Merge remote-tracking branch 'aosp/upstream-master' into master2
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..2a27948
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+---
+BasedOnStyle: Google
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+IndentCaseLabels: false
+AccessModifierOffset: -1
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..d963b7e
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,3 @@
+Dockerfile*
+build
+.*.swp
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2e39a80
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# Editor's files
+*.swp
+*.swo
+*.pyc
+.idea
+
+# Build artifacts
+/build/
+cmake-build-debug
+debian/**/*.log
+obj-x86_64-linux-gnu
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..dd36669
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,7 @@
+language: generic
+install:
+  - sudo apt-get install -y python-pip
+  - sudo pip install pep8
+script:
+  - ./scripts/check-helpers.sh
+  - find tools/ -type f -name "*.py" | xargs pep8 -r --show-source --ignore=E123,E125,E126,E127,E128,E302
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..64176d4
--- /dev/null
+++ b/Android.bp
@@ -0,0 +1,77 @@
+//
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+cc_defaults {
+    name: "libbpf_defaults",
+    cpp_std: "c++17",
+    cflags: [
+        "-Wall",
+        "-Werror",
+        "-Wnullable-to-nonnull-conversion",
+        "-Wthread-safety",
+        "-Wunused-parameter",
+    ],
+    tidy: true,
+    tidy_checks: [
+        "android-*",
+        "cert-*",
+        "clang-analyzer-security*",
+        // Disabling due to many unavoidable warnings from POSIX API usage.
+        "-google-runtime-int",
+    ],
+}
+
+cc_library_headers {
+    name: "libbpf_headers",
+    vendor_available: false,
+    host_supported: false,
+    export_include_dirs: ["src/cc/includes/"],
+    target: {
+        linux_bionic: {
+            enabled: true,
+        },
+    },
+}
+
+cc_library {
+    name: "libbpf",
+    vendor_available: false,
+    host_supported: false,
+    target: {
+        android: {
+            srcs: [
+                "src/cc/libbpf.c",
+                "src/cc/perf_reader.c",
+            ],
+            sanitize: {
+                misc_undefined: ["integer"],
+            },
+        },
+    },
+
+    header_libs: [
+        "libbpf_headers"
+    ],
+    export_header_lib_headers: ["libbpf_headers"],
+    local_include_dirs: ["src/cc"],
+
+    defaults: ["bpf_defaults"],
+    cflags: [
+        "-Werror",
+        "-Wall",
+        "-Wextra",
+    ],
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..bb3f53b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,94 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+cmake_minimum_required(VERSION 2.8.7)
+
+project(bcc)
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+enable_testing()
+
+include(cmake/GetGitRevisionDescription.cmake)
+include(cmake/version.cmake)
+include(CMakeDependentOption)
+include(GNUInstallDirs)
+include(CheckCXXCompilerFlag)
+include(cmake/FindCompilerFlag.cmake)
+
+option(ENABLE_LLVM_SHARED "Enable linking LLVM as a shared library" OFF)
+option(ENABLE_CLANG_JIT "Enable Loading BPF through Clang Frontend" ON)
+option(ENABLE_USDT "Enable User-level Statically Defined Tracing" ON)
+CMAKE_DEPENDENT_OPTION(ENABLE_CPP_API "Enable C++ API" ON "ENABLE_USDT" OFF)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+if(NOT PYTHON_ONLY AND ENABLE_CLANG_JIT)
+find_package(BISON)
+find_package(FLEX)
+find_package(LLVM REQUIRED CONFIG)
+message(STATUS "Found LLVM: ${LLVM_INCLUDE_DIRS} ${LLVM_PACKAGE_VERSION}")
+find_package(LibElf REQUIRED)
+
+# clang is linked as a library, but the library path searching is
+# primitively supported, unlike libLLVM
+set(CLANG_SEARCH "/opt/local/llvm/lib;/usr/lib/llvm-3.7/lib;${LLVM_LIBRARY_DIRS}")
+find_library(libclangAnalysis NAMES clangAnalysis HINTS ${CLANG_SEARCH})
+find_library(libclangAST NAMES clangAST HINTS ${CLANG_SEARCH})
+find_library(libclangBasic NAMES clangBasic HINTS ${CLANG_SEARCH})
+find_library(libclangCodeGen NAMES clangCodeGen HINTS ${CLANG_SEARCH})
+find_library(libclangDriver NAMES clangDriver HINTS ${CLANG_SEARCH})
+find_library(libclangEdit NAMES clangEdit HINTS ${CLANG_SEARCH})
+find_library(libclangFrontend NAMES clangFrontend HINTS ${CLANG_SEARCH})
+find_library(libclangLex NAMES clangLex HINTS ${CLANG_SEARCH})
+find_library(libclangParse NAMES clangParse HINTS ${CLANG_SEARCH})
+find_library(libclangRewrite NAMES clangRewrite HINTS ${CLANG_SEARCH})
+find_library(libclangSema NAMES clangSema HINTS ${CLANG_SEARCH})
+find_library(libclangSerialization NAMES clangSerialization HINTS ${CLANG_SEARCH})
+find_library(libclangASTMatchers NAMES clangASTMatchers HINTS ${CLANG_SEARCH})
+if(libclangBasic STREQUAL "libclangBasic-NOTFOUND")
+  message(FATAL_ERROR "Unable to find clang libraries")
+endif()
+FOREACH(DIR ${LLVM_INCLUDE_DIRS})
+  include_directories("${DIR}/../tools/clang/include")
+ENDFOREACH()
+
+# Set to a string path if system places kernel lib directory in
+# non-default location.
+if(NOT DEFINED BCC_KERNEL_MODULES_DIR)
+  set(BCC_KERNEL_MODULES_DIR "/lib/modules")
+endif()
+
+if(NOT DEFINED BCC_PROG_TAG_DIR)
+  set(BCC_PROG_TAG_DIR "/var/tmp/bcc")
+endif()
+
+# As reported in issue #735, GCC 6 has some behavioral problems when
+# dealing with -isystem. Hence, skip the warning optimization
+# altogether on that compiler.
+option(USINGISYSTEM "using -isystem" ON)
+execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+if (USINGISYSTEM AND GCC_VERSION VERSION_LESS 6.0)
+  # iterate over all available directories in LLVM_INCLUDE_DIRS to
+  # generate a correctly tokenized list of parameters
+  foreach(ONE_LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIRS})
+    set(CXX_ISYSTEM_DIRS "${CXX_ISYSTEM_DIRS} -isystem ${ONE_LLVM_INCLUDE_DIR}")
+  endforeach()
+endif()
+
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD 11)
+
+endif(NOT PYTHON_ONLY AND ENABLE_CLANG_JIT)
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall ${CXX_ISYSTEM_DIRS}")
+
+add_subdirectory(src)
+add_subdirectory(introspection)
+if(ENABLE_CLANG_JIT)
+add_subdirectory(examples)
+add_subdirectory(man)
+add_subdirectory(tests)
+add_subdirectory(tools)
+endif(ENABLE_CLANG_JIT)
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000..a009420
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,25 @@
+# This file should be kept up to date with the list of maintainers responsible
+# for the different subdirectories within BCC. One of these people SHOULD
+# review code that touches the respective areas, and MUST review it if the
+# change is substantial or API-breaking.
+
+# see https://help.github.com/articles/about-codeowners/ for syntax
+
+# Miscellaneous
+* @drzaeus77 @goldshtn @yonghong-song @4ast @brendangregg
+
+# Documentation
+/docs/ @brendangregg @goldshtn
+/man/ @brendangregg @goldshtn
+
+# Tools
+/tools/ @brendangregg @goldshtn
+
+# Compiler, C API
+/src/cc/ @drzaeus77 @yonghong-song @4ast
+
+# Python API
+/src/python/ @drzaeus77 @goldshtn
+
+# Tests
+/tests/ @drzaeus77 @yonghong-song
diff --git a/CONTRIBUTING-SCRIPTS.md b/CONTRIBUTING-SCRIPTS.md
new file mode 100644
index 0000000..fdc8768
--- /dev/null
+++ b/CONTRIBUTING-SCRIPTS.md
@@ -0,0 +1,44 @@
+# Contributing bcc/eBPF scripts
+
+If you want to contribute scripts to bcc, or improve your own bcc programs, great! Please read this first.
+
+_(Written by Brendan Gregg.)_
+
+## Type of script
+
+bcc has 2 types of scripts, in different directories:
+
+- **/examples**: intended as short examples of bcc & eBPF code. You should focus on keeping it short, neat, and documented (code comments). A submission can just be the example code.
+- **/tools**: intended as production safe performance and troubleshooting tools. You should focus on it being useful, tested, low overhead, documented (incl. all caveats), and easy to use. A submission should involve 4 changes: the tool, a man page, an example file, and an addition to README.md. Follow [my lead](https://github.com/brendangregg/bcc/commit/9fa156273b395cfc5505f0fff5d6b7b1396f7daa), and see the checklist below. These will be run in mission critical environments as root, so if spending hours testing isn't for you, please submit your idea as an issue instead, or chat with us on irc.
+
+More detail for each below.
+
+## Examples
+
+These are grouped into subdirectories (networking, tracing). Your example can either be a Python program with embedded C (eg, tracing/strlen_count.py), or separate Python and C files (eg, tracing/vfsreadlat.*).
+
+As said earlier: keep it short, neat, and documented (code comments).
+
+## Tools
+
+A checklist for bcc tool development:
+
+1. **Research the topic landscape**. Learn the existing tools and metrics (incl. from /proc). Determine what real world problems exist and need solving. We have too many tools and metrics as it is, we don't need more "I guess that's useful" tools, we need more "ah-hah! I couldn't do this before!" tools. Consider asking other developers about your idea. Many of us can be found in IRC, in the #iovisor channel on irc.oftc.net. There's also the mailing list (see the README.md), and github for issues.
+1. **Create a known workload for testing**. This might involving writing a 10 line C program, using a micro-benchmark, or just improvising at the shell. If you don't know how to create a workload, learn! Figuring this out will provide invaluable context and details that you may have otherwise overlooked. Sometimes it's easy, and I'm able to just use dd(1) from /dev/urandom or a disk device to /dev/null. It lets me set the I/O size, count, and provides throughput statistics for cross-checking my tool output. But other times I need a micro-benchmark, or some C.
+1. **Write the tool to solve the problem and no more**. Unix philosophy: do one thing and do it well. netstat doesn't have an option to dump packets, tcpdump-style. They are two different tools.
+1. **Check your tool correctly measures your known workload**. If possible, run a prime number of events (eg, 23) and check that the numbers match. Try other workload variations.
+1. **Use other observability tools to perform a cross-check or sanity check**. Eg, imagine you write a PCI bus tool that shows current throughput is 28 Gbytes/sec. How could you sanity test that? Well, what PCI devices are there? Disks and network cards? Measure their throughput (iostat, nicstat, sar), and check if is in the ballpark of 28 Gbytes/sec (which would include PCI frame overheads). Ideally, your numbers match.
+1. **Measure the overhead of the tool**. If you are running a micro-benchmark, how much slower is it with the tool running. Is more CPU consumed? Try to determine the worst case: run the micro-benchmark so that CPU headroom is exhausted, and then run the bcc tool. Can overhead be lowered?
+1. **Test again, and stress test**. You want to discover and fix all the bad things before others hit them.
+1. **Consider command line options**. Should it have -p for filtering on a PID? -T for timestamps? -i for interval? See other tools for examples, and copy the style: the usage message should list example usage at the end. Remember to keep the tool doing one thing and doing it well. Also, if there's one option that seems to be the common case, perhaps it should just be the first argument and not need a switch (no -X). A special case of this is *stat tools, like iostat/vmstat/etc, where the convention is [interval [count]].
+1. **Concise, intuitive, self-explanatory output**. The default output should meet the common need concisely. Leave much less useful fields and data to be shown with options: -v for verbose, etc. Consider including a startup message that's self-explanatory, eg "Tracing block I/O. Output every 1 seconds. Ctrl-C to end.". Also, try hard to keep the output less than 80 characters wide, especially the default output of the tool. That way, the output not only fits on the smallest reasonable terminal, it also fits well in slide decks, blog posts, articles, and printed material, all of which help education and adoption. Publishers of technical books often have templates they require books to conform to: it may not be an option to shrink or narrow the font to fit your output.
+1. **Use pep8 to check Python style**: pep8 --show-source --ignore=E123,E125,E126,E127,E128,E302 filename . Note that it misses some things, like consistent usage, so you'll still need to double check your script.
+1. **Make sure your script is Python3-ready**: Adding `from __future__ import absolute_import, division, print_function, unicode_literals` helps make your script Python3-ready.
+1. **Write an _example.txt file**. Copy the style in tools/biolatency_example.txt: start with an intro sentence, then have examples, and finish with the USAGE message. Explain everything: the first example should explain what we are seeing, even if this seems obvious. For some people it won't be obvious. Also explain why we are running the tool: what problems it's solving. It can take a long time (hours) to come up with good examples, but it's worth it. These will get copied around (eg, presentations, articles).
+1. **Read your example.txt file**. Does this sound too niche or convoluted? Are you spending too much time explaining caveats? These can be hints that perhaps you should fix your tool, or abandon it! Perhaps it better belongs as an /example, and not a tool. I've abandoned many tools at this stage.
+1. **Write a man page**. Either ROFF (.8), markdown (.md), or plain text (.txt): so long as it documents the important sections, particularly columns (fields) and caveats. These go under man/man8. See the other examples. Include a section on overhead, and pull no punches. It's better for end users to know about high overhead beforehand, than to discover it the hard way. Also explain caveats. Don't assume those will be obvious to tool users.
+1. **Read your man page**. For ROFF: nroff -man filename. Like before, this exercise is like saying something out loud. Does it sound too niche or convoluted? Again, hints that you might need to go back and fix things, or abandon it.
+1. **Spell check your documentation**. Use a spell checker like aspell to check your document quality before committing.
+1. **Add an entry to README.md**.
+1. **Add a smoke test** to [test_tools_smoke.py](https://github.com/iovisor/bcc/blob/master/tests/python/test_tools_smoke.py), which serves as a basic check that your tool still works when we make changes to the core library.
+1. If you made it this far, pull request!
diff --git a/Dockerfile.debian b/Dockerfile.debian
new file mode 100644
index 0000000..9b0284d
--- /dev/null
+++ b/Dockerfile.debian
@@ -0,0 +1,17 @@
+FROM debian:stretch
+
+MAINTAINER Brenden Blanco <bblanco@gmail.com>
+
+RUN DEBIAN_RELEASE=stretch && \
+    # Adding non-free repo for netperf
+    echo "deb http://deb.debian.org/debian ${DEBIAN_RELEASE} non-free" > \
+        /etc/apt/sources.list.d/debian-non-free.list && \
+    apt-get -qq update && \
+    apt-get -y install pbuilder aptitude
+
+COPY ./ /root/bcc
+
+WORKDIR /root/bcc
+
+RUN /usr/lib/pbuilder/pbuilder-satisfydepends && \
+    ./scripts/build-deb.sh
diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
new file mode 100644
index 0000000..71d025d
--- /dev/null
+++ b/Dockerfile.ubuntu
@@ -0,0 +1,13 @@
+FROM ubuntu:bionic
+
+MAINTAINER Brenden Blanco <bblanco@gmail.com>
+
+RUN apt-get -qq update && \
+    apt-get -y install pbuilder aptitude
+
+COPY ./ /root/bcc
+
+WORKDIR /root/bcc
+
+RUN /usr/lib/pbuilder/pbuilder-satisfydepends && \
+    ./scripts/build-deb.sh
diff --git a/FAQ.txt b/FAQ.txt
new file mode 100644
index 0000000..c898afb
--- /dev/null
+++ b/FAQ.txt
@@ -0,0 +1,47 @@
+Q: while running 'make test' I'm seeing:
+   'ImportError: No module named pyroute2'
+A: Install pyroute2:
+   git clone https://github.com/svinota/pyroute2.git
+   cd pyroute2; sudo make install
+
+Q: hello_world.py fails with:
+   OSError: libbcc.so: cannot open shared object file: No such file or directory
+A: make sure to 'make install' and add the directory
+   where libbcc.so was installed into your LD_LIBRARY_PATH
+   export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+Q: hello_world.py fails with:
+   ImportError: No module named bpf
+A: checkout "sudo make install" output to find out bpf package installation site,
+   add it to the PYTHONPATH env variable before running the program.
+   sudo bash -c 'PYTHONPATH=/usr/lib/python2.7/site-packages python examples/hello_world.py'
+
+Q: hello_world.py still fails with:
+   bpf: Operation not permitted
+   Exception: Failed to load BPF program hello
+A: sudo
+
+Q: How do I fulfill the Linux kernel version requirement?
+A: You need to obtain a recent version of the Linux source code
+   (please look at the README for the exact version), enable the
+   configuration options listed in the README file, install the image,
+   modules and headers, update your bootloader and reboot into the new
+   kernel.
+
+   If you want to compile your own kernel, you can fetch the sources
+   from kernel.org or through your Linux distribution.
+   To install, you need all of the following:
+      make install
+      make modules_install
+      make headers_install INSTALL_HDR_PATH=/usr/local/
+
+Q: hello_world.py fails with:
+   ImportError: No module named past.builtins
+A: sudo pip install future
+
+Q: Running one of the bcc tools produces an import error:
+   Traceback (most recent call last):
+   File "./execsnoop", line 20, in <module>
+     from bcc import BPF
+   ImportError: No module named bcc
+A: Make sure the python bcc bindings package (python2-bcc) is installed.
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 0000000..25df2ec
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,402 @@
+# Installing BCC
+
+* [Kernel Configuration](#kernel-configuration)
+* [Packages](#packages)
+  - [Ubuntu](#ubuntu---binary)
+  - [Fedora](#fedora---binary)
+  - [Arch](#arch---aur)
+  - [Gentoo](#gentoo---portage)
+  - [openSUSE](#opensuse---binary)
+  - [RHEL](#redhat---binary)
+* [Source](#source)
+  - [Debian](#debian---source)
+  - [Ubuntu](#ubuntu---source)
+  - [Fedora](#fedora---source)
+  - [openSUSE](#opensuse---source)
+  - [Amazon Linux](#amazon-linux---source)
+* [Older Instructions](#older-instructions)
+
+## Kernel Configuration
+
+In general, to use these features, a Linux kernel version 4.1 or newer is
+required. In addition, the kernel should have been compiled with the following
+flags set:
+
+```
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+# [optional, for tc filters]
+CONFIG_NET_CLS_BPF=m
+# [optional, for tc actions]
+CONFIG_NET_ACT_BPF=m
+CONFIG_BPF_JIT=y
+CONFIG_HAVE_BPF_JIT=y
+# [optional, for kprobes]
+CONFIG_BPF_EVENTS=y
+```
+
+There are a few optional kernel flags needed for running bcc networking examples on vanilla kernel:
+
+```
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_DUMMY=m
+CONFIG_VXLAN=m
+```
+
+Kernel compile flags can usually be checked by looking at `/proc/config.gz` or
+`/boot/config-<kernel-version>`.
+
+# Packages
+
+## Ubuntu - Binary
+
+The stable and the nightly packages are built for Ubuntu Xenial (16.04), Ubuntu Artful (17.10) and Ubuntu Bionic (18.04). The steps are very straightforward, no need to upgrade the kernel or compile from source!
+
+**Stable and Signed Packages**
+
+```bash
+sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 4052245BD4284CDD
+echo "deb https://repo.iovisor.org/apt/xenial xenial main" | sudo tee /etc/apt/sources.list.d/iovisor.list
+sudo apt-get update
+sudo apt-get install bcc-tools libbcc-examples linux-headers-$(uname -r)
+```
+(replace `xenial` with `artful` or `bionic` as appropriate)
+
+**Nightly Packages**
+
+```bash
+echo "deb [trusted=yes] https://repo.iovisor.org/apt/xenial xenial-nightly main" | sudo tee /etc/apt/sources.list.d/iovisor.list
+sudo apt-get update
+sudo apt-get install bcc-tools libbcc-examples linux-headers-$(uname -r)
+```
+(replace `xenial` with `artful` or `bionic` as appropriate)
+
+## Fedora - Binary
+
+Ensure that you are running a 4.2+ kernel with `uname -r`. If not, install a 4.2+ kernel from
+http://alt.fedoraproject.org/pub/alt/rawhide-kernel-nodebug, for example:
+
+```bash
+sudo dnf config-manager --add-repo=http://alt.fedoraproject.org/pub/alt/rawhide-kernel-nodebug/fedora-rawhide-kernel-nodebug.repo
+sudo dnf update
+# reboot
+```
+
+**Nightly Packages**
+
+Nightly bcc binary packages for Fedora 25, 26, 27, and 28 are hosted at
+`https://repo.iovisor.org/yum/nightly/f{25,26,27}`.
+
+To install:
+```bash
+echo -e '[iovisor]\nbaseurl=https://repo.iovisor.org/yum/nightly/f27/$basearch\nenabled=1\ngpgcheck=0' | sudo tee /etc/yum.repos.d/iovisor.repo
+sudo dnf install bcc-tools kernel-headers kernel-devel
+```
+
+**Stable and Signed Packages**
+
+Stable bcc binary packages for Fedora 25, 26, 27, and 28 are hosted at
+`https://repo.iovisor.org/yum/main/f{25,26,27}`.
+
+```bash
+echo -e '[iovisor]\nbaseurl=https://repo.iovisor.org/yum/main/f27/$basearch\nenabled=1' | sudo tee /etc/yum.repos.d/iovisor.repo
+sudo dnf install bcc-tools kernel-devel-$(uname -r) kernel-headers-$(uname -r)
+```
+
+## Arch - AUR
+
+Upgrade the kernel to minimum 4.3.1-1 first; the ```CONFIG_BPF_SYSCALL=y``` configuration was not added until [this kernel release](https://bugs.archlinux.org/task/47008).
+
+Install these packages using any AUR helper such as [pacaur](https://aur.archlinux.org/packages/pacaur), [yaourt](https://aur.archlinux.org/packages/yaourt), [cower](https://aur.archlinux.org/packages/cower), etc.:
+```
+bcc bcc-tools python-bcc python2-bcc
+```
+All build and install dependencies are listed [in the PKGBUILD](https://aur.archlinux.org/cgit/aur.git/tree/PKGBUILD?h=bcc) and should install automatically.
+
+## Gentoo - Portage
+
+First of all, upgrade the kernel of your choice to a recent version. For example:
+```
+emerge sys-kernel/gentoo-sources
+```
+Then, configure the kernel enabling the features you need. Please consider the following as a starting point:
+```
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_NET_CLS_BPF=m
+CONFIG_NET_ACT_BPF=m
+CONFIG_BPF_JIT=y
+CONFIG_BPF_EVENTS=y
+```
+Finally, you can install bcc with:
+```
+emerge dev-util/bcc
+```
+The appropriate dependencies (e.g., ```clang```, ```llvm``` with BPF backend) will be pulled automatically.
+
+## openSUSE - Binary
+
+For openSUSE Leap 42.2 (and later) and Tumbleweed, bcc is already included in the official repo. Just install
+the packages with zypper.
+
+```bash
+sudo zypper ref
+sudo zypper in bcc-tools bcc-examples
+```
+
+## RHEL - Binary
+
+For Redhat 7.6 (Beta) bcc is already included in the official yum repository as bcc-tools. As part of the install the following dependencies are installed: bcc.x86_64 0:0.6.0-3.el7 ,llvm-private.x86_64 0:6.0.1-2.el7 ,python-bcc.x86_64 0:0.6.0-3.el7,python-netaddr.noarch 0:0.7.5-9.el7
+
+```
+yum install bcc-tools
+```
+
+# Source
+
+## Debian - Source
+### Jessie
+#### Repositories
+
+The automated tests that run as part of the build process require `netperf`.  Since netperf's license is not "certified"
+as an open-source license, it is in Debian's `non-free` repository.
+
+`/etc/apt/sources.list` should include the `non-free` repository and look something like this:
+
+```
+deb http://httpredir.debian.org/debian/ jessie main non-free
+deb-src http://httpredir.debian.org/debian/ jessie main non-free
+
+deb http://security.debian.org/ jessie/updates main non-free
+deb-src http://security.debian.org/ jessie/updates main non-free
+
+# wheezy-updates, previously known as 'volatile'
+deb http://ftp.us.debian.org/debian/ jessie-updates main non-free
+deb-src http://ftp.us.debian.org/debian/ jessie-updates main non-free
+```
+
+BCC also requires kernel version 4.1 or above.  Those kernels are available in the `jessie-backports` repository.  To
+add the `jessie-backports` repository to your system create the file `/etc/apt/sources.list.d/jessie-backports.list`
+with the following contents:
+
+```
+deb http://httpredir.debian.org/debian jessie-backports main
+deb-src http://httpredir.debian.org/debian jessie-backports main
+```
+
+#### Install Build Dependencies
+
+Note, check for the latest `linux-image-4.x` version in `jessie-backports` before proceeding.  Also, have a look at the
+`Build-Depends:` section in `debian/control` file.
+
+```
+# Before you begin
+apt-get update
+
+# Update kernel and linux-base package
+apt-get -t jessie-backports install linux-base linux-image-4.9.0-0.bpo.2-amd64 linux-headers-4.9.0-0.bpo.2-amd64
+
+# BCC build dependencies:
+apt-get install debhelper cmake libllvm3.8 llvm-3.8-dev libclang-3.8-dev \
+  libelf-dev bison flex libedit-dev clang-format-3.8 python python-netaddr \
+  python-pyroute2 luajit libluajit-5.1-dev arping iperf netperf ethtool \
+  devscripts zlib1g-dev libfl-dev
+```
+
+#### Sudo
+
+Adding eBPF probes to the kernel and removing probes from it requires root privileges.  For the build to complete
+successfully, you must build from an account with `sudo` access.  (You may also build as root, but it is bad style.)
+
+`/etc/sudoers` or `/etc/sudoers.d/build-user` should contain
+
+```
+build-user ALL = (ALL) NOPASSWD: ALL
+```
+
+or
+
+```
+build-user ALL = (ALL) ALL
+```
+
+If using the latter sudoers configuration, please keep an eye out for sudo's password prompt while the build is running.
+
+#### Build
+
+```
+cd <preferred development directory>
+git clone https://github.com/iovisor/bcc.git
+cd bcc
+debuild -b -uc -us
+```
+
+#### Install
+
+```
+cd ..
+sudo dpkg -i *bcc*.deb
+```
+
+## Ubuntu - Source
+
+To build the toolchain from source, one needs:
+* LLVM 3.7.1 or newer, compiled with BPF support (default=on)
+* Clang, built from the same tree as LLVM
+* cmake (>=3.1), gcc (>=4.7), flex, bison
+* LuaJIT, if you want Lua support
+
+### Install build dependencies
+```
+# Trusty and older
+VER=trusty
+echo "deb http://llvm.org/apt/$VER/ llvm-toolchain-$VER-3.7 main
+deb-src http://llvm.org/apt/$VER/ llvm-toolchain-$VER-3.7 main" | \
+  sudo tee /etc/apt/sources.list.d/llvm.list
+wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
+sudo apt-get update
+
+# All versions
+sudo apt-get -y install bison build-essential cmake flex git libedit-dev \
+  libllvm3.7 llvm-3.7-dev libclang-3.7-dev python zlib1g-dev libelf-dev
+
+# For Lua support
+sudo apt-get -y install luajit luajit-5.1-dev
+```
+
+### Install and compile BCC
+```
+git clone https://github.com/iovisor/bcc.git
+mkdir bcc/build; cd bcc/build
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr
+make
+sudo make install
+```
+
+## Fedora - Source
+
+### Install build dependencies
+
+```
+sudo dnf install -y bison cmake ethtool flex git iperf libstdc++-static \
+  python-netaddr python-pip gcc gcc-c++ make zlib-devel \
+  elfutils-libelf-devel
+sudo dnf install -y luajit luajit-devel  # for Lua support
+sudo dnf install -y \
+  http://repo.iovisor.org/yum/extra/mageia/cauldron/x86_64/netperf-2.7.0-1.mga6.x86_64.rpm
+sudo pip install pyroute2
+```
+
+### Install binary clang
+
+```
+# FC22
+wget http://llvm.org/releases/3.7.1/clang+llvm-3.7.1-x86_64-fedora22.tar.xz
+sudo tar xf clang+llvm-3.7.1-x86_64-fedora22.tar.xz -C /usr/local --strip 1
+
+# FC23
+wget http://llvm.org/releases/3.9.0/clang+llvm-3.9.0-x86_64-fedora23.tar.xz
+sudo tar xf clang+llvm-3.9.0-x86_64-fedora23.tar.xz -C /usr/local --strip 1
+
+# FC24 and FC25
+sudo dnf install -y clang clang-devel llvm llvm-devel llvm-static ncurses-devel
+```
+
+### Install and compile BCC
+```
+git clone https://github.com/iovisor/bcc.git
+mkdir bcc/build; cd bcc/build
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr
+make
+sudo make install
+```
+
+## openSUSE - Source
+
+### Install build dependencies
+
+```
+sudo zypper in bison cmake flex gcc gcc-c++ git libelf-devel libstdc++-devel \
+  llvm-devel clang-devel pkg-config python-devel python-setuptools python3-devel \
+  python3-setuptools
+sudo zypper in luajit-devel       # for lua support in openSUSE Leap 42.2 or later
+sudo zypper in lua51-luajit-devel # for lua support in openSUSE Tumbleweed
+```
+
+### Install and compile BCC
+```
+git clone https://github.com/iovisor/bcc.git
+mkdir bcc/build; cd bcc/build
+cmake -DCMAKE_INSTALL_PREFIX=/usr \
+      -DLUAJIT_INCLUDE_DIR=`pkg-config --variable=includedir luajit` \ # for lua support
+      ..
+make
+sudo make install
+cmake -DPYTHON_CMD=python3 .. # build python3 binding
+pushd src/python/
+make
+sudo make install
+popd
+```
+
+## Amazon Linux - Source
+
+Tested on Amazon Linux AMI release 2018.03 (kernel 4.14.47-56.37.amzn1.x86_64)
+
+### Install packages required for building
+```
+# enable epel to get iperf, luajit, luajit-devel, cmake3 (cmake3 is required to support c++11) 
+sudo yum-config-manager --enable epel
+
+sudo yum install -y bison cmake3 ethtool flex git iperf libstdc++-static python-netaddr gcc gcc-c++ make zlib-devel elfutils-libelf-devel
+sudo yum install -y luajit luajit-devel
+sudo yum install -y http://repo.iovisor.org/yum/extra/mageia/cauldron/x86_64/netperf-2.7.0-1.mga6.x86_64.rpm
+sudo pip install pyroute2
+sudo yum install -y ncurses-devel
+```
+
+### Install clang 3.7.1 pre-built binaries
+```
+wget http://releases.llvm.org/3.7.1/clang+llvm-3.7.1-x86_64-fedora22.tar.xz
+tar xf clang*
+(cd clang* && sudo cp -R * /usr/local/)
+```
+
+### Build bcc
+```
+git clone https://github.com/iovisor/bcc.git
+pushd .
+mkdir bcc/build; cd bcc/build
+cmake3 .. -DCMAKE_INSTALL_PREFIX=/usr
+time make
+sudo make install
+popd
+```
+
+### Setup required to run the tools
+```
+sudo yum -y install kernel-devel-$(uname -r)
+sudo mount -t debugfs debugfs /sys/kernel/debug
+```
+
+### Test
+```
+sudo /usr/share/bcc/tools/execsnoop
+```
+
+# Older Instructions
+
+## Build LLVM and Clang development libs
+
+```
+git clone http://llvm.org/git/llvm.git
+cd llvm/tools; git clone http://llvm.org/git/clang.git
+cd ..; mkdir -p build/install; cd build
+cmake -G "Unix Makefiles" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
+  -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PWD/install ..
+make
+make install
+export PATH=$PWD/install/bin:$PATH
+```
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..8dada3e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/LINKS.md b/LINKS.md
new file mode 100644
index 0000000..9eb1abe
--- /dev/null
+++ b/LINKS.md
@@ -0,0 +1,45 @@
+- 2018-05-03: [Linux System Monitoring with eBPF](https://www.circonus.com/2018/05/linux-system-monitoring-with-ebpf)
+- 2018-02-22: [Some advanced BCC topics](https://lwn.net/Articles/747640)
+- 2018-01-23: [BPFd: Running BCC tools remotely across systems and architectures](https://lwn.net/Articles/744522)
+- 2017-12-22: [An introduction to the BPF Compiler Collection](https://lwn.net/Articles/742082)
+- 2017-09-13: [Performance Analysis Superpowers with Linux BPF](https://www.slideshare.net/brendangregg/ossna-2017-performance-analysis-superpowers-with-linux-bpf)
+- 2017-07-28: [Tracing a packet journey using Linux tracepoints, perf and eBPF](https://blog.yadutaf.fr/2017/07/28/tracing-a-packet-journey-using-linux-tracepoints-perf-ebpf/)
+- 2017-07-13: [Performance Superpowers with Enhanced BPF](https://www.usenix.org/conference/atc17/program/presentation/gregg-superpowers)
+- 2017-06-28: [The BSD Packet Filter](https://speakerdeck.com/tuxology/the-bsd-packet-filter)
+- 2017-03-04: [Linux 4.x Tracing: Performance Analysis with bcc/BPF](https://www.slideshare.net/brendangregg/linux-4x-tracing-performance-analysis-with-bccbpf)
+- 2017-02-27: [Profiling a .NET Core Application on Linux](https://blogs.microsoft.co.il/sasha/2017/02/27/profiling-a-net-core-application-on-linux)
+- 2017-02-05: [gobpf - utilizing eBPF from Go](https://fosdem.org/2017/schedule/event/go_bpf/attachments/slides/1681/export/events/attachments/go_bpf/slides/1681/gobpf_utilizing_eBPF_from_Go_FOSDEM_2017.pdf)
+- 2017-01-31: [Golang bcc/BPF Function Tracing](http://www.brendangregg.com/blog/2017-01-31/golang-bcc-bpf-function-tracing.html)
+- 2017-01-18: [BPF: Tracing and more](https://www.slideshare.net/brendangregg/bpf-tracing-and-more)
+- 2016-12-09: [Linux 4.x Tracing Tools: Using BPF Superpowers](https://www.slideshare.net/brendangregg/linux-4x-tracing-tools-using-bpf-superpowers)
+- 2016-11-30: [Introducing gobpf - Using eBPF from Go](https://kinvolk.io/blog/2016/11/introducing-gobpf---using-ebpf-from-go)
+- 2016-11-30: [Linux bcc/BPF tcplife: TCP Lifespans](http://www.brendangregg.com/blog/2016-11-30/linux-bcc-tcplife.html)
+- 2016-10-27: [DTrace for Linux 2016](http://www.brendangregg.com/blog/2016-10-27/dtrace-for-linux-2016.html)
+- 2016-10-21: [Linux 4.9's Efficient BPF-based Profiler](http://www.brendangregg.com/blog/2016-10-21/linux-efficient-profiler.html)
+- 2016-10-15: [Linux bcc tcptop](http://www.brendangregg.com/blog/2016-10-15/linux-bcc-tcptop.html)
+- 2016-10-12: [Linux bcc/BPF Node.js USDT Tracing](http://www.brendangregg.com/blog/2016-10-12/linux-bcc-nodejs-usdt.html)
+- 2016-10-08: [Linux bcc/BPF Run Queue (Scheduler) Latency](http://www.brendangregg.com/blog/2016-10-08/linux-bcc-runqlat.html)
+- 2016-10-06: [Linux bcc ext4 Latency Tracing](http://www.brendangregg.com/blog/2016-10-06/linux-bcc-ext4dist-ext4slower.html)
+- 2016-10-04: [Installing bcc to evaluate BPF and Postgres](http://blog.gregburek.com/2016/10/04/installing-bcc-to-evaluate-bpf-and-postgres)
+- 2016-10-04: [Linux MySQL Slow Query Tracing with bcc/BPF](http://www.brendangregg.com/blog/2016-10-04/linux-bcc-mysqld-qslower.html)
+- 2016-10-01: [Linux bcc Tracing Security Capabilities](http://www.brendangregg.com/blog/2016-10-01/linux-bcc-security-capabilities.html)
+- 2016-09-23: [BCC – Dynamic Tracing Tools for Linux Performance Monitoring, Networking and More](http://www.tecmint.com/bcc-best-linux-performance-monitoring-tools/)
+- 2016-08-22: [BoF - What Can BPF Do For You?](https://events.linuxfoundation.org/sites/events/files/slides/iovisor-lc-bof-2016.pdf)
+- 2016-07-03: [Linux debugging tools I love](https://jvns.ca/blog/2016/07/03/debugging-tools-i-love)
+- 2016-06-14: [Ubuntu Xenial bcc/BPF](http://www.brendangregg.com/blog/2016-06-14/ubuntu-xenial-bcc-bpf.html)
+- 2016-05-26: [Linux BPF/bcc for Oracle Tracing](https://db-blog.web.cern.ch/blog/luca-canali/2016-05-linux-bpfbcc-oracle-tracing)
+- 2016-05-04: [Tracing your TCP IPv4 connections with eBPF and BCC from the Linux kernel JIT-VM to Splunk](https://www.splunk.com/blog/2016/05/04/tracing-your-tcp-ipv4-connections-with-ebpf-and-bcc-from-the-linux-kernel-jit-vm-to-splunk/)
+- 2016-03-31: [Probing the JVM with BPF/BCC](http://blogs.microsoft.co.il/sasha/2016/03/31/probing-the-jvm-with-bpfbcc/)
+- 2016-03-30: [How to turn any syscall into an event: Introducing eBPF Kernel probes](https://blog.yadutaf.fr/2016/03/30/turn-any-syscall-into-event-introducing-ebpf-kernel-probes)
+- 2016-03-30: [USDT Probe Support in BPF/BCC](http://blogs.microsoft.co.il/sasha/2016/03/30/usdt-probe-support-in-bpfbcc)
+- 2016-03-28: [Linux BPF/bcc Road Ahead, March 2016](http://www.brendangregg.com/blog/2016-03-28/linux-bpf-bcc-road-ahead-2016.html)
+- 2016-03-05: [Linux BPF Superpowers](http://www.brendangregg.com/blog/2016-03-05/linux-bpf-superpowers.html)
+- 2016-03-02: [Linux BPF Superpowers](https://www.slideshare.net/brendangregg/linux-bpf-superpowers)
+- 2016-02-14: [Two New eBPF Tools: memleak and argdist](http://blogs.microsoft.co.il/sasha/2016/02/14/two-new-ebpf-tools-memleak-and-argdist/)
+- 2016-02-08: [Linux eBPF/bcc uprobes](http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html)
+- 2016-02-05: [Who is waking the waker? (Linux chain graph prototype)](http://www.brendangregg.com/blog/2016-02-05/ebpf-chaingraph-prototype.html)
+- 2016-02-01: [Linux Wakeup and Off-Wake Profiling](http://www.brendangregg.com/blog/2016-02-01/linux-wakeup-offwake-profiling.html)
+- 2016-01-20: [Linux eBPF Off-CPU Flame Graph](http://www.brendangregg.com/blog/2016-01-20/ebpf-offcpu-flame-graph.html)
+- 2016-01-18: [Linux eBPF Stack Trace Hack](http://www.brendangregg.com/blog/2016-01-18/ebpf-stack-trace-hack.html)
+- 2015-10-31: [tcpconnect and tcpaccept for Linux (bcc)](http://www.brendangregg.com/blog/2015-10-31/tcpconnect-tcpaccept-bcc.html)
+- 2015-09-22: [bcc: Taming Linux 4.3+ Tracing Superpowers](http://www.brendangregg.com/blog/2015-09-22/bcc-linux-4.3-tracing.html)
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..d2d1a3c
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,16 @@
+name: "BCC"
+description:
+    "BCC is a toolkit for creating efficient kernel tracing and manipulation programs"
+
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://github.com/iovisor/bcc"
+  }
+  url {
+    type: GIT
+    value: "https://github.com/iovisor/bcc.git"
+  }
+  version: "b998421b18a34d0b47a6bda996c91bad12fa5da0"
+  last_upgrade_date { year: 2018 month: 10 day: 31 }
+}
diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_APACHE2
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/QUICKSTART.md b/QUICKSTART.md
new file mode 100644
index 0000000..29e9b4e
--- /dev/null
+++ b/QUICKSTART.md
@@ -0,0 +1,20 @@
+# Quick Start Guide
+
+A Docker container is provided for user to try out [bcc](https://github.com/iovisor/bcc).
+
+From your host shell:
+```bash
+docker run -it --rm \
+  --privileged \
+  -v /lib/modules:/lib/modules:ro \
+  -v /usr/src:/usr/src:ro \
+  -v /etc/localtime:/etc/localtime:ro \
+  --workdir /usr/share/bcc/tools \
+  zlim/bcc
+```
+
+Now, from the container shell, you can try the various pre-installed bcc tools.
+
+For examples, please refer to the [tutorial](docs/tutorial.md#1-general-performance).
+
+If you wish to install bcc on your host, please refer to [INSTALL.md](INSTALL.md).
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..50d6db0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,250 @@
+![BCC Logo](images/logo2.png)
+# BPF Compiler Collection (BCC)
+
+BCC is a toolkit for creating efficient kernel tracing and manipulation
+programs, and includes several useful tools and examples. It makes use of
+extended BPF (Berkeley Packet Filters), formally known as eBPF, a new feature
+that was first added to Linux 3.15. Much of what BCC uses requires Linux 4.1
+and above.
+
+eBPF was [described by](https://lkml.org/lkml/2015/4/14/232) Ingo Molnár as:
+
+> One of the more interesting features in this cycle is the ability to attach eBPF programs (user-defined, sandboxed bytecode executed by the kernel) to kprobes. This allows user-defined instrumentation on a live kernel image that can never crash, hang or interfere with the kernel negatively.
+
+BCC makes BPF programs easier to write, with kernel instrumentation in C
+(and includes a C wrapper around LLVM), and front-ends in Python and lua.
+It is suited for many tasks, including performance analysis and network
+traffic control.
+
+## Screenshot
+
+This example traces a disk I/O kernel function, and populates an in-kernel
+power-of-2 histogram of the I/O size. For efficiency, only the histogram
+summary is returned to user-level.
+
+```Shell
+# ./bitehist.py
+Tracing... Hit Ctrl-C to end.
+^C
+     kbytes          : count     distribution
+       0 -> 1        : 3        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 211      |**********                            |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 800      |**************************************|
+```
+
+The above output shows a bimodal distribution, where the largest mode of
+800 I/O was between 128 and 255 Kbytes in size.
+
+See the source: [bitehist.py](examples/tracing/bitehist.py). What this traces,
+what this stores, and how the data is presented, can be entirely customized.
+This shows only some of many possible capabilities.
+
+## Installing
+
+See [INSTALL.md](INSTALL.md) for installation steps on your platform.
+
+## FAQ
+
+See [FAQ.txt](FAQ.txt) for the most common troubleshoot questions.
+
+## Reference guide
+
+See [docs/reference_guide.md](docs/reference_guide.md) for the reference guide to the bcc and bcc/BPF APIs.
+
+## Contents
+
+Some of these are single files that contain both C and Python, others have a
+pair of .c and .py files, and some are directories of files.
+
+### Tracing
+
+#### Examples:
+
+- examples/tracing/[bitehist.py](examples/tracing/bitehist.py): Block I/O size histogram. [Examples](examples/tracing/bitehist_example.txt).
+- examples/tracing/[disksnoop.py](examples/tracing/disksnoop.py): Trace block device I/O latency. [Examples](examples/tracing/disksnoop_example.txt).
+- examples/[hello_world.py](examples/hello_world.py): Prints "Hello, World!" for new processes.
+- examples/tracing/[mysqld_query.py](examples/tracing/mysqld_query.py): Trace MySQL server queries using USDT probes. [Examples](examples/tracing/mysqld_query_example.txt).
+- examples/tracing/[nodejs_http_server.py](examples/tracing/nodejs_http_server.py): Trace Node.js HTTP server requests using USDT probes. [Examples](examples/tracing/nodejs_http_server_example.txt).
+- examples/tracing/[stacksnoop](examples/tracing/stacksnoop.py): Trace a kernel function and print all kernel stack traces. [Examples](examples/tracing/stacksnoop_example.txt).
+- tools/[statsnoop](tools/statsnoop.py): Trace stat() syscalls. [Examples](tools/statsnoop_example.txt).
+- examples/tracing/[task_switch.py](examples/tracing/task_switch.py): Count task switches with from and to PIDs.
+- examples/tracing/[tcpv4connect.py](examples/tracing/tcpv4connect.py): Trace TCP IPv4 active connections. [Examples](examples/tracing/tcpv4connect_example.txt).
+- examples/tracing/[trace_fields.py](examples/tracing/trace_fields.py): Simple example of printing fields from traced events.
+- examples/tracing/[urandomread.py](examples/tracing/urandomread.py): A kernel tracepoint example, which traces random:urandom_read. [Examples](examples/tracing/urandomread_example.txt).
+- examples/tracing/[vfsreadlat.py](examples/tracing/vfsreadlat.py) examples/tracing/[vfsreadlat.c](examples/tracing/vfsreadlat.c): VFS read latency distribution. [Examples](examples/tracing/vfsreadlat_example.txt).
+- examples/tracing/[kvm_hypercall.py](examples/tracing/kvm_hypercall.py): Conditional static kernel tracepoints for KVM entry, exit and hypercall [Examples](examples/tracing/kvm_hypercall.txt).
+
+#### Tools:
+<center><a href="images/bcc_tracing_tools_2017.png"><img src="images/bcc_tracing_tools_2017.png" border=0 width=700></a></center>
+
+
+- tools/[argdist](tools/argdist.py): Display function parameter values as a histogram or frequency count. [Examples](tools/argdist_example.txt).
+- tools/[bashreadline](tools/bashreadline.py): Print entered bash commands system wide. [Examples](tools/bashreadline_example.txt).
+- tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
+- tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
+- tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
+- tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
+- tools/[bpflist](tools/bpflist.py): Display processes with active BPF programs and maps. [Examples](tools/bpflist_example.txt).
+- tools/[btrfsdist](tools/btrfsdist.py): Summarize btrfs operation latency distribution as a histogram. [Examples](tools/btrfsdist_example.txt).
+- tools/[btrfsslower](tools/btrfsslower.py): Trace slow btrfs operations. [Examples](tools/btrfsslower_example.txt).
+- tools/[capable](tools/capable.py): Trace security capability checks. [Examples](tools/capable_example.txt).
+- tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt).
+- tools/[cachetop](tools/cachetop.py): Trace page cache hit/miss ratio by processes. [Examples](tools/cachetop_example.txt).
+- tools/[cpudist](tools/cpudist.py): Summarize on- and off-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt)
+- tools/[cpuunclaimed](tools/cpuunclaimed.py): Sample CPU run queues and calculate unclaimed idle CPU. [Examples](tools/cpuunclaimed_example.txt)
+- tools/[criticalstat](tools/criticalstat.py): Trace and report long atomic critical sections in the kernel. [Examples](tools/criticalstat_example.txt)
+- tools/[dbslower](tools/dbslower.py): Trace MySQL/PostgreSQL queries slower than a threshold. [Examples](tools/dbslower_example.txt).
+- tools/[dbstat](tools/dbstat.py): Summarize MySQL/PostgreSQL query latency as a histogram. [Examples](tools/dbstat_example.txt).
+- tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt).
+- tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt).
+- tools/[deadlock_detector](tools/deadlock_detector.py): Detect potential deadlocks on a running process. [Examples](tools/deadlock_detector_example.txt).
+- tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt).
+- tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency distribution as a histogram. [Examples](tools/ext4dist_example.txt).
+- tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt).
+- tools/[filelife](tools/filelife.py): Trace the lifespan of short-lived files. [Examples](tools/filelife_example.txt).
+- tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt).
+- tools/[filetop](tools/filetop.py): File reads and writes by filename and process. Top for files. [Examples](tools/filetop_example.txt).
+- tools/[funccount](tools/funccount.py): Count kernel function calls. [Examples](tools/funccount_example.txt).
+- tools/[funclatency](tools/funclatency.py): Time functions and show their latency distribution. [Examples](tools/funclatency_example.txt).
+- tools/[funcslower](tools/funcslower.py): Trace slow kernel or user function calls. [Examples](tools/funcslower_example.txt).
+- tools/[gethostlatency](tools/gethostlatency.py): Show latency for getaddrinfo/gethostbyname[2] calls. [Examples](tools/gethostlatency_example.txt).
+- tools/[hardirqs](tools/hardirqs.py):  Measure hard IRQ (hard interrupt) event time. [Examples](tools/hardirqs_example.txt).
+- tools/[inject](tools/inject.py): Targeted error injection with call chain and predicates [Examples](tools/inject_example.txt).
+- tools/[killsnoop](tools/killsnoop.py): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt).
+- tools/[llcstat](tools/llcstat.py): Summarize CPU cache references and misses by process. [Examples](tools/llcstat_example.txt).
+- tools/[mdflush](tools/mdflush.py): Trace md flush events. [Examples](tools/mdflush_example.txt).
+- tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt).
+- tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_example.txt).
+- tools/[nfsslower](tools/nfsslower.py): Trace slow NFS operations. [Examples](tools/nfsslower_example.txt).
+- tools/[nfsdist](tools/nfsdist.py): Summarize NFS operation latency distribution as a histogram. [Examples](tools/nfsdist_example.txt).
+- tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt).
+- tools/[offwaketime](tools/offwaketime.py): Summarize blocked time by kernel off-CPU stack and waker stack. [Examples](tools/offwaketime_example.txt).
+- tools/[oomkill](tools/oomkill.py): Trace the out-of-memory (OOM) killer. [Examples](tools/oomkill_example.txt).
+- tools/[opensnoop](tools/opensnoop.py): Trace open() syscalls. [Examples](tools/opensnoop_example.txt).
+- tools/[pidpersec](tools/pidpersec.py): Count new processes (via fork). [Examples](tools/pidpersec_example.txt).
+- tools/[profile](tools/profile.py): Profile CPU usage by sampling stack traces at a timed interval. [Examples](tools/profile_example.txt).
+- tools/[reset-trace](tools/reset-trace.sh): Reset the state of tracing. Maintenance tool only. [Examples](tools/reset-trace_example.txt).
+- tools/[runqlat](tools/runqlat.py): Run queue (scheduler) latency as a histogram. [Examples](tools/runqlat_example.txt).
+- tools/[runqlen](tools/runqlen.py): Run queue length as a histogram. [Examples](tools/runqlen_example.txt).
+- tools/[runqslower](tools/runqslower.py): Trace long process scheduling delays. [Examples](tools/runqslower_example.txt).
+- tools/[slabratetop](tools/slabratetop.py): Kernel SLAB/SLUB memory cache allocation rate top. [Examples](tools/slabratetop_example.txt).
+- tools/[softirqs](tools/softirqs.py):  Measure soft IRQ (soft interrupt) event time. [Examples](tools/softirqs_example.txt).
+- tools/[solisten](tools/solisten.py): Trace TCP socket listen. [Examples](tools/solisten_example.txt).
+- tools/[sslsniff](tools/sslsniff.py): Sniff OpenSSL written and readed data. [Examples](tools/sslsniff_example.txt).
+- tools/[stackcount](tools/stackcount.py): Count kernel function calls and their stack traces. [Examples](tools/stackcount_example.txt).
+- tools/[syncsnoop](tools/syncsnoop.py): Trace sync() syscall. [Examples](tools/syncsnoop_example.txt).
+- tools/[syscount](tools/syscount.py): Summarize syscall counts and latencies. [Examples](tools/syscount_example.txt).
+- tools/[tcpaccept](tools/tcpaccept.py): Trace TCP passive connections (accept()). [Examples](tools/tcpaccept_example.txt).
+- tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt).
+- tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt).
+- tools/[tcpdrop](tools/tcpdrop.py): Trace kernel-based TCP packet drops with details. [Examples](tools/tcpdrop_example.txt).
+- tools/[tcplife](tools/tcplife.py): Trace TCP sessions and summarize lifespan. [Examples](tools/tcplife_example.txt).
+- tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt).
+- tools/[tcpstates](tools/tcpstates.py): Trace TCP session state changes with durations. [Examples](tools/tcpstates_example.txt).
+- tools/[tcpsubnet](tools/tcpsubnet.py): Summarize and aggregate TCP send by subnet. [Examples](tools/tcpsubnet_example.txt).
+- tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt).
+- tools/[tcptracer](tools/tcptracer.py): Trace TCP established connections (connect(), accept(), close()). [Examples](tools/tcptracer_example.txt).
+- tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt).
+- tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt).
+- tools/[ttysnoop](tools/ttysnoop.py): Watch live output from a tty or pts device. [Examples](tools/ttysnoop_example.txt).
+- tools/[ucalls](tools/lib/ucalls.py): Summarize method calls or Linux syscalls in high-level languages. [Examples](tools/lib/ucalls_example.txt).
+- tools/[uflow](tools/lib/uflow.py): Print a method flow graph in high-level languages. [Examples](tools/lib/uflow_example.txt).
+- tools/[ugc](tools/lib/ugc.py): Trace garbage collection events in high-level languages. [Examples](tools/lib/ugc_example.txt).
+- tools/[uobjnew](tools/lib/uobjnew.py): Summarize object allocation events by object type and number of bytes allocated. [Examples](tools/lib/uobjnew_example.txt).
+- tools/[ustat](tools/lib/ustat.py): Collect events such as GCs, thread creations, object allocations, exceptions and more in high-level languages. [Examples](tools/lib/ustat_example.txt).
+- tools/[uthreads](tools/lib/uthreads.py): Trace thread creation events in Java and raw pthreads. [Examples](tools/lib/uthreads_example.txt).
+- tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt).
+- tools/[vfsstat](tools/vfsstat.py) tools/[vfsstat.c](tools/vfsstat.c): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt).
+- tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt).
+- tools/[xfsdist](tools/xfsdist.py): Summarize XFS operation latency distribution as a histogram. [Examples](tools/xfsdist_example.txt).
+- tools/[xfsslower](tools/xfsslower.py): Trace slow XFS operations. [Examples](tools/xfsslower_example.txt).
+- tools/[zfsdist](tools/zfsdist.py): Summarize ZFS operation latency distribution as a histogram. [Examples](tools/zfsdist_example.txt).
+- tools/[zfsslower](tools/zfsslower.py): Trace slow ZFS operations. [Examples](tools/zfsslower_example.txt).
+
+### Networking
+
+Examples:
+
+- examples/networking/[distributed_bridge/](examples/networking/distributed_bridge): Distributed bridge example.
+- examples/networking/[http_filter/](examples/networking/http_filter): Simple HTTP filter example.
+- examples/networking/[simple_tc.py](examples/networking/simple_tc.py): Simple traffic control example.
+- examples/networking/[simulation.py](examples/networking/simulation.py): Simulation helper.
+- examples/networking/neighbor_sharing/[tc_neighbor_sharing.py](examples/networking/neighbor_sharing/tc_neighbor_sharing.py) examples/networking/neighbor_sharing/[tc_neighbor_sharing.c](examples/networking/neighbor_sharing/tc_neighbor_sharing.c): Per-IP classification and rate limiting.
+- examples/networking/[tunnel_monitor/](examples/networking/tunnel_monitor): Efficiently monitor traffic flows. [Example video](https://www.youtube.com/watch?v=yYy3Cwce02k).
+- examples/networking/vlan_learning/[vlan_learning.py](examples/networking/vlan_learning/vlan_learning.py) examples/[vlan_learning.c](examples/networking/vlan_learning/vlan_learning.c): Demux Ethernet traffic into worker veth+namespaces.
+
+### BPF Introspection:
+
+Tools that help to introspect BPF programs.
+
+- introspection/[bps.c](introspection/bps.c): List all BPF programs loaded into the kernel. 'ps' for BPF programs. [Examples](introspection/bps_example.txt).
+
+## Motivation
+
+BPF guarantees that the programs loaded into the kernel cannot crash, and
+cannot run forever, but yet BPF is general purpose enough to perform many
+arbitrary types of computation. Currently, it is possible to write a program in
+C that will compile into a valid BPF program, yet it is vastly easier to
+write a C program that will compile into invalid BPF (C is like that). The user
+won't know until trying to run the program whether it was valid or not.
+
+With a BPF-specific frontend, one should be able to write in a language and
+receive feedback from the compiler on the validity as it pertains to a BPF
+backend. This toolkit aims to provide a frontend that can only create valid BPF
+programs while still harnessing its full flexibility.
+
+Furthermore, current integrations with BPF have a kludgy workflow, sometimes
+involving compiling directly in a linux kernel source tree. This toolchain aims
+to minimize the time that a developer spends getting BPF compiled, and instead
+focus on the applications that can be written and the problems that can be
+solved with BPF.
+
+The features of this toolkit include:
+* End-to-end BPF workflow in a shared library
+  * A modified C language for BPF backends
+  * Integration with llvm-bpf backend for JIT
+  * Dynamic (un)loading of JITed programs
+  * Support for BPF kernel hooks: socket filters, tc classifiers,
+      tc actions, and kprobes
+* Bindings for Python
+* Examples for socket filters, tc classifiers, and kprobes
+* Self-contained tools for tracing a running system
+
+In the future, more bindings besides python will likely be supported. Feel free
+to add support for the language of your choice and send a pull request!
+
+## Tutorials
+
+- [docs/tutorial.md](docs/tutorial.md): Using bcc tools to solve performance, troubleshooting, and networking issues.
+- [docs/tutorial_bcc_python_developer.md](docs/tutorial_bcc_python_developer.md): Developing new bcc programs using the Python interface.
+
+### Networking
+
+At Red Hat Summit 2015, BCC was presented as part of a [session on BPF](http://www.devnation.org/#7784f1f7513e8542e4db519e79ff5eec).
+A multi-host vxlan environment is simulated and a BPF program used to monitor
+one of the physical interfaces. The BPF program keeps statistics on the inner
+and outer IP addresses traversing the interface, and the userspace component
+turns those statistics into a graph showing the traffic distribution at
+multiple granularities. See the code [here](examples/networking/tunnel_monitor).
+
+[![Screenshot](http://img.youtube.com/vi/yYy3Cwce02k/0.jpg)](https://youtu.be/yYy3Cwce02k)
+
+## Contributing
+
+Already pumped up to commit some code? Here are some resources to join the
+discussions in the [IOVisor](https://www.iovisor.org/) community and see
+what you want to work on.
+
+* _Mailing List:_ http://lists.iovisor.org/mailman/listinfo/iovisor-dev
+* _IRC:_ #iovisor at irc.oftc.net
+* _BCC Issue Tracker:_ [Github Issues](https://github.com/iovisor/bcc/issues)
+* _A guide for contributing scripts:_ [CONTRIBUTING-SCRIPTS.md](CONTRIBUTING-SCRIPTS.md)
+
+## External links
+
+Looking for more information on BCC and how it's being used? You can find links to other BCC content on the web in [LINKS.md](LINKS.md).
diff --git a/SPECS/Dockerfile.fedora b/SPECS/Dockerfile.fedora
new file mode 100644
index 0000000..2ecbb0b
--- /dev/null
+++ b/SPECS/Dockerfile.fedora
@@ -0,0 +1,20 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+FROM fedora:rawhide
+
+MAINTAINER Brenden Blanco <bblanco@plumgrid.com>
+
+RUN dnf -y install bison cmake flex gcc gcc-c++ git libxml2-devel make python2-devel rpm-build wget zlib-devel
+
+WORKDIR /root
+
+RUN wget http://llvm.org/releases/3.7.1/{cfe,llvm}-3.7.1.src.tar.xz
+
+RUN tar -xf llvm-3.7.1.src.tar.xz && mkdir llvm-3.7.1.src/tools/clang && tar -xf cfe-3.7.1.src.tar.xz -C llvm-3.7.1.src/tools/clang --strip 1 && mkdir llvm-3.7.1.src/build
+RUN cd llvm-3.7.1.src/build && cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;BPF" -DCMAKE_INSTALL_PREFIX=/usr
+RUN cd llvm-3.7.1.src/build && make -j8
+
+COPY . bcc
+WORKDIR /root/bcc
+RUN PATH=/root/llvm-3.7.1.src/build/bin:$PATH ./scripts/build-rpm.sh
diff --git a/SPECS/bcc+clang.spec b/SPECS/bcc+clang.spec
new file mode 100644
index 0000000..bbb6dd4
--- /dev/null
+++ b/SPECS/bcc+clang.spec
@@ -0,0 +1,101 @@
+%define debug_package %{nil}
+%define llvmver 3.7.1
+
+Name:           bcc
+Version:        @REVISION@
+Release:        @GIT_REV_COUNT@
+Summary:        BPF Compiler Collection (BCC)
+
+Group:          Development/Languages
+License:        ASL 2.0
+URL:            https://github.com/iovisor/bcc
+Source0:        https://github.com/iovisor/bcc/archive/v%{version}.tar.gz
+Source1:        http://llvm.org/releases/%{llvmver}/llvm-%{llvmver}.src.tar.xz
+Source2:        http://llvm.org/releases/%{llvmver}/cfe-%{llvmver}.src.tar.xz
+
+BuildArch:      x86_64
+BuildRequires:  bison, cmake >= 2.8.7, flex, gcc, gcc-c++, libxml2-devel, python2-devel, elfutils-libelf-devel-static
+
+%description
+Python bindings for BPF Compiler Collection (BCC). Control a BPF program from
+userspace.
+
+
+%prep
+%setup -T -b 1 -n llvm-%{llvmver}.src
+mkdir tools/clang
+tar -xvvJf %{_sourcedir}/cfe-%{llvmver}.src.tar.xz -C tools/clang --strip 1
+%setup -D -n bcc
+
+%build
+
+export LD_LIBRARY_PATH="%{_builddir}/usr/lib64"
+export PATH="%{_builddir}/usr/bin":$PATH
+
+# build llvm
+pushd %{_builddir}/llvm-%{llvmver}.src
+mkdir build
+cd build
+cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;BPF" -DCMAKE_INSTALL_PREFIX=/usr
+make %{?_smp_mflags}
+make install DESTDIR="%{_builddir}"
+popd
+
+mkdir build
+pushd build
+cmake .. -DREVISION_LAST=%{version} -DREVISION=%{version} -DCMAKE_INSTALL_PREFIX=/usr
+make %{?_smp_mflags}
+popd
+
+%install
+pushd build
+make install/strip DESTDIR=%{buildroot}
+
+%changelog
+* Fri Jul 03 2015 Brenden Blanco <bblanco@plumgrid.com> - 0.1.1-2
+- Initial RPM Release
+
+%package -n libbcc
+Summary: Shared Library for BPF Compiler Collection (BCC)
+Requires: elfutils-libelf
+%description -n libbcc
+Shared Library for BPF Compiler Collection (BCC)
+
+%package -n libbcc-examples
+Summary: Examples for BPF Compiler Collection (BCC)
+Requires: libbcc
+%description -n libbcc-examples
+Examples for BPF Compiler Collection (BCC)
+
+%package -n python-bcc
+Summary: Python bindings for BPF Compiler Collection (BCC)
+Requires: libbcc
+%description -n python-bcc
+Python bindings for BPF Compiler Collection (BCC)
+
+%package -n bcc-tools
+Summary: Command line tools for BPF Compiler Collection (BCC)
+Requires: python-bcc
+%description -n bcc-tools
+Command line tools for BPF Compiler Collection (BCC)
+
+%files -n python-bcc
+%{python_sitelib}/bcc*
+
+%files -n libbcc
+/usr/lib64/*
+/usr/include/bcc/*
+
+%files -n libbcc-examples
+/usr/share/bcc/examples/*
+%exclude /usr/share/bcc/examples/*.pyc
+%exclude /usr/share/bcc/examples/*.pyo
+%exclude /usr/share/bcc/examples/*/*.pyc
+%exclude /usr/share/bcc/examples/*/*.pyo
+%exclude /usr/share/bcc/examples/*/*/*.pyc
+%exclude /usr/share/bcc/examples/*/*/*.pyo
+
+%files -n bcc-tools
+/usr/share/bcc/introspection/*
+/usr/share/bcc/tools/*
+/usr/share/bcc/man/*
diff --git a/SPECS/bcc.spec b/SPECS/bcc.spec
new file mode 100644
index 0000000..f74bb61
--- /dev/null
+++ b/SPECS/bcc.spec
@@ -0,0 +1,195 @@
+%bcond_with local_clang_static
+#lua jit not available for some architectures
+%ifarch ppc64 aarch64 ppc64le
+%{!?with_lua: %global with_lua 0}
+%else
+%{!?with_lua: %global with_lua 1}
+%endif
+
+# use --with shared to only link against libLLVM.so
+%if 0%{?fedora} >= 28 || 0%{?rhel} > 7
+%bcond_without llvm_shared
+%else
+%bcond_with llvm_shared
+%endif
+
+# Build python3 support for distributions that have it
+%if 0%{?fedora} >= 28 || 0%{?rhel} > 7
+%bcond_without python3
+%else
+%bcond_with python3
+%endif
+
+%if %{with python3}
+%global __python %{__python3}
+%global python_bcc python3-bcc
+%global python_cmds python2;python3
+%else
+%global __python %{__python2}
+%global python_bcc python2-bcc
+%global python_cmds python2
+%endif
+
+%define debug_package %{nil}
+
+Name:           bcc
+Version:        @REVISION@
+Release:        @GIT_REV_COUNT@
+Summary:        BPF Compiler Collection (BCC)
+
+Group:          Development/Languages
+License:        ASL 2.0
+URL:            https://github.com/iovisor/bcc
+Source0:        bcc.tar.gz
+
+ExclusiveArch: x86_64 ppc64 aarch64 ppc64le
+BuildRequires: bison cmake >= 2.8.7 flex make
+BuildRequires: gcc gcc-c++ python2-devel elfutils-libelf-devel-static
+%if %{with python3}
+BuildRequires: python3-devel
+%endif
+%if %{with_lua}
+BuildRequires: luajit luajit-devel
+%endif
+%if %{without local_clang_static}
+BuildRequires: llvm-devel
+BuildRequires: clang-devel
+%if %{without llvm_shared}
+BuildRequires: llvm-static
+%endif
+%endif
+BuildRequires: pkgconfig ncurses-devel
+
+%description
+Python bindings for BPF Compiler Collection (BCC). Control a BPF program from
+userspace.
+
+%if %{with_lua}
+%global lua_include `pkg-config --variable=includedir luajit`
+%global lua_libs `pkg-config --variable=libdir luajit`/lib`pkg-config --variable=libname luajit`.so
+%global lua_config -DLUAJIT_INCLUDE_DIR=%{lua_include} -DLUAJIT_LIBRARIES=%{lua_libs}
+%endif
+
+%prep
+%setup -q -n bcc
+
+%build
+
+mkdir build
+pushd build
+cmake .. -DREVISION_LAST=%{version} -DREVISION=%{version} \
+      -DCMAKE_INSTALL_PREFIX=/usr \
+      %{?lua_config} \
+      -DPYTHON_CMD="%{python_cmds}" \
+      %{?with_llvm_shared:-DENABLE_LLVM_SHARED=1}
+make %{?_smp_mflags}
+popd
+
+%install
+pushd build
+make install/strip DESTDIR=%{buildroot}
+# mangle shebangs
+find %{buildroot}/usr/share/bcc/{tools,examples} -type f -exec \
+    sed -i -e '1 s|^#!/usr/bin/python$|#!'%{__python}'|' \
+           -e '1 s|^#!/usr/bin/env python$|#!'%{__python}'|' {} \;
+
+%package -n libbcc
+Summary: Shared Library for BPF Compiler Collection (BCC)
+Requires: elfutils-libelf
+%description -n libbcc
+Shared Library for BPF Compiler Collection (BCC)
+
+%package -n python2-bcc
+Summary: Python2 bindings for BPF Compiler Collection (BCC)
+Requires: libbcc = %{version}-%{release}
+%{?python_provide:%python_provide python2-bcc}
+%description -n python2-bcc
+Python bindings for BPF Compiler Collection (BCC)
+
+%if %{with python3}
+%package -n python3-bcc
+Summary: Python3 bindings for BPF Compiler Collection (BCC)
+Requires: libbcc = %{version}-%{release}
+%{?python_provide:%python_provide python3-bcc}
+%description -n python3-bcc
+Python bindings for BPF Compiler Collection (BCC)
+%endif
+
+%if %{with_lua}
+%package -n bcc-lua
+Summary: Standalone tool to run BCC tracers written in Lua
+Requires: libbcc = %{version}-%{release}
+%description -n bcc-lua
+Standalone tool to run BCC tracers written in Lua
+%endif
+
+%package -n libbcc-examples
+Summary: Examples for BPF Compiler Collection (BCC)
+Requires: %{python_bcc} = %{version}-%{release}
+%if %{with_lua}
+Requires: bcc-lua = %{version}-%{release}
+%endif
+%description -n libbcc-examples
+Examples for BPF Compiler Collection (BCC)
+
+%package -n bcc-tools
+Summary: Command line tools for BPF Compiler Collection (BCC)
+Requires: %{python_bcc} = %{version}-%{release}
+%description -n bcc-tools
+Command line tools for BPF Compiler Collection (BCC)
+
+%files -n libbcc
+/usr/lib64/*
+/usr/include/bcc/*
+
+%files -n python2-bcc
+%{python2_sitelib}/bcc*
+
+%if %{with python3}
+%files -n python3-bcc
+%{python3_sitelib}/bcc*
+%endif
+
+%if %{with_lua}
+%files -n bcc-lua
+/usr/bin/bcc-lua
+%endif
+
+%files -n libbcc-examples
+/usr/share/bcc/examples/*
+%exclude /usr/share/bcc/examples/*.pyc
+%exclude /usr/share/bcc/examples/*.pyo
+%exclude /usr/share/bcc/examples/*/*.pyc
+%exclude /usr/share/bcc/examples/*/*.pyo
+%exclude /usr/share/bcc/examples/*/*/*.pyc
+%exclude /usr/share/bcc/examples/*/*/*.pyo
+
+%files -n bcc-tools
+/usr/share/bcc/introspection/*
+/usr/share/bcc/tools/*
+/usr/share/bcc/man/*
+
+%post -n libbcc -p /sbin/ldconfig
+
+%postun -n libbcc -p /sbin/ldconfig
+
+%changelog
+* Wed Jul 18 2018 Brenden Blanco <bblanco@gmail.com> - 0.6.0-1
+- Make python3 the default when possible
+- Add with llvm_shared conditional
+- Add python2/python3 package targets
+
+* Mon Nov 21 2016 William Cohen <wcohen@redhat.com> - 0.2.0-1
+- Revise bcc.spec to address rpmlint issues and build properly in Fedora koji.
+
+* Mon Apr 04 2016 Vicent Marti <vicent@github.com> - 0.1.4-1
+- Add bcc-lua package
+
+* Sun Nov 29 2015 Brenden Blanco <bblanco@plumgrid.com> - 0.1.3-1
+- Add bcc-tools package
+
+* Mon Oct 12 2015 Brenden Blanco <bblanco@plumgrid.com> - 0.1.2-1
+- Add better version numbering into libbcc.so
+
+* Fri Jul 03 2015 Brenden Blanco <bblanco@plumgrid.com> - 0.1.1-2
+- Initial RPM Release
diff --git a/cmake/FindCompilerFlag.cmake b/cmake/FindCompilerFlag.cmake
new file mode 100644
index 0000000..31ac82d
--- /dev/null
+++ b/cmake/FindCompilerFlag.cmake
@@ -0,0 +1,17 @@
+# Copyright (c) 2017 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+	set(COMPILER_NOPIE_FLAG "-nopie")
+else()
+	set(_backup_c_flags "${CMAKE_REQUIRED_FLAGS}")
+	set(CMAKE_REQUIRED_FLAGS "-no-pie")
+	CHECK_CXX_SOURCE_COMPILES("int main() {return 0;}"
+				  HAVE_NO_PIE_FLAG)
+	if (HAVE_NO_PIE_FLAG)
+		set(COMPILER_NOPIE_FLAG "-no-pie")
+	else()
+		set(COMPILER_NOPIE_FLAG "")
+	endif()
+	set(CMAKE_REQUIRED_FLAGS "${_backup_c_flags}")
+endif()
diff --git a/cmake/FindLibElf.cmake b/cmake/FindLibElf.cmake
new file mode 100644
index 0000000..8968b3e
--- /dev/null
+++ b/cmake/FindLibElf.cmake
@@ -0,0 +1,64 @@
+# - Try to find libelf
+# Once done this will define
+#
+#  LIBELF_FOUND - system has libelf
+#  LIBELF_INCLUDE_DIRS - the libelf include directory
+#  LIBELF_LIBRARIES - Link these to use libelf
+#  LIBELF_DEFINITIONS - Compiler switches required for using libelf
+#
+#  Copyright (c) 2008 Bernhard Walle <bernhard.walle@gmx.de>
+#
+#  Redistribution and use is allowed according to the terms of the New
+#  BSD license.
+#  For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+#
+
+
+if (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
+  set (LibElf_FIND_QUIETLY TRUE)
+endif (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
+
+find_path (LIBELF_INCLUDE_DIRS
+  NAMES
+    libelf.h
+  PATHS
+    /usr/include
+    /usr/include/libelf
+    /usr/local/include
+    /usr/local/include/libelf
+    /opt/local/include
+    /opt/local/include/libelf
+    /sw/include
+    /sw/include/libelf
+    ENV CPATH)
+
+find_library (LIBELF_LIBRARIES
+  NAMES
+    elf
+  PATHS
+    /usr/lib
+    /usr/local/lib
+    /opt/local/lib
+    /sw/lib
+    ENV LIBRARY_PATH
+    ENV LD_LIBRARY_PATH)
+
+include (FindPackageHandleStandardArgs)
+
+
+# handle the QUIETLY and REQUIRED arguments and set LIBELF_FOUND to TRUE if all listed variables are TRUE
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibElf DEFAULT_MSG
+  LIBELF_LIBRARIES
+  LIBELF_INCLUDE_DIRS)
+
+SET(CMAKE_REQUIRED_LIBRARIES elf)
+INCLUDE(CheckCXXSourceCompiles)
+CHECK_CXX_SOURCE_COMPILES("#include <libelf.h>
+int main() {
+  Elf *e = (Elf*)0;
+  size_t sz;
+  elf_getshdrstrndx(e, &sz);
+  return 0;
+}" ELF_GETSHDRSTRNDX)
+
+mark_as_advanced(LIBELF_INCLUDE_DIRS LIBELF_LIBRARIES ELF_GETSHDRSTRNDX)
diff --git a/cmake/FindLuaJIT.cmake b/cmake/FindLuaJIT.cmake
new file mode 100644
index 0000000..5a2bcf8
--- /dev/null
+++ b/cmake/FindLuaJIT.cmake
@@ -0,0 +1,77 @@
+# Locate Lua library
+# This module defines
+#  LUAJIT_FOUND, if false, do not try to link to Lua
+#  LUAJIT_LIBRARIES
+#  LUAJIT_INCLUDE_DIR, where to find lua.h
+#
+# Note that the expected include convention is
+#  #include "lua.h"
+# and not
+#  #include <lua/lua.h>
+# This is because, the lua location is not standardized and may exist
+# in locations other than lua/
+
+#=============================================================================
+# Copyright 2007-2009 Kitware, Inc.
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distributed this file outside of CMake, substitute the full
+#  License text for the above reference.)
+#
+# ################
+# 2010 - modified for cronkite to find luajit instead of lua, as it was before.
+#
+
+FIND_PATH(LUAJIT_INCLUDE_DIR lua.h
+  HINTS
+  $ENV{LUAJIT_DIR}
+  PATH_SUFFIXES luajit-2.0 luajit2.0 luajit luajit-2.1
+  PATHS
+  ~/Library/Frameworks
+  /Library/Frameworks
+  /usr/local
+  /usr
+  /sw # Fink
+  /opt/local # DarwinPorts
+  /opt/csw # Blastwave
+  /opt
+)
+
+FIND_LIBRARY(LUAJIT_LIBRARY
+  NAMES libluajit-51.a libluajit-5.1.a libluajit.a libluajit-5.1.so
+  HINTS
+  $ENV{LUAJIT_DIR}
+  PATH_SUFFIXES lib64 lib
+  PATHS
+  ~/Library/Frameworks
+  /Library/Frameworks
+  /usr/local
+  /usr
+  /sw
+  /opt/local
+  /opt/csw
+  /opt
+)
+
+IF(LUAJIT_LIBRARY)
+  IF(UNIX AND NOT APPLE)
+    FIND_LIBRARY(LUAJIT_MATH_LIBRARY m)
+	FIND_LIBRARY(LUAJIT_DL_LIBRARY dl)
+	SET( LUAJIT_LIBRARIES "${LUAJIT_LIBRARY};${LUAJIT_DL_LIBRARY};${LUAJIT_MATH_LIBRARY}" CACHE STRING "Lua Libraries")
+  ELSE(UNIX AND NOT APPLE)
+    SET( LUAJIT_LIBRARIES "${LUAJIT_LIBRARY}" CACHE STRING "Lua Libraries")
+  ENDIF(UNIX AND NOT APPLE)
+ENDIF(LUAJIT_LIBRARY)
+
+INCLUDE(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set LUAJIT_FOUND to TRUE if
+# all listed variables are TRUE
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(LuaJIT  DEFAULT_MSG  LUAJIT_LIBRARIES LUAJIT_INCLUDE_DIR)
+
+MARK_AS_ADVANCED(LUAJIT_INCLUDE_DIR LUAJIT_LIBRARIES LUAJIT_LIBRARY LUAJIT_MATH_LIBRARY)
diff --git a/cmake/GetGitRevisionDescription.cmake b/cmake/GetGitRevisionDescription.cmake
new file mode 100644
index 0000000..1bf0230
--- /dev/null
+++ b/cmake/GetGitRevisionDescription.cmake
@@ -0,0 +1,123 @@
+# - Returns a version string from Git
+#
+# These functions force a re-configure on each git commit so that you can
+# trust the values of the variables in your build system.
+#
+#  get_git_head_revision(<refspecvar> <hashvar> [<additional arguments to git describe> ...])
+#
+# Returns the refspec and sha hash of the current head revision
+#
+#  git_describe(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe on the source tree, and adjusting
+# the output so that it tests false if an error occurs.
+#
+#  git_get_exact_tag(<var> [<additional arguments to git describe> ...])
+#
+# Returns the results of git describe --exact-match on the source tree,
+# and adjusting the output so that it tests false if there was no exact
+# matching tag.
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+if(__get_git_revision_description)
+	return()
+endif()
+set(__get_git_revision_description YES)
+
+# We must run the following at "include" time, not at function call time,
+# to find the path to this module rather than the path to a calling list file
+get_filename_component(_gitdescmoddir ${CMAKE_CURRENT_LIST_FILE} PATH)
+
+function(get_git_head_revision _refspecvar _hashvar)
+	set(GIT_PARENT_DIR "${CMAKE_SOURCE_DIR}")
+	set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	while(NOT EXISTS "${GIT_DIR}")	# .git dir not found, search parent directories
+		set(GIT_PREVIOUS_PARENT "${GIT_PARENT_DIR}")
+		get_filename_component(GIT_PARENT_DIR ${GIT_PARENT_DIR} PATH)
+		if(GIT_PARENT_DIR STREQUAL GIT_PREVIOUS_PARENT)
+			# We have reached the root directory, we are not in git
+			set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE)
+			return()
+		endif()
+		set(GIT_DIR "${GIT_PARENT_DIR}/.git")
+	endwhile()
+	set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data")
+	if(NOT EXISTS "${GIT_DATA}")
+		file(MAKE_DIRECTORY "${GIT_DATA}")
+	endif()
+
+	if(NOT EXISTS "${GIT_DIR}/HEAD")
+		return()
+	endif()
+	set(HEAD_FILE "${GIT_DATA}/HEAD")
+	configure_file("${GIT_DIR}/HEAD" "${HEAD_FILE}" COPYONLY)
+
+	configure_file("${_gitdescmoddir}/GetGitRevisionDescription.cmake.in"
+		"${GIT_DATA}/grabRef.cmake"
+		@ONLY)
+	include("${GIT_DATA}/grabRef.cmake")
+
+	set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE)
+	set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE)
+endfunction()
+
+function(git_describe _var)
+	if(NOT GIT_FOUND)
+		find_package(Git QUIET)
+	endif()
+	get_git_head_revision(refspec hash)
+	if(NOT GIT_FOUND)
+		set(${_var} "GIT-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+	if(NOT hash)
+		set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE)
+		return()
+	endif()
+
+	# TODO sanitize
+	#if((${ARGN}" MATCHES "&&") OR
+	#	(ARGN MATCHES "||") OR
+	#	(ARGN MATCHES "\\;"))
+	#	message("Please report the following error to the project!")
+	#	message(FATAL_ERROR "Looks like someone's doing something nefarious with git_describe! Passed arguments ${ARGN}")
+	#endif()
+
+	#message(STATUS "Arguments to execute_process: ${ARGN}")
+
+	execute_process(COMMAND
+		"${GIT_EXECUTABLE}"
+		describe
+		${hash}
+		${ARGN}
+		WORKING_DIRECTORY
+		"${CMAKE_SOURCE_DIR}"
+		RESULT_VARIABLE
+		res
+		OUTPUT_VARIABLE
+		out
+		ERROR_QUIET
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+	if(NOT res EQUAL 0)
+		set(out "${out}-${res}-NOTFOUND")
+	endif()
+
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
+
+function(git_get_exact_tag _var)
+	git_describe(out --exact-match ${ARGN})
+	set(${_var} "${out}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/GetGitRevisionDescription.cmake.in b/cmake/GetGitRevisionDescription.cmake.in
new file mode 100644
index 0000000..6faa374
--- /dev/null
+++ b/cmake/GetGitRevisionDescription.cmake.in
@@ -0,0 +1,38 @@
+#
+# Internal file for GetGitRevisionDescription.cmake
+#
+# Requires CMake 2.6 or newer (uses the 'function' command)
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+set(HEAD_HASH)
+
+file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024)
+
+string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS)
+if(HEAD_CONTENTS MATCHES "ref")
+	# named branch
+	string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}")
+	if(EXISTS "@GIT_DIR@/${HEAD_REF}")
+		configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+	elseif(EXISTS "@GIT_DIR@/logs/${HEAD_REF}")
+		configure_file("@GIT_DIR@/logs/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY)
+		set(HEAD_HASH "${HEAD_REF}")
+	endif()
+else()
+	# detached HEAD
+	configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY)
+endif()
+
+if(NOT HEAD_HASH)
+	file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024)
+	string(STRIP "${HEAD_HASH}" HEAD_HASH)
+endif()
diff --git a/cmake/bump_version.cmake b/cmake/bump_version.cmake
new file mode 100644
index 0000000..ac9dda3
--- /dev/null
+++ b/cmake/bump_version.cmake
@@ -0,0 +1,10 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+configure_file(SPECS/Dockerfile.el6.in SPECS/Dockerfile.el6 @ONLY)
+configure_file(SPECS/Dockerfile.el7.in SPECS/Dockerfile.el7 @ONLY)
+configure_file(SPECS/Dockerfile.f22.in SPECS/Dockerfile.f22 @ONLY)
+configure_file(SPECS/bcc.el6.spec.in SPECS/bcc.el6.spec @ONLY)
+configure_file(SPECS/bcc.el7.spec.in SPECS/bcc.el7.spec @ONLY)
+configure_file(SPECS/bcc.f22.spec.in SPECS/bcc.f22.spec @ONLY)
+configure_file(scripts/build-deb.sh.in scripts/build-deb.sh @ONLY)
diff --git a/cmake/clang_libs.cmake b/cmake/clang_libs.cmake
new file mode 100644
index 0000000..12aa9fd
--- /dev/null
+++ b/cmake/clang_libs.cmake
@@ -0,0 +1,56 @@
+if(ENABLE_LLVM_SHARED)
+set(llvm_libs "LLVM")
+else()
+set(llvm_raw_libs bitwriter bpfcodegen debuginfodwarf irreader linker
+  mcjit objcarcopts option passes nativecodegen lto)
+list(FIND LLVM_AVAILABLE_LIBS "LLVMCoverage" _llvm_coverage)
+if (${_llvm_coverage} GREATER -1)
+  list(APPEND llvm_raw_libs coverage)
+endif()
+list(FIND LLVM_AVAILABLE_LIBS "LLVMCoroutines" _llvm_coroutines)
+if (${_llvm_coroutines} GREATER -1)
+  list(APPEND llvm_raw_libs coroutines)
+endif()
+if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 6 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 6)
+  list(APPEND llvm_raw_libs bpfasmparser)
+  list(APPEND llvm_raw_libs bpfdisassembler)
+endif()
+llvm_map_components_to_libnames(_llvm_libs ${llvm_raw_libs})
+llvm_expand_dependencies(llvm_libs ${_llvm_libs})
+endif()
+
+# order is important
+set(clang_libs
+  ${libclangFrontend}
+  ${libclangSerialization}
+  ${libclangDriver})
+
+if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 8 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 8)
+  list(APPEND clang_libs ${libclangASTMatchers})
+endif()
+
+list(APPEND clang_libs
+  ${libclangParse}
+  ${libclangSema}
+  ${libclangCodeGen}
+  ${libclangAnalysis}
+  ${libclangRewrite}
+  ${libclangEdit}
+  ${libclangAST}
+  ${libclangLex}
+  ${libclangBasic})
+
+# prune unused llvm static library stuff when linking into the new .so
+set(_exclude_flags)
+foreach(_lib ${clang_libs})
+  get_filename_component(_lib ${_lib} NAME)
+  set(_exclude_flags "${_exclude_flags} -Wl,--exclude-libs=${_lib}")
+endforeach(_lib)
+set(clang_lib_exclude_flags "${_exclude_flags}")
+
+set(_exclude_flags)
+foreach(_lib ${llvm_libs})
+  get_filename_component(_lib ${_lib} NAME)
+  set(_exclude_flags "${_exclude_flags} -Wl,--exclude-libs=lib${_lib}.a")
+endforeach(_lib)
+set(llvm_lib_exclude_flags "${_exclude_flags}")
diff --git a/cmake/static_libstdc++.cmake b/cmake/static_libstdc++.cmake
new file mode 100644
index 0000000..3c8ac17
--- /dev/null
+++ b/cmake/static_libstdc++.cmake
@@ -0,0 +1,15 @@
+# only turn on static-libstdc++ if also linking statically against clang
+string(REGEX MATCH ".*[.]a$" LIBCLANG_ISSTATIC "${libclangBasic}")
+# if gcc 4.9 or higher is used, static libstdc++ is a good option
+if (CMAKE_COMPILER_IS_GNUCC AND LIBCLANG_ISSTATIC)
+  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+  if (GCC_VERSION VERSION_GREATER 4.9 OR GCC_VERSION VERSION_EQUAL 4.9)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -print-libgcc-file-name OUTPUT_VARIABLE GCC_LIB)
+    get_filename_component(GCC_DIR "${GCC_LIB}" DIRECTORY)
+    find_library(GCC_LIBSTDCPP libstdc++.a PATHS "${GCC_DIR}" NO_DEFAULT_PATH)
+    if (GCC_LIBSTDCPP)
+      message(STATUS "Using static-libstdc++")
+      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++")
+    endif()
+  endif()
+endif()
diff --git a/cmake/version.cmake b/cmake/version.cmake
new file mode 100644
index 0000000..fb00408
--- /dev/null
+++ b/cmake/version.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+if(NOT REVISION)
+  get_git_head_revision(GIT_REFSPEC GIT_SHA1)
+  string(SUBSTRING "${GIT_SHA1}" 0 8 GIT_SHA1_SHORT)
+  git_describe(GIT_DESCRIPTION)
+  git_describe(GIT_TAG_LAST "--abbrev=0")
+  git_get_exact_tag(GIT_TAG_EXACT)
+  string(SUBSTRING "${GIT_TAG_LAST}-${GIT_SHA1_SHORT}" 1 -1 REVISION)
+  if(GIT_TAG_EXACT)
+    string(SUBSTRING "${GIT_TAG_EXACT}" 1 -1 REVISION)
+    message(STATUS "Currently on Git tag ${GIT_TAG_EXACT}")
+  else ()
+    message(STATUS "Latest recognized Git tag is ${GIT_TAG_LAST}")
+    set(GIT_TAG_EXACT "")
+  endif()
+  message(STATUS "Git HEAD is ${GIT_SHA1}")
+  # rpm/deb packaging uses this, only works on whole tag numbers
+  if(NOT REVISION_LAST)
+    string(SUBSTRING "${GIT_TAG_LAST}" 1 -1 REVISION_LAST)
+  endif()
+else()
+  set(REVISION_LAST "${REVISION}")
+endif()
+
+# strip leading 'v', and make unique for the tag
+message(STATUS "Revision is ${REVISION}")
diff --git a/debian/bcc-lua.install b/debian/bcc-lua.install
new file mode 100644
index 0000000..bcd6d30
--- /dev/null
+++ b/debian/bcc-lua.install
@@ -0,0 +1 @@
+usr/bin/bcc-lua
diff --git a/debian/bcc-tools.install b/debian/bcc-tools.install
new file mode 100644
index 0000000..60d92a5
--- /dev/null
+++ b/debian/bcc-tools.install
@@ -0,0 +1,3 @@
+usr/share/bcc/introspection/*
+usr/share/bcc/tools/*
+usr/share/bcc/man/*
diff --git a/debian/changelog b/debian/changelog
new file mode 100644
index 0000000..1f5be87
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,94 @@
+bcc (0.7.0-1) unstable; urgency=low
+
+  * Support for kernel up to 4.18
+
+ -- Brenden Blanco <bblanco@gmail.com>  Tue, 04 Sep 2018 17:00:00 +0000
+
+bcc (0.6.1-1) unstable; urgency=low
+
+  * Build support for Fedora 28 and Ubuntu 18.04
+  * Add option to change license
+  * Optimizations for some uses of bpf_probe_reads
+
+ -- Brenden Blanco <bblanco@gmail.com>  Mon, 23 Jul 2018 17:00:00 +0000
+
+bcc (0.6.0-1) unstable; urgency=low
+
+  * Support for kernel up to 4.17
+  * Many bugfixes
+  * Many new tools
+  * Improved python3 support
+
+ -- Brenden Blanco <bblanco@gmail.com>  Wed, 13 Jun 2018 17:00:00 +0000
+
+bcc (0.5.0-1) unstable; urgency=low
+
+  * Support for USDT in ARM64
+  * Bugfixes for 4.14 in some tools
+  * Fixes for smoke test failures
+  * Runtime memory usage reductions
+
+ -- Brenden Blanco <bblanco@gmail.com>  Wed, 29 Nov 2017 17:00:00 +0000
+
+bcc (0.4.0-1) unstable; urgency=low
+
+  * Bugfixes
+  * Support for kernel up to 4.14
+
+ -- Brenden Blanco <bblanco@gmail.com>  Fri, 20 Oct 2017 17:00:00 +0000
+
+bcc (0.3.0-1) unstable; urgency=low
+
+  * Many bugfixes
+  * Many tools converted to perf ring buffer
+  * New utilities in tools/
+   * capable, cpuunclaimed, dbslower, dbstat, deadlock_detector, llcstat,
+     mountsnoop, runqlen, slabratetop, syscount, tcplife, tcptop, ttysnoop,
+     ucalls, uflow, ugc, uobjnew, ustat, uthreads
+  * New C++ API
+  * Support for kernel up to 4.10
+
+ -- Brenden Blanco <bblanco@gmail.com>  Thu, 09 Mar 2017 19:08:08 +0000
+
+bcc (0.2.0-1) unstable; urgency=low
+
+  * Add many new utilities in tools/
+  * Support for USDT
+  * Support for lua
+  * Many utilities converted to perf ring buffer
+  * Support for tracepoints
+
+ -- Brenden Blanco <bblanco@plumgrid.com>  Thu, 08 Sep 2016 17:05:28 -0700
+
+bcc (0.1.8-1) unstable; urgency=low
+
+  * Add many new utilities in tools/
+   * wakeuptime, offwaketime, argdist, {xfs,zfs,ext4}{slower,dist}, others
+  * Support for bpf_perf_event()
+  * Support for public tables shared between programs
+  * Support for up to 4.4 features
+  * Remove external file dependencies from clang lib
+
+ -- Brenden Blanco <bblanco@plumgrid.com>  Mon, 23 Feb 2016 00:41:00 +0000
+
+bcc (0.1.7-1) unstable; urgency=low
+
+  * Tracing features and bugfixes
+  * Built against LLVM 3.8 HEAD
+
+ -- Brenden Blanco <bblanco@plumgrid.com>  Mon, 12 Oct 2015 16:47:09 +0000
+
+bcc (0.1.6-1) unstable; urgency=low
+
+  * Stability fixes
+  * Improvements to python API
+  * Tracing features
+  * Support for kernel 4.2 features
+
+ -- Brenden Blanco <bblanco@plumgrid.com>  Wed, 02 Sep 2015 16:23:19 +0000
+
+bcc (0.1.5-1) unstable; urgency=low
+
+  * Initial release
+
+ -- Brenden Blanco <bblanco@plumgrid.com>  Mon, 06 Jul 2015 18:04:28 +0000
diff --git a/debian/compat b/debian/compat
new file mode 100644
index 0000000..ec63514
--- /dev/null
+++ b/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/debian/control b/debian/control
new file mode 100644
index 0000000..5143a42
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,47 @@
+Source: bcc
+Maintainer: Brenden Blanco <bblanco@plumgrid.com>
+Section: misc
+Priority: optional
+Standards-Version: 3.9.5
+Build-Depends: debhelper (>= 9), cmake,
+    libllvm3.7 [!arm64] | libllvm3.8 [!arm64] | libllvm6.0,
+    llvm-3.7-dev [!arm64] | llvm-3.8-dev [!arm64] | llvm-6.0-dev,
+    libclang-3.7-dev [!arm64] | libclang-3.8-dev [!arm64] | libclang-6.0-dev,
+    clang-format | clang-format-3.7 [!arm64] | clang-format-3.8 [!arm64] | clang-format-6.0,
+    libelf-dev, bison, flex, libfl-dev, libedit-dev, zlib1g-dev, git,
+    python (>= 2.7), python-netaddr, python-pyroute2, luajit,
+    libluajit-5.1-dev, arping, inetutils-ping | iputils-ping, iperf, netperf,
+    ethtool, devscripts, python3, dh-python
+Homepage: https://github.com/iovisor/bcc
+
+Package: libbcc
+Architecture: all
+Depends: libc6, libstdc++6, libelf1
+Description: Shared Library for BPF Compiler Collection (BCC)
+ Shared Library for BPF Compiler Collection to control BPF programs
+ from userspace.
+
+Package: libbcc-examples
+Architecture: any
+Depends: libbcc (= ${binary:Version})
+Description: Examples for BPF Compiler Collection (BCC)
+
+Package: python-bcc
+Architecture: all
+Depends: libbcc (= ${binary:Version}), python, binutils
+Description: Python wrappers for BPF Compiler Collection (BCC)
+
+Package: python3-bcc
+Architecture: all
+Depends: libbcc (= ${binary:Version}), python3, binutils
+Description: Python3 wrappers for BPF Compiler Collection (BCC)
+
+Package: bcc-tools
+Architecture: all
+Depends: python-bcc (= ${binary:Version})
+Description: Command line tools for BPF Compiler Collection (BCC)
+
+Package: bcc-lua
+Architecture: all
+Depends: libbcc (= ${binary:Version})
+Description: Standalone tool to run BCC tracers written in Lua
diff --git a/debian/copyright b/debian/copyright
new file mode 100644
index 0000000..f262737
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,7 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: bcc
+Source: https://github.com/iovisor/bcc
+
+Files: *
+Copyright: 2015 PLUMgrid, Inc.
+License: Apache-2.0
diff --git a/debian/docs b/debian/docs
new file mode 100644
index 0000000..7d88104
--- /dev/null
+++ b/debian/docs
@@ -0,0 +1,3 @@
+FAQ.txt
+LICENSE.txt
+README.md
diff --git a/debian/libbcc-examples.install b/debian/libbcc-examples.install
new file mode 100644
index 0000000..94c7b5a
--- /dev/null
+++ b/debian/libbcc-examples.install
@@ -0,0 +1 @@
+usr/share/bcc/examples/*
diff --git a/debian/libbcc.install b/debian/libbcc.install
new file mode 100644
index 0000000..0cb867e
--- /dev/null
+++ b/debian/libbcc.install
@@ -0,0 +1,3 @@
+usr/include/bcc/*
+usr/lib/*/libbcc*
+usr/lib/*/pkgconfig/libbcc.pc
diff --git a/debian/python-bcc.install b/debian/python-bcc.install
new file mode 100644
index 0000000..b2cc136
--- /dev/null
+++ b/debian/python-bcc.install
@@ -0,0 +1 @@
+usr/lib/python2*
diff --git a/debian/python3-bcc.install b/debian/python3-bcc.install
new file mode 100644
index 0000000..4606faa
--- /dev/null
+++ b/debian/python3-bcc.install
@@ -0,0 +1 @@
+usr/lib/python3*
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 0000000..49460be
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,20 @@
+#!/usr/bin/make -f
+# -*- makefile -*-
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+DEBIAN_VERSION := $(shell dpkg-parsechangelog | sed -rne "s,^Version: (.*),\1,p")
+DEBIAN_REVISION := $(shell dpkg-parsechangelog | sed -rne "s,^Version: ([0-9.]+)(~|-)(.*),\3,p")
+UPSTREAM_VERSION := $(shell dpkg-parsechangelog | sed -rne "s,^Version: ([0-9.]+)(~|-)(.*),\1,p")
+
+%:
+	dh $@ --buildsystem=cmake --parallel --with python2,python3
+
+# tests cannot be run in parallel
+override_dh_auto_test:
+	dh_auto_test -O--buildsystem=cmake -O--no-parallel
+
+# FIXME: LLVM_DEFINITIONS is broken somehow in LLVM cmake upstream
+override_dh_auto_configure:
+	dh_auto_configure -- -DREVISION_LAST=$(UPSTREAM_VERSION) -DREVISION=$(UPSTREAM_VERSION) -DLLVM_DEFINITIONS="-D_GNU_SOURCE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS" -DPYTHON_CMD="python2;python3"
diff --git a/debian/source/format b/debian/source/format
new file mode 100644
index 0000000..163aaf8
--- /dev/null
+++ b/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md
new file mode 100644
index 0000000..af42c8f
--- /dev/null
+++ b/docs/kernel-versions.md
@@ -0,0 +1,287 @@
+# BPF Features by Linux Kernel Version
+
+## eBPF support
+
+Kernel version | Commit
+---------------|-------
+3.15 | [`bd4cf0ed331a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8)
+
+## JIT compiling
+
+The list of supported architectures for your kernel can be retrieved with:
+
+    git grep HAVE_EBPF_JIT arch/
+
+Feature / Architecture | Kernel version | Commit
+-----------------------|----------------|-------
+x86\_64 | 3.16 | [`622582786c9e`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=622582786c9e041d0bd52bde201787adeab249f8)
+ARM64 | 3.18 | [`e54bcde3d69d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=e54bcde3d69d40023ae77727213d14f920eb264a)
+s390 | 4.1 | [`054623105728`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=054623105728b06852f077299e2bf1bf3d5f2b0b)
+Constant blinding for JIT machines | 4.7 | [`4f3446bb809f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f3446bb809f20ad56cadf712e6006815ae7a8f9)
+PowerPC64 | 4.8 | [`156d0e290e96`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=156d0e290e969caba25f1851c52417c14d141b24)
+Constant blinding - PowerPC64 | 4.9 | [`b7b7013cac55`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b7b7013cac55d794940bd9cb7b7c55c9dececac4)
+Sparc64 | 4.12 | [`7a12b5031c6b`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7a12b5031c6b947cc13918237ae652b536243b76)
+MIPS | 4.13 | [`f381bf6d82f0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=f381bf6d82f032b7410185b35d000ea370ac706b)
+ARM32 | 4.14 | [`39c13c204bb1`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=39c13c204bb1150d401e27d41a9d8b332be47c49)
+x86\_32 | 4.18 |  [`03f5781be2c7`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=03f5781be2c7b7e728d724ac70ba10799cc710d7)
+
+## Main features
+
+Several (but not all) of these _main features_ translate to an eBPF program type.
+The list of such program types supported in your kernel can be found in file
+[`include/uapi/linux/bpf.h`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/bpf.h):
+
+    git grep -W 'bpf_prog_type {' include/uapi/linux/bpf.h
+
+Feature | Kernel version | Commit
+--------|----------------|-------
+`AF_PACKET` (libpcap/tcpdump, `cls_bpf` classifier, netfilter's `xt_bpf`, team driver's load-balancing mode…) | 3.15 | [`bd4cf0ed331a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8)
+Kernel helpers | 3.15 | [`bd4cf0ed331a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8)
+`bpf()` syscall | 3.18 | [`99c55f7d47c0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=99c55f7d47c0dc6fc64729f37bf435abf43f4c60)
+Tables (_a.k.a._ Maps; details below) | 3.18 | [`99c55f7d47c0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=99c55f7d47c0dc6fc64729f37bf435abf43f4c60)
+BPF attached to sockets | 3.19 | [`89aa075832b0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=89aa075832b0da4402acebd698d0411dcc82d03e)
+BPF attached to `kprobes` | 4.1 | [`2541517c32be`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2541517c32be2531e0da59dfd7efc1ce844644f5)
+`cls_bpf` / `act_bpf` for `tc` | 4.1 | [`e2e9b6541dd4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=e2e9b6541dd4b31848079da80fe2253daaafb549)
+Tail calls | 4.2 | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb)
+Non-root programs on sockets | 4.4 | [`1be7f75d1668`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=1be7f75d1668d6296b80bf35dcf6762393530afc)
+Persistent maps and programs (virtual FS) | 4.4 | [`b2197755b263`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b2197755b2633e164a439682fb05a9b5ea48f706)
+`tc`'s `direct-action` (`da`) mode | 4.4 | [`045efa82ff56`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=045efa82ff563cd4e656ca1c2e354fa5bf6bbda4)
+`tc`'s `clsact` qdisc | 4.5 | [`1f211a1b929c`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=1f211a1b929c804100e138c5d3d656992cfd5622)
+BPF attached to tracepoints | 4.7 | [`98b5c2c65c29`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=98b5c2c65c2951772a8fc661f50d675e450e8bce)
+Direct packet access | 4.7 | [`969bf05eb3ce`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=969bf05eb3cedd5a8d4b7c346a85c2ede87a6d6d)
+XDP (see below) | 4.8 | [`6a773a15a1e8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6a773a15a1e8874e5eccd2f29190c31085912c95)
+BPF attached to perf events | 4.9 | [`0515e5999a46`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=0515e5999a466dfe6e1924f460da599bb6821487)
+Hardware offload for `tc`'s `cls_bpf` | 4.9 | [`332ae8e2f6ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=332ae8e2f6ecda5e50c5c62ed62894963e3a83f5)
+Verifier exposure and internal hooks | 4.9 | [`13a27dfc6697`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=13a27dfc669724564aafa2699976ee756029fed2)
+BPF attached to cgroups for socket filtering | 4.10 | [`0e33661de493`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=0e33661de493db325435d565a4a722120ae4cbf3)
+Lightweight tunnel encapsulation | 4.10 | [`3a0af8fd61f9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2)
+**e**BPF support for `xt_bpf` module (iptables) | 4.10 | [`2c16d6033264`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2c16d60332643e90d4fa244f4a706c454b8c7569)
+BPF program tag | 4.10 | [`7bd509e311f4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7bd509e311f408f7a5132fcdde2069af65fa05ae)
+Tracepoints to debug BPF | 4.11 (removed in 4.18) | [`a67edbf4fb6d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a67edbf4fb6deadcfe57a04a134abed4a5ba3bb5) [`4d220ed0f814`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=4d220ed0f8140c478ab7b0a14d96821da639b646)
+Testing / benchmarking BPF programs | 4.12 | [`1cf1cae963c2`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=1cf1cae963c2e6032aebe1637e995bc2f5d330f4)
+BPF programs and maps IDs | 4.13 | [`dc4bb0e23561`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=dc4bb0e2356149aee4cdae061936f3bbdd45595c)
+BPF support for `sock_ops` | 4.13 | [`40304b2a1567`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=40304b2a1567fecc321f640ee4239556dd0f3ee0)
+BPF support for skbs on sockets | 4.14 | [`b005fd189cec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b005fd189cec9407b700599e1e80e0552446ee79)
+bpftool utility in kernel sources | 4.15 | [`71bb428fe2c1`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=71bb428fe2c19512ac671d5ee16ef3e73e1b49a8)
+BPF attached to cgroups as device controller | 4.15 | [`ebc614f68736`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ebc614f687369f9df99828572b1d85a7c2de3d92)
+bpf2bpf function calls | 4.16 |  [`cc8b0b92a169`](https://github.com/torvalds/linux/commit/cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d)
+BPF used for monitoring socket RX/TX data | 4.17 | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0)
+BPF attached to raw tracepoints | 4.17 | [`c4f6699dfcb8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c4f6699dfcb8558d138fe838f741b2c10f416cf9)
+BPF attached to `bind()` system call | 4.17 | [`4fbac77d2d09`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4fbac77d2d092b475dda9eea66da674369665427)
+BPF Type Format (BTF) | 4.18 | [`69b693f0aefa`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=69b693f0aefa0ed521e8bd02260523b5ae446ad7)
+AF_XDP | 4.18 |  [`fbfc504a24f5`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fbfc504a24f53f7ebe128ab55cb5dba634f4ece8)
+bpfilter | 4.18 |  [`d2ba09c17a06`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=d2ba09c17a0647f899d6c20a11bab9e6d3382f07)
+End.BPF action for seg6local LWT | 4.18 |  [`004d4b274e2a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=004d4b274e2a1a895a0e5dc66158b90a7d463d44)
+BPF attached to LIRC devices | 4.18 |  [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936)
+
+## Tables (_a.k.a._ Maps)
+
+The list of map types supported in your kernel can be found in file
+[`include/uapi/linux/bpf.h`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/bpf.h):
+
+    git grep -W 'bpf_map_type {' include/uapi/linux/bpf.h
+
+Table type | Kernel version | Commit
+-----------|----------------|-------
+Hash | 3.19 | [`0f8e4bd8a1fc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475)
+Array | 3.19 | [`28fbcfa08d8e`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=28fbcfa08d8ed7c5a50d41a0433aad222835e8e3)
+Tail call (`PROG_ARRAY`) | 4.2 | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb)
+Perf events | 4.3 | [`ea317b267e9d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ea317b267e9d03a8241893aa176fba7661d07579)
+Per-CPU hash | 4.6 | [`824bd0ce6c7c`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=824bd0ce6c7c43a9e1e210abf124958e54d88342)
+Per-CPU array | 4.6 | [`a10423b87a7e`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a10423b87a7eae75da79ce80a8d9475047a674ee)
+Stack trace | 4.6 | [`d5a3b1f69186`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d5a3b1f691865be576c2bffa708549b8cdccda19)
+Pre-alloc maps memory | 4.6 | [`6c9059817432`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6c90598174322b8888029e40dd84a4eb01f56afe)
+cgroup array | 4.8 | [`4ed8ec521ed5`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4ed8ec521ed57c4e207ad464ca0388776de74d4b)
+LRU hash | 4.10 | [`29ba732acbee`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=29ba732acbeece1e34c68483d1ec1f3720fa1bb3) [`3a08c2fd7634`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3a08c2fd763450a927d1130de078d6f9e74944fb)
+LRU per-CPU hash | 4.10 | [`8f8449384ec3`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8f8449384ec364ba2a654f11f94e754e4ff719e0) [`961578b63474`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=961578b63474d13ad0e2f615fcc2901c5197dda6)
+LPM trie (longest-prefix match) | 4.11 | [`b95a5c4db09b`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b95a5c4db09bc7c253636cb84dc9b12c577fd5a0)
+Array of maps | 4.12 | [`56f668dfe00d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=56f668dfe00dcf086734f1c42ea999398fad6572)
+Hash of maps | 4.12 | [`bcc6b1b7ebf8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=bcc6b1b7ebf857a9fe56202e2be3361131588c15)
+Netdevice references | 4.14 | [`546ac1ffb70d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=546ac1ffb70d25b56c1126940e5ec639c4dd7413)
+Socket references (array) | 4.14 | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6)
+CPU references | 4.15 | [`6710e1126934`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6710e1126934d8b4372b4d2f9ae1646cd3f151bf)
+AF_XDP socket (XSK) references | 4.18 | [`fbfc504a24f5`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fbfc504a24f53f7ebe128ab55cb5dba634f4ece8)
+Socket references (hashmap) | 4.18 | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4)
+cgroup storage | 4.19 | [`de9cbbaadba5`](https://github.com/torvalds/linux/commit/de9cbbaadba5adf88a19e46df61f7054000838f6)
+reuseport sockarray | 4.19 | [`5dc4c4b7d4e8`](https://github.com/torvalds/linux/commit/5dc4c4b7d4e8115e7cde96a030f98cb3ab2e458c)
+precpu cgroup storage | 4.20 | [`b741f1630346`](https://github.com/torvalds/linux/commit/b741f1630346defcbc8cc60f1a2bdae8b3b0036f)
+queue | 4.20 | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+stack | 4.20 | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+
+## XDP
+
+An approximate list of drivers or components supporting XDP programs for your
+kernel can be retrieved with:
+
+    git grep -l XDP_SETUP_PROG drivers/
+
+Feature / Driver | Kernel version | Commit
+-----------------|----------------|-------
+XDP core architecture | 4.8 | [`6a773a15a1e8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6a773a15a1e8874e5eccd2f29190c31085912c95)
+Action: drop | 4.8 | [`6a773a15a1e8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6a773a15a1e8874e5eccd2f29190c31085912c95)
+Action: pass on to stack | 4.8 | [`6a773a15a1e8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6a773a15a1e8874e5eccd2f29190c31085912c95)
+Action: direct forwarding (on same port) | 4.8 | [`6ce96ca348a9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6ce96ca348a9e949f8c43f4d3e98db367d93cffd)
+Direct packet data write | 4.8 | [`4acf6c0b84c9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4acf6c0b84c91243c705303cd9ff16421914150d)
+Mellanox `mlx4` driver | 4.8 | [`47a38e155037`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=47a38e155037f417c5740e24ccae6482aedf4b68)
+Mellanox `mlx5` driver | 4.9 | [`86994156c736`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=86994156c736978d113e7927455d4eeeb2128b9f)
+Netronome `nfp` driver | 4.10 | [`ecd63a0217d5`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ecd63a0217d5f1e8a92f7516f5586d1177b95de2)
+QLogic (Cavium) `qed*` drivers | 4.10 | [`496e05170958`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=496e051709588f832d7a6a420f44f8642b308a87)
+`virtio_net` driver | 4.10 | [`f600b6905015`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=f600b690501550b94e83e07295d9c8b9c4c39f4e)
+Broadcom `bnxt_en` driver | 4.11 | [`c6d30e8391b8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c6d30e8391b85e00eb544e6cf047ee0160ee9938)
+Intel `ixgbe*` drivers | 4.12 | [`924708081629`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9247080816297de4e31abb684939c0e53e3a8a67)
+Cavium `thunderx` driver | 4.12 | [`05c773f52b96`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=05c773f52b96ef3fbc7d9bfa21caadc6247ef7a8)
+Generic XDP | 4.12 | [`b5cdae3291f7`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=b5cdae3291f7be7a34e75affe4c0ec1f7f328b64)
+Intel `i40e` driver | 4.13 | [`0c8493d90b6b`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=0c8493d90b6bb0f5c4fe9217db8f7203f24c0f28)
+Action: redirect | 4.14 | [`6453073987ba`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6453073987ba392510ab6c8b657844a9312c67f7)
+Support for tap | 4.14 | [`761876c857cb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=761876c857cb2ef8489fbee01907151da902af91)
+Support for veth | 4.14 | [`d445516966dc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d445516966dcb2924741b13b27738b54df2af01a)
+Intel `e1000` driver | | [Not upstream yet](https://git.kernel.org/pub/scm/linux/kernel/git/ast/bpf.git/commit/?h=xdp&id=0afee87cfc800bf3317f4dc8847e6f36539b820c)
+Intel `e1000e` driver | | [Not planned for upstream at this time](https://github.com/adjavon/e1000e_xdp)
+
+## Helpers
+
+The list of helpers supported in your kernel can be found in file
+[`include/uapi/linux/bpf.h`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/bpf.h):
+
+    git grep '	FN(' include/uapi/linux/bpf.h
+
+Alphabetical order
+
+Helper | Kernel version | License | Commit |
+-------|----------------|---------|--------|
+`BPF_FUNC_bind()` | 4.17 |  | [`d74bad4e74ee`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d74bad4e74ee373787a9ae24197c17b7cdc428d5) | 
+`BPF_FUNC_clone_redirect()` | 4.2 |  | [`3896d655f4d4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3896d655f4d491c67d669a15f275a39f713410f8)
+`BPF_FUNC_csum_diff()` | 4.6 |  | [`7d672345ed29`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7d672345ed295b1356a5d9f7111da1d1d7d65867)
+`BPF_FUNC_csum_update()` | 4.9 |  | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
+`BPF_FUNC_current_task_under_cgroup()` | 4.9 |  | [`60d20f9195b2`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=60d20f9195b260bdf0ac10c275ae9f6016f9c069)
+`BPF_FUNC_fib_lookup()` | 4.18 | GPL | [`87f5fc7e48dd`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=87f5fc7e48dd3175b30dd03b41564e1a8e136323)
+`BPF_FUNC_get_cgroup_classid()` | 4.3 |  | [`8d20aabe1c76`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8d20aabe1c76cccac544d9fcc3ad7823d9e98a2d)
+`BPF_FUNC_get_current_cgroup_id()` | 4.18 |  | [`bf6fa2c893c5`](https://github.com/torvalds/linux/commit/bf6fa2c893c5237b48569a13fa3c673041430b6c)
+`BPF_FUNC_get_current_comm()` | 4.2 |  | [`ffeedafbf023`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89)
+`BPF_FUNC_get_current_pid_tgid()` | 4.2 |  | [`ffeedafbf023`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89)
+`BPF_FUNC_get_current_task()` | 4.8 | GPL | [`606274c5abd8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=606274c5abd8e245add01bc7145a8cbb92b69ba8)
+`BPF_FUNC_get_current_uid_gid()` | 4.2 |  | [`ffeedafbf023`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89)
+`BPF_FUNC_get_hash_recalc()` | 4.8 |  | [`13c5c240f789`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=13c5c240f789bbd2bcacb14a23771491485ae61f)
+`BPF_FUNC_get_local_storage()` | 4.19 |  | [`cd3394317653`](https://github.com/torvalds/linux/commit/cd3394317653837e2eb5c5d0904a8996102af9fc)
+`BPF_FUNC_get_numa_node_id()` | 4.10 |  | [`2d0e30c30f84`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e)
+`BPF_FUNC_get_prandom_u32()` | 4.1 |  | [`03e69b508b6f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=03e69b508b6f7c51743055c9f61d1dfeadf4b635)
+`BPF_FUNC_get_route_realm()` | 4.4 |  | [`c46646d0484f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c46646d0484f5d08e2bede9b45034ba5b8b489cc)
+`BPF_FUNC_get_smp_processor_id()` | 4.1 |  | [`c04167ce2ca0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=c04167ce2ca0ecaeaafef006cb0d65cf01b68e42)
+`BPF_FUNC_get_socket_cookie()` | 4.12 |  | [`91b8270f2a4d`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=91b8270f2a4d1d9b268de90451cdca63a70052d6)
+`BPF_FUNC_get_socket_uid()` | 4.12 |  | [`6acc5c291068`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6acc5c2910689fc6ee181bf63085c5efff6a42bd)
+`BPF_FUNC_get_stack()` | 4.18 | GPL | [`de2ff05f48af`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=de2ff05f48afcde816ff4edb217417f62f624ab5)
+`BPF_FUNC_get_stackid()` | 4.6 | GPL | [`d5a3b1f69186`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d5a3b1f691865be576c2bffa708549b8cdccda19)
+`BPF_FUNC_getsockopt()` | 4.15 |  | [`cd86d1fd2102`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=cd86d1fd21025fdd6daf23d1288da405e7ad0ec6)
+`BPF_FUNC_ktime_get_ns()` | 4.1 | GPL | [`d9847d310ab4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d9847d310ab4003725e6ed1822682e24bd406908)
+`BPF_FUNC_l3_csum_replace()` | 4.1 |  | [`91bc4822c3d6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91bc4822c3d61b9bb7ef66d3b77948a4f9177954)
+`BPF_FUNC_l4_csum_replace()` | 4.1 |  | [`91bc4822c3d6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91bc4822c3d61b9bb7ef66d3b77948a4f9177954)
+`BPF_FUNC_lwt_push_encap()` | 4.18 |  | [`fe94cc290f53`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fe94cc290f535709d3c5ebd1e472dfd0aec7ee79)
+`BPF_FUNC_lwt_seg6_action()` | 4.18 |  | [`fe94cc290f53`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fe94cc290f535709d3c5ebd1e472dfd0aec7ee79)
+`BPF_FUNC_lwt_seg6_adjust_srh()` | 4.18 |  | [`fe94cc290f53`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fe94cc290f535709d3c5ebd1e472dfd0aec7ee79)
+`BPF_FUNC_lwt_seg6_store_bytes()` | 4.18 |  | [`fe94cc290f53`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fe94cc290f535709d3c5ebd1e472dfd0aec7ee79)
+`BPF_FUNC_map_delete_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
+`BPF_FUNC_map_lookup_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
+`BPF_FUNC_map_peek_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_pop_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_push_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_update_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
+`BPF_FUNC_msg_apply_bytes()` | 4.17 |  | [`2a100317c9eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a100317c9ebc204a166f16294884fbf9da074ce)
+`BPF_FUNC_msg_cork_bytes()` | 4.17 |  | [`91843d540a13`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91843d540a139eb8070bcff8aa10089164436deb)
+`BPF_FUNC_msg_pull_data()` | 4.17 |  | [`015632bb30da`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=015632bb30daaaee64e1bcac07570860e0bf3092)
+`BPF_FUNC_msg_push_data()` | 4.20 |  | [`6fff607e2f14`](https://github.com/torvalds/linux/commit/6fff607e2f14bd7c63c06c464a6f93b8efbabe28)
+`BPF_FUNC_msg_redirect_hash()` | 4.18 |  | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4)
+`BPF_FUNC_msg_redirect_map()` | 4.17 |  | [`4f738adba30a`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4f738adba30a7cfc006f605707e7aee847ffefa0)
+`BPF_FUNC_perf_event_output()` | 4.4 | GPL | [`a43eec304259`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a43eec304259a6c637f4014a6d4767159b6a3aa3)
+`BPF_FUNC_perf_event_read()` | 4.3 | GPL | [`35578d798400`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=35578d7984003097af2b1e34502bc943d40c1804)
+`BPF_FUNC_perf_event_read_value()` | 4.15 | GPL | [`908432ca84fc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=908432ca84fc229e906ba164219e9ad0fe56f755)
+`BPF_FUNC_perf_prog_read_value()` | 4.15 | GPL | [`4bebdc7a85aa`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4bebdc7a85aa400c0222b5329861e4ad9252f1e5)
+`BPF_FUNC_probe_read()` | 4.1 | GPL | [`2541517c32be`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2541517c32be2531e0da59dfd7efc1ce844644f5)
+`BPF_FUNC_probe_read_str()` | 4.11 | GPL | [`a5e8c07059d0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5e8c07059d0f0b31737408711d44794928ac218)
+`BPF_FUNC_probe_write_user()` | 4.8 | GPL | [`96ae52279594`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=96ae52279594470622ff0585621a13e96b700600)
+`BPF_FUNC_rc_keydown()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936)
+`BPF_FUNC_rc_repeat()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936)
+`BPF_FUNC_redirect()` | 4.4 |  | [`27b29f63058d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=27b29f63058d26c6c1742f1993338280d5a41dc6)
+`BPF_FUNC_redirect_map()` | 4.14 |  | [`97f91a7cf04f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=97f91a7cf04ff605845c20948b8a80e54cbd3376)
+`BPF_FUNC_set_hash()` | 4.13 |  | [`ded092cd73c2`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ded092cd73c2c56a394b936f86897f29b2e131c0)
+`BPF_FUNC_set_hash_invalid()` | 4.9 |  | [`7a4b28c6cc9f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7a4b28c6cc9ffac50f791b99cc7e46106436e5d8)
+`BPF_FUNC_setsockopt()` | 4.13 |  | [`8c4b4c7e9ff0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8c4b4c7e9ff0447995750d9329949fa082520269)
+`BPF_FUNC_sk_lookup_tcp()` | 4.20 |  | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71)
+`BPF_FUNC_sk_lookup_udp()` | 4.20 |  | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71)
+`BPF_FUNC_sk_redirect_hash()` | 4.18 |  | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4)
+`BPF_FUNC_sk_redirect_map()` | 4.14 |  | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6)
+`BPF_FUNC_sk_release()` | 4.20 |  | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71)
+`BPF_FUNC_sk_select_reuseport()` | 4.19 |  | [`2dbb9b9e6df6`](https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf)
+`BPF_FUNC_skb_adjust_room()` | 4.13 |  | [`2be7e212d541`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2be7e212d5419a400d051c84ca9fdd083e5aacac)
+`BPF_FUNC_skb_ancestor_cgroup_id()` | 4.19 |  | [`7723628101aa`](https://github.com/torvalds/linux/commit/7723628101aaeb1d723786747529b4ea65c5b5c5)
+`BPF_FUNC_skb_change_head()` | 4.10 |  | [`3a0af8fd61f9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2)
+`BPF_FUNC_skb_change_proto()` | 4.8 |  | [`6578171a7ff0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=6578171a7ff0c31dc73258f93da7407510abf085)
+`BPF_FUNC_skb_change_tail()` | 4.9 |  | [`5293efe62df8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5293efe62df81908f2e90c9820c7edcc8e61f5e9)
+`BPF_FUNC_skb_change_type()` | 4.8 |  | [`d2485c4242a8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d2485c4242a826fdf493fd3a27b8b792965b9b9e)
+`BPF_FUNC_skb_cgroup_id()` | 4.18 |  | [`cb20b08ead40`](https://github.com/torvalds/linux/commit/cb20b08ead401fd17627a36f035c0bf5bfee5567)
+`BPF_FUNC_skb_get_tunnel_key()` | 4.3 |  | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492)
+`BPF_FUNC_skb_get_tunnel_opt()` | 4.6 |  | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460)
+`BPF_FUNC_skb_get_xfrm_state()` | 4.18 |  | [`12bed760a78d`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=12bed760a78da6e12ac8252fec64d019a9eac523)
+`BPF_FUNC_skb_load_bytes()` | 4.5 |  | [`05c74e5e53f6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=05c74e5e53f6cb07502c3e6a820f33e2777b6605)
+`BPF_FUNC_skb_load_bytes_relative()` | 4.18 |  | [`4e1ec56cdc59`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=4e1ec56cdc59746943b2acfab3c171b930187bbe)
+`BPF_FUNC_skb_pull_data()` | 4.9 |  | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
+`BPF_FUNC_skb_set_tunnel_key()` | 4.3 |  | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492)
+`BPF_FUNC_skb_set_tunnel_opt()` | 4.6 |  | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460)
+`BPF_FUNC_skb_store_bytes()` | 4.1 |  | [`91bc4822c3d6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91bc4822c3d61b9bb7ef66d3b77948a4f9177954)
+`BPF_FUNC_skb_under_cgroup()` | 4.8 |  | [`4a482f34afcc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4a482f34afcc162d8456f449b137ec2a95be60d8)
+`BPF_FUNC_skb_vlan_pop()` | 4.3 |  | [`4e10df9a60d9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e10df9a60d96ced321dd2af71da558c6b750078)
+`BPF_FUNC_skb_vlan_push()` | 4.3 |  | [`4e10df9a60d9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e10df9a60d96ced321dd2af71da558c6b750078)
+`BPF_FUNC_sock_hash_update()` | 4.18 |  | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4)
+`BPF_FUNC_sock_map_update()` | 4.14 |  | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6)
+`BPF_FUNC_tail_call()` | 4.2 |  | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb)
+`BPF_FUNC_trace_printk()` | 4.1 | GPL | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569)
+`BPF_FUNC_xdp_adjust_head()` | 4.10 |  | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03)
+`BPF_FUNC_xdp_adjust_meta()` | 4.15 |  | [`de8f3a83b0a0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da)
+`BPF_FUNC_xdp_adjust_tail()` | 4.18 |  | [`b32cc5b9a346`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=b32cc5b9a346319c171e3ad905e0cddda032b5eb)
+`BPF_FUNC_override_return()` | 4.16 | GPL | [`9802d86585db`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9802d86585db91655c7d1929a4f6bbe0952ea88e)
+`BPF_FUNC_sock_ops_cb_flags_set()` | 4.16 |  | [`b13d88072172`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b13d880721729384757f235166068c315326f4a1)
+
+Note: GPL-only BPF helpers require a GPL-compatible license. The current licenses considered GPL-compatible by the kernel are:
+
+* GPL
+* GPL v2
+* GPL and additional rights
+* Dual BSD/GPL
+* Dual MIT/GPL
+* Dual MPL/GPL
+
+Check the list of GPL-compatible licenses in your [kernel source code](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/license.h).
+
+## Program Types
+The list of program types and supported helper functions can be retrieved with:
+
+    git grep -W 'func_proto(enum bpf_func_id func_id' kernel/ net/ drivers/
+
+|Program Type| Helper Functions|
+|------------|-----------------|
+|`BPF_PROG_TYPE_SOCKET_FILTER`|`BPF_FUNC_get_current_uid_gid()` <br> `Base functions`|
+|`BPF_PROG_TYPE_KPROBE`|`BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_stackid()` <br> `BPF_FUNC_get_stack()` <br> `BPF_FUNC_perf_event_read_value()` <br> `BPF_FUNC_override_return()` <br> `Tracing functions`|
+|`BPF_PROG_TYPE_SCHED_CLS` <br> `BPF_PROG_TYPE_SCHED_ACT`|`BPF_FUNC_skb_store_bytes()` <br> `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_skb_load_bytes_relative()` <br> `BPF_FUNC_skb_pull_data()` <br> `BPF_FUNC_csum_diff()` <br> `BPF_FUNC_csum_update()` <br> `BPF_FUNC_l3_csum_replace()` <br> `BPF_FUNC_l4_csum_replace()` <br> `BPF_FUNC_clone_redirect()` <br> `BPF_FUNC_get_cgroup_classid()` <br> `BPF_FUNC_skb_vlan_push()` <br> `BPF_FUNC_skb_vlan_pop()` <br> `BPF_FUNC_skb_change_proto()` <br> `BPF_FUNC_skb_change_type()` <br> `BPF_FUNC_skb_adjust_room()` <br> `BPF_FUNC_skb_change_tail()` <br> `BPF_FUNC_skb_get_tunnel_key()` <br> `BPF_FUNC_skb_set_tunnel_key()` <br> `BPF_FUNC_skb_get_tunnel_opt()` <br> `BPF_FUNC_skb_set_tunnel_opt()` <br> `BPF_FUNC_redirect()` <br> `BPF_FUNC_get_route_realm()` <br> `BPF_FUNC_get_hash_recalc()` <br> `BPF_FUNC_set_hash_invalid()` <br> `BPF_FUNC_set_hash()` <br> `BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_smp_processor_id()` <br> `BPF_FUNC_skb_under_cgroup()` <br> `BPF_FUNC_get_socket_cookie()` <br> `BPF_FUNC_get_socket_uid()` <br> `BPF_FUNC_fib_lookup()` <br> `BPF_FUNC_skb_get_xfrm_state()` <br> `BPF_FUNC_skb_cgroup_id()` <br> `Base functions`|
+|`BPF_PROG_TYPE_TRACEPOINT`|`BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_stackid()` <br> `BPF_FUNC_get_stack()` <br> `Tracing functions`|
+|`BPF_PROG_TYPE_XDP`| `BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_smp_processor_id()` <br> `BPF_FUNC_csum_diff()` <br> `BPF_FUNC_xdp_adjust_head()` <br> `BPF_FUNC_xdp_adjust_meta()` <br> `BPF_FUNC_redirect()` <br> `BPF_FUNC_redirect_map()` <br> `BPF_FUNC_xdp_adjust_tail()` <br> `BPF_FUNC_fib_lookup()` <br> `Base functions`|
+|`BPF_PROG_TYPE_PERF_EVENT`| `BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_stackid()` <br> `BPF_FUNC_get_stack()` <br> `BPF_FUNC_perf_prog_read_value()` <br> `Tracing functions`|
+|`BPF_PROG_TYPE_CGROUP_SKB`|`BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_skb_load_bytes_relative()` <br> `BPF_FUNC_get_socket_cookie()` <br> `BPF_FUNC_get_socket_uid()` <br> `Base functions`|
+|`BPF_PROG_TYPE_CGROUP_SOCK`|`BPF_FUNC_get_current_uid_gid()` <br> `Base functions`|
+|`BPF_PROG_TYPE_LWT_IN`|`BPF_FUNC_lwt_push_encap()` <br> `LWT functions` <br> `Base functions`|
+|`BPF_PROG_TYPE_LWT_OUT`| `LWT functions` <br> `Base functions`|
+|`BPF_PROG_TYPE_LWT_XMIT`| `BPF_FUNC_skb_get_tunnel_key()` <br> `BPF_FUNC_skb_set_tunnel_key()` <br> `BPF_FUNC_skb_get_tunnel_opt()` <br> `BPF_FUNC_skb_set_tunnel_opt()` <br> `BPF_FUNC_redirect()` <br> `BPF_FUNC_clone_redirect()` <br> `BPF_FUNC_skb_change_tail()` <br> `BPF_FUNC_skb_change_head()` <br> `BPF_FUNC_skb_store_bytes()` <br> `BPF_FUNC_csum_update()` <br> `BPF_FUNC_l3_csum_replace()` <br> `BPF_FUNC_l4_csum_replace()` <br> `BPF_FUNC_set_hash_invalid()` <br> `LWT functions`|
+|`BPF_PROG_TYPE_SOCK_OPS`|`BPF_FUNC_setsockopt()` <br> `BPF_FUNC_getsockopt()` <br> `BPF_FUNC_sock_ops_cb_flags_set()` <br> `BPF_FUNC_sock_map_update()` <br> `BPF_FUNC_sock_hash_update()` <br> `BPF_FUNC_get_socket_cookie()` <br> `Base functions`|
+|`BPF_PROG_TYPE_SK_SKB`|`BPF_FUNC_skb_store_bytes()` <br> `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_skb_pull_data()` <br> `BPF_FUNC_skb_change_tail()` <br> `BPF_FUNC_skb_change_head()` <br> `BPF_FUNC_get_socket_cookie()` <br> `BPF_FUNC_get_socket_uid()` <br> `BPF_FUNC_sk_redirect_map()` <br> `BPF_FUNC_sk_redirect_hash()` <br> `BPF_FUNC_sk_lookup_tcp()` <br> `BPF_FUNC_sk_lookup_udp()` <br> `BPF_FUNC_sk_release()` <br> `Base functions`|
+|`BPF_PROG_TYPE_CGROUP_DEVICE`|`BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_get_current_uid_gid()` <br> `BPF_FUNC_trace_printk()`|
+|`BPF_PROG_TYPE_SK_MSG`|`BPF_FUNC_msg_redirect_map()` <br> `BPF_FUNC_msg_redirect_hash()` <br> `BPF_FUNC_msg_apply_bytes()` <br> `BPF_FUNC_msg_cork_bytes()` <br> `BPF_FUNC_msg_pull_data()` <br> `BPF_FUNC_msg_push_data()` <br> `Base functions`|
+|`BPF_PROG_TYPE_RAW_TRACEPOINT`|`BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_stackid()` <br> `BPF_FUNC_get_stack()` <br> `Tracing functions`|
+|`BPF_PROG_TYPE_CGROUP_SOCK_ADDR`|`BPF_FUNC_get_current_uid_gid()` <br> `BPF_FUNC_bind()` <br> `BPF_FUNC_get_socket_cookie()` <br> `Base functions`|
+|`BPF_PROG_TYPE_LWT_SEG6LOCAL`|`BPF_FUNC_lwt_seg6_store_bytes()` <br> `BPF_FUNC_lwt_seg6_action()` <br> `BPF_FUNC_lwt_seg6_adjust_srh()` <br> `LWT functions`|
+|`BPF_PROG_TYPE_LIRC_MODE2`|`BPF_FUNC_rc_repeat()` <br> `BPF_FUNC_rc_keydown()` <br> `BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_ktime_get_ns()` <br> `BPF_FUNC_tail_call()` <br> `BPF_FUNC_get_prandom_u32()` <br> `BPF_FUNC_trace_printk()`|
+|`BPF_PROG_TYPE_SK_REUSEPORT`|`BPF_FUNC_sk_select_reuseport()` <br> `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_load_bytes_relative()` <br> `Base functions`|
+|`BPF_PROG_TYPE_FLOW_DISSECTOR`|`BPF_FUNC_skb_load_bytes()` <br> `Base functions`|
+
+|Function Group| Functions|
+|------------------|-------|
+|`Base functions`| `BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_map_peek_elem()` <br> `BPF_FUNC_map_pop_elem()` <br> `BPF_FUNC_map_push_elem()` <br> `BPF_FUNC_get_prandom_u32()` <br> `BPF_FUNC_get_smp_processor_id()` <br> `BPF_FUNC_get_numa_node_id()` <br> `BPF_FUNC_tail_call()` <br> `BPF_FUNC_ktime_get_ns()` <br> `BPF_FUNC_trace_printk()`|
+|`Tracing functions`|`BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_probe_read()` <br> `BPF_FUNC_ktime_get_ns()` <br> `BPF_FUNC_tail_call()` <br> `BPF_FUNC_get_current_pid_tgid()` <br> `BPF_FUNC_get_current_task()` <br> `BPF_FUNC_get_current_uid_gid()` <br> `BPF_FUNC_get_current_comm()` <br> `BPF_FUNC_trace_printk()` <br> `BPF_FUNC_get_smp_processor_id()` <br> `BPF_FUNC_get_numa_node_id()` <br> `BPF_FUNC_perf_event_read()` <br> `BPF_FUNC_probe_write_user()` <br> `BPF_FUNC_current_task_under_cgroup()` <br> `BPF_FUNC_get_prandom_u32()` <br> `BPF_FUNC_probe_read_str()` <br> `BPF_FUNC_get_current_cgroup_id()` |
+|`LWT functions`|  `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_skb_pull_data()` <br> `BPF_FUNC_csum_diff()` <br> `BPF_FUNC_get_cgroup_classid()` <br> `BPF_FUNC_get_route_realm()` <br> `BPF_FUNC_get_hash_recalc()` <br> `BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_smp_processor_id()` <br> `BPF_FUNC_skb_under_cgroup()`|
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
new file mode 100644
index 0000000..a90cf31
--- /dev/null
+++ b/docs/reference_guide.md
@@ -0,0 +1,1587 @@
+# bcc Reference Guide
+
+Intended for search (Ctrl-F) and reference. For tutorials, start with [tutorial.md](tutorial.md).
+
+This guide is incomplete. If something feels missing, check the bcc and kernel source. And if you confirm we're missing something, please send a pull request to fix it, and help out everyone.
+
+## Contents
+
+- [BPF C](#bpf-c)
+    - [Events & Arguments](#events--arguments)
+        - [1. kprobes](#1-kprobes)
+        - [2. kretprobes](#2-kretprobes)
+        - [3. Tracepoints](#3-tracepoints)
+        - [4. uprobes](#4-uprobes)
+        - [5. uretprobes](#5-uretprobes)
+        - [6. USDT probes](#6-usdt-probes)
+        - [7. Raw Tracepoints](#7-raw-tracepoints)
+    - [Data](#data)
+        - [1. bpf_probe_read()](#1-bpf_probe_read)
+        - [2. bpf_probe_read_str()](#2-bpf_probe_read_str)
+        - [3. bpf_ktime_get_ns()](#3-bpf_ktime_get_ns)
+        - [4. bpf_get_current_pid_tgid()](#4-bpf_get_current_pid_tgid)
+        - [5. bpf_get_current_uid_gid()](#5-bpf_get_current_uid_gid)
+        - [6. bpf_get_current_comm()](#6-bpf_get_current_comm)
+        - [7. bpf_get_current_task()](#7-bpf_get_current_task)
+        - [8. bpf_log2l()](#8-bpflog2l)
+        - [9. bpf_get_prandom_u32()](#9-bpf_get_prandom_u32)
+    - [Debugging](#debugging)
+        - [1. bpf_override_return()](#1-bpf_override_return)
+    - [Output](#output)
+        - [1. bpf_trace_printk()](#1-bpf_trace_printk)
+        - [2. BPF_PERF_OUTPUT](#2-bpf_perf_output)
+        - [3. perf_submit()](#3-perf_submit)
+    - [Maps](#maps)
+        - [1. BPF_TABLE](#1-bpf_table)
+        - [2. BPF_HASH](#2-bpf_hash)
+        - [3. BPF_ARRAY](#3-bpf_array)
+        - [4. BPF_HISTOGRAM](#4-bpf_histogram)
+        - [5. BPF_STACK_TRACE](#5-bpf_stack_trace)
+        - [6. BPF_PERF_ARRAY](#6-bpf_perf_array)
+        - [7. BPF_PERCPU_ARRAY](#7-bpf_percpu_array)
+        - [8. BPF_LPM_TRIE](#8-bpf_lpm_trie)
+        - [9. BPF_PROG_ARRAY](#9-bpf_prog_array)
+        - [10. BPF_DEVMAP](#10-bpf_devmap)
+        - [11. BPF_CPUMAP](#11-bpf_cpumap)
+        - [12. map.lookup()](#12-maplookup)
+        - [13. map.lookup_or_init()](#13-maplookup_or_init)
+        - [14. map.delete()](#14-mapdelete)
+        - [15. map.update()](#15-mapupdate)
+        - [16. map.insert()](#16-mapinsert)
+        - [17. map.increment()](#17-mapincrement)
+        - [18. map.get_stackid()](#18-mapget_stackid)
+        - [19. map.perf_read()](#19-mapperf_read)
+        - [20. map.call()](#20-mapcall)
+        - [21. map.redirect_map()](#21-mapredirect_map)
+    - [Licensing](#licensing)
+
+- [bcc Python](#bcc-python)
+    - [Initialization](#initialization)
+        - [1. BPF](#1-bpf)
+        - [2. USDT](#2-usdt)
+    - [Events](#events)
+        - [1. attach_kprobe()](#1-attach_kprobe)
+        - [2. attach_kretprobe()](#2-attach_kretprobe)
+        - [3. attach_tracepoint()](#3-attach_tracepoint)
+        - [4. attach_uprobe()](#4-attach_uprobe)
+        - [5. attach_uretprobe()](#5-attach_uretprobe)
+        - [6. USDT.enable_probe()](#6-usdtenable_probe)
+        - [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint)
+    - [Debug Output](#debug-output)
+        - [1. trace_print()](#1-trace_print)
+        - [2. trace_fields()](#2-trace_fields)
+    - [Output](#output)
+        - [1. perf_buffer_poll()](#1-perf_buffer_poll)
+    - [Maps](#maps)
+        - [1. get_table()](#1-get_table)
+        - [2. open_perf_buffer()](#2-open_perf_buffer)
+        - [3. items()](#3-items)
+        - [4. values()](#4-values)
+        - [5. clear()](#5-clear)
+        - [6. print_log2_hist()](#6-print_log2_hist)
+        - [7. print_linear_hist()](#6-print_linear_hist)
+    - [Helpers](#helpers)
+        - [1. ksym()](#1-ksym)
+        - [2. ksymname()](#2-ksymname)
+        - [3. sym()](#3-sym)
+        - [4. num_open_kprobes()](#4-num_open_kprobes)
+
+- [BPF Errors](#bpf-errors)
+    - [1. Invalid mem access](#1-invalid-mem-access)
+    - [2. Cannot call GPL only function from proprietary program](#2-cannot-call-gpl-only-function-from-proprietary-program)
+
+- [Environment Variables](#envvars)
+    - [1. kernel source directory](#1-kernel-source-directory)
+    - [2. kernel version overriding](#2-kernel-version-overriding)
+
+# BPF C
+
+This section describes the C part of a bcc program.
+
+## Events & Arguments
+
+### 1. kprobes
+
+Syntax: kprobe__*kernel_function_name*
+
+```kprobe__``` is a special prefix that creates a kprobe (dynamic tracing of a kernel function call) for the kernel function name provided as the remainder. You can also use kprobes by declaring a normal C function, then using the Python ```BPF.attach_kprobe()``` (covered later) to associate it with a kernel function.
+
+Arguments are specified on the function declaration: kprobe__*kernel_function_name*(struct pt_regs *ctx [, *argument1* ...])
+
+For example:
+
+```C
+int kprobe__tcp_v4_connect(struct pt_regs *ctx, struct sock *sk)
+    [...]
+}
+```
+
+This instruments the tcp_v4_connect() kernel function using a kprobe, with the following arguments:
+
+- ```struct pt_regs *ctx```: Registers and BPF context.
+- ```struct sock *sk```: First argument to tcp_v4_connect().
+
+The first argument is always ```struct pt_regs *```, the remainder are the arguments to the function (they don't need to be specified, if you don't intend to use them).
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/tcpv4connect.py#L28) ([output](https://github.com/iovisor/bcc/blob/5bd0eb21fd148927b078deb8ac29fff2fb044b66/examples/tracing/tcpv4connect_example.txt#L8)),
+[code](https://github.com/iovisor/bcc/commit/310ab53710cfd46095c1f6b3e44f1dbc8d1a41d8#diff-8cd1822359ffee26e7469f991ce0ef00R26) ([output](https://github.com/iovisor/bcc/blob/3b9679a3bd9b922c736f6061dc65cb56de7e0250/examples/tracing/bitehist_example.txt#L6))
+<!--- I can't add search links here, since github currently cannot handle partial-word searches needed for "kprobe__" --->
+
+### 2. kretprobes
+
+Syntax: kretprobe__*kernel_function_name*
+
+```kretprobe__``` is a special prefix that creates a kretprobe (dynamic tracing of a kernel function return) for the kernel function name provided as the remainder. You can also use kretprobes by declaring a normal C function, then using the Python ```BPF.attach_kretprobe()``` (covered later) to associate it with a kernel function.
+
+Return value is available as ```PT_REGS_RC(ctx)```, given a function declaration of: kretprobe__*kernel_function_name*(struct pt_regs *ctx)
+
+For example:
+
+```C
+int kretprobe__tcp_v4_connect(struct pt_regs *ctx)
+{
+    int ret = PT_REGS_RC(ctx);
+    [...]
+}
+```
+
+This instruments the return of the tcp_v4_connect() kernel function using a kretprobe, and stores the return value in ```ret```.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/tcpv4connect.py#L38) ([output](https://github.com/iovisor/bcc/blob/5bd0eb21fd148927b078deb8ac29fff2fb044b66/examples/tracing/tcpv4connect_example.txt#L8))
+
+### 3. Tracepoints
+
+Syntax: TRACEPOINT_PROBE(*category*, *event*)
+
+This is a macro that instruments the tracepoint defined by *category*:*event*.
+
+Arguments are available in an ```args``` struct, which are the tracepoint arguments. One way to list these is to cat the relevant format file under /sys/kernel/debug/tracing/events/*category*/*event*/format.
+
+The ```args``` struct can be used in place of ``ctx`` in each functions requiring a context as an argument. This includes notably [perf_submit()](#3-perf_submit).
+
+For example:
+
+```C
+TRACEPOINT_PROBE(random, urandom_read) {
+    // args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
+    bpf_trace_printk("%d\\n", args->got_bits);
+    return 0;
+}
+```
+
+This instruments the random:urandom_read tracepoint, and prints the tracepoint argument ```got_bits```.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread.py#L19) ([output](https://github.com/iovisor/bcc/commit/e422f5e50ecefb96579b6391a2ada7f6367b83c4#diff-41e5ecfae4a3b38de5f4e0887ed160e5R10)),
+[search /examples](https://github.com/iovisor/bcc/search?q=TRACEPOINT_PROBE+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=TRACEPOINT_PROBE+path%3Atools&type=Code)
+
+### 4. uprobes
+
+These are instrumented by declaring a normal function in C, then associating it as a uprobe probe in Python via ```BPF.attach_uprobe()``` (covered later).
+
+Arguments can be examined using ```PT_REGS_PARM``` macros.
+
+For example:
+
+```C
+int count(struct pt_regs *ctx) {
+    char buf[64];
+    bpf_probe_read(&buf, sizeof(buf), (void *)PT_REGS_PARM1(ctx));
+    bpf_trace_printk("%s %d", buf, PT_REGS_PARM2(ctx));
+    return(0);
+}
+```
+
+This reads the first argument as a string, and then prints it with the second argument as an integer.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_count.py#L26)
+
+### 5. uretprobes
+
+These are instrumented by declaring a normal function in C, then associating it as a uretprobe probe in Python via ```BPF.attach_uretprobe()``` (covered later).
+
+Return value is available as ```PT_REGS_RC(ctx)```, given a function declaration of: *function_name*(struct pt_regs *ctx)
+
+For example:
+
+```C
+BPF_HISTOGRAM(dist);
+int count(struct pt_regs *ctx) {
+    dist.increment(PT_REGS_RC(ctx));
+    return 0;
+}
+```
+
+This increments the bucket in the ```dist``` histogram that is indexed by the return value.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_hist.py#L39) ([output](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/examples/tracing/strlen_hist.py#L15)),
+[code](https://github.com/iovisor/bcc/blob/4afa96a71c5dbfc4c507c3355e20baa6c184a3a8/tools/bashreadline.py) ([output](https://github.com/iovisor/bcc/commit/aa87997d21e5c1a6a20e2c96dd25eb92adc8e85d#diff-2fd162f9e594206f789246ce97d62cf0R7))
+
+### 6. USDT probes
+
+These are User Statically-Defined Tracing (USDT) probes, which may be placed in some applications or libraries to provide a user-level equivalent of tracepoints. The primary BPF method provided for USDT support method is ```enable_probe()```. USDT probes are instrumented by declaring a normal function in C, then associating it as a USDT probe in Python via ```USDT.enable_probe()```.
+
+Arguments can be read via: bpf_usdt_readarg(*index*, ctx, &addr)
+
+For example:
+
+```C
+int do_trace(struct pt_regs *ctx) {
+    uint64_t addr;
+    char path[128];
+    bpf_usdt_readarg(6, ctx, &addr);
+    bpf_probe_read(&path, sizeof(path), (void *)addr);
+    bpf_trace_printk("path:%s\\n", path);
+    return 0;
+};
+```
+
+This reads the sixth USDT argument, and then pulls it in as a string to ```path```.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/commit/4f88a9401357d7b75e917abd994aa6ea97dda4d3#diff-04a7cad583be5646080970344c48c1f4R24),
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code)
+
+### 7. Raw Tracepoints
+
+Syntax: RAW_TRACEPOINT_PROBE(*event*)
+
+This is a macro that instruments the raw tracepoint defined by *event*.
+
+The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h).  The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events)
+directory.
+
+For example:
+```C
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+    struct task_struct *prev = (struct task_struct *)ctx->args[1];
+    struct task_struct *next= (struct task_struct *)ctx->args[2];
+    s32 prev_tgid, next_tgid;
+
+    bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
+    bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid);
+    bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
+}
+```
+
+This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code)
+
+## Data
+
+### 1. bpf_probe_read()
+
+Syntax: ```int bpf_probe_read(void *dst, int size, const void *src)```
+
+Return: 0 on success
+
+This copies a memory location to the BPF stack, so that BPF can later operate on it. For safety, all memory reads must pass through bpf_probe_read(). This happens automatically in some cases, such as dereferencing kernel variables, as bcc will rewrite the BPF program to include the necessary bpf_probe_reads().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_probe_read+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_probe_read+path%3Atools&type=Code)
+
+### 2. bpf_probe_read_str()
+
+Syntax: ```int bpf_probe_read_str(void *dst, int size, const void *src)```
+
+Return:
+  - \> 0 length of the string including the trailing NULL on success
+  - \< 0 error
+
+This copies a `NULL` terminated string from memory location to BPF stack, so that BPF can later operate on it. In case the string length is smaller than size, the target is not padded with further `NULL` bytes. In case the string length is larger than size, just `size - 1` bytes are copied and the last byte is set to `NULL`.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_probe_read_str+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_probe_read_str+path%3Atools&type=Code)
+
+### 3. bpf_ktime_get_ns()
+
+Syntax: ```u64 bpf_ktime_get_ns(void)```
+
+Return: current time in nanoseconds
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_ktime_get_ns+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_ktime_get_ns+path%3Atools&type=Code)
+
+### 4. bpf_get_current_pid_tgid()
+
+Syntax: ```u64 bpf_get_current_pid_tgid(void)```
+
+Return: ```current->tgid << 32 | current->pid```
+
+Returns the process ID in the lower 32 bits (kernel's view of the PID, which in user space is usually presented as the thread ID), and the thread group ID in the upper 32 bits (what user space often thinks of as the PID). By directly setting this to a u32, we discard the upper 32 bits.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_get_current_pid_tgid+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_get_current_pid_tgid+path%3Atools&type=Code)
+
+### 5. bpf_get_current_uid_gid()
+
+Syntax: ```u64 bpf_get_current_uid_gid(void)```
+
+Return: ```current_gid << 32 | current_uid```
+
+Returns the user ID and group IDs.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_get_current_uid_gid+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_get_current_uid_gid+path%3Atools&type=Code)
+
+### 6. bpf_get_current_comm()
+
+Syntax: ```bpf_get_current_comm(char *buf, int size_of_buf)```
+
+Return: 0 on success
+
+Populates the first argument address with the current process name. It should be a pointer to a char array of at least size TASK_COMM_LEN, which is defined in linux/sched.h. For example:
+
+```C
+#include <linux/sched.h>
+
+int do_trace(struct pt_regs *ctx) {
+    char comm[TASK_COMM_LEN];
+    bpf_get_current_comm(&comm, sizeof(comm));
+[...]
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_get_current_comm+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_get_current_comm+path%3Atools&type=Code)
+
+### 7. bpf_get_current_task()
+
+Syntax: ```bpf_get_current_task()```
+
+Return: current task as a pointer to struct task_struct.
+
+Returns a pointer to the current task's task_struct object. This helper can be used to compute the on-CPU time for a process, identify kernel threads, get the current CPU's run queue, or retrieve many other pieces of information.
+
+With Linux 4.13, due to issues with field randomization, you may need two #define directives before the includes:
+```C
+#define randomized_struct_fields_start  struct {
+#define randomized_struct_fields_end    };
+#include <linux/sched.h>
+
+int do_trace(void *ctx) {
+    struct task_struct *t = (struct task_struct *)bpf_get_current_task();
+[...]
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_get_current_task+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_get_current_task+path%3Atools&type=Code)
+
+### 8. bpf_log2l()
+
+Syntax: ```unsigned int bpf_log2l(unsigned long v)```
+
+Returns the log-2 of the provided value. This is often used to create indexes for histograms, to construct power-of-2 histograms.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_log2l+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_log2l+path%3Atools&type=Code)
+
+### 9. bpf_get_prandom_u32()
+
+Syntax: ```u32 bpf_get_prandom_u32()```
+
+Returns a pseudo-random u32.
+
+Example in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_get_prandom_u32+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_get_prandom_u32+path%3Atools&type=Code)
+
+## Debugging
+
+### 1. bpf_override_return()
+
+Syntax: ```int bpf_override_return(struct pt_regs *, unsigned long rc)```
+
+Return: 0 on success
+
+When used in a program attached to a function entry kprobe, causes the
+execution of the function to be skipped, immediately returning `rc` instead.
+This is used for targeted error injection.
+
+bpf_override_return will only work when the kprobed function is whitelisted to
+allow error injections. Whitelisting entails tagging a function with
+`BPF_ALLOW_ERROR_INJECTION()` in the kernel source tree; see `io_ctl_init` for
+an example. If the kprobed function is not whitelisted, the bpf program will
+fail to attach with ` ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument`
+
+
+```C
+int kprobe__io_ctl_init(void *ctx) {
+	bpf_override_return(ctx, -ENOMEM);
+	return 0;
+}
+```
+
+## Output
+
+### 1. bpf_trace_printk()
+
+Syntax: ```int bpf_trace_printk(const char *fmt, int fmt_size, ...)```
+
+Return: 0 on success
+
+A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=bpf_trace_printk+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=bpf_trace_printk+path%3Atools&type=Code)
+
+### 2. BPF_PERF_OUTPUT
+
+Syntax: ```BPF_PERF_OUTPUT(name)```
+
+Creates a BPF table for pushing out custom event data to user space via a perf ring buffer. This is the preferred method for pushing per-event data to user space.
+
+For example:
+
+```C
+struct data_t {
+    u32 pid;
+    u64 ts;
+    char comm[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(events);
+
+int hello(struct pt_regs *ctx) {
+    struct data_t data = {};
+
+    data.pid = bpf_get_current_pid_tgid();
+    data.ts = bpf_ktime_get_ns();
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+```
+
+The output table is named ```events```, and data is pushed to it via ```events.perf_submit()```.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_PERF_OUTPUT+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_PERF_OUTPUT+path%3Atools&type=Code)
+
+### 3. perf_submit()
+
+Syntax: ```int perf_submit((void *)ctx, (void *)data, u32 data_size)```
+
+Return: 0 on success
+
+A method of a BPF_PERF_OUTPUT table, for submitting custom event data to user space. See the BPF_PERF_OUTPUT entry. (This ultimately calls bpf_perf_event_output().)
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=perf_submit+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=perf_submit+path%3Atools&type=Code)
+
+## Maps
+
+Maps are BPF data stores, and are the basis for higher level object types including tables, hashes, and histograms.
+
+### 1. BPF_TABLE
+
+Syntax: ```BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries)```
+
+Creates a map named ```_name```. Most of the time this will be used via higher-level macros, like BPF_HASH, BPF_HIST, etc.
+
+Methods (covered later): map.lookup(), map.lookup_or_init(), map.delete(), map.update(), map.insert(), map.increment().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_TABLE+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_TABLE+path%3Atools&type=Code)
+
+### 2. BPF_HASH
+
+Syntax: ```BPF_HASH(name [, key_type [, leaf_type [, size]]])```
+
+Creates a hash map (associative array) named ```name```, with optional parameters.
+
+Defaults: ```BPF_HASH(name, key_type=u64, leaf_type=u64, size=10240)```
+
+For example:
+
+```C
+BPF_HASH(start, struct request *);
+```
+
+This creates a hash named ```start``` where the key is a ```struct request *```, and the value defaults to u64. This hash is used by the disksnoop.py example for saving timestamps for each I/O request, where the key is the pointer to struct request, and the value is the timestamp.
+
+Methods (covered later): map.lookup(), map.lookup_or_init(), map.delete(), map.update(), map.insert(), map.increment().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_HASH+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_HASH+path%3Atools&type=Code)
+
+### 3. BPF_ARRAY
+
+Syntax: ```BPF_ARRAY(name [, leaf_type [, size]])```
+
+Creates an int-indexed array which is optimized for fastest lookup and update, named ```name```, with optional parameters.
+
+Defaults: ```BPF_ARRAY(name, leaf_type=u64, size=10240)```
+
+For example:
+
+```C
+BPF_ARRAY(counts, u64, 32);
+```
+
+This creates an array named ```counts``` where with 32 buckets and 64-bit integer values. This array is used by the funccount.py example for saving call count of each function.
+
+Methods (covered later): map.lookup(), map.update(), map.increment(). Note that all array elements are pre-allocated with zero values and can not be deleted.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_ARRAY+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_ARRAY+path%3Atools&type=Code)
+
+### 4. BPF_HISTOGRAM
+
+Syntax: ```BPF_HISTOGRAM(name [, key_type [, size ]])```
+
+Creates a histogram map named ```name```, with optional parameters.
+
+Defaults: ```BPF_HISTOGRAM(name, key_type=int, size=64)```
+
+For example:
+
+```C
+BPF_HISTOGRAM(dist);
+```
+
+This creates a histogram named ```dist```, which defaults to 64 buckets indexed by keys of type int.
+
+Methods (covered later): map.increment().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_HISTOGRAM+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_HISTOGRAM+path%3Atools&type=Code)
+
+### 5. BPF_STACK_TRACE
+
+Syntax: ```BPF_STACK_TRACE(name, max_entries)```
+
+Creates stack trace map named ```name```, with a maximum entry count provided. These maps are used to store stack traces.
+
+For example:
+
+```C
+BPF_STACK_TRACE(stack_traces, 1024);
+```
+
+This creates stack trace map named ```stack_traces```, with a maximum number of stack trace entries of 1024.
+
+Methods (covered later): map.get_stackid().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_STACK_TRACE+path%3Atools&type=Code)
+
+### 6. BPF_PERF_ARRAY
+
+Syntax: ```BPF_PERF_ARRAY(name, max_entries)```
+
+Creates perf array named ```name```, with a maximum entry count provided, which must be equal to the number of system cpus. These maps are used to fetch hardware performance counters.
+
+For example:
+
+```C
+text="""
+BPF_PERF_ARRAY(cpu_cycles, NUM_CPUS);
+"""
+b = bcc.BPF(text=text, cflags=["-DNUM_CPUS=%d" % multiprocessing.cpu_count()])
+b["cpu_cycles"].open_perf_event(b["cpu_cycles"].HW_CPU_CYCLES)
+```
+
+This creates a perf array named ```cpu_cycles```, with number of entries equal to the number of cpus/cores. The array is configured so that later calling map.perf_read() will return a hardware-calculated counter of the number of cycles elapsed from some point in the past. Only one type of hardware counter may be configured per table at a time.
+
+Methods (covered later): map.perf_read().
+
+Examples in situ:
+[search /tests](https://github.com/iovisor/bcc/search?q=BPF_PERF_ARRAY+path%3Atests&type=Code)
+
+### 7. BPF_PERCPU_ARRAY
+
+Syntax: ```BPF_PERCPU_ARRAY(name [, leaf_type [, size]])```
+
+Creates NUM_CPU int-indexed arrays which are optimized for fastest lookup and update, named ```name```, with optional parameters. Each CPU will have a separate copy of this array. The copies are not kept synchronized in any way.
+
+
+Defaults: ```BPF_PERCPU_ARRAY(name, leaf_type=u64, size=10240)```
+
+For example:
+
+```C
+BPF_PERCPU_ARRAY(counts, u64, 32);
+```
+
+This creates NUM_CPU arrays named ```counts``` where with 32 buckets and 64-bit integer values.
+
+Methods (covered later): map.lookup(), map.update(), map.increment(). Note that all array elements are pre-allocated with zero values and can not be deleted.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_PERCPU_ARRAY+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_PERCPU_ARRAY+path%3Atools&type=Code)
+
+### 8. BPF_LPM_TRIE
+
+Syntax: `BPF_LPM_TRIE(name [, key_type [, leaf_type [, size]]])`
+
+Creates a longest prefix match trie map named `name`, with optional parameters.
+
+Defaults: `BPF_LPM_TRIE(name, key_type=u64, leaf_type=u64, size=10240)`
+
+For example:
+
+```c
+BPF_LPM_TRIE(trie, struct key_v6);
+```
+
+This creates an LPM trie map named `trie` where the key is a `struct key_v6`, and the value defaults to u64.
+
+Methods (covered later): map.lookup(), map.lookup_or_init(), map.delete(), map.update(), map.insert(), map.increment().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_LPM_TRIE+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF_LPM_TRIE+path%3Atools&type=Code)
+
+### 9. BPF_PROG_ARRAY
+
+Syntax: ```BPF_PROG_ARRAY(name, size)```
+
+This creates a program array named ```name``` with ```size``` entries. Each entry of the array is either a file descriptor to a bpf program or ```NULL```. The array acts as a jump table so that bpf programs can "tail-call" other bpf programs.
+
+Methods (covered later): map.call().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_PROG_ARRAY+path%3Aexamples&type=Code),
+[search /tests](https://github.com/iovisor/bcc/search?q=BPF_PROG_ARRAY+path%3Atests&type=Code),
+[assign fd](https://github.com/iovisor/bcc/blob/master/examples/networking/tunnel_monitor/monitor.py#L24-L26)
+
+### 10. BPF_DEVMAP
+
+Syntax: ```BPF_DEVMAP(name, size)```
+
+This creates a device map named ```name``` with ```size``` entries. Each entry of the map is an `ifindex` to a network interface. This map is only used in XDP.
+
+For example:
+```C
+BPF_DEVMAP(devmap, 10);
+```
+
+Methods (covered later): map.redirect_map().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_DEVMAP+path%3Aexamples&type=Code),
+
+### 11. BPF_CPUMAP
+
+Syntax: ```BPF_CPUMAP(name, size)```
+
+This creates a cpu map named ```name``` with ```size``` entries. The index of the map represents the CPU id and each entry is the size of the ring buffer allocated for the CPU. This map is only used in XDP.
+
+For example:
+```C
+BPF_CPUMAP(cpumap, 16);
+```
+
+Methods (covered later): map.redirect_map().
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF_CPUMAP+path%3Aexamples&type=Code),
+
+### 12. map.lookup()
+
+Syntax: ```*val map.lookup(&key)```
+
+Lookup the key in the map, and return a pointer to its value if it exists, else NULL. We pass the key in as an address to a pointer.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=lookup+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=lookup+path%3Atools&type=Code)
+
+### 13. map.lookup_or_init()
+
+Syntax: ```*val map.lookup_or_init(&key, &zero)```
+
+Lookup the key in the map, and return a pointer to its value if it exists, else initialize the key's value to the second argument. This is often used to initialize values to zero.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=lookup_or_init+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=lookup_or_init+path%3Atools&type=Code)
+
+### 14. map.delete()
+
+Syntax: ```map.delete(&key)```
+
+Delete the key from the hash.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=delete+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=delete+path%3Atools&type=Code)
+
+### 15. map.update()
+
+Syntax: ```map.update(&key, &val)```
+
+Associate the value in the second argument to the key, overwriting any previous value.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=update+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=update+path%3Atools&type=Code)
+
+### 16. map.insert()
+
+Syntax: ```map.insert(&key, &val)```
+
+Associate the value in the second argument to the key, only if there was no previous value.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=insert+path%3Aexamples&type=Code)
+
+### 17. map.increment()
+
+Syntax: ```map.increment(key[, increment_amount])```
+
+Increments the key's value by `increment_amount`, which defaults to 1. Used for histograms.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=increment+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=increment+path%3Atools&type=Code)
+
+### 18. map.get_stackid()
+
+Syntax: ```int map.get_stackid(void *ctx, u64 flags)```
+
+This walks the stack found via the struct pt_regs in ```ctx```, saves it in the stack trace map, and returns a unique ID for the stack trace.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=get_stackid+path%3Aexamples&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=get_stackid+path%3Atools&type=Code)
+
+### 19. map.perf_read()
+
+Syntax: ```u64 map.perf_read(u32 cpu)```
+
+This returns the hardware performance counter as configured in [5. BPF_PERF_ARRAY](#5-bpf_perf_array)
+
+Examples in situ:
+[search /tests](https://github.com/iovisor/bcc/search?q=perf_read+path%3Atests&type=Code)
+
+### 20. map.call()
+
+Syntax: ```void map.call(void *ctx, int index)```
+
+This invokes ```bpf_tail_call()``` to tail-call the bpf program which the ```index``` entry in [9. BPF_PROG_ARRAY](#9-bpf_prog_array) points to. A tail-call is different from the normal call. It reuses the current stack frame after jumping to another bpf program and never goes back. If the ```index``` entry is empty, it won't jump anywhere and the program execution continues as normal.
+
+For example:
+
+```C
+BPF_PROG_ARRAY(prog_array, 10);
+
+int tail_call(void *ctx) {
+    bpf_trace_printk("Tail-call\n");
+    return 0;
+}
+
+int do_tail_call(void *ctx) {
+    bpf_trace_printk("Original program\n");
+    prog_array.call(ctx, 2);
+    return 0;
+}
+```
+
+```Python
+b = BPF(src_file="example.c")
+tail_fn = b.load_func("tail_call", BPF.KPROBE)
+prog_array = b.get_table("prog_array")
+prog_array[c_int(2)] = c_int(tail_fn.fd)
+b.attach_kprobe(event="some_kprobe_event", fn_name="do_tail_call")
+```
+
+This assigns ```tail_call()``` to ```prog_array[2]```. In the end of ```do_tail_call()```, ```prog_array.call(ctx, 2)``` tail-calls ```tail_call()``` and executes it.
+
+**NOTE:** To prevent infinite loop, the maximum number of tail-calls is 32 ([```MAX_TAIL_CALL_CNT```](https://github.com/torvalds/linux/search?l=C&q=MAX_TAIL_CALL_CNT+path%3Ainclude%2Flinux&type=Code)).
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?l=C&q=call+path%3Aexamples&type=Code),
+[search /tests](https://github.com/iovisor/bcc/search?l=C&q=call+path%3Atests&type=Code)
+
+### 21. map.redirect_map()
+
+Syntax: ```int map.redirect_map(int index, int flags)```
+
+This redirects the incoming packets based on the ```index``` entry. If the map is [10. BPF_DEVMAP](#10-bpf_devmap), the packet will be sent to the transmit queue of the network interface that the entry points to. If the map is [11. BPF_CPUMAP](#11-bpf_cpumap), the packet will be sent to the ring buffer of the ```index``` CPU and be processed by the CPU later.
+
+If the packet is redirected successfully, the function will return XDP_REDIRECT. Otherwise, it will return XDP_ABORTED to discard the packet.
+
+For example:
+```C
+BPF_DEVMAP(devmap, 1);
+
+int redirect_example(struct xdp_md *ctx) {
+    return devmap.redirect_map(0, 0);
+}
+int xdp_dummy(struct xdp_md *ctx) {
+    return XDP_PASS;
+}
+```
+
+```Python
+ip = pyroute2.IPRoute()
+idx = ip.link_lookup(ifname="eth1")[0]
+
+b = bcc.BPF(src_file="example.c")
+
+devmap = b.get_table("devmap")
+devmap[c_uint32(0)] = c_int(idx)
+
+in_fn = b.load_func("redirect_example", BPF.XDP)
+out_fn = b.load_func("xdp_dummy", BPF.XDP)
+b.attach_xdp("eth0", in_fn, 0)
+b.attach_xdp("eth1", out_fn, 0)
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?l=C&q=redirect_map+path%3Aexamples&type=Code),
+
+## Licensing
+
+Depending on which [BPF helpers](kernel-versions.md#helpers) are used, a GPL-compatible license is required.
+
+The special BCC macro `BPF_LICENSE` specifies the license of the BPF program. You can set the license as a comment in your source code, but the kernel has a special interface to specify it programmatically. If you need to use GPL-only helpers, it is recommended to specify the macro in your C code so that the kernel can understand it:
+
+```C
+// SPDX-License-Identifier: GPL-2.0+
+#define BPF_LICENSE GPL
+```
+
+Otherwise, the kernel may reject loading your program (see the [error description](#2-cannot-call-gpl-only-function-from-proprietary-program) below). Note that it supports multiple words and quotes are not necessary:
+
+```C
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-2-Clause
+#define BPF_LICENSE Dual BSD/GPL
+```
+
+Check the [BPF helpers reference](kernel-versions.md#helpers) to see which helpers are GPL-only and what the kernel understands as GPL-compatible.
+
+**If the macro is not specified, BCC will automatically define the license of the program as GPL.**
+
+# bcc Python
+
+## Initialization
+
+Constructors.
+
+### 1. BPF
+
+Syntax: ```BPF({text=BPF_program | src_file=filename} [, usdt_contexts=[USDT_object, ...]] [, cflags=[arg1, ...]] [, debug=int])```
+
+Creates a BPF object. This is the main object for defining a BPF program, and interacting with its output.
+
+Exactly one of `text` or `src_file` must be supplied (not both).
+
+The `cflags` specifies additional arguments to be passed to the compiler, for example `-DMACRO_NAME=value` or `-I/include/path`.  The arguments are passed as an array, with each element being an additional argument.  Note that strings are not split on whitespace, so each argument must be a different element of the array, e.g. `["-include", "header.h"]`.
+
+The `debug` flags control debug output, and can be or'ed together:
+- `DEBUG_LLVM_IR = 0x1` compiled LLVM IR
+- `DEBUG_BPF = 0x2` loaded BPF bytecode and register state on branches
+- `DEBUG_PREPROCESSOR = 0x4` pre-processor result
+- `DEBUG_SOURCE = 0x8` ASM instructions embedded with source
+- `DEBUG_BPF_REGISTER_STATE = 0x10` register state on all instructions in addition to DEBUG_BPF
+
+Examples:
+
+```Python
+# define entire BPF program in one line:
+BPF(text='int do_trace(void *ctx) { bpf_trace_printk("hit!\\n"); return 0; }');
+
+# define program as a variable:
+prog = """
+int hello(void *ctx) {
+    bpf_trace_printk("Hello, World!\\n");
+    return 0;
+}
+"""
+b = BPF(text=prog)
+
+# source a file:
+b = BPF(src_file = "vfsreadlat.c")
+
+# include a USDT object:
+u = USDT(pid=int(pid))
+[...]
+b = BPF(text=bpf_text, usdt_contexts=[u])
+
+# add include paths:
+u = BPF(text=prog, cflags=["-I/path/to/include"])
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=BPF+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=BPF+path%3Atools+language%3Apython&type=Code)
+
+### 2. USDT
+
+Syntax: ```USDT({pid=pid | path=path})```
+
+Creates an object to instrument User Statically-Defined Tracing (USDT) probes. Its primary method is ```enable_probe()```.
+
+Arguments:
+
+- pid: attach to this process ID.
+- path: instrument USDT probes from this binary path.
+
+Examples:
+
+```Python
+# include a USDT object:
+u = USDT(pid=int(pid))
+[...]
+b = BPF(text=bpf_text, usdt_contexts=[u])
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=USDT+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=USDT+path%3Atools+language%3Apython&type=Code)
+
+## Events
+
+### 1. attach_kprobe()
+
+Syntax: ```BPF.attach_kprobe(event="event", fn_name="name")```
+
+Instruments the kernel function ```event()``` using kernel dynamic tracing of the function entry, and attaches our C defined function ```name()``` to be called when the kernel function is called.
+
+For example:
+
+```Python
+b.attach_kprobe(event="sys_clone", fn_name="do_trace")
+```
+
+This will instrument the kernel ```sys_clone()``` function, which will then run our BPF defined ```do_trace()``` function each time it is called.
+
+You can call attach_kprobe() more than once, and attach your BPF function to multiple kernel functions.
+
+See the previous kprobes section for how to instrument arguments from BPF.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=attach_kprobe+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_kprobe+path%3Atools+language%3Apython&type=Code)
+
+### 2. attach_kretprobe()
+
+Syntax: ```BPF.attach_kretprobe(event="event", fn_name="name")```
+
+Instruments the return of the kernel function ```event()``` using kernel dynamic tracing of the function return, and attaches our C defined function ```name()``` to be called when the kernel function returns.
+
+For example:
+
+```Python
+b.attach_kretprobe(event="vfs_read", fn_name="do_return")
+```
+
+This will instrument the kernel ```vfs_read()``` function, which will then run our BPF defined ```do_return()``` function each time it is called.
+
+You can call attach_kretprobe() more than once, and attach your BPF function to multiple kernel function returns.
+
+See the previous kretprobes section for how to instrument the return value from BPF.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=attach_kretprobe+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_kretprobe+path%3Atools+language%3Apython&type=Code)
+
+### 3. attach_tracepoint()
+
+Syntax: ```BPF.attach_tracepoint(tp="tracepoint", fn_name="name")```
+
+Instruments the kernel tracepoint described by ```tracepoint```, and when hit, runs the BPF function ```name()```.
+
+This is an explicit way to instrument tracepoints. The ```TRACEPOINT_PROBE``` syntax, covered in the earlier tracepoints section, is an alternate method with the advantage of auto-declaring an ```args``` struct containing the tracepoint arguments. With ```attach_tracepoint()```, the tracepoint arguments need to be declared in the BPF program.
+
+For example:
+
+```Python
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+struct urandom_read_args {
+    // from /sys/kernel/debug/tracing/events/random/urandom_read/format
+    u64 __unused__;
+    u32 got_bits;
+    u32 pool_left;
+    u32 input_left;
+};
+
+int printarg(struct urandom_read_args *args) {
+    bpf_trace_printk("%d\\n", args->got_bits);
+    return 0;
+};
+"""
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_tracepoint("random:urandom_read", "printarg")
+```
+
+Notice how the first argument to ```printarg()``` is now our defined struct.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread-explicit.py#L41)
+
+### 4. attach_uprobe()
+
+Syntax: ```BPF.attach_uprobe(name="location", sym="symbol", fn_name="name")```
+
+Instruments the user-level function ```symbol()``` from either the library or binary named by ```location``` using user-level dynamic tracing of the function entry, and attach our C defined function ```name()``` to be called whenever the user-level function is called.
+
+Libraries can be given in the name argument without the lib prefix, or with the full path (/usr/lib/...). Binaries can be given only with the full path (/bin/sh).
+
+For example:
+
+```Python
+b.attach_uprobe(name="c", sym="strlen", fn_name="count")
+```
+
+This will instrument ```strlen()``` function from libc, and call our BPF function ```count()``` when it is called. Note how the "lib" in "libc" is not necessary to specify.
+
+Other examples:
+
+```Python
+b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry")
+b.attach_uprobe(name="/usr/bin/python", sym="main", fn_name="do_main")
+```
+
+You can call attach_uprobe() more than once, and attach your BPF function to multiple user-level functions.
+
+See the previous uprobes section for how to instrument arguments from BPF.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=attach_uprobe+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_uprobe+path%3Atools+language%3Apython&type=Code)
+
+### 5. attach_uretprobe()
+
+Syntax: ```BPF.attach_uretprobe(name="location", sym="symbol", fn_name="name")```
+
+Instruments the return of the user-level function ```symbol()``` from either the library or binary named by ```location``` using user-level dynamic tracing of the function return, and attach our C defined function ```name()``` to be called whenever the user-level function returns.
+
+For example:
+
+```Python
+b.attach_uretprobe(name="c", sym="strlen", fn_name="count")
+```
+
+This will instrument ```strlen()``` function from libc, and call our BPF function ```count()``` when it returns.
+
+Other examples:
+
+```Python
+b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry")
+b.attach_uprobe(name="/usr/bin/python", sym="main", fn_name="do_main")
+```
+
+You can call attach_uretprobe() more than once, and attach your BPF function to multiple user-level functions.
+
+See the previous uretprobes section for how to instrument the return value from BPF.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=attach_uretprobe+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_uretprobe+path%3Atools+language%3Apython&type=Code)
+
+### 6. USDT.enable_probe()
+
+Syntax: ```USDT.enable_probe(probe=probe, fn_name=name)```
+
+Attaches a BPF C function ```name``` to the USDT probe ```probe```.
+
+Example:
+
+```Python
+# enable USDT probe from given PID
+u = USDT(pid=int(pid))
+u.enable_probe(probe="http__server__request", fn_name="do_trace")
+```
+
+To check if your binary has USDT probes, and what they are, you can run ```readelf -n binary``` and check the stap debug section.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code)
+
+### 7. attach_raw_tracepoint()
+
+Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")```
+
+Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```.
+
+This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method.
+
+For example:
+
+```Python
+b.attach_raw_tracepoint("sched_swtich", "do_trace")
+```
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code)
+
+## Debug Output
+
+### 1. trace_print()
+
+Syntax: ```BPF.trace_print(fmt="fields")```
+
+This method continually reads the globally shared /sys/kernel/debug/tracing/trace_pipe file and prints its contents. This file can be written to via BPF and the bpf_trace_printk() function, however, that method has limitations, including a lack of concurrent tracing support. The BPF_PERF_OUTPUT mechanism, covered earlier, is preferred.
+
+Arguments:
+
+- ```fmt```: optional, and can contain a field formatting string. It defaults to ```None```.
+
+Examples:
+
+```Python
+# print trace_pipe output as-is:
+b.trace_print()
+
+# print PID and message:
+b.trace_print(fmt="{1} {5}")
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=trace_print+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=trace_print+path%3Atools+language%3Apython&type=Code)
+
+### 2. trace_fields()
+
+Syntax: ```BPF.trace_fields(nonblocking=False)```
+
+This method reads one line from the globally shared /sys/kernel/debug/tracing/trace_pipe file and returns it as fields. This file can be written to via BPF and the bpf_trace_printk() function, however, that method has limitations, including a lack of concurrent tracing support. The BPF_PERF_OUTPUT mechanism, covered earlier, is preferred.
+
+Arguments:
+
+- ```nonblocking```: optional, defaults to ```False```. When set to ```True```, the program will not block waiting for input.
+
+Examples:
+
+```Python
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    [...]
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=trace_print+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=trace_print+path%3Atools+language%3Apython&type=Code)
+
+## Output
+
+Normal output from a BPF program is either:
+
+- per-event: using PERF_EVENT_OUTPUT, open_perf_buffer(), and perf_buffer_poll().
+- map summary: using items(), or print_log2_hist(), covered in the Maps section.
+
+### 1. perf_buffer_poll()
+
+Syntax: ```BPF.perf_buffer_poll()```
+
+This polls from all open perf ring buffers, calling the callback function that was provided when calling open_perf_buffer for each entry.
+
+Example:
+
+```Python
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
+```
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/08fbceb7e828f0e3e77688497727c5b2405905fd/examples/tracing/hello_perf_output.py#L61),
+[search /examples](https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Atools+language%3Apython&type=Code)
+
+## Maps
+
+Maps are BPF data stores, and are used in bcc to implement a table, and then higher level objects on top of tables, including hashes and histograms.
+
+### 1. get_table()
+
+Syntax: ```BPF.get_table(name)```
+
+Returns a table object. This is no longer used, as tables can now be read as items from BPF. Eg: ```BPF[name]```.
+
+Examples:
+
+```Python
+counts = b.get_table("counts")
+
+counts = b["counts"]
+```
+
+These are equivalent.
+
+### 2. open_perf_buffer()
+
+Syntax: ```table.open_perf_buffers(callback, page_cnt=N, lost_cb=None)```
+
+This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the ```page_cnt``` parameter, which must be a power of two number of pages and defaults to 8. If the callback is not processing data fast enough, some submitted data may be lost. ```lost_cb``` will be called to log / monitor the lost count. If ```lost_cb``` is the default ```None``` value, it will just print a line of message to ```stderr```.
+
+Example:
+
+```Python
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    [...]
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
+```
+
+Note that the data structure transferred will need to be declared in C in the BPF program, and in Python. For example:
+
+```C
+// define output data structure in C
+struct data_t {
+    u32 pid;
+    u64 ts;
+    char comm[TASK_COMM_LEN];
+};
+```
+
+```Python
+# define output data structure in Python
+TASK_COMM_LEN = 16    # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [("pid", ct.c_ulonglong),
+                ("ts", ct.c_ulonglong),
+                ("comm", ct.c_char * TASK_COMM_LEN)]
+```
+
+Perhaps in a future bcc version, the Python data structure will be automatically generated from the C declaration.
+
+Examples in situ:
+[code](https://github.com/iovisor/bcc/blob/08fbceb7e828f0e3e77688497727c5b2405905fd/examples/tracing/hello_perf_output.py#L59),
+[search /examples](https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Atools+language%3Apython&type=Code)
+
+### 3. items()
+
+Syntax: ```table.items()```
+
+Returns an array of the keys in a table. This can be used with BPF_HASH maps to fetch, and iterate, over the keys.
+
+Example:
+
+```Python
+# print output
+print("%10s %s" % ("COUNT", "STRING"))
+counts = b.get_table("counts")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    print("%10d \"%s\"" % (v.value, k.c.encode('string-escape')))
+```
+
+This example also uses the ```sorted()``` method to sort by value.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=clear+items%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=clear+items%3Atools+language%3Apython&type=Code)
+
+### 4. values()
+
+Syntax: ```table.values()```
+
+Returns an array of the values in a table.
+
+### 5. clear()
+
+Syntax: ```table.clear()```
+
+Clears the table: deletes all entries.
+
+Example:
+
+```Python
+# print map summary every second:
+while True:
+    time.sleep(1)
+    print("%-8s\n" % time.strftime("%H:%M:%S"), end="")
+    dist.print_log2_hist(sym + " return:")
+    dist.clear()
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=clear+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=clear+path%3Atools+language%3Apython&type=Code)
+
+### 6. print_log2_hist()
+
+Syntax: ```table.print_log2_hist(val_type="value", section_header="Bucket ptr", section_print_fn=None)```
+
+Prints a table as a log2 histogram in ASCII. The table must be stored as log2, which can be done using the BPF function ```bpf_log2l()```.
+
+Arguments:
+
+- val_type: optional, column header.
+- section_header: if the histogram has a secondary key, multiple tables will print and section_header can be used as a header description for each.
+- section_print_fn: if section_print_fn is not None, it will be passed the bucket value.
+
+Example:
+
+```Python
+b = BPF(text="""
+BPF_HISTOGRAM(dist);
+
+int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
+{
+	dist.increment(bpf_log2l(req->__data_len / 1024));
+	return 0;
+}
+""")
+[...]
+
+b["dist"].print_log2_hist("kbytes")
+```
+
+Output:
+
+```
+     kbytes          : count     distribution
+       0 -> 1        : 3        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 211      |**********                            |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 800      |**************************************|
+```
+
+This output shows a multi-modal distribution, with the largest mode of 128->255 kbytes and a count of 800.
+
+This is an efficient way to summarize data, as the summarization is performed in-kernel, and only the count column is passed to user space.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=print_log2_hist+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=print_log2_hist+path%3Atools+language%3Apython&type=Code)
+
+### 6. print_linear_hist()
+
+Syntax: ```table.print_linear_hist(val_type="value", section_header="Bucket ptr", section_print_fn=None)```
+
+Prints a table as a linear histogram in ASCII. This is intended to visualize small integer ranges, eg, 0 to 100.
+
+Arguments:
+
+- val_type: optional, column header.
+- section_header: if the histogram has a secondary key, multiple tables will print and section_header can be used as a header description for each.
+- section_print_fn: if section_print_fn is not None, it will be passed the bucket value.
+
+Example:
+
+```Python
+b = BPF(text="""
+BPF_HISTOGRAM(dist);
+
+int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
+{
+	dist.increment(req->__data_len / 1024);
+	return 0;
+}
+""")
+[...]
+
+b["dist"].print_linear_hist("kbytes")
+```
+
+Output:
+
+```
+     kbytes        : count     distribution
+        0          : 3        |******                                  |
+        1          : 0        |                                        |
+        2          : 0        |                                        |
+        3          : 0        |                                        |
+        4          : 19       |****************************************|
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 0        |                                        |
+        8          : 4        |********                                |
+        9          : 0        |                                        |
+        10         : 0        |                                        |
+        11         : 0        |                                        |
+        12         : 0        |                                        |
+        13         : 0        |                                        |
+        14         : 0        |                                        |
+        15         : 0        |                                        |
+        16         : 2        |****                                    |
+[...]
+```
+
+This is an efficient way to summarize data, as the summarization is performed in-kernel, and only the values in the count column are passed to user space.
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=print_linear_hist+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=print_linear_hist+path%3Atools+language%3Apython&type=Code)
+
+## Helpers
+
+Some helper methods provided by bcc. Note that since we're in Python, we can import any Python library and their methods, including, for example, the libraries: argparse, collections, ctypes, datetime, re, socket, struct, subprocess, sys, and time.
+
+### 1. ksym()
+
+Syntax: ```BPF.ksym(addr)```
+
+Translate a kernel memory address into a kernel function name, which is returned.
+
+Example:
+
+```Python
+print("kernel function: " + b.ksym(addr))
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=ksym+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=ksym+path%3Atools+language%3Apython&type=Code)
+
+### 2. ksymname()
+
+Syntax: ```BPF.ksymname(name)```
+
+Translate a kernel name into an address. This is the reverse of ksym. Returns -1 when the function name is unknown.
+
+Example:
+
+```Python
+print("kernel address: %x" % b.ksymname("vfs_read"))
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=ksymname+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=ksymname+path%3Atools+language%3Apython&type=Code)
+
+### 3. sym()
+
+Syntax: ```BPF.sym(addr, pid, show_module=False, show_offset=False)```
+
+Translate a memory address into a function name for a pid, which is returned. A pid of less than zero will access the kernel symbol cache. The `show_module` and `show_offset` parameters control whether the module in which the symbol lies should be displayed, and whether the instruction offset from the beginning of the symbol should be displayed. These extra parameters default to `False`.
+
+Example:
+
+```Python
+print("function: " + b.sym(addr, pid))
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=sym+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=sym+path%3Atools+language%3Apython&type=Code)
+
+### 4. num_open_kprobes()
+
+Syntax: ```BPF.num_open_kprobes()```
+
+Returns the number of open k[ret]probes. Can be useful for scenarios where event_re is used while attaching and detaching probes. Excludes perf_events readers.
+
+Example:
+
+```Python
+b.attach_kprobe(event_re=pattern, fn_name="trace_count")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions matched by \"%s\". Exiting." % args.pattern)
+    exit()
+```
+
+Examples in situ:
+[search /examples](https://github.com/iovisor/bcc/search?q=num_open_kprobes+path%3Aexamples+language%3Apython&type=Code),
+[search /tools](https://github.com/iovisor/bcc/search?q=num_open_kprobes+path%3Atools+language%3Apython&type=Code)
+
+# BPF Errors
+
+See the "Understanding eBPF verifier messages" section in the kernel source under Documentation/networking/filter.txt.
+
+## 1. Invalid mem access
+
+This can be due to trying to read memory directly, instead of operating on memory on the BPF stack. All memory reads must be passed via bpf_probe_read() to copy memory into the BPF stack, which can be automatic by the bcc rewriter in some cases of simple dereferencing. bpf_probe_read() does all the required checks.
+
+Example:
+
+```
+bpf: Permission denied
+0: (bf) r6 = r1
+1: (79) r7 = *(u64 *)(r6 +80)
+2: (85) call 14
+3: (bf) r8 = r0
+[...]
+23: (69) r1 = *(u16 *)(r7 +16)
+R7 invalid mem access 'inv'
+
+Traceback (most recent call last):
+  File "./tcpaccept", line 179, in <module>
+    b = BPF(text=bpf_text)
+  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 172, in __init__
+    self._trace_autoload()
+  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 612, in _trace_autoload
+    fn = self.load_func(func_name, BPF.KPROBE)
+  File "/usr/lib/python2.7/dist-packages/bcc/__init__.py", line 212, in load_func
+    raise Exception("Failed to load BPF program %s" % func_name)
+Exception: Failed to load BPF program kretprobe__inet_csk_accept
+```
+
+## 2. Cannot call GPL only function from proprietary program
+
+This error happens when a GPL-only helper is called from a non-GPL BPF program. To fix this error, do not use GPL-only helpers from a proprietary BPF program, or relicense the BPF program under a GPL-compatible license. Check which [BPF helpers](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md#helpers) are GPL-only, and what licenses are considered GPL-compatible.
+
+Example calling `bpf_get_stackid()`, a GPL-only BPF helper, from a proprietary program (`#define BPF_LICENSE Proprietary`):
+
+```
+bpf: Failed to load program: Invalid argument
+[...]
+8: (85) call bpf_get_stackid#27
+cannot call GPL only function from proprietary program
+```
+
+# Environment Variables
+
+## 1. Kernel source directory
+
+eBPF program compilation needs kernel sources or kernel headers with headers
+compiled. In case your kernel sources are at a non-standard location where BCC
+cannot find then, its possible to provide BCC the absolute path of the location
+by setting `BCC_KERNEL_SOURCE` to it.
+
+## 2. Kernel version overriding
+
+By default, BCC stores the `LINUX_VERSION_CODE` in the generated eBPF object
+which is then passed along to the kernel when the eBPF program is loaded.
+Sometimes this is quite inconvenient especially when the kernel is slightly
+updated such as an LTS kernel release. Its extremely unlikely the slight
+mismatch would cause any issues with the loaded eBPF program. By setting
+`BCC_LINUX_VERSION_CODE` to the version of the kernel that's running, the check
+for verifying the kernel version can be bypassed. This is needed for programs
+that use kprobes. This needs to be encoded in the format: `(VERSION * 65536) +
+(PATCHLEVEL * 256) + SUBLEVEL`. For example, if the running kernel is `4.9.10`,
+then can set `export BCC_LINUX_VERSION_CODE=264458` to override the kernel
+version check successfully.
diff --git a/docs/tutorial.md b/docs/tutorial.md
new file mode 100644
index 0000000..e00c79d
--- /dev/null
+++ b/docs/tutorial.md
@@ -0,0 +1,422 @@
+# bcc Tutorial
+
+This tutorial covers how to use [bcc](https://github.com/iovisor/bcc) tools to quickly solve performance, troubleshooting, and networking issues. If you want to develop new bcc tools, see [tutorial_bcc_python_developer.md](tutorial_bcc_python_developer.md) for that tutorial.
+
+It is assumed for this tutorial that bcc is already installed, and you can run tools like execsnoop successfully. See [INSTALL.md](../INSTALL.md). This uses enhancements added to the Linux 4.x series.
+
+## Observability
+
+Some quick wins.
+
+### 0. Before bcc
+
+Before using bcc, you should start with the Linux basics. One reference is the [Linux Performance Analysis in 60s](http://techblog.netflix.com/2015/11/linux-performance-analysis-in-60s.html) post, which covers these commands:
+
+1. uptime
+1. dmesg | tail
+1. vmstat 1
+1. mpstat -P ALL 1
+1. pidstat 1
+1. iostat -xz 1
+1. free -m
+1. sar -n DEV 1
+1. sar -n TCP,ETCP 1
+1. top
+
+### 1. General Performance
+
+Here is a generic checklist for performance investigations with bcc, first as a list, then in detail:
+
+1. execsnoop
+1. opensnoop
+1. ext4slower (or btrfs\*, xfs\*, zfs\*)
+1. biolatency
+1. biosnoop
+1. cachestat
+1. tcpconnect
+1. tcpaccept
+1. tcpretrans
+1. runqlat
+1. profile
+
+These tools may be installed on your system under /usr/share/bcc/tools, or you can run them from the bcc github repo under /tools where they have a .py extension. Browse the 50+ tools available for more analysis options.
+
+#### 1.1 execsnoop
+
+```
+# ./execsnoop
+PCOMM            PID    RET ARGS
+supervise        9660     0 ./run
+supervise        9661     0 ./run
+mkdir            9662     0 /bin/mkdir -p ./main
+run              9663     0 ./run
+[...]
+```
+
+execsnoop prints one line of output for each new process. Check for short-lived processes. These can consume CPU resources, but not show up in most monitoring tools that periodically take snapshots of which processes are running.
+
+It works by tracing exec(), not the fork(), so it will catch many types of new processes but not all (eg, it won't see an application launching working processes, that doesn't exec() anything else).
+
+More [examples](../tools/execsnoop_example.txt).
+
+#### 1.2. opensnoop
+
+```
+# ./opensnoop
+PID    COMM      FD ERR PATH
+1565   redis-server        5   0 /proc/1565/stat
+1565   redis-server        5   0 /proc/1565/stat
+1565   redis-server        5   0 /proc/1565/stat
+1603   snmpd               9   0 /proc/net/dev
+1603   snmpd              11   0 /proc/net/if_inet6
+1603   snmpd              -1   2 /sys/class/net/eth0/device/vendor
+1603   snmpd              11   0 /proc/sys/net/ipv4/neigh/eth0/retrans_time_ms
+1603   snmpd              11   0 /proc/sys/net/ipv6/neigh/eth0/retrans_time_ms
+1603   snmpd              11   0 /proc/sys/net/ipv6/conf/eth0/forwarding
+[...]
+```
+
+opensnoop prints one line of output for each open() syscall, including details.
+
+Files that are opened can tell you a lot about how applications work: identifying their data files, config files, and log files. Sometimes applications can misbehave, and perform poorly, when they are constantly attempting to read files that do not exist. opensnoop gives you a quick look.
+
+More [examples](../tools/opensnoop_example.txt).
+
+#### 1.3. ext4slower (or btrfs\*, xfs\*, zfs\*)
+
+```
+# ./ext4slower
+Tracing ext4 operations slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:35:01 cron           16464  R 1249    0          16.05 common-auth
+06:35:01 cron           16463  R 1249    0          16.04 common-auth
+06:35:01 cron           16465  R 1249    0          16.03 common-auth
+06:35:01 cron           16465  R 4096    0          10.62 login.defs
+06:35:01 cron           16464  R 4096    0          10.61 login.defs
+```
+
+ext4slower traces the ext4 file system and times common operations, and then only prints those that exceed a threshold.
+
+This is great for identifying or exonerating one type of performance issue: slow individually slow disk i/O via the file system. Disks process I/O asynchronously, and it can be difficult to associate latency at that layer with the latency applications experience. Tracing higher up in the kernel stack, at the VFS -> file system interface, will more closely match what an application suffers. Use this tool to identify if file system latency exceeds a given threshold.
+
+Similar tools exist in bcc for other file systems: btrfsslower, xfsslower, and zfsslower. There is also fileslower, which works at the VFS layer and traces everything (although at some higher overhead).
+
+More [examples](../tools/ext4slower_example.txt).
+
+#### 1.4. biolatency
+
+```
+# ./biolatency
+Tracing block device I/O... Hit Ctrl-C to end.
+^C
+     usecs           : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 12       |********                              |
+     256 -> 511      : 15       |**********                            |
+     512 -> 1023     : 43       |*******************************       |
+    1024 -> 2047     : 52       |**************************************|
+    2048 -> 4095     : 47       |**********************************    |
+    4096 -> 8191     : 52       |**************************************|
+    8192 -> 16383    : 36       |**************************            |
+   16384 -> 32767    : 15       |**********                            |
+   32768 -> 65535    : 2        |*                                     |
+   65536 -> 131071   : 2        |*                                     |
+```
+
+biolatency traces disk I/O latency (time from device issue to completion), and when the tool ends (Ctrl-C, or a given interval), it prints a histogram summary of the latency.
+
+This is great for understanding disk I/O latency beyond the average times given by tools like iostat. I/O latency outliers will be visible at the end of the distribution, as well as multi-mode distributions.
+
+More [examples](../tools/biolatency_example.txt).
+
+#### 1.5. biosnoop
+
+```
+# ./biosnoop
+TIME(s)        COMM           PID    DISK    T  SECTOR    BYTES   LAT(ms)
+0.000004001    supervise      1950   xvda1   W  13092560  4096       0.74
+0.000178002    supervise      1950   xvda1   W  13092432  4096       0.61
+0.001469001    supervise      1956   xvda1   W  13092440  4096       1.24
+0.001588002    supervise      1956   xvda1   W  13115128  4096       1.09
+1.022346001    supervise      1950   xvda1   W  13115272  4096       0.98
+1.022568002    supervise      1950   xvda1   W  13188496  4096       0.93
+[...]
+```
+
+biosnoop prints a line of output for each disk I/O, with details including latency (time from device issue to completion).
+
+This allows you to examine disk I/O in more detail, and look for time-ordered patterns (eg, reads queueing behind writes). Note that the output will be verbose if your system performance a high rate of disk I/O.
+
+More [examples](../tools/biosnoop_example.txt).
+
+#### 1.6. cachestat
+
+```
+# ./cachestat
+    HITS   MISSES  DIRTIES  READ_HIT% WRITE_HIT%   BUFFERS_MB  CACHED_MB
+    1074       44       13      94.9%       2.9%            1        223
+    2195      170        8      92.5%       6.8%            1        143
+     182       53       56      53.6%       1.3%            1        143
+   62480    40960    20480      40.6%      19.8%            1        223
+       7        2        5      22.2%      22.2%            1        223
+     348        0        0     100.0%       0.0%            1        223
+[...]
+```
+
+cachestat prints a one line summary every second (or every custom interval) showing statistics from the file system cache.
+
+Use this to identify a low cache hit ratio, and a high rate of misses: which gives one lead for performance tuning.
+
+More [examples](../tools/cachestat_example.txt).
+
+#### 1.7. tcpconnect
+
+```
+# ./tcpconnect
+PID    COMM         IP SADDR            DADDR            DPORT
+1479   telnet       4  127.0.0.1        127.0.0.1        23
+1469   curl         4  10.201.219.236   54.245.105.25    80
+1469   curl         4  10.201.219.236   54.67.101.145    80
+1991   telnet       6  ::1              ::1              23
+2015   ssh          6  fe80::2000:bff:fe82:3ac fe80::2000:bff:fe82:3ac 22
+[...]
+```
+
+tcpconnect prints one line of output for every active TCP connection (eg, via connect()), with details including source and destination addresses.
+
+Look for unexpected connections that may point to inefficiencies in application configuration, or an intruder.
+
+More [examples](../tools/tcpconnect_example.txt).
+
+#### 1.8. tcpaccept
+
+```
+# ./tcpaccept
+PID    COMM         IP RADDR            LADDR            LPORT
+907    sshd         4  192.168.56.1     192.168.56.102   22
+907    sshd         4  127.0.0.1        127.0.0.1        22
+5389   perl         6  1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001
+[...]
+```
+
+tcpaccept prints one line of output for every passive TCP connection (eg, via accept()), with details including source and destination addresses.
+
+Look for unexpected connections that may point to inefficiencies in application configuration, or an intruder.
+
+More [examples](../tools/tcpaccept_example.txt).
+
+#### 1.9. tcpretrans
+
+```
+# ./tcpretrans
+TIME     PID    IP LADDR:LPORT          T> RADDR:RPORT          STATE
+01:55:05 0      4  10.153.223.157:22    R> 69.53.245.40:34619   ESTABLISHED
+01:55:05 0      4  10.153.223.157:22    R> 69.53.245.40:34619   ESTABLISHED
+01:55:17 0      4  10.153.223.157:22    R> 69.53.245.40:22957   ESTABLISHED
+[...]
+```
+
+tcprerans prints one line of output for every TCP retransmit packet, with details including source and destination addresses, and kernel state of the TCP connection.
+
+TCP retransmissions cause latency and throughput issues. For ESTABLISHED retransmits, look for patterns with networks. For SYN_SENT, this may point to target kernel CPU saturation and kernel packet drops.
+
+More [examples](../tools/tcpretrans_example.txt).
+
+#### 1.10. runqlat
+
+```
+# ./runqlat
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 233      |***********                             |
+         2 -> 3          : 742      |************************************    |
+         4 -> 7          : 203      |**********                              |
+         8 -> 15         : 173      |********                                |
+        16 -> 31         : 24       |*                                       |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 30       |*                                       |
+       128 -> 255        : 6        |                                        |
+       256 -> 511        : 3        |                                        |
+       512 -> 1023       : 5        |                                        |
+      1024 -> 2047       : 27       |*                                       |
+      2048 -> 4095       : 30       |*                                       |
+      4096 -> 8191       : 20       |                                        |
+      8192 -> 16383      : 29       |*                                       |
+     16384 -> 32767      : 809      |****************************************|
+     32768 -> 65535      : 64       |***                                     |
+```
+
+runqlat times how long threads were waiting on the CPU run queues, and prints this as a histogram.
+
+This can help quantify time lost waiting for a turn on CPU, during periods of CPU saturation.
+
+More [examples](../tools/runqlat_example.txt).
+
+#### 1.11. profile
+
+```
+# ./profile
+Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
+^C
+    00007f31d76c3251 [unknown]
+    47a2c1e752bf47f7 [unknown]
+    -                sign-file (8877)
+        1
+
+    ffffffff813d0af8 __clear_user
+    ffffffff813d5277 iov_iter_zero
+    ffffffff814ec5f2 read_iter_zero
+    ffffffff8120be9d __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        4
+
+    0000000000400542 func_a
+    0000000000400598 main
+    00007f12a133e830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (13549)
+        5
+
+[...]
+
+    ffffffff8105eb66 native_safe_halt
+    ffffffff8103659e default_idle
+    ffffffff81036d1f arch_cpu_idle
+    ffffffff810bba5a default_idle_call
+    ffffffff810bbd07 cpu_startup_entry
+    ffffffff8104df55 start_secondary
+    -                swapper/1 (0)
+        75
+```
+
+profile is a CPU profiler, which takes samples of stack traces at timed intervals, and prints a summary of unique stack traces and a count of their occurrence.
+
+Use this tool to understand the code paths that are consuming CPU resources.
+
+More [examples](../tools/profile_example.txt).
+
+### 2. Observatility with Generic Tools
+
+In addition to the above tools for performance tuning, below is a checklist for bcc generic tools, first as a list, and in detail:
+
+1. trace
+1. argdist
+1. funccount
+
+These generic tools may be useful to provide visibility to solve your specific problems.
+
+#### 2.1. trace
+
+##### Example 1
+
+Suppose you want to track file ownership change. There are three syscalls, `chown`, `fchown` and `lchown` which users can use to change file ownership. The corresponding syscall entry is `SyS_[f|l]chown`.  The following command can be used to print out syscall parameters and the calling process user id. You can use `id` command to find the uid of a particular user.
+
+```
+$ trace.py \
+  'p::SyS_chown "file = %s, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid' \
+  'p::SyS_fchown "fd = %d, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid' \
+  'p::SyS_lchown "file = %s, to_uid = %d, to_gid = %d, from_uid = %d", arg1, arg2, arg3, $uid'
+PID    TID    COMM         FUNC             -
+1269255 1269255 python3.6    SyS_lchown       file = /tmp/dotsync-usisgezu/tmp, to_uid = 128203, to_gid = 100, from_uid = 128203
+1269441 1269441 zstd         SyS_chown        file = /tmp/dotsync-vic7ygj0/dotsync-package.zst, to_uid = 128203, to_gid = 100, from_uid = 128203
+1269255 1269255 python3.6    SyS_lchown       file = /tmp/dotsync-a40zd7ev/tmp, to_uid = 128203, to_gid = 100, from_uid = 128203
+1269442 1269442 zstd         SyS_chown        file = /tmp/dotsync-gzp413o_/dotsync-package.zst, to_uid = 128203, to_gid = 100, from_uid = 128203
+1269255 1269255 python3.6    SyS_lchown       file = /tmp/dotsync-whx4fivm/tmp/.bash_profile, to_uid = 128203, to_gid = 100, from_uid = 128203
+```
+
+##### Example 2
+
+Suppose you want to count nonvoluntary context switches (`nvcsw`) in your bpf based performance monitoring tools and you do not know what is the proper method. `/proc/<pid>/status` already tells you the number (`nonvoluntary_ctxt_switches`) for a pid and you can use `trace.py` to do a quick experiment to verify your method. With kernel source code, the `nvcsw` is counted at file `linux/kernel/sched/core.c` function `__schedule` and under condition
+```
+!(!preempt && prev->state) // i.e., preempt || !prev->state
+```
+
+The `__schedule` function is marked as `notrace`, and the best place to evaluate the above condition seems in `sched/sched_switch` tracepoint called inside function `__schedule` and defined in `linux/include/trace/events/sched.h`. `trace.py` already has `args` being the pointer to the tracepoint `TP_STRUCT__entry`.  The above condition in function `__schedule` can be represented as
+```
+args->prev_state == TASK_STATE_MAX || args->prev_state == 0
+```
+
+The below command can be used to count the involuntary context switches (per process or per pid) and compare to `/proc/<pid>/status` or `/proc/<pid>/task/<task_id>/status` for correctness, as in typical cases, involuntary context switches are not very common.
+```
+$ trace.py -p 1134138 't:sched:sched_switch (args->prev_state == TASK_STATE_MAX || args->prev_state == 0)'
+PID    TID    COMM         FUNC
+1134138 1134140 contention_test sched_switch
+1134138 1134142 contention_test sched_switch
+...
+$ trace.py -L 1134140 't:sched:sched_switch (args->prev_state == TASK_STATE_MAX || args->prev_state == 0)'
+PID    TID    COMM         FUNC
+1134138 1134140 contention_test sched_switch
+1134138 1134140 contention_test sched_switch
+...
+```
+
+##### Example 3
+
+This example is related to issue [1231](https://github.com/iovisor/bcc/issues/1231) and [1516](https://github.com/iovisor/bcc/issues/1516) where uprobe does not work at all in certain cases. First, you can do a `strace` as below
+
+```
+$ strace trace.py 'r:bash:readline "%s", retval'
+...
+perf_event_open(0x7ffd968212f0, -1, 0, -1, 0x8 /* PERF_FLAG_??? */) = -1 EIO (Input/output error)
+...
+```
+
+The `perf_event_open` syscall returns `-EIO`. Digging into kernel uprobe related codes in `/kernel/trace` and `/kernel/events` directories to search `EIO`, the function `uprobe_register` is the most suspicious. Let us find whether this function is called or not and what is the return value if it is called. In one terminal using the following command to print out the return value of uprobe_register,
+```
+$ trace.py 'r::uprobe_register "ret = %d", retval'
+```
+In another terminal run the same bash uretprobe tracing example, and you should get
+```
+$ trace.py 'r::uprobe_register "ret = %d", retval'
+PID    TID    COMM         FUNC             -
+1041401 1041401 python2.7    uprobe_register  ret = -5
+```
+
+The `-5` error code is EIO. This confirms that the following code in function `uprobe_register` is the most suspicious culprit.
+```
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+        return -EIO;
+```
+The `shmem_mapping` function is defined as
+```
+bool shmem_mapping(struct address_space *mapping)
+{
+        return mapping->a_ops == &shmem_aops;
+}
+```
+
+To confirm the theory, find what is `inode->i_mapping->a_ops` with the following command
+```
+$ trace.py -I 'linux/fs.h' 'p::uprobe_register(struct inode *inode) "a_ops = %llx", inode->i_mapping->a_ops'
+PID    TID    COMM         FUNC             -
+814288 814288 python2.7    uprobe_register  a_ops = ffffffff81a2adc0
+^C$ grep ffffffff81a2adc0 /proc/kallsyms
+ffffffff81a2adc0 R empty_aops
+```
+
+The kernel symbol `empty_aops` does not have `readpage` defined and hence the above suspicious condition is true. Further examining the kernel source code shows that `overlayfs` does not provide its own `a_ops` while some other file systems (e.g., ext4) define their own `a_ops` (e.g., `ext4_da_aops`), and `ext4_da_aops` defines `readpage`. Hence, uprobe works fine on ext4 while not on overlayfs.
+
+More [examples](../tools/trace_example.txt).
+
+#### 2.2. argdist
+
+More [examples](../tools/argdist_example.txt).
+
+#### 2.3. funccount
+
+More [examples](../tools/funccount_example.txt).
+
+## Networking
+
+To do.
diff --git a/docs/tutorial_bcc_python_developer.md b/docs/tutorial_bcc_python_developer.md
new file mode 100644
index 0000000..40d4985
--- /dev/null
+++ b/docs/tutorial_bcc_python_developer.md
@@ -0,0 +1,725 @@
+# bcc Python Developer Tutorial
+
+This tutorial is about developing [bcc](https://github.com/iovisor/bcc) tools and programs using the Python interface. There are two parts: observability then networking. Snippets are taken from various programs in bcc: see their files for licences.
+
+Also see the bcc developer's [reference_guide.md](reference_guide.md), and a tutorial for end-users of tools: [tutorial.md](tutorial.md). There is also a lua interface for bcc.
+
+## Observability
+
+This observability tutorial contains 17 lessons, and 46 enumerated things to learn.
+
+### Lesson 1. Hello World
+
+Start by running [examples/hello_world.py](../examples/hello_world.py), while running some commands (eg, "ls") in another session. It should print "Hello, World!" for new processes. If not, start by fixing bcc: see [INSTALL.md](../INSTALL.md).
+
+```
+# ./examples/hello_world.py
+            bash-13364 [002] d... 24573433.052937: : Hello, World!
+            bash-13364 [003] d... 24573436.642808: : Hello, World!
+[...]
+```
+
+Here's the code for hello_world.py:
+
+```Python
+from bcc import BPF
+BPF(text='int kprobe__sys_clone(void *ctx) { bpf_trace_printk("Hello, World!\\n"); return 0; }').trace_print()
+```
+
+There are six things to learn from this:
+
+1. ```text='...'```: This defines a BPF program inline. The program is written in C.
+
+1. ```kprobe__sys_clone()```: This is a short-cut for kernel dynamic tracing via kprobes. If the C function begins with ``kprobe__``, the rest is treated as a kernel function name to instrument, in this case, ```sys_clone()```.
+
+1. ```void *ctx```: ctx has arguments, but since we aren't using them here, we'll just cast it to ```void *```.
+
+1. ```bpf_trace_printk()```: A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT(), covered later.
+
+1. ```return 0;```: Necessary formality (if you want to know why, see [#139](https://github.com/iovisor/bcc/issues/139)).
+
+1. ```.trace_print()```: A bcc routine that reads trace_pipe and prints the output.
+
+### Lesson 2. sys_sync()
+
+Write a program that traces the sys_sync() kernel function. Print "sys_sync() called" when it runs. Test by running ```sync``` in another session while tracing. The hello_world.py program has everything you need for this.
+
+Improve it by printing "Tracing sys_sync()... Ctrl-C to end." when the program first starts. Hint: it's just Python.
+
+### Lesson 3. hello_fields.py
+
+This program is in [examples/tracing/hello_fields.py](../examples/tracing/trace_fields.py). Sample output (run commands in another session):
+
+```
+# ./examples/tracing/hello_fields.py
+TIME(s)            COMM             PID    MESSAGE
+24585001.174885999 sshd             1432   Hello, World!
+24585001.195710000 sshd             15780  Hello, World!
+24585001.991976000 systemd-udevd    484    Hello, World!
+24585002.276147000 bash             15787  Hello, World!
+```
+
+Code:
+
+```Python
+from bcc import BPF
+
+# define BPF program
+prog = """
+int hello(void *ctx) {
+	bpf_trace_printk("Hello, World!\\n");
+	return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=prog)
+b.attach_kprobe(event="sys_clone", fn_name="hello")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
+```
+
+This is similar to hello_world.py, and traces new processes via sys_clone() again, but has a few more things to learn:
+
+1. ```prog =```: This time we declare the C program as a variable, and later refer to it. This is useful if you want to add some string substitutions based on command line arguments.
+
+1. ```hello()```: Now we're just declaring a C function, instead of the ```kprobe__``` shortcut. We'll refer to this later. All C functions declared in the BPF program are expected to be executed on a probe, hence they all need to take a ```pt_reg* ctx``` as first argument. If you need to define some helper function that will not be executed on a probe, they need to be defined as ```static inline``` in order to be inlined by the compiler. Sometimes you would also need to add ```_always_inline``` function attribute to it.
+
+1. ```b.attach_kprobe(event="sys_clone", fn_name="hello")```: Creates a kprobe for the sys_clone() kernel function, which will execute our defined hello() function. You can call attach_kprobe() more than once, and attach your C function to multiple kernel functions.
+
+1. ```b.trace_fields()```: Returns a fixed set of fields from trace_pipe. Similar to trace_print(), this is handy for hacking, but for real tooling we should switch to BPF_PERF_OUTPUT().
+
+### Lesson 4. sync_timing.py
+
+Remember the days of sysadmins typing ```sync``` three times on a slow console before ```reboot```, to give the first asynchronous sync time to complete? Then someone thought ```sync;sync;sync``` was clever, to run them all on one line, which became industry practice despite defeating the original purpose! And then sync became synchronous, so more reasons it was silly. Anyway.
+
+The following example times how quickly the ```do_sync``` function is called, and prints output if it has been called more recently than one second ago. A ```sync;sync;sync``` will print output for the 2nd and 3rd sync's:
+
+```
+# ./examples/tracing/sync_timing.py
+Tracing for quick sync's... Ctrl-C to end
+At time 0.00 s: multiple syncs detected, last 95 ms ago
+At time 0.10 s: multiple syncs detected, last 96 ms ago
+```
+
+This program is [examples/tracing/sync_timing.py](../examples/tracing/sync_timing.py):
+
+```Python
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+BPF_HASH(last);
+
+int do_trace(struct pt_regs *ctx) {
+	u64 ts, *tsp, delta, key = 0;
+
+	// attempt to read stored timestamp
+	tsp = last.lookup(&key);
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		if (delta < 1000000000) {
+			// output if time is less than 1 second
+			bpf_trace_printk("%d\\n", delta / 1000000);
+		}
+		last.delete(&key);
+	}
+
+	// update stored timestamp
+	ts = bpf_ktime_get_ns();
+	last.update(&key, &ts);
+	return 0;
+}
+""")
+
+b.attach_kprobe(event="sys_sync", fn_name="do_trace")
+print("Tracing for quick sync's... Ctrl-C to end")
+
+# format output
+start = 0
+while 1:
+    (task, pid, cpu, flags, ts, ms) = b.trace_fields()
+    if start == 0:
+        start = ts
+    ts = ts - start
+    print("At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms))
+```
+
+Things to learn:
+
+1. ```bpf_ktime_get_ns()```: Returns the time as nanoseconds.
+1. ```BPF_HASH(last)```: Creates a BPF map object that is a hash (associative array), called "last". We didn't specify any further arguments, so it defaults to key and value types of u64.
+1. ```key = 0```: We'll only store one key/value pair in this hash, where the key is hardwired to zero.
+1. ```last.lookup(&key)```: Lookup the key in the hash, and return a pointer to its value if it exists, else NULL. We pass the key in as an address to a pointer.
+1. ```last.delete(&key)```: Delete the key from the hash. This is currently required because of [a kernel bug in `.update()`](https://git.kernel.org/cgit/linux/kernel/git/davem/net.git/commit/?id=a6ed3ea65d9868fdf9eff84e6fe4f666b8d14b02).
+1. ```last.update(&key, &ts)```: Associate the value in the 2nd argument to the key, overwriting any previous value. This records the timestamp.
+
+### Lesson 5. sync_count.py
+
+Modify the sync_timing.py program (prior lesson) to store the count of all sys_sync() calls (both fast and slow), and print it with the output. This count can be recorded in the BPF program by adding a new key index to the existing hash.
+
+### Lesson 6. disksnoop.py
+
+Browse the [examples/tracing/disksnoop.py](../examples/tracing/disksnoop.py) program to see what is new. Here is some sample output:
+
+```
+# ./disksnoop.py
+TIME(s)            T  BYTES    LAT(ms)
+16458043.436012    W  4096        3.13
+16458043.437326    W  4096        4.44
+16458044.126545    R  4096       42.82
+16458044.129872    R  4096        3.24
+[...]
+```
+
+And a code snippet:
+
+```Python
+[...]
+REQ_WRITE = 1		# from include/linux/blk_types.h
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+BPF_HASH(start, struct request *);
+
+void trace_start(struct pt_regs *ctx, struct request *req) {
+	// stash start timestamp by request ptr
+	u64 ts = bpf_ktime_get_ns();
+
+	start.update(&req, &ts);
+}
+
+void trace_completion(struct pt_regs *ctx, struct request *req) {
+	u64 *tsp, delta;
+
+	tsp = start.lookup(&req);
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		bpf_trace_printk("%d %x %d\\n", req->__data_len,
+			req->cmd_flags, delta / 1000);
+		start.delete(&req);
+	}
+}
+""")
+
+b.attach_kprobe(event="blk_start_request", fn_name="trace_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_start")
+b.attach_kprobe(event="blk_account_io_completion", fn_name="trace_completion")
+[...]
+```
+
+Things to learn:
+
+1. ```REQ_WRITE```: We're defining a kernel constant in the Python program because we'll use it there later. If we were using REQ_WRITE in the BPF program, it should just work (without needing to be defined) with the appropriate #includes.
+1. ```trace_start(struct pt_regs *ctx, struct request *req)```: This function will later be attached to kprobes. The arguments to kprobe functions are ```struct pt_regs *ctx```, for registers and BPF context, and then the actual arguments to the function. We'll attach this to blk_start_request(), where the first argument is ```struct request *```.
+1. ```start.update(&req, &ts)```: We're using the pointer to the request struct as a key in our hash. What? This is commonplace in tracing. Pointers to structs turn out to be great keys, as they are unique: two structs can't have the same pointer address. (Just be careful about when it gets free'd and reused.) So what we're really doing is tagging the request struct, which describes the disk I/O, with our own timestamp, so that we can time it. There's two common keys used for storing timestamps: pointers to structs, and, thread IDs (for timing function entry to return).
+1. ```req->__data_len```: We're dereferencing members of ```struct request```. See its definition in the kernel source for what members are there. bcc actually rewrites these expressions to be a series of ```bpf_probe_read()``` calls. Sometimes bcc can't handle a complex dereference, and you need to call ```bpf_probe_read()``` directly.
+
+This is a pretty interesting program, and if you can understand all the code, you'll understand many important basics. We're still using the bpf_trace_printk() hack, so let's fix that next.
+
+### Lesson 7. hello_perf_output.py
+
+Let's finally stop using bpf_trace_printk() and use the proper BPF_PERF_OUTPUT() interface. This will also mean we stop getting the free trace_field() members like PID and timestamp, and will need to fetch them directly. Sample output while commands are run in another session:
+
+```
+# ./hello_perf_output.py
+TIME(s)            COMM             PID    MESSAGE
+0.000000000        bash             22986  Hello, perf_output!
+0.021080275        systemd-udevd    484    Hello, perf_output!
+0.021359520        systemd-udevd    484    Hello, perf_output!
+0.021590610        systemd-udevd    484    Hello, perf_output!
+[...]
+```
+
+Code is [examples/tracing/hello_perf_output.py](../examples/tracing/hello_perf_output.py):
+
+```Python
+from bcc import BPF
+import ctypes as ct
+
+# define BPF program
+prog = """
+#include <linux/sched.h>
+
+// define output data structure in C
+struct data_t {
+	u32 pid;
+	u64 ts;
+	char comm[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(events);
+
+int hello(struct pt_regs *ctx) {
+	struct data_t data = {};
+
+	data.pid = bpf_get_current_pid_tgid();
+	data.ts = bpf_ktime_get_ns();
+	bpf_get_current_comm(&data.comm, sizeof(data.comm));
+
+	events.perf_submit(ctx, &data, sizeof(data));
+
+	return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=prog)
+b.attach_kprobe(event="sys_clone", fn_name="hello")
+
+# define output data structure in Python
+TASK_COMM_LEN = 16    # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [("pid", ct.c_ulonglong),
+                ("ts", ct.c_ulonglong),
+                ("comm", ct.c_char * TASK_COMM_LEN)]
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
+
+# process event
+start = 0
+def print_event(cpu, data, size):
+    global start
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    if start == 0:
+            start = event.ts
+    time_s = (float(event.ts - start)) / 1000000000
+    print("%-18.9f %-16s %-6d %s" % (time_s, event.comm, event.pid,
+        "Hello, perf_output!"))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
+```
+
+Things to learn:
+
+1. ```struct data_t```: This defines the C struct we'll use to pass data from kernel to user space.
+1. ```BPF_PERF_OUTPUT(events)```: This names our output channel "events".
+1. ```struct data_t data = {};```: Create an empty data_t struct that we'll then populate.
+1. ```bpf_get_current_pid_tgid()```: Returns the process ID in the lower 32 bits (kernel's view of the PID, which in user space is usually presented as the thread ID), and the thread group ID in the upper 32 bits (what user space often thinks of as the PID). By directly setting this to a u32, we discard the upper 32 bits. Should you be presenting the PID or the TGID? For a multi-threaded app, the TGID will be the same, so you need the PID to differentiate them, if that's what you want. It's also a question of expectations for the end user.
+1. ```bpf_get_current_comm()```: Populates the first argument address with the current process name.
+1. ```events.perf_submit()```: Submit the event for user space to read via a perf ring buffer.
+1. ```class Data(ct.Structure)```: Now define the Python version of the C data structure.
+1. ```def print_event()```: Define a Python function that will handle reading events from the ```events``` stream.
+1. ```b["events"].open_perf_buffer(print_event)```: Associate the Python ```print_event``` function with the ```events``` stream.
+1. ```while 1: b.perf_buffer_poll()```: Block waiting for events.
+
+This may be improved in future bcc versions. Eg, the Python data struct could be auto-generated from the C code.
+
+### Lesson 8. sync_perf_output.py
+
+Rewrite sync_timing.py, from a prior lesson, to use ```BPF_PERF_OUTPUT```.
+
+### Lesson 9. bitehist.py
+
+The following tool records a histogram of disk I/O sizes. Sample output:
+
+```
+# ./bitehist.py
+Tracing... Hit Ctrl-C to end.
+^C
+     kbytes          : count     distribution
+       0 -> 1        : 3        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 211      |**********                            |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 800      |**************************************|
+```
+
+Code is [examples/tracing/bitehist.py](../examples/tracing/bitehist.py):
+
+```Python
+from bcc import BPF
+from time import sleep
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+BPF_HISTOGRAM(dist);
+
+int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
+{
+	dist.increment(bpf_log2l(req->__data_len / 1024));
+	return 0;
+}
+""")
+
+# header
+print("Tracing... Hit Ctrl-C to end.")
+
+# trace until Ctrl-C
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    print
+
+# output
+b["dist"].print_log2_hist("kbytes")
+```
+
+A recap from earlier lessons:
+
+- ```kprobe__```: This prefix means the rest will be treated as a kernel function name that will be instrumented using kprobe.
+- ```struct pt_regs *ctx, struct request *req```: Arguments to kprobe. The ```ctx``` is registers and BPF context, the ```req``` is the first argument to the instrumented function: ```blk_account_io_completion()```.
+- ```req->__data_len```: Dereferencing that member.
+
+New things to learn:
+
+1. ```BPF_HISTOGRAM(dist)```: Defines a BPF map object that is a histogram, and names it "dist".
+1. ```dist.increment()```: Increments the histogram bucket index provided as first argument by one by default. Optionally, custom increments can be passed as the second argument.
+1. ```bpf_log2l()```: Returns the log-2 of the provided value. This becomes the index of our histogram, so that we're constructing a power-of-2 histogram.
+1. ```b["dist"].print_log2_hist("kbytes")```: Prints the "dist" histogram as power-of-2, with a column header of "kbytes". The only data transferred from kernel to user space is the bucket counts, making this efficient.
+
+### Lesson 10. disklatency.py
+
+Write a program that times disk I/O, and prints a histogram of their latency. Disk I/O instrumentation and timing can be found in the disksnoop.py program from a prior lesson, and histogram code can be found in bitehist.py from a prior lesson.
+
+### Lesson 11. vfsreadlat.py
+
+This example is split into separate Python and C files. Example output:
+
+```
+# ./vfsreadlat.py 1
+Tracing... Hit Ctrl-C to end.
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 2        |***********                             |
+         4 -> 7          : 7        |****************************************|
+         8 -> 15         : 4        |**********************                  |
+
+     usecs               : count     distribution
+         0 -> 1          : 29       |****************************************|
+         2 -> 3          : 28       |**************************************  |
+         4 -> 7          : 4        |*****                                   |
+         8 -> 15         : 8        |***********                             |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 2        |**                                      |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 4        |*****                                   |
+      8192 -> 16383      : 6        |********                                |
+     16384 -> 32767      : 9        |************                            |
+     32768 -> 65535      : 6        |********                                |
+     65536 -> 131071     : 2        |**                                      |
+
+     usecs               : count     distribution
+         0 -> 1          : 11       |****************************************|
+         2 -> 3          : 2        |*******                                 |
+         4 -> 7          : 10       |************************************    |
+         8 -> 15         : 8        |*****************************           |
+        16 -> 31         : 1        |***                                     |
+        32 -> 63         : 2        |*******                                 |
+[...]
+```
+
+Browse the code in [examples/tracing/vfsreadlat.py](../examples/tracing/vfsreadlat.py) and [examples/tracing/vfsreadlat.c](../examples/tracing/vfsreadlat.c). Things to learn:
+
+1. ```b = BPF(src_file = "vfsreadlat.c")```: Read the BPF C program from a separate source file.
+1. ```b.attach_kretprobe(event="vfs_read", fn_name="do_return")```: Attaches the BPF C function ```do_return()``` to the return of the kernel function ```vfs_read()```. This is a kretprobe: instrumenting the return from a function, rather than its entry.
+1. ```b["dist"].clear()```: Clears the histogram.
+
+### Lesson 12. urandomread.py
+
+Tracing while a ```dd if=/dev/urandom of=/dev/null bs=8k count=5``` is run:
+
+```
+# ./urandomread.py
+TIME(s)            COMM             PID    GOTBITS
+24652832.956994001 smtp             24690  384
+24652837.726500999 dd               24692  65536
+24652837.727111001 dd               24692  65536
+24652837.727703001 dd               24692  65536
+24652837.728294998 dd               24692  65536
+24652837.728888001 dd               24692  65536
+```
+
+Hah! I caught smtp by accident. Code is [examples/tracing/urandomread.py](../examples/tracing/urandomread.py):
+
+```Python
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+TRACEPOINT_PROBE(random, urandom_read) {
+	// args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
+	bpf_trace_printk("%d\\n", args->got_bits);
+	return 0;
+};
+""")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "GOTBITS"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
+```
+
+Things to learn:
+
+1. ```TRACEPOINT_PROBE(random, urandom_read)```: Instrument the kernel tracepoint ```random:urandom_read```. These have a stable API, and thus are recommend to use instead of kprobes, wherever possible. You can run ```perf list``` for a list of tracepoints. Linux >= 4.7 is required to attach BPF programs to tracepoints.
+1. ```args->got_bits```: ```args``` is auto-populated to be a structure of the tracepoint arguments. The comment above says where you can see that structure. Eg:
+
+```
+# cat /sys/kernel/debug/tracing/events/random/urandom_read/format
+name: urandom_read
+ID: 972
+format:
+	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
+	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
+	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
+	field:int common_pid;	offset:4;	size:4;	signed:1;
+
+	field:int got_bits;	offset:8;	size:4;	signed:1;
+	field:int pool_left;	offset:12;	size:4;	signed:1;
+	field:int input_left;	offset:16;	size:4;	signed:1;
+
+print fmt: "got_bits %d nonblocking_pool_entropy_left %d input_entropy_left %d", REC->got_bits, REC->pool_left, REC->input_left
+```
+
+In this case, we were printing the ```got_bits``` member.
+
+### Lesson 13. disksnoop.py fixed
+
+Convert disksnoop.py from a previous lesson to use the ```block:block_rq_issue``` and ```block:block_rq_complete``` tracepoints.
+
+### Lesson 14. strlen_count.py
+
+This program instruments a user-level function, the ```strlen()``` library function, and frequency counts its string argument. Example output:
+
+```
+# ./strlen_count.py
+Tracing strlen()... Hit Ctrl-C to end.
+^C     COUNT STRING
+         1 " "
+         1 "/bin/ls"
+         1 "."
+         1 "cpudist.py.1"
+         1 ".bashrc"
+         1 "ls --color=auto"
+         1 "key_t"
+[...]
+        10 "a7:~# "
+        10 "/root"
+        12 "LC_ALL"
+        12 "en_US.UTF-8"
+        13 "en_US.UTF-8"
+        20 "~"
+        70 "#%^,~:-=?+/}"
+       340 "\x01\x1b]0;root@bgregg-test: ~\x07\x02root@bgregg-test:~# "
+```
+
+These are various strings that are being processed by this library function while tracing, along with their frequency counts. ```strlen()``` was called on "LC_ALL" 12 times, for example.
+
+Code is [examples/tracing/strlen_count.py](../examples/tracing/strlen_count.py):
+
+```Python
+from bcc import BPF
+from time import sleep
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+	char c[80];
+};
+BPF_HASH(counts, struct key_t);
+
+int count(struct pt_regs *ctx) {
+  if (!PT_REGS_PARM1(ctx))
+    return 0;
+
+  struct key_t key = {};
+  u64 zero = 0, *val;
+
+  bpf_probe_read(&key.c, sizeof(key.c), (void *)PT_REGS_PARM1(ctx));
+
+  // another possibility is using `counts.increment(key);`. It allows a second
+  //   optional parameter to specify the increment step
+  val = counts.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+};
+""")
+b.attach_uprobe(name="c", sym="strlen", fn_name="count")
+
+# header
+print("Tracing strlen()... Hit Ctrl-C to end.")
+
+# sleep until Ctrl-C
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    pass
+
+# print output
+print("%10s %s" % ("COUNT", "STRING"))
+counts = b.get_table("counts")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    print("%10d \"%s\"" % (v.value, k.c.encode('string-escape')))
+```
+
+Things to learn:
+
+1. ```PT_REGS_PARM1(ctx)```: This fetches the first argument to ```strlen()```, which is the string.
+1. ```b.attach_uprobe(name="c", sym="strlen", fn_name="count")```: Attach to library "c" (if this is the main program, use its pathname), instrument the user-level function ```strlen()```, and on execution call our C function ```count()```.
+
+### Lesson 15. nodejs_http_server.py
+
+This program instruments a user-defined static tracing (USDT) probe, which is the user-level version of a kernel tracepoint. Sample output:
+
+```
+# ./nodejs_http_server.py 24728
+TIME(s)            COMM             PID    ARGS
+24653324.561322998 node             24728  path:/index.html
+24653335.343401998 node             24728  path:/images/welcome.png
+24653340.510164998 node             24728  path:/images/favicon.png
+```
+
+Relevant code from [examples/tracing/nodejs_http_server.py](../examples/tracing/nodejs_http_server.py):
+
+```Python
+if len(sys.argv) < 2:
+	print("USAGE: nodejs_http_server PID")
+	exit()
+pid = sys.argv[1]
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+int do_trace(struct pt_regs *ctx) {
+	uint64_t addr;
+	char path[128];
+	bpf_usdt_readarg(6, ctx, &addr);
+	bpf_probe_read(&path, sizeof(path), (void *)addr);
+	bpf_trace_printk("path:%s\\n", path);
+	return 0;
+};
+"""
+
+# enable USDT probe from given PID
+u = USDT(pid=int(pid))
+u.enable_probe(probe="http__server__request", fn_name="do_trace")
+
+# initialize BPF
+b = BPF(text=bpf_text, usdt_contexts=[u])
+```
+
+Things to learn:
+
+1. ```bpf_usdt_readarg(6, ctx, &addr)```: Read the address of argument 6 from the USDT probe into ```addr```.
+1. ```bpf_probe_read(&path, sizeof(path), (void *)addr)```: Now the string ```addr``` points to into our ```path``` variable.
+1. ```u = USDT(pid=int(pid))```: Initialize USDT tracing for the given PID.
+1. ```u.enable_probe(probe="http__server__request", fn_name="do_trace")```: Attach our ```do_trace()``` BPF C function to the Node.js ```http__server__request``` USDT probe.
+1. ```b = BPF(text=bpf_text, usdt_contexts=[u])```: Need to pass in our USDT object, ```u```, to BPF object creation.
+
+### Lesson 16. task_switch.c
+
+This is an older tutorial included as a bonus lesson. Use this for recap and to reinforce what you've already learned.
+
+This is a slightly more complex tracing example than Hello World. This program
+will be invoked for every task change in the kernel, and record in a BPF map
+the new and old pids.
+
+The C program below introduces two new concepts.
+The first is the macro `BPF_TABLE`. This defines a table (type="hash"), with key
+type `key_t` and leaf type `u64` (a single counter). The table name is `stats`,
+containing 1024 entries maximum. One can `lookup`, `lookup_or_init`, `update`,
+and `delete` entries from the table.
+The second concept is the prev argument. This argument is treated specially by
+the BCC frontend, such that accesses to this variable are read from the saved
+context that is passed by the kprobe infrastructure. The prototype of the args
+starting from position 1 should match the prototype of the kernel function being
+kprobed. If done so, the program will have seamless access to the function
+parameters.
+
+```c
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct key_t {
+  u32 prev_pid;
+  u32 curr_pid;
+};
+// map_type, key_type, leaf_type, table_name, num_entry
+BPF_HASH(stats, struct key_t, u64, 1024);
+// attach to finish_task_switch in kernel/sched/core.c, which has the following
+// prototype:
+//   struct rq *finish_task_switch(struct task_struct *prev)
+int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
+  struct key_t key = {};
+  u64 zero = 0, *val;
+
+  key.curr_pid = bpf_get_current_pid_tgid();
+  key.prev_pid = prev->pid;
+
+  // another possibility is using `counts.increment(key);`. It allows a second
+  //   optional parameter to specify the increment step
+  val = stats.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+}
+```
+
+The userspace component loads the file shown above, and attaches it to the
+`finish_task_switch` kernel function.
+The [] operator of the BPF object gives access to each BPF_TABLE in the
+program, allowing pass-through access to the values residing in the kernel. Use
+the object as you would any other python dict object: read, update, and deletes
+are all allowed.
+```python
+from bcc import BPF
+from time import sleep
+
+b = BPF(src_file="task_switch.c")
+b.attach_kprobe(event="finish_task_switch", fn_name="count_sched")
+
+# generate many schedule events
+for i in range(0, 100): sleep(0.01)
+
+for k, v in b["stats"].items():
+    print("task_switch[%5d->%5d]=%u" % (k.prev_pid, k.curr_pid, v.value))
+```
+
+These programs have now been merged, and are both in [examples/tracing/task_switch.py](examples/tracing/task_switch.py).
+
+### Lesson 17. Further Study
+
+For further study, see Sasha Goldshtein's [linux-tracing-workshop](https://github.com/goldshtn/linux-tracing-workshop), which contains additional labs. There are also many tools in bcc /tools to study.
+
+Please read [CONTRIBUTING-SCRIPTS.md](../CONTRIBUTING-SCRIPTS.md) if you wish to contrubite tools to bcc. At the bottom of the main [README.md](../README.md), you'll also find methods for contacting us. Good luck, and happy tracing!
+
+## Networking
+
+To do.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..c63553d
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(EXAMPLE_PROGRAMS hello_world.py)
+install(PROGRAMS ${EXAMPLE_PROGRAMS} DESTINATION share/bcc/examples)
+
+if(ENABLE_CLANG_JIT)
+if(ENABLE_USDT)
+add_subdirectory(cpp)
+endif(ENABLE_USDT)
+add_subdirectory(lua)
+add_subdirectory(networking)
+add_subdirectory(tracing)
+endif()
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
new file mode 100644
index 0000000..9d5e5d4
--- /dev/null
+++ b/examples/cpp/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+include_directories(${CMAKE_SOURCE_DIR}/src/cc)
+include_directories(${CMAKE_SOURCE_DIR}/src/cc/api)
+
+option(INSTALL_CPP_EXAMPLES "Install C++ examples. Those binaries are statically linked and can take plenty of disk space" OFF)
+
+add_executable(HelloWorld HelloWorld.cc)
+target_link_libraries(HelloWorld bcc-static)
+
+add_executable(CPUDistribution CPUDistribution.cc)
+target_link_libraries(CPUDistribution bcc-static)
+
+add_executable(RecordMySQLQuery RecordMySQLQuery.cc)
+target_link_libraries(RecordMySQLQuery bcc-static)
+
+add_executable(TCPSendStack TCPSendStack.cc)
+target_link_libraries(TCPSendStack bcc-static)
+
+add_executable(RandomRead RandomRead.cc)
+target_link_libraries(RandomRead bcc-static)
+
+add_executable(LLCStat LLCStat.cc)
+target_link_libraries(LLCStat bcc-static)
+
+add_executable(FollyRequestContextSwitch FollyRequestContextSwitch.cc)
+target_link_libraries(FollyRequestContextSwitch bcc-static)
+
+add_executable(UseExternalMap UseExternalMap.cc)
+target_link_libraries(UseExternalMap bcc-static)
+
+if(INSTALL_CPP_EXAMPLES)
+  install (TARGETS HelloWorld DESTINATION share/bcc/examples/cpp)
+  install (TARGETS CPUDistribution DESTINATION share/bcc/examples/cpp)
+  install (TARGETS RecordMySQLQuery DESTINATION share/bcc/examples/cpp)
+  install (TARGETS TCPSendStack DESTINATION share/bcc/examples/cpp)
+  install (TARGETS RandomRead DESTINATION share/bcc/examples/cpp)
+  install (TARGETS LLCStat DESTINATION share/bcc/examples/cpp)
+  install (TARGETS FollyRequestContextSwitch DESTINATION share/bcc/examples/cpp)
+  install (TARGETS UseExternalMap DESTINATION share/bcc/examples/cpp)
+endif(INSTALL_CPP_EXAMPLES)
diff --git a/examples/cpp/CPUDistribution.cc b/examples/cpp/CPUDistribution.cc
new file mode 100644
index 0000000..010339f
--- /dev/null
+++ b/examples/cpp/CPUDistribution.cc
@@ -0,0 +1,97 @@
+/*
+ * CPUDistribution Show load distribution across CPU cores during a period of
+ *                 time. For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of BCC and kprobes.
+ *
+ * USAGE: CPUDistribution [duration]
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <unistd.h>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(pid_to_cpu, pid_t, int);
+BPF_HASH(pid_to_ts, pid_t, uint64_t);
+BPF_HASH(cpu_time, int, uint64_t);
+
+int task_switch_event(struct pt_regs *ctx, struct task_struct *prev) {
+  pid_t prev_pid = prev->pid;
+  int* prev_cpu = pid_to_cpu.lookup(&prev_pid);
+  uint64_t* prev_ts = pid_to_ts.lookup(&prev_pid);
+
+  pid_t cur_pid = bpf_get_current_pid_tgid();
+  int cur_cpu = bpf_get_smp_processor_id();
+  uint64_t cur_ts = bpf_ktime_get_ns();
+
+  uint64_t this_cpu_time = 0;
+  if (prev_ts) {
+    pid_to_ts.delete(&prev_pid);
+    this_cpu_time = (cur_ts - *prev_ts);
+  }
+  if (prev_cpu) {
+    pid_to_cpu.delete(&prev_pid);
+    if (this_cpu_time > 0) {
+      int cpu_key = *prev_cpu;
+      uint64_t* history_time = cpu_time.lookup(&cpu_key);
+      if (history_time)
+        this_cpu_time += *history_time;
+      cpu_time.update(&cpu_key, &this_cpu_time);
+    }
+  }
+
+  pid_to_cpu.update(&cur_pid, &cur_cpu);
+  pid_to_ts.update(&cur_pid, &cur_ts);
+
+  return 0;
+}
+)";
+
+int main(int argc, char** argv) {
+  ebpf::BPF bpf;
+  auto init_res = bpf.init(BPF_PROGRAM);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_res =
+      bpf.attach_kprobe("finish_task_switch", "task_switch_event");
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  int probe_time = 10;
+  if (argc == 2) {
+    probe_time = atoi(argv[1]);
+  }
+  std::cout << "Probing for " << probe_time << " seconds" << std::endl;
+  sleep(probe_time);
+
+  auto table = bpf.get_hash_table<int, uint64_t>("cpu_time");
+  auto num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+  for (int i = 0; i < num_cores; i++) {
+    std::cout << "CPU " << std::setw(2) << i << " worked for ";
+    std::cout << (table[i] / 1000000.0) << " ms." << std::endl;
+  }
+
+  auto detach_res = bpf.detach_kprobe("finish_task_switch");
+  if (detach_res.code() != 0) {
+    std::cerr << detach_res.msg() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/examples/cpp/FollyRequestContextSwitch.cc b/examples/cpp/FollyRequestContextSwitch.cc
new file mode 100644
index 0000000..a1692e6
--- /dev/null
+++ b/examples/cpp/FollyRequestContextSwitch.cc
@@ -0,0 +1,131 @@
+/*
+ * FollyRequestContextSwitch Monitor RequestContext switch events for any binary
+ *                           uses the class from [folly](http://bit.ly/2h6S1yx).
+ *                           For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of using USDT with BCC.
+ *
+ * USAGE: FollyRequestContextSwitch PATH_TO_BINARY
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <signal.h>
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+struct event_t {
+  int pid;
+  char name[16];
+  uint64_t old_addr;
+  uint64_t new_addr;
+};
+
+BPF_PERF_OUTPUT(events);
+
+int on_context_switch(struct pt_regs *ctx) {
+  struct event_t event = {};
+
+  event.pid = bpf_get_current_pid_tgid();
+  bpf_get_current_comm(&event.name, sizeof(event.name));
+
+  bpf_usdt_readarg(1, ctx, &event.old_addr);
+  bpf_usdt_readarg(2, ctx, &event.new_addr);
+
+  events.perf_submit(ctx, &event, sizeof(event));
+  return 0;
+}
+)";
+
+// Define the same struct to use in user space.
+struct event_t {
+  int pid;
+  char name[16];
+  uint64_t old_addr;
+  uint64_t new_addr;
+};
+
+void handle_output(void* cb_cookie, void* data, int data_size) {
+  auto event = static_cast<event_t*>(data);
+  std::cout << "PID " << event->pid << " (" << event->name << ") ";
+  std::cout << "folly::RequestContext switch from " << event->old_addr << " to "
+            << event->new_addr << std::endl;
+}
+
+std::function<void(int)> shutdown_handler;
+
+void signal_handler(int s) { shutdown_handler(s); }
+
+int main(int argc, char** argv) {
+  std::string binary;
+  pid_t pid = -1;
+  for (int i = 0; i < argc; i++) {
+    if (strncmp(argv[i], "--pid", 5) == 0) {
+      pid = std::stoi(argv[i + 1]);
+      i++;
+      continue;
+    }
+    if (strncmp(argv[i], "--binary", 8) == 0) {
+      binary = argv[i + 1];
+      i++;
+      continue;
+    }
+  }
+
+  if (pid <= 0 && binary.empty()) {
+    std::cout << "Must specify at least one of binary or PID:" << std::endl
+              << "FollyRequestContextSwitch [--pid PID] [--binary BINARY]"
+              << std::endl;
+    exit(1);
+  }
+
+  ebpf::USDT u(binary, pid, "folly", "request_context_switch_before",
+               "on_context_switch");
+
+  ebpf::BPF* bpf = new ebpf::BPF();
+
+  auto init_res = bpf->init(BPF_PROGRAM, {}, {u});
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_res = bpf->attach_usdt(u);
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  } else {
+    std::cout << "Attached to USDT " << u;
+  }
+
+  auto open_res = bpf->open_perf_buffer("events", &handle_output);
+  if (open_res.code() != 0) {
+    std::cerr << open_res.msg() << std::endl;
+    return 1;
+  }
+
+  shutdown_handler = [&](int s) {
+    std::cerr << "Terminating..." << std::endl;
+    bpf->detach_usdt(u);
+    delete bpf;
+    exit(0);
+  };
+  signal(SIGINT, signal_handler);
+
+  std::cout << "Started tracing, hit Ctrl-C to terminate." << std::endl;
+  auto perf_buffer = bpf->get_perf_buffer("events");
+  if (perf_buffer)
+    while (true)
+      // 100ms timeout
+      perf_buffer->poll(100);
+
+  return 0;
+}
diff --git a/examples/cpp/HelloWorld.cc b/examples/cpp/HelloWorld.cc
new file mode 100644
index 0000000..05e5509
--- /dev/null
+++ b/examples/cpp/HelloWorld.cc
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <unistd.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+int on_sys_clone(void *ctx) {
+  bpf_trace_printk("Hello, World! Here I did a sys_clone call!\n");
+  return 0;
+}
+)";
+
+int main() {
+  ebpf::BPF bpf;
+  auto init_res = bpf.init(BPF_PROGRAM);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  std::ifstream pipe("/sys/kernel/debug/tracing/trace_pipe");
+  std::string line;
+  std::string clone_fnname = bpf.get_syscall_fnname("clone");
+
+  auto attach_res = bpf.attach_kprobe(clone_fnname, "on_sys_clone");
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  while (true) {
+    if (std::getline(pipe, line)) {
+      std::cout << line << std::endl;
+      // Detach the probe if we got at least one line.
+      auto detach_res = bpf.detach_kprobe(clone_fnname);
+      if (detach_res.code() != 0) {
+        std::cerr << detach_res.msg() << std::endl;
+        return 1;
+      }
+      break;
+    } else {
+      std::cout << "Waiting for a sys_clone event" << std::endl;
+      sleep(1);
+    }
+  }
+
+  return 0;
+}
diff --git a/examples/cpp/LLCStat.cc b/examples/cpp/LLCStat.cc
new file mode 100644
index 0000000..2e9d628
--- /dev/null
+++ b/examples/cpp/LLCStat.cc
@@ -0,0 +1,119 @@
+/*
+ * LLCStat Show LLC hit ratio for each process on each CPU core.
+ *         For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of BCC timed sampling perf event.
+ *
+ * USAGE: LLCStat [duration]
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <linux/perf_event.h>
+#include <unistd.h>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+
+struct event_t {
+    int cpu;
+    int pid;
+    char name[16];
+};
+
+BPF_HASH(ref_count, struct event_t);
+BPF_HASH(miss_count, struct event_t);
+
+static inline __attribute__((always_inline)) void get_key(struct event_t* key) {
+    key->cpu = bpf_get_smp_processor_id();
+    key->pid = bpf_get_current_pid_tgid();
+    bpf_get_current_comm(&(key->name), sizeof(key->name));
+}
+
+int on_cache_miss(struct bpf_perf_event_data *ctx) {
+    struct event_t key = {};
+    get_key(&key);
+
+    u64 zero = 0, *val;
+    val = miss_count.lookup_or_init(&key, &zero);
+    (*val) += ctx->sample_period;
+
+    return 0;
+}
+
+int on_cache_ref(struct bpf_perf_event_data *ctx) {
+    struct event_t key = {};
+    get_key(&key);
+
+    u64 zero = 0, *val;
+    val = ref_count.lookup_or_init(&key, &zero);
+    (*val) += ctx->sample_period;
+
+    return 0;
+}
+)";
+
+struct event_t {
+  int cpu;
+  int pid;
+  char name[16];
+};
+
+int main(int argc, char** argv) {
+  ebpf::BPF bpf;
+  auto init_res = bpf.init(BPF_PROGRAM);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_ref_res =
+      bpf.attach_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES,
+                            "on_cache_ref", 100, 0);
+  if (attach_ref_res.code() != 0) {
+    std::cerr << attach_ref_res.msg() << std::endl;
+    return 1;
+  }
+  auto attach_miss_res = bpf.attach_perf_event(
+      PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, "on_cache_miss", 100, 0);
+  if (attach_miss_res.code() != 0) {
+    std::cerr << attach_miss_res.msg() << std::endl;
+    return 1;
+  }
+
+  int probe_time = 10;
+  if (argc == 2) {
+    probe_time = atoi(argv[1]);
+  }
+  std::cout << "Probing for " << probe_time << " seconds" << std::endl;
+  sleep(probe_time);
+  bpf.detach_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES);
+  bpf.detach_perf_event(PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES);
+
+  auto refs = bpf.get_hash_table<event_t, uint64_t>("ref_count");
+  auto misses = bpf.get_hash_table<event_t, uint64_t>("miss_count");
+  for (auto it : refs.get_table_offline()) {
+    uint64_t hit;
+    try {
+      auto miss = misses[it.first];
+      hit = miss <= it.second ? it.second - miss : 0;
+    } catch (...) {
+      hit = it.second;
+    }
+    double ratio = (double(hit) / double(it.second)) * 100.0;
+    std::cout << "PID " << std::setw(8) << std::setfill(' ') << it.first.pid;
+    std::cout << std::setw(20) << std::setfill(' ') << std::left
+              << " (" + std::string(it.first.name) + ") " << std::right;
+    std::cout << "on CPU " << std::setw(2) << std::setfill(' ') << it.first.cpu;
+    std::cout << " Hit Rate " << std::setprecision(4) << ratio << "% ";
+    std::cout << "(" << hit << "/" << it.second << ")" << std::endl;
+  }
+  return 0;
+}
diff --git a/examples/cpp/RandomRead.cc b/examples/cpp/RandomRead.cc
new file mode 100644
index 0000000..5e0609a
--- /dev/null
+++ b/examples/cpp/RandomRead.cc
@@ -0,0 +1,126 @@
+/*
+ * RandomRead Monitor random number read events.
+ *            For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of BCC Tracepoint and perf buffer.
+ *
+ * USAGE: RandomRead
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <signal.h>
+#include <iostream>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+#ifndef CGROUP_FILTER
+#define CGROUP_FILTER 0
+#endif
+
+struct urandom_read_args {
+  // See /sys/kernel/debug/tracing/events/random/urandom_read/format
+  uint64_t common__unused;
+  int got_bits;
+  int pool_left;
+  int input_left;
+};
+
+struct event_t {
+  int pid;
+  char comm[16];
+  int cpu;
+  int got_bits;
+};
+
+BPF_PERF_OUTPUT(events);
+BPF_CGROUP_ARRAY(cgroup, 1);
+
+int on_urandom_read(struct urandom_read_args* attr) {
+  if (CGROUP_FILTER && (cgroup.check_current_task(0) != 1))
+    return 0;
+
+  struct event_t event = {};
+  event.pid = bpf_get_current_pid_tgid();
+  bpf_get_current_comm(&event.comm, sizeof(event.comm));
+  event.cpu = bpf_get_smp_processor_id();
+  event.got_bits = attr->got_bits;
+
+  events.perf_submit(attr, &event, sizeof(event));
+  return 0;
+}
+)";
+
+// Define the same struct to use in user space.
+struct event_t {
+  int pid;
+  char comm[16];
+  int cpu;
+  int got_bits;
+};
+
+void handle_output(void* cb_cookie, void* data, int data_size) {
+  auto event = static_cast<event_t*>(data);
+  std::cout << "PID: " << event->pid << " (" << event->comm << ") on CPU "
+            << event->cpu << " read " << event->got_bits << " bits"
+            << std::endl;
+}
+
+ebpf::BPF* bpf;
+
+void signal_handler(int s) {
+  std::cerr << "Terminating..." << std::endl;
+  delete bpf;
+  exit(0);
+}
+
+int main(int argc, char** argv) {
+  if (argc != 1 && argc != 2) {
+    std::cerr << "USAGE: RandomRead [cgroup2_path]" << std::endl;
+    return 1;
+  }
+
+  std::vector<std::string> cflags = {};
+  if (argc == 2)
+    cflags.emplace_back("-DCGROUP_FILTER=1");
+
+  bpf = new ebpf::BPF();
+  auto init_res = bpf->init(BPF_PROGRAM, cflags, {});
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+  if (argc == 2) {
+    auto cgroup_array = bpf->get_cgroup_array("cgroup");
+    auto update_res = cgroup_array.update_value(0, argv[1]);
+    if (update_res.code() != 0) {
+      std::cerr << update_res.msg() << std::endl;
+      return 1;
+    }
+  }
+
+  auto attach_res =
+      bpf->attach_tracepoint("random:urandom_read", "on_urandom_read");
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto open_res = bpf->open_perf_buffer("events", &handle_output);
+  if (open_res.code() != 0) {
+    std::cerr << open_res.msg() << std::endl;
+    return 1;
+  }
+
+  signal(SIGINT, signal_handler);
+  std::cout << "Started tracing, hit Ctrl-C to terminate." << std::endl;
+  while (true)
+    bpf->poll_perf_buffer("events");
+
+  return 0;
+}
diff --git a/examples/cpp/RecordMySQLQuery.cc b/examples/cpp/RecordMySQLQuery.cc
new file mode 100644
index 0000000..09233ac
--- /dev/null
+++ b/examples/cpp/RecordMySQLQuery.cc
@@ -0,0 +1,104 @@
+/*
+ * RecordMySQLQuery Record MySQL queries by probing the alloc_query() function
+ *                  in mysqld. For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of BCC and uprobes.
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <unistd.h>
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/ptrace.h>
+
+struct query_probe_t {
+  uint64_t ts;
+  pid_t pid;
+  char query[100];
+};
+
+BPF_HASH(queries, struct query_probe_t, int);
+
+int probe_mysql_query(struct pt_regs *ctx, void* thd, char* query, size_t len) {
+  if (query) {
+    struct query_probe_t key = {};
+
+    key.ts = bpf_ktime_get_ns();
+    key.pid = bpf_get_current_pid_tgid();
+
+    bpf_probe_read_str(&key.query, sizeof(key.query), query);
+
+    int one = 1;
+    queries.update(&key, &one);
+  }
+  return 0;
+}
+)";
+const std::string ALLOC_QUERY_FUNC = "_Z11alloc_queryP3THDPKcj";
+
+// Define the same struct to use in user space.
+struct query_probe_t {
+  uint64_t ts;
+  pid_t pid;
+  char query[100];
+};
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cout << "USAGE: RecordMySQLQuery PATH_TO_MYSQLD [duration]"
+              << std::endl;
+    exit(1);
+  }
+
+  std::string mysql_path(argv[1]);
+  std::cout << "Using mysqld path: " << mysql_path << std::endl;
+
+  ebpf::BPF bpf;
+  auto init_res = bpf.init(BPF_PROGRAM);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_res =
+      bpf.attach_uprobe(mysql_path, ALLOC_QUERY_FUNC, "probe_mysql_query");
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  int probe_time = 10;
+  if (argc >= 3)
+    probe_time = atoi(argv[2]);
+  std::cout << "Probing for " << probe_time << " seconds" << std::endl;
+  sleep(probe_time);
+
+  auto table_handle = bpf.get_hash_table<query_probe_t, int>("queries");
+  auto table = table_handle.get_table_offline();
+  std::sort(
+      table.begin(), table.end(),
+      [](std::pair<query_probe_t, int> a, std::pair<query_probe_t, int> b) {
+        return a.first.ts < b.first.ts;
+      });
+  std::cout << table.size() << " queries recorded:" << std::endl;
+  for (auto it : table) {
+    std::cout << "Time: " << it.first.ts << " PID: " << it.first.pid
+              << " Query: " << it.first.query << std::endl;
+  }
+
+  auto detach_res = bpf.detach_uprobe(mysql_path, ALLOC_QUERY_FUNC);
+  if (detach_res.code() != 0) {
+    std::cerr << detach_res.msg() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/examples/cpp/TCPSendStack.cc b/examples/cpp/TCPSendStack.cc
new file mode 100644
index 0000000..183529e
--- /dev/null
+++ b/examples/cpp/TCPSendStack.cc
@@ -0,0 +1,129 @@
+/*
+ * TCPSendStack Summarize tcp_sendmsg() calling stack traces.
+ *              For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of BCC in-kernel stack trace dedup.
+ *
+ * USAGE: TCPSendStack [duration]
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <unistd.h>
+#include <algorithm>
+#include <iostream>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+struct stack_key_t {
+  int pid;
+  char name[16];
+  int user_stack;
+  int kernel_stack;
+};
+
+BPF_STACK_TRACE(stack_traces, 16384);
+BPF_HASH(counts, struct stack_key_t, uint64_t);
+
+int on_tcp_send(struct pt_regs *ctx) {
+  struct stack_key_t key = {};
+  key.pid = bpf_get_current_pid_tgid() >> 32;
+  bpf_get_current_comm(&key.name, sizeof(key.name));
+  key.kernel_stack = stack_traces.get_stackid(ctx, 0);
+  key.user_stack = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
+
+  u64 zero = 0, *val;
+  val = counts.lookup_or_init(&key, &zero);
+  (*val)++;
+
+  return 0;
+}
+)";
+
+// Define the same struct to use in user space.
+struct stack_key_t {
+  int pid;
+  char name[16];
+  int user_stack;
+  int kernel_stack;
+};
+
+int main(int argc, char** argv) {
+  ebpf::BPF bpf;
+  auto init_res = bpf.init(BPF_PROGRAM);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_res = bpf.attach_kprobe("tcp_sendmsg", "on_tcp_send");
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  int probe_time = 10;
+  if (argc == 2) {
+    probe_time = atoi(argv[1]);
+  }
+  std::cout << "Probing for " << probe_time << " seconds" << std::endl;
+  sleep(probe_time);
+
+  auto detach_res = bpf.detach_kprobe("tcp_sendmsg");
+  if (detach_res.code() != 0) {
+    std::cerr << detach_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto table =
+      bpf.get_hash_table<stack_key_t, uint64_t>("counts").get_table_offline();
+  std::sort(
+      table.begin(), table.end(),
+      [](std::pair<stack_key_t, uint64_t> a,
+         std::pair<stack_key_t, uint64_t> b) { return a.second < b.second; });
+  auto stacks = bpf.get_stack_table("stack_traces");
+
+  int lost_stacks = 0;
+  for (auto it : table) {
+    std::cout << "PID: " << it.first.pid << " (" << it.first.name << ") "
+              << "made " << it.second
+              << " TCP sends on following stack: " << std::endl;
+    if (it.first.kernel_stack >= 0) {
+      std::cout << "  Kernel Stack:" << std::endl;
+      auto syms = stacks.get_stack_symbol(it.first.kernel_stack, -1);
+      for (auto sym : syms)
+        std::cout << "    " << sym << std::endl;
+    } else {
+      // -EFAULT normally means the stack is not availiable and not an error
+      if (it.first.kernel_stack != -EFAULT) {
+        lost_stacks++;
+        std::cout << "    [Lost Kernel Stack" << it.first.kernel_stack << "]"
+                  << std::endl;
+      }
+    }
+    if (it.first.user_stack >= 0) {
+      std::cout << "  User Stack:" << std::endl;
+      auto syms = stacks.get_stack_symbol(it.first.user_stack, it.first.pid);
+      for (auto sym : syms)
+        std::cout << "    " << sym << std::endl;
+    } else {
+      // -EFAULT normally means the stack is not availiable and not an error
+      if (it.first.user_stack != -EFAULT) {
+        lost_stacks++;
+        std::cout << "    [Lost User Stack " << it.first.user_stack << "]"
+                  << std::endl;
+      }
+    }
+  }
+
+  if (lost_stacks > 0)
+    std::cout << "Total " << lost_stacks << " stack-traces lost due to "
+              << "hash collision or stack table full" << std::endl;
+
+  return 0;
+}
diff --git a/examples/cpp/UseExternalMap.cc b/examples/cpp/UseExternalMap.cc
new file mode 100644
index 0000000..3d4d759
--- /dev/null
+++ b/examples/cpp/UseExternalMap.cc
@@ -0,0 +1,134 @@
+/*
+ * UseExternalMap shows how to access an external map through
+ * C++ interface. The external map could be a pinned map.
+ * This example simulates the pinned map through a locally
+ * created map by calling libbpf bpf_create_map.
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <stdint.h>
+#include <iostream>
+
+#include "BPF.h"
+
+// Used by C++ get hash_table
+struct sched_switch_info {
+  int prev_pid;
+  int next_pid;
+  char prev_comm[16];
+  char next_comm[16];
+};
+
+#define CHECK(condition, msg)        \
+  ({                                 \
+    if (condition) {                 \
+      std::cerr << msg << std::endl; \
+      return 1;                      \
+    }                                \
+  })
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+
+struct sched_switch_info {
+  int prev_pid;
+  int next_pid;
+  char prev_comm[16];
+  char next_comm[16];
+};
+
+BPF_TABLE("extern", u32, u32, control, 1);
+BPF_HASH(counts, struct sched_switch_info, u32);
+int on_sched_switch(struct tracepoint__sched__sched_switch *args) {
+  struct sched_switch_info key = {};
+  u32 zero = 0, *val;
+
+  /* only do something when control is on */
+  val = control.lookup(&zero);
+  if (!val || *val == 0)
+    return 0;
+
+  /* record sched_switch info in counts table */
+  key.prev_pid = args->prev_pid;
+  key.next_pid = args->next_pid;
+  __builtin_memcpy(&key.prev_comm, args->prev_comm, 16);
+  __builtin_memcpy(&key.next_comm, args->next_comm, 16);
+  val = counts.lookup_or_init(&key, &zero);
+  (*val)++;
+
+  return 0;
+}
+)";
+
+static void print_counts(ebpf::BPF *bpfp, std::string msg) {
+  auto counts_table_hdl =
+      bpfp->get_hash_table<struct sched_switch_info, uint32_t>("counts");
+  printf("%s\n", msg.c_str());
+  printf("%-8s  %-16s      %-8s  %-16s   %-4s\n", "PREV_PID", "PREV_COMM",
+         "CURR_PID", "CURR_COMM", "CNT");
+  for (auto it : counts_table_hdl.get_table_offline()) {
+    printf("%-8d (%-16s) ==> %-8d (%-16s): %-4d\n", it.first.prev_pid,
+           it.first.prev_comm, it.first.next_pid, it.first.next_comm,
+           it.second);
+  }
+}
+
+int main() {
+  int ctrl_map_fd;
+  uint32_t val;
+
+  // create a map through bpf_create_map, bcc knows nothing about this map.
+  ctrl_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, "control", sizeof(uint32_t),
+                               sizeof(uint32_t), 1, 0);
+  CHECK(ctrl_map_fd < 0, "bpf_create_map failure");
+
+  // populate control map into TableStorage
+  std::unique_ptr<ebpf::TableStorage> local_ts =
+      ebpf::createSharedTableStorage();
+  ebpf::Path global_path({"control"});
+  ebpf::TableDesc table_desc("control", ebpf::FileDesc(ctrl_map_fd),
+                             BPF_MAP_TYPE_ARRAY, sizeof(uint32_t),
+                             sizeof(uint32_t), 1, 0);
+  local_ts->Insert(global_path, std::move(table_desc));
+
+  // constructor with the pre-populated table storage
+  ebpf::BPF bpf(0, &*local_ts);
+  auto res = bpf.init(BPF_PROGRAM);
+  CHECK(res.code(), res.msg());
+
+  // attach to the tracepoint sched:sched_switch
+  res = bpf.attach_tracepoint("sched:sched_switch", "on_sched_switch");
+  CHECK(res.code(), res.msg());
+
+  // wait for some scheduling events
+  sleep(1);
+
+  auto control_table_hdl = bpf.get_array_table<uint32_t>("control");
+  res = control_table_hdl.get_value(0, val);
+  CHECK(res.code() || val != 0, res.msg());
+
+  // we should not see any events here
+  print_counts(&bpf, "events with control off:");
+
+  printf("\n");
+
+  // change the control to on so bpf program starts to count events
+  val = 1;
+  res = control_table_hdl.update_value(0, val);
+  CHECK(res.code(), res.msg());
+
+  // verify we get the control on back
+  val = 0;
+  res = control_table_hdl.get_value(0, val);
+  CHECK(res.code() || val != 1, res.msg());
+
+  // wait for some scheduling events
+  sleep(1);
+
+  // we should see a bunch of events here
+  print_counts(&bpf, "events with control on:");
+
+  return 0;
+}
diff --git a/examples/hello_world.py b/examples/hello_world.py
new file mode 100755
index 0000000..49c5535
--- /dev/null
+++ b/examples/hello_world.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# run in project examples directory with:
+# sudo ./hello_world.py"
+# see trace_fields.py for a longer example
+
+from bcc import BPF
+
+# This may not work for 4.17 on x64, you need replace kprobe__sys_clone with kprobe____x64_sys_clone
+BPF(text='int kprobe__sys_clone(void *ctx) { bpf_trace_printk("Hello, World!\\n"); return 0; }').trace_print()
diff --git a/examples/lua/CMakeLists.txt b/examples/lua/CMakeLists.txt
new file mode 100644
index 0000000..b322078
--- /dev/null
+++ b/examples/lua/CMakeLists.txt
@@ -0,0 +1,4 @@
+file(GLOB C_FILES *.c)
+file(GLOB LUA_FILES *.lua)
+install(FILES ${C_FILES} DESTINATION share/bcc/examples/lua)
+install(PROGRAMS ${LUA_FILES} DESTINATION share/bcc/examples/lua)
\ No newline at end of file
diff --git a/examples/lua/bashreadline.c b/examples/lua/bashreadline.c
new file mode 100644
index 0000000..fad33d7
--- /dev/null
+++ b/examples/lua/bashreadline.c
@@ -0,0 +1,21 @@
+#include <uapi/linux/ptrace.h>
+
+struct str_t {
+	u64 pid;
+	char str[80];
+};
+
+BPF_PERF_OUTPUT(events);
+
+int printret(struct pt_regs *ctx)
+{
+	struct str_t data  = {};
+	u32 pid;
+        if (!PT_REGS_RC(ctx))
+          return 0;
+        pid = bpf_get_current_pid_tgid();
+        data.pid = pid;
+        bpf_probe_read(&data.str, sizeof(data.str), (void *)PT_REGS_RC(ctx));
+        events.perf_submit(ctx, &data, sizeof(data));
+        return 0;
+};
diff --git a/examples/lua/bashreadline.lua b/examples/lua/bashreadline.lua
new file mode 100755
index 0000000..f7bb9ff
--- /dev/null
+++ b/examples/lua/bashreadline.lua
@@ -0,0 +1,31 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+
+return function(BPF)
+  local b = BPF:new{src_file="bashreadline.c", debug=0}
+  b:attach_uprobe{name="/bin/bash", sym="readline", fn_name="printret", retprobe=true}
+
+  local function print_readline(cpu, event)
+    print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)})
+  end
+
+  b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }", nil)
+
+  print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"})
+  b:perf_buffer_poll_loop()
+end
diff --git a/examples/lua/kprobe-latency.lua b/examples/lua/kprobe-latency.lua
new file mode 100644
index 0000000..60ac2c1
--- /dev/null
+++ b/examples/lua/kprobe-latency.lua
@@ -0,0 +1,79 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This example program measures latency of block device operations and plots it
+-- in a histogram. It is similar to BPF example:
+-- https://github.com/torvalds/linux/blob/master/samples/bpf/tracex3_kern.c
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+
+-- Shared part of the program
+local bins = 100
+local map = bpf.map('hash', 512, ffi.typeof('uint64_t'), ffi.typeof('uint64_t'))
+local lat_map = bpf.map('array', bins)
+
+-- Kernel-space part of the program
+local trace_start = bpf.kprobe('myprobe:blk_start_request', function (ptregs)
+	map[ptregs.parm1] = time()
+end, false, -1, 0)
+local trace_end = bpf.kprobe('myprobe2:blk_account_io_completion', function (ptregs)
+	-- The lines below are computing index
+	-- using log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	-- index = 29 ~ 1 usec
+	-- index = 59 ~ 1 msec
+	-- index = 89 ~ 1 sec
+	-- index = 99 ~ 10sec or more
+	local delta = time() - map[ptregs.parm1]
+	local index = 3 * math.log2(delta)
+	if index >= bins then
+		index = bins-1
+	end
+	xadd(lat_map[index], 1)
+	return true
+end, false, -1, 0)
+-- User-space part of the program
+pcall(function()
+	local counter = 0
+	local sym = {' ',' ','.','.','*','*','o','o','O','O','#','#'}
+	while true do
+		-- Print header once in a while
+		if counter % 50 == 0 then
+			print('|1us      |10us     |100us    |1ms      |10ms     |100ms    |1s       |10s')
+			counter = 0
+		end
+		counter = counter + 1
+		-- Collect all events
+		local hist, events = {}, 0
+		for i=29,bins-1 do
+			local v = tonumber(lat_map[i] or 0)
+			if v > 0 then
+				hist[i] = hist[i] or 0 + v
+				events = events + v
+			end
+		end
+		-- Print histogram symbols based on relative frequency
+		local s = ''
+		for i=29,bins-1 do
+			if hist[i] then
+				local c = math.ceil((hist[i] / (events + 1)) * #sym)
+				s = s .. sym[c]
+			else s = s .. ' ' end
+		end
+		print(s .. string.format('  ; %d events', events))
+		S.sleep(1)
+	end
+end)
\ No newline at end of file
diff --git a/examples/lua/kprobe-write.lua b/examples/lua/kprobe-write.lua
new file mode 100644
index 0000000..38f5a20
--- /dev/null
+++ b/examples/lua/kprobe-write.lua
@@ -0,0 +1,35 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple tracing example that executes a program on
+-- return from sys_write() and tracks the number of hits
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+
+-- Shared part of the program
+local map = bpf.map('array', 1)
+-- Kernel-space part of the program
+local probe = bpf.kprobe('myprobe:sys_write', function (ptregs)
+   xadd(map[0], 1)
+end, true)
+-- User-space part of the program
+pcall(function()
+	for _ = 1, 10 do
+	   print('hits: ', tonumber(map[0]))
+	   S.sleep(1)
+	end
+end)
diff --git a/examples/lua/memleak.lua b/examples/lua/memleak.lua
new file mode 100755
index 0000000..99c1522
--- /dev/null
+++ b/examples/lua/memleak.lua
@@ -0,0 +1,205 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+local bpf_source = [[
+#include <uapi/linux/ptrace.h>
+
+struct alloc_info_t {
+        u64 size;
+        u64 timestamp_ns;
+        int stack_id;
+};
+
+BPF_HASH(sizes, u64);
+BPF_HASH(allocs, u64, struct alloc_info_t);
+BPF_STACK_TRACE(stack_traces, 10240);
+
+int alloc_enter(struct pt_regs *ctx, size_t size)
+{
+        SIZE_FILTER
+        if (SAMPLE_EVERY_N > 1) {
+                u64 ts = bpf_ktime_get_ns();
+                if (ts % SAMPLE_EVERY_N != 0)
+                        return 0;
+        }
+
+        u64 pid = bpf_get_current_pid_tgid();
+        u64 size64 = size;
+        sizes.update(&pid, &size64);
+
+        if (SHOULD_PRINT)
+                bpf_trace_printk("alloc entered, size = %u\n", size);
+        return 0;
+}
+
+int alloc_exit(struct pt_regs *ctx)
+{
+        u64 address = PT_REGS_RC(ctx);
+        u64 pid = bpf_get_current_pid_tgid();
+        u64* size64 = sizes.lookup(&pid);
+        struct alloc_info_t info = {0};
+
+        if (size64 == 0)
+                return 0; // missed alloc entry
+
+        info.size = *size64;
+        sizes.delete(&pid);
+
+        info.timestamp_ns = bpf_ktime_get_ns();
+        info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS);
+
+        allocs.update(&address, &info);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("alloc exited, size = %lu, result = %lx\n",
+                                 info.size, address);
+        }
+        return 0;
+}
+
+int free_enter(struct pt_regs *ctx, void *address)
+{
+        u64 addr = (u64)address;
+        struct alloc_info_t *info = allocs.lookup(&addr);
+        if (info == 0)
+                return 0;
+
+        allocs.delete(&addr);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("free entered, address = %lx, size = %lu\n",
+                                 address, info->size);
+        }
+        return 0;
+}
+]]
+
+return function(BPF, utils)
+  local parser = utils.argparse("memleak", "Catch memory leaks")
+  parser:flag("-t --trace")
+  parser:flag("-a --show-allocs")
+  parser:option("-p --pid"):convert(tonumber)
+
+  parser:option("-i --interval", "", 5):convert(tonumber)
+  parser:option("-o --older", "", 500):convert(tonumber)
+  parser:option("-s --sample-rate", "", 1):convert(tonumber)
+
+  parser:option("-z --min-size", ""):convert(tonumber)
+  parser:option("-Z --max-size", ""):convert(tonumber)
+  parser:option("-T --top", "", 10):convert(tonumber)
+
+  local args = parser:parse()
+
+  local size_filter = ""
+  if args.min_size and args.max_size then
+    size_filter = "if (size < %d || size > %d) return 0;" %  {args.min_size, args.max_size}
+  elseif args.min_size then
+    size_filter = "if (size < %d) return 0;" % args.min_size
+  elseif args.max_size then
+    size_filter = "if (size > %d) return 0;" % args.max_size
+  end
+
+  local stack_flags = "BPF_F_REUSE_STACKID"
+  if args.pid then
+    stack_flags = stack_flags .. "|BPF_F_USER_STACK"
+  end
+
+  local text = bpf_source
+  text = text:gsub("SIZE_FILTER", size_filter)
+  text = text:gsub("STACK_FLAGS",  stack_flags)
+  text = text:gsub("SHOULD_PRINT", args.trace and "1" or "0")
+  text = text:gsub("SAMPLE_EVERY_N", tostring(args.sample_rate))
+
+  local bpf = BPF:new{text=text, debug=0}
+  local syms = nil
+  local min_age_ns = args.older * 1e6
+
+  if args.pid then
+    print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % args.pid)
+    bpf:attach_uprobe{name="c", sym="malloc", fn_name="alloc_enter", pid=args.pid}
+    bpf:attach_uprobe{name="c", sym="malloc", fn_name="alloc_exit", pid=args.pid, retprobe=true}
+    bpf:attach_uprobe{name="c", sym="free", fn_name="free_enter", pid=args.pid}
+  else
+    print("Attaching to kmalloc and kfree, Ctrl+C to quit.")
+    bpf:attach_kprobe{event="__kmalloc", fn_name="alloc_enter"}
+    bpf:attach_kprobe{event="__kmalloc", fn_name="alloc_exit", retprobe=true} -- TODO
+    bpf:attach_kprobe{event="kfree", fn_name="free_enter"}
+  end
+
+  local syms = BPF.SymbolCache(args.pid)
+  local allocs = bpf:get_table("allocs")
+  local stack_traces = bpf:get_table("stack_traces")
+
+  local function resolve(addr)
+    local sym = syms:resolve(addr)
+    if args.pid == nil then
+      sym = sym .. " [kernel]"
+    end
+    return string.format("%s (%p)", sym, addr)
+  end
+
+  local function print_outstanding()
+    local alloc_info = {}
+    local now = utils.posix.time_ns()
+
+    print("[%s] Top %d stacks with outstanding allocations:" %
+      {os.date("%H:%M:%S"), args.top})
+
+    for address, info in allocs:items() do
+      if now - min_age_ns >= tonumber(info.timestamp_ns) then
+        local stack_id = tonumber(info.stack_id)
+
+        if stack_id >= 0 then
+          if alloc_info[stack_id] then
+            local s = alloc_info[stack_id]
+            s.count = s.count + 1
+            s.size = s.size + tonumber(info.size)
+          else
+            local stack = stack_traces:get(stack_id, resolve)
+            alloc_info[stack_id] = { stack=stack, count=1, size=tonumber(info.size) }
+          end
+        end
+
+        if args.show_allocs then
+          print("\taddr = %p size = %s" % {address, tonumber(info.size)})
+        end
+      end
+    end
+
+    local top = table.values(alloc_info)
+    table.sort(top, function(a, b) return a.size > b.size end)
+
+    for n, alloc in ipairs(top) do
+      print("\t%d bytes in %d allocations from stack\n\t\t%s" %
+        {alloc.size, alloc.count, table.concat(alloc.stack, "\n\t\t")})
+      if n == args.top then break end
+    end
+  end
+
+  if args.trace then
+    local pipe = bpf:pipe()
+    while true do
+      print(pipe:trace_fields())
+    end
+  else
+    while true do
+      utils.posix.sleep(args.interval)
+      syms:refresh()
+      print_outstanding()
+    end
+  end
+end
diff --git a/examples/lua/offcputime.lua b/examples/lua/offcputime.lua
new file mode 100755
index 0000000..09c704f
--- /dev/null
+++ b/examples/lua/offcputime.lua
@@ -0,0 +1,116 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MINBLOCK_US	1
+
+struct key_t {
+    char name[TASK_COMM_LEN];
+    int stack_id;
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+BPF_STACK_TRACE(stack_traces, 10240);
+
+int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
+    u32 pid;
+    u64 ts, *tsp;
+
+    // record previous thread sleep time
+    if (FILTER) {
+        pid = prev->pid;
+        ts = bpf_ktime_get_ns();
+        start.update(&pid, &ts);
+    }
+
+    // calculate current thread's delta time
+    pid = bpf_get_current_pid_tgid();
+    tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return 0;        // missed start or filtered
+    u64 delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+    delta = delta / 1000;
+    if (delta < MINBLOCK_US)
+        return 0;
+
+    // create map key
+    u64 zero = 0, *val;
+    struct key_t key = {};
+    int stack_flags = BPF_F_REUSE_STACKID;
+
+    /*
+    if (!(prev->flags & PF_KTHREAD))
+      stack_flags |= BPF_F_USER_STACK;
+    */
+
+    bpf_get_current_comm(&key.name, sizeof(key.name));
+    key.stack_id = stack_traces.get_stackid(ctx, stack_flags);
+
+    val = counts.lookup_or_init(&key, &zero);
+    (*val) += delta;
+    return 0;
+}
+]]
+
+return function(BPF, utils)
+  local ffi = require("ffi")
+
+  local parser = utils.argparse("offcputime", "Summarize off-cpu time")
+  parser:flag("-u --user-only")
+  parser:option("-p --pid"):convert(tonumber)
+  parser:flag("-f --folded")
+  parser:option("-d --duration", "duration to trace for", 9999999):convert(tonumber)
+
+  local args = parser:parse()
+  local ksym = BPF.SymbolCache()
+  local filter = "1"
+  local MAXDEPTH = 20
+
+  if args.pid then
+    filter = "pid == %d" % args.pid
+  elseif args.user_only then
+    filter = "!(prev->flags & PF_KTHREAD)"
+  end
+
+  local text = program:gsub("FILTER", filter)
+  local b = BPF:new{text=text}
+  b:attach_kprobe{event="finish_task_switch", fn_name="oncpu"}
+
+  if BPF.num_open_kprobes() == 0 then
+    print("no functions matched. quitting...")
+    return
+  end
+
+  print("Sleeping for %d seconds..." % args.duration)
+  pcall(utils.posix.sleep, args.duration)
+  print("Tracing...")
+
+  local counts = b:get_table("counts")
+  local stack_traces = b:get_table("stack_traces")
+
+  for k, v in counts:items() do
+    for addr in stack_traces:walk(tonumber(k.stack_id)) do
+      print("    %-16p %s" % {addr, ksym:resolve(addr)})
+    end
+    print("    %-16s %s" % {"-", ffi.string(k.name)})
+    print("        %d\n" % tonumber(v))
+  end
+end
diff --git a/examples/lua/sock-parse-dns.lua b/examples/lua/sock-parse-dns.lua
new file mode 100644
index 0000000..3c20517
--- /dev/null
+++ b/examples/lua/sock-parse-dns.lua
@@ -0,0 +1,56 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple parsing example of UDP/DNS that counts frequency of QTYPEs.
+-- It shows how to parse packet variable-length packet structures.
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = assert(bpf.map('array', 256))
+-- Kernel-space part of the program
+local prog = bpf.socket('lo', function (skb)
+	local ip = pkt.ip   -- Accept only UDP messages
+	if ip.proto ~= c.ip.proto_udp then return false end
+	local udp = ip.udp  -- Only messages >12 octets (DNS header)
+	if udp.length < 12 then return false end
+	-- Unroll QNAME (up to 2 labels)
+	udp = udp.data + 12
+	local label = udp[0]
+	if label > 0 then
+		udp = udp + label + 1
+		label = udp[0]
+		if label > 0 then
+			udp = udp + label + 1
+		end
+	end
+	-- Track QTYPE (low types)
+	if udp[0] == 0 then
+		local qtype = udp[2] -- Low octet from QTYPE
+		xadd(map[qtype], 1)
+	end
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	for k,v in map.pairs,map,0 do
+		v = tonumber(v)
+		if v > 0 then
+			print(string.format('TYPE%d: %d', k, v))
+		end
+	end
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-parse-http.lua b/examples/lua/sock-parse-http.lua
new file mode 100644
index 0000000..477b049
--- /dev/null
+++ b/examples/lua/sock-parse-http.lua
@@ -0,0 +1,57 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple parsing example of TCP/HTTP that counts frequency of types of requests
+-- and shows more complicated pattern matching constructions and slices.
+-- Rewrite of a BCC example:
+-- https://github.com/iovisor/bcc/blob/master/examples/networking/http_filter/http-parse-simple.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('hash', 64)
+-- Kernel-space part of the program
+local prog = bpf.socket('lo', function (skb)
+	-- Only ingress so we don't count twice on loopback
+	if skb.ingress_ifindex == 0 then return end
+	local data = pkt.ip.tcp.data  -- Get TCP protocol dissector
+	-- Continue only if we have 7 bytes of TCP data
+	if data + 7 > skb.len then return end
+	-- Fetch 4 bytes of TCP data and compare
+	local h = data(0, 4)
+	if h == 'HTTP' or h == 'GET ' or
+	   h == 'POST' or h == 'PUT ' or
+	   h == 'HEAD' or h == 'DELE' then
+	   	-- If hash key doesn't exist, create it
+	   	-- otherwise increment counter
+	   local v = map[h]
+	   if not v then map[h] = 1
+	   else          xadd(map[h], 1)
+	   end
+	end
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	local strkey = ffi.new('uint32_t [1]')
+	local s = ''
+	for k,v in map.pairs,map,0 do
+		strkey[0] = bpf.ntoh(k)
+		s = s..string.format('%s %d ', ffi.string(strkey, 4):match '^%s*(.-)%s*$', tonumber(v))
+	end
+	if #s > 0 then print(s..'messages') end
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-proto.lua b/examples/lua/sock-proto.lua
new file mode 100644
index 0000000..ab9d3e2
--- /dev/null
+++ b/examples/lua/sock-proto.lua
@@ -0,0 +1,38 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This program looks at IP, UDP and ICMP packets and
+-- increments counter for each packet of given type seen
+-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('hash', 256)
+map[1], map[6], map[17] = 0, 0, 0
+-- Kernel-space part of the program
+bpf.socket('lo', function (skb)
+   local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+   xadd(map[proto], 1)         -- Atomic `map[proto] += 1`
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+   local icmp, udp, tcp = map[1], map[17], map[6]
+   print(string.format('TCP %d UDP %d ICMP %d packets',
+   	     tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
+   S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-protolen.lua b/examples/lua/sock-protolen.lua
new file mode 100644
index 0000000..6ad6e3b
--- /dev/null
+++ b/examples/lua/sock-protolen.lua
@@ -0,0 +1,38 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This program counts total bytes received per-protocol in 64-bit counters.
+-- The map backend is array in this case to avoid key allocations.
+-- increments counter for each packet of given type seen
+-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('array', 256, ffi.typeof('uint32_t'), ffi.typeof('uint64_t'))
+-- Kernel-space part of the program
+bpf.socket('lo', function (skb)
+	local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+	xadd(map[proto], skb.len)   -- Atomic `map[proto] += <payload length>`
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	local icmp, udp, tcp = map[1], map[17], map[6]
+	print(string.format('TCP %d UDP %d ICMP %d bytes',
+		tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/strlen_count.lua b/examples/lua/strlen_count.lua
new file mode 100755
index 0000000..553d043
--- /dev/null
+++ b/examples/lua/strlen_count.lua
@@ -0,0 +1,44 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+assert(arg[1], "usage: strlen_count PID")
+
+local program = string.gsub([[
+#include <uapi/linux/ptrace.h>
+int printarg(struct pt_regs *ctx) {
+  if (!PT_REGS_PARM1(ctx))
+    return 0;
+  u32 pid = bpf_get_current_pid_tgid();
+  if (pid != PID)
+    return 0;
+  char str[128] = {};
+  bpf_probe_read(&str, sizeof(str), (void *)PT_REGS_PARM1(ctx));
+  bpf_trace_printk("strlen(\"%s\")\n", &str);
+  return 0;
+};
+]], "PID", arg[1])
+
+return function(BPF)
+  local b = BPF:new{text=program, debug=0}
+  b:attach_uprobe{name="c", sym="strlen", fn_name="printarg"}
+
+  local pipe = b:pipe()
+  while true do
+    local task, pid, cpu, flags, ts, msg = pipe:trace_fields()
+    print("%-18.9f %-16s %-6d %s" % {ts, task, pid, msg})
+  end
+end
diff --git a/examples/lua/task_switch.lua b/examples/lua/task_switch.lua
new file mode 100755
index 0000000..1c0aaa8
--- /dev/null
+++ b/examples/lua/task_switch.lua
@@ -0,0 +1,52 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct key_t {
+  u32 prev_pid;
+  u32 curr_pid;
+};
+// map_type, key_type, leaf_type, table_name, num_entry
+BPF_HASH(stats, struct key_t, u64, 1024);
+int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
+  struct key_t key = {};
+  u64 zero = 0, *val;
+
+  key.curr_pid = bpf_get_current_pid_tgid();
+  key.prev_pid = prev->pid;
+
+  val = stats.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+}
+]]
+
+return function(BPF)
+  local b = BPF:new{text=program, debug=0}
+  b:attach_kprobe{event="finish_task_switch", fn_name="count_sched"}
+
+  print("Press any key...")
+  io.read()
+
+  local t = b:get_table("stats")
+  for k, v in t:items() do
+    print("task_switch[%d -> %d] = %d" % {k.prev_pid, k.curr_pid, tonumber(v)})
+  end
+end
diff --git a/examples/lua/tracepoint-offcputime.lua b/examples/lua/tracepoint-offcputime.lua
new file mode 100644
index 0000000..fccf0b7
--- /dev/null
+++ b/examples/lua/tracepoint-offcputime.lua
@@ -0,0 +1,80 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Summarize off-CPU time by stack trace
+-- Related tool: https://github.com/iovisor/bcc/blob/master/tools/offcputime.py
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Create BPF maps
+-- TODO: made smaller to fit default memory limits
+local key_t = 'struct { char name[16]; int32_t stack_id; }'
+local starts = assert(bpf.map('hash', 128, ffi.typeof('uint32_t'), ffi.typeof('uint64_t')))
+local counts = assert(bpf.map('hash', 128, ffi.typeof(key_t), ffi.typeof('uint64_t')))
+local stack_traces = assert(bpf.map('stack_trace', 16))
+-- Open tracepoint and attach BPF program
+-- The 'arg' parses tracepoint format automatically
+local tp = bpf.tracepoint('sched/sched_switch', function (arg)
+	-- Update previous thread sleep time
+	local pid = arg.prev_pid
+	local now = time()
+	starts[pid] = now
+	-- Calculate current thread's delta time
+	pid = arg.next_pid
+	local from = starts[pid]
+	if not from then
+		return 0
+	end
+	local delta = (now - from) / 1000
+	starts[pid] = nil
+	-- Check if the delta is below 1us
+	if delta < 1 then
+		return
+	end
+	-- Create key for this thread
+	local key = ffi.new(key_t)
+	comm(key.name)
+	key.stack_id = stack_id(stack_traces, BPF.F_FAST_STACK_CMP)
+	-- Update current thread off cpu time with delta
+	local val = counts[key]
+	if not val then
+		counts[key] = 0
+	end
+	xadd(counts[key], delta)
+end, 0, -1)
+-- Helper: load kernel symbols
+ffi.cdef 'unsigned long long strtoull(const char *, char **, int);'
+local ksyms = {}
+for l in io.lines('/proc/kallsyms') do
+	local addr, sym = l:match '(%w+) %w (%S+)'
+	if addr then ksyms[ffi.C.strtoull(addr, nil, 16)] = sym end
+end
+-- User-space part of the program
+while true do
+	for k,v in counts.pairs,counts,nil do
+		local s = ''
+		local traces = stack_traces[k.stack_id]
+		if traces then
+			for i, ip in ipairs(traces) do
+				s = s .. string.format("    %-16p %s", ip, ksyms[ip])
+			end
+		end
+		s = s .. string.format("    %-16s %s", "-", ffi.string(k.name))
+		s = s .. string.format("        %d", tonumber(v))
+		print(s)
+	end
+	S.sleep(1)
+end
diff --git a/examples/lua/uprobe-readline-perf.lua b/examples/lua/uprobe-readline-perf.lua
new file mode 100644
index 0000000..aaf3f40
--- /dev/null
+++ b/examples/lua/uprobe-readline-perf.lua
@@ -0,0 +1,42 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace readline() call from all bash instances (print bash commands from all running shells).
+-- This is rough equivallent to `bashreadline` with output through perf event API.
+-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Perf event map
+local sample_t = 'struct { uint64_t pid; char str[80]; }'
+local events = bpf.map('perf_event_array')
+-- Kernel-space part of the program
+local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
+	local sample = ffi.new(sample_t)
+	sample.pid = pid_tgid()
+	ffi.copy(sample.str, ffi.cast('char *', ptregs.ax)) -- Cast `ax` to string pointer and copy to buffer
+	perf_submit(events, sample)                         -- Write buffer to perf event map
+end, true, -1, 0)
+-- User-space part of the program
+local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
+print('            TASK-PID         TIMESTAMP  FUNCTION')
+print('               | |               |         |')
+while true do
+	log:block()               -- Wait until event reader is readable
+	for _,e in log:read() do  -- Collect available reader events
+		print(string.format('%12s%-16s %-10s %s', '', tonumber(e.pid), os.date("%H:%M:%S"), ffi.string(e.str)))
+	end
+end
diff --git a/examples/lua/uprobe-readline.lua b/examples/lua/uprobe-readline.lua
new file mode 100644
index 0000000..7c76950
--- /dev/null
+++ b/examples/lua/uprobe-readline.lua
@@ -0,0 +1,37 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace readline() call from all bash instances (print bash commands from all running shells).
+-- This is rough equivallent to `bashreadline`
+-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Kernel-space part of the program
+local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
+	local line = ffi.new('char [40]')              -- Create a 40 byte buffer on stack
+	ffi.copy(line, ffi.cast('char *', ptregs.ax))  -- Cast `ax` to string pointer and copy to buffer
+	print('%s\n', line)                            -- Print to trace_pipe
+end, true, -1, 0)
+-- User-space part of the program
+local ok, err = pcall(function()
+	local log = bpf.tracelog()
+	print('            TASK-PID   CPU#         TIMESTAMP  FUNCTION')
+	print('               | |      |               |         |')
+	while true do
+		print(log:read())
+	end
+end)
diff --git a/examples/lua/uprobe-tailkt.lua b/examples/lua/uprobe-tailkt.lua
new file mode 100644
index 0000000..071b2de
--- /dev/null
+++ b/examples/lua/uprobe-tailkt.lua
@@ -0,0 +1,65 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace operations on keys matching given pattern in KyotoTycoon daemon.
+-- This can show you if certain keys were modified or read during the lifetime
+-- even if KT doesn't support this. It also shows how to attach to C++ mangled symbols.
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+local function help(err)
+	print(string.format('%s [get|set] [key]', arg[0]))
+	if err then print('error: '..err) end
+	os.exit(1)
+end
+-- Accept the same format as ktremotemgr for clarity: <get|set> <key>
+local writeable, watch_key, klen = 'any', arg[2] or '*', 80
+if     arg[1] == 'get' then writeable = 0
+elseif arg[1] == 'set' then writeable = 1
+elseif arg[1] == '-h' or arg[1] == '--help' then help()
+elseif arg[1] and arg[1] ~= 'any' then
+	help(string.format('bad cmd: "%s"', arg[1]))
+end
+if watch_key ~= '*' then klen = #watch_key end
+
+-- Find a good entrypoint that has both key and differentiates read/write in KT
+-- That is going to serve as an attachment point for BPF program
+-- ABI: bool accept(void *this, const char* kbuf, size_t ksiz, Visitor* visitor, bool writable)
+local key_type = string.format('char [%d]', klen)
+local probe = bpf.uprobe('/usr/local/bin/ktserver:kyotocabinet::StashDB::accept',
+function (ptregs)
+	-- Watch either get/set or both
+	if writeable ~= 'any' then
+		if ptregs.parm5 ~= writeable then return end
+	end
+	local line = ffi.new(key_type)
+	ffi.copy(line, ffi.cast('char *', ptregs.parm2))
+	-- Check if we're looking for specific key
+	if watch_key ~= '*' then
+		if ptregs.parm3 ~= klen then return false end
+		if line ~= watch_key then return false end
+	end
+	print('%s write:%d\n', line, ptregs.parm5)
+end, false, -1, 0)
+-- User-space part of the program
+local ok, err = pcall(function()
+	local log = bpf.tracelog()
+	print('            TASK-PID   CPU#         TIMESTAMP  FUNCTION')
+	print('               | |      |               |         |')
+	while true do
+		print(log:read())
+	end
+end)
diff --git a/examples/lua/usdt_ruby.lua b/examples/lua/usdt_ruby.lua
new file mode 100755
index 0000000..5b5df2d
--- /dev/null
+++ b/examples/lua/usdt_ruby.lua
@@ -0,0 +1,46 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+int trace_method(struct pt_regs *ctx) {
+  uint64_t addr;
+  bpf_usdt_readarg(2, ctx, &addr);
+
+  char fn_name[128] = {};
+  bpf_probe_read(&fn_name, sizeof(fn_name), (void *)addr);
+
+  bpf_trace_printk("%s(...)\n", fn_name);
+  return 0;
+};
+]]
+
+return function(BPF, util)
+  if not arg[1] then
+    print("usage: rubysyms.lua PID")
+    return
+  end
+
+  local u = util.USDT:new{pid=tonumber(arg[1])}
+  u:enable_probe{probe="method__entry", fn_name="trace_method"}
+
+  local b = BPF:new{text=program, usdt=u}
+  local pipe = b:pipe()
+  while true do
+    print(pipe:trace_fields())
+  end
+end
diff --git a/examples/networking/CMakeLists.txt b/examples/networking/CMakeLists.txt
new file mode 100644
index 0000000..790f033
--- /dev/null
+++ b/examples/networking/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(EXAMPLE_FILES simulation.py)
+set(EXAMPLE_PROGRAMS simple_tc.py tc_perf_event.py)
+install(FILES ${EXAMPLE_FILES} DESTINATION share/bcc/examples/networking)
+install(PROGRAMS ${EXAMPLE_PROGRAMS} DESTINATION share/bcc/examples/networking)
+
+add_subdirectory(distributed_bridge)
+add_subdirectory(neighbor_sharing)
+add_subdirectory(vlan_learning)
+add_subdirectory(tunnel_monitor)
+add_subdirectory(http_filter)
+add_subdirectory(xdp)
diff --git a/examples/networking/distributed_bridge/CMakeLists.txt b/examples/networking/distributed_bridge/CMakeLists.txt
new file mode 100644
index 0000000..2c7ec5e
--- /dev/null
+++ b/examples/networking/distributed_bridge/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(EXAMPLE_FILES simulation.py tunnel.c tunnel_mesh.c)
+set(EXAMPLE_PROGRAMS main.py tunnel_mesh.py tunnel.py)
+install(FILES ${EXAMPLE_FILES} DESTINATION share/bcc/examples/networking/distributed_bridge)
+install(PROGRAMS ${EXAMPLE_PROGRAMS} DESTINATION share/bcc/examples/networking/distributed_bridge)
diff --git a/examples/networking/distributed_bridge/main.py b/examples/networking/distributed_bridge/main.py
new file mode 100755
index 0000000..056443e
--- /dev/null
+++ b/examples/networking/distributed_bridge/main.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from sys import argv
+from builtins import input
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from simulation import Simulation
+from subprocess import PIPE, call, Popen
+import re
+
+multicast = 1
+dhcp = 0
+gretap = 0
+
+if "mesh" in argv:
+    multicast = 0
+
+if "dhcp" in argv:
+    dhcp = 1
+    multicast = 0
+
+if "gretap" in argv:
+    gretap = 1
+    multicast = 0
+
+print("multicast %d dhcp %d gretap %d" % (multicast, dhcp, gretap))
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+num_hosts = 3
+null = open("/dev/null", "w")
+
+class TunnelSimulation(Simulation):
+    def __init__(self, ipdb):
+        super(TunnelSimulation, self).__init__(ipdb)
+
+    def start(self):
+        # each entry is tuple of ns_ipdb, out_ifc, in_ifc
+        host_info = []
+        for i in range(0, num_hosts):
+            print("Launching host %i of %i" % (i + 1, num_hosts))
+            ipaddr = "172.16.1.%d/24" % (100 + i)
+            host_info.append(self._create_ns("host%d" % i, ipaddr=ipaddr,
+                disable_ipv6=True))
+            if multicast:
+              cmd = ["python", "tunnel.py", str(i)]
+            else:
+              cmd = ["python", "tunnel_mesh.py", str(num_hosts), str(i), str(dhcp), str(gretap)]
+            p = NSPopen(host_info[i][0].nl.netns, cmd, stdin=PIPE)
+            self.processes.append(p)
+        with self.ipdb.create(ifname="br-fabric", kind="bridge") as br:
+            for host in host_info: br.add_port(host[1])
+            br.up()
+
+        # get host0 bridge ip's
+        host0_br_ips = []
+        if dhcp == 1:
+            print("Waiting for host0 br1/br2 ip addresses available")
+            for j in range(0, 2):
+                interface = host_info[0][0].interfaces["br%d" % j]
+                interface.wait_ip("99.1.0.0", 16, timeout=60)
+                host0_br_ips = [x[0] for x in interface.ipaddr
+                                if x[0].startswith("99.1")]
+        else:
+            host0_br_ips.append("99.1.0.1")
+            host0_br_ips.append("99.1.1.1")
+
+        # traffic test
+        print("Validating connectivity")
+        for i in range(1, num_hosts):
+            for j in range(0, 2):
+                interface = host_info[i][0].interfaces["br%d" % j]
+                interface.wait_ip("99.1.0.0", 16, timeout=60)
+                print("VNI%d between host0 and host%d" % (10000 + j, i))
+                call(["ip", "netns", "exec", "host%d" % i,
+                      "ping", host0_br_ips[j], "-c", "3", "-i", "0.2", "-q"])
+
+try:
+    sim = TunnelSimulation(ipdb)
+    sim.start()
+    input("Press enter to quit:")
+    for p in sim.processes: p.communicate(b"\n")
+except:
+    if "sim" in locals():
+        for p in sim.processes: p.kill(); p.wait(); p.release()
+finally:
+    if "br-fabric" in ipdb.interfaces: ipdb.interfaces["br-fabric"].remove().commit()
+    if "sim" in locals(): sim.release()
+    ipdb.release()
+    null.close()
diff --git a/examples/networking/distributed_bridge/simulation.py b/examples/networking/distributed_bridge/simulation.py
new file mode 120000
index 0000000..98a2055
--- /dev/null
+++ b/examples/networking/distributed_bridge/simulation.py
@@ -0,0 +1 @@
+../simulation.py
\ No newline at end of file
diff --git a/examples/networking/distributed_bridge/tunnel.c b/examples/networking/distributed_bridge/tunnel.c
new file mode 100644
index 0000000..0bd9982
--- /dev/null
+++ b/examples/networking/distributed_bridge/tunnel.c
@@ -0,0 +1,83 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+
+BPF_HASH(vni2if, u32, int, 1024);
+
+struct vni_key {
+  u64 mac;
+  int ifindex;
+  int pad;
+};
+struct host {
+  u32 tunnel_id;
+  u32 remote_ipv4;
+  u64 rx_pkts;
+  u64 tx_pkts;
+};
+BPF_HASH(mac2host, struct vni_key, struct host);
+
+struct config {
+  int tunnel_ifindex;
+};
+BPF_HASH(conf, int, struct config, 1);
+
+// Handle packets from the encap device, demux into the dest tenant
+int handle_ingress(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+
+  struct bpf_tunnel_key tkey = {};
+  bpf_skb_get_tunnel_key(skb, &tkey,
+      offsetof(struct bpf_tunnel_key, remote_ipv6[1]), 0);
+
+  int *ifindex = vni2if.lookup(&tkey.tunnel_id);
+  if (ifindex) {
+    //bpf_trace_printk("ingress tunnel_id=%d ifindex=%d\n", tkey.tunnel_id, *ifindex);
+    struct vni_key vk = {ethernet->src, *ifindex, 0};
+    struct host *src_host = mac2host.lookup_or_init(&vk,
+        &(struct host){tkey.tunnel_id, tkey.remote_ipv4, 0, 0});
+    lock_xadd(&src_host->rx_pkts, 1);
+    bpf_clone_redirect(skb, *ifindex, 1/*ingress*/);
+  } else {
+    bpf_trace_printk("ingress invalid tunnel_id=%d\n", tkey.tunnel_id);
+  }
+
+  return 1;
+}
+
+// Handle packets from the tenant, mux into the encap device
+int handle_egress(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+
+  int one = 1;
+  struct config *cfg = conf.lookup(&one);
+  if (!cfg) return 1;
+
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+
+  struct vni_key vk = {ethernet->dst, skb->ifindex, 0};
+  struct host *dst_host = mac2host.lookup(&vk);
+  struct bpf_tunnel_key tkey = {};
+  if (dst_host) {
+    u32 zero = 0;
+    tkey.tunnel_id = dst_host->tunnel_id;
+    tkey.remote_ipv4 = dst_host->remote_ipv4;
+    bpf_skb_set_tunnel_key(skb, &tkey,
+        offsetof(struct bpf_tunnel_key, remote_ipv6[1]), 0);
+    lock_xadd(&dst_host->tx_pkts, 1);
+  } else {
+    struct bpf_tunnel_key tkey = {};
+    vk.mac = 0xFFFFFFFFFFFFull;
+    dst_host = mac2host.lookup(&vk);
+    if (!dst_host)
+      return 1;
+    tkey.tunnel_id = dst_host->tunnel_id;
+    tkey.remote_ipv4 = dst_host->remote_ipv4;
+    bpf_skb_set_tunnel_key(skb, &tkey,
+        offsetof(struct bpf_tunnel_key, remote_ipv6[1]), 0);
+  }
+  bpf_clone_redirect(skb, cfg->tunnel_ifindex, 0/*egress*/);
+  return 1;
+}
diff --git a/examples/networking/distributed_bridge/tunnel.py b/examples/networking/distributed_bridge/tunnel.py
new file mode 100755
index 0000000..ef94292
--- /dev/null
+++ b/examples/networking/distributed_bridge/tunnel.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from sys import argv
+from bcc import BPF
+from builtins import input
+from ctypes import c_int, c_uint
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+import json
+from netaddr import EUI, IPAddress
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from socket import htons, AF_INET
+from threading import Thread
+from subprocess import call
+
+host_id = int(argv[1])
+
+b = BPF(src_file="tunnel.c")
+ingress_fn = b.load_func("handle_ingress", BPF.SCHED_CLS)
+egress_fn = b.load_func("handle_egress", BPF.SCHED_CLS)
+mac2host = b.get_table("mac2host")
+vni2if = b.get_table("vni2if")
+conf = b.get_table("conf")
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+ifc = ipdb.interfaces.eth0
+mcast = IPAddress("239.1.1.1")
+
+# ifcs to cleanup at the end
+ifc_gc = []
+
+def run():
+    ipdb.routes.add({"dst": "224.0.0.0/4", "oif": ifc.index}).commit()
+    with ipdb.create(ifname="vxlan0", kind="vxlan", vxlan_id=0,
+                     vxlan_link=ifc, vxlan_port=4789,
+                     vxlan_group=str(mcast), vxlan_flowbased=True,
+                     vxlan_collect_metadata=True,
+                     vxlan_learning=False) as vx:
+        vx.up()
+        ifc_gc.append(vx.ifname)
+
+    conf[c_int(1)] = c_int(vx.index)
+
+    ipr.tc("add", "ingress", vx.index, "ffff:")
+    ipr.tc("add-filter", "bpf", vx.index, ":1", fd=ingress_fn.fd,
+           name=ingress_fn.name, parent="ffff:", action="drop", classid=1)
+
+    for i in range(0, 2):
+        vni = 10000 + i
+        with ipdb.create(ifname="br%d" % i, kind="bridge") as br:
+            v = ipdb.create(ifname="dummy%d" % i, kind="dummy").up().commit()
+            mcast_key = mac2host.Key(0xFFFFFFFFFFFF, v.index, 0)
+            mcast_leaf = mac2host.Leaf(vni, mcast.value, 0, 0)
+            mac2host[mcast_key] = mcast_leaf
+
+            ipr.tc("add", "sfq", v.index, "1:")
+            ipr.tc("add-filter", "bpf", v.index, ":1", fd=egress_fn.fd,
+                   name=egress_fn.name, parent="1:", action="drop", classid=1)
+            br.add_port(v)
+            br.up()
+            ifc_gc.append(v.ifname)
+            ifc_gc.append(br.ifname)
+            vni2if[c_uint(vni)] = c_int(v.index)
+            ipaddr = "99.1.%d.%d/24" % (i, host_id + 1)
+            br.add_ip(ipaddr)
+
+try:
+    run()
+    ipdb.release()
+    input("")
+    print("---")
+    for k, v in mac2host.items():
+        print(EUI(k.mac), k.ifindex, IPAddress(v.remote_ipv4),
+              v.tunnel_id, v.rx_pkts, v.tx_pkts)
+finally:
+    for v in ifc_gc: call(["ip", "link", "del", v])
diff --git a/examples/networking/distributed_bridge/tunnel_mesh.c b/examples/networking/distributed_bridge/tunnel_mesh.c
new file mode 100644
index 0000000..ea64511
--- /dev/null
+++ b/examples/networking/distributed_bridge/tunnel_mesh.c
@@ -0,0 +1,66 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+
+struct config {
+  int tunnel_ifindex;
+};
+BPF_HASH(conf, int, struct config, 1);
+
+struct tunnel_key {
+  u32 tunnel_id;
+  u32 remote_ipv4;
+};
+BPF_HASH(tunkey2if, struct tunnel_key, int, 1024);
+
+BPF_HASH(if2tunkey, int, struct tunnel_key, 1024);
+
+// Handle packets from the encap device, demux into the dest tenant
+int handle_ingress(struct __sk_buff *skb) {
+  struct bpf_tunnel_key tkey = {};
+  struct tunnel_key key;
+  bpf_skb_get_tunnel_key(skb, &tkey,
+      offsetof(struct bpf_tunnel_key, remote_ipv6[1]), 0);
+
+  key.tunnel_id = tkey.tunnel_id;
+  key.remote_ipv4 = tkey.remote_ipv4;
+  int *ifindex = tunkey2if.lookup(&key);
+  if (ifindex) {
+    //bpf_trace_printk("ingress tunnel_id=%d remote_ip=%08x ifindex=%d\n",
+    //                 key.tunnel_id, key.remote_ipv4, *ifindex);
+    // mark from external
+    skb->tc_index = 1;
+    bpf_clone_redirect(skb, *ifindex, 1/*ingress*/);
+  } else {
+    bpf_trace_printk("ingress invalid tunnel_id=%d\n", key.tunnel_id);
+  }
+
+  return 1;
+}
+
+// Handle packets from the tenant, mux into the encap device
+int handle_egress(struct __sk_buff *skb) {
+  int ifindex = skb->ifindex;
+  struct bpf_tunnel_key tkey = {};
+  struct tunnel_key *key_p;
+  int one = 1;
+  struct config *cfg = conf.lookup(&one);
+
+  if (!cfg) return 1;
+
+  if (skb->tc_index) {
+    //bpf_trace_printk("from external\n");
+    // don't send it back out to encap device
+    return 1;
+  }
+
+  key_p = if2tunkey.lookup(&ifindex);
+  if (key_p) {
+    tkey.tunnel_id = key_p->tunnel_id;
+    tkey.remote_ipv4 = key_p->remote_ipv4;
+    bpf_skb_set_tunnel_key(skb, &tkey,
+        offsetof(struct bpf_tunnel_key, remote_ipv6[1]), 0);
+    bpf_clone_redirect(skb, cfg->tunnel_ifindex, 0/*egress*/);
+  }
+  return 1;
+}
diff --git a/examples/networking/distributed_bridge/tunnel_mesh.py b/examples/networking/distributed_bridge/tunnel_mesh.py
new file mode 100644
index 0000000..f111ac9
--- /dev/null
+++ b/examples/networking/distributed_bridge/tunnel_mesh.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from sys import argv
+from bcc import BPF
+from builtins import input
+from ctypes import c_int, c_uint
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+import json
+from netaddr import EUI, IPAddress
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from socket import htons, AF_INET
+from threading import Thread
+from subprocess import call, Popen, PIPE
+
+num_hosts = int(argv[1])
+host_id = int(argv[2])
+dhcp = int(argv[3])
+gretap = int(argv[4])
+
+b = BPF(src_file="tunnel_mesh.c")
+ingress_fn = b.load_func("handle_ingress", BPF.SCHED_CLS)
+egress_fn = b.load_func("handle_egress", BPF.SCHED_CLS)
+tunkey2if = b.get_table("tunkey2if")
+if2tunkey = b.get_table("if2tunkey")
+conf = b.get_table("conf")
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+ifc = ipdb.interfaces.eth0
+
+# ifcs to cleanup at the end
+ifc_gc = []
+
+# dhcp server and client processes
+d_serv = []
+d_client = []
+
+def run():
+    if gretap:
+        with ipdb.create(ifname="gretap1", kind="gretap", gre_ikey=0, gre_okey=0,
+                         gre_local='172.16.1.%d' % (100 + host_id),
+                         gre_ttl=16, gre_collect_metadata=1) as vx:
+            vx.up()
+            ifc_gc.append(vx.ifname)
+    else:
+        with ipdb.create(ifname="vxlan0", kind="vxlan", vxlan_id=0,
+                         vxlan_link=ifc, vxlan_port=4789,
+                         vxlan_collect_metadata=True,
+                         vxlan_learning=False) as vx:
+            vx.up()
+            ifc_gc.append(vx.ifname)
+
+    conf[c_int(1)] = c_int(vx.index)
+
+    ipr.tc("add", "ingress", vx.index, "ffff:")
+    ipr.tc("add-filter", "bpf", vx.index, ":1", fd=ingress_fn.fd,
+           name=ingress_fn.name, parent="ffff:", action="drop", classid=1)
+
+    for j in range(0, 2):
+        vni = 10000 + j
+        with ipdb.create(ifname="br%d" % j, kind="bridge") as br:
+            for i in range(0, num_hosts):
+                if i != host_id:
+                    v = ipdb.create(ifname="dummy%d%d" % (j , i), kind="dummy").up().commit()
+                    ipaddr = "172.16.1.%d" % (100 + i)
+                    tunkey2if_key = tunkey2if.Key(vni)
+                    tunkey2if_key.remote_ipv4 = IPAddress(ipaddr)
+                    tunkey2if_leaf = tunkey2if.Leaf(v.index)
+                    tunkey2if[tunkey2if_key] = tunkey2if_leaf
+
+                    if2tunkey_key = if2tunkey.Key(v.index)
+                    if2tunkey_leaf = if2tunkey.Leaf(vni)
+                    if2tunkey_leaf.remote_ipv4 = IPAddress(ipaddr)
+                    if2tunkey[if2tunkey_key] = if2tunkey_leaf
+
+                    ipr.tc("add", "sfq", v.index, "1:")
+                    ipr.tc("add-filter", "bpf", v.index, ":1", fd=egress_fn.fd,
+                       name=egress_fn.name, parent="1:", action="drop", classid=1)
+                    br.add_port(v)
+                    br.up()
+                    ifc_gc.append(v.ifname)
+            if dhcp == 0:
+                ipaddr = "99.1.%d.%d/24" % (j, host_id + 1)
+                br.add_ip(ipaddr)
+            ifc_gc.append(br.ifname)
+
+    # dhcp server only runs on host 0
+    if dhcp == 1 and host_id == 0:
+        for j in range(0, 2):
+            v1 = "dhcp%d_v1" % j
+            v2 = "dhcp%d_v2" % j
+            br = ipdb.interfaces["br%d" % j]
+            with ipdb.create(ifname=v1, kind="veth", peer=v2) as v:
+                    v.up()
+            br.add_port(ipdb.interfaces[v1]).commit()
+            dhcp_v2 = ipdb.interfaces[v2]
+            dhcp_v2.add_ip("99.1.%d.1/24" % j).up().commit()
+
+            call(["/bin/rm", "-f", "/tmp/dnsmasq.%d.leases" % j])
+            cmd = ["dnsmasq", "-d", "--bind-interfaces", "--strict-order",
+                   "--conf-file=",
+                   "--dhcp-range", "99.1.%d.2,99.1.%d.254,255.255.255.0,12h" % (j, j),
+                   "--dhcp-no-override", "--except-interface=lo",
+                   "--interface=dhcp%d_v2" % j,
+                   "--dhcp-authoritative",
+                   "--dhcp-leasefile=/tmp/dnsmasq.%d.leases" % j]
+            d_serv.append(Popen(cmd, stdout=PIPE, stderr=PIPE))
+
+    # dhcp client to assign ip address for each bridge
+    if dhcp == 1:
+        for j in range(0, 2):
+            call(["/bin/rm", "-rf", "/tmp/dhcp_%d_%d" % (host_id, j)])
+            call(["mkdir", "/tmp/dhcp_%d_%d" % (host_id, j)])
+            call(["touch", "/tmp/dhcp_%d_%d/dhclient.conf" % (host_id, j)])
+            call(["touch", "/tmp/dhcp_%d_%d/dhclient.lease" % (host_id, j)])
+            cmd = ["dhclient", "-d", "br%d" % j,
+                   "-cf", "/tmp/dhcp_%d_%d/dhclient.conf" % (host_id, j),
+                   "-lf", "/tmp/dhcp_%d_%d/dhclient.lease" % (host_id, j)]
+            d_client.append(Popen(cmd, stdout=PIPE, stderr=PIPE))
+
+            # make sure we get address for eth0
+            retry = -1
+            while retry < 0:
+                check = Popen(["ip", "addr", "show", "br%d" % j], stdout=PIPE, stderr=PIPE)
+                out = check.stdout.read()
+                checkip = b"99.1.%d" % j
+                retry = out.find(checkip)
+
+try:
+    run()
+    input("")
+finally:
+    for v in ifc_gc: call(["ip", "link", "del", v])
+    ipdb.release()
+    for p in d_client: p.kill()
+    for p in d_serv: p.kill()
diff --git a/examples/networking/dns_matching/dns_matching.c b/examples/networking/dns_matching/dns_matching.c
new file mode 100644
index 0000000..ce36e60
--- /dev/null
+++ b/examples/networking/dns_matching/dns_matching.c
@@ -0,0 +1,103 @@
+/*
+ * dns_matching.c  Drop DNS packets requesting DNS name contained in hash map
+ *    For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Copyright (c) 2016 Rudi Floren.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 11-May-2016  Rudi Floren Created this.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/udp.h>
+#include <bcc/proto.h>
+
+#define ETH_LEN 14
+
+struct dns_hdr_t
+{
+    uint16_t id;
+    uint16_t flags;
+    uint16_t qdcount;
+    uint16_t ancount;
+    uint16_t nscount;
+    uint16_t arcount;
+} BPF_PACKET_HEADER;
+
+
+struct dns_query_flags_t
+{
+  uint16_t qtype;
+  uint16_t qclass;
+} BPF_PACKET_HEADER;
+
+struct dns_char_t
+{
+    char c;
+} BPF_PACKET_HEADER;
+
+struct Key {
+  unsigned char p[255];
+};
+
+struct Leaf {
+  // Not really needed in this example
+  unsigned char p[4];
+};
+
+BPF_HASH(cache, struct Key, struct Leaf, 128);
+
+int dns_matching(struct __sk_buff *skb)
+{
+  u8 *cursor = 0;
+  struct Key key = {};
+  // Check of ethernet/IP frame.
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+  if(ethernet->type == ETH_P_IP) {
+
+    // Check for UDP.
+    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+    u16 hlen_bytes = ip->hlen << 2;
+    if(ip->nextp == IPPROTO_UDP) {
+
+      // Check for Port 53, DNS packet.
+      struct udp_t *udp = cursor_advance(cursor, sizeof(*udp));
+      if(udp->dport == 53){
+
+        struct dns_hdr_t *dns_hdr = cursor_advance(cursor, sizeof(*dns_hdr));
+
+        // Do nothing if packet is not a request.
+        if((dns_hdr->flags >>15) != 0) {
+          // Exit if this packet is not a request.
+          return -1;
+        }
+
+        u16 i = 0;
+        struct dns_char_t *c;
+        #pragma unroll
+        for(i = 0; i<255;i++){
+          c = cursor_advance(cursor, 1);
+          if (c->c == 0)
+            break;
+          key.p[i] = c->c;
+        }
+
+        struct Leaf * lookup_leaf = cache.lookup(&key);
+
+        // If DNS name is contained in our map, keep the packet
+        if(lookup_leaf) {
+          bpf_trace_printk("Matched1\n");
+          return -1;
+        }
+      }
+    }
+  }
+  // Drop the packet
+  return 0;
+}
diff --git a/examples/networking/dns_matching/dns_matching.py b/examples/networking/dns_matching/dns_matching.py
new file mode 100755
index 0000000..943dca5
--- /dev/null
+++ b/examples/networking/dns_matching/dns_matching.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import *
+
+import os
+import sys
+import fcntl
+import dnslib
+import argparse
+
+
+def encode_dns(name):
+  if len(name) + 1 > 255:
+    raise Exception("DNS Name too long.")
+  b = bytearray()
+  for element in name.split('.'):
+    sublen = len(element)
+    if sublen > 63:
+      raise ValueError('DNS label %s is too long' % element)
+    b.append(sublen)
+    b.extend(element.encode('ascii'))
+  b.append(0)  # Add 0-len octet label for the root server
+  return b
+
+def add_cache_entry(cache, name):
+  key = cache.Key()
+  key_len = len(key.p)
+  name_buffer = encode_dns(name)
+  # Pad the buffer with null bytes if it is too short
+  name_buffer.extend((0,) * (key_len - len(name_buffer)))
+  key.p = (c_ubyte * key_len).from_buffer(name_buffer)
+  leaf = cache.Leaf()
+  leaf.p = (c_ubyte * 4).from_buffer(bytearray(4))
+  cache[key] = leaf
+
+
+parser = argparse.ArgumentParser(usage='For detailed information about usage,\
+ try with -h option')
+req_args = parser.add_argument_group("Required arguments")
+req_args.add_argument("-i", "--interface", type=str, default="",
+                      help="Interface name, defaults to all if unspecified.")
+req_args.add_argument("-d", "--domains", type=str, required=True, nargs="+",
+    help='List of domain names separated by space. For example: -d abc.def xyz.mno')
+args = parser.parse_args()
+
+# initialize BPF - load source code from http-parse-simple.c
+bpf = BPF(src_file = "dns_matching.c", debug=0)
+# print(bpf.dump_func("dns_test"))
+
+#load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
+#more info about eBPF program types
+#http://man7.org/linux/man-pages/man2/bpf.2.html
+function_dns_matching = bpf.load_func("dns_matching", BPF.SOCKET_FILTER)
+
+
+#create raw socket, bind it to user provided interface
+#attach bpf program to socket created
+BPF.attach_raw_socket(function_dns_matching, args.interface)
+
+# Get the table.
+cache = bpf.get_table("cache")
+
+# Add cache entries
+for e in args.domains:
+  print(">>>> Adding map entry: ", e)
+  add_cache_entry(cache, e)
+
+print("\nTry to lookup some domain names using nslookup from another terminal.")
+print("For example:  nslookup foo.bar")
+print("\nBPF program will filter-in DNS packets which match with map entries.")
+print("Packets received by user space program will be printed here")
+print("\nHit Ctrl+C to end...")
+
+socket_fd = function_dns_matching.sock
+fl = fcntl.fcntl(socket_fd, fcntl.F_GETFL)
+fcntl.fcntl(socket_fd, fcntl.F_SETFL, fl & (~os.O_NONBLOCK))
+
+while 1:
+  #retrieve raw packet from socket
+  try:
+    packet_str = os.read(socket_fd, 2048)
+  except KeyboardInterrupt:
+    sys.exit(0)
+  packet_bytearray = bytearray(packet_str)
+
+  ETH_HLEN = 14
+  UDP_HLEN = 8
+
+  #IP HEADER
+  #calculate ip header length
+  ip_header_length = packet_bytearray[ETH_HLEN]               #load Byte
+  ip_header_length = ip_header_length & 0x0F                  #mask bits 0..3
+  ip_header_length = ip_header_length << 2                    #shift to obtain length
+
+  #calculate payload offset
+  payload_offset = ETH_HLEN + ip_header_length + UDP_HLEN
+
+  payload = packet_bytearray[payload_offset:]
+  # pass the payload to dnslib for parsing
+  dnsrec = dnslib.DNSRecord.parse(payload)
+  print (dnsrec.questions, "\n")
diff --git a/examples/networking/http_filter/CMakeLists.txt b/examples/networking/http_filter/CMakeLists.txt
new file mode 100644
index 0000000..6f854e7
--- /dev/null
+++ b/examples/networking/http_filter/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(FILES http-parse-complete.c http-parse-simple.c README.md)
+set(PROGRAMS http-parse-complete.py http-parse-simple.py)
+install(FILES ${FILES} DESTINATION share/bcc/examples/networking/http_filter)
+install(PROGRAMS ${PROGRAMS} DESTINATION share/bcc/examples/networking/http_filter)
diff --git a/examples/networking/http_filter/README.md b/examples/networking/http_filter/README.md
new file mode 100644
index 0000000..8e1daf1
--- /dev/null
+++ b/examples/networking/http_filter/README.md
@@ -0,0 +1,47 @@
+# HTTP Filter
+
+eBPF application that parses HTTP packets and extracts (and prints on screen) the URL contained in the GET/POST request.
+
+[eBPF HTTP Filter - Short Presentation](https://github.com/iovisor/bpf-docs/blob/master/ebpf_http_filter.pdf)
+
+## Usage Example
+
+
+    $ sudo python http-parse-complete.py 
+    GET /pipermail/iovisor-dev/ HTTP/1.1
+    HTTP/1.1 200 OK
+    GET /favicon.ico HTTP/1.1
+    HTTP/1.1 404 Not Found
+    GET /pipermail/iovisor-dev/2016-January/thread.html HTTP/1.1
+    HTTP/1.1 200 OK
+    GET /pipermail/iovisor-dev/2016-January/000046.html HTTP/1.1
+    HTTP/1.1 200 OK
+
+
+## Implementation overview
+
+The implementation is split in two portions: the former that exploits eBPF code, the latter that performs some additional processing in user space (the python wrapper).
+
+### First part: eBPF filter
+This component filters IP and TCP packets containing the "HTTP", "GET", "POST" strings in their payload and all subsequent packets belonging to the same session, having the same (ip.src,ip.dst,port.src,port.dst) tuple.
+
+The program is loaded as PROG_TYPE_SOCKET_FILTER and attached to a socket, bind to eth0.
+
+Matching packets are forwarded to user space, the others are dropped by the filter.
+
+### Second part: python code in user space
+The Python script reads filtered raw packets from the socket, if necessary reassembles packets belonging to the same session, and prints on stdout the first line of the HTTP GET/POST request.
+
+## Simple vs. complete
+
+Two versions of this code are available in this repository:
+
+* simple version: it does not handle URLs that span across multiple packets. For instance, if the URL is too long it shows only the portion contained in the first packet.
+* complete version: it is able to cope with URLs spanning across multiple packets; if such a situation is detected, the code reassembles packets belonging to the same session and prints the complete URL.
+
+## How to execute this sample
+
+This sample can be executed by typing either one the two commands below:
+ 
+    $ sudo python http-parse-simple.py
+    $ sudo python http-parse-complete.py
diff --git a/examples/networking/http_filter/http-parse-complete.c b/examples/networking/http_filter/http-parse-complete.c
new file mode 100644
index 0000000..dff16b9
--- /dev/null
+++ b/examples/networking/http_filter/http-parse-complete.c
@@ -0,0 +1,157 @@
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+#define IP_TCP 	6
+#define ETH_HLEN 14
+
+struct Key {
+	u32 src_ip;               //source ip
+	u32 dst_ip;               //destination ip
+	unsigned short src_port;  //source port
+	unsigned short dst_port;  //destination port
+};
+
+struct Leaf {
+	int timestamp;            //timestamp in ns
+};
+
+//BPF_TABLE(map_type, key_type, leaf_type, table_name, num_entry)
+//map <Key, Leaf>
+//tracing sessions having same Key(dst_ip, src_ip, dst_port,src_port)
+BPF_HASH(sessions, struct Key, struct Leaf, 1024);
+
+/*eBPF program.
+  Filter IP and TCP packets, having payload not empty
+  and containing "HTTP", "GET", "POST"  as first bytes of payload.
+  AND ALL the other packets having same (src_ip,dst_ip,src_port,dst_port)
+  this means belonging to the same "session"
+  this additional check avoids url truncation, if url is too long
+  userspace script, if necessary, reassembles urls splitted in 2 or more packets.
+  if the program is loaded as PROG_TYPE_SOCKET_FILTER
+  and attached to a socket
+  return  0 -> DROP the packet
+  return -1 -> KEEP the packet and return it to user space (userspace can read it from the socket_fd )
+*/
+int http_filter(struct __sk_buff *skb) {
+
+	u8 *cursor = 0;
+
+	struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+	//filter IP packets (ethernet type = 0x0800)
+	if (!(ethernet->type == 0x0800)) {
+		goto DROP;
+	}
+
+	struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+	//filter TCP packets (ip next protocol = 0x06)
+	if (ip->nextp != IP_TCP) {
+		goto DROP;
+	}
+
+	u32  tcp_header_length = 0;
+	u32  ip_header_length = 0;
+	u32  payload_offset = 0;
+	u32  payload_length = 0;
+	struct Key 	key;
+	struct Leaf zero = {0};
+
+        //calculate ip header length
+        //value to multiply * 4
+        //e.g. ip->hlen = 5 ; IP Header Length = 5 x 4 byte = 20 byte
+        ip_header_length = ip->hlen << 2;    //SHL 2 -> *4 multiply
+
+        //check ip header length against minimum
+        if (ip_header_length < sizeof(*ip)) {
+                goto DROP;
+        }
+
+        //shift cursor forward for dynamic ip header size
+        void *_ = cursor_advance(cursor, (ip_header_length-sizeof(*ip)));
+
+	struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));
+
+	//retrieve ip src/dest and port src/dest of current packet
+	//and save it into struct Key
+	key.dst_ip = ip->dst;
+	key.src_ip = ip->src;
+	key.dst_port = tcp->dst_port;
+	key.src_port = tcp->src_port;
+
+	//calculate tcp header length
+	//value to multiply *4
+	//e.g. tcp->offset = 5 ; TCP Header Length = 5 x 4 byte = 20 byte
+	tcp_header_length = tcp->offset << 2; //SHL 2 -> *4 multiply
+
+	//calculate patload offset and length
+	payload_offset = ETH_HLEN + ip_header_length + tcp_header_length;
+	payload_length = ip->tlen - ip_header_length - tcp_header_length;
+
+	//http://stackoverflow.com/questions/25047905/http-request-minimum-size-in-bytes
+	//minimum length of http request is always geater than 7 bytes
+	//avoid invalid access memory
+	//include empty payload
+	if(payload_length < 7) {
+		goto DROP;
+	}
+
+	//load first 7 byte of payload into p (payload_array)
+	//direct access to skb not allowed
+	unsigned long p[7];
+	int i = 0;
+	int j = 0;
+	const int last_index = payload_offset + 7;
+	for (i = payload_offset ; i < last_index ; i++) {
+		p[j] = load_byte(skb , i);
+		j++;
+	}
+
+	//find a match with an HTTP message
+	//HTTP
+	if ((p[0] == 'H') && (p[1] == 'T') && (p[2] == 'T') && (p[3] == 'P')) {
+		goto HTTP_MATCH;
+	}
+	//GET
+	if ((p[0] == 'G') && (p[1] == 'E') && (p[2] == 'T')) {
+		goto HTTP_MATCH;
+	}
+	//POST
+	if ((p[0] == 'P') && (p[1] == 'O') && (p[2] == 'S') && (p[3] == 'T')) {
+		goto HTTP_MATCH;
+	}
+	//PUT
+	if ((p[0] == 'P') && (p[1] == 'U') && (p[2] == 'T')) {
+		goto HTTP_MATCH;
+	}
+	//DELETE
+	if ((p[0] == 'D') && (p[1] == 'E') && (p[2] == 'L') && (p[3] == 'E') && (p[4] == 'T') && (p[5] == 'E')) {
+		goto HTTP_MATCH;
+	}
+	//HEAD
+	if ((p[0] == 'H') && (p[1] == 'E') && (p[2] == 'A') && (p[3] == 'D')) {
+		goto HTTP_MATCH;
+	}
+
+	//no HTTP match
+	//check if packet belong to an HTTP session
+	struct Leaf * lookup_leaf = sessions.lookup(&key);
+	if(lookup_leaf) {
+		//send packet to userspace
+		goto KEEP;
+	}
+	goto DROP;
+
+	//keep the packet and send it to userspace retruning -1
+	HTTP_MATCH:
+	//if not already present, insert into map <Key, Leaf>
+	sessions.lookup_or_init(&key,&zero);
+
+	//send packet to userspace returning -1
+	KEEP:
+	return -1;
+
+	//drop the packet returning 0
+	DROP:
+	return 0;
+
+}
diff --git a/examples/networking/http_filter/http-parse-complete.py b/examples/networking/http_filter/http-parse-complete.py
new file mode 100644
index 0000000..f1e5e0a
--- /dev/null
+++ b/examples/networking/http_filter/http-parse-complete.py
@@ -0,0 +1,298 @@
+#!/usr/bin/python
+#
+#Bertrone Matteo - Polytechnic of Turin
+#November 2015
+#
+#eBPF application that parses HTTP packets
+#and extracts (and prints on screen) the URL contained in the GET/POST request.
+#
+#eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface.
+#only packet of type ip and tcp containing HTTP GET/POST are returned to userspace, others dropped
+#
+#python script uses bcc BPF Compiler Collection by iovisor (https://github.com/iovisor/bcc)
+#and prints on stdout the first line of the HTTP GET/POST request containing the url
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import *
+from struct import *
+from sys import argv
+
+import sys
+import socket
+import os
+import struct
+import binascii
+import time
+
+CLEANUP_N_PACKETS  = 50       #run cleanup every CLEANUP_N_PACKETS packets received
+MAX_URL_STRING_LEN = 8192     #max url string len (usually 8K)
+MAX_AGE_SECONDS    = 30       #max age entry in bpf_sessions map
+
+#convert a bin string into a string of hex char
+#helper function to print raw packet in hex
+def toHex(s):
+    lst = []
+    for ch in s:
+        hv = hex(ord(ch)).replace('0x', '')
+        if len(hv) == 1:
+            hv = '0'+hv
+        lst.append(hv)
+
+    return reduce(lambda x,y:x+y, lst)
+
+#print str until CR+LF
+def printUntilCRLF(str):
+    for k in range (0,len(str)-1):
+      if (str[k] == '\n'):
+        if (str[k-1] == '\r'):
+          print ("")
+          return
+      print ("%c" % (str[k]), end = "")
+    print("")
+    return
+
+#cleanup function
+def cleanup():
+    #get current time in seconds
+    current_time = int(time.time())
+    #looking for leaf having:
+    #timestap  == 0        --> update with current timestamp
+    #AGE > MAX_AGE_SECONDS --> delete item
+    for key,leaf in bpf_sessions.items():
+      try:
+        current_leaf = bpf_sessions[key]
+        #set timestamp if timestamp == 0
+        if (current_leaf.timestamp == 0):
+          bpf_sessions[key] = bpf_sessions.Leaf(current_time)
+        else:
+          #delete older entries
+          if (current_time - current_leaf.timestamp > MAX_AGE_SECONDS):
+            del bpf_sessions[key]
+      except:
+        print("cleanup exception.")
+    return
+
+#args
+def usage():
+    print("USAGE: %s [-i <if_name>]" % argv[0])
+    print("")
+    print("Try '%s -h' for more options." % argv[0])
+    exit()
+
+#help
+def help():
+    print("USAGE: %s [-i <if_name>]" % argv[0])
+    print("")
+    print("optional arguments:")
+    print("   -h                       print this help")
+    print("   -i if_name               select interface if_name. Default is eth0")
+    print("")
+    print("examples:")
+    print("    http-parse              # bind socket to eth0")
+    print("    http-parse -i wlan0     # bind socket to wlan0")
+    exit()
+
+#arguments
+interface="eth0"
+
+if len(argv) == 2:
+  if str(argv[1]) == '-h':
+    help()
+  else:
+    usage()
+
+if len(argv) == 3:
+  if str(argv[1]) == '-i':
+    interface = argv[2]
+  else:
+    usage()
+
+if len(argv) > 3:
+  usage()
+
+print ("binding socket to '%s'" % interface)
+
+# initialize BPF - load source code from http-parse-complete.c
+bpf = BPF(src_file = "http-parse-complete.c",debug = 0)
+
+#load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
+#more info about eBPF program types
+#http://man7.org/linux/man-pages/man2/bpf.2.html
+function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER)
+
+#create raw socket, bind it to interface
+#attach bpf program to socket created
+BPF.attach_raw_socket(function_http_filter, interface)
+
+#get file descriptor of the socket previously created inside BPF.attach_raw_socket
+socket_fd = function_http_filter.sock
+
+#create python socket object, from the file descriptor
+sock = socket.fromfd(socket_fd,socket.PF_PACKET,socket.SOCK_RAW,socket.IPPROTO_IP)
+#set it as blocking socket
+sock.setblocking(True)
+
+#get pointer to bpf map of type hash
+bpf_sessions = bpf.get_table("sessions")
+
+#packets counter
+packet_count = 0
+
+#dictionary containing association <key(ipsrc,ipdst,portsrc,portdst),payload_string>
+#if url is not entirely contained in only one packet, save the firt part of it in this local dict
+#when I find \r\n in a next pkt, append and print all the url
+local_dictionary = {}
+
+while 1:
+  #retrieve raw packet from socket
+  packet_str = os.read(socket_fd,4096) #set packet length to max packet length on the interface
+  packet_count += 1
+
+  #DEBUG - print raw packet in hex format
+  #packet_hex = toHex(packet_str)
+  #print ("%s" % packet_hex)
+
+  #convert packet into bytearray
+  packet_bytearray = bytearray(packet_str)
+
+  #ethernet header length
+  ETH_HLEN = 14
+
+  #IP HEADER
+  #https://tools.ietf.org/html/rfc791
+  # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  # |Version|  IHL  |Type of Service|          Total Length         |
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  #
+  #IHL : Internet Header Length is the length of the internet header
+  #value to multiply * 4 byte
+  #e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte
+  #
+  #Total length: This 16-bit field defines the entire packet size,
+  #including header and data, in bytes.
+
+  #calculate packet total length
+  total_length = packet_bytearray[ETH_HLEN + 2]               #load MSB
+  total_length = total_length << 8                            #shift MSB
+  total_length = total_length + packet_bytearray[ETH_HLEN+3]  #add LSB
+
+  #calculate ip header length
+  ip_header_length = packet_bytearray[ETH_HLEN]               #load Byte
+  ip_header_length = ip_header_length & 0x0F                  #mask bits 0..3
+  ip_header_length = ip_header_length << 2                    #shift to obtain length
+
+  #retrieve ip source/dest
+  ip_src_str = packet_str[ETH_HLEN+12:ETH_HLEN+16]                #ip source offset 12..15
+  ip_dst_str = packet_str[ETH_HLEN+16:ETH_HLEN+20]                #ip dest   offset 16..19
+
+  ip_src = int(toHex(ip_src_str),16)
+  ip_dst = int(toHex(ip_dst_str),16)
+
+  #TCP HEADER
+  #https://www.rfc-editor.org/rfc/rfc793.txt
+  #  12              13              14              15
+  #  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  # |  Data |           |U|A|P|R|S|F|                               |
+  # | Offset| Reserved  |R|C|S|S|Y|I|            Window             |
+  # |       |           |G|K|H|T|N|N|                               |
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  #
+  #Data Offset: This indicates where the data begins.
+  #The TCP header is an integral number of 32 bits long.
+  #value to multiply * 4 byte
+  #e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte
+
+  #calculate tcp header length
+  tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12]  #load Byte
+  tcp_header_length = tcp_header_length & 0xF0                            #mask bit 4..7
+  tcp_header_length = tcp_header_length >> 2                              #SHR 4 ; SHL 2 -> SHR 2
+
+  #retrieve port source/dest
+  port_src_str = packet_str[ETH_HLEN+ip_header_length:ETH_HLEN+ip_header_length+2]
+  port_dst_str = packet_str[ETH_HLEN+ip_header_length+2:ETH_HLEN+ip_header_length+4]
+
+  port_src = int(toHex(port_src_str),16)
+  port_dst = int(toHex(port_dst_str),16)
+
+  #calculate payload offset
+  payload_offset = ETH_HLEN + ip_header_length + tcp_header_length
+
+  #payload_string contains only packet payload
+  payload_string = packet_str[(payload_offset):(len(packet_bytearray))]
+
+  #CR + LF (substring to find)
+  crlf = "\r\n"
+
+  #current_Key contains ip source/dest and port source/map
+  #useful for direct bpf_sessions map access
+  current_Key = bpf_sessions.Key(ip_src,ip_dst,port_src,port_dst)
+
+  #looking for HTTP GET/POST request
+  if ((payload_string[:3] == "GET") or (payload_string[:4] == "POST")   or (payload_string[:4] == "HTTP")  \
+  or ( payload_string[:3] == "PUT") or (payload_string[:6] == "DELETE") or (payload_string[:4] == "HEAD") ):
+    #match: HTTP GET/POST packet found
+    if (crlf in payload_string):
+      #url entirely contained in first packet -> print it all
+      printUntilCRLF(payload_string)
+
+      #delete current_Key from bpf_sessions, url already printed. current session not useful anymore
+      try:
+        del bpf_sessions[current_Key]
+      except:
+        print ("error during delete from bpf map ")
+    else:
+      #url NOT entirely contained in first packet
+      #not found \r\n in payload.
+      #save current part of the payload_string in dictionary <key(ips,ipd,ports,portd),payload_string>
+      local_dictionary[binascii.hexlify(current_Key)] = payload_string
+  else:
+    #NO match: HTTP GET/POST  NOT found
+
+    #check if the packet belong to a session saved in bpf_sessions
+    if (current_Key in bpf_sessions):
+      #check id the packet belong to a session saved in local_dictionary
+      #(local_dictionary mantains HTTP GET/POST url not printed yet because splitted in N packets)
+      if (binascii.hexlify(current_Key) in local_dictionary):
+        #first part of the HTTP GET/POST url is already present in local dictionary (prev_payload_string)
+        prev_payload_string = local_dictionary[binascii.hexlify(current_Key)]
+        #looking for CR+LF in current packet.
+        if (crlf in payload_string):
+          #last packet. containing last part of HTTP GET/POST url splitted in N packets.
+          #append current payload
+          prev_payload_string += payload_string
+          #print HTTP GET/POST url
+          printUntilCRLF(prev_payload_string)
+          #clean bpf_sessions & local_dictionary
+          try:
+            del bpf_sessions[current_Key]
+            del local_dictionary[binascii.hexlify(current_Key)]
+          except:
+            print ("error deleting from map or dictionary")
+        else:
+          #NOT last packet. containing part of HTTP GET/POST url splitted in N packets.
+          #append current payload
+          prev_payload_string += payload_string
+          #check if not size exceeding (usually HTTP GET/POST url < 8K )
+          if (len(prev_payload_string) > MAX_URL_STRING_LEN):
+            print("url too long")
+            try:
+              del bpf_sessions[current_Key]
+              del local_dictionary[binascii.hexlify(current_Key)]
+            except:
+              print ("error deleting from map or dict")
+          #update dictionary
+          local_dictionary[binascii.hexlify(current_Key)] = prev_payload_string
+      else:
+        #first part of the HTTP GET/POST url is NOT present in local dictionary
+        #bpf_sessions contains invalid entry -> delete it
+        try:
+          del bpf_sessions[current_Key]
+        except:
+          print ("error del bpf_session")
+
+  #check if dirty entry are present in bpf_sessions
+  if (((packet_count) % CLEANUP_N_PACKETS) == 0):
+    cleanup()
diff --git a/examples/networking/http_filter/http-parse-simple.c b/examples/networking/http_filter/http-parse-simple.c
new file mode 100644
index 0000000..b4e49cc
--- /dev/null
+++ b/examples/networking/http_filter/http-parse-simple.c
@@ -0,0 +1,117 @@
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+#define IP_TCP 	6
+#define ETH_HLEN 14
+
+/*eBPF program.
+  Filter IP and TCP packets, having payload not empty
+  and containing "HTTP", "GET", "POST" ... as first bytes of payload
+  if the program is loaded as PROG_TYPE_SOCKET_FILTER
+  and attached to a socket
+  return  0 -> DROP the packet
+  return -1 -> KEEP the packet and return it to user space (userspace can read it from the socket_fd )
+*/
+int http_filter(struct __sk_buff *skb) {
+
+	u8 *cursor = 0;
+
+	struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+	//filter IP packets (ethernet type = 0x0800)
+	if (!(ethernet->type == 0x0800)) {
+		goto DROP;
+	}
+
+	struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+	//filter TCP packets (ip next protocol = 0x06)
+	if (ip->nextp != IP_TCP) {
+		goto DROP;
+	}
+
+	u32  tcp_header_length = 0;
+	u32  ip_header_length = 0;
+	u32  payload_offset = 0;
+	u32  payload_length = 0;
+
+	//calculate ip header length
+	//value to multiply * 4
+	//e.g. ip->hlen = 5 ; IP Header Length = 5 x 4 byte = 20 byte
+	ip_header_length = ip->hlen << 2;    //SHL 2 -> *4 multiply
+
+        //check ip header length against minimum
+	if (ip_header_length < sizeof(*ip)) {
+		goto DROP;
+	}
+
+        //shift cursor forward for dynamic ip header size
+        void *_ = cursor_advance(cursor, (ip_header_length-sizeof(*ip)));
+
+	struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));
+
+	//calculate tcp header length
+	//value to multiply *4
+	//e.g. tcp->offset = 5 ; TCP Header Length = 5 x 4 byte = 20 byte
+	tcp_header_length = tcp->offset << 2; //SHL 2 -> *4 multiply
+
+	//calculate patload offset and length
+	payload_offset = ETH_HLEN + ip_header_length + tcp_header_length;
+	payload_length = ip->tlen - ip_header_length - tcp_header_length;
+
+	//http://stackoverflow.com/questions/25047905/http-request-minimum-size-in-bytes
+	//minimum length of http request is always geater than 7 bytes
+	//avoid invalid access memory
+	//include empty payload
+	if(payload_length < 7) {
+		goto DROP;
+	}
+
+	//load first 7 byte of payload into p (payload_array)
+	//direct access to skb not allowed
+	unsigned long p[7];
+	int i = 0;
+	int j = 0;
+	const int last_index = payload_offset + 7;
+	for (i = payload_offset ; i < last_index ; i++) {
+		p[j] = load_byte(skb , i);
+		j++;
+	}
+
+	//find a match with an HTTP message
+	//HTTP
+	if ((p[0] == 'H') && (p[1] == 'T') && (p[2] == 'T') && (p[3] == 'P')) {
+		goto KEEP;
+	}
+	//GET
+	if ((p[0] == 'G') && (p[1] == 'E') && (p[2] == 'T')) {
+		goto KEEP;
+	}
+	//POST
+	if ((p[0] == 'P') && (p[1] == 'O') && (p[2] == 'S') && (p[3] == 'T')) {
+		goto KEEP;
+	}
+	//PUT
+	if ((p[0] == 'P') && (p[1] == 'U') && (p[2] == 'T')) {
+		goto KEEP;
+	}
+	//DELETE
+	if ((p[0] == 'D') && (p[1] == 'E') && (p[2] == 'L') && (p[3] == 'E') && (p[4] == 'T') && (p[5] == 'E')) {
+		goto KEEP;
+	}
+	//HEAD
+	if ((p[0] == 'H') && (p[1] == 'E') && (p[2] == 'A') && (p[3] == 'D')) {
+		goto KEEP;
+	}
+
+	//no HTTP match
+	goto DROP;
+
+	//keep the packet and send it to userspace retruning -1
+	KEEP:
+	return -1;
+
+	//drop the packet returning 0
+	DROP:
+	return 0;
+
+}
diff --git a/examples/networking/http_filter/http-parse-simple.py b/examples/networking/http_filter/http-parse-simple.py
new file mode 100644
index 0000000..b702393
--- /dev/null
+++ b/examples/networking/http_filter/http-parse-simple.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+#
+#Bertrone Matteo - Polytechnic of Turin
+#November 2015
+#
+#eBPF application that parses HTTP packets
+#and extracts (and prints on screen) the URL contained in the GET/POST request.
+#
+#eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface.
+#only packet of type ip and tcp containing HTTP GET/POST are returned to userspace, others dropped
+#
+#python script uses bcc BPF Compiler Collection by iovisor (https://github.com/iovisor/bcc)
+#and prints on stdout the first line of the HTTP GET/POST request containing the url
+
+from __future__ import print_function
+from bcc import BPF
+from sys import argv
+
+import sys
+import socket
+import os
+
+#args
+def usage():
+    print("USAGE: %s [-i <if_name>]" % argv[0])
+    print("")
+    print("Try '%s -h' for more options." % argv[0])
+    exit()
+
+#help
+def help():
+    print("USAGE: %s [-i <if_name>]" % argv[0])
+    print("")
+    print("optional arguments:")
+    print("   -h                       print this help")
+    print("   -i if_name               select interface if_name. Default is eth0")
+    print("")
+    print("examples:")
+    print("    http-parse              # bind socket to eth0")
+    print("    http-parse -i wlan0     # bind socket to wlan0")
+    exit()
+
+#arguments
+interface="eth0"
+
+if len(argv) == 2:
+  if str(argv[1]) == '-h':
+    help()
+  else:
+    usage()
+
+if len(argv) == 3:
+  if str(argv[1]) == '-i':
+    interface = argv[2]
+  else:
+    usage()
+
+if len(argv) > 3:
+  usage()
+
+print ("binding socket to '%s'" % interface)
+
+# initialize BPF - load source code from http-parse-simple.c
+bpf = BPF(src_file = "http-parse-simple.c",debug = 0)
+
+#load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
+#more info about eBPF program types
+#http://man7.org/linux/man-pages/man2/bpf.2.html
+function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER)
+
+#create raw socket, bind it to interface
+#attach bpf program to socket created
+BPF.attach_raw_socket(function_http_filter, interface)
+
+#get file descriptor of the socket previously created inside BPF.attach_raw_socket
+socket_fd = function_http_filter.sock
+
+#create python socket object, from the file descriptor
+sock = socket.fromfd(socket_fd,socket.PF_PACKET,socket.SOCK_RAW,socket.IPPROTO_IP)
+#set it as blocking socket
+sock.setblocking(True)
+
+while 1:
+  #retrieve raw packet from socket
+  packet_str = os.read(socket_fd,2048)
+
+  #DEBUG - print raw packet in hex format
+  #packet_hex = toHex(packet_str)
+  #print ("%s" % packet_hex)
+
+  #convert packet into bytearray
+  packet_bytearray = bytearray(packet_str)
+
+  #ethernet header length
+  ETH_HLEN = 14
+
+  #IP HEADER
+  #https://tools.ietf.org/html/rfc791
+  # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  # |Version|  IHL  |Type of Service|          Total Length         |
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  #
+  #IHL : Internet Header Length is the length of the internet header
+  #value to multiply * 4 byte
+  #e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte
+  #
+  #Total length: This 16-bit field defines the entire packet size,
+  #including header and data, in bytes.
+
+  #calculate packet total length
+  total_length = packet_bytearray[ETH_HLEN + 2]               #load MSB
+  total_length = total_length << 8                            #shift MSB
+  total_length = total_length + packet_bytearray[ETH_HLEN+3]  #add LSB
+
+  #calculate ip header length
+  ip_header_length = packet_bytearray[ETH_HLEN]               #load Byte
+  ip_header_length = ip_header_length & 0x0F                  #mask bits 0..3
+  ip_header_length = ip_header_length << 2                    #shift to obtain length
+
+  #TCP HEADER
+  #https://www.rfc-editor.org/rfc/rfc793.txt
+  #  12              13              14              15
+  #  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  # |  Data |           |U|A|P|R|S|F|                               |
+  # | Offset| Reserved  |R|C|S|S|Y|I|            Window             |
+  # |       |           |G|K|H|T|N|N|                               |
+  # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  #
+  #Data Offset: This indicates where the data begins.
+  #The TCP header is an integral number of 32 bits long.
+  #value to multiply * 4 byte
+  #e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte
+
+  #calculate tcp header length
+  tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12]  #load Byte
+  tcp_header_length = tcp_header_length & 0xF0                            #mask bit 4..7
+  tcp_header_length = tcp_header_length >> 2                              #SHR 4 ; SHL 2 -> SHR 2
+
+  #calculate payload offset
+  payload_offset = ETH_HLEN + ip_header_length + tcp_header_length
+
+  #print first line of the HTTP GET/POST request
+  #line ends with 0xOD 0xOA (\r\n)
+  #(if we want to print all the header print until \r\n\r\n)
+  for i in range (payload_offset-1,len(packet_bytearray)-1):
+    if (packet_bytearray[i]== 0x0A):
+      if (packet_bytearray[i-1] == 0x0D):
+        break
+    print ("%c" % chr(packet_bytearray[i]), end = "")
+  print("")
+
diff --git a/examples/networking/neighbor_sharing/CMakeLists.txt b/examples/networking/neighbor_sharing/CMakeLists.txt
new file mode 100644
index 0000000..a00ae65
--- /dev/null
+++ b/examples/networking/neighbor_sharing/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(EXAMPLE_FILES README.txt simulation.py tc_neighbor_sharing.c)
+set(EXAMPLE_PROGRAMS tc_neighbor_sharing.py)
+install(FILES ${EXAMPLE_FILES} DESTINATION share/bcc/examples/networking/neighbor_sharing)
+install(PROGRAMS ${EXAMPLE_PROGRAMS} DESTINATION share/bcc/examples/networking/neighbor_sharing)
diff --git a/examples/networking/neighbor_sharing/README.txt b/examples/networking/neighbor_sharing/README.txt
new file mode 100644
index 0000000..9f3aba2
--- /dev/null
+++ b/examples/networking/neighbor_sharing/README.txt
@@ -0,0 +1,57 @@
+This example shows how a combination of BPF programs can be used to perform
+per-IP classification and rate limiting. The simulation in this example
+shows an example where N+M devices are combined and use 1 WAN. Traffic sent
+from/to the "neighbor" devices have their combined bandwidth capped at
+128kbit, and the rest of the traffic can use an additional 1Mbit.
+
+This works by sharing a map between various tc ingress filters, each with
+a related set of bpf functions attached. The map stores a list of dynamically
+learned ip addresses that were seen on the neighbor devices and should be
+throttled.
+
+                         /------------\                        |
+neigh1 --|->->->->->->->-|            |                        |
+neigh2 --|->->->->->->->-|    <-128kb-|        /------\        |
+neigh3 --|->->->->->->->-|            |  wan0  | wan  |        |
+         | ^             |   br100    |-<-<-<--| sim  |        |
+         | clsfy_neigh() |            |   ^    \------/        |
+lan1 ----|->->->->->->->-|    <--1Mb--|   |                    |
+lan2 ----|->->->->->->->-|            |   classify_wan()       |
+           ^             \------------/                        |
+           pass()                                              |
+
+To run the example:
+
+$ sudo /path/to/neighbor_sharing/neighbor_sharing.py
+Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+Network ready. Create a shell in the wan0 namespace and test with netperf
+   (Neighbors are 172.16.1.100-102, and LAN clients are 172.16.1.150-151)
+ e.g.: ip netns exec wan0 netperf -H 172.16.1.100 -l 2
+Press enter when finished:
+
+
+In another shell:
+$ sudo ip netns exec wan0 netperf -H 172.16.1.100 -l 2
+MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 172.16.1.100 () port 0 AF_INET : demo
+Recv   Send    Send
+Socket Socket  Message  Elapsed
+Size   Size    Size     Time     Throughput
+bytes  bytes   bytes    secs.    10^6bits/sec
+
+ 87380  16384  16384    4.30        0.18
+
+$ sudo ip netns exec wan0 netperf -H 172.16.1.150 -l 2
+MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 172.16.1.150 () port 0 AF_INET : demo
+Recv   Send    Send
+Socket Socket  Message  Elapsed
+Size   Size    Size     Time     Throughput
+bytes  bytes   bytes    secs.    10^6bits/sec
+
+ 87380  16384  16384    4.10        1.01
+
+
+The bandwidth is throttled according to the IP.
diff --git a/examples/networking/neighbor_sharing/simulation.py b/examples/networking/neighbor_sharing/simulation.py
new file mode 120000
index 0000000..98a2055
--- /dev/null
+++ b/examples/networking/neighbor_sharing/simulation.py
@@ -0,0 +1 @@
+../simulation.py
\ No newline at end of file
diff --git a/examples/networking/neighbor_sharing/tc_neighbor_sharing.c b/examples/networking/neighbor_sharing/tc_neighbor_sharing.c
new file mode 100644
index 0000000..6594862
--- /dev/null
+++ b/examples/networking/neighbor_sharing/tc_neighbor_sharing.c
@@ -0,0 +1,65 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <bcc/proto.h>
+
+struct ipkey {
+  u32 client_ip;
+};
+
+BPF_HASH(learned_ips, struct ipkey, int, 1024);
+
+// trivial action
+int pass(struct __sk_buff *skb) {
+  return 1;
+}
+
+// Process each wan packet, and determine if the packet is in the IP
+// table or not. Learned IPs are rate-limited and unclassified are not.
+// returns: > 0 when an IP is known
+//          = 0 when an IP is not known, or non-IP traffic
+int classify_wan(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    switch (ethernet->type) {
+      case ETH_P_IP: goto ip;
+      default: goto EOP;
+    }
+  }
+  ip: {
+    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+    u32 dip = ip->dst;
+    struct ipkey key = {.client_ip=dip};
+    int *val = learned_ips.lookup(&key);
+    if (val)
+      return *val;
+    goto EOP;
+  }
+EOP:
+  return 0;
+}
+
+// Process each neighbor packet, and store the source IP in the learned table.
+// Mark the inserted entry with a non-zero value to be used by the classify_wan
+// lookup.
+int classify_neighbor(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    switch (ethernet->type) {
+      case ETH_P_IP: goto ip;
+      default: goto EOP;
+    }
+  }
+  ip: {
+    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+    u32 sip = ip->src;
+    struct ipkey key = {.client_ip=sip};
+    int val = 1;
+    learned_ips.insert(&key, &val);
+    goto EOP;
+  }
+EOP:
+  return 1;
+}
diff --git a/examples/networking/neighbor_sharing/tc_neighbor_sharing.py b/examples/networking/neighbor_sharing/tc_neighbor_sharing.py
new file mode 100755
index 0000000..43799c9
--- /dev/null
+++ b/examples/networking/neighbor_sharing/tc_neighbor_sharing.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from simulation import Simulation
+import sys
+from time import sleep
+from builtins import input
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+b = BPF(src_file="tc_neighbor_sharing.c", debug=0)
+
+wan_fn = b.load_func("classify_wan", BPF.SCHED_CLS)
+pass_fn = b.load_func("pass", BPF.SCHED_CLS)
+neighbor_fn = b.load_func("classify_neighbor", BPF.SCHED_CLS)
+
+num_neighbors = 3
+num_locals = 2
+
+# class to build the simulation network
+class SharedNetSimulation(Simulation):
+
+    def __init__(self, ipdb):
+        super(SharedNetSimulation, self).__init__(ipdb)
+
+        # Create the wan namespace, and attach an ingress filter for throttling
+        # inbound (download) traffic
+        wan_if = self._create_ns("wan0", ipaddr="172.16.1.5/24")[1]
+        ipr.tc("add", "ingress", wan_if["index"], "ffff:")
+        ipr.tc("add-filter", "bpf", wan_if["index"], ":1", fd=wan_fn.fd,
+               prio=1, name=wan_fn.name, parent="ffff:", action="drop",
+               classid=1, rate="128kbit", burst=1024 * 32, mtu=16 * 1024)
+        ipr.tc("add-filter", "bpf", wan_if["index"], ":2", fd=pass_fn.fd,
+               prio=2, name=pass_fn.name, parent="ffff:", action="drop",
+               classid=2, rate="1024kbit", burst=1024 * 32, mtu=16 * 1024)
+        self.wan_if = wan_if
+
+    # start the namespaces that compose the network, interconnect them with the
+    # bridge, and attach the tc filters
+    def start(self):
+        neighbor_list = []
+        local_list = []
+        cmd = ["netserver", "-D"]
+        for i in range(0, num_neighbors):
+            ipaddr = "172.16.1.%d/24" % (i + 100)
+            ret = self._create_ns("neighbor%d" % i, ipaddr=ipaddr,
+                                  fn=neighbor_fn, cmd=cmd)
+            neighbor_list.append(ret)
+        for i in range(0, num_locals):
+            ipaddr = "172.16.1.%d/24" % (i + 150)
+            ret = self._create_ns("local%d" % i, ipaddr=ipaddr,
+                                  fn=pass_fn, cmd=cmd)
+            local_list.append(ret)
+
+        with ipdb.create(ifname="br100", kind="bridge") as br100:
+            for x in neighbor_list:
+                br100.add_port(x[1])
+            for x in local_list:
+                br100.add_port(x[1])
+            br100.add_port(self.wan_if)
+            br100.up()
+
+try:
+    sim = SharedNetSimulation(ipdb)
+    sim.start()
+    print("Network ready. Create a shell in the wan0 namespace and test with netperf")
+    print("   (Neighbors are 172.16.1.100-%d, and LAN clients are 172.16.1.150-%d)"
+            % (100 + num_neighbors - 1, 150 + num_locals - 1))
+    print(" e.g.: ip netns exec wan0 netperf -H 172.16.1.100 -l 2")
+    input("Press enter when finished: ")
+finally:
+    if "sim" in locals(): sim.release()
+    if "br100" in ipdb.interfaces: ipdb.interfaces.br100.remove().commit()
+    ipdb.release()
+
+
diff --git a/examples/networking/simple_tc.py b/examples/networking/simple_tc.py
new file mode 100755
index 0000000..ec0a3e7
--- /dev/null
+++ b/examples/networking/simple_tc.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from pyroute2 import IPRoute
+
+ipr = IPRoute()
+
+text = """
+int hello(struct __sk_buff *skb) {
+  return 1;
+}
+"""
+
+try:
+    b = BPF(text=text, debug=0)
+    fn = b.load_func("hello", BPF.SCHED_CLS)
+    ipr.link_create(ifname="t1a", kind="veth", peer="t1b")
+    idx = ipr.link_lookup(ifname="t1a")[0]
+
+    ipr.tc("add", "ingress", idx, "ffff:")
+    ipr.tc("add-filter", "bpf", idx, ":1", fd=fn.fd,
+           name=fn.name, parent="ffff:", action="ok", classid=1)
+    ipr.tc("add", "sfq", idx, "1:")
+    ipr.tc("add-filter", "bpf", idx, ":1", fd=fn.fd,
+           name=fn.name, parent="1:", action="ok", classid=1)
+finally:
+    if "idx" in locals(): ipr.link_remove(idx)
+print("BPF tc functionality - SCHED_CLS: OK")
diff --git a/examples/networking/simulation.py b/examples/networking/simulation.py
new file mode 100644
index 0000000..2c6a0f3
--- /dev/null
+++ b/examples/networking/simulation.py
@@ -0,0 +1,111 @@
+import os
+import subprocess
+import pyroute2
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+
+class Simulation(object):
+    """
+    Helper class for controlling multiple namespaces. Inherit from
+    this class and setup your namespaces.
+    """
+
+    def __init__(self, ipdb):
+        self.ipdb = ipdb
+        self.ipdbs = {}
+        self.namespaces = []
+        self.processes = []
+        self.released = False
+
+    # helper function to add additional ifc to namespace
+    # if called directly outside Simulation class, "ifc_base_name" should be
+    # different from "name", the "ifc_base_name" and "name" are the same for
+    # the first ifc created by namespace
+    def _ns_add_ifc(self, name, ns_ifc, ifc_base_name=None, in_ifc=None,
+                    out_ifc=None, ipaddr=None, macaddr=None, fn=None, cmd=None,
+                    action="ok", disable_ipv6=False):
+        if name in self.ipdbs:
+            ns_ipdb = self.ipdbs[name]
+        else:
+            try:
+                nl=NetNS(name)
+                self.namespaces.append(nl)
+            except KeyboardInterrupt:
+                # remove the namespace if it has been created
+                pyroute2.netns.remove(name)
+                raise
+            ns_ipdb = IPDB(nl)
+            self.ipdbs[nl.netns] = ns_ipdb
+            if disable_ipv6:
+                cmd1 = ["sysctl", "-q", "-w",
+                       "net.ipv6.conf.default.disable_ipv6=1"]
+                nsp = NSPopen(ns_ipdb.nl.netns, cmd1)
+                nsp.wait(); nsp.release()
+            ns_ipdb.interfaces.lo.up().commit()
+        if in_ifc:
+            in_ifname = in_ifc.ifname
+            with in_ifc as v:
+                # move half of veth into namespace
+                v.net_ns_fd = ns_ipdb.nl.netns
+        else:
+            # delete the potentially leaf-over veth interfaces
+            ipr = IPRoute()
+            for i in ipr.link_lookup(ifname='%sa' % ifc_base_name): ipr.link_remove(i)
+            ipr.close()
+            try:
+                out_ifc = self.ipdb.create(ifname="%sa" % ifc_base_name, kind="veth",
+                                           peer="%sb" % ifc_base_name).commit()
+                in_ifc = self.ipdb.interfaces[out_ifc.peer]
+                in_ifname = in_ifc.ifname
+                with in_ifc as v:
+                    v.net_ns_fd = ns_ipdb.nl.netns
+            except KeyboardInterrupt:
+                # explicitly remove the interface
+                out_ifname = "%sa" % ifc_base_name
+                if out_ifname in self.ipdb.interfaces: self.ipdb.interfaces[out_ifname].remove().commit()
+                raise
+
+        if out_ifc: out_ifc.up().commit()
+        ns_ipdb.interfaces.lo.up().commit()
+        ns_ipdb.initdb()
+        in_ifc = ns_ipdb.interfaces[in_ifname]
+        with in_ifc as v:
+            v.ifname = ns_ifc
+            if ipaddr: v.add_ip("%s" % ipaddr)
+            if macaddr: v.address = macaddr
+            v.up()
+        if disable_ipv6:
+            cmd1 = ["sysctl", "-q", "-w",
+                   "net.ipv6.conf.%s.disable_ipv6=1" % out_ifc.ifname]
+            subprocess.call(cmd1)
+        if fn and out_ifc:
+            self.ipdb.nl.tc("add", "ingress", out_ifc["index"], "ffff:")
+            self.ipdb.nl.tc("add-filter", "bpf", out_ifc["index"], ":1",
+                            fd=fn.fd, name=fn.name, parent="ffff:",
+                            action=action, classid=1)
+        if cmd:
+            self.processes.append(NSPopen(ns_ipdb.nl.netns, cmd))
+        return (ns_ipdb, out_ifc, in_ifc)
+
+    # helper function to create a namespace and a veth connecting it
+    def _create_ns(self, name, in_ifc=None, out_ifc=None, ipaddr=None,
+                   macaddr=None, fn=None, cmd=None, action="ok", disable_ipv6=False):
+        (ns_ipdb, out_ifc, in_ifc) = self._ns_add_ifc(name, "eth0", name, in_ifc, out_ifc,
+                                                      ipaddr, macaddr, fn, cmd, action,
+                                                      disable_ipv6)
+        return (ns_ipdb, out_ifc, in_ifc)
+
+    def release(self):
+        if self.released: return
+        self.released = True
+        for p in self.processes:
+            if p.released: continue
+            try:
+                p.kill()
+                p.wait()
+            except:
+                pass
+            finally:
+                p.release()
+        for name, db in self.ipdbs.items(): db.release()
+        for ns in self.namespaces: ns.remove()
+
diff --git a/examples/networking/tc_perf_event.py b/examples/networking/tc_perf_event.py
new file mode 100755
index 0000000..40e7411
--- /dev/null
+++ b/examples/networking/tc_perf_event.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+#
+# tc_perf_event.py  Output skb and meta data through perf event
+#
+# Copyright (c) 2016-present, Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import ctypes as ct
+import pyroute2
+import socket
+
+bpf_txt = """
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/in6.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/pkt_cls.h>
+#include <uapi/linux/bpf.h>
+
+BPF_PERF_OUTPUT(skb_events);
+
+struct eth_hdr {
+	unsigned char   h_dest[ETH_ALEN];
+	unsigned char   h_source[ETH_ALEN];
+	unsigned short  h_proto;
+};
+
+int handle_egress(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct eth_hdr *eth = data;
+	struct ipv6hdr *ip6h = data + sizeof(*eth);
+	u32 magic = 0xfaceb00c;
+
+	/* single length check */
+	if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
+		return TC_ACT_OK;
+
+	if (eth->h_proto == htons(ETH_P_IPV6) &&
+	    ip6h->nexthdr == IPPROTO_ICMPV6)
+	        skb_events.perf_submit_skb(skb, skb->len, &magic, sizeof(magic));
+
+	return TC_ACT_OK;
+}"""
+
+def print_skb_event(cpu, data, size):
+    class SkbEvent(ct.Structure):
+        _fields_ =  [ ("magic", ct.c_uint32),
+                      ("raw", ct.c_ubyte * (size - ct.sizeof(ct.c_uint32))) ]
+
+    skb_event = ct.cast(data, ct.POINTER(SkbEvent)).contents
+    icmp_type = int(skb_event.raw[54])
+
+    # Only print for echo request
+    if icmp_type == 128:
+        src_ip = bytes(bytearray(skb_event.raw[22:38]))
+        dst_ip = bytes(bytearray(skb_event.raw[38:54]))
+        print("%-3s %-32s %-12s 0x%08x" %
+              (cpu, socket.inet_ntop(socket.AF_INET6, src_ip),
+               socket.inet_ntop(socket.AF_INET6, dst_ip),
+               skb_event.magic))
+
+try:
+    b = BPF(text=bpf_txt)
+    fn = b.load_func("handle_egress", BPF.SCHED_CLS)
+
+    ipr = pyroute2.IPRoute()
+    ipr.link("add", ifname="me", kind="veth", peer="you")
+    me = ipr.link_lookup(ifname="me")[0]
+    you = ipr.link_lookup(ifname="you")[0]
+    for idx in (me, you):
+        ipr.link('set', index=idx, state='up')
+
+    ipr.tc("add", "clsact", me)
+    ipr.tc("add-filter", "bpf", me, ":1", fd=fn.fd, name=fn.name,
+           parent="ffff:fff3", classid=1, direct_action=True)
+
+    b["skb_events"].open_perf_buffer(print_skb_event)
+    print('Try: "ping6 ff02::1%me"\n')
+    print("%-3s %-32s %-12s %-10s" % ("CPU", "SRC IP", "DST IP", "Magic"))
+    while True:
+        b.perf_buffer_poll()
+finally:
+    if "me" in locals(): ipr.link("del", index=me)
diff --git a/examples/networking/tunnel_monitor/CMakeLists.txt b/examples/networking/tunnel_monitor/CMakeLists.txt
new file mode 100644
index 0000000..edc7c08
--- /dev/null
+++ b/examples/networking/tunnel_monitor/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(FILES README.md chord.png monitor.c simulation.py vxlan.jpg)
+set(PROGRAMS main.py monitor.py setup.sh traffic.sh)
+install(FILES ${FILES} DESTINATION share/bcc/examples/networking/tunnel_monitor)
+install(PROGRAMS ${PROGRAMS} DESTINATION share/bcc/examples/networking/tunnel_monitor)
\ No newline at end of file
diff --git a/examples/networking/tunnel_monitor/README.md b/examples/networking/tunnel_monitor/README.md
new file mode 100644
index 0000000..92cb467
--- /dev/null
+++ b/examples/networking/tunnel_monitor/README.md
@@ -0,0 +1,65 @@
+## Tunnel Monitor Example
+
+This example shows how to use a BPF program to parse packets across an
+encapsulation boundary. It uses this ability to record inner+outer ip addresses
+as well as vxlan id into a hash table. The entries in that table store bytes
+and packets received/transmitted. One novel part of this program is its use of
+`bpf_tail_call` to parse two different IP headers (inner/outer) using the same
+state machine logic.
+
+Also part of this example is a simulation of a multi-host environment with an
+overlay network (using vxlan in this case), and each host contains multiple
+clients in different segments of the overlay network. The script `traffic.sh`
+can be used to simulate a subset of clients on host0 talking to various other
+clients+hosts at different traffic rates.
+
+![Overlay Diagram](vxlan.jpg)
+
+Once the simulation is running, the statistics kept by the BPF program can be
+displayed to give a visual clue as to the nature of the traffic flowing over
+the physical interface, post-encapsulation.
+
+![Chord Diagram](chord.png)
+
+To get the example running, change into the examples/tunnel_monitor directory.
+If this is the first time, run `setup.sh` to pull in the UI component and
+dependencies. You will need nodejs+npm installed on the system to run this, but
+the setup script will only install packages in the local directory.
+
+```
+[user@localhost tunnel_monitor]$ ./setup.sh 
+Cloning into 'chord-transitions'...
+remote: Counting objects: 294, done.
+...
+jquery#2.1.4 bower_components/jquery
+modernizr#2.8.3 bower_components/modernizr
+fastclick#1.0.6 bower_components/fastclick
+[user@localhost tunnel_monitor]$
+```
+
+Then, start the simulation by running main.py:
+
+```
+[root@bcc-dev tunnel_monitor]# python main.py 
+Launching host 1 of 9
+Launching host 2 of 9
+...
+Starting tunnel 8 of 9
+Starting tunnel 9 of 9
+HTTPServer listening on 0.0.0.0:8080
+Press enter to quit:
+```
+
+The prompt will remain until you choose to exit. In the background, the script
+has started a python SimpleHTTPServer on port 8080, which you may now try to
+connect to from your browser. There will likely be a blank canvas until traffic
+is sent through the tunnels.
+
+To simulate traffic, use the traffic.sh script to generate a distribution of
+pings between various clients and hosts. Check back on the chord diagram to
+see a visualization. Try clicking on a host IP address to see a breakdown of
+the inner IP addresses sent to/from that host.
+
+As an exercise, try modifying the traffic.sh script to cause one of the clients
+to send much more traffic than the others, and use the chord diagram to identify
+the culprit.
diff --git a/examples/networking/tunnel_monitor/chord.png b/examples/networking/tunnel_monitor/chord.png
new file mode 100644
index 0000000..b9754e5
--- /dev/null
+++ b/examples/networking/tunnel_monitor/chord.png
Binary files differ
diff --git a/examples/networking/tunnel_monitor/main.py b/examples/networking/tunnel_monitor/main.py
new file mode 100755
index 0000000..d3359ef
--- /dev/null
+++ b/examples/networking/tunnel_monitor/main.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from builtins import input
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+from netaddr import IPNetwork
+from os import chdir
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from random import choice, randint
+from simulation import Simulation
+from socket import htons
+from threading import Thread
+import sys
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+num_hosts = 9
+num_vnis = 4
+null = open("/dev/null", "w")
+
+class TunnelSimulation(Simulation):
+    def __init__(self, ipdb):
+        super(TunnelSimulation, self).__init__(ipdb)
+        self.available_ips = [list(IPNetwork("192.168.%d.0/24" % i)[1:-1])
+                              for i in range(0, num_vnis)]
+
+    def start(self):
+        # each entry is tuple of ns_ipdb, out_ifc, in_ifc
+        host_info = []
+        for i in range(0, num_hosts):
+            print("Launching host %i of %i" % (i + 1, num_hosts))
+            ipaddr = "172.16.1.%d/24" % (100 + i)
+            host_info.append(self._create_ns("host%d" % i, ipaddr=ipaddr))
+        with self.ipdb.create(ifname="br100", kind="bridge") as br100:
+            for host in host_info: br100.add_port(host[1])
+            br100.up()
+        # create a vxlan device inside each namespace
+        for host in host_info:
+            print("Starting tunnel %i of %i" % (len(self.processes) + 1, num_hosts))
+            cmd = ["netserver", "-D"]
+            self.processes.append(NSPopen(host[0].nl.netns, cmd, stdout=null))
+            for i in range(0, num_vnis):
+                with host[0].create(ifname="vxlan%d" % i, kind="vxlan",
+                                    vxlan_id=10000 + i,
+                                    vxlan_link=host[0].interfaces.eth0,
+                                    vxlan_port=4789,
+                                    vxlan_group="239.1.1.%d" % (1 + i)) as vx:
+                    vx.up()
+                with host[0].create(ifname="br%d" % i, kind="bridge") as br:
+                    br.add_port(host[0].interfaces["vxlan%d" % i])
+                    br.up()
+                    with host[0].create(ifname="c%da" % i, kind="veth",
+                                        peer="c%db" % i) as c:
+                        c.up()
+                        c.add_ip("%s/24" % self.available_ips[i].pop(0))
+                        c.mtu = 1450
+                    br.add_port(host[0].interfaces["c%db" % i])
+                    host[0].interfaces["c%db" % i].up().commit()
+
+        # pick one host to start the monitor in
+        host = host_info[0]
+        cmd = ["python", "monitor.py"]
+        p = NSPopen(host[0].nl.netns, cmd)
+        self.processes.append(p)
+
+    def serve_http(self):
+        chdir("chord-transitions")
+        # comment below line to see http server log messages
+        SimpleHTTPRequestHandler.log_message = lambda self, format, *args: None
+        self.srv = HTTPServer(("", 8080), SimpleHTTPRequestHandler)
+        self.t = Thread(target=self.srv.serve_forever)
+        self.t.setDaemon(True)
+        self.t.start()
+        print("HTTPServer listening on 0.0.0.0:8080")
+
+try:
+    sim = TunnelSimulation(ipdb)
+    sim.start()
+    sim.serve_http()
+    input("Press enter to quit:")
+finally:
+    if "br100" in ipdb.interfaces: ipdb.interfaces.br100.remove().commit()
+    sim.release()
+    ipdb.release()
+    null.close()
diff --git a/examples/networking/tunnel_monitor/monitor.c b/examples/networking/tunnel_monitor/monitor.c
new file mode 100644
index 0000000..630e4a6
--- /dev/null
+++ b/examples/networking/tunnel_monitor/monitor.c
@@ -0,0 +1,137 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+
+struct ipkey {
+  u32 inner_sip;
+  u32 inner_dip;
+  u32 outer_sip;
+  u32 outer_dip;
+  u32 vni;
+};
+struct counters {
+  u64 tx_pkts;
+  u64 rx_pkts;
+  u64 tx_bytes;
+  u64 rx_bytes;
+};
+
+BPF_HASH(stats, struct ipkey, struct counters, 1024);
+BPF_PROG_ARRAY(parser, 10);
+
+enum cb_index {
+  CB_FLAGS = 0,
+  CB_SIP,
+  CB_DIP,
+  CB_VNI,
+  CB_OFFSET,
+};
+
+// helper func to swap two memory locations
+static inline
+void swap32(u32 *a, u32 *b) {
+  u32 t = *a;
+  *a = *b;
+  *b = t;
+}
+
+// helper to swap the fields in an ipkey to give consistent ordering
+static inline
+void swap_ipkey(struct ipkey *key) {
+  swap32(&key->outer_sip, &key->outer_dip);
+  swap32(&key->inner_sip, &key->inner_dip);
+}
+
+#define IS_INGRESS 0x1
+// initial handler for each packet on an ingress tc filter
+int handle_ingress(struct __sk_buff *skb) {
+  skb->cb[CB_FLAGS] = IS_INGRESS;
+  parser.call(skb, 1);  // jump to generic packet parser
+  return 1;
+}
+
+// initial handler for each packet on an egress tc filter
+int handle_egress(struct __sk_buff *skb) {
+  skb->cb[CB_FLAGS] = 0;
+  parser.call(skb, 1);  // jump to generic packet parser
+  return 1;
+}
+
+// parse the outer vxlan frame
+int handle_outer(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+
+  // filter bcast/mcast from the stats
+  if (ethernet->dst & (1ull << 40))
+    goto finish;
+
+  switch (ethernet->type) {
+    case 0x0800: goto ip;
+    default: goto finish;
+  }
+
+ip: ;
+  struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+  skb->cb[CB_SIP] = ip->src;
+  skb->cb[CB_DIP] = ip->dst;
+
+  switch (ip->nextp) {
+    case 17: goto udp;
+    default: goto finish;
+  }
+
+udp: ;
+  struct udp_t *udp = cursor_advance(cursor, sizeof(*udp));
+  switch (udp->dport) {
+    case 4789: goto vxlan;
+    default: goto finish;
+  }
+
+vxlan: ;
+  struct vxlan_t *vxlan = cursor_advance(cursor, sizeof(*vxlan));
+  skb->cb[CB_VNI] = vxlan->key;
+  skb->cb[CB_OFFSET] = (u64)vxlan + sizeof(*vxlan);
+  parser.call(skb, 2);
+
+finish:
+  return 1;
+}
+
+// Parse the inner frame, whatever it may be. If it is ipv4, add the inner
+// source/dest ip to the key, for finer grained stats
+int handle_inner(struct __sk_buff *skb) {
+  int is_ingress = skb->cb[CB_FLAGS] & IS_INGRESS;
+  struct ipkey key = {
+    .vni=skb->cb[CB_VNI],
+    .outer_sip = skb->cb[CB_SIP],
+    .outer_dip = skb->cb[CB_DIP]
+  };
+  u8 *cursor = (u8 *)(u64)skb->cb[CB_OFFSET];
+
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+  switch (ethernet->type) {
+    case 0x0800: goto ip;
+    default: goto finish;
+  }
+ip: ;
+  struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+  key.inner_sip = ip->src;
+  key.inner_dip = ip->dst;
+
+finish:
+  // consistent ordering
+  if (key.outer_dip < key.outer_sip)
+    swap_ipkey(&key);
+  struct counters zleaf = {0};
+  struct counters *leaf = stats.lookup_or_init(&key, &zleaf);
+  if (is_ingress) {
+    lock_xadd(&leaf->rx_pkts, 1);
+    lock_xadd(&leaf->rx_bytes, skb->len);
+  } else {
+    lock_xadd(&leaf->tx_pkts, 1);
+    lock_xadd(&leaf->tx_bytes, skb->len);
+  }
+  return 1;
+}
diff --git a/examples/networking/tunnel_monitor/monitor.py b/examples/networking/tunnel_monitor/monitor.py
new file mode 100644
index 0000000..bac3420
--- /dev/null
+++ b/examples/networking/tunnel_monitor/monitor.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from ctypes import c_uint, c_int, c_ulonglong, Structure
+import json
+from netaddr import IPAddress
+from os import rename
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+import sys
+from time import sleep
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+b = BPF(src_file="monitor.c", debug=0)
+ingress_fn = b.load_func("handle_ingress", BPF.SCHED_CLS)
+egress_fn = b.load_func("handle_egress", BPF.SCHED_CLS)
+outer_fn = b.load_func("handle_outer", BPF.SCHED_CLS)
+inner_fn = b.load_func("handle_inner", BPF.SCHED_CLS)
+stats = b.get_table("stats")
+# using jump table for inner and outer packet split
+parser = b.get_table("parser")
+parser[c_int(1)] = c_int(outer_fn.fd)
+parser[c_int(2)] = c_int(inner_fn.fd)
+
+ifc = ipdb.interfaces.eth0
+
+ipr.tc("add", "ingress", ifc.index, "ffff:")
+ipr.tc("add-filter", "bpf", ifc.index, ":1", fd=ingress_fn.fd,
+       name=ingress_fn.name, parent="ffff:", action="ok", classid=1)
+ipr.tc("add", "sfq", ifc.index, "1:")
+ipr.tc("add-filter", "bpf", ifc.index, ":1", fd=egress_fn.fd,
+       name=egress_fn.name, parent="1:", action="ok", classid=1)
+
+def stats2json(k, v):
+    return {
+        "vni": int(k.vni),
+        "outer_sip": str(IPAddress(k.outer_sip)),
+        "outer_dip": str(IPAddress(k.outer_dip)),
+        "inner_sip": str(IPAddress(k.inner_sip)),
+        "inner_dip": str(IPAddress(k.inner_dip)),
+        "tx_pkts": v.tx_pkts, "tx_bytes": v.tx_bytes,
+        "rx_pkts": v.rx_pkts, "rx_bytes": v.rx_bytes,
+    }
+
+def delta_stats(v, oldv):
+    return stats.Leaf(v.tx_pkts - oldv.tx_pkts, v.rx_pkts - oldv.rx_pkts,
+                      v.tx_bytes - oldv.tx_bytes, v.rx_bytes - oldv.rx_bytes)
+def key2str(k):
+    return "%s,%s,%d,%s,%s" % (IPAddress(k.outer_sip), IPAddress(k.outer_dip), k.vni,
+                               IPAddress(k.inner_sip), IPAddress(k.inner_dip))
+
+prev = {}
+
+while True:
+    result_total = []
+    result_delta = []
+    tmp = {}
+    # compute both the total and last-N-seconds statistics
+    for k, v in stats.items():
+        # subtract the previous totals from the current, or 0 if none exists
+        v2 = delta_stats(v, prev.get(key2str(k), stats.Leaf(0, 0, 0, 0)))
+        if v2.tx_pkts != 0 or v2.rx_pkts != 0:
+            result_delta.append(stats2json(k, v2))
+        tmp[key2str(k)] = v
+        result_total.append(stats2json(k, v))
+
+    prev = tmp
+
+    with open("./chord-transitions/data/tunnel.json.new", "w") as f:
+        json.dump(result_total, f)
+    rename("./chord-transitions/data/tunnel.json.new", "./chord-transitions/data/tunnel.json")
+    with open("./chord-transitions/data/tunnel-delta.json.new", "w") as f:
+        json.dump(result_delta, f)
+    rename("./chord-transitions/data/tunnel-delta.json.new", "./chord-transitions/data/tunnel-delta.json")
+    sleep(5)
+ipdb.release()
+
diff --git a/examples/networking/tunnel_monitor/setup.sh b/examples/networking/tunnel_monitor/setup.sh
new file mode 100755
index 0000000..5849cd3
--- /dev/null
+++ b/examples/networking/tunnel_monitor/setup.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# this script:
+#  1. checks for bower to be installed
+#  2. clones the chord-transitions UI from github
+#  3. installs locally the packages required by the UI
+
+function which_() { hash "$1" &>/dev/null; }
+
+if [[ ! -d chord-transitions ]]; then
+  git clone https://github.com/iovisor/chord-transitions.git
+fi
+
+cd chord-transitions
+
+export PATH=node_modules/.bin:$PATH
+
+if ! which_ bower; then
+  if ! which_ npm; then
+    echo "Error: required binary 'npm' not found, please install nodejs"
+    exit 1
+  fi
+  npm install bower
+fi
+
+if [[ "$(id -u)" = "0" ]]; then
+  args="--allow-root"
+fi
+
+bower install $args
diff --git a/examples/networking/tunnel_monitor/simulation.py b/examples/networking/tunnel_monitor/simulation.py
new file mode 120000
index 0000000..98a2055
--- /dev/null
+++ b/examples/networking/tunnel_monitor/simulation.py
@@ -0,0 +1 @@
+../simulation.py
\ No newline at end of file
diff --git a/examples/networking/tunnel_monitor/traffic.sh b/examples/networking/tunnel_monitor/traffic.sh
new file mode 100755
index 0000000..753c047
--- /dev/null
+++ b/examples/networking/tunnel_monitor/traffic.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+cmd="ip netns exec host0"
+if [[ "$(id -u)" != "0" ]]; then
+  cmd="sudo $cmd"
+fi
+
+B=/usr/bin/byobu
+S=tunnel1
+
+tmux has-session -t $S &> /dev/null
+
+if [[ $? != 0 ]]; then
+  $B new-session -s $S -n "c1" -d
+  tmux send -t $S "$cmd ping 192.168.0.1 -s512" C-m
+  tmux new-window -t $S -n "c2"
+  tmux send -t $S "$cmd ping 192.168.0.2 -s128" C-m
+  tmux new-window -t $S -n "c3"
+  tmux send -t $S "$cmd ping 192.168.0.3 -s1024" C-m
+  tmux new-window -t $S -n "c3"
+  tmux send -t $S "$cmd ping 192.168.0.4 -s128" C-m
+  tmux new-window -t $S -n "c3"
+  tmux send -t $S "$cmd ping 192.168.0.5 -s128" C-m
+  tmux new-window -t $S -n "c3"
+  tmux send -t $S "$cmd ping 192.168.0.6 -s128" C-m
+  tmux new-window -t $S -n "c4"
+  tmux send -t $S "$cmd ping 192.168.1.2 -s128" C-m
+  tmux new-window -t $S -n "c5"
+  tmux send -t $S "$cmd ping 192.168.1.4 -s768" C-m
+  tmux new-window -t $S -n "c2"
+  tmux send -t $S "$cmd ping 192.168.2.2 -s128" C-m
+  tmux new-window -t $S -n "c3"
+  tmux send -t $S "$cmd ping 192.168.2.7 -s1024" C-m
+  tmux new-window -t $S -n "c4"
+  tmux send -t $S "$cmd ping 192.168.2.2 -s128" C-m
+  tmux new-window -t $S -n "c5"
+  tmux send -t $S "$cmd ping 192.168.3.8 -s768" C-m
+  tmux new-window -t $S -n "c5"
+  tmux send -t $S "$cmd ping 192.168.3.9 -s768" C-m
+fi
+
+exec tmux attach -t $S
+
diff --git a/examples/networking/tunnel_monitor/vxlan.jpg b/examples/networking/tunnel_monitor/vxlan.jpg
new file mode 100644
index 0000000..15acb6f
--- /dev/null
+++ b/examples/networking/tunnel_monitor/vxlan.jpg
Binary files differ
diff --git a/examples/networking/vlan_filter/README.md b/examples/networking/vlan_filter/README.md
new file mode 100644
index 0000000..9c17a54
--- /dev/null
+++ b/examples/networking/vlan_filter/README.md
@@ -0,0 +1,34 @@
+# VLAN Filter #
+This is eBPF application to parse VXLAN packets and then extracts encapsulated VLAN packets to monitor traffic from each VLAN. Extracted packet header fields can be stored in a file or sent to remote server via Apache Kafka messaging system.
+
+Also part of this example is a simulation of a multi-host environment. Simulation environment can be setup by using test_setup.sh script. Then a sample script (traffic.sh) can be used to send traffic from one client (VLAN=100) from host1 talking to another client at host2 and one client (VLAN=200) from host2 talking to another client at host1 while running vlan_filter application in parallel by using command 'python data-plane-tracing -i veth7'.
+
+![picture](scenario.jpg)
+
+### Usage Example ###
+* $ sudo python data-plane-tracing.py
+
+Timestamp | Host Name  | Host IP   | IP Version   | Source Host IP   | Dest Host IP   | Source Host Port   | Dest Host Port   | VNI   | Source VM MAC  | Dest VM MAC  | VLAN ID  | Source VM IP   | Dest VM IP   | Protocol   | Source VM Port   | Dest VM Port   | Packet Length   |
+---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
+ 2018-05-24 18:43:30.386228 | Box1 | x.x.x.x  | 4 | 10.1.1.11 | 10.1.1.12 | 54836 | 4789 | 10 | fa:16:3e:ec:22:99 | fa:16:3e:1c:6f:2d | 100 | 192.168.100.11 | 192.168.100.12 | 6 | 1285 | 20302 | 1200
+
+
+# Implementation overview #
+Example application implementation is split into two parts: the former that exploits eBPF code, the latter that performs some additional processing in user space (python wrapper).
+
+### First part: eBPF Filter ###
+This component filters VXLAN packets.
+The program is loaded as PROG_TYPE_SOCKET_FILTER and attached to a socket, bind to eth0.
+Packets matching VXLAN filter are forwarded to the user space, while other packets are dropped.
+
+### Python code in user space ###
+The Python script reads filtered raw packets from the socket, extracts all the useful header fields and stores extracted packet into a file by default or can be sent to a remote server via Apache Kafka messaging system.
+
+# How to execute this example application #
+VLAN Filter application can be executed by using one of the below commands:
+* $ sudo python data-plane-tracing.py
+* $ sudo python data-plane-tracing -i eth2 -k vc.manage.overcloud:9092
+
+# How to install Required Dependencies
+* $ pip install kafka-python
+* $ pip install netifaces
diff --git a/examples/networking/vlan_filter/data-plane-tracing.c b/examples/networking/vlan_filter/data-plane-tracing.c
new file mode 100644
index 0000000..8b725a5
--- /dev/null
+++ b/examples/networking/vlan_filter/data-plane-tracing.c
@@ -0,0 +1,54 @@
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+#define IP_TCP 	6
+#define IP_UDP 17
+#define IP_ICMP 1
+/* 
+  In 802.3, both the source and destination addresses are 48 bits (4 bytes) MAC address.
+  6 bytes (src) + 6 bytes (dst) + 2 bytes (type) = 14 bytes 
+*/
+#define ETH_HLEN 14
+
+/*eBPF program.
+  Filter TCP/UDP/ICMP packets, having payload not empty
+  if the program is loaded as PROG_TYPE_SOCKET_FILTER
+  and attached to a socket
+  return  0 -> DROP the packet
+  return -1 -> KEEP the packet and return it to user space (userspace can read it from the socket_fd )
+*/
+int vlan_filter(struct __sk_buff *skb) { 
+	u8 *cursor = 0;	
+
+	struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+	
+	//filter IP packets (ethernet type = 0x0800) 0x0800 is IPv4 packet
+	switch(ethernet->type){
+		case 0x0800: goto IP;
+	    	default: goto DROP;
+	}
+
+	
+	IP: ;
+		struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));  // IP header (datagram)
+	        switch (ip->nextp){
+			case 17: goto UDP;
+			default: goto DROP;
+		}
+
+	UDP: ;
+		struct udp_t *udp = cursor_advance(cursor, sizeof(*udp));
+		switch (udp->dport) {
+    			case 4789: goto KEEP;
+    			default: goto DROP;
+  		}
+
+	//keep the packet and send it to userspace retruning -1
+	KEEP:
+		return -1;
+
+	//drop the packet returning 0
+	DROP:
+		return 0;
+}
\ No newline at end of file
diff --git a/examples/networking/vlan_filter/data-plane-tracing.py b/examples/networking/vlan_filter/data-plane-tracing.py
new file mode 100755
index 0000000..efaa7f1
--- /dev/null
+++ b/examples/networking/vlan_filter/data-plane-tracing.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python
+from __future__ import print_function
+from bcc import BPF
+
+import sys
+import socket
+import os
+import argparse
+import time
+import netifaces as ni
+
+from sys import argv
+from kafka import KafkaProducer
+from kafka.errors import KafkaError
+from datetime import datetime
+
+#args
+def usage():
+    print("USAGE: %s [-i <if_name>]" % argv[0])
+    print("")
+    print("Try '%s -h' for more options." % argv[0])
+    exit()
+
+#help
+def help():
+    print("USAGE: %s [-i <if_name>][-k <kafka_server_name:kafka_port>]" % argv[0])
+    print("")
+    print("optional arguments:")
+    print("   -h                       print this help")
+    print("   -i if_name               select interface if_name. Default is eth0")
+    print("   -k kafka_server_name     select kafka server name. Default is save to file")
+    print("                            If -k option is not specified data will be saved to file.")
+    
+    print("")
+    print("examples:")
+    print("    data-plane-tracing                                      # bind socket to eth0")
+    print("    data-plane-tracing -i eno2 -k vc.manage.overcloud:9092  # bind socket to eno2 and send data to kafka server in iovisor-topic.")
+    exit()
+
+#arguments
+interface="eth0"
+kafkaserver=''
+        
+#check provided arguments
+if len(argv) == 2:
+    if str(argv[1]) == '-h':
+        help()
+    else:
+        usage()
+
+if len(argv) == 3:
+    if str(argv[1]) == '-i':
+        interface = argv[2]
+    elif str(argv[1]) == '-k':
+        kafkaserver = argv[2] 
+    else:
+        usage()
+    
+if len(argv) == 5:
+    if str(argv[1]) == '-i':
+        interface = argv[2]
+        kafkaserver = argv[4]
+    elif str(argv[1]) == '-k':
+        kafkaserver = argv[2] 
+        interface = argv[4]
+    else:
+        usage()
+
+if len(argv) > 5:
+    usage()
+
+print ("binding socket to '%s'" % interface)	
+ 
+#initialize BPF - load source code from http-parse-simple.c
+bpf = BPF(src_file = "data-plane-tracing.c", debug = 0)
+
+#load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm
+#more info about eBPF program types http://man7.org/linux/man-pages/man2/bpf.2.html
+function_vlan_filter = bpf.load_func("vlan_filter", BPF.SOCKET_FILTER)
+
+#create raw socket, bind it to eth0
+#attach bpf program to socket created
+BPF.attach_raw_socket(function_vlan_filter, interface)
+
+#get file descriptor of the socket previously created inside BPF.attach_raw_socket
+socket_fd = function_vlan_filter.sock
+
+#create python socket object, from the file descriptor
+sock = socket.fromfd(socket_fd,socket.PF_PACKET,socket.SOCK_RAW,socket.IPPROTO_IP)
+
+#set it as blocking socket
+sock.setblocking(True)
+
+#get interface ip address. In case ip is not set then just add 127.0.0.1.
+ni.ifaddresses(interface)
+try:
+    ip = ni.ifaddresses(interface)[ni.AF_INET][0]['addr']
+except:
+    ip = '127.0.0.1'    
+
+print("| Timestamp | Host Name | Host IP | IP Version | Source Host IP | Dest Host IP | Source Host Port | Dest Host Port | VNI | Source VM MAC | Dest VM MAC | VLAN ID | Source VM IP | Dest VM IP | Protocol | Source VM Port | Dest VM Port | Packet Length |")
+
+while 1:
+    #retrieve raw packet from socket
+    packet_str = os.read(socket_fd, 2048)
+    
+    #convert packet into bytearray
+    packet_bytearray = bytearray(packet_str)
+    
+    #ethernet header length
+    ETH_HLEN = 14 
+    
+    #VXLAN header length
+    VXLAN_HLEN = 8
+    
+    #VLAN header length
+    VLAN_HLEN = 4
+    
+    #Inner TCP/UDP header length
+    TCP_HLEN = 20
+    UDP_HLEN = 8
+    
+    #calculate packet total length
+    total_length = packet_bytearray[ETH_HLEN + 2]               #load MSB
+    total_length = total_length << 8                            #shift MSB
+    total_length = total_length + packet_bytearray[ETH_HLEN+3]  #add LSB
+    
+    #calculate ip header length
+    ip_header_length = packet_bytearray[ETH_HLEN]               #load Byte
+    ip_header_length = ip_header_length & 0x0F                  #mask bits 0..3
+    ip_header_length = ip_header_length << 2                    #shift to obtain length
+    
+    #calculate payload offset
+    payload_offset = ETH_HLEN + ip_header_length + UDP_HLEN + VXLAN_HLEN
+    
+    #parsing ip version from ip packet header
+    ipversion = str(bin(packet_bytearray[14])[2:5])
+    
+    #parsing source ip address, destination ip address from ip packet header
+    src_host_ip = str(packet_bytearray[26]) + "." + str(packet_bytearray[27]) + "." + str(packet_bytearray[28]) + "." + str(packet_bytearray[29])
+    dest_host_ip = str(packet_bytearray[30]) + "." + str(packet_bytearray[31]) + "." + str(packet_bytearray[32]) + "." + str(packet_bytearray[33])
+    
+    #parsing source port and destination port
+    src_host_port = packet_bytearray[34] << 8 | packet_bytearray[35]
+    dest_host_port = packet_bytearray[36] << 8 | packet_bytearray[37]
+    
+    #parsing VNI from VXLAN header
+    VNI = str((packet_bytearray[46])+(packet_bytearray[47])+(packet_bytearray[48]))
+    
+    #parsing source mac address and destination mac address
+    mac_add = [packet_bytearray[50], packet_bytearray[51], packet_bytearray[52], packet_bytearray[53], packet_bytearray[54], packet_bytearray[55]]
+    src_vm_mac = ":".join(map(lambda b: format(b, "02x"), mac_add))
+    mac_add = [packet_bytearray[56], packet_bytearray[57], packet_bytearray[58], packet_bytearray[59], packet_bytearray[60], packet_bytearray[61]]
+    dest_vm_mac = ":".join(map(lambda b: format(b, "02x"), mac_add))
+    
+    #parsing VLANID from VLAN header
+    VLANID=""
+    VLANID = str((packet_bytearray[64])+(packet_bytearray[65]))
+
+    #parsing source vm ip address, destination vm ip address from encapsulated ip packet header
+    src_vm_ip = str(packet_bytearray[80]) + "." + str(packet_bytearray[81]) + "." + str(packet_bytearray[82]) + "." + str(packet_bytearray[83])
+    dest_vm_ip = str(packet_bytearray[84]) + "." + str(packet_bytearray[85]) + "." + str(packet_bytearray[86]) + "." + str(packet_bytearray[87]) 
+    
+    #parsing source port and destination port
+    if (packet_bytearray[77]==6 or packet_bytearray[77]==17):
+        src_vm_port = packet_bytearray[88] << 8 | packet_bytearray[88]
+        dest_vm_port = packet_bytearray[90] << 8 | packet_bytearray[91]
+    elif (packet_bytearray[77]==1):
+        src_vm_port = -1
+        dest_vm_port = -1
+        type = str(packet_bytearray[88])
+    else:
+        continue
+    
+    timestamp = str(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))
+    
+    #send data to remote server via Kafka Messaging Bus
+    if kafkaserver:
+        MESSAGE = (timestamp, socket.gethostname(),ip, str(int(ipversion, 2)), str(src_host_ip), str(dest_host_ip), str(src_host_port), str(dest_host_port), str(int(VNI)), str(src_vm_mac), str(dest_vm_mac), str(int(VLANID)), src_vm_ip, dest_vm_ip, str(packet_bytearray[77]), str(src_vm_port), str(dest_vm_port), str(total_length))
+        print (MESSAGE)
+        MESSAGE = ','.join(MESSAGE)
+        MESSAGE = MESSAGE.encode() 
+        producer = KafkaProducer(bootstrap_servers=[kafkaserver])
+        producer.send('iovisor-topic', key=b'iovisor', value=MESSAGE)
+    
+    #save data to files
+    else:
+        MESSAGE = timestamp+","+socket.gethostname()+","+ip+","+str(int(ipversion, 2))+","+src_host_ip+","+dest_host_ip+","+str(src_host_port)+","+str(dest_host_port)+","+str(int(VNI))+","+str(src_vm_mac)+","+str(dest_vm_mac)+","+str(int(VLANID))+","+src_vm_ip+","+dest_vm_ip+","+str(packet_bytearray[77])+","+str(src_vm_port)+","+str(dest_vm_port)+","+str(total_length)
+        print (MESSAGE)
+        #save data to a file on hour basis 
+        filename = "./vlan-data-"+time.strftime("%Y-%m-%d-%H")+"-00"
+        with open(filename, "a") as f:
+            f.write("%s\n" % MESSAGE)
diff --git a/examples/networking/vlan_filter/scenario.jpg b/examples/networking/vlan_filter/scenario.jpg
new file mode 100644
index 0000000..ba3d7ab
--- /dev/null
+++ b/examples/networking/vlan_filter/scenario.jpg
Binary files differ
diff --git a/examples/networking/vlan_filter/test_setup.sh b/examples/networking/vlan_filter/test_setup.sh
new file mode 100755
index 0000000..967cf21
--- /dev/null
+++ b/examples/networking/vlan_filter/test_setup.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+# This script must be executed by root user
+if [ "$(id -u)" != "0" ]; then
+   echo "This script must be run as root" 1>&2
+   exit 1
+fi
+
+# add namespaces
+ip netns add netns11
+ip netns add netns12
+ip netns add netns21
+ip netns add netns22
+ip netns add netns3
+ip netns add netns4
+
+# set up veth devices in netns11 to netns21 with connection to netns3  
+ip link add veth11 type veth peer name veth13
+ip link add veth21 type veth peer name veth23
+ip link set veth11 netns netns11
+ip link set veth21 netns netns21
+ip link set veth13 netns netns3
+ip link set veth23 netns netns3
+
+# set up veth devices in netns12 and netns22 with connection to netns4 
+ip link add veth12 type veth peer name veth14
+ip link add veth22 type veth peer name veth24
+ip link set veth12 netns netns12
+ip link set veth22 netns netns22
+ip link set veth14 netns netns4
+ip link set veth24 netns netns4
+  
+# assign IP addresses and set the devices up 
+ip netns exec netns11 ifconfig veth11 192.168.100.11/24 up
+ip netns exec netns11 ip link set lo up
+ip netns exec netns12 ifconfig veth12 192.168.100.12/24 up
+ip netns exec netns12 ip link set lo up
+ip netns exec netns21 ifconfig veth21 192.168.200.21/24 up
+ip netns exec netns21 ip link set lo up
+ip netns exec netns22 ifconfig veth22 192.168.200.22/24 up
+ip netns exec netns22 ip link set lo up
+
+# set up bridge brx and its ports 
+ip netns exec netns3 brctl addbr brx  
+ip netns exec netns3 ip link set brx up
+ip netns exec netns3 ip link set veth13 up
+ip netns exec netns3 ip link set veth23 up
+ip netns exec netns3 brctl addif brx veth13
+ip netns exec netns3 brctl addif brx veth23
+
+# set up bridge bry and its ports 
+ip netns exec netns4 brctl addbr bry  
+ip netns exec netns4 ip link set bry up
+ip netns exec netns4 ip link set veth14 up
+ip netns exec netns4 ip link set veth24 up
+ip netns exec netns4 brctl addif bry veth14
+ip netns exec netns4 brctl addif bry veth24
+
+# create veth devices to connect the bridges
+ip link add vethx type veth peer name vethx11
+ip link add vethy type veth peer name vethy11
+ip link set vethx netns netns3
+ip link set vethx11 netns netns3
+ip link set vethy netns netns4
+ip link set vethy11 netns netns4
+
+ip netns exec netns3 brctl addif brx vethx
+ip netns exec netns3 ip link set vethx up
+ip netns exec netns3 bridge vlan add vid 100 tagged dev vethx
+ip netns exec netns3 bridge vlan add vid 200 tagged dev vethx
+ip netns exec netns3 bridge vlan del vid 1 dev vethx
+ip netns exec netns3 bridge vlan show
+
+ip netns exec netns4 brctl addif bry vethy
+ip netns exec netns4 ip link set vethy up
+ip netns exec netns4 bridge vlan add vid 100 tagged dev vethy
+ip netns exec netns4 bridge vlan add vid 200 tagged dev vethy
+ip netns exec netns4 bridge vlan del vid 1 dev vethy
+ip netns exec netns4 bridge vlan show
+
+ip netns exec netns3 ip link set dev brx type bridge vlan_filtering 1
+ip netns exec netns4 ip link set dev bry type bridge vlan_filtering 1
+ip netns exec netns3 bridge vlan del vid 1 dev brx self
+ip netns exec netns4 bridge vlan del vid 1 dev bry self
+ip netns exec netns3 bridge vlan show
+ip netns exec netns4 bridge vlan show
+
+ip netns exec netns3 bridge vlan add vid 100 pvid untagged dev veth13
+ip netns exec netns3 bridge vlan add vid 200 pvid untagged dev veth23
+ip netns exec netns4 bridge vlan add vid 100 pvid untagged dev veth14
+ip netns exec netns4 bridge vlan add vid 200 pvid untagged dev veth24
+
+ip netns exec netns3 bridge vlan del vid 1 dev veth13
+ip netns exec netns3 bridge vlan del vid 1 dev veth23
+ip netns exec netns4 bridge vlan del vid 1 dev veth14
+ip netns exec netns4 bridge vlan del vid 1 dev veth24
+
+# set up bridge brvx and its ports 
+ip netns exec netns3 brctl addbr brvx  
+ip netns exec netns3 ip link set brvx up
+ip netns exec netns3 ip link set vethx11 up
+ip netns exec netns3 brctl addif brvx vethx11
+
+# set up bridge brvy and its ports 
+ip netns exec netns4 brctl addbr brvy  
+ip netns exec netns4 ip link set brvy up
+ip netns exec netns4 ip link set vethy11 up
+ip netns exec netns4 brctl addif brvy vethy11
+
+# create veth devices to connect the vxlan bridges
+ip link add veth3 type veth peer name veth4
+ip link add veth5 type veth peer name veth6
+ip link set veth3 netns netns3
+ip link set veth5 netns netns4
+ip netns exec netns3 ip link set veth3 up
+ip netns exec netns4 ip link set veth5 up
+ip link set veth4 up
+ip link set veth6 up
+ip netns exec netns3 ifconfig veth3 10.1.1.11/24 up
+ip netns exec netns4 ifconfig veth5 10.1.1.12/24 up
+
+# add vxlan ports
+ip netns exec netns3 ip link add vxlan-10 type vxlan id 10 remote 10.1.1.12 dstport 4789 dev veth3
+ip netns exec netns4 ip link add vxlan-10 type vxlan id 10 remote 10.1.1.11 dstport 4789 dev veth5
+ip netns exec netns3 ip link set vxlan-10 up
+ip netns exec netns4 ip link set vxlan-10 up
+ip netns exec netns3 brctl addif brvx vxlan-10
+ip netns exec netns4 brctl addif brvy vxlan-10
+
+# create veth devices to connect the vxlan bridges
+ip link add veth7 type veth peer name veth8
+ip link set veth7 up
+ip link set veth8 up
+
+# set up bridge brjx and its ports 
+brctl addbr brjx  
+ip link set brjx up
+ip link set veth4 up
+brctl addif brjx veth4
+brctl addif brjx veth7
+
+# set up bridge brjy and its ports 
+brctl addbr brjy  
+ip link set brjy up
+ip link set veth6 up
+brctl addif brjy veth6
+brctl addif brjy veth8
diff --git a/examples/networking/vlan_filter/test_traffic.sh b/examples/networking/vlan_filter/test_traffic.sh
new file mode 100755
index 0000000..4be4515
--- /dev/null
+++ b/examples/networking/vlan_filter/test_traffic.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+ip netns exec netns11 ping 192.168.100.12 -c 10
+ip netns exec netns22 ping 192.168.200.21 -c 10
diff --git a/examples/networking/vlan_learning/CMakeLists.txt b/examples/networking/vlan_learning/CMakeLists.txt
new file mode 100644
index 0000000..0572444
--- /dev/null
+++ b/examples/networking/vlan_learning/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(EXAMPLE_FILES README.txt simulation.py vlan_learning.c)
+set(EXAMPLE_PROGRAMS vlan_learning.py)
+install(FILES ${EXAMPLE_FILES} DESTINATION share/bcc/examples/networking/vlan_learning)
+install(PROGRAMS ${EXAMPLE_PROGRAMS} DESTINATION share/bcc/examples/networking/vlan_learning)
diff --git a/examples/networking/vlan_learning/README.txt b/examples/networking/vlan_learning/README.txt
new file mode 100644
index 0000000..bbe181a
--- /dev/null
+++ b/examples/networking/vlan_learning/README.txt
@@ -0,0 +1,43 @@
+This example shows a unique way to use a BPF program to demux any ethernet
+traffic into a pool of worker veth+namespaces (or any ifindex-based
+destination) depending on a configurable mapping of src-mac to ifindex. As
+part of the ingress processing, the program will dynamically learn the source
+ifindex of the matched source mac.
+
+Simulate a physical network with a vlan aware switch and clients that may
+connect to any vlan. The program will detect the known clients and pass the
+traffic through to a dedicated namespace for processing. Clients may have
+overlapping IP spaces and the traffic will still work.
+
+               |           bpf program                      |
+cli0 --|       |                            /--|-- worker0  |
+cli1 --| trunk | +->--->-handle_p2v(pkt)-> /---|-- worker1  |
+cli2 --|=======|=+                        /----|-- worker2  |
+...  --|       | +-<---<-handle_v2p(pkt)-<-----|--  ...     |
+cliN --|       |                          \----|-- workerM  |
+       |       |                              ^             |
+     phys      |                            veth            |
+    switch     |                                            |
+
+To run the example, simply:
+
+sudo /path/to/vlan_learning/vlan_learning.py
+
+Serving HTTP on 0.0.0.0 port 80 ...
+Serving HTTP on 0.0.0.0 port 80 ...
+Serving HTTP on 0.0.0.0 port 80 ...
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0172.16.1.100 - - [04/Nov/2015 10:54:47] "GET / HTTP/1.1" 200 -
+100   574  100   574    0     0  45580      0 --:--:-- --:--:-- --:--:-- 47833
+
+...
+
+Press enter to exit:
+mac 020000000000 rx pkts = 95, rx bytes = 7022
+                 tx pkts = 0, tx bytes = 0
+mac 020000000001 rx pkts = 95, rx bytes = 7022
+                 tx pkts = 0, tx bytes = 0
+mac 020000000002 rx pkts = 97, rx bytes = 7154
+                 tx pkts = 0, tx bytes = 0
+
diff --git a/examples/networking/vlan_learning/simulation.py b/examples/networking/vlan_learning/simulation.py
new file mode 120000
index 0000000..98a2055
--- /dev/null
+++ b/examples/networking/vlan_learning/simulation.py
@@ -0,0 +1 @@
+../simulation.py
\ No newline at end of file
diff --git a/examples/networking/vlan_learning/vlan_learning.c b/examples/networking/vlan_learning/vlan_learning.c
new file mode 100644
index 0000000..3774d74
--- /dev/null
+++ b/examples/networking/vlan_learning/vlan_learning.c
@@ -0,0 +1,62 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <bcc/proto.h>
+
+struct ifindex_leaf_t {
+  int out_ifindex;
+  int vlan_tci; // populated by phys2virt and used by virt2phys
+  int vlan_proto; // populated by phys2virt and used by virt2phys
+  u64 tx_pkts;
+  u64 tx_bytes;
+};
+
+// redirect based on mac -> out_ifindex (auto-learning)
+BPF_HASH(egress, int, struct ifindex_leaf_t, 4096);
+
+// redirect based on mac -> out_ifindex (config-driven)
+BPF_HASH(ingress, u64, struct ifindex_leaf_t, 4096);
+
+int handle_phys2virt(struct __sk_buff *skb) {
+  // only handle vlan packets
+  if (!skb->vlan_present)
+    return 1;
+  u8 *cursor = 0;
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    u64 src_mac = ethernet->src;
+    struct ifindex_leaf_t *leaf = ingress.lookup(&src_mac);
+    if (leaf) {
+      lock_xadd(&leaf->tx_pkts, 1);
+      lock_xadd(&leaf->tx_bytes, skb->len);
+      // auto-program reverse direction table
+      int out_ifindex = leaf->out_ifindex;
+      struct ifindex_leaf_t zleaf = {0};
+      struct ifindex_leaf_t *out_leaf = egress.lookup_or_init(&out_ifindex, &zleaf);
+      // to capture potential configuration changes
+      out_leaf->out_ifindex = skb->ifindex;
+      out_leaf->vlan_tci = skb->vlan_tci;
+      out_leaf->vlan_proto = skb->vlan_proto;
+      // pop the vlan header and send to the destination
+      bpf_skb_vlan_pop(skb);
+      bpf_clone_redirect(skb, leaf->out_ifindex, 0);
+    }
+  }
+  return 1;
+}
+
+int handle_virt2phys(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    int src_ifindex = skb->ifindex;
+    struct ifindex_leaf_t *leaf = egress.lookup(&src_ifindex);
+    if (leaf) {
+      lock_xadd(&leaf->tx_pkts, 1);
+      lock_xadd(&leaf->tx_bytes, skb->len);
+      bpf_skb_vlan_push(skb, leaf->vlan_proto, leaf->vlan_tci);
+      bpf_clone_redirect(skb, leaf->out_ifindex, 0);
+    }
+  }
+  return 1;
+}
diff --git a/examples/networking/vlan_learning/vlan_learning.py b/examples/networking/vlan_learning/vlan_learning.py
new file mode 100755
index 0000000..a902320
--- /dev/null
+++ b/examples/networking/vlan_learning/vlan_learning.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from builtins import input
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from random import shuffle
+from time import sleep
+from simulation import Simulation
+import sys
+
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+
+num_clients = 3
+num_vlans = 16
+
+# load the bpf program
+b = BPF(src_file="vlan_learning.c", debug=0)
+phys_fn = b.load_func("handle_phys2virt", BPF.SCHED_CLS)
+virt_fn = b.load_func("handle_virt2phys", BPF.SCHED_CLS)
+
+ingress = b.get_table("ingress")
+egress = b.get_table("egress")
+
+class VlanSimulation(Simulation):
+    def __init__(self, ipdb):
+        super(VlanSimulation, self).__init__(ipdb)
+
+    def start(self):
+        # start identical workers each in a namespace
+        for i in range(0, num_clients):
+            httpmod = ("SimpleHTTPServer" if sys.version_info[0] < 3
+                       else "http.server")
+            cmd = ["python", "-m", httpmod, "80"]
+            self._create_ns("worker%d" % i, cmd=cmd, fn=virt_fn, action="drop",
+                            ipaddr="172.16.1.5/24")
+
+        # simulate a physical eth vlan trunk
+        with self.ipdb.create(ifname="eth0a", kind="veth", peer="eth0b") as v:
+            v.up()
+        self.ipdb.interfaces.eth0b.up().commit()
+
+        # eth0a will be hooked to clients with vlan interfaces
+        # add the bpf program to eth0b for demuxing phys2virt packets
+        v = self.ipdb.interfaces["eth0b"]
+        ipr.tc("add", "ingress", v["index"], "ffff:")
+        ipr.tc("add-filter", "bpf", v["index"], ":1", fd=phys_fn.fd,
+               name=phys_fn.name, parent="ffff:", action="drop", classid=1)
+
+        # allocate vlans randomly
+        available_vlans = [i for i in range(2, 2 + num_vlans)]
+        shuffle(available_vlans)
+        available_ips = [[i for i in range(100, 105)] for i in range(0, num_clients)]
+
+        # these are simulations of physical clients
+        for i in range(0, num_clients):
+            macaddr = ("02:00:00:%.2x:%.2x:%.2x" %
+                       ((i >> 16) & 0xff, (i >> 8) & 0xff, i & 0xff))
+
+            # assign this client to the given worker
+            idx = self.ipdb.interfaces["worker%da" % i]["index"]
+            mac = int(macaddr.replace(":", ""), 16)
+            ingress[ingress.Key(mac)] = ingress.Leaf(idx, 0, 0, 0, 0)
+
+            # test traffic with curl loop
+            cmd = ["bash", "-c",
+                   "for i in {1..8}; do curl 172.16.1.5 -o /dev/null; sleep 1; done"]
+            client_ifc = self.ipdb.create(ifname="eth0a.%d" % i, kind="vlan",
+                                          link=self.ipdb.interfaces["eth0a"],
+                                          vlan_id=available_vlans.pop(0)).commit()
+            (out_ifc, in_ifc) = self._create_ns("client%d" % i, in_ifc=client_ifc,
+                                                ipaddr="172.16.1.100/24",
+                                                macaddr=macaddr, cmd=cmd)[1:3]
+
+try:
+    sim = VlanSimulation(ipdb)
+    sim.start()
+    sleep(10)
+    input("Press enter to exit: ")
+
+    stats_collect = {}
+    for key, leaf in ingress.items():
+        stats_collect[key.value] = [leaf.tx_pkts, leaf.tx_bytes, 0, 0]
+    for key, leaf in egress.items():
+        x = stats_collect.get(key.value, [0, 0, 0, 0])
+        x[2] = leaf.tx_pkts
+        x[3] = leaf.tx_bytes
+    for k, v in stats_collect.items():
+        print("mac %.12x rx pkts = %u, rx bytes = %u" % (k, v[0], v[1]))
+        print("                 tx pkts = %u, tx bytes = %u" % (v[2], v[3]))
+finally:
+    if "eth0a" in ipdb.interfaces: ipdb.interfaces.eth0a.remove().commit()
+    if "sim" in locals(): sim.release()
+    ipdb.release()
diff --git a/examples/networking/xdp/CMakeLists.txt b/examples/networking/xdp/CMakeLists.txt
new file mode 100644
index 0000000..ebe5239
--- /dev/null
+++ b/examples/networking/xdp/CMakeLists.txt
@@ -0,0 +1 @@
+install(PROGRAMS xdp_drop_count.py DESTINATION share/bcc/examples/networking/xdp)
diff --git a/examples/networking/xdp/xdp_drop_count.py b/examples/networking/xdp/xdp_drop_count.py
new file mode 100755
index 0000000..ff0af0f
--- /dev/null
+++ b/examples/networking/xdp/xdp_drop_count.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+#
+# xdp_drop_count.py Drop incoming packets on XDP layer and count for which
+#                   protocol type
+#
+# Copyright (c) 2016 PLUMgrid
+# Copyright (c) 2016 Jan Ruth
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import pyroute2
+import time
+import sys
+
+flags = 0
+def usage():
+    print("Usage: {0} [-S] <ifdev>".format(sys.argv[0]))
+    print("       -S: use skb mode\n")
+    print("e.g.: {0} eth0\n".format(sys.argv[0]))
+    exit(1)
+
+if len(sys.argv) < 2 or len(sys.argv) > 3:
+    usage()
+
+if len(sys.argv) == 2:
+    device = sys.argv[1]
+
+if len(sys.argv) == 3:
+    if "-S" in sys.argv:
+        # XDP_FLAGS_SKB_MODE
+        flags |= 2 << 0
+
+    if "-S" == sys.argv[1]:
+        device = sys.argv[2]
+    else:
+        device = sys.argv[1]
+
+mode = BPF.XDP
+#mode = BPF.SCHED_CLS
+
+if mode == BPF.XDP:
+    ret = "XDP_DROP"
+    ctxtype = "xdp_md"
+else:
+    ret = "TC_ACT_SHOT"
+    ctxtype = "__sk_buff"
+
+# load BPF program
+b = BPF(text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+
+BPF_TABLE("percpu_array", uint32_t, long, dropcnt, 256);
+
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end) {
+    struct iphdr *iph = data + nh_off;
+
+    if ((void*)&iph[1] > data_end)
+        return 0;
+    return iph->protocol;
+}
+
+static inline int parse_ipv6(void *data, u64 nh_off, void *data_end) {
+    struct ipv6hdr *ip6h = data + nh_off;
+
+    if ((void*)&ip6h[1] > data_end)
+        return 0;
+    return ip6h->nexthdr;
+}
+
+int xdp_prog1(struct CTXTYPE *ctx) {
+
+    void* data_end = (void*)(long)ctx->data_end;
+    void* data = (void*)(long)ctx->data;
+
+    struct ethhdr *eth = data;
+
+    // drop packets
+    int rc = RETURNCODE; // let pass XDP_PASS or redirect to tx via XDP_TX
+    long *value;
+    uint16_t h_proto;
+    uint64_t nh_off = 0;
+    uint32_t index;
+
+    nh_off = sizeof(*eth);
+
+    if (data + nh_off  > data_end)
+        return rc;
+
+    h_proto = eth->h_proto;
+
+    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+        struct vlan_hdr *vhdr;
+
+        vhdr = data + nh_off;
+        nh_off += sizeof(struct vlan_hdr);
+        if (data + nh_off > data_end)
+            return rc;
+            h_proto = vhdr->h_vlan_encapsulated_proto;
+    }
+    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+        struct vlan_hdr *vhdr;
+
+        vhdr = data + nh_off;
+        nh_off += sizeof(struct vlan_hdr);
+        if (data + nh_off > data_end)
+            return rc;
+            h_proto = vhdr->h_vlan_encapsulated_proto;
+    }
+
+    if (h_proto == htons(ETH_P_IP))
+        index = parse_ipv4(data, nh_off, data_end);
+    else if (h_proto == htons(ETH_P_IPV6))
+       index = parse_ipv6(data, nh_off, data_end);
+    else
+        index = 0;
+
+    value = dropcnt.lookup(&index);
+    if (value)
+        *value += 1;
+
+    return rc;
+}
+""", cflags=["-w", "-DRETURNCODE=%s" % ret, "-DCTXTYPE=%s" % ctxtype])
+
+fn = b.load_func("xdp_prog1", mode)
+
+if mode == BPF.XDP:
+    b.attach_xdp(device, fn, flags)
+else:
+    ip = pyroute2.IPRoute()
+    ipdb = pyroute2.IPDB(nl=ip)
+    idx = ipdb.interfaces[device].index
+    ip.tc("add", "clsact", idx)
+    ip.tc("add-filter", "bpf", idx, ":1", fd=fn.fd, name=fn.name,
+          parent="ffff:fff2", classid=1, direct_action=True)
+
+dropcnt = b.get_table("dropcnt")
+prev = [0] * 256
+print("Printing drops per IP protocol-number, hit CTRL+C to stop")
+while 1:
+    try:
+        for k in dropcnt.keys():
+            val = dropcnt.sum(k).value
+            i = k.value
+            if val:
+                delta = val - prev[i]
+                prev[i] = val
+                print("{}: {} pkt/s".format(i, delta))
+        time.sleep(1)
+    except KeyboardInterrupt:
+        print("Removing filter from device")
+        break;
+
+if mode == BPF.XDP:
+    b.remove_xdp(device, flags)
+else:
+    ip.tc("del", "clsact", idx)
+    ipdb.release()
diff --git a/examples/networking/xdp/xdp_redirect_cpu.py b/examples/networking/xdp/xdp_redirect_cpu.py
new file mode 100755
index 0000000..f7aa2bc
--- /dev/null
+++ b/examples/networking/xdp/xdp_redirect_cpu.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+#
+# xdp_redirect_cpu.py Redirect the incoming packet to the specific CPU
+#
+# Copyright (c) 2018 Gary Lin
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import time
+import sys
+from multiprocessing import cpu_count
+import ctypes as ct
+
+flags = 0
+def usage():
+    print("Usage: {0} <in ifdev> <CPU id>".format(sys.argv[0]))
+    print("e.g.: {0} eth0 2\n".format(sys.argv[0]))
+    exit(1)
+
+if len(sys.argv) != 3:
+    usage()
+
+in_if = sys.argv[1]
+cpu_id = int(sys.argv[2])
+
+max_cpu = cpu_count()
+if (cpu_id > max_cpu):
+    print("Invalid CPU id")
+    exit(1)
+
+# load BPF program
+b = BPF(text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+
+BPF_CPUMAP(cpumap, __MAX_CPU__);
+BPF_ARRAY(dest, uint32_t, 1);
+BPF_PERCPU_ARRAY(rxcnt, long, 1);
+
+int xdp_redirect_cpu(struct xdp_md *ctx) {
+    void* data_end = (void*)(long)ctx->data_end;
+    void* data = (void*)(long)ctx->data;
+    struct ethhdr *eth = data;
+    uint32_t key = 0;
+    long *value;
+    uint32_t *cpu;
+    uint64_t nh_off;
+
+    nh_off = sizeof(*eth);
+    if (data + nh_off  > data_end)
+        return XDP_DROP;
+
+    cpu = dest.lookup(&key);
+    if (!cpu)
+        return XDP_PASS;
+
+    value = rxcnt.lookup(&key);
+    if (value)
+        *value += 1;
+
+    return cpumap.redirect_map(*cpu, 0);
+}
+
+int xdp_dummy(struct xdp_md *ctx) {
+    return XDP_PASS;
+}
+""", cflags=["-w", "-D__MAX_CPU__=%u" % max_cpu], debug=0)
+
+dest = b.get_table("dest")
+dest[0] = ct.c_uint32(cpu_id)
+
+cpumap = b.get_table("cpumap")
+cpumap[cpu_id] = ct.c_uint32(192)
+
+in_fn = b.load_func("xdp_redirect_cpu", BPF.XDP)
+b.attach_xdp(in_if, in_fn, flags)
+
+rxcnt = b.get_table("rxcnt")
+prev = 0
+print("Printing redirected packets, hit CTRL+C to stop")
+while 1:
+    try:
+        val = rxcnt.sum(0).value
+        if val:
+            delta = val - prev
+            prev = val
+            print("{} pkt/s to CPU {}".format(delta, cpu_id))
+        time.sleep(1)
+    except KeyboardInterrupt:
+        print("Removing filter from device")
+        break
+
+b.remove_xdp(in_if, flags)
diff --git a/examples/networking/xdp/xdp_redirect_map.py b/examples/networking/xdp/xdp_redirect_map.py
new file mode 100755
index 0000000..e3b90a3
--- /dev/null
+++ b/examples/networking/xdp/xdp_redirect_map.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+#
+# xdp_redirect_map.py Redirect the incoming packet to another interface
+#                     with the helper: bpf_redirect_map()
+#
+# Copyright (c) 2018 Gary Lin
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import pyroute2
+import time
+import sys
+import ctypes as ct
+
+flags = 0
+def usage():
+    print("Usage: {0} <in ifdev> <out ifdev>".format(sys.argv[0]))
+    print("e.g.: {0} eth0 eth1\n".format(sys.argv[0]))
+    exit(1)
+
+if len(sys.argv) != 3:
+    usage()
+
+in_if = sys.argv[1]
+out_if = sys.argv[2]
+
+ip = pyroute2.IPRoute()
+out_idx = ip.link_lookup(ifname=out_if)[0]
+
+# load BPF program
+b = BPF(text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+
+BPF_DEVMAP(tx_port, 1);
+BPF_PERCPU_ARRAY(rxcnt, long, 1);
+
+static inline void swap_src_dst_mac(void *data)
+{
+    unsigned short *p = data;
+    unsigned short dst[3];
+
+    dst[0] = p[0];
+    dst[1] = p[1];
+    dst[2] = p[2];
+    p[0] = p[3];
+    p[1] = p[4];
+    p[2] = p[5];
+    p[3] = dst[0];
+    p[4] = dst[1];
+    p[5] = dst[2];
+}
+
+int xdp_redirect_map(struct xdp_md *ctx) {
+    void* data_end = (void*)(long)ctx->data_end;
+    void* data = (void*)(long)ctx->data;
+    struct ethhdr *eth = data;
+    uint32_t key = 0;
+    long *value;
+    uint64_t nh_off;
+
+    nh_off = sizeof(*eth);
+    if (data + nh_off  > data_end)
+        return XDP_DROP;
+
+    value = rxcnt.lookup(&key);
+    if (value)
+        *value += 1;
+
+    swap_src_dst_mac(data);
+
+    return tx_port.redirect_map(0, 0);
+}
+
+int xdp_dummy(struct xdp_md *ctx) {
+    return XDP_PASS;
+}
+""", cflags=["-w"])
+
+tx_port = b.get_table("tx_port")
+tx_port[0] = ct.c_int(out_idx)
+
+in_fn = b.load_func("xdp_redirect_map", BPF.XDP)
+out_fn = b.load_func("xdp_dummy", BPF.XDP)
+
+b.attach_xdp(in_if, in_fn, flags)
+b.attach_xdp(out_if, out_fn, flags)
+
+rxcnt = b.get_table("rxcnt")
+prev = 0
+print("Printing redirected packets, hit CTRL+C to stop")
+while 1:
+    try:
+        val = rxcnt.sum(0).value
+        if val:
+            delta = val - prev
+            prev = val
+            print("{} pkt/s".format(delta))
+        time.sleep(1)
+    except KeyboardInterrupt:
+        print("Removing filter from device")
+        break;
+
+b.remove_xdp(in_if, flags)
+b.remove_xdp(out_if, flags)
diff --git a/examples/tracing/CMakeLists.txt b/examples/tracing/CMakeLists.txt
new file mode 100644
index 0000000..dfce81b
--- /dev/null
+++ b/examples/tracing/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB C_FILES *.c)
+file(GLOB PY_FILES *.py)
+file(GLOB TXT_FILES *.txt)
+install(PROGRAMS ${PY_FILES} DESTINATION share/bcc/examples/tracing)
+install(FILES ${C_FILES} DESTINATION share/bcc/examples/tracing)
+install(FILES ${TXT_FILES} DESTINATION share/bcc/examples/tracing)
diff --git a/examples/tracing/bitehist.py b/examples/tracing/bitehist.py
new file mode 100755
index 0000000..c8c7f7a
--- /dev/null
+++ b/examples/tracing/bitehist.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+#
+# bitehist.py	Block I/O size histogram.
+#		For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of using a histogram to show a distribution.
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+BPF_HISTOGRAM(dist);
+
+int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req)
+{
+	dist.increment(bpf_log2l(req->__data_len / 1024));
+	return 0;
+}
+""")
+
+# header
+print("Tracing... Hit Ctrl-C to end.")
+
+# trace until Ctrl-C
+try:
+	sleep(99999999)
+except KeyboardInterrupt:
+	print()
+
+# output
+b["dist"].print_log2_hist("kbytes")
diff --git a/examples/tracing/bitehist_example.txt b/examples/tracing/bitehist_example.txt
new file mode 100644
index 0000000..90bbdf7
--- /dev/null
+++ b/examples/tracing/bitehist_example.txt
@@ -0,0 +1,25 @@
+Demonstrations of bitehist.py, the Linux eBPF/bcc version.
+
+This prints a power-of-2 histogram to show the block I/O size distribution.
+A summary is printed after Ctrl-C is hit.
+
+# ./bitehist.py
+Tracing... Hit Ctrl-C to end.
+^C
+     kbytes          : count     distribution
+       0 -> 1        : 3        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 211      |**********                            |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 800      |**************************************|
+
+This output shows a bimodal distribution. The largest mod of 800 I/O were
+between 128 and 255 Kbytes in size, and another mode of 211 I/O were between
+4 and 7 Kbytes in size.
+
+Understanding this distribution is useful for characterizing workloads and
+understanding performance. The existence of this distribution is not visible
+from averages alone.
diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py
new file mode 100755
index 0000000..ed3dd81
--- /dev/null
+++ b/examples/tracing/disksnoop.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+#
+# disksnoop.py	Trace block device I/O: basic version of iosnoop.
+#		For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of tracing latency.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+
+REQ_WRITE = 1		# from include/linux/blk_types.h
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+BPF_HASH(start, struct request *);
+
+void trace_start(struct pt_regs *ctx, struct request *req) {
+	// stash start timestamp by request ptr
+	u64 ts = bpf_ktime_get_ns();
+
+	start.update(&req, &ts);
+}
+
+void trace_completion(struct pt_regs *ctx, struct request *req) {
+	u64 *tsp, delta;
+
+	tsp = start.lookup(&req);
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		bpf_trace_printk("%d %x %d\\n", req->__data_len,
+		    req->cmd_flags, delta / 1000);
+		start.delete(&req);
+	}
+}
+""")
+
+b.attach_kprobe(event="blk_start_request", fn_name="trace_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_start")
+b.attach_kprobe(event="blk_account_io_completion", fn_name="trace_completion")
+
+# header
+print("%-18s %-2s %-7s %8s" % ("TIME(s)", "T", "BYTES", "LAT(ms)"))
+
+# format output
+while 1:
+	(task, pid, cpu, flags, ts, msg) = b.trace_fields()
+	(bytes_s, bflags_s, us_s) = msg.split()
+
+	if int(bflags_s, 16) & REQ_WRITE:
+		type_s = "W"
+	elif bytes_s == "0":	# see blk_fill_rwbs() for logic
+		type_s = "M"
+	else:
+		type_s = "R"
+	ms = float(int(us_s, 10)) / 1000
+
+	print("%-18.9f %-2s %-7s %8.2f" % (ts, type_s, bytes_s, ms))
diff --git a/examples/tracing/disksnoop_example.txt b/examples/tracing/disksnoop_example.txt
new file mode 100644
index 0000000..8352912
--- /dev/null
+++ b/examples/tracing/disksnoop_example.txt
@@ -0,0 +1,40 @@
+Demonstrations of disksnoop.py, the Linux eBPF/bcc version.
+
+
+This traces block I/O, a prints a line to summarize each I/O completed:
+
+# ./disksnoop.py 
+TIME(s)            T  BYTES    LAT(ms)
+16458043.435457    W  4096        2.73
+16458043.435981    W  4096        3.24
+16458043.436012    W  4096        3.13
+16458043.437326    W  4096        4.44
+16458044.126545    R  4096       42.82
+16458044.129872    R  4096        3.24
+16458044.130705    R  4096        0.73
+16458044.142813    R  4096       12.01
+16458044.147302    R  4096        4.33
+16458044.148117    R  4096        0.71
+16458044.148950    R  4096        0.70
+16458044.164332    R  4096       15.29
+16458044.168003    R  4096        3.58
+16458044.171676    R  4096        3.59
+16458044.172453    R  4096        0.72
+16458044.173213    R  4096        0.71
+16458044.173989    R  4096        0.72
+16458044.174739    R  4096        0.70
+16458044.190334    R  4096       15.52
+16458044.196608    R  4096        6.17
+16458044.203091    R  4096        6.35
+
+The output includes a basic timestamp (in seconds), the type of I/O (W == write,
+R == read, M == metadata), the size of the I/O in bytes, and the latency (or
+duration) of the I/O in milliseconds.
+
+The latency is measured from I/O request to the device, to the device
+completion. This excludes latency spent queued in the OS.
+
+Most of the I/O in this example were 0.7 and 4 milliseconds in duration. There
+was an outlier of 42.82 milliseconds, a read which followed many writes (the
+high latency may have been caused by the writes still being serviced on the
+storage device).
diff --git a/examples/tracing/hello_fields.py b/examples/tracing/hello_fields.py
new file mode 100755
index 0000000..bad1a22
--- /dev/null
+++ b/examples/tracing/hello_fields.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+#
+# This is a Hello World example that formats output as fields.
+
+from bcc import BPF
+
+# define BPF program
+prog = """
+int hello(void *ctx) {
+    bpf_trace_printk("Hello, World!\\n");
+    return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=prog)
+b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/hello_perf_output.py b/examples/tracing/hello_perf_output.py
new file mode 100755
index 0000000..eb1e997
--- /dev/null
+++ b/examples/tracing/hello_perf_output.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+#
+# This is a Hello World example that uses BPF_PERF_OUTPUT.
+
+from bcc import BPF
+import ctypes as ct
+
+# define BPF program
+prog = """
+#include <linux/sched.h>
+
+// define output data structure in C
+struct data_t {
+    u32 pid;
+    u64 ts;
+    char comm[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(events);
+
+int hello(struct pt_regs *ctx) {
+    struct data_t data = {};
+
+    data.pid = bpf_get_current_pid_tgid();
+    data.ts = bpf_ktime_get_ns();
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=prog)
+b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
+
+# define output data structure in Python
+TASK_COMM_LEN = 16    # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [("pid", ct.c_uint),
+                ("ts", ct.c_ulonglong),
+                ("comm", ct.c_char * TASK_COMM_LEN)]
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
+
+# process event
+start = 0
+def print_event(cpu, data, size):
+    global start
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    if start == 0:
+            start = event.ts
+    time_s = (float(event.ts - start)) / 1000000000
+    print("%-18.9f %-16s %-6d %s" % (time_s, event.comm, event.pid,
+        "Hello, perf_output!"))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/examples/tracing/kvm_hypercall.py b/examples/tracing/kvm_hypercall.py
new file mode 100755
index 0000000..322bb8e
--- /dev/null
+++ b/examples/tracing/kvm_hypercall.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#
+# kvm_hypercall.py
+#
+# Demonstrates stateful kvm_entry and kvm_exit recording along with the
+# associated hypercall when exit_reason is VMCALL. See kvm_hypercall.txt
+# for usage
+#
+# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support)
+#
+# Copyright (c) 2017 ShiftLeft Inc.
+#
+# Author(s):
+#   Suchakrapani Sharma <suchakra@shiftleft.io>
+
+
+from __future__ import print_function
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+#define EXIT_REASON 18
+BPF_HASH(start, u8, u8);
+
+TRACEPOINT_PROBE(kvm, kvm_exit) {
+    u8 e = EXIT_REASON;
+    u8 one = 1;
+    if (args->exit_reason == EXIT_REASON) {
+        bpf_trace_printk("KVM_EXIT exit_reason : %d\\n", args->exit_reason);
+        start.update(&e, &one);
+    }
+    return 0;
+}
+
+TRACEPOINT_PROBE(kvm, kvm_entry) {
+    u8 e = EXIT_REASON;
+    u8 zero = 0;
+    u8 *s = start.lookup(&e);
+    if (s != NULL && *s == 1) {
+        bpf_trace_printk("KVM_ENTRY vcpu_id : %u\\n", args->vcpu_id);
+        start.update(&e, &zero);
+    }
+    return 0;
+}
+
+TRACEPOINT_PROBE(kvm, kvm_hypercall) {
+    u8 e = EXIT_REASON;
+    u8 zero = 0;
+    u8 *s = start.lookup(&e);
+    if (s != NULL && *s == 1) {
+        bpf_trace_printk("HYPERCALL nr : %d\\n", args->nr);
+    }
+    return 0;
+};
+""")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "EVENT"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
+
diff --git a/examples/tracing/kvm_hypercall.txt b/examples/tracing/kvm_hypercall.txt
new file mode 100644
index 0000000..6c31b11
--- /dev/null
+++ b/examples/tracing/kvm_hypercall.txt
@@ -0,0 +1,33 @@
+Demonstrations of kvm_hypercall.py, showing eBPF/bcc based hypercall analysis
+
+This example demonstrates how we can statefully save static tracepoint
+events based on conditions being met for other events with which they are
+associated. Here, we wish to record kvm_exit and kvm_entry events which are
+linked to the kvm_hypercall event. We are interested in kvm_exit with exit
+reason as VMCALL (18). This may be useful to analyze latency caused by a
+hypercall itself.
+
+To test this, while the python script is run, induce a hypercall from a
+guest based on the following example:
+https://gist.github.com/abenbachir/344822b5ba9fc5ac384cdec3f087e018
+
+# ./kvm_hypercall.py
+TIME(s)            COMM             PID    MESSAGE
+2445.577087000     CPU 0/KVM        8896   KVM_EXIT exit_reason : 18
+2445.577122000     CPU 0/KVM        8896   HYPERCALL nr : 0
+2445.577129000     CPU 0/KVM        8896   KVM_ENTRY vcpu_id : 0
+2445.577136000     CPU 0/KVM        8896   KVM_EXIT exit_reason : 18
+2445.577145000     CPU 0/KVM        8896   HYPERCALL nr : 1
+2445.577149000     CPU 0/KVM        8896   KVM_ENTRY vcpu_id : 0
+2445.577155000     CPU 0/KVM        8896   KVM_EXIT exit_reason : 18
+2445.577160000     CPU 0/KVM        8896   HYPERCALL nr : 2
+2445.577164000     CPU 0/KVM        8896   KVM_ENTRY vcpu_id : 0
+2445.577170000     CPU 0/KVM        8896   KVM_EXIT exit_reason : 18
+2445.577175000     CPU 0/KVM        8896   HYPERCALL nr : 3
+2445.577179000     CPU 0/KVM        8896   KVM_ENTRY vcpu_id : 0
+2445.577185000     CPU 0/KVM        8896   KVM_EXIT exit_reason : 18
+2445.577190000     CPU 0/KVM        8896   HYPERCALL nr : 4
+2445.577194000     CPU 0/KVM        8896   KVM_ENTRY vcpu_id : 0
+
+This output shows a sequence of exit -> hypercall -> entry where the
+exit_reason was VMCALL.
diff --git a/examples/tracing/mallocstacks.py b/examples/tracing/mallocstacks.py
new file mode 100644
index 0000000..2f3eb25
--- /dev/null
+++ b/examples/tracing/mallocstacks.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+#
+# mallocstacks  Trace malloc() calls in a process and print the full
+#               stack trace for all callsites.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# This script is a basic example of the new Linux 4.6+ BPF_STACK_TRACE
+# table API.
+#
+# Copyright 2016 GitHub, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+import sys
+
+if len(sys.argv) < 2:
+    print("USAGE: mallocstacks PID")
+    exit()
+pid = int(sys.argv[1])
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(calls, int);
+BPF_STACK_TRACE(stack_traces, 1024);
+
+int alloc_enter(struct pt_regs *ctx, size_t size) {
+    int key = stack_traces.get_stackid(ctx,
+        BPF_F_USER_STACK|BPF_F_REUSE_STACKID);
+    if (key < 0)
+        return 0;
+
+    // could also use `calls.increment(key, size);`
+    u64 zero = 0, *val;
+    val = calls.lookup_or_init(&key, &zero);
+    (*val) += size;
+    return 0;
+};
+""")
+
+b.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter", pid=pid)
+print("Attaching to malloc in pid %d, Ctrl+C to quit." % pid)
+
+# sleep until Ctrl-C
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    pass
+
+calls = b.get_table("calls")
+stack_traces = b.get_table("stack_traces")
+
+for k, v in reversed(sorted(calls.items(), key=lambda c: c[1].value)):
+    print("%d bytes allocated at:" % v.value)
+    for addr in stack_traces.walk(k.value):
+        print("\t%s" % b.sym(addr, pid, show_offset=True))
diff --git a/examples/tracing/mysqld_query.py b/examples/tracing/mysqld_query.py
new file mode 100755
index 0000000..15ff297
--- /dev/null
+++ b/examples/tracing/mysqld_query.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+#
+# mysqld_query    Trace MySQL server queries. Example of USDT tracing.
+#                 For Linux, uses BCC, BPF. Embedded C.
+#
+# USAGE: mysqld_query PID
+#
+# This uses USDT probes, and needs a MySQL server with -DENABLE_DTRACE=1.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import sys
+
+if len(sys.argv) < 2:
+    print("USAGE: mysqld_latency PID")
+    exit()
+pid = sys.argv[1]
+debug = 0
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+int do_trace(struct pt_regs *ctx) {
+    uint64_t addr;
+    char query[128];
+    /*
+     * Read the first argument from the query-start probe, which is the query.
+     * The format of this probe is:
+     * query-start(query, connectionid, database, user, host)
+     * see: https://dev.mysql.com/doc/refman/5.7/en/dba-dtrace-ref-query.html
+     */
+    bpf_usdt_readarg(1, ctx, &addr);
+    bpf_trace_printk("%s\\n", addr);
+    return 0;
+};
+"""
+
+# enable USDT probe from given PID
+u = USDT(pid=int(pid))
+u.enable_probe(probe="query__start", fn_name="do_trace")
+if debug:
+    print(u.get_text())
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text, usdt_contexts=[u])
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "QUERY"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        print("value error")
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/mysqld_query_example.txt b/examples/tracing/mysqld_query_example.txt
new file mode 100644
index 0000000..630c932
--- /dev/null
+++ b/examples/tracing/mysqld_query_example.txt
@@ -0,0 +1,8 @@
+# ./mysqld_query.py `pgrep -n mysqld`
+TIME(s)            COMM             PID    QUERY
+17450459.549910001 mysqld           18608  select @@version_comment limit 1
+17450463.822668001 mysqld           18608  SELECT DATABASE()
+17450463.824042998 mysqld           18608  show databases
+17450463.824570000 mysqld           18608  show tables
+17450465.602717999 mysqld           18608  SELECT COUNT(*) FROM words
+17450479.944897000 mysqld           18608  SELECT * FROM words WHERE word REGEXP '^bre.*n$'
diff --git a/examples/tracing/nodejs_http_server.py b/examples/tracing/nodejs_http_server.py
new file mode 100755
index 0000000..1017de5
--- /dev/null
+++ b/examples/tracing/nodejs_http_server.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+#
+# nodejs_http_server    Basic example of node.js USDT tracing.
+#                       For Linux, uses BCC, BPF. Embedded C.
+#
+# USAGE: nodejs_http_server PID
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import sys
+
+if len(sys.argv) < 2:
+    print("USAGE: nodejs_http_server PID")
+    exit()
+pid = sys.argv[1]
+debug = 0
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+int do_trace(struct pt_regs *ctx) {
+    uint64_t addr;
+    char path[128]={0};
+    bpf_usdt_readarg(6, ctx, &addr);
+    bpf_probe_read(&path, sizeof(path), (void *)addr);
+    bpf_trace_printk("path:%s\\n", path);
+    return 0;
+};
+"""
+
+# enable USDT probe from given PID
+u = USDT(pid=int(pid))
+u.enable_probe(probe="http__server__request", fn_name="do_trace")
+if debug:
+    print(u.get_text())
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text, usdt_contexts=[u])
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "ARGS"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        print("value error")
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/nodejs_http_server_example.txt b/examples/tracing/nodejs_http_server_example.txt
new file mode 100644
index 0000000..ae223e0
--- /dev/null
+++ b/examples/tracing/nodejs_http_server_example.txt
@@ -0,0 +1,5 @@
+# ./nodejs_http_server.py 24728
+TIME(s)            COMM             PID    ARGS
+24653324.561322998 node             24728  path:/index.html
+24653335.343401998 node             24728  path:/images/welcome.png
+24653340.510164998 node             24728  path:/images/favicon.png
diff --git a/examples/tracing/stacksnoop.py b/examples/tracing/stacksnoop.py
new file mode 100755
index 0000000..bced93f
--- /dev/null
+++ b/examples/tracing/stacksnoop.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# stacksnoop    Trace a kernel function and print all kernel stack traces.
+#               For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
+#
+# USAGE: stacksnoop [-h] [-p PID] [-s] [-v] function
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Jan-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import time
+
+# arguments
+examples = """examples:
+    ./stacksnoop ext4_sync_fs    # print kernel stack traces for ext4_sync_fs
+    ./stacksnoop -s ext4_sync_fs    # ... also show symbol offsets
+    ./stacksnoop -v ext4_sync_fs    # ... show extra columns
+    ./stacksnoop -p 185 ext4_sync_fs    # ... only when PID 185 is on-CPU
+"""
+parser = argparse.ArgumentParser(
+    description="Trace and print kernel stack traces for a kernel function",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-s", "--offset", action="store_true",
+    help="show address offsets")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print more fields")
+parser.add_argument("function",
+    help="kernel function name")
+args = parser.parse_args()
+function = args.function
+offset = args.offset
+verbose = args.verbose
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct data_t {
+    u64 stack_id;
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_STACK_TRACE(stack_traces, 128);
+BPF_PERF_OUTPUT(events);
+
+void trace_stack(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    struct data_t data = {};
+    data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
+    data.pid = pid;
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    events.perf_submit(ctx, &data, sizeof(data));
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event=function, fn_name="trace_stack")
+
+TASK_COMM_LEN = 16  # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("stack_id", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+    ]
+
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("Function \"%s\" not found. Exiting." % function)
+    exit()
+
+stack_traces = b.get_table("stack_traces")
+start_ts = time.time()
+
+# header
+if verbose:
+    print("%-18s %-12s %-6s %-3s %s" %
+            ("TIME(s)", "COMM", "PID", "CPU", "FUNCTION"))
+else:
+    print("%-18s %s" % ("TIME(s)", "FUNCTION"))
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    ts = time.time() - start_ts
+
+    if verbose:
+        print("%-18.9f %-12.12s %-6d %-3d %s" %
+              (ts, event.comm.decode(), event.pid, cpu, function))
+    else:
+        print("%-18.9f %s" % (ts, function))
+
+    for addr in stack_traces.walk(event.stack_id):
+        sym = b.ksym(addr, show_offset=offset)
+        print("\t%s" % sym)
+
+    print()
+
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/examples/tracing/stacksnoop_example.txt b/examples/tracing/stacksnoop_example.txt
new file mode 100644
index 0000000..76784fd
--- /dev/null
+++ b/examples/tracing/stacksnoop_example.txt
@@ -0,0 +1,96 @@
+Demonstrations of stacksnoop, the Linux eBPF/bcc version.
+
+
+This program traces the given kernel function and prints the kernel stack trace
+for every call. This tool is useful for studying low frequency kernel functions,
+to see how they were invoked. For example, tracing the submit_bio() call:
+
+# ./stacksnoop submit_bio
+TIME(s)            SYSCALL
+3592.838736000     submit_bio
+        submit_bio
+        submit_bh
+        jbd2_journal_commit_transaction
+        kjournald2
+        kthread
+        ret_from_fork
+
+This shows that submit_bio() was called by submit_bh(), which was called
+by jbd2_journal_commit_transaction(), and so on. 
+
+For high frequency functions, see stackcount, which summarizes in-kernel for
+efficiency. If you don't know if your function is low or high frequency, try
+funccount.
+
+
+The -v option includes more fields, including the on-CPU process (COMM and PID):
+
+# ./stacksnoop -v submit_bio
+TIME(s)            COMM         PID    CPU SYSCALL
+3734.855027000     jbd2/dm-0-8  313    0   submit_bio
+        submit_bio
+        submit_bh
+        jbd2_journal_commit_transaction
+        kjournald2
+        kthread
+        ret_from_fork
+
+This identifies the application issuing the sync syscall: the jbd2 process
+(COMM column).
+
+
+Here's another example, showing the path to second_overflow() and on-CPU
+process:
+
+# ./stacksnoop -v second_overflow
+TIME(s)            COMM         PID    CPU SYSCALL
+3837.526433000     <idle>       0      1   second_overflow
+        second_overflow
+        tick_do_update_jiffies64
+        tick_irq_enter
+        irq_enter
+        smp_apic_timer_interrupt
+        apic_timer_interrupt
+        default_idle
+        arch_cpu_idle
+        default_idle_call
+        cpu_startup_entry
+        start_secondary
+
+3838.526953000     <idle>       0      1   second_overflow
+        second_overflow
+        tick_do_update_jiffies64
+        tick_irq_enter
+        irq_enter
+        smp_apic_timer_interrupt
+        apic_timer_interrupt
+        default_idle
+        arch_cpu_idle
+        default_idle_call
+        cpu_startup_entry
+        start_secondary
+
+This fires every second (see TIME(s)), and is from tick_do_update_jiffies64().
+
+
+USAGE message:
+
+# ./stacksnoop -h
+usage: stacksnoop [-h] [-p PID] [-s] [-v] function
+
+Trace and print kernel stack traces for a kernel function
+
+positional arguments:
+  function           kernel function name
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  trace this PID only
+  -s, --offset       show address offsets
+  -v, --verbose      print more fields
+
+examples:
+    ./stacksnoop ext4_sync_fs    # print kernel stack traces for ext4_sync_fs
+    ./stacksnoop -s ext4_sync_fs    # ... also show symbol offsets
+    ./stacksnoop -v ext4_sync_fs    # ... show extra columns
+    ./stacksnoop -p 185 ext4_sync_fs    # ... only when PID 185 is on-CPU
diff --git a/examples/tracing/strlen_count.py b/examples/tracing/strlen_count.py
new file mode 100755
index 0000000..49d7080
--- /dev/null
+++ b/examples/tracing/strlen_count.py
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+#
+# strlen_count  Trace strlen() and print a frequency count of strings.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of BCC and uprobes.
+#
+# Also see strlensnoop.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+    char c[80];
+};
+BPF_HASH(counts, struct key_t);
+
+int count(struct pt_regs *ctx) {
+    if (!PT_REGS_PARM1(ctx))
+        return 0;
+
+    struct key_t key = {};
+    u64 zero = 0, *val;
+
+    bpf_probe_read(&key.c, sizeof(key.c), (void *)PT_REGS_PARM1(ctx));
+    // could also use `counts.increment(key)`
+    val = counts.lookup_or_init(&key, &zero);
+    (*val)++;
+    return 0;
+};
+""")
+b.attach_uprobe(name="c", sym="strlen", fn_name="count")
+
+# header
+print("Tracing strlen()... Hit Ctrl-C to end.")
+
+# sleep until Ctrl-C
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    pass
+
+# print output
+print("%10s %s" % ("COUNT", "STRING"))
+counts = b.get_table("counts")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    print("%10d \"%s\"" % (v.value, k.c.encode('string-escape')))
diff --git a/examples/tracing/strlen_hist.py b/examples/tracing/strlen_hist.py
new file mode 100755
index 0000000..dda1cb2
--- /dev/null
+++ b/examples/tracing/strlen_hist.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+#
+# strlen_hist.py   Histogram of system-wide strlen return values
+#
+# A basic example of using uprobes along with a histogram to show
+# distributions.
+#
+# Runs until ctrl-c is pressed.
+#
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Example output:
+# $ sudo ./strlen_hist.py
+# 22:12:52
+#      strlen return:      : count     distribution
+#          0 -> 1          : 2106     |****************                        |
+#          2 -> 3          : 1172     |*********                               |
+#          4 -> 7          : 3892     |******************************          |
+#          8 -> 15         : 5096     |****************************************|
+#         16 -> 31         : 2201     |*****************                       |
+#         32 -> 63         : 547      |****                                    |
+#         64 -> 127        : 106      |                                        |
+#        128 -> 255        : 13       |                                        |
+#        256 -> 511        : 27       |                                        |
+#        512 -> 1023       : 6        |                                        |
+#       1024 -> 2047       : 10       |                                        |
+# ^C$
+#
+
+from __future__ import print_function
+import bcc
+import time
+
+text = """
+#include <uapi/linux/ptrace.h>
+BPF_HISTOGRAM(dist);
+int count(struct pt_regs *ctx) {
+    dist.increment(bpf_log2l(PT_REGS_RC(ctx)));
+    return 0;
+}
+"""
+
+b = bcc.BPF(text=text)
+sym="strlen"
+b.attach_uretprobe(name="c", sym=sym, fn_name="count")
+
+dist = b["dist"]
+
+try:
+    while True:
+        time.sleep(1)
+        print("%-8s\n" % time.strftime("%H:%M:%S"), end="")
+        dist.print_log2_hist(sym + " return:")
+        dist.clear()
+
+except KeyboardInterrupt:
+    pass
diff --git a/examples/tracing/strlen_snoop.py b/examples/tracing/strlen_snoop.py
new file mode 100755
index 0000000..c3c7199
--- /dev/null
+++ b/examples/tracing/strlen_snoop.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+#
+# strlen_snoop  Trace strlen() library function for a given PID.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: strlensnoop PID
+#
+# Try running this on a separate bash shell.
+#
+# Written as a basic example of BCC and uprobes.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+from os import getpid
+import sys
+
+if len(sys.argv) < 2:
+    print("USAGE: strlensnoop PID")
+    exit()
+pid = sys.argv[1]
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+int printarg(struct pt_regs *ctx) {
+    if (!PT_REGS_PARM1(ctx))
+        return 0;
+
+    u32 pid = bpf_get_current_pid_tgid();
+    if (pid != PID)
+        return 0;
+
+    char str[80] = {};
+    bpf_probe_read(&str, sizeof(str), (void *)PT_REGS_PARM1(ctx));
+    bpf_trace_printk("%s\\n", &str);
+
+    return 0;
+};
+"""
+bpf_text = bpf_text.replace('PID', pid)
+b = BPF(text=bpf_text)
+b.attach_uprobe(name="c", sym="strlen", fn_name="printarg")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "STRLEN"))
+
+# format output
+me = getpid()
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    if pid == me or msg == "":
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/sync_timing.py b/examples/tracing/sync_timing.py
new file mode 100755
index 0000000..675ad14
--- /dev/null
+++ b/examples/tracing/sync_timing.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+#
+# sync_timing.py    Trace time between syncs.
+#                   For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of tracing time between events.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(last);
+
+int do_trace(struct pt_regs *ctx) {
+    u64 ts, *tsp, delta, key = 0;
+
+    // attempt to read stored timestamp
+    tsp = last.lookup(&key);
+    if (tsp != 0) {
+        delta = bpf_ktime_get_ns() - *tsp;
+        if (delta < 1000000000) {
+            // output if time is less than 1 second
+            bpf_trace_printk("%d\\n", delta / 1000000);
+        }
+        last.delete(&key);
+    }
+
+    // update stored timestamp
+    ts = bpf_ktime_get_ns();
+    last.update(&key, &ts);
+    return 0;
+}
+""")
+
+b.attach_kprobe(event=b.get_syscall_fnname("sync"), fn_name="do_trace")
+print("Tracing for quick sync's... Ctrl-C to end")
+
+# format output
+start = 0
+while 1:
+    (task, pid, cpu, flags, ts, ms) = b.trace_fields()
+    if start == 0:
+        start = ts
+    ts = ts - start
+    print("At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms))
diff --git a/examples/tracing/task_switch.py b/examples/tracing/task_switch.py
new file mode 100755
index 0000000..161edfb
--- /dev/null
+++ b/examples/tracing/task_switch.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from time import sleep
+
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct key_t {
+  u32 prev_pid;
+  u32 curr_pid;
+};
+// map_type, key_type, leaf_type, table_name, num_entry
+BPF_HASH(stats, struct key_t, u64, 1024);
+int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
+  struct key_t key = {};
+  u64 zero = 0, *val;
+
+  key.curr_pid = bpf_get_current_pid_tgid();
+  key.prev_pid = prev->pid;
+
+  // could also use `stats.increment(key);`
+  val = stats.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+}
+""")
+b.attach_kprobe(event="finish_task_switch", fn_name="count_sched")
+
+# generate many schedule events
+for i in range(0, 100): sleep(0.01)
+
+for k, v in b["stats"].items():
+    print("task_switch[%5d->%5d]=%u" % (k.prev_pid, k.curr_pid, v.value))
diff --git a/examples/tracing/tcpv4connect.py b/examples/tracing/tcpv4connect.py
new file mode 100755
index 0000000..8a89469
--- /dev/null
+++ b/examples/tracing/tcpv4connect.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+#
+# tcpv4connect	Trace TCP IPv4 connect()s.
+#		For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpv4connect [-h] [-t] [-p PID]
+#
+# This is provided as a basic example of TCP connection & socket tracing.
+#
+# All IPv4 connection attempts are traced, even if they ultimately fail.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Oct-2015	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int kprobe__tcp_v4_connect(struct pt_regs *ctx, struct sock *sk)
+{
+	u32 pid = bpf_get_current_pid_tgid();
+
+	// stash the sock ptr for lookup on return
+	currsock.update(&pid, &sk);
+
+	return 0;
+};
+
+int kretprobe__tcp_v4_connect(struct pt_regs *ctx)
+{
+	int ret = PT_REGS_RC(ctx);
+	u32 pid = bpf_get_current_pid_tgid();
+
+	struct sock **skpp;
+	skpp = currsock.lookup(&pid);
+	if (skpp == 0) {
+		return 0;	// missed entry
+	}
+
+	if (ret != 0) {
+		// failed to send SYNC packet, may not have populated
+		// socket __sk_common.{skc_rcv_saddr, ...}
+		currsock.delete(&pid);
+		return 0;
+	}
+
+	// pull in details
+	struct sock *skp = *skpp;
+	u32 saddr = skp->__sk_common.skc_rcv_saddr;
+	u32 daddr = skp->__sk_common.skc_daddr;
+	u16 dport = skp->__sk_common.skc_dport;
+
+	// output
+	bpf_trace_printk("trace_tcp4connect %x %x %d\\n", saddr, daddr, ntohs(dport));
+
+	currsock.delete(&pid);
+
+	return 0;
+}
+"""
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+print("%-6s %-12s %-16s %-16s %-4s" % ("PID", "COMM", "SADDR", "DADDR",
+    "DPORT"))
+
+def inet_ntoa(addr):
+	dq = ''
+	for i in range(0, 4):
+		dq = dq + str(addr & 0xff)
+		if (i != 3):
+			dq = dq + '.'
+		addr = addr >> 8
+	return dq
+
+# filter and format output
+while 1:
+	# Read messages from kernel pipe
+	try:
+	    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+	    (_tag, saddr_hs, daddr_hs, dport_s) = msg.split(" ")
+	except ValueError:
+	    # Ignore messages from other tracers
+	    continue
+
+	# Ignore messages from other tracers
+	if _tag != "trace_tcp4connect":
+	    continue
+
+	print("%-6d %-12.12s %-16s %-16s %-4s" % (pid, task,
+	    inet_ntoa(int(saddr_hs, 16)),
+	    inet_ntoa(int(daddr_hs, 16)),
+	    dport_s))
diff --git a/examples/tracing/tcpv4connect_example.txt b/examples/tracing/tcpv4connect_example.txt
new file mode 100644
index 0000000..0ff06e3
--- /dev/null
+++ b/examples/tracing/tcpv4connect_example.txt
@@ -0,0 +1,23 @@
+Demonstrations of tcpv4connect.py, the Linux eBPF/bcc version.
+
+
+This example traces the kernel function performing active TCP IPv4 connections
+(eg, via a connect() syscall; accept() are passive connections). Some example
+output (IP addresses changed to protect the innocent):
+
+# ./tcpv4connect.py
+PID    COMM         SADDR            DADDR            DPORT
+1479   telnet       127.0.0.1        127.0.0.1        23  
+1469   curl         10.201.219.236   54.245.105.25    80  
+1469   curl         10.201.219.236   54.67.101.145    80  
+
+This output shows three connections, one from a "telnet" process and two from
+"curl". The output details shows the source address, destination address,
+and destination port. This traces attempted connections: these may have failed.
+
+The overhead of this tool should be negligible, since it is only tracing the
+kernel function performing a connect. It is not tracing every packet and then
+filtering.
+
+This is provided as a basic example of TCP tracing. See tools/tcpconnect for a
+more featured version of this example (a tool).
diff --git a/examples/tracing/trace_fields.py b/examples/tracing/trace_fields.py
new file mode 100755
index 0000000..63a7b53
--- /dev/null
+++ b/examples/tracing/trace_fields.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# This is an example of tracing an event and printing custom fields.
+# run in project examples directory with:
+# sudo ./trace_fields.py"
+
+from __future__ import print_function
+from bcc import BPF
+
+prog = """
+int hello(void *ctx) {
+  bpf_trace_printk("Hello, World!\\n");
+  return 0;
+}
+"""
+b = BPF(text=prog)
+b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
+print("PID MESSAGE")
+b.trace_print(fmt="{1} {5}")
diff --git a/examples/tracing/trace_perf_output.py b/examples/tracing/trace_perf_output.py
new file mode 100755
index 0000000..865a459
--- /dev/null
+++ b/examples/tracing/trace_perf_output.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# This is an example of tracing an event and printing custom fields.
+# run in project examples directory with:
+# sudo ./trace_fields.py"
+
+import atexit
+from bcc import BPF
+import ctypes as ct
+
+class Data(ct.Structure):
+    _fields_ = [("ts", ct.c_ulonglong),
+                ("magic", ct.c_ulonglong)]
+
+counter = 0
+def cb(cpu, data, size):
+    assert size >= ct.sizeof(Data)
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("[%0d] %f: %x" % (cpu, float(event.ts) / 1000000, event.magic))
+    global counter
+    counter += 1
+
+prog = """
+BPF_PERF_OUTPUT(events);
+BPF_ARRAY(counters, u64, 10);
+int do_sys_clone(void *ctx) {
+  struct {
+    u64 ts;
+    u64 magic;
+  } data = {bpf_ktime_get_ns(), 0x12345678};
+  int rc;
+  if ((rc = events.perf_submit(ctx, &data, sizeof(data))) < 0)
+    bpf_trace_printk("perf_output failed: %d\\n", rc);
+  int zero = 0;
+  u64 *val = counters.lookup(&zero);
+  if (val) lock_xadd(val, 1);
+  return 0;
+}
+"""
+b = BPF(text=prog)
+event_name = b.get_syscall_fnname("clone")
+b.attach_kprobe(event=event_name, fn_name="do_sys_clone")
+b["events"].open_perf_buffer(cb)
+
+@atexit.register
+def print_counter():
+    global counter
+    global b
+    print("counter = %d vs %d" % (counter, b["counters"][ct.c_int(0)].value))
+
+print("Tracing " + event_name + ", try `dd if=/dev/zero of=/dev/null`")
+print("Tracing... Hit Ctrl-C to end.")
+while 1:
+    b.perf_buffer_poll()
diff --git a/examples/tracing/urandomread-explicit.py b/examples/tracing/urandomread-explicit.py
new file mode 100755
index 0000000..448ffdf
--- /dev/null
+++ b/examples/tracing/urandomread-explicit.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python
+#
+# urandomread-explicit  Example of instrumenting a kernel tracepoint.
+#                       For Linux, uses BCC, BPF. Embedded C.
+#
+# This is an older example of instrumenting a tracepoint, which defines
+# the argument struct and makes an explicit call to attach_tracepoint().
+# See urandomread for a newer version that uses TRACEPOINT_PROBE().
+#
+# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support).
+#
+# Test by running this, then in another shell, run:
+#     dd if=/dev/urandom of=/dev/null bs=1k count=5
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+struct urandom_read_args {
+    // from /sys/kernel/debug/tracing/events/random/urandom_read/format
+    u64 __unused__;
+    u32 got_bits;
+    u32 pool_left;
+    u32 input_left;
+};
+
+int printarg(struct urandom_read_args *args) {
+    bpf_trace_printk("%d\\n", args->got_bits);
+    return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_tracepoint(tp="random:urandom_read", fn_name="printarg")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "GOTBITS"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/urandomread.py b/examples/tracing/urandomread.py
new file mode 100755
index 0000000..319db2c
--- /dev/null
+++ b/examples/tracing/urandomread.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+#
+# urandomread  Example of instrumenting a kernel tracepoint.
+#              For Linux, uses BCC, BPF. Embedded C.
+#
+# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support).
+#
+# Test by running this, then in another shell, run:
+#     dd if=/dev/urandom of=/dev/null bs=1k count=5
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+TRACEPOINT_PROBE(random, urandom_read) {
+    // args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
+    bpf_trace_printk("%d\\n", args->got_bits);
+    return 0;
+}
+""")
+
+# header
+print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "GOTBITS"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg))
diff --git a/examples/tracing/urandomread_example.txt b/examples/tracing/urandomread_example.txt
new file mode 100755
index 0000000..43d962b
--- /dev/null
+++ b/examples/tracing/urandomread_example.txt
@@ -0,0 +1,20 @@
+Examples of urandomread.py, the Linux eBPF/bcc version.
+
+
+To demonstrate this, the following workload was issued:
+
+# dd if=/dev/urandom of=/dev/null bs=1k count=5
+
+While urandomread.py was tracing in another session:
+
+# ./urandomread.py
+TIME(s)            COMM             PID    GOTBITS
+22592556.392825000 dd               14228  8192
+22592556.392949000 dd               14228  8192
+22592556.393068999 dd               14228  8192
+22592556.393183999 dd               14228  8192
+22592556.393298000 dd               14228  8192
+
+The GOTBITS of 8192 matches the workload of 1 Kbyte (8 Kbit) reads.
+
+This program was really written as a simple example of tracing a tracepoint.
diff --git a/examples/tracing/vfsreadlat.c b/examples/tracing/vfsreadlat.c
new file mode 100644
index 0000000..77da22e
--- /dev/null
+++ b/examples/tracing/vfsreadlat.c
@@ -0,0 +1,44 @@
+/*
+ * vfsreadlat.c		VFS read latency distribution.
+ *			For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Copyright (c) 2013-2015 PLUMgrid, http://plumgrid.com
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * 15-Aug-2015	Brendan Gregg	Created this.
+ */
+
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist);
+
+int do_entry(struct pt_regs *ctx)
+{
+	u32 pid;
+	u64 ts, *val;
+
+	pid = bpf_get_current_pid_tgid();
+	ts = bpf_ktime_get_ns();
+	start.update(&pid, &ts);
+	return 0;
+}
+
+int do_return(struct pt_regs *ctx)
+{
+	u32 pid;
+	u64 *tsp, delta;
+
+	pid = bpf_get_current_pid_tgid();
+	tsp = start.lookup(&pid);
+
+	if (tsp != 0) {
+		delta = bpf_ktime_get_ns() - *tsp;
+		dist.increment(bpf_log2l(delta / 1000));
+		start.delete(&pid);
+	}
+
+	return 0;
+}
diff --git a/examples/tracing/vfsreadlat.py b/examples/tracing/vfsreadlat.py
new file mode 100755
index 0000000..b2c4156
--- /dev/null
+++ b/examples/tracing/vfsreadlat.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+#
+# vfsreadlat.py		VFS read latency distribution.
+#			For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of a function latency distribution histogram.
+#
+# USAGE: vfsreadlat.py [interval [count]]
+#
+# The default interval is 5 seconds. A Ctrl-C will print the partially
+# gathered histogram then exit.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Aug-2015	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import c_ushort, c_int, c_ulonglong
+from time import sleep
+from sys import argv
+
+def usage():
+	print("USAGE: %s [interval [count]]" % argv[0])
+	exit()
+
+# arguments
+interval = 5
+count = -1
+if len(argv) > 1:
+	try:
+		interval = int(argv[1])
+		if interval == 0:
+			raise
+		if len(argv) > 2:
+			count = int(argv[2])
+	except:	# also catches -h, --help
+		usage()
+
+# load BPF program
+b = BPF(src_file = "vfsreadlat.c")
+b.attach_kprobe(event="vfs_read", fn_name="do_entry")
+b.attach_kretprobe(event="vfs_read", fn_name="do_return")
+
+# header
+print("Tracing... Hit Ctrl-C to end.")
+
+# output
+loop = 0
+do_exit = 0
+while (1):
+	if count > 0:
+		loop += 1
+		if loop > count:
+			exit()
+	try:
+		sleep(interval)
+	except KeyboardInterrupt:
+		pass; do_exit = 1
+
+	print()
+	b["dist"].print_log2_hist("usecs")
+	b["dist"].clear()
+	if do_exit:
+		exit()
diff --git a/examples/tracing/vfsreadlat_example.txt b/examples/tracing/vfsreadlat_example.txt
new file mode 100644
index 0000000..1d95f6a
--- /dev/null
+++ b/examples/tracing/vfsreadlat_example.txt
@@ -0,0 +1,63 @@
+Demonstrations of vfsreadlat.py, the Linux eBPF/bcc version.
+
+
+This example traces the latency of vfs_read (time from call to return), printing 
+it as a histogram distribution. By default, output is every five seconds:
+
+# ./vfsreadlat.py 
+Tracing... Hit Ctrl-C to end.
+
+     usecs           : count     distribution
+       0 -> 1        : 4457     |*************************************+|
+       2 -> 3        : 447      |***                                   |
+       4 -> 7        : 2059     |*****************                     |
+       8 -> 15       : 1179     |**********                            |
+      16 -> 31       : 63       |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 2        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 3        |                                      |
+     512 -> 1023     : 1        |                                      |
+    1024 -> 2047     : 3        |                                      |
+    2048 -> 4095     : 2        |                                      |
+    4096 -> 8191     : 0        |                                      |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 0        |                                      |
+   65536 -> 131071   : 4        |                                      |
+  131072 -> 262143   : 2        |                                      |
+  262144 -> 524287   : 0        |                                      |
+  524288 -> 1048575  : 4        |                                      |
+^C
+     usecs           : count     distribution
+       0 -> 1        : 241      |*************************************+|
+       2 -> 3        : 17       |**                                    |
+       4 -> 7        : 2        |                                      |
+       8 -> 15       : 4        |                                      |
+      16 -> 31       : 2        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 1        |                                      |
+     512 -> 1023     : 1        |                                      |
+    1024 -> 2047     : 0        |                                      |
+    2048 -> 4095     : 1        |                                      |
+    4096 -> 8191     : 0        |                                      |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 0        |                                      |
+   65536 -> 131071   : 0        |                                      |
+  131072 -> 262143   : 0        |                                      |
+  262144 -> 524287   : 0        |                                      |
+  524288 -> 1048575  : 1        |                                      |
+
+These examples show outliers in the 524 - 1048 milliseconds range. Since
+vfs_read() will catch many types of events, this could be anything including
+keystroke latency on ssh sessions. Further drilling with bcc will be necessary
+to identify more details.
+
+
+Full usage:
+
+# ./vfsreadlat.py -h
+USAGE: ./vfsreadlat.py [interval [count]]
diff --git a/examples/usdt_sample/CMakeLists.txt b/examples/usdt_sample/CMakeLists.txt
new file mode 100755
index 0000000..04e5092
--- /dev/null
+++ b/examples/usdt_sample/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 3.0)
+
+# This sample requires C++11 enabled.
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Weffc++")
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/usdt_sample_lib1)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/usdt_sample_app1)
diff --git a/examples/usdt_sample/scripts/bpf_text_shared.c b/examples/usdt_sample/scripts/bpf_text_shared.c
new file mode 100644
index 0000000..d8e7464
--- /dev/null
+++ b/examples/usdt_sample/scripts/bpf_text_shared.c
@@ -0,0 +1,55 @@
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+
+/**
+ * @brief Helper method to filter based on the specified inputString.
+ * @param inputString The operation input string to check against the filter.
+ * @return True if the specified inputString starts with the hard-coded FILTER_STRING; otherwise, false.
+ */
+static inline bool filter(char const* inputString)
+{
+    char needle[] = "FILTER_STRING"; ///< The FILTER STRING is replaced by python code.
+    char haystack[sizeof(needle)] = {};
+    bpf_probe_read(&haystack, sizeof(haystack), (void*)inputString);
+    for (int i = 0; i < sizeof(needle) - 1; ++i) {
+        if (needle[i] != haystack[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+/**
+ * @brief Contains the operation start data to trace.
+ */
+struct start_data_t
+{
+    u64 operation_id; ///< The id of the operation.
+    char input[64];   ///< The input string of the request.
+    u64 start;        ///< Timestamp of the start operation (start timestamp).
+};
+
+/**
+ * @brief Contains the operation start data.
+ * key: the operation id.
+ * value: The operation start latency data.
+ */
+BPF_HASH(start_hash, u64, struct start_data_t);
+
+/**
+ * @brief Reads the operation request arguments and stores the start data in the hash.
+ * @param ctx The BPF context.
+ */
+int trace_operation_start(struct pt_regs* ctx)
+{
+    struct start_data_t start_data = {};
+    bpf_usdt_readarg_p(2, ctx, &start_data.input, sizeof(start_data.input));
+
+    FILTER ///< Replaced by python code.
+
+    bpf_usdt_readarg(1, ctx, &start_data.operation_id);
+
+    start_data.start = bpf_ktime_get_ns();
+    start_hash.update(&start_data.operation_id, &start_data);
+    return 0;
+}
diff --git a/examples/usdt_sample/scripts/lat_avg.py b/examples/usdt_sample/scripts/lat_avg.py
new file mode 100755
index 0000000..be473d1
--- /dev/null
+++ b/examples/usdt_sample/scripts/lat_avg.py
@@ -0,0 +1,126 @@
+import argparse
+from time import sleep, strftime
+from sys import argv
+import ctypes as ct
+from bcc import BPF, USDT
+import inspect
+import os
+
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Trace the moving average of the latency of an operation using usdt probes.",
+    formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-p", "--pid", type=int, help="The id of the process to trace.")
+parser.add_argument("-i", "--interval", type=int, help="The interval in seconds on which to report the latency distribution.")
+parser.add_argument("-c", "--count", type=int, default=16, help="The count of samples over which to calculate the moving average.")
+parser.add_argument("-f", "--filterstr", type=str, default="", help="The prefix filter for the operation input. If specified, only operations for which the input string starts with the filterstr are traced.")
+parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", help="If true, will output verbose logging information.")
+parser.set_defaults(verbose=False)
+args = parser.parse_args()
+this_pid = int(args.pid)
+this_interval = int(args.interval)
+this_count = int(args.count)
+this_filter = str(args.filterstr)
+
+if this_interval < 1:
+    print("Invalid value for interval, using 1.")
+    this_interval = 1
+
+if this_count < 1:
+    print("Invalid value for count, using 1.")
+    this_count = 1
+
+debugLevel=0
+if args.verbose:
+    debugLevel=4
+
+# BPF program
+bpf_text_shared = "%s/bpf_text_shared.c" % os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+bpf_text = open(bpf_text_shared, 'r').read()
+bpf_text += """
+
+const u32 MAX_SAMPLES = SAMPLE_COUNT;
+
+struct hash_key_t
+{
+    char input[64];
+};
+
+struct hash_leaf_t
+{
+    u32 count;
+    u64 total;
+    u64 average;
+};
+
+/**
+ * @brief Contains the averages for the operation latencies by operation input.
+ */
+BPF_HASH(lat_hash, struct hash_key_t, struct hash_leaf_t, 512);
+
+/**
+ * @brief Reads the operation response arguments, calculates the latency, and stores it in the histogram.
+ * @param ctx The BPF context.
+ */
+int trace_operation_end(struct pt_regs* ctx)
+{
+    u64 operation_id;
+    bpf_usdt_readarg(1, ctx, &operation_id);
+
+    struct start_data_t* start_data = start_hash.lookup(&operation_id);
+    if (0 == start_data) {
+        return 0;
+    }
+
+    u64 duration = bpf_ktime_get_ns() - start_data->start;
+    struct hash_key_t hash_key = {};
+    __builtin_memcpy(&hash_key.input, start_data->input, sizeof(hash_key.input));
+    start_hash.delete(&operation_id);
+
+    struct hash_leaf_t zero = {};
+    struct hash_leaf_t* hash_leaf = lat_hash.lookup_or_init(&hash_key, &zero);
+    if (0 == hash_leaf) {
+        return 0;
+    }
+
+    if (hash_leaf->count < MAX_SAMPLES) {
+        hash_leaf->count++;
+    } else {
+        hash_leaf->total -= hash_leaf->average;
+    }
+
+    hash_leaf->total += duration;
+    hash_leaf->average = hash_leaf->total / hash_leaf->count;
+
+    return 0;
+}
+"""
+
+bpf_text = bpf_text.replace("SAMPLE_COUNT", str(this_count))
+bpf_text = bpf_text.replace("FILTER_STRING", this_filter)
+if this_filter:
+    bpf_text = bpf_text.replace("FILTER", "if (!filter(start_data.input)) { return 0; }")
+else:
+    bpf_text = bpf_text.replace("FILTER", "")
+
+# Create USDT context
+print("Attaching probes to pid %d" % this_pid)
+usdt_ctx = USDT(pid=this_pid)
+usdt_ctx.enable_probe(probe="operation_start", fn_name="trace_operation_start")
+usdt_ctx.enable_probe(probe="operation_end", fn_name="trace_operation_end")
+
+# Create BPF context, load BPF program
+bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel)
+
+print("Tracing... Hit Ctrl-C to end.")
+
+lat_hash = bpf_ctx.get_table("lat_hash")
+while (1):
+    try:
+        sleep(this_interval)
+    except KeyboardInterrupt:
+        exit()
+
+    print("[%s]" % strftime("%H:%M:%S"))
+    print("%-64s %8s %16s" % ("input", "count", "latency (us)"))
+    for k, v in lat_hash.items():
+        print("%-64s %8d %16d" % (k.input, v.count, v.average / 1000))
diff --git a/examples/usdt_sample/scripts/lat_dist.py b/examples/usdt_sample/scripts/lat_dist.py
new file mode 100755
index 0000000..af13e89
--- /dev/null
+++ b/examples/usdt_sample/scripts/lat_dist.py
@@ -0,0 +1,98 @@
+import argparse
+from time import sleep, strftime
+from sys import argv
+import ctypes as ct
+from bcc import BPF, USDT
+import inspect
+import os
+
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Trace the latency distribution of an operation using usdt probes.",
+    formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-p", "--pid", type=int, help="The id of the process to trace.")
+parser.add_argument("-i", "--interval", type=int, help="The interval in seconds on which to report the latency distribution.")
+parser.add_argument("-f", "--filterstr", type=str, default="", help="The prefix filter for the operation input. If specified, only operations for which the input string starts with the filterstr are traced.")
+parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", help="If true, will output verbose logging information.")
+parser.set_defaults(verbose=False)
+args = parser.parse_args()
+this_pid = int(args.pid)
+this_interval = int(args.interval)
+this_filter = str(args.filterstr)
+
+if this_interval < 1:
+    print("Invalid value for interval, using 1.")
+    this_interval = 1
+
+debugLevel=0
+if args.verbose:
+    debugLevel=4
+
+# BPF program
+bpf_text_shared = "%s/bpf_text_shared.c" % os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+bpf_text = open(bpf_text_shared, 'r').read()
+bpf_text += """
+
+/**
+ * @brief The key to use for the latency histogram.
+ */
+struct dist_key_t
+{
+    char input[64];   ///< The input string of the request.
+    u64 slot;         ///< The histogram slot.
+};
+
+/**
+ * @brief Contains the histogram for the operation latencies.
+ */
+BPF_HISTOGRAM(dist, struct dist_key_t);
+
+/**
+ * @brief Reads the operation response arguments, calculates the latency, and stores it in the histogram.
+ * @param ctx The BPF context.
+ */
+int trace_operation_end(struct pt_regs* ctx)
+{
+    u64 operation_id;
+    bpf_usdt_readarg(1, ctx, &operation_id);
+
+    struct start_data_t* start_data = start_hash.lookup(&operation_id);
+    if (0 == start_data) {
+        return 0;
+    }
+
+    u64 duration = bpf_ktime_get_ns() - start_data->start;
+    struct dist_key_t dist_key = {};
+    __builtin_memcpy(&dist_key.input, start_data->input, sizeof(dist_key.input));
+    dist_key.slot = bpf_log2l(duration / 1000);
+    start_hash.delete(&operation_id);
+
+    dist.increment(dist_key);
+    return 0;
+}
+"""
+
+bpf_text = bpf_text.replace("FILTER_STRING", this_filter)
+if this_filter:
+    bpf_text = bpf_text.replace("FILTER", "if (!filter(start_data.input)) { return 0; }")
+else:
+    bpf_text = bpf_text.replace("FILTER", "")
+
+# Create USDT context
+print("Attaching probes to pid %d" % this_pid)
+usdt_ctx = USDT(pid=this_pid)
+usdt_ctx.enable_probe(probe="operation_start", fn_name="trace_operation_start")
+usdt_ctx.enable_probe(probe="operation_end", fn_name="trace_operation_end")
+
+# Create BPF context, load BPF program
+bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel)
+
+start = 0
+dist = bpf_ctx.get_table("dist")
+while (1):
+    try:
+        sleep(this_interval)
+    except KeyboardInterrupt:
+        exit()
+
+    print("[%s]" % strftime("%H:%M:%S"))
+    dist.print_log2_hist("latency (us)")
diff --git a/examples/usdt_sample/scripts/latency.py b/examples/usdt_sample/scripts/latency.py
new file mode 100755
index 0000000..4170592
--- /dev/null
+++ b/examples/usdt_sample/scripts/latency.py
@@ -0,0 +1,117 @@
+import argparse
+from time import sleep
+from sys import argv
+import ctypes as ct
+from bcc import BPF, USDT
+import inspect
+import os
+
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Trace the latency of an operation using usdt probes.",
+    formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-p", "--pid", type=int, help="The id of the process to trace.")
+parser.add_argument("-f", "--filterstr", type=str, default="", help="The prefix filter for the operation input. If specified, only operations for which the input string starts with the filterstr are traced.")
+parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", help="If true, will output verbose logging information.")
+parser.set_defaults(verbose=False)
+args = parser.parse_args()
+this_pid = int(args.pid)
+this_filter = str(args.filterstr)
+
+debugLevel=0
+if args.verbose:
+    debugLevel=4
+
+# BPF program
+bpf_text_shared = "%s/bpf_text_shared.c" % os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+bpf_text = open(bpf_text_shared, 'r').read()
+bpf_text += """
+
+/**
+ * @brief Contains the latency data w.r.t. the complete operation from request to response.
+ */
+struct end_data_t
+{
+    u64 operation_id; ///< The id of the operation.
+    char input[64];   ///< The request (input) string.
+    char output[64];  ///< The response (output) string.
+    u64 start;        ///< The start timestamp of the operation.
+    u64 end;          ///< The end timestamp of the operation.
+    u64 duration;     ///< The duration of the operation.
+};
+
+/**
+ * The output buffer, which will be used to push the latency event data to user space.
+ */
+BPF_PERF_OUTPUT(operation_event);
+
+/**
+ * @brief Reads the operation response arguments, calculates the latency event data, and writes it to the user output buffer.
+ * @param ctx The BPF context.
+ */
+int trace_operation_end(struct pt_regs* ctx)
+{
+    u64 operation_id;
+    bpf_usdt_readarg(1, ctx, &operation_id);
+
+    struct start_data_t* start_data = start_hash.lookup(&operation_id);
+    if (0 == start_data) {
+        return 0;
+    }
+
+    struct end_data_t end_data = {};
+    end_data.operation_id = operation_id;
+    bpf_usdt_readarg_p(2, ctx, &end_data.output, sizeof(end_data.output));
+    end_data.end = bpf_ktime_get_ns();
+    end_data.start = start_data->start;
+    end_data.duration = end_data.end - end_data.start;
+    __builtin_memcpy(&end_data.input, start_data->input, sizeof(end_data.input));
+
+    start_hash.delete(&end_data.operation_id);
+
+    operation_event.perf_submit(ctx, &end_data, sizeof(end_data));
+    return 0;
+}
+"""
+
+bpf_text = bpf_text.replace("FILTER_STRING", this_filter)
+if this_filter:
+    bpf_text = bpf_text.replace("FILTER", "if (!filter(start_data.input)) { return 0; }")
+else:
+    bpf_text = bpf_text.replace("FILTER", "")
+
+# Create USDT context
+print("Attaching probes to pid %d" % this_pid)
+usdt_ctx = USDT(pid=this_pid)
+usdt_ctx.enable_probe(probe="operation_start", fn_name="trace_operation_start")
+usdt_ctx.enable_probe(probe="operation_end", fn_name="trace_operation_end")
+
+# Create BPF context, load BPF program
+bpf_ctx = BPF(text=bpf_text, usdt_contexts=[usdt_ctx], debug=debugLevel)
+
+# Define latency event and print function
+class OperationEventData(ct.Structure):
+  _fields_ = [("operation_id", ct.c_ulonglong),
+              ("input", ct.c_char * 64),
+              ("output", ct.c_char * 64),
+              ("start", ct.c_ulonglong),
+              ("end", ct.c_ulonglong),
+              ("duration", ct.c_ulonglong)]
+
+start = 0
+def print_event(cpu, data, size):
+    global start
+    event = ct.cast(data, ct.POINTER(OperationEventData)).contents
+    if start == 0:
+        start = event.start
+    time_s = (float(event.start - start)) / 1000000000
+    latency = (float(event.duration) / 1000)
+    print("%-18.9f %-10d %-32s %-32s %16d %16d %16d" % (time_s, event.operation_id, event.input, event.output, event.start, event.end, latency))
+
+# Print header
+print("Tracing... Hit Ctrl-C to end.")
+print("%-18s %-10s %-32s %-32s %16s %16s %16s" % ("time(s)", "id", "input", "output", "start (ns)", "end (ns)", "duration (us)"))
+
+# Output latency events
+bpf_ctx["operation_event"].open_perf_buffer(print_event)
+while 1:
+    bpf_ctx.perf_buffer_poll()
diff --git a/examples/usdt_sample/usdt_sample.md b/examples/usdt_sample/usdt_sample.md
new file mode 100644
index 0000000..c6b5a07
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample.md
@@ -0,0 +1,163 @@
+Tested on Fedora25 4.11.3-200.fc25.x86_64, gcc (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)
+
+As an alternative to using ...bcc/tests/python/include/folly/tracing/StaticTracepoint.h,
+it's possible to use systemtap-sdt-devel.
+However, this is *not* required for this sample.
+```bash
+$ sudo dnf install systemtap-sdt-devel  # For Fedora25, other distro's might have differently named packages.
+```
+
+If using systemtap-sdt-devel, the following commands can be used to generate the corresponding header and object files:
+Also see the CMakeLists.txt file for an example how to do this using cmake.
+```bash
+$ dtrace -h -s usdt_sample_lib1/src/lib1_sdt.d -o usdt_sample_lib1/include/usdt_sample_lib1/lib1_sdt.h
+$ dtrace -G -s usdt_sample_lib1/src/lib1_sdt.d -o lib1_sdt.o
+```
+
+Build the sample:
+```bash
+$ pwd
+~/src/bcc
+$ mkdir -p examples/usdt_sample/build && pushd examples/usdt_sample/build
+$ cmake .. && make
+$ popd
+```
+
+After building, you should see the available probes:
+```bash
+$ python tools/tplist.py -l examples/usdt_sample/build/usdt_sample_lib1/libusdt_sample_lib1.so
+examples/usdt_sample/build/usdt_sample_lib1/libusdt_sample_lib1.so usdt_sample_lib1:operation_end
+examples/usdt_sample/build/usdt_sample_lib1/libusdt_sample_lib1.so usdt_sample_lib1:operation_start
+$ readelf -n examples/usdt_sample/build/usdt_sample_lib1/libusdt_sample_lib1.so
+
+Displaying notes found at file offset 0x000001c8 with length 0x00000024:
+  Owner                 Data size	Description
+  GNU                  0x00000014	NT_GNU_BUILD_ID (unique build ID bitstring)
+    Build ID: 3930c19f654990159563394669f2ed5281513302
+
+Displaying notes found at file offset 0x0001b9ec with length 0x000000c0:
+  Owner                 Data size	Description
+  stapsdt              0x00000047	NT_STAPSDT (SystemTap probe descriptors)
+    Provider: usdt_sample_lib1
+    Name: operation_end
+    Location: 0x000000000000ed6d, Base: 0x0000000000000000, Semaphore: 0x0000000000000000
+    Arguments: -8@%rbx -8@%rax
+  stapsdt              0x0000004e	NT_STAPSDT (SystemTap probe descriptors)
+    Provider: usdt_sample_lib1
+    Name: operation_start
+    Location: 0x000000000000ee2c, Base: 0x0000000000000000, Semaphore: 0x0000000000000000
+    Arguments: -8@-24(%rbp) -8@%rax
+```
+
+Start the usdt sample application:
+```bash
+$ examples/usdt_sample/build/usdt_sample_app1/usdt_sample_app1 "pf" 1 30 10 1 50
+Applying the following parameters:
+Input prefix: pf.
+Input range: [1, 30].
+Calls Per Second: 10.
+Latency range: [1, 50] ms.
+You can now run the bcc scripts, see usdt_sample.md for examples.
+pid: 25433
+Press ctrl-c to exit.
+```
+
+Use argdist.py on the individual probes:
+```bash
+$ sudo python tools/argdist.py -p 25433 -i 5 -C 'u:usdt_sample_lib1:operation_start():char*:arg2#input' -z 32
+[11:18:29]
+input
+	COUNT      EVENT
+	1          arg2 = pf_10
+	1          arg2 = pf_5
+	1          arg2 = pf_12
+	1          arg2 = pf_1
+	1          arg2 = pf_11
+	1          arg2 = pf_28
+	1          arg2 = pf_16
+	1          arg2 = pf_19
+	1          arg2 = pf_15
+	1          arg2 = pf_2
+	2          arg2 = pf_17
+	2          arg2 = pf_3
+	2          arg2 = pf_25
+	2          arg2 = pf_30
+	2          arg2 = pf_13
+	2          arg2 = pf_18
+	2          arg2 = pf_7
+	2          arg2 = pf_29
+	2          arg2 = pf_26
+	3          arg2 = pf_8
+	3          arg2 = pf_21
+	3          arg2 = pf_14
+	4          arg2 = pf_6
+	4          arg2 = pf_23
+	5          arg2 = pf_24
+```
+
+Use latency.py to trace the operation latencies:
+```bash
+$ sudo python examples/usdt_sample/scripts/latency.py -p=25433 -f="pf_2"
+Attaching probes to pid 25433
+Tracing... Hit Ctrl-C to end.
+time(s)            id         input                            output                                 start (ns)         end (ns)    duration (us)
+0.000000000        7204       pf_28                            resp_pf_28                         11949439999644   11949489234565            49234
+0.100211886        7205       pf_28                            resp_pf_28                         11949540211530   11949574403064            34191
+0.300586675        7207       pf_21                            resp_pf_21                         11949740586319   11949742773571             2187
+0.400774366        7208       pf_28                            resp_pf_28                         11949840774010   11949859965498            19191
+0.701365719        7211       pf_21                            resp_pf_21                         11950141365363   11950152551131            11185
+0.901736620        7213       pf_25                            resp_pf_25                         11950341736264   11950347924333             6188
+1.102162217        7215       pf_21                            resp_pf_21                         11950542161861   11950567484183            25322
+1.302595998        7217       pf_23                            resp_pf_23                         11950742595642   11950761841242            19245
+1.503047601        7219       pf_2                             resp_pf_2                          11950943047245   11950951213474             8166
+1.703371457        7221       pf_27                            resp_pf_27                         11951143371101   11951176568051            33196
+2.104228899        7225       pf_24                            resp_pf_24                         11951544228543   11951588432769            44204
+2.304608175        7227       pf_21                            resp_pf_21                         11951744607819   11951790796068            46188
+2.404796703        7228       pf_21                            resp_pf_21                         11951844796347   11951877984160            33187
+2.605134923        7230       pf_27                            resp_pf_27                         11952045134567   11952065327660            20193
+3.206291642        7236       pf_29                            resp_pf_29                         11952646291286   11952660443343            14152
+3.506887492        7239       pf_21                            resp_pf_21                         11952946887136   11952995060987            48173
+```
+
+Use lat_dist.py to trace the latency distribution:
+```bash
+$ sudo python examples/usdt_sample/scripts/lat_dist.py -p=25433 -i=30 -f="pf_20"
+Attaching probes to pid 25433
+[11:23:47]
+
+Bucket ptr = 'pf_20'
+     latency (us)        : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |**********                              |
+      2048 -> 4095       : 1        |**********                              |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 1        |**********                              |
+     16384 -> 32767      : 4        |****************************************|
+     32768 -> 65535      : 3        |******************************          |
+```
+
+Use lat_avg.py to trace the moving average of the latencies:
+```bash
+$ sudo python examples/usdt_sample/scripts/lat_avg.py -p=25433 -i=5 -c=10 -f="pf_2"
+Attaching probes to pid 25433
+Tracing... Hit Ctrl-C to end.
+[11:28:32]
+input                                                               count     latency (us)
+pf_22                                                                   3             7807
+pf_23                                                                   4            36914
+pf_25                                                                   3            31473
+pf_28                                                                   2            10627
+pf_27                                                                   1            47174
+pf_29                                                                   1             8138
+pf_26                                                                   1            49121
+pf_20                                                                   2            29158
+```
diff --git a/examples/usdt_sample/usdt_sample_app1/CMakeLists.txt b/examples/usdt_sample/usdt_sample_app1/CMakeLists.txt
new file mode 100755
index 0000000..b447e21
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_app1/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.0)
+project(usdt_sample_app1)
+
+include_directories(
+    ${USDT_SAMPLE_LIB1_INCLUDE_DIR}
+)
+
+link_directories(
+    ${USDT_SAMPLE_LIB1_LINK_DIR}
+)
+
+add_executable( ${PROJECT_NAME}
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+target_link_libraries( ${PROJECT_NAME}
+    ${USDT_SAMPLE_LIB1_LIB}
+    pthread
+)
diff --git a/examples/usdt_sample/usdt_sample_app1/main.cpp b/examples/usdt_sample/usdt_sample_app1/main.cpp
new file mode 100644
index 0000000..c75c783
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_app1/main.cpp
@@ -0,0 +1,145 @@
+// std
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <string>
+#include <thread>
+
+// gnu-c
+#include <sys/types.h>
+#include <unistd.h>
+
+// usdt_sample_lib1
+#include "usdt_sample_lib1/lib1.h"
+
+void print_usage(int argc, char** argv)
+{
+    std::cout << "Usage:" << std::endl;
+    std::cout << argv[0]
+              << " <InputPrefix> <InputMinimum (1-50)> <InputMaximum (1-50)> <CallsPerSec (1-50)> <MinimumLatencyMs (1-50)> <MaximumLatencyMs (1-50)>"
+              << std::endl;
+    std::cout << "InputPrefix: Prefix of the input string to the operation. Default: dummy" << std::endl;
+    std::cout << "InputMinimum: Minimum number to make the input string to the operation somewhat unique. Default: 1" << std::endl;
+    std::cout << "InputMaximum: Maximum number to make the input string to the operation somewhat unique. Default: 50" << std::endl;
+    std::cout << "CallsPerSec: Rate of calls to the operation. Default: 10" << std::endl;
+    std::cout << "MinimumLatencyMs: Minimum latency to apply to the operation. Default: 20" << std::endl;
+    std::cout << "MaximumLatencyMs: Maximum latency to apply to the operation. Default: 40" << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    std::string inputPrefix("dummy");
+    std::uint32_t inputMinimum = 1;
+    std::uint32_t inputMaximum = 50;
+    std::uint32_t callsPerSec = 10;
+    std::uint32_t minLatMs = 20;
+    std::uint32_t maxLatMs = 40;
+
+    try {
+        if (argc > 1) {
+            inputPrefix = argv[1];
+        }
+
+        if (argc > 2) {
+            inputMinimum = static_cast<std::uint32_t>(std::max(1, std::min(50, std::atoi(argv[2]))));
+        }
+
+        if (argc > 3) {
+            inputMaximum = static_cast<std::uint32_t>(std::max(1, std::min(50, std::atoi(argv[3]))));
+        }
+
+        if (argc > 4) {
+            callsPerSec = static_cast<std::uint32_t>(std::max(1, std::min(50, std::atoi(argv[4]))));
+        }
+
+        if (argc > 5) {
+            minLatMs = static_cast<std::uint32_t>(std::max(1, std::min(50, std::atoi(argv[5]))));
+        }
+
+        if (argc > 6) {
+            maxLatMs = static_cast<std::uint32_t>(std::max(1, std::min(50, std::atoi(argv[6]))));
+        }
+    }
+    catch (const std::exception& exc) {
+        std::cout << "Exception while reading arguments: " << exc.what() << std::endl;
+        print_usage(argc, argv);
+        return -1;
+    }
+    catch (...) {
+        std::cout << "Unknown exception while reading arguments." << std::endl;
+        print_usage(argc, argv);
+        return -1;
+    }
+
+    if (inputMinimum > inputMaximum) {
+        std::cout << "InputMinimum must be smaller than InputMaximum." << std::endl;
+        print_usage(argc, argv);
+        return -1;
+    }
+
+    if (minLatMs > maxLatMs) {
+        std::cout << "MinimumLatencyMs must be smaller than MaximumLatencyMs." << std::endl;
+        print_usage(argc, argv);
+        return -1;
+    }
+
+    std::cout << "Applying the following parameters:" << std::endl
+              << "Input prefix: " << inputPrefix << "." << std::endl
+              << "Input range: [" << inputMinimum << ", " << inputMaximum << "]." << std::endl
+              << "Calls Per Second: " << callsPerSec << "." << std::endl
+              << "Latency range: [" << minLatMs << ", " << maxLatMs << "] ms." << std::endl;
+
+    const int sleepTimeMs = 1000 / callsPerSec;
+    OperationProvider op(minLatMs, maxLatMs);
+
+    std::mutex queueMutex;
+    std::queue<std::shared_future<OperationResponse>> responseQueue;
+
+    auto dequeueFuture = std::async(std::launch::async, [&]() {
+        while (true) {
+            bool empty = false;
+            {
+                std::lock_guard<std::mutex> lg(queueMutex);
+                empty = responseQueue.empty();
+            }
+
+            if (empty) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(sleepTimeMs));
+                continue;
+            }
+
+            responseQueue.front().get();
+
+            // std::cout << "Removing item from queue." << std::endl;
+            std::lock_guard<std::mutex> lg(queueMutex);
+            responseQueue.pop();
+        }
+    });
+
+    std::random_device rd;
+    std::uniform_int_distribution<> dis(inputMinimum, inputMaximum);
+
+    std::cout << "You can now run the bcc scripts, see usdt_sample.md for examples." << std::endl;
+    std::cout << "pid: " << ::getpid() << std::endl;
+    std::cout << "Press ctrl-c to exit." << std::endl;
+    while (true) {
+        std::ostringstream inputOss;
+        inputOss << inputPrefix << "_" << dis(rd);
+        auto responseFuture = op.executeAsync(OperationRequest(inputOss.str()));
+
+        {
+            std::lock_guard<std::mutex> lg(queueMutex);
+            responseQueue.push(responseFuture);
+        }
+
+        // For a sample application, this is good enough to simulate callsPerSec.
+        std::this_thread::sleep_for(std::chrono::milliseconds(sleepTimeMs));
+    }
+
+    dequeueFuture.get();
+    return 0;
+}
diff --git a/examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt b/examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt
new file mode 100755
index 0000000..3f1c7b2
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.0)
+project(usdt_sample_lib1)
+
+# Define variables.
+set(USDT_SAMPLE_LIB1_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING "USDT_SAMPLE_LIB1_INCLUDE_DIR" FORCE)
+set(USDT_SAMPLE_LIB1_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src CACHE STRING "USDT_SAMPLE_LIB1_SRC_DIR" FORCE)
+set(USDT_SAMPLE_LIB1_LINK_DIR ${CMAKE_CURRENT_BINARY_DIR} CACHE STRING "USDT_SAMPLE_LIB1_LINK_DIR" FORCE)
+set(USDT_SAMPLE_LIB1_LIB ${PROJECT_NAME} CACHE STRING "USDT_SAMPLE_LIB1_LIB" FORCE)
+set(USDT_SAMPLE_LIB1_GENERATED ${CMAKE_CURRENT_BINARY_DIR}/generated)
+
+## Start - N.B. Following section only relevant when using systemtap-sdt-devel.
+
+# Create usdt header file.
+# N.B. ${USDT_SAMPLE_LIB1_INCLUDE_DIR}/usdt_sample_lib1/lib1_sdt.h must be removed manually in order for it to be (re-)created.
+#  i.e. after making changes to libt_sdt.d
+#add_custom_command(
+#    OUTPUT ${USDT_SAMPLE_LIB1_INCLUDE_DIR}/usdt_sample_lib1/lib1_sdt.h
+#    PRE_BUILD
+#    COMMAND dtrace -h -s ${USDT_SAMPLE_LIB1_SRC_DIR}/lib1_sdt.d -o ${USDT_SAMPLE_LIB1_INCLUDE_DIR}/usdt_sample_lib1/lib1_sdt.h
+#    COMMENT "Create usdt probes header file"
+#)
+
+# Create usdt object file.
+#file(MAKE_DIRECTORY ${USDT_SAMPLE_LIB1_GENERATED})
+#add_custom_command(
+#    OUTPUT ${USDT_SAMPLE_LIB1_GENERATED}/lib1_sdt.o
+#    PRE_BUILD
+#    COMMAND dtrace -G -s ${USDT_SAMPLE_LIB1_SRC_DIR}/lib1_sdt.d -o ${USDT_SAMPLE_LIB1_GENERATED}/lib1_sdt.o
+#    COMMENT "Create usdt probes object file"
+#)
+
+## End
+
+include_directories(
+    ${USDT_SAMPLE_LIB1_INCLUDE_DIR}
+    # For folly StaticTracepoint.h:
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../tests/python/include
+)
+
+add_library( ${PROJECT_NAME} SHARED
+## Only relevant when using systemtap-sdt-devel
+#    ${USDT_SAMPLE_LIB1_INCLUDE_DIR}/usdt_sample_lib1/lib1_sdt.h
+#    ${USDT_SAMPLE_LIB1_GENERATED}/lib1_sdt.o
+    ${USDT_SAMPLE_LIB1_SRC_DIR}/lib1.cpp
+)
diff --git a/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1.h b/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1.h
new file mode 100644
index 0000000..1a9d13e
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1.h
@@ -0,0 +1,58 @@
+#pragma once
+
+// std
+#include <cstdint>
+#include <future>
+#include <random>
+#include <string>
+
+/**
+ * @brief Contains the operation request data.
+ */
+class OperationRequest
+{
+public:
+    OperationRequest(const std::string& input);
+    const std::string& input() const { return _input; }
+
+private:
+    std::string _input;
+};
+
+/**
+ * @brief Contains the operation response data.
+ */
+class OperationResponse
+{
+public:
+    OperationResponse(const std::string& output);
+    const std::string& output() const { return _output; }
+
+private:
+    std::string _output;
+};
+
+/**
+ * @brief Provides the operation.
+ */
+class OperationProvider
+{
+public:
+    /**
+     * @brief Constructs an instance of OperationProvider.
+     * @param minLatencyMs The minimum latency to simulate for the operation.
+     * @param maxLatencyMs The maximum latency to simulate for the operation.
+     */
+    OperationProvider(std::uint32_t minLatencyMs, std::uint32_t maxLatencyMs);
+
+    /**
+     * @brief Asynchronously executes the operation.
+     * @param request The request input data for the operation.
+     * @return A shared_future of the response of the operation.
+     */
+    std::shared_future<OperationResponse> executeAsync(const OperationRequest& request);
+
+private:
+    std::mt19937 _gen;                    ///< Used randomly determine an operation latency to simulate.
+    std::uniform_int_distribution<> _dis; ///< Used randomly determine an operation latency to simulate.
+};
diff --git a/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1_sdt.h b/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1_sdt.h
new file mode 100644
index 0000000..6b8a51a
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_lib1/include/usdt_sample_lib1/lib1_sdt.h
@@ -0,0 +1,36 @@
+// N.B. This file is not used by this usdt_sample. Instead, the StaticTracepoint.h file from folly is used.
+//  It is here only for demonstration purposes.
+
+/* Generated by the Systemtap dtrace wrapper */
+
+
+#define _SDT_HAS_SEMAPHORES 1
+
+
+#define STAP_HAS_SEMAPHORES 1 /* deprecated */
+
+
+#include <sys/sdt.h>
+
+/* USDT_SAMPLE_LIB1_OPERATION_START ( uint64_t operation_id, const char * input ) */
+#if defined STAP_SDT_V1
+#define USDT_SAMPLE_LIB1_OPERATION_START_ENABLED() __builtin_expect (operation_start_semaphore, 0)
+#define usdt_sample_lib1_operation_start_semaphore operation_start_semaphore
+#else
+#define USDT_SAMPLE_LIB1_OPERATION_START_ENABLED() __builtin_expect (usdt_sample_lib1_operation_start_semaphore, 0)
+#endif
+__extension__ extern unsigned short usdt_sample_lib1_operation_start_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
+#define USDT_SAMPLE_LIB1_OPERATION_START(arg1, arg2) \
+DTRACE_PROBE2 (usdt_sample_lib1, operation_start, arg1, arg2)
+
+/* USDT_SAMPLE_LIB1_OPERATION_END ( uint64_t operation_id, const char * output ) */
+#if defined STAP_SDT_V1
+#define USDT_SAMPLE_LIB1_OPERATION_END_ENABLED() __builtin_expect (operation_end_semaphore, 0)
+#define usdt_sample_lib1_operation_end_semaphore operation_end_semaphore
+#else
+#define USDT_SAMPLE_LIB1_OPERATION_END_ENABLED() __builtin_expect (usdt_sample_lib1_operation_end_semaphore, 0)
+#endif
+__extension__ extern unsigned short usdt_sample_lib1_operation_end_semaphore __attribute__ ((unused)) __attribute__ ((section (".probes")));
+#define USDT_SAMPLE_LIB1_OPERATION_END(arg1, arg2) \
+DTRACE_PROBE2 (usdt_sample_lib1, operation_end, arg1, arg2)
+
diff --git a/examples/usdt_sample/usdt_sample_lib1/src/lib1.cpp b/examples/usdt_sample/usdt_sample_lib1/src/lib1.cpp
new file mode 100644
index 0000000..f19a7ea
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_lib1/src/lib1.cpp
@@ -0,0 +1,64 @@
+#include "usdt_sample_lib1/lib1.h"
+
+// std
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+// usdt_sample_lib1
+#include "folly/tracing/StaticTracepoint.h"
+
+// When using systemtap-sdt-devel, the following file should be included:
+// #include "usdt_sample_lib1/lib1_sdt.h"
+
+OperationRequest::OperationRequest(const std::string& input_)
+    : _input(input_)
+{
+}
+
+OperationResponse::OperationResponse(const std::string& output_)
+    : _output(output_)
+{
+}
+
+OperationProvider::OperationProvider(std::uint32_t minLatencyMs_, std::uint32_t maxLatencyMs_)
+    : _gen(std::random_device()())
+    , _dis(minLatencyMs_, maxLatencyMs_)
+{
+}
+
+std::shared_future<OperationResponse> OperationProvider::executeAsync(const OperationRequest& request)
+{
+    static std::atomic<std::uint64_t> operationIdCounter(0);
+    std::uint64_t operationId = operationIdCounter++;
+
+    FOLLY_SDT(usdt_sample_lib1, operation_start, operationId, request.input().c_str());
+
+/* Below an example of how to use this sample with systemtap-sdt-devel:
+    if (USDT_SAMPLE_LIB1_OPERATION_START_ENABLED()) {
+        //std::cout << "operation_start probe enabled." << std::endl;
+        USDT_SAMPLE_LIB1_OPERATION_START(operationId, &inputBuf);
+    }
+*/
+
+    auto latencyMs = _dis(_gen);
+
+    return std::async(std::launch::async, [latencyMs, operationId, request]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(latencyMs));
+
+        auto output = std::string("resp_") + request.input();
+        OperationResponse response(output);
+
+        FOLLY_SDT(usdt_sample_lib1, operation_end, operationId, response.output().c_str());
+
+/* Below an example of how to use this sample with systemtap-sdt-devel:
+        if (USDT_SAMPLE_LIB1_OPERATION_END_ENABLED()) {
+            //std::cout << "operation_end probe enabled." << std::endl;
+            USDT_SAMPLE_LIB1_OPERATION_END(operationId, &outputBuf);
+        }
+*/
+
+        return response;
+    });
+}
diff --git a/examples/usdt_sample/usdt_sample_lib1/src/lib1_sdt.d b/examples/usdt_sample/usdt_sample_lib1/src/lib1_sdt.d
new file mode 100644
index 0000000..4f7129e
--- /dev/null
+++ b/examples/usdt_sample/usdt_sample_lib1/src/lib1_sdt.d
@@ -0,0 +1,7 @@
+# This file is only relevant when using systemtap-sdt-devel (see usdt_sample.md).
+#  This usdt_sample uses the StaticTracepoint.h header file (from folly) instead.
+provider usdt_sample_lib1
+{
+    probe operation_start(uint64_t operation_id, const char* input);
+    probe operation_end(uint64_t operation_id, const char* output);
+};
diff --git a/images/bcc_tracing_tools_2016.png b/images/bcc_tracing_tools_2016.png
new file mode 100644
index 0000000..73d0dd7
--- /dev/null
+++ b/images/bcc_tracing_tools_2016.png
Binary files differ
diff --git a/images/bcc_tracing_tools_2017.png b/images/bcc_tracing_tools_2017.png
new file mode 100644
index 0000000..7e66561
--- /dev/null
+++ b/images/bcc_tracing_tools_2017.png
Binary files differ
diff --git a/images/logo1.png b/images/logo1.png
new file mode 100644
index 0000000..47c5834
--- /dev/null
+++ b/images/logo1.png
Binary files differ
diff --git a/images/logo1.svg b/images/logo1.svg
new file mode 100644
index 0000000..ccfb0c2
--- /dev/null
+++ b/images/logo1.svg
@@ -0,0 +1,137 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="343.1369"
+   height="192.88474"
+   id="svg4001"
+   version="1.1"
+   inkscape:version="0.48.5 r10040"
+   sodipodi:docname="logo1.svg"
+   inkscape:export-filename="/home/suchakra/Projects/repos/bcc/images/logo1.png"
+   inkscape:export-xdpi="82.220001"
+   inkscape:export-ydpi="82.220001">
+  <defs
+     id="defs4003" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.4"
+     inkscape:cx="106.20637"
+     inkscape:cy="47.374369"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1280"
+     inkscape:window-height="996"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <metadata
+     id="metadata4006">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-12.707259,-27.658902)">
+    <g
+       id="g4237"
+       transform="translate(-9.2594183,0)">
+      <g
+         style="font-size:159.42726135px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:0%;letter-spacing:-220.96620178px;word-spacing:0px;text-anchor:end;fill:#5a2a7a;fill-opacity:1;stroke:none;font-family:Open Sans;-inkscape-font-specification:Sans"
+         id="text3894-8">
+        <path
+           style="font-weight:bold;letter-spacing:0px;fill:#5a2a7a;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           d="m 20,20 0,121.125 18.125,0 2.5,-6.03125 c 0.8784,-1.42156 0.80734,-4.26454 3.125,-3.53125 7.792014,10.57538 22.717836,13.42947 34.75,9.46875 13.075885,-4.35147 20.19391,-18.12418 21.875,-31.0625 0.11735,-0.92644 0.24681,-1.84573 0.34375,-2.78125 l -24.5,0 c -0.672646,3.33799 -1.792798,6.56729 -3.53125,9.5625 -3.219708,5.87821 -10.565247,7.97697 -16.75,6.125 -7.207936,-1.44191 -10.751371,-8.81172 -11.5,-15.5 -0.938115,-8.75323 -1.423638,-17.920742 1.34375,-26.40625 1.900727,-6.370935 8.634375,-9.884146 15,-9.5 4.696235,-0.182402 8.393519,2.043213 11.03125,5.34375 l 26.6875,0 C 97.417905,73.363813 95.992292,70.00998 94.125,66.84375 88.994034,57.788433 78.819496,52.080562 68.40625,52.46875 63.182778,52.45477 57.879945,53.480878 53.3125,56.09375 48.771891,57.830316 46.386127,63.260648 42.8125,65.375 44.249967,50.285021 43.606677,35.131316 43.75,20 L 20,20 z"
+           transform="translate(1.9666778,7.658902)"
+           id="path4363"
+           inkscape:connector-curvature="0" />
+      </g>
+      <g
+         id="g4194"
+         transform="matrix(-1,0,0,-1,286.62081,200.49829)">
+        <path
+           style="color:#000000;fill:#4d4d4d;stroke:none"
+           d="m 140.21875,42.5 c -1.05652,0.0723 -2.08693,0.25622 -3.125,0.375 l -0.0625,0 -2.46875,13.53125 c -4.04737,0.92166 -7.85564,2.49945 -11.28125,4.625 L 112.125,53 c -3.01102,2.33761 -5.74899,5.0785 -8.15625,8.03125 l 7.71875,11.28125 c -0.97245,1.486128 -1.83447,3.048161 -2.59375,4.6875 l 32.96875,0 0.78125,0 1.09375,0 0,0.03125 c 7.89174,0.563622 14.15625,7.188282 14.15625,15.21875 0,8.39769 -6.85231,15.1875 -15.25,15.1875 -0.52486,0 -1.05151,-0.042 -1.5625,-0.0937 l -32.5625,0 c 0.85371,1.98646 1.886,3.90048 3.0625,5.6875 l -8.03125,11 c 2.29593,2.8503 4.95533,5.43121 7.8125,7.71875 l 11.34375,-7.78125 c 3.96715,2.53068 8.3874,4.28248 13.1875,5.15625 l 2.125,13.40625 c 1.51246,0.13757 3.07702,0.15625 4.625,0.15625 2.18534,0 4.26295,-0.0718 6.375,-0.34375 l 2.5625,-13.71875 c 4.55742,-1.13418 8.84452,-3.10528 12.5625,-5.75 L 175.25,130.8438 c 2.8333,-2.41051 5.4495,-5.17856 7.6875,-8.15625 l -7.96875,-11.5 c 2.15194,-3.71633 3.65055,-7.83338 4.40625,-12.21875 l 13.40625,-2.125 c 0.11799,-1.39922 0.15625,-2.75799 0.15625,-4.1875 0,-2.484124 -0.30647,-4.89404 -0.65625,-7.28125 l -13.625,-2.5 c -1.06744,-3.94162 -2.81323,-7.63094 -5.03125,-10.9375 l 8.03125,-10.96875 c -2.48498,-3.03894 -5.31509,-5.86655 -8.4375,-8.25 L 161.625,60.6563 c -3.32363,-1.96568 -6.89791,-3.45871 -10.78125,-4.3125 l -2.125,-13.46875 c -1.9371,-0.22788 -3.87704,-0.375 -5.875,-0.375 -0.54011,0 -1.0889,-0.0159 -1.625,0 -0.26124,0.008 -0.52086,-0.0159 -0.78125,0 -0.0703,0.005 -0.14832,-0.005 -0.21875,0 z"
+           transform="translate(1.9666778,7.658902)"
+           id="path3243-2-8"
+           inkscape:connector-curvature="0" />
+      </g>
+    </g>
+    <g
+       id="g4227"
+       transform="matrix(0.89637358,0,0,0.89637358,1.0019582,16.606833)">
+      <text
+         sodipodi:linespacing="0%"
+         id="text3963"
+         y="221.37347"
+         x="194.31525"
+         style="font-size:29.82613182px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:0%;letter-spacing:0px;word-spacing:0px;text-anchor:end;fill:#cccccc;fill-opacity:1;stroke:none;font-family:Open Sans;-inkscape-font-specification:Sans"
+         xml:space="preserve"><tspan
+           style="font-size:100.66319275px;font-weight:normal;fill:#cccccc;-inkscape-font-specification:Open Sans"
+           y="221.37347"
+           x="194.31525"
+           id="tspan3965"
+           sodipodi:role="line">:</tspan></text>
+      <text
+         sodipodi:linespacing="100%"
+         id="text3894-1-5"
+         y="227.51317"
+         x="3.4564569"
+         style="font-size:316.62283325px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:100%;letter-spacing:-438.83929443px;word-spacing:0px;text-anchor:end;fill:#800080;fill-opacity:1;stroke:none;font-family:Open Sans;-inkscape-font-specification:Sans"
+         xml:space="preserve"><tspan
+           style="font-size:95px;font-style:normal;font-variant:normal;font-weight:300;font-stretch:normal;text-align:start;line-height:100%;letter-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#5a2a7a;fill-opacity:1;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           y="227.51317"
+           x="3.4564569"
+           id="tspan3896-4-0"
+           sodipodi:role="line">BPF</tspan><tspan
+           id="tspan3942-7"
+           style="font-size:27.08731651px;font-style:normal;font-variant:normal;font-weight:300;font-stretch:normal;text-align:start;line-height:100%;letter-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#666666;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           y="322.51318"
+           x="3.4564569"
+           sodipodi:role="line" /></text>
+      <text
+         sodipodi:linespacing="100%"
+         id="text3894-1-7-4"
+         y="190.06371"
+         x="190.77078"
+         style="font-size:413.49078369px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:100%;letter-spacing:-573.0982666px;word-spacing:0px;text-anchor:end;fill:#800080;fill-opacity:1;stroke:none;font-family:Open Sans;-inkscape-font-specification:Sans"
+         xml:space="preserve"><tspan
+           id="tspan3940-6-7"
+           style="font-size:43.41408539px;font-style:normal;font-variant:normal;font-weight:300;font-stretch:normal;text-align:start;line-height:100%;letter-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#666666;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           y="190.06371"
+           x="190.77078"
+           sodipodi:role="line">COMPILER</tspan><tspan
+           id="tspan3942-8-3"
+           style="font-size:35.37443924px;font-style:normal;font-variant:normal;font-weight:300;font-stretch:normal;text-align:start;line-height:100%;letter-spacing:0px;writing-mode:lr-tb;text-anchor:start;fill:#666666;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           y="227.16772"
+           x="190.77078"
+           sodipodi:role="line">COLLECTION</tspan></text>
+    </g>
+  </g>
+</svg>
diff --git a/images/logo2.png b/images/logo2.png
new file mode 100644
index 0000000..7f2cf76
--- /dev/null
+++ b/images/logo2.png
Binary files differ
diff --git a/images/logo2.svg b/images/logo2.svg
new file mode 100644
index 0000000..8753977
--- /dev/null
+++ b/images/logo2.svg
@@ -0,0 +1,87 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   width="158.93745"
+   height="122.73608"
+   id="svg4001"
+   version="1.1"
+   inkscape:version="0.48.5 r10040"
+   sodipodi:docname="logo2.svg"
+   inkscape:export-filename="/home/suchakra/Projects/repos/bcc/images/logo2.png"
+   inkscape:export-xdpi="108.39"
+   inkscape:export-ydpi="108.39">
+  <defs
+     id="defs4003" />
+  <sodipodi:namedview
+     id="base"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageopacity="0.0"
+     inkscape:pageshadow="2"
+     inkscape:zoom="1.4"
+     inkscape:cx="117.2778"
+     inkscape:cy="105.94581"
+     inkscape:document-units="px"
+     inkscape:current-layer="layer1"
+     showgrid="false"
+     inkscape:window-width="1280"
+     inkscape:window-height="752"
+     inkscape:window-x="0"
+     inkscape:window-y="1044"
+     inkscape:window-maximized="1"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0" />
+  <metadata
+     id="metadata4006">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-12.707259,-27.658902)">
+    <g
+       id="g4237"
+       transform="translate(-9.2594183,0)">
+      <g
+         style="font-size:159.42726135px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:end;line-height:0%;letter-spacing:-220.96620178px;word-spacing:0px;text-anchor:end;fill:#5a2a7a;fill-opacity:1;stroke:none;font-family:Open Sans;-inkscape-font-specification:Sans"
+         id="text3894-8">
+        <path
+           style="font-weight:bold;letter-spacing:0px;fill:#5a2a7a;font-family:Open Sans;-inkscape-font-specification:Open Sans Light"
+           d="m 20,20 0,121.125 18.125,0 2.5,-6.03125 c 0.8784,-1.42156 0.80734,-4.26454 3.125,-3.53125 7.792014,10.57538 22.717836,13.42947 34.75,9.46875 13.075885,-4.35147 20.19391,-18.12418 21.875,-31.0625 0.11735,-0.92644 0.24681,-1.84573 0.34375,-2.78125 l -24.5,0 c -0.672646,3.33799 -1.792798,6.56729 -3.53125,9.5625 -3.219708,5.87821 -10.565247,7.97697 -16.75,6.125 -7.207936,-1.44191 -10.751371,-8.81172 -11.5,-15.5 -0.938115,-8.75323 -1.423638,-17.920742 1.34375,-26.40625 1.900727,-6.370935 8.634375,-9.884146 15,-9.5 4.696235,-0.182402 8.393519,2.043213 11.03125,5.34375 l 26.6875,0 C 97.417905,73.363813 95.992292,70.00998 94.125,66.84375 88.994034,57.788433 78.819496,52.080562 68.40625,52.46875 63.182778,52.45477 57.879945,53.480878 53.3125,56.09375 48.771891,57.830316 46.386127,63.260648 42.8125,65.375 44.249967,50.285021 43.606677,35.131316 43.75,20 L 20,20 z"
+           transform="translate(1.9666778,7.658902)"
+           id="path4363"
+           inkscape:connector-curvature="0" />
+      </g>
+      <g
+         id="g4194"
+         transform="matrix(-1,0,0,-1,286.62081,200.49829)">
+        <path
+           style="color:#000000;fill:#4d4d4d;stroke:none"
+           d="m 140.21875,42.5 c -1.05652,0.0723 -2.08693,0.25622 -3.125,0.375 l -0.0625,0 -2.46875,13.53125 c -4.04737,0.92166 -7.85564,2.49945 -11.28125,4.625 L 112.125,53 c -3.01102,2.33761 -5.74899,5.0785 -8.15625,8.03125 l 7.71875,11.28125 c -0.97245,1.486128 -1.83447,3.048161 -2.59375,4.6875 l 32.96875,0 0.78125,0 1.09375,0 0,0.03125 c 7.89174,0.563622 14.15625,7.188282 14.15625,15.21875 0,8.39769 -6.85231,15.1875 -15.25,15.1875 -0.52486,0 -1.05151,-0.042 -1.5625,-0.0937 l -32.5625,0 c 0.85371,1.98646 1.886,3.90048 3.0625,5.6875 l -8.03125,11 c 2.29593,2.8503 4.95533,5.43121 7.8125,7.71875 l 11.34375,-7.78125 c 3.96715,2.53068 8.3874,4.28248 13.1875,5.15625 l 2.125,13.40625 c 1.51246,0.13757 3.07702,0.15625 4.625,0.15625 2.18534,0 4.26295,-0.0718 6.375,-0.34375 l 2.5625,-13.71875 c 4.55742,-1.13418 8.84452,-3.10528 12.5625,-5.75 L 175.25,130.8438 c 2.8333,-2.41051 5.4495,-5.17856 7.6875,-8.15625 l -7.96875,-11.5 c 2.15194,-3.71633 3.65055,-7.83338 4.40625,-12.21875 l 13.40625,-2.125 c 0.11799,-1.39922 0.15625,-2.75799 0.15625,-4.1875 0,-2.484124 -0.30647,-4.89404 -0.65625,-7.28125 l -13.625,-2.5 c -1.06744,-3.94162 -2.81323,-7.63094 -5.03125,-10.9375 l 8.03125,-10.96875 c -2.48498,-3.03894 -5.31509,-5.86655 -8.4375,-8.25 L 161.625,60.6563 c -3.32363,-1.96568 -6.89791,-3.45871 -10.78125,-4.3125 l -2.125,-13.46875 c -1.9371,-0.22788 -3.87704,-0.375 -5.875,-0.375 -0.54011,0 -1.0889,-0.0159 -1.625,0 -0.26124,0.008 -0.52086,-0.0159 -0.78125,0 -0.0703,0.005 -0.14832,-0.005 -0.21875,0 z"
+           transform="translate(1.9666778,7.658902)"
+           id="path3243-2-8"
+           inkscape:connector-curvature="0" />
+      </g>
+    </g>
+  </g>
+</svg>
diff --git a/introspection/CMakeLists.txt b/introspection/CMakeLists.txt
new file mode 100644
index 0000000..836bc0a
--- /dev/null
+++ b/introspection/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+include_directories(${CMAKE_SOURCE_DIR}/src/cc)
+include_directories(${CMAKE_SOURCE_DIR}/src/cc/api)
+
+option(INSTALL_INTROSPECTION "Install BPF introspection tools" ON)
+
+add_executable(bps bps.c)
+target_link_libraries(bps bpf-static)
+
+install (TARGETS bps DESTINATION share/bcc/introspection)
diff --git a/introspection/bps.c b/introspection/bps.c
new file mode 100644
index 0000000..4993b8e
--- /dev/null
+++ b/introspection/bps.c
@@ -0,0 +1,328 @@
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <sysexits.h>
+
+#include "libbpf.h"
+
+// TODO: Remove this when CentOS 6 support is not needed anymore
+#ifndef CLOCK_BOOTTIME
+#define CLOCK_BOOTTIME 7
+#endif
+
+static const char * const prog_type_strings[] = {
+  [BPF_PROG_TYPE_UNSPEC] = "unspec",
+  [BPF_PROG_TYPE_SOCKET_FILTER] = "socket filter",
+  [BPF_PROG_TYPE_KPROBE] = "kprobe",
+  [BPF_PROG_TYPE_SCHED_CLS] = "sched cls",
+  [BPF_PROG_TYPE_SCHED_ACT] = "sched act",
+  [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint",
+  [BPF_PROG_TYPE_XDP] = "xdp",
+  [BPF_PROG_TYPE_PERF_EVENT] = "perf event",
+  [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup skb",
+  [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup sock",
+  [BPF_PROG_TYPE_LWT_IN] = "lwt in",
+  [BPF_PROG_TYPE_LWT_OUT] = "lwt out",
+  [BPF_PROG_TYPE_LWT_XMIT] = "lwt xmit",
+  [BPF_PROG_TYPE_SOCK_OPS] = "sock ops",
+  [BPF_PROG_TYPE_SK_SKB] = "sk skb",
+  [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device",
+  [BPF_PROG_TYPE_SK_MSG] = "sk_msg",
+  [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint",
+  [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
+  [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2",
+  [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport",
+  [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
+};
+
+static const char * const map_type_strings[] = {
+  [BPF_MAP_TYPE_UNSPEC] = "unspec",
+  [BPF_MAP_TYPE_HASH] = "hash",
+  [BPF_MAP_TYPE_ARRAY] = "array",
+  [BPF_MAP_TYPE_PROG_ARRAY] = "prog array",
+  [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf-ev array",
+  [BPF_MAP_TYPE_PERCPU_HASH] = "percpu hash",
+  [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu array",
+  [BPF_MAP_TYPE_STACK_TRACE] = "stack trace",
+  [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup array",
+  [BPF_MAP_TYPE_LRU_HASH] = "lru hash",
+  [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru percpu hash",
+  [BPF_MAP_TYPE_LPM_TRIE] = "lpm trie",
+  [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array of maps",
+  [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash of maps",
+  [BPF_MAP_TYPE_DEVMAP] = "devmap",
+  [BPF_MAP_TYPE_SOCKMAP] = "sockmap",
+  [BPF_MAP_TYPE_CPUMAP] = "cpumap",
+  [BPF_MAP_TYPE_SOCKHASH] = "sockhash",
+  [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage",
+  [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray",
+  [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "precpu_cgroup_storage",
+  [BPF_MAP_TYPE_QUEUE] = "queue",
+  [BPF_MAP_TYPE_STACK] = "stack",
+};
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define LAST_KNOWN_PROG_TYPE (ARRAY_SIZE(prog_type_strings) - 1)
+#define LAST_KNOWN_MAP_TYPE (ARRAY_SIZE(map_type_strings) - 1)
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+static inline uint64_t ptr_to_u64(const void *ptr)
+{
+  return (uint64_t) (unsigned long) ptr;
+}
+
+static inline void * u64_to_ptr(uint64_t ptr)
+{
+  return (void *) (unsigned long ) ptr;
+}
+
+static int handle_get_next_errno(int eno)
+{
+  switch (eno) {
+    case ENOENT:
+      return 0;
+    case EINVAL:
+      fprintf(stderr, "Kernel does not support BPF introspection\n");
+      return EX_UNAVAILABLE;
+    case EPERM:
+      fprintf(stderr,
+              "Require CAP_SYS_ADMIN capability.  Please retry as root\n");
+      return EX_NOPERM;
+    default:
+      fprintf(stderr, "%s\n", strerror(errno));
+      return 1;
+  }
+}
+
+static void print_prog_hdr(void)
+{
+  printf("%9s %-15s %8s %6s %-12s %-15s\n",
+         "BID", "TYPE", "UID", "#MAPS", "LoadTime", "NAME");
+}
+
+static void print_prog_info(const struct bpf_prog_info *prog_info)
+{
+  struct timespec real_time_ts, boot_time_ts;
+  time_t wallclock_load_time = 0;
+  char unknown_prog_type[16];
+  const char *prog_type;
+  char load_time[16];
+  struct tm load_tm;
+
+  if (prog_info->type > LAST_KNOWN_PROG_TYPE) {
+    snprintf(unknown_prog_type, sizeof(unknown_prog_type), "<%u>",
+             prog_info->type);
+    unknown_prog_type[sizeof(unknown_prog_type) - 1] = '\0';
+    prog_type = unknown_prog_type;
+  } else {
+    prog_type = prog_type_strings[prog_info->type];
+  }
+
+  if (!clock_gettime(CLOCK_REALTIME, &real_time_ts) &&
+      !clock_gettime(CLOCK_BOOTTIME, &boot_time_ts) &&
+      real_time_ts.tv_sec >= boot_time_ts.tv_sec)
+    wallclock_load_time =
+      (real_time_ts.tv_sec - boot_time_ts.tv_sec) +
+      prog_info->load_time / 1000000000;
+
+  if (wallclock_load_time && localtime_r(&wallclock_load_time, &load_tm))
+    strftime(load_time, sizeof(load_time), "%b%d/%H:%M", &load_tm);
+  else
+    snprintf(load_time, sizeof(load_time), "<%llu>",
+             prog_info->load_time / 1000000000);
+  load_time[sizeof(load_time) - 1] = '\0';
+
+  if (prog_info->jited_prog_len)
+    printf("%9u %-15s %8u %6u %-12s %-15s\n",
+           prog_info->id, prog_type, prog_info->created_by_uid,
+           prog_info->nr_map_ids, load_time, prog_info->name);
+  else
+    printf("%8u- %-15s %8u %6u %-12s %-15s\n",
+           prog_info->id, prog_type, prog_info->created_by_uid,
+           prog_info->nr_map_ids, load_time, prog_info->name);
+}
+
+static void print_map_hdr(void)
+{
+  printf("%8s %-15s %-10s %8s %8s %8s %-15s\n",
+         "MID", "TYPE", "FLAGS", "KeySz", "ValueSz", "MaxEnts",
+         "NAME");
+}
+
+static void print_map_info(const struct bpf_map_info *map_info)
+{
+  char unknown_map_type[16];
+  const char *map_type;
+
+  if (map_info->type > LAST_KNOWN_MAP_TYPE) {
+    snprintf(unknown_map_type, sizeof(unknown_map_type),
+             "<%u>", map_info->type);
+    unknown_map_type[sizeof(unknown_map_type) - 1] = '\0';
+    map_type = unknown_map_type;
+  } else {
+    map_type = map_type_strings[map_info->type];
+  }
+
+  printf("%8u %-15s 0x%-8x %8u %8u %8u %-15s\n",
+         map_info->id, map_type, map_info->map_flags, map_info->key_size,
+         map_info->value_size, map_info->max_entries,
+         map_info->name);
+}
+
+static int print_one_prog(uint32_t prog_id)
+{
+  const uint32_t usual_nr_map_ids = 64;
+  uint32_t nr_map_ids = usual_nr_map_ids;
+  struct bpf_prog_info prog_info;
+  uint32_t *map_ids =  NULL;
+  uint32_t info_len;
+  int ret = 0;
+  int prog_fd;
+  uint32_t i;
+
+  prog_fd = bpf_prog_get_fd_by_id(prog_id);
+  if (prog_fd == -1) {
+    if (errno == ENOENT) {
+      fprintf(stderr, "BID:%u not found\n", prog_id);
+      return EX_DATAERR;
+    } else {
+      return handle_get_next_errno(errno);
+    }
+  }
+
+  /* Retry at most one time for larger map_ids array */
+  for (i = 0; i < 2; i++) {
+    bzero(&prog_info, sizeof(prog_info));
+    prog_info.map_ids = ptr_to_u64(realloc(map_ids,
+                                           nr_map_ids * sizeof(*map_ids)));
+    if (!prog_info.map_ids) {
+      fprintf(stderr,
+              "Cannot allocate memory for %u map_ids for BID:%u\n",
+              nr_map_ids, prog_id);
+      close(prog_fd);
+      free(map_ids);
+      return 1;
+    }
+
+    map_ids = u64_to_ptr(prog_info.map_ids);
+    prog_info.nr_map_ids = nr_map_ids;
+    info_len = sizeof(prog_info);
+    ret = bpf_obj_get_info(prog_fd, &prog_info, &info_len);
+    if (ret) {
+      fprintf(stderr, "Cannot get info for BID:%u. %s(%d)\n",
+              prog_id, strerror(errno), errno);
+      close(prog_fd);
+      free(map_ids);
+      return ret;
+    }
+
+    if (prog_info.nr_map_ids <= nr_map_ids)
+      break;
+
+    nr_map_ids = prog_info.nr_map_ids;
+  }
+  close(prog_fd);
+
+  print_prog_hdr();
+  print_prog_info(&prog_info);
+  printf("\n");
+
+  /* Print all map_info used by the prog */
+  print_map_hdr();
+  nr_map_ids = min(prog_info.nr_map_ids, nr_map_ids);
+  for (i = 0; i < nr_map_ids; i++) {
+    struct bpf_map_info map_info = {};
+    info_len = sizeof(map_info);
+    int map_fd;
+
+    map_fd = bpf_map_get_fd_by_id(map_ids[i]);
+    if (map_fd == -1) {
+      if (errno == -ENOENT)
+        continue;
+
+      fprintf(stderr,
+              "Cannot get fd for map:%u. %s(%d)\n",
+              map_ids[i], strerror(errno), errno);
+      ret = map_fd;
+      break;
+    }
+
+    ret = bpf_obj_get_info(map_fd, &map_info, &info_len);
+    close(map_fd);
+    if (ret) {
+      fprintf(stderr, "Cannot get info for map:%u. %s(%d)\n",
+              map_ids[i], strerror(errno), errno);
+      break;
+    }
+
+    print_map_info(&map_info);
+  }
+
+  free(map_ids);
+  return ret;
+}
+
+int print_all_progs(void)
+{
+  uint32_t next_id = 0;
+
+  print_prog_hdr();
+
+  while (!bpf_prog_get_next_id(next_id, &next_id)) {
+    struct bpf_prog_info prog_info = {};
+    uint32_t prog_info_len = sizeof(prog_info);
+    int prog_fd;
+    int ret;
+
+    prog_fd = bpf_prog_get_fd_by_id(next_id);
+    if (prog_fd < 0) {
+      if (errno == ENOENT)
+        continue;
+      fprintf(stderr,
+              "Cannot get fd for BID:%u. %s(%d)\n",
+              next_id, strerror(errno), errno);
+      return 1;
+    }
+
+    ret = bpf_obj_get_info(prog_fd, &prog_info, &prog_info_len);
+    close(prog_fd);
+    if (ret) {
+      fprintf(stderr,
+              "Cannot get bpf_prog_info for BID:%u. %s(%d)\n",
+              next_id, strerror(errno), errno);
+      return ret;
+    }
+
+    print_prog_info(&prog_info);
+  }
+
+  return handle_get_next_errno(errno);
+}
+
+void usage(void)
+{
+  printf("BPF Program Snapshot (bps):\n"
+         "List of all BPF programs loaded into the system.\n\n");
+  printf("Usage: bps [bpf-prog-id]\n");
+  printf("    [bpf-prog-id] If specified, it shows the details info of the bpf-prog\n");
+  printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+  if (argc > 1) {
+    if (!isdigit(*argv[1])) {
+      usage();
+      return EX_USAGE;
+    }
+    return print_one_prog((uint32_t)atoi(argv[1]));
+  }
+
+  return print_all_progs();
+}
diff --git a/introspection/bps_example.txt b/introspection/bps_example.txt
new file mode 100644
index 0000000..6bbb08f
--- /dev/null
+++ b/introspection/bps_example.txt
@@ -0,0 +1,22 @@
+* List all BPF programs *
+# bps
+   BID TYPE                 UID  #MAPS LoadTime     NAME
+    82 kprobe                 0      1 Oct19/23:52  map_perf_test
+    83 kprobe                 0      1 Oct19/23:52  map_perf_test
+    84 kprobe                 0      1 Oct19/23:52  map_perf_test
+    85 kprobe                 0      1 Oct19/23:52  map_perf_test
+    86 kprobe                 0      4 Oct19/23:52  map_perf_test
+    87 kprobe                 0      1 Oct19/23:52  map_perf_test
+    88 kprobe                 0      1 Oct19/23:52  map_perf_test
+    89 kprobe                 0      1 Oct19/23:52  map_perf_test
+
+* List a particular BPF program and its maps *
+# bps 86
+   BID TYPE                 UID  #MAPS LoadTime     NAME
+    86 kprobe                 0      4 Oct19/23:52  map_perf_test
+
+MID TYPE            FLAGS         KeySz  ValueSz  MaxEnts NAME
+120 lru hash        0x0               4        8    10000 lru_hash_map
+129 lru hash        0x0               4        8       43 lru_hash_lookup
+123 array of maps   0x0               4        4     1024 array_of_lru_ha
+121 lru hash        0x2               4
diff --git a/man/CMakeLists.txt b/man/CMakeLists.txt
new file mode 100644
index 0000000..1b350a3
--- /dev/null
+++ b/man/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(man8)
diff --git a/man/man8/CMakeLists.txt b/man/man8/CMakeLists.txt
new file mode 100644
index 0000000..718c700
--- /dev/null
+++ b/man/man8/CMakeLists.txt
@@ -0,0 +1,12 @@
+find_program(GZIP gzip)
+file(GLOB FILES *.8)
+set(GZFILES "")
+foreach(FIL ${FILES})
+  get_filename_component(NAME ${FIL} NAME)
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz
+    COMMAND ${GZIP} -c ${FIL} > ${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz
+    DEPENDS ${FIL})
+  list(APPEND GZFILES "${CMAKE_CURRENT_BINARY_DIR}/${NAME}.gz")
+endforeach()
+add_custom_target(man ALL DEPENDS ${GZFILES})
+install(FILES ${GZFILES} DESTINATION share/bcc/man/man8)
diff --git a/man/man8/argdist.8 b/man/man8/argdist.8
new file mode 100644
index 0000000..4116cd4
--- /dev/null
+++ b/man/man8/argdist.8
@@ -0,0 +1,194 @@
+.TH argdist 8  "2016-02-11" "USER COMMANDS"
+.SH NAME
+argdist \- Trace a function and display a histogram or frequency count of its parameter values. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-d DURATION] [-n COUNT] [-v] [-T TOP] [-H specifier] [-C specifier] [-I header]
+.SH DESCRIPTION
+argdist attaches to function entry and exit points, collects specified parameter
+values, and stores them in a histogram or a frequency collection that counts
+the number of times a parameter value occurred. It can also filter parameter
+values and instrument multiple entry points at once.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace only functions in the process PID.
+.TP
+\-z STRING_SIZE
+When collecting string arguments (of type char*), collect up to STRING_SIZE 
+characters. Longer strings will be truncated.
+.TP
+\-i INTERVAL
+Print the collected data every INTERVAL seconds. The default is 1 second.
+.TP
+\-d DURATION
+Total duration of trace in seconds.
+.TP
+\-n NUMBER
+Print the collected data COUNT times and then exit.
+.TP
+\-v
+Display the generated BPF program, for debugging purposes.
+.TP
+\-T TOP
+When collecting frequency counts, display only the top TOP entries.
+.TP
+\-H specifiers, \-C specifiers
+One or more probe specifications that instruct argdist which functions to
+probe, which parameters to collect, how to aggregate them, and whether to perform
+any filtering. See SPECIFIER SYNTAX below.
+.TP
+\-I header
+One or more header files that should be included in the BPF program. This 
+enables the use of structure definitions, enumerations, and constants that
+are available in these headers. You should provide the same path you would
+include in the BPF program, e.g. 'linux/blkdev.h' or 'linux/time.h'. Note: in
+many cases, argdist will deduce the necessary header files automatically. 
+.SH SPECIFIER SYNTAX
+The general specifier syntax is as follows:
+
+.B {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label]
+.TP
+.B {p,r,t,u}
+Probe type \- "p" for function entry, "r" for function return, "t" for kernel
+tracepoint, "u" for USDT probe; \-H for histogram collection, \-C for frequency count.
+Indicates where to place the probe and whether the probe should collect frequency
+count information, or aggregate the collected values into a histogram. Counting 
+probes will collect the number of times every parameter value was observed,
+whereas histogram probes will collect the parameter values into a histogram.
+Only integral types can be used with histogram probes; there is no such limitation
+for counting probes.
+.TP
+.B [library]
+Library containing the probe.
+Specify the full path to the .so or executable file where the function to probe
+resides. Alternatively, you can specify just the lib name: for example, "c"
+refers to libc. If no library name is specified, the kernel is assumed.
+.TP
+.B category
+The category of the kernel tracepoint. For example: net, sched, block.
+.TP
+.B function(signature)
+The function to probe, and its signature.
+The function name must match exactly for the probe to be placed. The signature,
+on the other hand, is only required if you plan to collect parameter values 
+based on that signature. For example, if you only want to collect the first
+parameter, you don't have to specify the rest of the parameters in the signature.
+When capturing kernel tracepoints, this should be the name of the event, e.g.
+net_dev_start_xmit. The signature for kernel tracepoints should be empty. When
+capturing USDT probes, this should be the name of the probe, e.g. reloc_complete.
+The signature for USDT probes should be empty.
+.TP
+.B [type[,type...]]
+The type(s) of the expression(s) to capture.
+This is the type of the keys in the histogram or raw event collection that are
+collected by the probes.
+.TP
+.B [expr[,expr...]]
+The expression(s) to capture.
+These are the values that are assigned to the histogram or raw event collection.
+You may use the parameters directly, or valid C expressions that involve the
+parameters, such as "size % 10".
+Tracepoints may access a special structure called "args" that is formatted
+according to the tracepoint format (which you can obtain using tplist).
+For example, the block:block_rq_complete tracepoint can access args->nr_sector.
+USDT probes may access the arguments defined by the tracing program in the 
+special arg1, arg2, ... variables. To obtain their types, use the tplist tool.
+Return probes can use the argument values received by the
+function when it was entered, through the $entry(paramname) special variable.
+Return probes can also access the function's return value in $retval, and the
+function's execution time in nanoseconds in $latency. Note that adding the
+$latency or $entry(paramname) variables to the expression will introduce an
+additional probe at the function's entry to collect this data, and therefore
+introduce additional overhead.
+.TP
+.B [filter]
+The filter applied to the captured data.
+Only parameter values that pass the filter will be collected. This is any valid
+C expression that refers to the parameter values, such as "fd == 1 && length > 16".
+The $entry, $retval, and $latency variables can be used here as well, in return
+probes.
+The filter expression may also use the STRCMP pseudo-function to compare
+a predefined string to a string argument. For example: STRCMP("test.txt", file).
+The order of arguments is important: the first argument MUST be a quoted
+literal string, and the second argument can be a runtime string.
+.TP
+.B [label]
+The label that will be displayed when printing the probed values. By default,
+this is the probe specifier. 
+.SH EXAMPLES
+.TP
+Print a histogram of allocation sizes passed to kmalloc:
+#
+.B argdist -H 'p::__kmalloc(u64 size):u64:size'
+.TP
+Print a count of how many times process 1005 called malloc with an allocation size of 16 bytes:
+#
+.B argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16'
+.TP
+Snoop on all strings returned by gets():
+#
+.B argdist -C 'r:c:gets():char*:$retval'
+.TP
+Print a histogram of read sizes that were longer than 1ms:
+#
+.B argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000'
+.TP
+Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005:
+#
+.B argdist -p 1005 -C 'p:c:write(int fd):int:fd'
+.TP
+Print a histogram of error codes returned by read() in process 1005:
+#
+.B argdist -p 1005 -H 'r:c:read()'
+.TP
+Print a histogram of buffer sizes passed to write() across all processes, where the file descriptor was 1 (STDOUT):
+#
+.B argdist -H 'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
+.TP
+Count fork() calls in libc across all processes, grouped by pid:
+#
+.B argdist -C 'p:c:fork():int:$PID;fork per process'
+.TP
+Print histogram of number of sectors in completing block I/O requests:
+#
+.B argdist -H 't:block:block_rq_complete():u32:nr_sector'
+.TP
+Aggregate interrupts by interrupt request (IRQ):
+#
+.B argdist -C 't:irq:irq_handler_entry():int:irq'
+.TP
+Print the functions used as thread entry points and how common they are:
+#
+.B argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337
+.TP
+Print histograms of sleep() and nanosleep() parameter values:
+#
+.B argdist -H 'p:c:sleep(u32 seconds):u32:seconds' -H 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec'
+.TP
+Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters:
+#
+.B argdist -p 2780 -z 120 -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1'
+.TP
+Group files being read from and the read sizes from __vfs_read:
+#
+.B argdist -C 'p::__vfs_read(struct file *file, void *buf, size_t count):char*,size_t:file->f_path.dentry->d_iname,count:file->f_path.dentry->d_iname[0]!=0'
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
diff --git a/man/man8/bashreadline.8 b/man/man8/bashreadline.8
new file mode 100644
index 0000000..a70fc58
--- /dev/null
+++ b/man/man8/bashreadline.8
@@ -0,0 +1,52 @@
+.TH bashreadline 8  "2016-01-28" "USER COMMANDS"
+.SH NAME
+bashreadline \- Print entered bash commands system wide. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B bashreadline
+.SH DESCRIPTION
+bashreadline traces the return of the readline() function using uprobes, to
+show the bash commands that were entered interactively, system wide. The
+entered command may fail: this is just showing what was entered.
+
+This program is also a basic example of eBPF/bcc and uprobes.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output()); 
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace bash commands system wide:
+#
+.B bashreadline
+.SH FIELDS
+.TP
+TIME
+Time of the command (HH:MM:SS).
+.TP
+PID
+Process ID of the bash shell.
+.TP
+COMMAND
+Entered command.
+.SH OVERHEAD
+As the rate of interactive bash commands is expected to be very low (<<100/s),
+the overhead of this program is expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(8)
diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8
new file mode 100644
index 0000000..84d261e
--- /dev/null
+++ b/man/man8/biolatency.8
@@ -0,0 +1,96 @@
+.TH biolatency 8  "2015-08-20" "USER COMMANDS"
+.SH NAME
+biolatency \- Summarize block device I/O latency as a histogram.
+.SH SYNOPSIS
+.B biolatency [\-h] [\-T] [\-Q] [\-m] [\-D] [interval [count]]
+.SH DESCRIPTION
+biolatency traces block device I/O (disk I/O), and records the distribution
+of I/O latency (time). This is printed as a histogram either on Ctrl-C, or
+after a given interval in seconds.
+
+The latency of the disk I/O is measured from the issue to the device to its
+completion. A \-Q option can be used to include time queued in the kernel.
+
+This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
+for efficiency.
+
+This works by tracing various kernel blk_*() functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-m
+Output histogram in milliseconds.
+.TP
+\-D
+Print a histogram per disk device.
+.TP
+interval
+Output interval, in seconds.
+.TP
+count
+Number of outputs.
+.SH EXAMPLES
+.TP
+Summarize block device I/O latency as a histogram:
+#
+.B biolatency
+.TP
+Print 1 second summaries, 10 times:
+#
+.B biolatency 1 10
+.TP
+Print 1 second summaries, using milliseconds as units for the histogram, and
+include timestamps on output:
+#
+.B biolatency \-mT 1
+.TP
+Include OS queued time in I/O time:
+#
+.B biolatency \-Q
+.TP
+Show a latency histogram for each disk device separately:
+#
+.B biolatency \-D
+.SH FIELDS
+.TP
+usecs
+Microsecond range
+.TP
+msecs
+Millisecond range
+.TP
+count
+How many I/O fell into this range
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This traces kernel functions and maintains in-kernel timestamps and a histogram,
+which are asynchronously copied to user-space. This method is very efficient,
+and the overhead for most storage I/O rates (< 10k IOPS) should be negligible.
+If you have a higher IOPS storage environment, test and quantify the overhead
+before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8)
diff --git a/man/man8/biosnoop.8 b/man/man8/biosnoop.8
new file mode 100644
index 0000000..e5dbeaa
--- /dev/null
+++ b/man/man8/biosnoop.8
@@ -0,0 +1,76 @@
+.TH biosnoop 8  "2015-09-16" "USER COMMANDS"
+.SH NAME
+biosnoop \- Trace block device I/O and print details incl. issuing PID.
+.SH SYNOPSIS
+.B biosnoop
+.SH DESCRIPTION
+This tools traces block device I/O (disk I/O), and prints a one-line summary
+for each I/O showing various details. These include the latency from the time of
+issue to the device to its completion, and the PID and process name from when
+the I/O was first created (which usually identifies the responsible process).
+
+This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+request, as well as a starting timestamp for calculating I/O latency.
+
+This works by tracing various kernel blk_*() functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace all block device I/O and print a summary line per I/O:
+#
+.B biosnoop
+.SH FIELDS
+.TP
+TIME(s)
+Time of the I/O, in seconds since the first I/O was seen.
+.TP
+COMM
+Cached process name, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+PID
+Cached process ID, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+DISK
+Disk device name.
+.TP
+T
+Type of I/O: R = read, W = write. This is a simplification.
+.TP
+SECTOR
+Device sector for the I/O.
+.TP
+BYTES
+Size of the I/O, in bytes.
+.TP
+LAT(ms)
+Time for the I/O (latency) from the issue to the device, to its completion,
+in milliseconds.
+.SH OVERHEAD
+Since block device I/O usually has a relatively low frequency (< 10,000/s),
+the overhead for this tool is expected to be negligible. For high IOPS storage
+systems, test and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+disksnoop(8), iostat(1)
diff --git a/man/man8/biotop.8 b/man/man8/biotop.8
new file mode 100644
index 0000000..8b872aa
--- /dev/null
+++ b/man/man8/biotop.8
@@ -0,0 +1,108 @@
+.TH biotop 8  "2016-02-06" "USER COMMANDS"
+.SH NAME
+biotop \- Block device (disk) I/O by process top.
+.SH SYNOPSIS
+.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
+.SH DESCRIPTION
+This is top for disks. 
+
+This traces block device I/O (disk I/O), and prints a per-process summary every
+interval (by default, 1 second). The summary is sorted on the top disk
+consumers by throughput (Kbytes). The PID and process name shown are measured
+from when the I/O was first created, which usually identifies the responsible
+process.
+
+For efficiency, this uses in-kernel eBPF maps to cache process details (PID and
+comm) by I/O request, as well as a starting timestamp for calculating I/O
+latency, and the final summary.
+
+This works by tracing various kernel blk_*() functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C
+Don't clear the screen.
+.TP
+\-r MAXROWS
+Maximum number of rows to print. Default is 20.
+.TP
+\-p PID
+Trace this PID only.
+.TP
+interval
+Interval between updates, seconds.
+.TP
+count
+Number of interval summaries.
+.SH EXAMPLES
+.TP
+Summarize block device I/O by process, 1 second screen refresh:
+#
+.B biotop
+.TP
+Don't clear the screen:
+#
+.B biotop -C
+.TP
+5 second summaries, 10 times only:
+#
+.B biotop 5 10
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Cached process ID, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+COMM
+Cached process name, if present. This usually (but isn't guaranteed) to identify
+the responsible process for the I/O.
+.TP
+D
+Direction: R == read, W == write. This is a simplification.
+.TP
+MAJ
+Major device number.
+.TP
+MIN
+Minor device number.
+.TP
+DISK
+Disk device name.
+.TP
+I/O
+Number of I/O during the interval.
+.TP
+Kbytes
+Total Kbytes for these I/O, during the interval.
+.TP
+AVGms
+Average time for the I/O (latency) from the issue to the device, to its
+completion, in milliseconds.
+.SH OVERHEAD
+Since block device I/O usually has a relatively low frequency (< 10,000/s),
+the overhead for this tool is expected to be low or negligible. For high IOPS
+storage systems, test and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH INSPIRATION
+top(1) by William LeFebvre
+.SH SEE ALSO
+biosnoop(8), biolatency(8), iostat(1)
diff --git a/man/man8/bitesize.8 b/man/man8/bitesize.8
new file mode 100644
index 0000000..07046e8
--- /dev/null
+++ b/man/man8/bitesize.8
@@ -0,0 +1,52 @@
+.TH bitesize 8  "2016-02-05" "USER COMMANDS"
+.SH NAME
+bitesize \- Summarize block device I/O size as a histogram \- Linux eBPF/bcc.
+.SH SYNOPSIS
+.B bitesize
+.SH DESCRIPTION
+Show I/O distribution for requested block sizes, by process name.
+
+This works by tracing block I/O kernel functions using dynamic
+tracing and prints a historgram of I/O size.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count I/O size per process until Ctrl-C is hit:
+#
+.B bitesize
+.SH FIELDS
+.TP
+Kbtes
+Size in kilobytes of range
+.TP
+count
+How many I/O fell into this range
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+
+.SH OVERHEAD
+This traces kernel block I/O functions to update a histgroam, which are
+asynchronously copied to user-space. This method is very efficient, and 
+the overhead for most storage I/O rates (< 10k IOPS) should be negligible. 
+If you have a higher IOPS storage environment, test and quantify the overhead 
+before use.
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Allan McAleavy
+.SH SEE ALSO
+https://github.com/brendangregg/systemtap-lwtools/blob/master/disk/bitesize-nd.stp
diff --git a/man/man8/bpflist.8 b/man/man8/bpflist.8
new file mode 100644
index 0000000..1cab0c8
--- /dev/null
+++ b/man/man8/bpflist.8
@@ -0,0 +1,60 @@
+.TH bpflist 8  "2017-03-09" "USER COMMANDS"
+.SH NAME
+bpflist \- Display processes currently using BPF programs and maps.
+.SH SYNOPSIS
+.B bpflist [-v]
+.SH DESCRIPTION
+This tool displays processes currently using BPF programs and maps, and
+optionally also kprobes and uprobes on the system. This is useful to understand
+which BPF programs are loaded on the system.
+
+Currently, for lack of a better alternative, this tool pipes into 'ls' and
+parses its output to snoop for BPF file descriptors in all running processes.
+In the future, when BPF accounting is provided by the kernel, this tool should
+use these accounting features.
+
+Only the root user can use this tool, because it accesses debugfs.
+.SH REQUIREMENTS
+bcc, debugfs
+.SH OPTIONS
+\-h
+Print usage message.
+.TP
+\-v
+Count kprobes and uprobes as well as BPF programs. Repeating verbose mode twice
+also prints the kprobe and uprobe definitions in addition to counting them.
+.SH EXAMPLES
+.TP
+Display processes currently using BPF programs:
+#
+.B bpflist
+.TP
+Also count kprobes and uprobes:
+#
+.B bpflist -v
+.SH FIELDS
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process comm.
+.TP
+TYPE
+The type of the data displayed: BPF program, BPF map, kprobe, or uprobe.
+.TP
+COUNT
+The number of items of this type that belong to the specified process.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
diff --git a/man/man8/bps.8 b/man/man8/bps.8
new file mode 100644
index 0000000..4316be0
--- /dev/null
+++ b/man/man8/bps.8
@@ -0,0 +1,87 @@
+.TH bps 8  "2017-10-19" "USER COMMANDS"
+.SH NAME
+bps \- List all BPF programs. 'ps' for BPF programs.
+.SH SYNOPSIS
+.B bps [bpf-prog-id]
+.SH DESCRIPTION
+.B bps
+lists all BPF programs loaded into the kernel.  It is similar
+to the ps command but for the BPF programs.
+
+Each loaded bpf program is identified by an unique integer (i.e.
+.B bpf-prog-id
+or simply BID).  If
+a
+.B bpf-prog-id
+is specified, the maps used by
+.B bpf-prog-id
+will also be listed.
+
+.SH EXAMPLES
+.TP
+List all BPF programs loaded into the kernel:
+.B bps
+.TP
+Show the details and maps of BID 6:
+.B bps 6
+.SH BPF PROGRAM FIELDS
+.TP
+.B BID
+BPF program ID.  It ends with '-' if it is not jitted.
+.TP
+.B TYPE
+The type of a BPF program. e.g. kprobe, tracepoint, xdp...etc.
+.TP
+.B UID
+The user ID that loaded the BPF program.
+.TP
+.B #MAPS
+Total number of maps used by a BPF program.
+.TP
+.B LoadTime
+When was the BPF program loaded?
+.TP
+.B NAME
+The name of a BPF program.  The user space library (like
+.B bcc
+) usually
+uses the C function name of the original BPF's source code as
+the program name.  It could be empty if the user space did not
+provide a name.
+
+.SH BPF MAP FIELDS
+.TP
+.B MID
+BPF map ID.
+.TP
+.B TYPE
+The type of a BPF map. e.g. hash, array, stack trace...etc.
+.TP
+.B FLAGS
+The flags used to create the BP map.
+.TP
+.B KeySz
+The key size of a BPF map.
+.TP
+.B ValueSz
+The value size of a BPF map.
+.TP
+.B MaxEnts
+The maximum number of entries of a map.
+.TP
+.B NAME
+The name of a BPF map.  The user space library (like
+.B bcc
+) usually uses the C variable name of the BPF map as its name.
+It could be empty if the user space did not provide a name.
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Martin Lau
diff --git a/man/man8/btrfsdist.8 b/man/man8/btrfsdist.8
new file mode 100644
index 0000000..480264f
--- /dev/null
+++ b/man/man8/btrfsdist.8
@@ -0,0 +1,82 @@
+.TH btrfsdist 8  "2016-02-15" "USER COMMANDS"
+.SH NAME
+btrfsdist \- Summarize btrfs operation latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B btrfsdist [\-h] [\-T] [\-N] [\-d] [interval] [count]
+.SH DESCRIPTION
+This tool summarizes time (latency) spent in common btrfs file operations:
+reads, writes, opens, and syncs, and presents it as a power-of-2 histogram. It
+uses an in-kernel eBPF map to store the histogram for efficiency.
+
+Since this works by tracing the btrfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Don't include timestamps on interval output.
+.TP
+\-m
+Output in milliseconds.
+.TP
+\-p PID
+Trace this PID only.
+.SH EXAMPLES
+.TP
+Trace btrfs operation time, and print a summary on Ctrl-C:
+#
+.B btrfsdist
+.TP
+Trace PID 181 only:
+#
+.B btrfsdist -p 181
+.TP
+Print 1 second summaries, 10 times:
+#
+.B btrfsdist 1 10
+.TP
+1 second summaries, printed in milliseconds
+#
+.B btrfsdist \-m 1
+.SH FIELDS
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+count
+Number of operations in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This adds low-overhead instrumentation to btrfs writes and fsyncs, as well
+as all system reads and opens (due to the current implementation of the
+btrfs_file_operations interface). Particularly, all reads and writes from
+the file system cache will incur extra overhead while tracing. Such reads and
+writes can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+btrfsslower(8)
diff --git a/man/man8/btrfsslower.8 b/man/man8/btrfsslower.8
new file mode 100644
index 0000000..35af5df
--- /dev/null
+++ b/man/man8/btrfsslower.8
@@ -0,0 +1,115 @@
+.TH btrfsslower 8  "2016-02-15" "USER COMMANDS"
+.SH NAME
+btrfsslower \- Trace slow btrfs file operations, with per-event details.
+.SH SYNOPSIS
+.B btrfsslower [\-h] [\-j] [\-p PID] [min_ms]
+.SH DESCRIPTION
+This tool traces common btrfs file operations: reads, writes, opens, and
+syncs. It measures the time spent in these operations, and prints details
+for each that exceeded a threshold.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used. If a threshold of 0
+is used, all events are printed (warning: verbose).
+
+Since this works by tracing the btrfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B btrfsslower
+.TP
+Trace slower than 1 ms:
+#
+.B btrfsslower 1
+.TP
+Trace slower than 1 ms, and output just the fields in parsable format (csv):
+#
+.B btrfsslower \-j 1
+.TP
+Trace all file reads and writes (warning: the output will be verbose):
+#
+.B btrfsslower 0
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B btrfsslower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+T
+Type of operation. R == read, W == write, O == open, S == fsync.
+.TP
+OFF_KB
+File offset for the I/O, in Kbytes.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when it was issued by VFS to the
+filesystem, to when it completed. This time is inclusive of block device I/O,
+file system CPU cycles, file system locks, run queue latency, etc. It's a more
+accurate measure of the latency suffered by applications performing file
+system I/O, than to measure this down at the block device interface.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.TP
+ENDTIME_us
+Completion timestamp, microseconds (\-j only).
+.TP
+OFFSET_b
+File offset, bytes (\-j only).
+.TP
+LATENCY_us
+Latency (duration) of the I/O, in microseconds (\-j only).
+.SH OVERHEAD
+This adds low-overhead instrumentation to btrfs writes and fsyncs, as well
+as all system reads and opens (due to the current implementation of the
+btrfs_file_operations interface). Particularly, all reads and writes from
+the file system cache will incur extra overhead while tracing. Such reads and
+writes can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use. If this
+continues to be a problem, consider switching to a tool that prints in-kernel
+summaries only, such as btrfsdist(8).
+.PP
+Note that the overhead of this tool should be less than fileslower(8), as
+this tool targets btrfs functions only, and not all file read/write paths
+(which can include socket I/O).
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+btrfsdist(8), biosnoop(8), funccount(8), fileslower(8)
diff --git a/man/man8/cachestat.8 b/man/man8/cachestat.8
new file mode 100644
index 0000000..897d5af
--- /dev/null
+++ b/man/man8/cachestat.8
@@ -0,0 +1,85 @@
+.TH cachestat 8  "2016-01-30" "USER COMMANDS"
+.SH NAME
+cachestat \- Statistics for linux page cache hit/miss ratios. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B cachestat
+[-T] [interval [count]]
+.SH DESCRIPTION
+This traces four kernel functions and prints per-second summaries. This can
+be useful for general workload characterization, and looking for patterns
+in operation usage over time.
+
+This works by tracing kernel page cache functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize which functions are traced.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print summaries every five second:
+#
+.B cachestat
+.TP
+Print summaries every five seconds with timestamp:
+#
+.B cachestat -T
+.TP
+Print summaries each second:
+#
+.B cachestat 1
+.TP
+Print output every five seconds, three times:
+#
+.B cachestat 5 3
+.TP
+Print output with timestamp every five seconds, three times:
+#
+.B cachestat -T 5 3
+.SH FIELDS
+.TP
+TIME
+Timestamp.
+.TP
+HITS
+Number of page cache hits.
+.TP
+MISSES
+Number of page cache misses.
+.TP
+DIRTIES
+Number of dirty pages added to the page cache.
+.TP
+READ_HIT%
+Read hit percent of page cache usage.
+.TP
+WRITE_HIT%
+Write hit percent of page cache usage.
+.TP
+BUFFERS_MB
+Buffers size taken from /proc/meminfo.
+.TP
+CACHED_MB
+Cached amount of data in current page cache taken from /proc/meminfo.
+.SH OVERHEAD
+This traces various kernel page cache functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of operations can
+be very high (>1G/sec) we can have up to 34% overhead, this is still a relatively efficient way to trace 
+these events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Allan McAleavy
+.SH SEE ALSO
+https://github.com/brendangregg/perf-tools/blob/master/fs/cachestat
diff --git a/man/man8/cachetop.8 b/man/man8/cachetop.8
new file mode 100644
index 0000000..5642fa1
--- /dev/null
+++ b/man/man8/cachetop.8
@@ -0,0 +1,91 @@
+.TH cachetop 8  "2016-01-30" "USER COMMANDS"
+.SH NAME
+cachetop \- Statistics for linux page cache hit/miss ratios per processes. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B cachetop
+[interval]
+.SH DESCRIPTION
+This traces four kernel functions and prints per-processes summaries every
+\fBinterval\fR seconds. This can be useful for processes workload characterization,
+and looking for patterns in operation usage over time. It provides a \fBtop\fR-like interface
+which by default sorts by \fBHITS\fR in ascending order.
+
+This works by tracing kernel page cache functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize which functions are traced.
+
+Since this uses BPF, only the root user can use this tool.
+.SH KEYBINDINGS
+The following keybindings can be used to control the output of \fBcachetop\fR.
+.TP
+.B <
+Use the previous column for sorting.
+.TP
+.B >
+Use the next column for sorting.
+.TP
+.B r
+Toggle sorting order (default ascending).
+.TP
+.B q
+Quit cachetop.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Update summaries every five second:
+#
+.B cachetop
+.TP
+Print summaries each second:
+#
+.B cachetop 1
+.SH FIELDS
+.TP
+PID
+Process ID of the process causing the cache activity.
+.TP
+UID
+User ID of the process causing the cache activity.
+.TP
+HITS
+Number of page cache hits.
+.TP
+MISSES
+Number of page cache misses.
+.TP
+DIRTIES
+Number of dirty pages added to the page cache.
+.TP
+READ_HIT%
+Read hit percent of page cache usage.
+.TP
+WRITE_HIT%
+Write hit percent of page cache usage.
+.TP
+BUFFERS_MB
+Buffers size taken from /proc/meminfo.
+.TP
+CACHED_MB
+Cached amount of data in current page cache taken from /proc/meminfo.
+.SH OVERHEAD
+This traces various kernel page cache functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of operations can
+be very high (>1G/sec) we can have up to 34% overhead, this is still a relatively efficient way to trace
+these events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Emmanuel Bretelle
+.SH SEE ALSO
+cachestat (8)
diff --git a/man/man8/capable.8 b/man/man8/capable.8
new file mode 100644
index 0000000..c847ff0
--- /dev/null
+++ b/man/man8/capable.8
@@ -0,0 +1,69 @@
+.TH capable 8  "2016-09-13" "USER COMMANDS"
+.SH NAME
+capable \- Trace security capability checks (cap_capable()).
+.SH SYNOPSIS
+.B capable [\-h] [\-v] [\-p PID]
+.SH DESCRIPTION
+This traces security capability checks in the kernel, and prints details for
+each call. This can be useful for general debugging, and also security
+enforcement: determining a white list of capabilities an application needs.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF, bcc.
+.SH OPTIONS
+\-h
+USAGE message.
+.TP
+\-v
+Include non-audit capability checks. These are those deemed not interesting and
+not necessary to audit, such as CAP_SYS_ADMIN checks on memory allocation to
+affect the behavior of overcommit.
+.SH EXAMPLES
+.TP
+Trace all capability checks system-wide:
+#
+.B capable
+.TP
+Trace capability checks for PID 181:
+#
+.B capable \-p 181
+.SH FIELDS
+.TP
+TIME(s)
+Time of capability check: HH:MM:SS.
+.TP
+UID
+User ID.
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process name.
+CAP
+Capability number.
+NAME
+Capability name. See capabilities(7) for descriptions.
+.TP
+AUDIT
+Whether this was an audit event. Use \-v to include non-audit events.
+.SH OVERHEAD
+This adds low-overhead instrumentation to capability checks, which are expected
+to be low frequency, however, that depends on the application. Test in a lab
+environment before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+capabilities(7)
diff --git a/man/man8/cobjnew.8 b/man/man8/cobjnew.8
new file mode 120000
index 0000000..b384265
--- /dev/null
+++ b/man/man8/cobjnew.8
@@ -0,0 +1 @@
+uobjnew.8
\ No newline at end of file
diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8
new file mode 100644
index 0000000..6ee1f3b
--- /dev/null
+++ b/man/man8/cpudist.8
@@ -0,0 +1,107 @@
+.TH cpudist 8  "2016-06-28" "USER COMMANDS"
+.SH NAME
+cpudist \- On- and off-CPU task time as a histogram.
+.SH SYNOPSIS
+.B cpudist [\-h] [-O] [\-T] [\-m] [\-P] [\-L] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This measures the time a task spends on the CPU before being descheduled, and
+shows the times as a histogram. Tasks that spend a very short time on the CPU
+can be indicative of excessive context-switches and poor workload distribution,
+and possibly point to a shared source of contention that keeps tasks switching
+in and out as it becomes available (such as a mutex).
+
+Similarly, the tool can also measure the time a task spends off-CPU before it
+is scheduled again. This can be helpful in identifying long blocking and I/O
+operations, or alternatively very short descheduling times due to short-lived
+locks or timers.
+
+This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
+for efficiency. Despite this, the overhead of this tool may become significant
+for some workloads: see the OVERHEAD section.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-O
+Measure off-CPU time instead of on-CPU time.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-m
+Output histogram in milliseconds.
+.TP
+\-P
+Print a histogram for each PID (tgid from the kernel's perspective).
+.TP
+\-L
+Print a histogram for each TID (pid from the kernel's perspective).
+.TP
+\-p PID
+Only show this PID (filtered in kernel for efficiency).
+.TP
+interval
+Output interval, in seconds.
+.TP
+count
+Number of outputs.
+.SH EXAMPLES
+.TP
+Summarize task on-CPU time as a histogram:
+#
+.B cpudist
+.TP
+Summarize task off-CPU time as a histogram:
+#
+.B cpudist -O
+.TP
+Print 1 second summaries, 10 times:
+#
+.B cpudist 1 10
+.TP
+Print 1 second summaries, using milliseconds as units for the histogram, and include timestamps on output:
+#
+.B cpudist \-mT 1
+.TP
+Trace PID 186 only, 1 second summaries:
+#
+.B cpudist -P 185 1
+.SH FIELDS
+.TP
+usecs
+Microsecond range
+.TP
+msecs
+Millisecond range
+.TP
+count
+How many times a task event fell into this range
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This traces scheduler tracepoints, which can become very frequent. While eBPF
+has very low overhead, and this tool uses in-kernel maps for efficiency, the
+frequency of scheduler events for some workloads may be high enough that the
+overhead of this tool becomes significant. Measure in a lab environment
+to quantify the overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+pidstat(1), runqlat(8)
diff --git a/man/man8/cpuunclaimed.8 b/man/man8/cpuunclaimed.8
new file mode 100644
index 0000000..674be49
--- /dev/null
+++ b/man/man8/cpuunclaimed.8
@@ -0,0 +1,106 @@
+.TH cpuunclaimed 8  "2016-12-21" "USER COMMANDS"
+.SH NAME
+cpuunclaimed \- Sample CPU run queues and calculate unclaimed idle CPU. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B cpuunclaimed
+[\-T] [\-j] [\-J] [interval [count]]
+.SH DESCRIPTION
+This tool samples the length of the run queues and determine when there are idle
+CPUs, yet queued threads waiting their turn. It reports the amount of idle
+(yet unclaimed by waiting threads) CPU as a system-wide percentage.
+
+This situation can happen for a number of reasons:
+.IP -
+An application has been bound to some, but not all, CPUs, and has runnable
+threads that cannot migrate to other CPUs due to this configuration.
+.IP -
+CPU affinity: an optimization that leaves threads on CPUs where the CPU
+caches are warm, even if this means short periods of waiting while other
+CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
+.IP -
+Scheduler bugs.
+.P
+An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
+cause for concern. By leaving the CPU idle, overall throughput of the system
+may be improved. This tool is best for identifying larger issues, > 2%, due
+to the coarseness of its 99 Hertz samples.
+
+This is an experimental tool that currently works by use of sampling to
+keep overheads low. Tool assumptions:
+.IP -
+CPU samples consistently fire around the same offset. There will sometimes
+be a lag as a sample is delayed by higher-priority interrupts, but it is
+assumed the subsequent samples will catch up to the expected offsets (as
+is seen in practice). You can use -J to inspect sample offsets. Some
+systems can power down CPUs when idle, and when they wake up again they
+may begin firing at a skewed offset: this tool will detect the skew, print
+an error, and exit.
+.IP -
+All CPUs are online (see ncpu).
+.P
+If this identifies unclaimed CPU, you can double check it by dumping raw
+samples (-j), as well as using other tracing tools to instrument scheduler
+events (although this latter approach has much higher overhead).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Sample and calculate unclaimed idle CPUs, output every 1 second (default:
+#
+.B cpuunclaimed
+.TP
+Print 5 second summaries, 10 times:
+#
+.B cpuunclaimed 5 10
+.TP
+Print 1 second summaries with timestamps:
+#
+.B cpuunclaimed \-T 1
+.TP
+Raw dump of all samples (verbose), as comma-separated values:
+#
+.B cpuunclaimed \-j
+.SH FIELDS
+.TP
+%CPU
+CPU utilization as a system-wide percentage.
+.TP
+unclaimed idle
+Percentage of CPU resources that were idle when work was queued on other CPUs,
+as a system-wide percentage.
+.TP
+TIME
+Time (HH:MM:SS)
+.TP
+TIMESTAMP_ns
+Timestamp, nanoseconds.
+.TP
+CPU#
+CPU ID.
+.TP
+OFFSET_ns_CPU#
+Time offset that a sample fired within a sample group for this CPU.
+.SH OVERHEAD
+The overhead is expected to be low/negligible as this tool uses sampling at
+99 Hertz (on all CPUs), which has a fixed and low cost, rather than sampling
+every scheduler event as many other approaches use (which can involve
+instrumenting millions of events per second). Sampled CPUs, run queue lengths,
+and timestamps are written to ring buffers that are periodically read by
+user space for reporting. Measure overhead in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+runqlen(8)
diff --git a/man/man8/criticalstat.8 b/man/man8/criticalstat.8
new file mode 100644
index 0000000..52baf1d
--- /dev/null
+++ b/man/man8/criticalstat.8
@@ -0,0 +1,74 @@
+.TH criticalstat 8  "2018-06-07" "USER COMMANDS"
+.SH NAME
+criticalstat \- A tracer to find and report long atomic critical sections in kernel
+.SH SYNOPSIS
+.B criticalstat [\-h] [\-p] [\-i] [\-d DURATION]
+.SH DESCRIPTION
+
+criticalstat traces and reports occurences of atomic critical sections in the
+kernel with useful stacktraces showing the origin of them. Such critical
+sections frequently occur due to use of spinlocks, or if interrupts or
+preemption were explicity disabled by a driver. IRQ routines in Linux are also
+executed with interrupts disabled. There are many reasons. Such critical
+sections are a source of long latency/responsive issues for real-time systems.
+
+This works by probing the preempt/irq and cpuidle tracepoints in the kernel.
+Since this uses BPF, only the root user can use this tool. Further, the kernel
+has to be built with certain CONFIG options enabled. See below.
+
+.SH REQUIREMENTS
+Enable CONFIG_PREEMPTIRQ_EVENTS and CONFIG_DEBUG_PREEMPT. Additionally, the
+following options should be DISABLED on older kernels: CONFIG_PROVE_LOCKING,
+CONFIG_LOCKDEP.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p
+Find long sections where preemption was disabled on local CPU.
+.TP
+\-i
+Find long sections where interrupt was disabled on local CPU.
+.TP
+\-d DURATION
+Only identify sections that are longer than DURATION in microseconds.
+.SH EXAMPLES
+.TP
+Run with default options: irq disabled for more than 100 uS
+#
+.B criticalstat
+.TP
+Find sections with preemption disabled for more than 100 uS.
+#
+.B criticalstat -p
+.TP
+Find sections with IRQ disabled for more than 500 uS.
+#
+.B criticalstat -d 500
+.TP
+Find sections with preemption disabled for more than 500 uS.
+#
+.B criticalstat -p -d 500
+.SH OVERHEAD
+This tool can cause overhead if the application is spending a lot of time in
+kernel mode. The overhead is variable but can be 2-4% of performance
+degradation. If overhead is seen to be too much, please pass a higher DURATION
+to the -d option to filter more aggressively.
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Joel Fernandes
+.SH SEE ALSO
+Linux kernel's preemptoff and irqoff tracers.
+
diff --git a/man/man8/cthreads.8 b/man/man8/cthreads.8
new file mode 120000
index 0000000..baf8205
--- /dev/null
+++ b/man/man8/cthreads.8
@@ -0,0 +1 @@
+uthreads.8
\ No newline at end of file
diff --git a/man/man8/dbslower.8 b/man/man8/dbslower.8
new file mode 100644
index 0000000..740fdb6
--- /dev/null
+++ b/man/man8/dbslower.8
@@ -0,0 +1,74 @@
+.TH dbslower 8  "2017-02-15" "USER COMMANDS"
+.SH NAME
+dbslower \- Trace MySQL/PostgreSQL server queries slower than a threshold.
+.SH SYNOPSIS
+.B dbslower [-v] [-p PID [PID ...]] [-m THRESHOLD] {mysql,postgres}
+.SH DESCRIPTION
+This traces queries served by a MySQL or PostgreSQL server, and prints
+those that exceed a latency (query time) threshold. By default a threshold of
+1 ms is used.
+
+This uses User Statically-Defined Tracing (USDT) probes, a feature added to
+MySQL and PostgreSQL for DTrace support, but which may not be enabled on a
+given installation. See requirements.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF, bcc, and MySQL server with USDT probe support (when configuring
+the build: \-DENABLE_DTRACE=1) or PostgreSQL server with USDT probe support
+(when configuring the build: \-\-enable-dtrace).
+.SH OPTIONS
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this PID. If no PID is specified, the tool will attempt to automatically
+detect the MySQL or PostgreSQL processes running on the system.
+.TP
+\-m THRESHOLD
+Minimum query latency (duration) to trace, in milliseconds. Default is 1 ms.
+.TP
+{mysql,postgres}
+The database engine to trace.
+.SH EXAMPLES
+.TP
+Trace MySQL server queries slower than 1 ms:
+#
+.B dbslower mysql
+.TP
+Trace slower than 10 ms for PostgreSQL in process 408:
+#
+.B dbslower postgres -p 408 -m 10
+.SH FIELDS
+.TP
+TIME(s)
+Time of query start, in seconds.
+.TP
+PID
+Process ID of the traced server.
+.TP
+MS
+Milliseconds for the query, from start to end.
+.TP
+QUERY
+Query string, truncated to 256 characters.
+.SH OVERHEAD
+This adds low-overhead instrumentation to queries, and only emits output
+data from kernel to user-level if they query exceeds the threshold. If the
+server query rate is less than 1,000/sec, the overhead is expected to be
+negligible. If the query rate is higher, test to gauge overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein, Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), mysqld_qslower(8), dbstat(8)
diff --git a/man/man8/dbstat.8 b/man/man8/dbstat.8
new file mode 100644
index 0000000..c8e8fd8
--- /dev/null
+++ b/man/man8/dbstat.8
@@ -0,0 +1,72 @@
+.TH dbstat 8  "2017-02-15" "USER COMMANDS"
+.SH NAME
+dbstat \- Collect histograms of MySQL/PostgreSQL query latencies.
+.SH SYNOPSIS
+.	B dbstat [-v] [-p PID [PID ...]] [-m THRESHOLD] [-u] [-i INTERVAL] {mysql,postgres}
+.SH DESCRIPTION
+This traces queries served by a MySQL or PostgreSQL server, and collects a
+histogram of query latencies. The histogram is printed at the end of collection,
+or at specified intervals.
+
+This uses User Statically-Defined Tracing (USDT) probes, a feature added to
+MySQL and PostgreSQL for DTrace support, but which may not be enabled on a
+given installation. See requirements.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF, bcc, and MySQL server with USDT probe support (when configuring
+the build: \-DENABLE_DTRACE=1) or PostgreSQL server with USDT probe support
+(when configuring the build: \-\-enable-dtrace).
+.SH OPTIONS
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this PID. If no PID is specified, the tool will attempt to automatically
+detect the MySQL or PostgreSQL processes running on the system.
+.TP
+\-m THRESHOLD
+Minimum query latency (duration) to trace, in milliseconds.
+Default is all queries.
+.TP
+\-u
+Display query latencies in microseconds (default: milliseconds).
+.TP
+\-i INTERVAL
+Print summaries (histograms) at this interval, specified in seconds.
+.TP
+{mysql,postgres}
+The database engine to trace.
+.SH EXAMPLES
+.TP
+Display histogram of MySQL query latencies:
+#
+.B dbstat mysql
+.TP
+Display histogram of PostgreSQL query latencies slower than 10ms in pid 408:
+#
+.B dbstat postgres -p 408 -m 10
+.TP
+Display histogram of PostgreSQL query latencies at 3-second intervals:
+#
+.B dbstat postgres -i 3
+.SH OVERHEAD
+This adds low-overhead instrumentation to queries, and only emits output
+data from kernel to user-level if they query exceeds the threshold. If the
+server query rate is less than 1,000/sec, the overhead is expected to be
+negligible. If the query rate is higher, test to gauge overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+dbslower(8)
diff --git a/man/man8/dcsnoop.8 b/man/man8/dcsnoop.8
new file mode 100644
index 0000000..6ccb3b6
--- /dev/null
+++ b/man/man8/dcsnoop.8
@@ -0,0 +1,77 @@
+.TH dcsnoop 8  "2016-02-10" "USER COMMANDS"
+.SH NAME
+dcsnoop \- Trace directory entry cache (dcache) lookups. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B dcsnoop [\-h] [\-a]
+.SH DESCRIPTION
+By default, this traces every failed dcache lookup (cache miss), and shows the
+process performing the lookup and the filename requested. A \-a option can be
+used to show all lookups, not just failed ones.
+
+The output of this tool can be verbose, and is intended for further
+investigations of dcache performance beyond dcstat(8), which prints
+per-second summaries.
+
+This uses kernel dynamic tracing of the d_lookup() function, and will need
+and will need updating to match any changes to this function.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-a
+Trace references, not just failed lookups.
+.SH EXAMPLES
+.TP
+Trace failed dcache lookups:
+#
+.B dcsnoop
+.TP
+Trace all dcache lookups:
+#
+.B dcsnoop \-a
+.SH FIELDS
+.TP
+TIME(s)
+Time of lookup, in seconds.
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process name.
+.TP
+T
+Type: R == reference (only visible with \-a), M == miss. A miss will print two
+lines, one for the reference, and one for the miss.
+.TP
+FILE
+The file name component that was being looked up. This contains trailing
+pathname components (after '/'), which will be the subject of subsequent
+lookups.
+.SH OVERHEAD
+File name lookups can be frequent (depending on the workload), and this tool
+prints a line for each failed lookup, and with \-a, each reference as well. The
+output may be verbose, and the incurred overhead, while optimized to some
+extent, may still be from noticeable to significant. This is only really
+intended for deeper investigations beyond dcstat(8), when absolutely necessary.
+Measure and quantify the overhead in a test environment before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+dcstat(1)
diff --git a/man/man8/dcstat.8 b/man/man8/dcstat.8
new file mode 100644
index 0000000..e2bc4dc
--- /dev/null
+++ b/man/man8/dcstat.8
@@ -0,0 +1,61 @@
+.TH dcstat 8  "2016-02-09" "USER COMMANDS"
+.SH NAME
+dcstat \- Directory entry cache (dcache) stats. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B dcstat
+[interval [count]]
+.SH DESCRIPTION
+The Linux directory entry cache (dcache) improves the performance of file and
+directory name lookups. This tool provides per-second summary statistics of
+dcache performance.
+
+This uses kernel dynamic tracing of kernel functions, lookup_fast() and
+d_lookup(), which will need to be modified to match kernel changes.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print summaries each second:
+#
+.B dcstat
+.TP
+Print output every five seconds, three times:
+#
+.B dcstat 5 3
+.SH FIELDS
+.TP
+REFS/s
+Number dcache lookups (references) per second.
+.TP
+SLOW/s
+Number of dcache lookups that failed the lookup_fast() path and executed the
+lookup_slow() path instead.
+.TP
+MISS/s
+Number of dcache misses (failed both fast and slow lookups).
+.TP
+HIT%
+Percentage of dcache hits over total references.
+.SH OVERHEAD
+The overhead depends on the frequency of file and directory name lookups.
+While the per-event overhead is low, some applications may make over 100k
+lookups per second, and the low per-event overhead will begin to add up, and
+could begin to be measurable (over 10% CPU usage). Measure in a test
+environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+dcsnoop(8)
diff --git a/man/man8/deadlock_detector.8 b/man/man8/deadlock_detector.8
new file mode 100644
index 0000000..0b23e3e
--- /dev/null
+++ b/man/man8/deadlock_detector.8
@@ -0,0 +1,142 @@
+.TH deadlock_detector 8  "2017-02-01" "USER COMMANDS"
+.SH NAME
+deadlock_detector \- Find potential deadlocks (lock order inversions)
+in a running program.
+.SH SYNOPSIS
+.B deadlock_detector [\-h] [\--binary BINARY] [\--dump-graph DUMP_GRAPH]
+.B                  [\--verbose] [\--lock-symbols LOCK_SYMBOLS]
+.B                  [\--unlock-symbols UNLOCK_SYMBOLS]
+.B                  pid
+.SH DESCRIPTION
+deadlock_detector finds potential deadlocks in a running process. The program
+attaches uprobes on `pthread_mutex_lock` and `pthread_mutex_unlock` by default
+to build a mutex wait directed graph, and then looks for a cycle in this graph.
+This graph has the following properties:
+
+- Nodes in the graph represent mutexes.
+
+- Edge (A, B) exists if there exists some thread T where lock(A) was called
+and lock(B) was called before unlock(A) was called.
+
+If there is a cycle in this graph, this indicates that there is a lock order
+inversion (potential deadlock). If the program finds a lock order inversion, the
+program will dump the cycle of mutexes, dump the stack traces where each mutex
+was acquired, and then exit.
+
+This program can only find potential deadlocks that occur while the program is
+tracing the process. It cannot find deadlocks that may have occurred before the
+program was attached to the process.
+
+This tool does not work for shared mutexes or recursive mutexes.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc
+.SH OPTIONS
+.TP
+\-h, --help
+show this help message and exit
+.TP
+\--binary BINARY
+If set, trace the mutexes from the binary at this path. For
+statically-linked binaries, this argument is not required.
+For dynamically-linked binaries, this argument is required and should be the
+path of the pthread library the binary is using.
+Example: /lib/x86_64-linux-gnu/libpthread.so.0
+.TP
+\--dump-graph DUMP_GRAPH
+If set, this will dump the mutex graph to the specified file.
+.TP
+\--verbose
+Print statistics about the mutex wait graph.
+.TP
+\--lock-symbols LOCK_SYMBOLS
+Comma-separated list of lock symbols to trace. Default is pthread_mutex_lock.
+These symbols cannot be inlined in the binary.
+.TP
+\--unlock-symbols UNLOCK_SYMBOLS
+Comma-separated list of unlock symbols to trace. Default is
+pthread_mutex_unlock. These symbols cannot be inlined in the binary.
+.TP
+pid
+Pid to trace
+.SH EXAMPLES
+.TP
+Find potential deadlocks in PID 181. The --binary argument is not needed for \
+statically-linked binaries.
+#
+.B deadlock_detector 181
+.TP
+Find potential deadlocks in PID 181. If the process was created from a \
+dynamically-linked executable, the --binary argument is required and must be \
+the path of the pthread library:
+#
+.B deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
+.TP
+Find potential deadlocks in PID 181. If the process was created from a \
+statically-linked executable, optionally pass the location of the binary. \
+On older kernels without https://lkml.org/lkml/2017/1/13/585, binaries that \
+contain `:` in the path cannot be attached with uprobes. As a workaround, we \
+can create a symlink to the binary, and provide the symlink name instead with \
+the `--binary` option:
+#
+.B deadlock_detector 181 --binary /usr/local/bin/lockinversion
+.TP
+Find potential deadlocks in PID 181 and dump the mutex wait graph to a file:
+#
+.B deadlock_detector 181 --dump-graph graph.json
+.TP
+Find potential deadlocks in PID 181 and print mutex wait graph statistics:
+#
+.B deadlock_detector 181 --verbose
+.TP
+Find potential deadlocks in PID 181 with custom mutexes:
+#
+.B deadlock_detector 181
+.B      --lock-symbols custom_mutex1_lock,custom_mutex2_lock
+.B      --unlock_symbols custom_mutex1_unlock,custom_mutex2_unlock
+.SH OUTPUT
+This program does not output any fields. Rather, it will keep running until
+it finds a potential deadlock, or the user hits Ctrl-C. If the program finds
+a potential deadlock, it will output the stack traces and lock order inversion
+in the following format and exit:
+.TP
+Potential Deadlock Detected!
+.TP
+Cycle in lock order graph: Mutex M0 => Mutex M1 => Mutex M0
+.TP
+Mutex M1 acquired here while holding Mutex M0 in Thread T:
+.B [stack trace]
+.TP
+Mutex M0 previously acquired by the same Thread T here:
+.B [stack trace]
+.TP
+Mutex M0 acquired here while holding Mutex M1 in Thread S:
+.B [stack trace]
+.TP
+Mutex M1 previously acquired by the same Thread S here:
+.B [stack trace]
+.TP
+Thread T created by Thread R here:
+.B [stack trace]
+.TP
+Thread S created by Thread Q here:
+.B [stack trace]
+.SH OVERHEAD
+This traces all mutex lock and unlock events and all thread creation events
+on the traced process. The overhead of this can be high if the process has many
+threads and mutexes. You should only run this on a process where the slowdown
+is acceptable.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Kenny Yu
diff --git a/man/man8/execsnoop.8 b/man/man8/execsnoop.8
new file mode 100644
index 0000000..0efd89f
--- /dev/null
+++ b/man/man8/execsnoop.8
@@ -0,0 +1,107 @@
+.TH execsnoop 8  "2016-02-07" "USER COMMANDS"
+.SH NAME
+execsnoop \- Trace new processes via exec() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B execsnoop [\-h] [\-t] [\-x] [\-n NAME] [\-l LINE]
+.SH DESCRIPTION
+execsnoop traces new processes, showing the filename executed and argument
+list.
+
+It works by traces the execve() system call (commonly used exec() variant).
+This catches new processes that follow the fork->exec sequence, as well as
+processes that re-exec() themselves. Some applications fork() but do not
+exec(), eg, for worker processes, which won't be included in the execsnoop
+output.
+
+This works by tracing the kernel sys_execve() function using dynamic tracing,
+and will need updating to match any changes to this function.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column.
+.TP
+\-x
+Include failed exec()s
+.TP
+\-q
+Add "quotemarks" around arguments. Escape quotemarks in arguments with a
+backslash. For tracing empty arguments or arguments that contain whitespace. 
+.TP
+\-n NAME
+Only print command lines matching this name (regex)
+.TP
+\-l LINE
+Only print commands where arg contains this line (regex)
+.TP
+\--max-args MAXARGS
+Maximum number of arguments parsed and displayed, defaults to 20
+.SH EXAMPLES
+.TP
+Trace all exec() syscalls:
+#
+.B execsnoop
+.TP
+Trace all exec() syscalls, and include timestamps:
+#
+.B execsnoop \-t
+.TP
+Include failed exec()s:
+#
+.B execsnoop \-x
+.TP
+Put quotemarks around arguments. 
+#
+.B execsnoop \-q
+.TP
+Only trace exec()s where the filename contains "mount":
+#
+.B execsnoop \-n mount
+.TP
+Only trace exec()s where argument's line contains "testpkg":
+#
+.B execsnoop \-l testpkg
+.SH FIELDS
+.TP
+TIME(s)
+Time of exec() return, in seconds.
+.TP
+PCOMM
+Parent process/command name.
+.TP
+PID
+Process ID
+.TP
+RET
+Return value of exec(). 0 == successs. Failures are only shown when using the
+\-x option.
+.TP
+ARGS
+Filename for the exec(), followed be up to 19 arguments. An ellipsis "..." is
+shown if the argument list is known to be truncated.
+.SH OVERHEAD
+This traces the kernel execve function and prints output for each event. As the
+rate of this is generally expected to be low (< 1000/s), the overhead is also
+expected to be negligible. If you have an application that is calling a high
+rate of exec()s, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/ext4dist.8 b/man/man8/ext4dist.8
new file mode 100644
index 0000000..61ecb72
--- /dev/null
+++ b/man/man8/ext4dist.8
@@ -0,0 +1,80 @@
+.TH ext4dist 8  "2016-02-12" "USER COMMANDS"
+.SH NAME
+ext4dist \- Summarize ext4 operation latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B ext4dist [\-h] [\-T] [\-m] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This tool summarizes time (latency) spent in common ext4 file operations: reads,
+writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
+in-kernel eBPF map to store the histogram for efficiency.
+
+Since this works by tracing the ext4_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Don't include timestamps on interval output.
+.TP
+\-m
+Output in milliseconds.
+.TP
+\-p PID
+Trace this PID only.
+.SH EXAMPLES
+.TP
+Trace ext4 operation time, and print a summary on Ctrl-C:
+#
+.B ext4dist
+.TP
+Trace PID 181 only:
+#
+.B ext4dist -p 181
+.TP
+Print 1 second summaries, 10 times:
+#
+.B ext4dist 1 10
+.TP
+1 second summaries, printed in milliseconds
+#
+.B ext4dist \-m 1
+.SH FIELDS
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+count
+Number of operations in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these ext4 operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+ext4snoop(8)
diff --git a/man/man8/ext4slower.8 b/man/man8/ext4slower.8
new file mode 100644
index 0000000..7591f28
--- /dev/null
+++ b/man/man8/ext4slower.8
@@ -0,0 +1,113 @@
+.TH ext4slower 8  "2016-02-11" "USER COMMANDS"
+.SH NAME
+ext4slower \- Trace slow ext4 file operations, with per-event details.
+.SH SYNOPSIS
+.B ext4slower [\-h] [\-j] [\-p PID] [min_ms]
+.SH DESCRIPTION
+This tool traces common ext4 file operations: reads, writes, opens, and
+syncs. It measures the time spent in these operations, and prints details
+for each that exceeded a threshold.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used. If a threshold of 0
+is used, all events are printed (warning: verbose).
+
+Since this works by tracing the ext4_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B ext4slower
+.TP
+Trace slower than 1 ms:
+#
+.B ext4slower 1
+.TP
+Trace slower than 1 ms, and output just the fields in parsable format (csv):
+#
+.B ext4slower \-j 1
+.TP
+Trace all file reads and writes (warning: the output will be verbose):
+#
+.B ext4slower 0
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B ext4slower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+T
+Type of operation. R == read, W == write, O == open, S == fsync.
+.TP
+OFF_KB
+File offset for the I/O, in Kbytes.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when it was issued by VFS to the
+filesystem, to when it completed. This time is inclusive of block device I/O,
+file system CPU cycles, file system locks, run queue latency, etc. It's a more
+accurate measure of the latency suffered by applications performing file
+system I/O, than to measure this down at the block device interface.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.TP
+ENDTIME_us
+Completion timestamp, microseconds (\-j only).
+.TP
+OFFSET_b
+File offset, bytes (\-j only).
+.TP
+LATENCY_us
+Latency (duration) of the I/O, in microseconds (\-j only).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these ext4 operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool (even if it prints no "slower" events) can
+begin to become significant. Measure and quantify before use. If this
+continues to be a problem, consider switching to a tool that prints in-kernel
+summaries only.
+.PP
+Note that the overhead of this tool should be less than fileslower(8), as
+this tool targets ext4 functions only, and not all file read/write paths
+(which can include socket I/O).
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), funccount(8), fileslower(8)
diff --git a/man/man8/filelife.8 b/man/man8/filelife.8
new file mode 100644
index 0000000..1e4e423
--- /dev/null
+++ b/man/man8/filelife.8
@@ -0,0 +1,72 @@
+.TH filelife 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+filelife \- Trace the lifespan of short-lived files. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B filelife [\-h] [\-p PID]
+.SH DESCRIPTION
+This traces the creation and deletion of files, providing information
+on who deleted the file, the file age, and the file name. The intent is to
+provide information on short-lived files, for debugging or performance
+analysis.
+
+This works by tracing the kernel vfs_create() and vfs_delete() functions (and
+maybe more, see the source) using dynamic tracing, and will need updating to
+match any changes to these functions.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all short-lived files, and print details:
+#
+.B filelife
+.TP
+Trace all short-lived files created AND deleted by PID 181:
+#
+.B filelife \-p 181
+.SH FIELDS
+.TP
+TIME
+Time of the deletion.
+.TP
+PID
+Process ID that deleted the file.
+.TP
+COMM
+Process name for the PID.
+.TP
+AGE(s)
+Age of the file, from creation to deletion, in seconds.
+.TP
+FILE
+Filename.
+.SH OVERHEAD
+This traces the kernel VFS file create and delete functions and prints output
+for each delete. As the rate of this is generally expected to be low
+(< 1000/s), the overhead is also expected to be negligible.
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/fileslower.8 b/man/man8/fileslower.8
new file mode 100644
index 0000000..169013b
--- /dev/null
+++ b/man/man8/fileslower.8
@@ -0,0 +1,114 @@
+.TH fileslower 8  "2016-02-07" "USER COMMANDS"
+.SH NAME
+fileslower \- Trace slow synchronous file reads and writes.
+.SH SYNOPSIS
+.B fileslower [\-h] [\-p PID] [-a] [min_ms]
+.SH DESCRIPTION
+This script uses kernel dynamic tracing of synchronous reads and writes
+at the VFS interface, to identify slow file reads and writes for any file
+system.
+
+This version traces __vfs_read() and __vfs_write() and only showing
+synchronous I/O (the path to new_sync_read() and new_sync_write()), and
+I/O with filenames. This approach provides a view of just two file
+system request types: file reads and writes. There are typically many others:
+asynchronous I/O, directory operations, file handle operations, file open()s,
+fflush(), etc.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used.
+
+Since this works by tracing various kernel __vfs_*() functions using dynamic
+tracing, it will need updating to match any changes to these functions. A
+future version should switch to using FS tracepoints instead.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+\-a
+Include non-regular file types in output (sockets, FIFOs, etc).
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B fileslower
+.TP
+Trace slower than 1 ms:
+#
+.B fileslower 1
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B fileslower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+D
+Direction of I/O. R == read, W == write.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when the application issued it to VFS
+to when it completed. This time is inclusive of block device I/O, file system
+CPU cycles, file system locks, run queue latency, etc. It's a more accurate
+measure of the latency suffered by applications performing file system I/O,
+than to measure this down at the block device interface.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.SH OVERHEAD
+Depending on the frequency of application reads and writes, overhead can become
+severe, in the worst case slowing applications by 2x. In the best case, the
+overhead is negligible. Hopefully for real world workloads the overhead is
+often at the lower end of the spectrum -- test before use. The reason for
+high overhead is that this traces VFS reads and writes, which includes FS
+cache reads and writes, and can exceed one million events per second if the
+application is I/O heavy. While the instrumentation is extremely lightweight,
+and uses in-kernel eBPF maps for efficient timing and filtering, multiply that
+cost by one million events per second and that cost becomes a million times
+worse. You can get an idea of the possible cost by just counting the
+instrumented events using the bcc funccount tool, eg:
+.PP
+# ./funccount.py -i 1 -r '^__vfs_(read|write)$'
+.PP
+This also costs overhead, but is somewhat less than fileslower.
+.PP
+If the overhead is prohibitive for your workload, I'd recommend moving
+down-stack a little from VFS into the file system functions (ext4, xfs, etc).
+Look for updates to bcc for specific file system tools that do this. The
+advantage of a per-file system approach is that we can trace post-cache,
+greatly reducing events and overhead. The disadvantage is needing custom
+tracing approaches for each different file system (whereas VFS is generic).
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), funccount(8)
diff --git a/man/man8/filetop.8 b/man/man8/filetop.8
new file mode 100644
index 0000000..e70d908
--- /dev/null
+++ b/man/man8/filetop.8
@@ -0,0 +1,118 @@
+.TH filetop 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+filetop \- File reads and writes by filename and process. Top for files.
+.SH SYNOPSIS
+.B filetop [\-h] [\-C] [\-r MAXROWS] [\-s {reads,writes,rbytes,wbytes}] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This is top for files.
+
+This traces file reads and writes, and prints a per-file summary every interval
+(by default, 1 second). By default the summary is sorted on the highest read
+throughput (Kbytes). Sorting order can be changed via -s option. By default only
+IO on regular files is shown. The -a option will list all file types (sokets,
+FIFOs, etc).
+
+This uses in-kernel eBPF maps to store per process summaries for efficiency.
+
+This script works by tracing the __vfs_read() and __vfs_write() functions using
+kernel dynamic tracing, which instruments explicit read and write calls. If
+files are read or written using another means (eg, via mmap()), then they
+will not be visible using this tool. Also, this tool will need updating to
+match any code changes to those vfs functions.
+
+This should be useful for file system workload characterization when analyzing
+the performance of applications.
+
+Note that tracing VFS level reads and writes can be a frequent activity, and
+this tool can begin to cost measurable overhead at high I/O rates.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-a
+Include non-regular file types (sockets, FIFOs, etc).
+.TP
+\-C
+Don't clear the screen.
+.TP
+\-r MAXROWS
+Maximum number of rows to print. Default is 20.
+.TP
+\-s {reads,writes,rbytes,wbytes}
+Sort column. Default is rbytes (read throughput).
+.TP
+\-p PID
+Trace this PID only.
+.TP
+interval
+Interval between updates, seconds.
+.TP
+count
+Number of interval summaries.
+
+.SH EXAMPLES
+.TP
+Summarize block device I/O by process, 1 second screen refresh:
+#
+.B filetop
+.TP
+Don't clear the screen, and top 8 rows only:
+#
+.B filetop -Cr 8
+.TP
+5 second summaries, 10 times only:
+#
+.B filetop 5 10
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process name.
+.TP
+READS
+Count of reads during interval.
+.TP
+WRITES
+Count of writes during interval.
+.TP
+R_Kb
+Total read Kbytes during interval.
+.TP
+W_Kb
+Total write Kbytes during interval.
+.TP
+T
+Type of file: R == regular, S == socket, O == other (pipe, etc).
+.SH OVERHEAD
+Depending on the frequency of application reads and writes, overhead can become
+significant, in the worst case slowing applications by over 50%. Hopefully for
+real world workloads the overhead is much less -- test before use. The reason
+for the high overhead is that VFS reads and writes can be a frequent event, and
+despite the eBPF overhead being very small per event, if you multiply this
+small overhead by a million events per second, it becomes a million times
+worse. Literally. You can gauge the number of reads and writes using the
+vfsstat(8) tool, also from bcc.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH INSPIRATION
+top(1) by William LeFebvre
+.SH SEE ALSO
+vfsstat(8), vfscount(8), fileslower(8)
diff --git a/man/man8/funccount.8 b/man/man8/funccount.8
new file mode 100644
index 0000000..9039ab3
--- /dev/null
+++ b/man/man8/funccount.8
@@ -0,0 +1,107 @@
+.TH funccount 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+funccount \- Count function, tracepoint, and USDT probe calls matching a pattern. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B funccount [\-h] [\-p PID] [\-i INTERVAL] [\-d DURATION] [\-T] [\-r] [\-D] pattern
+.SH DESCRIPTION
+This tool is a quick way to determine which functions are being called,
+and at what rate. It uses in-kernel eBPF maps to count function calls.
+
+WARNING: This uses dynamic tracing of (what can be many) functions, an
+activity that has had issues on some kernel versions (risk of panics or
+freezes). Test, and know what you are doing, before use.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+pattern
+Search pattern. Supports "*" wildcards. See EXAMPLES. You can also use \-r for regular expressions.
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this process ID only.
+.TP
+\-i INTERVAL
+Print output every interval seconds.
+.TP
+\-d DURATION
+Total duration of trace in seconds.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-r
+Use regular expressions for the search pattern.
+.TP
+\-D
+Print the BPF program before starting (for debugging purposes).
+.SH EXAMPLES
+.TP
+Count kernel functions beginning with "vfs_", until Ctrl-C is hit:
+#
+.B funccount 'vfs_*'
+.TP
+Count kernel functions beginning with "tcp_send", until Ctrl-C is hit:
+#
+.B funccount 'tcp_send*'
+.TP
+Print kernel functions beginning with "vfs_", every second:
+#
+.B funccount \-i 1 'vfs_*'
+.TP
+Print kernel functions beginning with "vfs_", for ten seconds only:
+#
+.B funccount \-d 10 'vfs_*'
+.TP
+Match kernel functions beginning with "vfs_", using regular expressions:
+#
+.B funccount \-r '^vfs_.*'
+.TP
+Count vfs calls for process ID 181 only:
+#
+.B funccount \-p 181 'vfs_*'
+.TP
+Count calls to the sched_fork tracepoint, indicating a fork() performed:
+#
+.B funccount t:sched:sched_fork
+.TP
+Count all GC USDT probes in the Node process:
+#
+.B funccount -p 185 u:node:gc*
+.TP
+Count all malloc() calls in libc:
+#
+.B funccount c:malloc
+.SH FIELDS
+.TP
+FUNC
+Function name
+.TP
+COUNT
+Number of calls while tracing
+.SH OVERHEAD
+This traces functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of calls
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg, Sasha Goldshtein
+.SH SEE ALSO
+stackcount(8)
+funclatency(8)
+vfscount(8)
diff --git a/man/man8/funclatency.8 b/man/man8/funclatency.8
new file mode 100644
index 0000000..b82626c
--- /dev/null
+++ b/man/man8/funclatency.8
@@ -0,0 +1,135 @@
+.TH funclatency 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+funclatency \- Time functions and print latency as a histogram.
+.SH SYNOPSIS
+.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-d DURATION] [\-T] [\-u] [\-m] [\-F] [\-r] [\-v] pattern
+.SH DESCRIPTION
+This tool traces function calls and times their duration (latency), and
+shows the latency distribution as a histogram. The time is measured from when
+the function is called to when it returns, and is inclusive of both on-CPU
+time and time spent blocked.
+
+This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
+for efficiency.
+
+Currently nested or recursive functions are not supported properly, and
+timestamps will be overwritten, creating dubious output. Try to match single
+functions, or groups of functions that run at the same stack layer, and
+don't ultimately call each other.
+
+WARNING: This uses dynamic tracing of (what can be many) functions, an
+activity that has had issues on some kernel versions (risk of panics or
+freezes). Test, and know what you are doing, before use.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+pattern
+Function name or search pattern. Supports "*" wildcards. See EXAMPLES.
+You can also use \-r for regular expressions.
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this process ID only.
+.TP
+\-i INTERVAL
+Print output every interval seconds.
+.TP
+\-d DURATION
+Total duration of trace, in seconds.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-u
+Output histogram in microseconds.
+.TP
+\-m
+Output histogram in milliseconds.
+.TP
+\-F
+Print a separate histogram per function matched.
+.TP
+\-r
+Use regular expressions for the search pattern.
+.TP
+\-v
+Print the BPF program (for debugging purposes).
+.SH EXAMPLES
+.TP
+Time the do_sys_open() kernel function, and print the distribution as a histogram:
+#
+.B funclatency do_sys_open
+.TP
+Time the read() function in libc across all processes on the system:
+#
+.B funclatency c:read
+.TP
+Time vfs_read(), and print the histogram in units of microseconds:
+#
+.B funclatency \-u vfs_read
+.TP
+Time do_nanosleep(), and print the histogram in units of milliseconds:
+#
+.B funclatency \-m do_nanosleep
+.TP
+Time libc open(), and print output every 2 seconds, for duration 10 seconds:
+#
+.B funclatency \-i 2 -d 10 c:read
+.TP
+Time vfs_read(), and print output every 5 seconds, with timestamps:
+#
+.B funclatency \-mTi 5 vfs_read
+.TP
+Time vfs_read() for process ID 181 only:
+#
+.B funclatency \-p 181 vfs_read:
+.TP
+Time both vfs_fstat() and vfs_fstatat() calls, by use of a wildcard:
+#
+.B funclatency 'vfs_fstat*'
+.TP
+Time both vfs_fstat* calls, and print a separate histogram for each:
+#
+.B funclatency -F 'vfs_fstat*'
+.SH FIELDS
+.TP
+necs
+Nanosecond range
+.TP
+usecs
+Microsecond range
+.TP
+msecs
+Millisecond range
+.TP
+count
+How many calls fell into this range
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This traces kernel functions and maintains in-kernel timestamps and a histogram,
+which are asynchronously copied to user-space. While this method is very
+efficient, the rate of kernel functions can also be very high (>1M/sec), at
+which point the overhead is expected to be measurable. Measure in a test
+environment and understand overheads before use. You can also use funccount
+to measure the rate of kernel functions over a short duration, to set some
+expectations before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg, Sasha Goldshtein
+.SH SEE ALSO
+funccount(8)
diff --git a/man/man8/funcslower.8 b/man/man8/funcslower.8
new file mode 100644
index 0000000..06f1793
--- /dev/null
+++ b/man/man8/funcslower.8
@@ -0,0 +1,125 @@
+.TH funcslower 8  "2017-03-30" "USER COMMANDS"
+.SH NAME
+funcslower \- Trace slow kernel or user function calls.
+.SH SYNOPSIS
+.B funcslower [\-hf] [\-p PID] [\-U | \-K] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T] [-t] [-v] function [function ...]
+.SH DESCRIPTION
+This script traces a kernel or user function's entry and return points, and
+prints a message when the function's latency exceeded the specified threshold.
+Multiple functions are supported, and you can mix kernel functions with user
+functions in different libraries.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 1 is used. Recursive functions
+are not supported: only the inner-most recursive invocation will be traced.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+\-m MIN_NS
+Minimum duration to trace, in milliseconds. Default is 1 ms.
+.TP
+\-u MIN_US
+Minimum duration to trace, in microseconds.
+.TP
+\-a ARGUMENTS
+Print the function's arguments, up to 6.
+.TP
+\-T
+Print a HH:MM:SS timestamp with each entry.
+.TP
+\-t
+Print a seconds timestamp with each entry, at microsecond resolution.
+.TP
+\-f
+Print output in folded stack format.
+.TP
+\-U
+Show stacks from user space only (no kernel space stacks).
+.TP
+\-K
+Show stacks from kernel space only (no user space stacks).
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+function
+The function to trace -- multiple functions are supported. If a plain function
+name is provided, the function is assumed to be a kernel function. For user
+functions, provide the library name and the function name, e.g. bash:readline
+or c:malloc.
+.SH EXAMPLES
+.TP
+Trace vfs_write calls slower than 1ms:
+#
+.B funcslower vfs_write
+.TP
+Trace open() calls in libc slower than 10us:
+#
+.B funcslower \-u 10 c:open
+.TP
+Trace both malloc() and free() slower than 10us, in pid 135 only:
+#
+.B funcslower \-p 135 \-u 10 c:malloc c:free
+.TP
+Trace the write syscall and print its first 4 arguments:
+#
+.B funcslower -a 4 SyS_write
+.TP
+Trace opens from libc and print the user and kernel stack frames:
+#
+.B funcslower -UK c:open
+.SH FIELDS
+.TP
+TIME
+Time of the event as a human-readable HH:MM:SS format, or a timestamp in seconds
+at microsecond-accuracy from the first event seen.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+LAT
+Latency of the operation in either microseconds (us) or milliseconds (ms).
+.TP
+RVAL
+The return value from the function. Often useful for diagnosing a relationship
+between slow and failed function calls.
+.TP
+FUNC
+The function name, followed by its arguments if requested.
+.SH OVERHEAD
+Depending on the function(s) being traced, overhead can become severe. For 
+example, tracing a common function like malloc() can slow down a C/C++ program
+by a factor of 2 or more. On the other hand, tracing a low-frequency event like
+the SyS_setreuid() function will probably not be as prohibitive, and in fact
+negligible for functions that are called up to 100-1000 times per second.
+
+You should first use the funclatency and argdist tools for investigation, 
+because they summarize data in-kernel and have a much lower overhead than this
+tool. To get a general idea of the number of times a particular function is
+called (and estimate the overhead), use the funccount tool, e.g.:
+.PP
+# funccount c:open
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+funccount(8), funclatency(8), argdist(8), trace(8)
diff --git a/man/man8/gethostlatency.8 b/man/man8/gethostlatency.8
new file mode 100644
index 0000000..c5a5330
--- /dev/null
+++ b/man/man8/gethostlatency.8
@@ -0,0 +1,63 @@
+.TH gethostlatency 8  "2016-01-28" "USER COMMANDS"
+.SH NAME
+gethostlatency \- Show latency for getaddrinfo/gethostbyname[2] calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B gethostlatency
+.SH DESCRIPTION
+This traces and prints when getaddrinfo(), gethostbyname(), and gethostbyname2()
+are called, system wide, and shows the responsible PID and command name,
+latency of the call (duration) in milliseconds, and the host string.
+
+This tool can be useful for identifying DNS latency, by identifying which
+remote host name lookups were slow, and by how much.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism
+
+This tool currently uses dynamic tracing of user-level functions and registers,
+and may need modifications to match your software and processor architecture.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-p PID
+Trace this process ID only.
+.SH EXAMPLES
+.TP
+Trace host lookups (getaddrinfo/gethostbyname[2]) system wide:
+#
+.B gethostlatency
+.SH FIELDS
+.TP
+TIME
+Time of the command (HH:MM:SS).
+.TP
+PID
+Process ID of the client performing the call.
+.TP
+COMM
+Process (command) name of the client performing the call.
+.TP
+HOST
+Host name string: the target of the lookup.
+.SH OVERHEAD
+The rate of lookups should be relatively low, so the overhead is not expected
+to be a problem.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpdump(8)
diff --git a/man/man8/hardirqs.8 b/man/man8/hardirqs.8
new file mode 100644
index 0000000..8e7237a
--- /dev/null
+++ b/man/man8/hardirqs.8
@@ -0,0 +1,95 @@
+.TH hardirqs 8  "2015-10-20" "USER COMMANDS"
+.SH NAME
+hardirqs \- Measure hard IRQ (hard interrupt) event time. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B hardirqs [\-h] [\-T] [\-N] [\-C] [\-d] [interval] [outputs]
+.SH DESCRIPTION
+This summarizes the time spent servicing hard IRQs (hard interrupts), and can
+show this time as either totals or histogram distributions. A system-wide
+summary of this time is shown by the %irq column of mpstat(1), and event
+counts (but not times) are shown by /proc/interrupts.
+
+WARNING: This currently uses dynamic tracing of hard interrupts. You should
+understand what this means before use. Try in a test environment. Future
+versions should switch to tracepoints.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-N
+Output in nanoseconds.
+.TP
+\-C
+Count events only.
+.TP
+\-d
+Show IRQ time distribution as histograms.
+.SH EXAMPLES
+.TP
+Sum hard IRQ event time until Ctrl-C:
+#
+.B hardirqs
+.TP
+Show hard IRQ event time as histograms:
+#
+.B hardirqs \-d
+.TP
+Print 1 second summaries, 10 times:
+#
+.B hardirqs 1 10
+.TP
+1 second summaries, printed in nanoseconds, with timestamps:
+#
+.B hardirqs \-NT 1
+.SH FIELDS
+.TP
+HARDIRQ
+The irq action name for this hard IRQ.
+.TP
+TOTAL_usecs
+Total time spent in this hard IRQ in microseconds.
+.TP
+TOTAL_nsecs
+Total time spent in this hard IRQ in nanoseconds.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+nsecs
+Range of nanoseconds for this bucket.
+.TP
+count
+Number of hard IRQs in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This traces kernel functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of interrupts
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads, but
+could become noticeable for heavy workloads. Measure in a test environment
+before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+softirqs(8)
diff --git a/man/man8/inject.8 b/man/man8/inject.8
new file mode 100644
index 0000000..e97613b
--- /dev/null
+++ b/man/man8/inject.8
@@ -0,0 +1,46 @@
+.TH inject 8  "2018-03-16" "USER COMMANDS"
+.SH NAME
+inject \- injects appropriate error into function if input call chain and
+predicates are satisfied. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B inject -h [-I header] [-P probability] [-v] mode spec
+.SH DESCRIPTION
+inject injects errors into specified kernel functionality when a given call
+chain and associated predicates are satsified.
+
+WARNING: This tool injects failures into key kernel functions and may crash the
+kernel. You should know what you're doing if you're using this tool.
+
+This makes use of a Linux 4.16 feature (bpf_override_return())
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF, CONFIG_BPF_KPROBE_OVERRIDE, bcc
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-v
+Display the generated BPF program, for debugging or modification.
+.TP
+\-I header
+Necessary headers to be included.
+.TP
+\-P probability
+Optional probability of failure, default 1.
+.SH EXAMPLES
+Please see inject_example.txt
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Howard McLauchlan
diff --git a/man/man8/javacalls.8 b/man/man8/javacalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/javacalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/javaflow.8 b/man/man8/javaflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/javaflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/javagc.8 b/man/man8/javagc.8
new file mode 120000
index 0000000..219d8a3
--- /dev/null
+++ b/man/man8/javagc.8
@@ -0,0 +1 @@
+ugc.8
\ No newline at end of file
diff --git a/man/man8/javaobjnew.8 b/man/man8/javaobjnew.8
new file mode 120000
index 0000000..b384265
--- /dev/null
+++ b/man/man8/javaobjnew.8
@@ -0,0 +1 @@
+uobjnew.8
\ No newline at end of file
diff --git a/man/man8/javastat.8 b/man/man8/javastat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/javastat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/javathreads.8 b/man/man8/javathreads.8
new file mode 120000
index 0000000..baf8205
--- /dev/null
+++ b/man/man8/javathreads.8
@@ -0,0 +1 @@
+uthreads.8
\ No newline at end of file
diff --git a/man/man8/killsnoop.8 b/man/man8/killsnoop.8
new file mode 100644
index 0000000..b7048ed
--- /dev/null
+++ b/man/man8/killsnoop.8
@@ -0,0 +1,83 @@
+.TH killsnoop 8  "2015-08-20" "USER COMMANDS"
+.SH NAME
+killsnoop \- Trace signals issued by the kill() syscall. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B killsnoop [\-h] [\-x] [-p PID]
+.SH DESCRIPTION
+killsnoop traces the kill() syscall, to show signals sent via this method. This
+may be useful to troubleshoot failing applications, where an unknown mechanism
+is sending signals.
+
+This works by tracing the kernel sys_kill() function using dynamic tracing, and
+will need updating to match any changes to this function.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-x
+Only print failed kill() syscalls.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all kill() syscalls:
+#
+.B killsnoop
+.TP
+Trace only kill() syscalls that failed:
+#
+.B killsnoop \-x
+.TP
+Trace PID 181 only:
+#
+.B killsnoop \-p 181
+.SH FIELDS
+.TP
+TIME
+Time of the kill call.
+.TP
+PID
+Source process ID
+.TP
+COMM
+Source process name
+.TP
+SIG
+Signal number. See signal(7).
+.TP
+TPID
+Target process ID
+.TP
+RES
+Result. 0 == success, a negative value (of the error code) for failure.
+.SH OVERHEAD
+This traces the kernel kill function and prints output for each event. As the
+rate of this is generally expected to be low (< 100/s), the overhead is also
+expected to be negligible. If you have an application that is calling a very
+high rate of kill()s for some reason, then test and understand overhead before
+use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(8), funccount(8)
diff --git a/man/man8/llcstat.8 b/man/man8/llcstat.8
new file mode 100644
index 0000000..36dbed7
--- /dev/null
+++ b/man/man8/llcstat.8
@@ -0,0 +1,74 @@
+.TH llcstat 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+llcstat \- Summarize CPU cache references and misses by process. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B llcstat [\-h] [\-c SAMPLE_PERIOD] [duration]
+.SH DESCRIPTION
+llcstat instruments CPU cache references and cache misses system-side, and
+summarizes them by PID and CPU. These events have different meanings on
+different architecture. For x86-64, they mean misses and references to LLC.
+This can be useful to locate and debug performance issues
+caused by cache hit rate.
+
+This works by sampling corresponding events defined in uapi/linux/perf_event.h,
+namely PERF_COUNT_HW_CACHE_REFERENCES and PERF_COUNT_HW_CACHE_MISSES, using
+BPF perf event tracing. Upon each sampled event, the attached BPF program
+records the PID and CPU ID on which the event happened, and stores it in table.
+
+This makes use of a Linux 4.9 feature (BPF_PROG_TYPE_PERF_EVENT).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-c SAMPLE_PERIOD
+Sample one in this many cache reference and cache miss events.
+.TP
+duration
+Duration to trace, in seconds.
+.SH EXAMPLES
+.TP
+Sample one in 100 events, trace for 20 seconds:
+#
+.B llcstat -c 100 20
+.SH FIELDS
+.TP
+PID
+Process ID
+.TP
+NAME
+Process name
+.TP
+CPU
+CPU ID
+.TP
+REFERENCE
+Number of cache reference events
+.TP
+MISS
+Number of cache miss events
+.TP
+HIT%
+Cache hit ratio
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Teng Qin
+.SH SEE ALSO
+.TP
+Perf can be used as a generic event counter tool. An example for LLC:
+#
+.B perf top -e cache-misses -e cache-references -a -ns pid,cpu,comm
diff --git a/man/man8/mdflush.8 b/man/man8/mdflush.8
new file mode 100644
index 0000000..9d10ca8
--- /dev/null
+++ b/man/man8/mdflush.8
@@ -0,0 +1,57 @@
+.TH pidpersec 8  "2016-02-13" "USER COMMANDS"
+.SH NAME
+mdflush \- Trace md flush events. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B mdflush
+.SH DESCRIPTION
+This tool traces flush events by md, the Linux multiple device driver
+(software RAID). The timestamp and md device for the flush are printed.
+Knowing when these flushes happen can be useful for correlation with
+unexplained spikes in disk latency.
+
+This works by tracing the kernel md_flush_request() function using dynamic
+tracing, and will need updating to match any changes to this function.
+
+Note that the flushes themselves are likely to originate from higher in the
+I/O stack, such as from the file systems.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace md flush events:
+#
+.B mdflush
+.SH FIELDS
+.TP
+TIME
+Time of the flush event (HH:MM:SS).
+.TP
+PID
+The process ID that was on-CPU when the event was issued. This may identify
+the cause of the flush (eg, the "sync" command), but will often identify a
+kernel worker thread that was managing I/O.
+.TP
+COMM
+The command name for the PID.
+.TP
+DEVICE
+The md device name.
+.SH OVERHEAD
+Expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8)
diff --git a/man/man8/memleak.8 b/man/man8/memleak.8
new file mode 100644
index 0000000..fa52c8c
--- /dev/null
+++ b/man/man8/memleak.8
@@ -0,0 +1,135 @@
+.TH memleak 8  "2016-01-14" "USER COMMANDS"
+.SH NAME
+memleak \- Print a summary of outstanding allocations and their call stacks to detect memory leaks. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] [--combined-only]
+[-s SAMPLE_RATE] [-T TOP] [-z MIN_SIZE] [-Z MAX_SIZE] [-O OBJ] [INTERVAL]
+[COUNT]
+.SH DESCRIPTION
+memleak traces and matches memory allocation and deallocation requests, and
+collects call stacks for each allocation. memleak can then print a summary
+of which call stacks performed allocations that weren't subsequently freed.
+
+When tracing a specific process, memleak instruments a list of allocation
+functions from libc, specifically: malloc, calloc, realloc, posix_memalign,
+valloc, memalign, pvalloc, aligned_alloc, and free.
+When tracing all processes, memleak instruments kmalloc/kfree,
+kmem_cache_alloc/kmem_cache_free, and also page allocations made by
+get_free_pages/free_pages.
+
+memleak may introduce significant overhead when tracing processes that allocate
+and free many blocks very quickly. See the OVERHEAD section below.
+
+This tool only works on Linux 4.6+. Stack traces are obtained using the new BPF_STACK_TRACE` APIs.
+For kernels older than 4.6, see the version under tools/old.
+Kernel memory allocations are intercepted through tracepoints, which are
+available on Linux 4.7+.
+
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel). This traces libc allocator.
+.TP
+\-t
+Print a trace of all allocation and free requests and results.
+.TP
+\-a
+Print a list of allocations that weren't freed (and their sizes) in addition to their call stacks.
+.TP
+\-o OLDER
+Print only allocations older than OLDER milliseconds. Useful to remove false positives.
+The default value is 500 milliseconds.
+.TP
+\-c COMMAND
+Run the specified command and trace its allocations only. This traces libc allocator.
+.TP
+\-\-combined-only
+Use statistics precalculated in kernel space. Amount of data to be pulled from
+kernel significantly decreases, at the cost of losing capabilities of time-based
+false positives filtering (\-o).
+.TP
+\-s SAMPLE_RATE
+Record roughly every SAMPLE_RATE-th allocation to reduce overhead.
+.TP
+\-t TOP
+Print only the top TOP stacks (sorted by size).
+The default value is 10.
+.TP
+\-z MIN_SIZE
+Capture only allocations that are larger than or equal to MIN_SIZE bytes.
+.TP
+\-Z MAX_SIZE
+Capture only allocations that are smaller than or equal to MAX_SIZE bytes.
+.TP
+\-O OBJ
+Attach to allocation functions in specified object instead of resolving libc. Ignored when kernel allocations are profiled.
+.TP
+INTERVAL
+Print a summary of outstanding allocations and their call stacks every INTERVAL seconds.
+The default interval is 5 seconds.
+.TP
+COUNT
+Print the outstanding allocations summary COUNT times and then exit.
+.SH EXAMPLES
+.TP
+Print outstanding kernel allocation stacks every 3 seconds:
+#
+.B memleak 3
+.TP
+Print user outstanding allocation stacks and allocation details for the process 1005:
+#
+.B memleak -p 1005 -a
+.TP
+Sample roughly every 5th allocation (~20%) of the call stacks and print the top 5
+stacks 10 times before quitting.
+#
+.B memleak -s 5 --top=5 10
+.TP
+Run ./allocs and print outstanding allocation stacks for that process: 
+#
+.B memleak -c "./allocs"
+.TP
+Capture only allocations between 16 and 32 bytes in size:
+#
+.B memleak -z 16 -Z 32
+.SH OVERHEAD
+memleak can have significant overhead if the target process or kernel performs
+allocations at a very high rate. Pathological cases may exhibit up to 100x
+degradation in running time. Most of the time, however, memleak shouldn't cause
+a significant slowdown. You can use the \-s switch to reduce the overhead
+further by capturing only every N-th allocation. The \-z and \-Z switches can
+also reduce overhead by capturing only allocations of specific sizes.
+
+Additionally, option \-\-combined-only saves processing time by reusing already
+calculated allocation statistics from kernel. It's faster, but lacks information
+about particular allocations.
+
+To determine the rate at which your application is calling malloc/free, or the
+rate at which your kernel is calling kmalloc/kfree, place a probe with perf and
+collect statistics. For example, to determine how many calls to __kmalloc are
+placed in a typical period of 10 seconds:
+
+#
+.B perf probe '__kmalloc'
+
+#
+.B perf stat -a -e 'probe:__kmalloc' -- sleep 10
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
diff --git a/man/man8/mountsnoop.8 b/man/man8/mountsnoop.8
new file mode 100644
index 0000000..450301a
--- /dev/null
+++ b/man/man8/mountsnoop.8
@@ -0,0 +1,55 @@
+.TH mountsnoop 8  "2016-10-14" "USER COMMANDS"
+.SH NAME
+mountsnoop \- Trace mount() and umount() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B mountsnoop
+.SH DESCRIPTION
+mountsnoop traces the mount() and umount() syscalls, showing which processes
+are mounting and unmounting filesystems in what mount namespaces. This can be
+useful for troubleshooting system and container setup.
+
+This works by tracing the kernel sys_mount() and sys_umount() functions using
+dynamic tracing, and will need updating to match any changes to this function.
+
+This makes use of a Linux 4.4 feature (bpf_perf_event_output()).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH FIELDS
+.TP
+COMM
+Process name
+.TP
+PID
+Process ID
+.TP
+TID
+Thread ID
+.TP
+MNT_NS
+Mount namespace inode number
+.TP
+CALL
+System call, arguments, and return value
+.SH OVERHEAD
+This traces the kernel mount and umount functions and prints output for each
+event. As the rate of these calls is generally expected to be very low, the
+overhead is also expected to be negligible. If your system calls mount() and
+umount() at a high rate, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Omar Sandoval
+.SH SEE ALSO
+mount(2)
+umount(2)
diff --git a/man/man8/mysqld_qslower.8 b/man/man8/mysqld_qslower.8
new file mode 100644
index 0000000..5753079
--- /dev/null
+++ b/man/man8/mysqld_qslower.8
@@ -0,0 +1,66 @@
+.TH mysqld_qslower 8  "2016-08-01" "USER COMMANDS"
+.SH NAME
+mysqld_qslower \- Trace MySQL server queries slower than a threshold.
+.SH SYNOPSIS
+.B mysqld_qslower PID [min_ms]
+.SH DESCRIPTION
+This traces queries served by a MySQL server, and prints those that exceed a
+custom latency (query duration) threshold. By default, a minimum threshold of 1
+millisecond is used. If a threshold of 0 is used, all queries are printed.
+
+This uses User Statically-Defined Tracing (USDT) probes, a feature added to
+MySQL for DTrace support, but which may not be enabled on a given MySQL
+installation. See requirements.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF, bcc, and MySQL server with USDT probe support (when configuring
+the build: \-DENABLE_DTRACE=1).
+.SH OPTIONS
+PID
+Trace this mysqld PID.
+.TP
+min_ms
+Minimum query latency (duration) to trace, in milliseconds. Default is 1 ms.
+.SH EXAMPLES
+.TP
+Trace MySQL server queries slower than 1 ms for PID 1981:
+#
+.B mysqld_qslower 1981
+.TP
+Trace slower than 10 ms for PID 1981:
+#
+.B mysqld_qslower 1981 10
+.SH FIELDS
+.TP
+TIME(s)
+Time of query start, in seconds.
+.TP
+PID
+Process ID of the traced server.
+.TP
+MS
+Milliseconds for the query, from start to end.
+.TP
+QUERY
+Query string, truncated to 128 characters.
+.SH OVERHEAD
+This adds low-overhead instrumentation to MySQL queries, and only emits output
+data from kernel to user-level if they query exceeds the threshold. If the
+server query rate is less than 1,000/sec, the overhead is expected to be
+negligible. If the query rate is higher, test to gauge overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8)
diff --git a/man/man8/nfsdist.8 b/man/man8/nfsdist.8
new file mode 100644
index 0000000..c72cfaa
--- /dev/null
+++ b/man/man8/nfsdist.8
@@ -0,0 +1,80 @@
+.TH nfsdist 8  "2017-09-08" "USER COMMANDS"
+.SH NAME
+nfsdist \- Summarize NFS operation latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B nfsdist [\-h] [\-T] [\-m] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This tool summarizes time (latency) spent in common NFS file operations: reads,
+writes, opens, and getattrs, and presents it as a power-of-2 histogram. It uses an
+in-kernel eBPF map to store the histogram for efficiency.
+
+Since this works by tracing the nfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Don't include timestamps on interval output.
+.TP
+\-m
+Output in milliseconds.
+.TP
+\-p PID
+Trace this PID only.
+.SH EXAMPLES
+.TP
+Trace NFS operation time, and print a summary on Ctrl-C:
+#
+.B nfsdist
+.TP
+Trace PID 181 only:
+#
+.B nfsdist -p 181
+.TP
+Print 1 second summaries, 10 times:
+#
+.B nfsdist 1 10
+.TP
+1 second summaries, printed in milliseconds
+#
+.B nfsdist \-m 1
+.SH FIELDS
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+count
+Number of operations in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these NFS operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Samuel Nair
+.SH SEE ALSO
+nfsslower(8)
diff --git a/man/man8/nfsslower.8 b/man/man8/nfsslower.8
new file mode 100644
index 0000000..19eb635
--- /dev/null
+++ b/man/man8/nfsslower.8
@@ -0,0 +1,122 @@
+.TH nfsslower 8  "2017-09-01" "USER COMMANDS"
+.SH NAME
+nfsslower \- Trace slow NFS file operations, with per-event details.
+.SH SYNOPSIS
+.B nfsslower [\-h] [\-j] [\-p PID] [min_ms]
+.SH DESCRIPTION
+This tool traces common NFSv3 & NFSv4 file operations: reads, writes, opens, and
+getattrs. It measures the time spent in these operations, and prints details
+for each that exceeded a threshold.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used. If a threshold of 0
+is used, all events are printed (warning: verbose).
+
+Since this works by tracing the nfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+This tool uses kprobes to instrument the kernel for entry and exit
+information, in the future a preferred way would be to use tracepoints.
+Currently there aren't any tracepoints available for nfs_read_file,
+nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
+tracepoints but we chose to use kprobes for consistency
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+\-j
+Trace output in CSV format.
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B nfsslower
+.TP
+Trace slower than 1 ms:
+#
+.B nfsslower 1
+.TP
+Trace slower than 1 ms, and output just the fields in parsable format (CSV):
+#
+.B nfsslower \-j 1
+.TP
+Trace all file reads and writes (warning: the output will be verbose):
+#
+.B nfsslower 0
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B nfsslower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+T
+Type of operation. R == read, W == write, O == open, G == getattr.
+.TP
+OFF_KB
+File offset for the I/O, in Kbytes.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when it was issued by VFS to the
+filesystem, to when it completed. This time is inclusive of RPC latency,
+network latency, cache lookup, remote fileserver processing latency, etc. 
+Its a more accurate measure of the latency suffered by applications performing
+NFS read/write calls to a fileserver.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.TP
+ENDTIME_us
+Completion timestamp, microseconds (\-j only).
+.TP
+OFFSET_b
+File offset, bytes (\-j only).
+.TP
+LATENCY_us
+Latency (duration) of the I/O, in microseconds (\-j only).
+.SH OVERHEAD
+This adds low-overhead instrumentation to NFS operations,
+including reads and writes from the file system cache. Such read, writes and
+particularly getattrs can be very frequent (depending on the workload; eg, 1M/sec),
+at which point the overhead of this tool (even if it prints no "slower" events) can
+begin to become significant. Measure and quantify before use. If this
+continues to be a problem, consider switching to a tool that prints in-kernel
+summaries only. This tool has been tested with NFSv3 & NVSv4, but it might work
+with NFSv{1,2}, since it is tracing the generic functions from nfs_file_operations.
+.PP
+Note that the overhead of this tool should be less than fileslower(8), as
+this tool targets NFS functions only, and not all file read/write paths.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion nfsslower_examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Samuel Nair
+.SH SEE ALSO
+biosnoop(8), funccount(8), fileslower(8)
diff --git a/man/man8/nodegc.8 b/man/man8/nodegc.8
new file mode 120000
index 0000000..219d8a3
--- /dev/null
+++ b/man/man8/nodegc.8
@@ -0,0 +1 @@
+ugc.8
\ No newline at end of file
diff --git a/man/man8/nodestat.8 b/man/man8/nodestat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/nodestat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/offcputime.8 b/man/man8/offcputime.8
new file mode 100644
index 0000000..440c1dd
--- /dev/null
+++ b/man/man8/offcputime.8
@@ -0,0 +1,121 @@
+.TH offcputime 8  "2016-01-14" "USER COMMANDS"
+.SH NAME
+offcputime \- Summarize off-CPU time by kernel stack trace. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B offcputime [\-h] [\-p PID | \-t TID | \-u | \-k] [\-U | \-K] [\-d] [\-f] [\-\-stack\-storage\-size STACK_STORAGE_SIZE] [\-m MIN_BLOCK_TIME] [\-M MAX_BLOCK_TIME] [\-\-state STATE] [duration]
+.SH DESCRIPTION
+This program shows stack traces and task names that were blocked and "off-CPU",
+and the total duration they were not running: their "off-CPU time".
+It works by tracing when threads block and when they return to CPU, measuring
+both the time they were off-CPU and the blocked stack trace and the task name.
+This data is summarized in the kernel using an eBPF map, and by summing the
+off-CPU time by unique stack trace and task name.
+
+The output summary will help you identify reasons why threads were blocking,
+and quantify the time they were off-CPU. This spans all types of blocking
+activity: disk I/O, network I/O, locks, page faults, involuntary context
+switches, etc.
+
+This is complementary to CPU profiling (e.g., CPU flame graphs) which shows
+the time spent on-CPU. This shows the time spent off-CPU, and the output,
+especially the -f format, can be used to generate an "off-CPU time flame graph".
+
+See http://www.brendangregg.com/FlameGraphs/offcpuflamegraphs.html
+
+This tool only works on Linux 4.6+. It uses the new `BPF_STACK_TRACE` table
+APIs to generate the in-kernel stack traces.
+For kernels older than 4.6, see the version under tools/old.
+
+Note: this tool only traces off-CPU times that began and ended while tracing.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-f
+Print output in folded stack format.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
+\-u
+Only trace user threads (no kernel threads).
+.TP
+\-k
+Only trace kernel threads (no user threads).
+.TP
+\-U
+Show stacks from user space only (no kernel space stacks).
+.TP
+\-K
+Show stacks from kernel space only (no user space stacks).
+.TP
+\-d
+Insert delimiter between kernel/user stacks.
+.TP
+\-f
+Output folded format.
+.TP
+\-\-stack-storage-size STACK_STORAGE_SIZE
+Change the number of unique stack traces that can be stored and displayed.
+.TP
+\-m MIN_BLOCK_TIME
+The minimum time in microseconds over which we store traces (default 1)
+.TP
+\-M MAX_BLOCK_TIME
+The maximum time in microseconds under which we store traces (default U64_MAX)
+.TP
+\-\-state
+Filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE).
+See include/linux/sched.h for states.
+.TP
+duration
+Duration to trace, in seconds.
+.SH EXAMPLES
+.TP
+Trace all thread blocking events, and summarize (in-kernel) by kernel stack trace and total off-CPU time:
+#
+.B offcputime
+.TP
+Trace for 5 seconds only:
+#
+.B offcputime 5
+.TP
+Trace for 5 seconds, and emit output in folded stack format (suitable for flame graphs):
+#
+.B offcputime -f 5
+.TP
+Trace PID 185 only:
+#
+.B offcputime -p 185
+.SH OVERHEAD
+This summarizes unique stack traces in-kernel for efficiency, allowing it to
+trace a higher rate of events than methods that post-process in user space. The
+stack trace and time data is only copied to user space once, when the output is
+printed. While these techniques greatly lower overhead, scheduler events are
+still a high frequency event, as they can exceed 1 million events per second,
+and so caution should still be used. Test before production use.
+
+If the overhead is still a problem, take a look at the MINBLOCK_US tunable in
+the code. If your aim is to chase down longer blocking events, then this could
+be increased to filter shorter blocking events, further lowering overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+stackcount(8)
diff --git a/man/man8/offwaketime.8 b/man/man8/offwaketime.8
new file mode 100644
index 0000000..cdc49a7
--- /dev/null
+++ b/man/man8/offwaketime.8
@@ -0,0 +1,109 @@
+.TH offwaketime 8  "2016-01-30" "USER COMMANDS"
+.SH NAME
+offwaketime \- Summarize blocked time by off-CPU stack + waker stack. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B offwaketime [\-h] [\-p PID | \-t TID | \-u | \-k] [\-U | \-K] [\-f] [\-\-stack-storage-size STACK_STORAGE_SIZE] [\-m MIN_BLOCK_TIME] [\-M MAX_BLOCK_TIME] [duration]
+.SH DESCRIPTION
+This program shows kernel stack traces and task names that were blocked and
+"off-CPU", along with the stack traces and task names for the threads that woke
+them, and the total elapsed time from when they blocked to when they were woken
+up.  This combines the summaries from both the offcputime and wakeuptime tools.
+The time measurement will be very similar to off-CPU time, however, off-CPU time
+may include a little extra time spent waiting on a run queue to be scheduled.
+The combined stacks, task names, and total time is summarized in kernel context
+for efficiency, using an eBPF map.
+
+The output summary will further help you identify reasons why threads
+were blocking, and quantify the time from when they were blocked to woken up.
+This spans all types of blocking activity: disk I/O, network I/O, locks, page
+faults, swapping, sleeping, involuntary context switches, etc.
+
+This is complementary to CPU profiling (e.g., CPU flame graphs) which shows
+the time spent on-CPU. This shows the time spent blocked off-CPU, and the
+output, especially the -f format, can be used to generate an "off-wake time
+flame graph".
+
+See http://www.brendangregg.com/FlameGraphs/offcpuflamegraphs.html
+
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-f
+Print output in folded stack format.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
+\-u
+Only trace user threads (no kernel threads).
+.TP
+\-k
+Only trace kernel threads (no user threads).
+.TP
+\-U
+Show stacks from user space only (no kernel space stacks).
+.TP
+\-K
+Show stacks from kernel space only (no user space stacks).
+.TP
+\-\-stack-storage-size STACK_STORAGE_SIZE
+Change the number of unique stack traces that can be stored and displayed.
+.TP
+duration
+Duration to trace, in seconds.
+.TP
+\-m MIN_BLOCK_TIME
+The amount of time in microseconds over which we store traces (default 1)
+.TP
+\-M MAX_BLOCK_TIME
+The amount of time in microseconds under which we store traces (default U64_MAX)
+.SH EXAMPLES
+.TP
+Trace all thread blocking events, and summarize (in-kernel) by user and kernel off-CPU stack trace, waker stack traces, task names, and total blocked time:
+#
+.B offwaketime
+.TP
+Trace for 5 seconds only:
+#
+.B offwaketime 5
+.TP
+Trace for 5 seconds, and emit output in folded stack format (suitable for flame graphs), user-mode threads only:
+#
+.B offwaketime -fu 5
+.TP
+Trace PID 185 only:
+#
+.B offwaketime -p 185
+.SH OVERHEAD
+This summarizes unique stack trace pairs in-kernel for efficiency, allowing it
+to trace a higher rate of events than methods that post-process in user space.
+The stack trace and time data is only copied to user space once, when the output
+is printed. While these techniques greatly lower overhead, scheduler events are
+still a high frequency event, as they can exceed 1 million events per second,
+and so caution should still be used. Test before production use.
+
+If the overhead is still a problem, take a look at the min block option.
+If your aim is to chase down longer blocking events, then this could
+be increased to filter shorter blocking events, further lowering overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+offcputime(8), wakeuptime(8)
diff --git a/man/man8/oomkill.8 b/man/man8/oomkill.8
new file mode 100644
index 0000000..b6d6f9e
--- /dev/null
+++ b/man/man8/oomkill.8
@@ -0,0 +1,55 @@
+.TH oomkill 8  "2016-02-09" "USER COMMANDS"
+.SH NAME
+oomkill \- Trace oom_kill_process(). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B oomkill
+.SH DESCRIPTION
+This traces the kernel out-of-memory killer, and prints basic details,
+including the system load averages at the time of the OOM kill. This can
+provide more context on the system state at the time: was it getting busier
+or steady, based on the load averages? This tool may also be useful to
+customize for investigations; for example, by adding other task_struct
+details at the time of OOM.
+
+This program is also a basic example of eBPF/bcc.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace OOM kill events:
+#
+.B oomkill
+.SH FIELDS
+.TP
+Triggered by ...
+The process ID and process name of the task that was running when another task was OOM
+killed.
+.TP
+OOM kill of ...
+The process ID and name of the target process that was OOM killed.
+.TP
+loadavg
+Contents of /proc/loadavg. The first three numbers are 1, 5, and 15 minute
+load averages (where the average is an exponentially damped moving sum, and
+those numbers are constants in the equation); then there is the number of
+running tasks, a slash, and the total number of tasks; and then the last number
+is the last PID to be created.
+.SH OVERHEAD
+Negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+memleak(8)
diff --git a/man/man8/opensnoop.8 b/man/man8/opensnoop.8
new file mode 100644
index 0000000..f7b74c1
--- /dev/null
+++ b/man/man8/opensnoop.8
@@ -0,0 +1,110 @@
+.TH opensnoop 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+opensnoop \- Trace open() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B opensnoop [\-h] [\-T] [\-x] [\-p PID] [\-t TID] [\-d DURATION] [\-n name]
+.SH DESCRIPTION
+opensnoop traces the open() syscall, showing which processes are attempting
+to open which files. This can be useful for determining the location of config
+and log files, or for troubleshooting applications that are failing, specially
+on startup.
+
+This works by tracing the kernel sys_open() function using dynamic tracing, and
+will need updating to match any changes to this function.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include a timestamp column.
+.TP
+\-x
+Only print failed opens.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
+\-d DURATION
+Total duration of trace in seconds.
+.TP
+\-n name
+Only print processes where its name partially matches 'name'
+.SH EXAMPLES
+.TP
+Trace all open() syscalls:
+#
+.B opensnoop
+.TP
+Trace all open() syscalls, for 10 seconds only:
+#
+.B opensnoop -d 10
+.TP
+Trace all open() syscalls, and include timestamps:
+#
+.B opensnoop \-T
+.TP
+Trace only open() syscalls that failed:
+#
+.B opensnoop \-x
+.TP
+Trace PID 181 only:
+#
+.B opensnoop \-p 181
+.TP
+Trace all open() syscalls from processes where its name partially matches 'ed':
+#
+.B opensnoop \-n ed
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+PID
+Process ID
+.TP
+TID
+Thread ID
+.TP
+COMM
+Process name
+.TP
+FD
+File descriptor (if success), or -1 (if failed)
+.TP
+ERR
+Error number (see the system's errno.h)
+.TP
+PATH
+Open path
+.SH OVERHEAD
+This traces the kernel open function and prints output for each event. As the
+rate of this is generally expected to be low (< 1000/s), the overhead is also
+expected to be negligible. If you have an application that is calling a high
+rate of open()s, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+funccount(1)
diff --git a/man/man8/perlcalls.8 b/man/man8/perlcalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/perlcalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/perlflow.8 b/man/man8/perlflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/perlflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/perlstat.8 b/man/man8/perlstat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/perlstat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/phpcalls.8 b/man/man8/phpcalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/phpcalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/phpflow.8 b/man/man8/phpflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/phpflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/phpstat.8 b/man/man8/phpstat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/phpstat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/pidpersec.8 b/man/man8/pidpersec.8
new file mode 100644
index 0000000..2164ffa
--- /dev/null
+++ b/man/man8/pidpersec.8
@@ -0,0 +1,41 @@
+.TH pidpersec 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+pidpersec \- Count new processes (via fork()). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B pidpersec
+.SH DESCRIPTION
+pidpersec shows how many new processes were created each second. There
+can be performance issues caused by many short-lived processes, which may not
+be visible in sampling tools like top(1). pidpersec provides one way to
+investigate this behavior.
+
+This works by tracing the kernel sched_fork() function using dynamic tracing,
+and will need updating to match any changes to this function.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count new processes created each second:
+#
+.B pidpersec
+.SH OVERHEAD
+This traces the kernel fork function, and maintains an in-kernel count which is
+read asynchronously from user-space. As the rate of this is generally expected to
+be low (<< 1000/s), the overhead is also expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+top(1)
diff --git a/man/man8/profile.8 b/man/man8/profile.8
new file mode 100644
index 0000000..abdd6e3
--- /dev/null
+++ b/man/man8/profile.8
@@ -0,0 +1,140 @@
+.TH profile 8  "2016-07-17" "USER COMMANDS"
+.SH NAME
+profile \- Profile CPU usage by sampling stack traces. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B profile [\-adfh] [\-p PID] [\-U | \-K] [\-F FREQUENCY | \-c COUNT]
+.B [\-\-stack\-storage\-size COUNT] [duration]
+.SH DESCRIPTION
+This is a CPU profiler. It works by taking samples of stack traces at timed
+intervals. It will help you understand and quantify CPU usage: which code is
+executing, and by how much, including both user-level and kernel code.
+
+By default this samples at 49 Hertz (samples per second), across all CPUs.
+This frequency can be tuned using a command line option. The reason for 49, and
+not 50, is to avoid lock-step sampling.
+
+This is also an efficient profiler, as stack traces are frequency counted in
+kernel context, rather than passing each stack to user space for frequency
+counting there. Only the unique stacks and counts are passed to user space
+at the end of the profile, greatly reducing the kernel<->user transfer.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+
+This also requires Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). See tools/old
+for an older version that may work on Linux 4.6 - 4.8.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel). Without this, all CPUs are
+profiled.
+.TP
+\-F frequency
+Frequency to sample stacks.
+.TP
+\-c count
+Sample stacks every one in this many events.
+.TP
+\-f
+Print output in folded stack format.
+.TP
+\-d
+Include an output delimiter between kernel and user stacks (either "--", or,
+in folded mode, "-").
+.TP
+\-U
+Show stacks from user space only (no kernel space stacks).
+.TP
+\-K
+Show stacks from kernel space only (no user space stacks).
+.TP
+\-\-stack-storage-size COUNT
+The maximum number of unique stack traces that the kernel will count (default
+16384). If the sampled count exceeds this, a warning will be printed.
+.TP
+\-C cpu
+Collect stacks only from specified cpu.
+.TP
+duration
+Duration to trace, in seconds.
+.SH EXAMPLES
+.TP
+Profile (sample) stack traces system-wide at 49 Hertz (samples per second) until Ctrl-C:
+#
+.B profile
+.TP
+Profile for 5 seconds only:
+#
+.B profile 5
+.TP
+Profile at 99 Hertz for 5 seconds only:
+#
+.B profile -F 99 5
+.TP
+Profile 1 in a million events for 5 seconds only:
+#
+.B profile -c 1000000 5
+.TP
+Profile PID 181 only:
+#
+.B profile -p 181
+.TP
+Profile for 5 seconds and output in folded stack format (suitable as input for flame graphs), including a delimiter between kernel and user stacks:
+#
+.B profile -df 5
+.TP
+Profile kernel stacks only:
+#
+.B profile -K
+.SH DEBUGGING
+See "[unknown]" frames with bogus addresses? This can happen for different
+reasons. Your best approach is to get Linux perf to work first, and then to
+try this tool. Eg, "perf record \-F 49 \-a \-g \-\- sleep 1; perf script", and
+to check for unknown frames there.
+
+The most common reason for "[unknown]" frames is that the target software has
+not been compiled
+with frame pointers, and so we can't use that simple method for walking the
+stack. The fix in that case is to use software that does have frame pointers,
+eg, gcc -fno-omit-frame-pointer, or Java's -XX:+PreserveFramePointer.
+
+Another reason for "[unknown]" frames is JIT compilers, which don't use a
+traditional symbol table. The fix in that case is to populate a
+/tmp/perf-PID.map file with the symbols, which this tool should read. How you
+do this depends on the runtime (Java, Node.js).
+
+If you seem to have unrelated samples in the output, check for other
+sampling or tracing tools that may be running. The current version of this
+tool can include their events if profiling happened concurrently. Those
+samples may be filtered in a future version.
+.SH OVERHEAD
+This is an efficient profiler, as stack traces are frequency counted in
+kernel context, and only the unique stacks and their counts are passed to
+user space. Contrast this with the current "perf record -F 99 -a" method
+of profiling, which writes each sample to user space (via a ring buffer),
+and then to the file system (perf.data), which must be post-processed.
+
+This uses perf_event_open to setup a timer which is instrumented by BPF,
+and for efficiency it does not initialize the perf ring buffer, so the
+redundant perf samples are not collected.
+
+It's expected that the overhead while sampling at 49 Hertz (the default),
+across all CPUs, should be negligible. If you increase the sample rate, the
+overhead might begin to be measurable.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+offcputime(8)
diff --git a/man/man8/pythoncalls.8 b/man/man8/pythoncalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/pythoncalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/pythonflow.8 b/man/man8/pythonflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/pythonflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/pythongc.8 b/man/man8/pythongc.8
new file mode 120000
index 0000000..219d8a3
--- /dev/null
+++ b/man/man8/pythongc.8
@@ -0,0 +1 @@
+ugc.8
\ No newline at end of file
diff --git a/man/man8/pythonstat.8 b/man/man8/pythonstat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/pythonstat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/reset-trace.8 b/man/man8/reset-trace.8
new file mode 100644
index 0000000..e96d479
--- /dev/null
+++ b/man/man8/reset-trace.8
@@ -0,0 +1,59 @@
+.TH reset-trace 8  "2016-10-18" "USER COMMANDS"
+.SH NAME
+reset-trace \- reset the state of tracing.
+.SH SYNOPSIS
+.B reset-trace [\-F] [\-h] [\-q] [\-v]
+.SH DESCRIPTION
+You will probably never need this tool. If you kill \-9 a bcc tool (plus other
+signals, like SIGTERM), or if a bcc tool crashes, then kernel tracing can be
+left in a semi-enabled state. It's not as bad as it sounds: there may just be
+overhead for writing to ring buffers that are never read. This tool can be
+used to clean up the tracing state, and reset and disable active tracing.
+
+Make sure no other tracing sessions are active. This tool might stop them from
+functioning (perhaps ungracefully).
+
+This specifically clears the state in at least the following files in
+/sys/kernel/debug/tracing: kprobe_events, uprobe_events, trace_pipe.
+Other tracing facilities (ftrace) are checked, and if not in an expected state,
+a note is printed. All tracing files can be reset with \-F for force, but this
+will interfere with any other running tracing sessions (eg, ftrace).
+.SH REQUIREMENTS
+/sys/kernel/debug mounted as debugfs
+.SH OPTIONS
+.TP
+\-F
+Force. Will reset all tracing facilities, including those not used by bcc
+(ftrace). You shouldn't need to use this.
+.TP
+\-h
+USAGE message.
+.TP
+\-q
+Quiet. No output while working.
+.TP
+\-v
+Verbose: print what it is doing.
+.SH EXAMPLES
+.TP
+Reset the state of tracing:
+#
+.B reset-trace
+.TP
+Verbose:
+#
+.B reset-trace \-v
+.TP
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
diff --git a/man/man8/rubycalls.8 b/man/man8/rubycalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/rubycalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/rubyflow.8 b/man/man8/rubyflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/rubyflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/rubygc.8 b/man/man8/rubygc.8
new file mode 120000
index 0000000..219d8a3
--- /dev/null
+++ b/man/man8/rubygc.8
@@ -0,0 +1 @@
+ugc.8
\ No newline at end of file
diff --git a/man/man8/rubyobjnew.8 b/man/man8/rubyobjnew.8
new file mode 120000
index 0000000..b384265
--- /dev/null
+++ b/man/man8/rubyobjnew.8
@@ -0,0 +1 @@
+uobjnew.8
\ No newline at end of file
diff --git a/man/man8/rubystat.8 b/man/man8/rubystat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/rubystat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/runqlat.8 b/man/man8/runqlat.8
new file mode 100644
index 0000000..d535ebb
--- /dev/null
+++ b/man/man8/runqlat.8
@@ -0,0 +1,113 @@
+.TH runqlat 8  "2016-02-07" "USER COMMANDS"
+.SH NAME
+runqlat \- Run queue (scheduler) latency as a histogram.
+.SH SYNOPSIS
+.B runqlat [\-h] [\-T] [\-m] [\-P] [\-\-pidnss] [\-L] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This measures the time a task spends waiting on a run queue (or equivalent
+scheduler data structure) for a turn on-CPU, and shows this time as a
+histogram. This time should be small, but a task may need to wait its turn due
+to CPU load. The higher the CPU load, the longer a task will generally need to
+wait its turn.
+
+This tool measures two types of run queue latency:
+
+1. The time from a task being enqueued on a run queue to its context switch
+and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
+finish_task_switch() with either raw tracepoints (if supported) or kprobes
+and instruments the run queue latency after a voluntary context switch.
+
+2. The time from when a task was involuntary context switched and still
+in the runnable state, to when it next executed. This is instrumented
+from finish_task_switch() alone.
+
+This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
+for efficiency. Despite this, the overhead of this tool may become significant
+for some workloads: see the OVERHEAD section.
+
+This works by tracing various kernel scheduler functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-m
+Output histogram in milliseconds.
+.TP
+\-P
+Print a histogram for each PID.
+.TP
+\-\-pidnss
+Print a histogram for each PID namespace (short for PID namespaces). For
+container analysis.
+.TP
+\-L
+Print a histogram for each thread ID.
+.TP
+\-p PID
+Only show this PID (filtered in kernel for efficiency).
+.TP
+interval
+Output interval, in seconds.
+.TP
+count
+Number of outputs.
+.SH EXAMPLES
+.TP
+Summarize run queue latency as a histogram:
+#
+.B runqlat
+.TP
+Print 1 second summaries, 10 times:
+#
+.B runqlat 1 10
+.TP
+Print 1 second summaries, using milliseconds as units for the histogram, and include timestamps on output:
+#
+.B runqlat \-mT 1
+.TP
+Trace PID 186 only, 1 second summaries:
+#
+.B runqlat -P 185 1
+.SH FIELDS
+.TP
+usecs
+Microsecond range
+.TP
+msecs
+Millisecond range
+.TP
+count
+How many times a task event fell into this range
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This traces scheduler functions, which can become very frequent. While eBPF
+has very low overhead, and this tool uses in-kernel maps for efficiency, the
+frequency of scheduler events for some workloads may be high enough that the
+overhead of this tool becomes significant. Measure in a lab environment
+to quantify the overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+runqlen(8), runqslower(8), pidstat(1)
diff --git a/man/man8/runqlen.8 b/man/man8/runqlen.8
new file mode 100644
index 0000000..27a649d
--- /dev/null
+++ b/man/man8/runqlen.8
@@ -0,0 +1,86 @@
+.TH runqlen 8  "2016-12-12" "USER COMMANDS"
+.SH NAME
+runqlen \- Scheduler run queue length as a histogram.
+.SH SYNOPSIS
+.B runqlen [\-h] [\-T] [\-O] [\-C] [interval] [count]
+.SH DESCRIPTION
+This program summarizes scheduler queue length as a histogram, and can also
+show run queue occupancy. It works by sampling the run queue length on all
+CPUs at 99 Hertz.
+
+This tool can be used to identify imbalances, eg, when processes are bound
+to CPUs causing queueing, or interrupt mappings causing the same.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-O
+Report run queue occupancy.
+.TP
+\-C
+Report for each CPU.
+.TP
+interval
+Output interval, in seconds.
+.TP
+count
+Number of outputs.
+.SH EXAMPLES
+.TP
+Summarize run queue length as a histogram:
+#
+.B runqlen
+.TP
+Print 1 second summaries, 10 times:
+#
+.B runqlen 1 10
+.TP
+Print output every second, with timestamps, and show each CPU separately:
+#
+.B runqlen \-CT 1
+.TP
+Print run queue occupancy every second:
+#
+.B runqlen \-O 1
+.TP
+Print run queue occupancy, with timetamps, for each CPU:
+#
+.B runqlen \-COT 1
+.SH FIELDS
+.TP
+runqlen
+Scheduler run queue length: the number of threads (tasks) waiting to run,
+(excluding including the currently running task).
+.TP
+count
+Number of samples at this queue length.
+.TP
+distribution
+An ASCII bar chart to visualize the distribution (count column)
+.SH OVERHEAD
+This uses sampling at 99 Hertz (on all CPUs), and in-kernel summaries, which
+should make overhead negligible. This does not trace scheduler events, like
+runqlen does, which comes at a much higher overhead cost.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+runqlat(8), runqslower(8), pidstat(1)
diff --git a/man/man8/runqslower.8 b/man/man8/runqslower.8
new file mode 100644
index 0000000..0baee64
--- /dev/null
+++ b/man/man8/runqslower.8
@@ -0,0 +1,86 @@
+.TH runqslower 8  "2016-02-07" "USER COMMANDS"
+.SH NAME
+runqlat \- Trace long process scheduling delays.
+.SH SYNOPSIS
+.B runqslower [\-p PID] [min_us]
+.SH DESCRIPTION
+This measures the time a task spends waiting on a run queue (or equivalent
+scheduler data structure) for a turn on-CPU, and shows occurrences of time
+exceeding passed threshold. This time should be small, but a task may need
+to wait its turn due to CPU load. The higher the CPU load, the longer a task
+will generally need to wait its turn.
+
+This tool measures two types of run queue latency:
+
+1. The time from a task being enqueued on a run queue to its context switch
+and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
+finish_task_switch() with either raw tracepoints (if supported) or kprobes
+and instruments the run queue latency after a voluntary context switch.
+
+2. The time from when a task was involuntary context switched and still
+in the runnable state, to when it next executed. This is instrumented
+from finish_task_switch() alone.
+
+The overhead of this tool may become significant for some workloads:
+see the OVERHEAD section.
+
+This works by tracing various kernel scheduler functions using dynamic tracing,
+and will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Only show this PID (filtered in kernel for efficiency).
+.TP
+min_us
+Minimum scheduling delay in microseconds to output.
+.SH EXAMPLES
+.TP
+Show scheduling delays longer than 10ms:
+#
+.B runqslower
+.TP
+Show scheduling delays longer than 1ms for process with PID 123:
+#
+.B runqslower -p 123 1000
+.SH FIELDS
+.TP
+TIME
+Time of when scheduling event occurred.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+LAT(us)
+Scheduling latency from time when task was ready to run to the time it was
+assigned to a CPU to run.
+.SH OVERHEAD
+This traces scheduler functions, which can become very frequent. While eBPF
+has very low overhead, and this tool uses in-kernel maps for efficiency, the
+frequency of scheduler events for some workloads may be high enough that the
+overhead of this tool becomes significant. Measure in a lab environment
+to quantify the overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Ivan Babrou
+.SH SEE ALSO
+runqlen(8), runqlat(8), pidstat(1)
diff --git a/man/man8/slabratetop.8 b/man/man8/slabratetop.8
new file mode 100644
index 0000000..bfe897f
--- /dev/null
+++ b/man/man8/slabratetop.8
@@ -0,0 +1,76 @@
+.TH slabratetop 8  "2016-10-17" "USER COMMANDS"
+.SH NAME
+slabratetop \- Kernel SLAB/SLUB memory cache allocation rate top.
+Uses Linux BPF/bcc.
+.SH SYNOPSIS
+.B slabratetop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
+.SH DESCRIPTION
+This is top for the the rate of kernel SLAB/SLUB memory allocations.
+It works by tracing kmem_cache_alloc() calls, a commonly used interface for
+kernel memory allocation (SLAB or SLUB). It summarizes the rate and total bytes
+allocated of these calls per interval: the activity. Compare this to
+slabtop(1), which shows the current static volume of the caches.
+
+This tool uses kernel dynamic tracing of the kmem_cache_alloc() function.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C
+Don't clear the screen.
+.TP
+\-r MAXROWS
+Maximum number of rows to print. Default is 20.
+.TP
+interval
+Interval between updates, seconds.
+.TP
+count
+Number of interval summaries.
+.SH EXAMPLES
+.TP
+Summarize active kernel SLAB/SLUB calls (kmem_cache_alloc()), showing the top 20 caches every second:
+#
+.B slabratetop
+.TP
+Don't clear the screen, and top 8 rows only:
+#
+.B slabratetop -Cr 8
+.TP
+5 second summaries, 10 times only:
+#
+.B slabratetop 5 10
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+CACHE
+Kernel cache name.
+.TP
+ALLOCS
+Allocations (number of calls).
+.TP
+BYTES
+Total bytes allocated.
+.SH OVERHEAD
+If kmem_cache_alloc() is called at a high rate (eg, >100k/second) the overhead
+of this tool might begin to be measurable. The rate can be seen in the ALLOCS
+column of the output.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+slabtop(1)
diff --git a/man/man8/softirqs.8 b/man/man8/softirqs.8
new file mode 100644
index 0000000..a9a1441
--- /dev/null
+++ b/man/man8/softirqs.8
@@ -0,0 +1,93 @@
+.TH softirqs 8  "2015-10-20" "USER COMMANDS"
+.SH NAME
+softirqs \- Measure soft IRQ (soft interrupt) event time. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B softirqs [\-h] [\-T] [\-N] [\-d] [interval] [count]
+.SH DESCRIPTION
+This summarizes the time spent servicing soft IRQs (soft interrupts), and can
+show this time as either totals or histogram distributions. A system-wide
+summary of this time is shown by the %soft column of mpstat(1), and soft IRQ
+event counts (but not times) are available in /proc/softirqs.
+
+This tool uses the irq:softirq_enter and irq:softirq_exit kernel tracepoints,
+which is a stable tracing mechanism. BPF programs can attach to tracepoints
+from Linux 4.7 only. An older version of this tool is available in tools/old,
+and uses kprobes instead of tracepoints.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include timestamps on output.
+.TP
+\-N
+Output in nanoseconds
+.TP
+\-d
+Show IRQ time distribution as histograms
+.SH EXAMPLES
+.TP
+Sum soft IRQ event time until Ctrl-C:
+#
+.B softirqs
+.TP
+Show soft IRQ event time as histograms:
+#
+.B softirqs \-d
+.TP
+Print 1 second summaries, 10 times:
+#
+.B softirqs 1 10
+.TP
+1 second summaries, printed in nanoseconds, with timestamps:
+#
+.B softirqs \-NT 1
+.SH FIELDS
+.TP
+SOFTIRQ
+The kernel function name that performs the soft IRQ action.
+.TP
+TOTAL_usecs
+Total time spent in this soft IRQ function in microseconds.
+.TP
+TOTAL_nsecs
+Total time spent in this soft IRQ function in nanoseconds.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+nsecs
+Range of nanoseconds for this bucket.
+.TP
+count
+Number of soft IRQs in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This traces kernel functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of interrupts
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads, but
+could become noticeable for heavy workloads. Measure in a test environment
+before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHORS
+Brendan Gregg, Sasha Goldshtein
+.SH SEE ALSO
+hardirqs(8)
diff --git a/man/man8/sslsniff.8 b/man/man8/sslsniff.8
new file mode 100644
index 0000000..72836e2
--- /dev/null
+++ b/man/man8/sslsniff.8
@@ -0,0 +1,51 @@
+.TH sslsniff 8  "2016-08-16" "USER COMMANDS"
+.SH NAME
+sslsniff \- Print data passed to OpenSSL, GnuTLS or NSS. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B sslsniff [-h] [-p PID] [-c COMM] [-o] [-g] [-n] [-d]
+.SH DESCRIPTION
+sslsniff prints data sent to write/send and read/recv functions of
+OpenSSL, GnuTLS and NSS, allowing us to read plain text content before
+encryption (when writing) and after decryption (when reading).
+
+This works reading the second parameter of both functions (*buf).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print all calls to SSL write/send and read/recv system-wide:
+#
+.B sslsniff
+.SH FIELDS
+.TP
+FUNC
+Which function is being called (write/send or read/recv)
+.TP
+TIME
+Time of the command, in seconds.
+.TP
+COMM
+Entered command.
+.TP
+PID
+Process ID calling SSL.
+.TP
+LEN
+Bytes written or read by SSL functions.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHORS
+Adrian Lopez and Mark Drayton
+.SH SEE ALSO
+trace(8)
diff --git a/man/man8/stackcount.8 b/man/man8/stackcount.8
new file mode 100644
index 0000000..d6ab993
--- /dev/null
+++ b/man/man8/stackcount.8
@@ -0,0 +1,142 @@
+.TH stackcount 8  "2016-01-14" "USER COMMANDS"
+.SH NAME
+stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] [\-s]
+              [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern
+.SH DESCRIPTION
+stackcount traces functions and frequency counts them with their entire
+stack trace, kernel stack and user stack, summarized in-kernel for efficiency.
+This allows higher frequency events to be studied. The output consists of
+unique stack traces, and their occurrence counts. In addition to kernel and
+user functions, kernel tracepoints and USDT tracepoint are also supported.
+
+The pattern is a string with optional '*' wildcards, similar to file globbing.
+If you'd prefer to use regular expressions, use the \-r option.
+
+This tool only works on Linux 4.6+. Stack traces are obtained using the new `BPF_STACK_TRACE` APIs.
+For kernels older than 4.6, see the version under tools/old.
+
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-r
+Allow regular expressions for the search pattern. The default allows "*"
+wildcards only.
+.TP
+\-s
+Show address offsets.
+.TP
+\-T
+Include a timestamp with interval output.
+.TP
+\-v
+Show raw addresses.
+.TP
+\-d
+Print a delimiter ("--") in-between the kernel and user stacks.
+.TP
+\-\-debug
+Print the source of the BPF program when loading it (for debugging purposes).
+.TP
+\-i interval
+Summary interval, in seconds.
+.TP
+\-D duration
+Total duration of trace, in seconds.
+\-f
+Folded output format.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+.TP
+pattern
+A function name, or a search pattern. Can include wildcards ("*"). If the
+\-r option is used, can include regular expressions.
+.SH EXAMPLES
+.TP
+Count kernel and user stack traces for submit_bio():
+#
+.B stackcount submit_bio
+.TP
+Count stacks with a delimiter for submit_bio():
+#
+.B stackcount \-d submit_bio
+.TP
+Count kernel stack trace only for submit_bio():
+#
+.B stackcount \-K submit_bio
+.TP
+Count user stack trace only for submit_bio():
+#
+.B stackcount \-U submit_bio
+.TP
+Count stack traces for ip_output():
+#
+.B stackcount ip_output
+.TP
+Show symbol offsets:
+#
+.B stackcount \-s ip_output
+.TP
+Show offsets and raw addresses (verbose):
+#
+.B stackcount \-sv ip_output
+.TP
+Count stacks for kernel functions matching tcp_send*:
+#
+.B stackcount 'tcp_send*'
+.TP
+Same as previous, but using regular expressions:
+#
+.B stackcount \-r '^tcp_send.*'
+.TP
+Output every 5 seconds, with timestamps:
+#
+.B stackcount \-Ti 5 ip_output
+.TP
+Only count stacks when PID 185 is on-CPU:
+#
+.B stackcount \-p 185 ip_output
+.TP
+Count user stacks for dynamic heap allocations with malloc in PID 185:
+#
+.B stackcount \-p 185 c:malloc
+.TP
+Count user stacks for thread creation (USDT tracepoint) in PID 185:
+#
+.B stackcount \-p 185 u:pthread:pthread_create
+.TP
+Count stacks for context switch events using a kernel tracepoint:
+#
+.B stackcount t:sched:sched_switch
+.SH OVERHEAD
+This summarizes unique stack traces in-kernel for efficiency, allowing it to
+trace a higher rate of function calls than methods that post-process in user
+space. The stack trace data is only copied to user space when the output is
+printed, which usually only happens once. The stack walking also happens in an
+optimized code path in the kernel thanks to the new BPF_STACK_TRACE table APIs,
+which should be more efficient than the manual walker in the eBPF tracer which
+older versions of this script used. With this in mind, call rates of <
+10,000/sec would incur negligible overhead. Test before production use. You can
+also use funccount to get a handle on function call rates first.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg, Sasha Goldshtein
+.SH SEE ALSO
+stacksnoop(8), funccount(8)
diff --git a/man/man8/statsnoop.8 b/man/man8/statsnoop.8
new file mode 100644
index 0000000..00921d6
--- /dev/null
+++ b/man/man8/statsnoop.8
@@ -0,0 +1,91 @@
+.TH statsnoop 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+statsnoop \- Trace stat() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B statsnoop [\-h] [\-t] [\-x] [\-p PID]
+.SH DESCRIPTION
+statsnoop traces the different stat() syscalls, showing which processes are
+attempting to read information about which files. This can be useful for
+determining the location of config and log files, or for troubleshooting
+applications that are failing, especially on startup.
+
+This works by tracing various kernel sys_stat() functions using dynamic
+tracing, and will need updating to match any changes to these functions.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column: in seconds since the first event, with decimal
+places.
+.TP
+\-x
+Only print failed stats.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all stat() syscalls:
+#
+.B statsnoop
+.TP
+Trace all stat() syscalls, and include timestamps:
+#
+.B statsnoop \-t
+.TP
+Trace only stat() syscalls that failed:
+#
+.B statsnoop \-x
+.TP
+Trace PID 181 only:
+#
+.B statsnoop \-p 181
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+FD
+File descriptor (if success), or -1 (if failed)
+.TP
+ERR
+Error number (see the system's errno.h)
+.TP
+PATH
+Open path
+.SH OVERHEAD
+This traces the kernel stat function and prints output for each event. As the
+rate of this is generally expected to be low (< 1000/s), the overhead is also
+expected to be negligible. If you have an application that is calling a high
+rate of stat()s, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/syncsnoop.8 b/man/man8/syncsnoop.8
new file mode 100644
index 0000000..3d4c8c4
--- /dev/null
+++ b/man/man8/syncsnoop.8
@@ -0,0 +1,53 @@
+.TH syncsnoop 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+syncsnoop \- Trace sync() syscall. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B syncsnoop
+.SH DESCRIPTION
+syncsnoop traces calls to sync(), which flushes file system buffers to
+storage devices. These calls can cause performance perturbations, and it can
+be useful to know if they are happening and how frequently.
+
+This works by tracing the kernel sys_sync() function using dynamic tracing, and
+will need updating to match any changes to this function.
+
+This makes use of a Linux 4.5 feature (bpf_perf_event_output());
+for kernels older than 4.5, see the version under tools/old,
+which uses an older mechanism.
+
+This program is also a basic example of eBPF/bcc.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Trace calls to sync():
+#
+.B syncsnoop
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+CALL
+Call traced.
+.SH OVERHEAD
+This traces the kernel sync function and prints output for each event. As the
+rate of this is generally expected to be low (<< 100/s), the overhead is also
+expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+iostat(1)
diff --git a/man/man8/syscount.8 b/man/man8/syscount.8
new file mode 100644
index 0000000..d13793b
--- /dev/null
+++ b/man/man8/syscount.8
@@ -0,0 +1,110 @@
+.TH syscount 8  "2017-02-15" "USER COMMANDS"
+.SH NAME
+syscount \- Summarize syscall counts and latencies.
+.SH SYNOPSIS
+.B syscount [-h] [-p PID] [-i INTERVAL] [-d DURATION] [-T TOP] [-x] [-e ERRNO] [-L] [-m] [-P] [-l]
+.SH DESCRIPTION
+This tool traces syscall entry and exit tracepoints and summarizes either the
+number of syscalls of each type, or the number of syscalls per process. It can
+also collect latency (invocation time) for each syscall or each process.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc. Linux 4.7+ is required to attach a BPF program to the
+raw_syscalls:sys_{enter,exit} tracepoints, used by this tool.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace only this process.
+.TP
+\-i INTERVAL
+Print the summary at the specified interval (in seconds).
+.TP
+\-d DURATION
+Total duration of trace (in seconds).
+.TP
+\-T TOP
+Print only this many entries. Default: 10.
+.TP
+\-x
+Trace only failed syscalls (i.e., the return value from the syscall was < 0).
+.TP
+\-e ERRNO
+Trace only syscalls that failed with that error (e.g. -e EPERM or -e 1).
+.TP
+\-m
+Display times in milliseconds. Default: microseconds.
+.TP
+\-P
+Summarize by process and not by syscall.
+.TP
+\-l
+List the syscalls recognized by the tool (hard-coded list). Syscalls beyond this
+list will still be displayed, as "[unknown: nnn]" where nnn is the syscall
+number.
+.SH EXAMPLES
+.TP
+Summarize all syscalls by syscall:
+#
+.B syscount
+.TP
+Summarize all syscalls by process:
+#
+.B syscount \-P
+.TP
+Summarize only failed syscalls:
+#
+.B syscount \-x
+.TP
+Summarize only syscalls that failed with EPERM:
+#
+.B syscount \-e EPERM
+.TP
+Trace PID 181 only:
+#
+.B syscount \-p 181
+.TP
+Summarize syscalls counts and latencies:
+#
+.B syscount \-L
+.SH FIELDS
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+SYSCALL
+Syscall name, or "[unknown: nnn]" for syscalls that aren't recognized
+.TP
+COUNT
+The number of events
+.TP
+TIME
+The total elapsed time (in us or ms)
+.SH OVERHEAD
+For most applications, the overhead should be manageable if they perform 1000's
+or even 10,000's of syscalls per second. For higher rates, the overhead may
+become considerable. For example, tracing a loop of 4 million calls to geteuid(),
+slows it down by 1.85x when tracing only syscall counts, and slows it down by
+more than 5x when tracing syscall counts and latencies. However, this represents
+a rate of >3.5 million syscalls per second, which should not be typical.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+funccount(8), ucalls(8), argdist(8), trace(8), funclatency(8)
diff --git a/man/man8/tclcalls.8 b/man/man8/tclcalls.8
new file mode 120000
index 0000000..a9a6054
--- /dev/null
+++ b/man/man8/tclcalls.8
@@ -0,0 +1 @@
+ucalls.8
\ No newline at end of file
diff --git a/man/man8/tclflow.8 b/man/man8/tclflow.8
new file mode 120000
index 0000000..84ccb94
--- /dev/null
+++ b/man/man8/tclflow.8
@@ -0,0 +1 @@
+uflow.8
\ No newline at end of file
diff --git a/man/man8/tclobjnew.8 b/man/man8/tclobjnew.8
new file mode 120000
index 0000000..b384265
--- /dev/null
+++ b/man/man8/tclobjnew.8
@@ -0,0 +1 @@
+uobjnew.8
\ No newline at end of file
diff --git a/man/man8/tclstat.8 b/man/man8/tclstat.8
new file mode 120000
index 0000000..e3a3a29
--- /dev/null
+++ b/man/man8/tclstat.8
@@ -0,0 +1 @@
+ustat.8
\ No newline at end of file
diff --git a/man/man8/tcpaccept.8 b/man/man8/tcpaccept.8
new file mode 100644
index 0000000..837717b
--- /dev/null
+++ b/man/man8/tcpaccept.8
@@ -0,0 +1,86 @@
+.TH tcpaccept 8  "2015-08-25" "USER COMMANDS"
+.SH NAME
+tcpaccept \- Trace TCP passive connections (accept()). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpaccept [\-h] [\-t] [\-x] [\-p PID]
+.SH DESCRIPTION
+This tool traces passive TCP connections (eg, via an accept() syscall;
+connect() are active connections). This can be useful for general
+troubleshooting to see what new connections the local server is accepting.
+
+This uses dynamic tracing of the kernel inet_csk_accept() socket function (from
+tcp_prot.accept), and will need to be modified to match kernel changes.
+
+This tool only traces successful TCP accept()s. Connection attempts to closed
+ports will not be shown (those can be traced via other functions).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all passive TCP connections (accept()s):
+#
+.B tcpaccept
+.TP
+Trace all TCP accepts, and include timestamps:
+#
+.B tcpaccept \-t
+.TP
+Trace PID 181 only:
+#
+.B tcpaccept \-p 181
+.SH FIELDS
+.TP
+TIME(s)
+Time of the event, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+IP
+IP address family (4 or 6)
+.TP
+RADDR
+Remote IP address.
+.TP
+LADDR
+Local IP address.
+.TP
+LPORT
+Local port
+.SH OVERHEAD
+This traces the kernel inet_csk_accept function and prints output for each event.
+The rate of this depends on your server application. If it is a web or proxy server
+accepting many tens of thousands of connections per second, then the overhead
+of this tool may be measurable (although, still a lot better than tracing
+every packet). If it is less than a thousand a second, then the overhead is
+expected to be negligible. Test and understand this overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpconnect(8), funccount(8), tcpdump(8)
diff --git a/man/man8/tcpconnect.8 b/man/man8/tcpconnect.8
new file mode 100644
index 0000000..eb1f4ad
--- /dev/null
+++ b/man/man8/tcpconnect.8
@@ -0,0 +1,92 @@
+.TH tcpconnect 8  "2015-08-25" "USER COMMANDS"
+.SH NAME
+tcpconnect \- Trace TCP active connections (connect()). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpconnect [\-h] [\-t] [\-x] [\-p PID] [-P PORT]
+.SH DESCRIPTION
+This tool traces active TCP connections (eg, via a connect() syscall;
+accept() are passive connections). This can be useful for general
+troubleshooting to see what connections are initiated by the local server.
+
+All connection attempts are traced, even if they ultimately fail.
+
+This works by tracing the kernel tcp_v4_connect() and tcp_v6_connect() functions
+using dynamic tracing, and will need updating to match any changes to these
+functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-P PORT
+Comma-separated list of destination ports to trace (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all active TCP connections:
+#
+.B tcpconnect
+.TP
+Trace all TCP connects, and include timestamps:
+#
+.B tcpconnect \-t
+.TP
+Trace PID 181 only:
+#
+.B tcpconnect \-p 181
+.TP
+Trace ports 80 and 81 only:
+#
+.B tcpconnect \-P 80,81
+.SH FIELDS
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+IP
+IP address family (4 or 6)
+.TP
+SADDR
+Source IP address.
+.TP
+DADDR
+Destination IP address.
+.TP
+DPORT
+Destination port
+.SH OVERHEAD
+This traces the kernel tcp_v[46]_connect functions and prints output for each
+event. As the rate of this is generally expected to be low (< 1000/s), the
+overhead is also expected to be negligible. If you have an application that
+is calling a high rate of connects()s, such as a proxy server, then test and
+understand this overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpaccept(8), funccount(8), tcpdump(8)
diff --git a/man/man8/tcpconnlat.8 b/man/man8/tcpconnlat.8
new file mode 100644
index 0000000..996c21b
--- /dev/null
+++ b/man/man8/tcpconnlat.8
@@ -0,0 +1,109 @@
+.TH tcpconnlat 8  "2016-02-19" "USER COMMANDS"
+.SH NAME
+tcpconnlat \- Trace TCP active connection latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpconnlat [\-h] [\-t] [\-p PID] [-v] [min_ms]
+.SH DESCRIPTION
+This tool traces active TCP connections
+(eg, via a connect() syscall), and shows the latency (time) for the connection
+as measured locally: the time from SYN sent to the response packet.
+This is a useful performance metric that typically spans kernel TCP/IP
+processing and the network round trip time (not application runtime).
+
+All connection attempts are traced, even if they ultimately fail (RST packet
+in response).
+
+This tool works by use of kernel dynamic tracing of TCP/IP functions, and will
+need updating to match any changes to these functions. This tool should be
+updated in the future to use static tracepoints, once they are available.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-t
+Include a timestamp column.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+min_ms
+Minimum duration to trace, in milliseconds.
+.SH EXAMPLES
+.TP
+Trace all active TCP connections, and show connection latency (SYN->response round trip):
+#
+.B tcpconnlat
+.TP
+Include timestamps:
+#
+.B tcpconnlat \-t
+.TP
+Trace PID 181 only:
+#
+.B tcpconnlat \-p 181
+.TP
+Trace connects with latency longer than 10 ms:
+#
+.B tcpconnlat 10
+.TP
+Print the BPF program:
+#
+.B tcpconnlat \-v
+.SH FIELDS
+.TP
+TIME(s)
+Time of the response packet, in seconds.
+.TP
+PID
+Process ID that initiated the connection.
+.TP
+COMM
+Process name that initiated the connection.
+.TP
+IP
+IP address family (4 or 6).
+.TP
+SADDR
+Source IP address.
+.TP
+DADDR
+Destination IP address.
+.TP
+DPORT
+Destination port
+.TP
+LAT(ms)
+The time from when a TCP connect was issued (measured in-kernel) to when a
+response packet was received for this connection (can be SYN,ACK, or RST, etc).
+This time spans kernel to kernel latency, involving kernel TCP/IP processing
+and the network round trip in between. This typically does not include
+time spent by the application processing the new connection.
+.SH OVERHEAD
+This traces the kernel tcp_v[46]_connect functions and prints output for each
+event. As the rate of this is generally expected to be low (< 1000/s), the
+overhead is also expected to be negligible. If you have an application that
+is calling a high rate of connects()s, such as a proxy server, then test and
+understand this overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpconnect(8), tcpaccept(8), funccount(8), tcpdump(8)
diff --git a/man/man8/tcpdrop.8 b/man/man8/tcpdrop.8
new file mode 100644
index 0000000..a21e885
--- /dev/null
+++ b/man/man8/tcpdrop.8
@@ -0,0 +1,73 @@
+.TH tcpdrop 8  "2018-05-30" "USER COMMANDS"
+.SH NAME
+tcpdrop \- Trace kernel-based TCP packet drops with details. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpdrop [\-h] [\-T] [\-t] [\-w] [\-s] [\-p PID] [\-D PORTS] [\-L PORTS]
+.SH DESCRIPTION
+This tool traces TCP packets or segments that were dropped by the kernel, and
+shows details from the IP and TCP headers, the socket state, and the
+kernel stack trace. This is useful for debugging cases of high kernel drops,
+which can cause timer-based retransmits and performance issues.
+
+This tool works using dynamic tracing of the tcp_drop() kernel function,
+which requires a recent kernel version.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.B tcpdrop
+.SH FIELDS
+.TP
+TIME
+Time of the drop, in HH:MM:SS format.
+.TP
+PID
+Process ID that was on-CPU during the drop. This may be unrelated, as drops
+can occur on the receive interrupt and be unrelated to the PID that was
+interrupted.
+.TP
+IP
+IP address family (4 or 6)
+.TP
+SADDR
+Source IP address.
+.TP
+SPORT
+Source TCP port.
+.TP
+DADDR
+Destination IP address.
+.TP
+DPORT
+Destionation TCP port.
+.TP
+STATE
+TCP session state ("ESTABLISHED", etc).
+.TP
+FLAGS
+TCP flags ("SYN", etc).
+.SH OVERHEAD
+This traces the kernel tcp_drop() function, which should be low frequency,
+and therefore the overhead of this tool should be negligible.
+
+As always, test and understand this tools overhead for your types of
+workloads before production use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcplife(8), tcpaccept(8), tcpconnect(8), tcptop(8)
diff --git a/man/man8/tcplife.8 b/man/man8/tcplife.8
new file mode 100644
index 0000000..f6b8991
--- /dev/null
+++ b/man/man8/tcplife.8
@@ -0,0 +1,130 @@
+.TH tcplife 8  "2016-10-19" "USER COMMANDS"
+.SH NAME
+tcplife \- Trace TCP sessions and summarize lifespan. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcplife [\-h] [\-T] [\-t] [\-w] [\-s] [\-p PID] [\-D PORTS] [\-L PORTS]
+.SH DESCRIPTION
+This tool traces TCP sessions that open and close while tracing, and prints
+a line of output to summarize each one. This includes the IP addresses, ports,
+duration, and throughput for the session. This is useful for workload
+characterisation and flow accounting: identifying what connections are
+happening, with the bytes transferred.
+
+This tool works using the sock:inet_sock_set_state tracepoint if it exists,
+added to Linux 4.16, and switches to using kernel dynamic tracing for older
+kernels. Only TCP state changes are traced, so it is expected that the
+overhead of this tool is much lower than typical send/receive tracing.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-s
+Comma separated values output (parseable).
+.TP
+\-t
+Include a timestamp column (seconds).
+.TP
+\-T
+Include a time column (HH:MM:SS).
+.TP
+\-w
+Wide column output (fits IPv6 addresses).
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-L PORTS
+Comma-separated list of local ports to trace (filtered in-kernel).
+.TP
+\-D PORTS
+Comma-separated list of destination ports to trace (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all TCP sessions, and summarize lifespan and throughput:
+#
+.B tcplife
+.TP
+Include a timestamp column, and wide column output:
+#
+.B tcplife \-tw
+.TP
+Trace PID 181 only:
+#
+.B tcplife \-p 181
+.TP
+Trace connections to local ports 80 and 81 only:
+#
+.B tcplife \-L 80,81
+.TP
+Trace connections to remote port 80 only:
+#
+.B tcplife \-D 80
+.SH FIELDS
+.TP
+TIME
+Time of the call, in HH:MM:SS format.
+.TP
+TIME(s)
+Time of the call, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+IP
+IP address family (4 or 6)
+.TP
+LADDR
+Local IP address.
+.TP
+DADDR
+Remote IP address.
+.TP
+LPORT
+Local port.
+.TP
+DPORT
+Destination port.
+.TP
+TX_KB
+Total transmitted Kbytes.
+.TP
+RX_KB
+Total received Kbytes.
+.TP
+MS
+Lifespan of the session, in milliseconds.
+.SH OVERHEAD
+This traces the kernel TCP set state function, which should be called much
+less often than send/receive tracing, and therefore have lower overhead. The
+overhead of the tool is relative to the rate of new TCP sessions: if this is
+high, over 10,000 per second, then there may be noticeable overhead just to
+print out 10k lines of formatted output per second.
+
+You can find out the rate of new TCP sessions using "sar \-n TCP 1", and
+adding the active/s and passive/s columns.
+
+As always, test and understand this tools overhead for your types of
+workloads before production use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpaccept(8), tcpconnect(8), tcptop(8)
diff --git a/man/man8/tcpretrans.8 b/man/man8/tcpretrans.8
new file mode 100644
index 0000000..e4f6fbf
--- /dev/null
+++ b/man/man8/tcpretrans.8
@@ -0,0 +1,91 @@
+.TH tcpretrans 8  "2016-02-14" "USER COMMANDS"
+.SH NAME
+tcpretrans \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpretrans [\-h] [\-l]
+.SH DESCRIPTION
+This traces TCP retransmits, showing address, port, and TCP state information,
+and sometimes the PID (although usually not, since retransmits are usually
+sent by the kernel on timeouts). To keep overhead very low, only
+the TCP retransmit functions are traced. This does not trace every packet
+(like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits
+over a user signalled interval to spot potentially dropping network paths the
+flows are traversing. 
+
+This uses dynamic tracing of the kernel tcp_retransmit_skb() and
+tcp_send_loss_probe() functions, and will need to be updated to
+match kernel changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-l
+Include tail loss probe attempts (in some cases the kernel may not
+complete the TLP send).
+.TP
+\-c
+Count occurring retransmits per flow. 
+.SH EXAMPLES
+.TP
+Trace TCP retransmits:
+#
+.B tcpretrans
+.TP
+Trace TCP retransmits and TLP attempts:
+#
+.B tcpretrans \-l
+.SH FIELDS
+.TP
+TIME
+Time of the retransmit.
+.TP
+PID
+Process ID that was on-CPU. This is less useful than it might sound, as it
+may usually be 0, for the kernel, for timer-based retransmits.
+.TP
+IP
+IP address family (4 or 6).
+.TP
+LADDR
+Local IP address.
+.TP
+LPORT
+Local port.
+.TP
+T>
+Type of event: R> == retransmit, L> == tail loss probe.
+.TP
+RADDR
+Remote IP address.
+.TP
+RPORT
+Remote port.
+.TP
+STATE
+TCP session state.
+.TP
+RETRANSMITS
+Accumulated occurred retransmits since start.
+.SH OVERHEAD
+Should be negligible: TCP retransmit events should be low (<1000/s), and the
+low overhead this tool adds to each event should make the cost negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpconnect(8), tcpaccept(8)
diff --git a/man/man8/tcpstates.8 b/man/man8/tcpstates.8
new file mode 100644
index 0000000..b31fd64
--- /dev/null
+++ b/man/man8/tcpstates.8
@@ -0,0 +1,128 @@
+.TH tcpstates 8  "2018-03-20" "USER COMMANDS"
+.SH NAME
+tcpstates \- Trace TCP session state changes with durations. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpstates [\-h] [\-T] [\-t] [\-w] [\-s] [\-D PORTS] [\-L PORTS]
+.SH DESCRIPTION
+This tool traces TCP session state changes while tracing, and prints details
+including the duration in each state. This can help explain the latency of
+TCP connections: whether the time is spent in the ESTABLISHED state (data
+transfer), or initialization state (SYN_SENT), etc.
+
+This tool works using the sock:inet_sock_set_state tracepoint, which was
+added to Linux 4.16. Linux 4.16 also included extra state transitions so that
+all TCP transitions could be observed by this tracepoint.
+
+Only TCP state changes are traced, so it is expected that the
+overhead of this tool is much lower than typical send/receive tracing.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc, and the sock:inet_sock_set_state tracepoint.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-s
+Comma separated values output (parseable).
+.TP
+\-t
+Include a timestamp column (seconds).
+.TP
+\-T
+Include a time column (HH:MM:SS).
+.TP
+\-w
+Wide column output (fits IPv6 addresses).
+.TP
+\-L PORTS
+Comma-separated list of local ports to trace (filtered in-kernel).
+.TP
+\-D PORTS
+Comma-separated list of destination ports to trace (filtered in-kernel).
+.SH EXAMPLES
+.TP
+Trace all TCP sessions, and show all state changes:
+#
+.B tcpstates
+.TP
+Include a timestamp column, and wide column output:
+#
+.B tcpstates \-tw
+.TP
+Trace connections to local ports 80 and 81 only:
+#
+.B tcpstates \-L 80,81
+.TP
+Trace connections to remote port 80 only:
+#
+.B tcpstates \-D 80
+.SH FIELDS
+.TP
+TIME
+Time of the change, in HH:MM:SS format.
+.TP
+TIME(s)
+Time of the change, in seconds.
+.TP
+C-PID
+The current on-CPU process ID. This may show the process that owns the TCP
+session if the state change executes in synchronous process context, else it
+is likely to show the kernel (asynchronous state change).
+.TP
+C-COMM
+The current on-CPU process name. This may show the process that owns the TCP
+session if the state change executes in synchronous process context, else it
+is likely to show the kernel (asynchronous state change).
+.TP
+IP
+IP address family (4 or 6)
+.TP
+LADDR
+Local IP address.
+.TP
+DADDR
+Remote IP address.
+.TP
+LPORT
+Local port.
+.TP
+DPORT
+Destination port.
+.TP
+OLDSTATE
+Previous TCP state.
+.TP
+NEWSTATE
+New TCP state.
+.TP
+MS
+Duration of this state.
+.SH OVERHEAD
+This traces the kernel TCP set state function, which should be called much
+less often than send/receive tracing, and therefore have lower overhead. The
+overhead of the tool is relative to the rate of new TCP sessions: if this is
+high, over 10,000 per second, then there may be noticeable overhead just to
+print out 10k lines of formatted output per second.
+
+You can find out the rate of new TCP sessions using "sar \-n TCP 1", and
+adding the active/s and passive/s columns.
+
+As always, test and understand this tools overhead for your types of
+workloads before production use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+tcpaccept(8), tcpconnect(8), tcptop(8), tcplife(8)
diff --git a/man/man8/tcpsubnet.8 b/man/man8/tcpsubnet.8
new file mode 100644
index 0000000..525b808
--- /dev/null
+++ b/man/man8/tcpsubnet.8
@@ -0,0 +1,100 @@
+.TH tcpsubnet 8  "2018-03-01" "USER COMMANDS"
+.SH NAME
+tcpsubnet \- Summarize and aggregate IPv4 TCP traffic by subnet.
+.SH SYNOPSIS
+.B tcpsubnet [\-h] [\-v] [\--ebpf] [\-J] [\-f FORMAT] [\-i INTERVAL] [subnets]
+.SH DESCRIPTION
+This tool summarizes and aggregates IPv4 TCP sent to the subnets
+passed in argument and prints to stdout on a fixed interval.
+
+This uses dynamic tracing of kernel TCP send/receive functions, and will
+need to be updated to match kernel changes.
+
+The traced data is summarized in-kernel using a BPF map to reduce overhead.
+At very high TCP event rates, the overhead may still be measurable.
+See the OVERHEAD section for more details.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print USAGE message.
+.TP
+\-v
+Run in verbose mode. Will output subnet evaluation and the BPF program
+.TP
+\-J
+Format output in JSON.
+.TP
+\-i
+Interval between updates, seconds (default 1).
+.TP
+\-f
+Format output units. Supported values are bkmBKM. When using
+kmKM the output will be rounded to floor.
+.TP
+\--ebpf
+Prints the BPF program.
+.TP
+subnets
+Comma separated list of subnets. Traffic will be categorized
+in theses subnets. Order matters.
+(default 127.0.0.1/32,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16,0.0.0.0/0)
+.SH EXAMPLES
+.TP
+Summarize TCP traffic by the default subnets:
+#
+.B tcpsubnet
+.TP
+Summarize all TCP traffic:
+#
+.B tcpsubnet 0.0.0.0/0
+.TP
+Summarize all TCP traffic and output in JSON and Kb:
+#
+.B tcpsubnet -J -fk 0.0.0.0/0
+.SH FIELDS
+.TP
+(Standad output) Left hand side column:
+Subnet
+.TP
+(Standard output) Right hand side column:
+Aggregate traffic in units passed as argument
+.TP
+(JSON output) date
+Current date formatted in the system locale
+.TP
+(JSON output) time
+Current time formatted in the system locale
+.TP
+(JSON output) entries
+Map of subnets to aggregates. Values will be in format passed to -f
+.SH OVERHEAD
+This traces all tcp_sendmsg function calls in the TCP/IP stack.
+It summarizes data in-kernel to reduce overhead.
+A simple iperf test (v2.0.5) with the default values shows a loss
+of ~5% throughput. On 10 runs without tcpsubnet running the average
+throughput was 32.42Gb/s, with tcpsubnet enabled it was 31.26Gb/s.
+This is not meant to be used as a long running service. Use it
+for troubleshooting or for a controlled interval. As always,
+try it out in a test environment first.
+
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Rodrigo Manyari
+.SH INSPIRATION
+tcptop(8) by Brendan Gregg
+.SH SEE ALSO
+netlink(7)
diff --git a/man/man8/tcptop.8 b/man/man8/tcptop.8
new file mode 100644
index 0000000..672e8ed
--- /dev/null
+++ b/man/man8/tcptop.8
@@ -0,0 +1,112 @@
+.TH tcptop 8  "2016-09-13" "USER COMMANDS"
+.SH NAME
+tcptop \- Summarize TCP send/recv throughput by host. Top for TCP.
+.SH SYNOPSIS
+.B tcptop [\-h] [\-C] [\-S] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This is top for TCP sessions.
+
+This summarizes TCP send/receive Kbytes by host, and prints a summary that
+refreshes, along other system-wide metrics.
+
+This uses dynamic tracing of kernel TCP send/receive functions, and will
+need to be updated to match kernel changes.
+
+The traced TCP functions are usually called at a lower rate than
+per-packet functions, and therefore have lower overhead. The traced data is
+summarized in-kernel using a BPF map to further reduce overhead. At very high
+TCP event rates, the overhead may still be measurable. See the OVERHEAD
+section for more details.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print USAGE message.
+.TP
+\-C
+Don't clear the screen.
+.TP
+\-S
+Don't print the system summary line (load averages).
+.TP
+\-p PID
+Trace this PID only.
+.TP
+interval
+Interval between updates, seconds (default 1).
+.TP
+count
+Number of interval summaries (default is many).
+.SH EXAMPLES
+.TP
+Summarize TCP throughput by active sessions, 1 second refresh:
+#
+.B tcptop
+.TP
+Don't clear the screen (rolling output), and 5 second summaries:
+#
+.B tcptop \-C 5
+.TP
+Trace PID 181 only, and don't clear the screen:
+#
+.B tcptop \-Cp 181
+.SH FIELDS
+.TP
+loadavg:
+The contents of /proc/loadavg
+.TP
+PID
+Process ID.
+.TP
+COMM
+Process name.
+.TP
+LADDR
+Local address (IPv4), and TCP port
+.TP
+RADDR
+Remote address (IPv4), and TCP port
+.TP
+LADDR6
+Source address (IPv6), and TCP port
+.TP
+RADDR6
+Destination address (IPv6), and TCP port
+.TP
+RX_KB
+Received Kbytes
+.TP
+TX_KB
+Transmitted Kbytes
+.SH OVERHEAD
+This traces all send/receives in TCP, high in the TCP/IP stack (close to the
+application) which are usually called at a lower rate than per-packet
+functions, lowering overhead. It also summarizes data in-kernel to further
+reduce overhead. These techniques help, but there may still be measurable
+overhead at high send/receive rates, eg, ~13% of one CPU at 100k events/sec.
+use funccount to count the kprobes in the tool to find out this rate, as the
+overhead is relative to the rate. Some sample production servers tested found
+total TCP event rates of 4k to 15k per second, and the CPU overhead at these
+rates ranged from 0.5% to 2.0% of one CPU. If your send/receive rate is low
+(eg, <1000/sec) then the overhead is expected to be negligible; Test in a lab
+environment first.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH INSPIRATION
+top(1) by William LeFebvre
+.SH SEE ALSO
+tcpconnect(8), tcpaccept(8)
diff --git a/man/man8/tcptracer.8 b/man/man8/tcptracer.8
new file mode 100644
index 0000000..b5b3061
--- /dev/null
+++ b/man/man8/tcptracer.8
@@ -0,0 +1,98 @@
+.TH tcptracer 8  "2017-03-27" "USER COMMANDS"
+.SH NAME
+tcptracer \- Trace TCP established connections. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcptracer [\-h] [\-v] [\-p PID] [\-N NETNS]
+.SH DESCRIPTION
+This tool traces established TCP connections that open and close while tracing,
+and prints a line of output per connect, accept and close events. This includes
+the type of event, PID, IP addresses and ports.
+
+This tool works by using kernel dynamic tracing, and will need to be updated if
+the kernel implementation changes. Only established TCP connections are traced,
+so it is expected that the overhead of this tool is rather low.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-v
+Print full lines, with long event type names and network namespace numbers.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-N NETNS
+Trace this network namespace only (filtered in-kernel).
+.TP
+.SH EXAMPLES
+.TP
+Trace all TCP established connections:
+#
+.B tcptracer
+.TP
+Trace all TCP established connections with verbose lines:
+#
+.B tcptracer \-v
+.TP
+Trace PID 181 only:
+#
+.B tcptracer \-p 181
+.TP
+Trace connections in network namespace 4026531969 only:
+#
+.B tcptracer \-N 4026531969
+.SH FIELDS
+.TP
+TYPE
+Type of event. In non-verbose mode: C for connect, A for accept, X for close.
+.TP
+PID
+Process ID
+.TP
+COMM
+Process name
+.TP
+IP
+IP address family (4 or 6)
+.TP
+SADDR
+Source IP address.
+.TP
+DADDR
+Destination IP address.
+.TP
+SPORT
+Source port.
+.TP
+DPORT
+Destination port.
+.TP
+NETNS
+Network namespace where the event originated.
+.SH OVERHEAD
+This traces the kernel inet accept function, and the TCP connect, close,
+and set state functions. However, it only prints information for connections
+that are established, so it shouldn't have a huge overhead.
+
+As always, test and understand this tools overhead for your types of workloads
+before production use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Iago López Galeiras
+.SH SEE ALSO
+tcpaccept(8), tcpconnect(8), tcptop(8), tcplife(8)
diff --git a/man/man8/tplist.8 b/man/man8/tplist.8
new file mode 100644
index 0000000..da5edf3
--- /dev/null
+++ b/man/man8/tplist.8
@@ -0,0 +1,62 @@
+.TH tplist 8  "2016-03-20" "USER COMMANDS"
+.SH NAME
+tplist \- Display kernel tracepoints or USDT probes and their formats.
+.SH SYNOPSIS
+.B tplist [-p PID] [-l LIB] [-v] [filter]
+.SH DESCRIPTION
+tplist lists all kernel tracepoints, and can optionally print out the tracepoint
+format; namely, the variables that you can trace when the tracepoint is hit. 
+tplist can also list USDT probes embedded in a specific library or executable,
+and can list USDT probes for all the libraries loaded by a specific process.
+These features are usually used in conjunction with the argdist and/or trace tools.
+
+On a typical system, accessing the tracepoint list and format requires root.
+However, accessing USDT probes does not require root.
+.SH OPTIONS
+.TP
+\-p PID
+Display the USDT probes from all the libraries loaded by the specified process.
+.TP
+\-l LIB
+Display the USDT probes from the specified library or executable. If the librar
+or executable can be found in the standard paths, a full path is not required.
+.TP
+\-v
+Increase the verbosity level. Can be used to display the variables, locations,
+and arguments of tracepoints and USDT probes.
+.TP
+[filter]
+A wildcard expression that specifies which tracepoints or probes to print.
+For example, block:* will print all block tracepoints (block:block_rq_complete,
+etc.). Regular expressions are not supported.
+.SH EXAMPLES
+.TP
+Print all kernel tracepoints:
+#
+.B tplist
+.TP
+Print all net tracepoints with their format:
+#
+.B tplist -v 'net:*'
+.TP
+Print all USDT probes in libpthread:
+$ 
+.B tplist -l pthread
+.TP
+Print all USDT probes in process 4717 from the libc provider:
+$
+.B tplist -p 4717 'libc:*'
+.TP
+Print all the USDT probes in the node executable:
+$
+.B tplist -l node
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
diff --git a/man/man8/trace.8 b/man/man8/trace.8
new file mode 100644
index 0000000..c12dd79
--- /dev/null
+++ b/man/man8/trace.8
@@ -0,0 +1,192 @@
+.TH trace 8  "2016-02-18" "USER COMMANDS"
+.SH NAME
+trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
+         [-M MAX_EVENTS] [-t] [-T] [-C] [-K] [-U] [-a] [-I header]
+         probe [probe ...]
+.SH DESCRIPTION
+trace probes functions you specify and displays trace messages if a particular
+condition is met. You can control the message format to display function 
+arguments and return values. 
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-p PID
+Trace only functions in the process PID.
+.TP
+\-L TID
+Trace only functions in the thread TID.
+.TP
+\-v
+Display the generated BPF program, for debugging purposes.
+.TP
+\-z STRING_SIZE
+When collecting string arguments (of type char*), collect up to STRING_SIZE 
+characters. Longer strings will be truncated.
+.TP
+\-S
+If set, trace messages from trace's own process. By default, this is off to
+avoid tracing storms -- for example, if you trace the write system call, and
+consider that trace is writing to the standard output.
+.TP
+\-M MAX_EVENTS
+Print up to MAX_EVENTS trace messages and then exit.
+.TP
+\-t
+Print times relative to the beginning of the trace (offsets), in seconds.
+.TP
+\-T
+Print the time column.
+.TP
+\-C
+Print CPU id.
+.TP
+\-B
+Treat argument of STRCMP helper as a binary value
+.TP
+\-K
+Print the kernel stack for each event.
+.TP
+\-U
+Print the user stack for each event.
+\-a
+Print virtual address in kernel and user stacks.
+.TP
+\-I header
+Additional header files to include in the BPF program. This is needed if your
+filter or print expressions use types or data structures that are not available
+in the standard headers. For example: 'linux/mm.h'
+.TP
+probe [probe ...]
+One or more probes that attach to functions, filter conditions, and print
+information. See PROBE SYNTAX below.
+.SH PROBE SYNTAX
+The general probe syntax is as follows:
+
+.B [{p,r}]:[library]:function[(signature)] [(predicate)] ["format string"[, arguments]]
+
+.B {t:category:event,u:library:probe} [(predicate)] ["format string"[, arguments]]
+.TP
+.B {[{p,r}],t,u}
+Probe type \- "p" for function entry, "r" for function return, "t" for kernel
+tracepoint, "u" for USDT probe. The default probe type is "p".
+.TP
+.B [library]
+Library containing the probe.
+Specify the full path to the .so or executable file where the function to probe
+resides. Alternatively, you can specify just the lib name: for example, "c"
+refers to libc. If no library name is specified, the kernel is assumed. Also,
+you can specify an executable name (without a full path) if it is in the PATH.
+For example, "bash".
+.TP
+.B category
+The tracepoint category. For example, "sched" or "irq".
+.TP
+.B function
+The function to probe.
+.TP
+.B signature
+The optional signature of the function to probe. This can make it easier to
+access the function's arguments, instead of using the "arg1", "arg2" etc.
+argument specifiers. For example, "(struct timespec *ts)" in the signature
+position lets you use "ts" in the filter or print expressions.
+.TP
+.B event
+The tracepoint event. For example, "block_rq_complete".
+.TP
+.B probe
+The USDT probe name. For example, "pthread_create".
+.TP
+.B [(predicate)]
+The filter applied to the captured data. Only if the filter evaluates as true,
+the trace message will be printed. The filter can use any valid C expression
+that refers to the argument values: arg1, arg2, etc., or to the return value
+retval in a return probe. If necessary, use C cast operators to coerce the
+arguments to the desired type. For example, if arg1 is of type int, use the
+expression ((int)arg1 < 0) to trace only invocations where arg1 is negative.
+Note that only arg1-arg6 are supported, and only if the function is using the
+standard x86_64 convention where the first six arguments are in the RDI, RSI, 
+RDX, RCX, R8, R9 registers. If no predicate is specified, all function 
+invocations are traced.
+
+The predicate expression may also use the STRCMP pseudo-function to compare
+a predefined string to a string argument. For example: STRCMP("test", arg1).
+The order of arguments is important: the first argument MUST be a quoted
+literal string, and the second argument can be a runtime string, most typically
+an argument. 
+.TP
+.B ["format string"[, arguments]]
+A printf-style format string that will be used for the trace message. You can
+use the following format specifiers: %s, %d, %u, %lld, %llu, %hd, %hu, %c,
+%x, %llx -- with the same semantics as printf's. Make sure to pass the exact
+number of arguments as there are placeholders in the format string. The
+format specifier replacements may be any C expressions, and may refer to the
+same special keywords as in the predicate (arg1, arg2, etc.).
+
+In addition to the above format specifiers, you can also use %K and %U when
+the expression is an address that potentially points to executable code (i.e.,
+a symbol). trace will resolve %K specifiers to a kernel symbol, such as
+vfs__read, and will resolve %U specifiers to a user-space symbol in that
+process, such as sprintf.
+
+In tracepoints, both the predicate and the arguments may refer to the tracepoint
+format structure, which is stored in the special "args" variable. For example, the
+block:block_rq_complete tracepoint can print or filter by args->nr_sector. To 
+discover the format of your tracepoint, use the tplist tool. 
+
+In USDT probes, the arg1, ..., argN variables refer to the probe's arguments.
+To determine which arguments your probe has, use the tplist tool.
+
+The predicate expression and the format specifier replacements for printing
+may also use the following special keywords: $pid, $tgid to refer to the 
+current process' pid and tgid; $uid, $gid to refer to the current user's
+uid and gid; $cpu to refer to the current processor number.
+.SH EXAMPLES
+.TP
+Trace all invocations of the open system call with the name of the file being opened:
+#
+.B trace '::do_sys_open """%s"", arg2'
+.TP
+Trace all invocations of the read system call where the number of bytes requested is greater than 20,000:
+#
+.B trace '::sys_read (arg3 > 20000) """read %d bytes"", arg3'
+.TP
+Trace all malloc calls and print the size of the requested allocation:
+#
+.B trace ':c:malloc """size = %d"", arg1'
+.TP
+Trace returns from the readline function in bash and print the return value as a string:
+#
+.B trace 'r:bash:readline """%s"", retval' 
+.TP
+Trace the block:block_rq_complete tracepoint and print the number of sectors completed:
+#
+.B trace 't:block:block_rq_complete """%d sectors"", args->nr_sector'
+.TP
+Trace the pthread_create USDT probe from the pthread library and print the address of the thread's start function:
+#
+.B trace 'u:pthread:pthread_create """start addr = %llx"", arg3'
+.TP
+Trace the nanosleep system call and print the sleep duration in nanoseconds:
+#
+.B trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
diff --git a/man/man8/ttysnoop.8 b/man/man8/ttysnoop.8
new file mode 100644
index 0000000..9f37aaa
--- /dev/null
+++ b/man/man8/ttysnoop.8
@@ -0,0 +1,60 @@
+.TH ttysnoop 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+ttysnoop \- Watch output from a tty or pts device. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B ttysnoop [\-h] [\-C] device
+.SH DESCRIPTION
+ttysnoop watches a tty or pts device, and prints the same output that is
+appearing on that device. It can be used to mirror the output from a shell
+session, or the system console.
+
+This works by use of kernel dynamic tracing of the tty_write() function.
+This tool will need updating in case that kernel function changes in a future
+kernel version.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C
+Don't clear the screen.
+.TP
+device
+Either a path to a tty device (eg, /dev/tty0) or a pts number (eg, the "3"
+from /dev/pts/3).
+.SH EXAMPLES
+.TP
+Snoop output from /dev/pts/2
+#
+.B ttysnoop /dev/pts/2
+.TP
+Snoop output from /dev/pts/2 (shortcut)
+#
+.B ttysnoop 2
+.TP
+Snoop output from the system console
+#
+.B ttysnoop /dev/console
+.TP
+Snoop output from /dev/tty0
+#
+.B ttysnoop /dev/tty0
+.SH OVERHEAD
+As the rate of tty_write() is expected to be very low (<100/s), the overhead
+of this tool is expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/ucalls.8 b/man/man8/ucalls.8
new file mode 100644
index 0000000..dfc4b8b
--- /dev/null
+++ b/man/man8/ucalls.8
@@ -0,0 +1,98 @@
+.TH ucalls 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+ucalls, javacalls, perlcalls, phpcalls, pythoncalls, rubycalls, tclcalls \- Summarize method calls
+from high-level languages and Linux syscalls.
+.SH SYNOPSIS
+.B javacalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B perlcalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B phpcalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B pythoncalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B rubycalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B tclcalls [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.br
+.B ucalls [-l {java,perl,php,python,ruby}] [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.SH DESCRIPTION
+This tool summarizes method calls from high-level languages such as Java, Perl,
+PHP, Python, Ruby, and Tcl. It can also trace Linux system calls. Whenever a method
+is invoked, ucalls records the call count and optionally the method's execution
+time (latency) and displays a summary.
+
+This uses in-kernel eBPF maps to store per process summaries for efficiency.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Java, Perl, PHP, Python, Ruby, and Tcl. It requires a runtime instrumented with these
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java, method probes are
+not enabled by default, and can be turned on by running the Java process with
+the "-XX:+ExtendedDTraceProbes" flag. For PHP processes, the environment
+variable USE_ZEND_DTRACE must be set to 1.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {java,perl,php,python,ruby,tcl}
+The language to trace. If not provided, only syscalls are traced (when the \-S
+option is used).
+.TP
+\-T TOP
+Print only the top methods by frequency or latency.
+.TP
+\-L
+Collect method invocation latency (duration).
+.TP
+\-S
+Collect Linux syscalls frequency and timing.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+\-m
+Print times in milliseconds (the default is microseconds).
+.TP
+pid
+The process id to trace.
+.TP
+interval
+Print summary after this number of seconds and then exit. By default, wait for
+Ctrl+C to terminate.
+.SH EXAMPLES
+.TP
+Trace the top 10 Ruby method calls:
+#
+.B ucalls -T 10 -l ruby 1344
+.TP
+Trace Python method calls and Linux syscalls including latency in milliseconds:
+#
+.B ucalls -l python -mL 2020
+.TP
+Trace only syscalls and print a summary after 10 seconds:
+#
+.B ucalls -S 788 10
+.SH OVERHEAD
+Tracing individual method calls will produce a considerable overhead in all
+high-level languages. For languages with just-in-time compilation, such as
+Java, the overhead can be more considerable than for interpreted languages.
+On the other hand, syscall tracing will typically be tolerable for most
+processes, unless they have a very unusual rate of system calls.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), argdist(8)
diff --git a/man/man8/uflow.8 b/man/man8/uflow.8
new file mode 100644
index 0000000..1d0951c
--- /dev/null
+++ b/man/man8/uflow.8
@@ -0,0 +1,98 @@
+.TH uflow 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+uflow, javaflow, perlflow, phpflow, pythonflow, rubyflow, tclflow \- Print a flow graph of method
+calls in high-level languages.
+.SH SYNOPSIS
+.B javaflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B perlflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B phpflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B pythonflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B rubyflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B tclflow [-h] [-M METHOD] [-C CLAZZ] [-v] pid
+.br
+.B uflow [-h] [-M METHOD] [-C CLAZZ] [-v] [-l {java,perl,php,python,ruby,tcl}] pid
+.SH DESCRIPTION
+uflow traces method calls and prints them in a flow graph that can facilitate
+debugging and diagnostics by following the program's execution (method flow).
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Java, Perl, PHP, Python, Ruby, and Tcl. It requires a runtime instrumented with these
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java processes, the
+startup flag "-XX:+ExtendedDTraceProbes" is required. For PHP processes, the
+environment variable USE_ZEND_DTRACE must be set to 1.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-M METHOD
+Print only method calls where the method name begins with this string.
+.TP
+\-C CLAZZ
+Print only method calls where the class name begins with this string. The class
+name interpretation strongly depends on the language. For example, in Java use
+"package/subpackage/ClassName" to refer to classes.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+{java,perl,php,python,ruby,tcl}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Follow method flow in a Ruby process:
+#
+.B uflow ruby 148
+.TP
+Follow method flow in a Java process where the class name is java.lang.Thread:
+#
+.B uflow -C java/lang/Thread java 1802
+.SH FIELDS
+.TP
+CPU
+The CPU number on which the method was invoked. This is useful to easily see
+where the output skips to a different CPU.
+.TP
+PID
+The process id.
+.TP
+TID
+The thread id.
+.TP
+TIME
+The duration of the method call.
+.TP
+METHOD
+The method name.
+.SH OVERHEAD
+This tool has extremely high overhead because it prints every method call. For
+some scenarios, you might see lost samples in the output as the tool is unable
+to keep up with the rate of data coming from the kernel. Filtering by class 
+or method prefix can help reduce the amount of data printed, but there is still
+a very high overhead in the collection mechanism. Do not use for performance-
+sensitive production scenarios, and always test first.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), ustat(8)
diff --git a/man/man8/ugc.8 b/man/man8/ugc.8
new file mode 100644
index 0000000..782ae63
--- /dev/null
+++ b/man/man8/ugc.8
@@ -0,0 +1,98 @@
+.TH ugc 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+ugc, javagc, nodegc, pythongc, rubygc \- Trace garbage collection events in
+high-level languages.
+.SH SYNOPSIS
+.B javagc [-h] [-v] [-m] [-M MINIMUM] [-F FILTER] pid
+.br
+.B nodegc [-h] [-v] [-m] [-M MINIMUM] [-F FILTER] pid
+.br
+.B pythongc [-h] [-v] [-m] [-M MINIMUM] [-F FILTER] pid
+.br
+.B rubygc [-h] [-v] [-m] [-M MINIMUM] [-F FILTER] pid
+.br
+.B ugc [-h] [-v] [-m] [-M MINIMUM] [-F FILTER] [-l {java,node,python,ruby}] pid
+.SH DESCRIPTION
+This traces garbage collection events as they occur, including their duration
+and any additional information (such as generation collected or type of GC)
+provided by the respective language's runtime.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Java, Node, Python, and Ruby. It requires a runtime instrumented with these
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace".
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+\-m
+Print times in milliseconds. The default is microseconds.
+.TP
+\-M MINIMUM
+Display only collections that are longer than this threshold. The value is
+given in milliseconds. The default is to display all collections.
+.TP
+\-F FILTER
+Display only collections whose textual description matches (contains) this
+string. The default is to display all collections. Note that the filtering here
+is performed in user-space, and not as part of the BPF program. This means that
+if you have thousands of collection events, specifying this filter will not
+reduce the amount of data that has to be transferred from the BPF program to
+the user-space script.
+.TP
+{java,node,python,ruby}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Trace garbage collections in a specific Node process:
+#
+.B ugc node 148
+.TP
+Trace garbage collections in a specific Java process, and print GC times in
+milliseconds:
+#
+.B ugc -m java 6004
+.TP
+Trace garbage collections in a specific Java process, and display them only if
+they are longer than 10ms and have the string "Tenured" in their detailed
+description:
+#
+.B ugc -M 10 -F Tenured java 6004
+.SH FIELDS
+.TP
+START
+The start time of the GC, in seconds from the beginning of the trace.
+.TP
+TIME
+The duration of the garbage collection event.
+.TP
+DESCRIPTION
+The runtime-provided description of this garbage collection event.
+.SH OVERHEAD
+Garbage collection events, even if frequent, should not produce a considerable
+overhead when traced because they are still not very common. Even hundreds of 
+GCs per second (which is a very high rate) will still produce a fairly 
+negligible overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), ustat(8), uobjnew(8)
diff --git a/man/man8/uobjnew.8 b/man/man8/uobjnew.8
new file mode 100644
index 0000000..f4a9c74
--- /dev/null
+++ b/man/man8/uobjnew.8
@@ -0,0 +1,88 @@
+.TH uobjnew 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+uobjnew, cobjnew, javaobjnew, rubyobjnew, tclobjnew \- Summarize object allocations in
+high-level languages.
+.SH SYNOPSIS
+.B cobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] pid [interval]
+.br
+.B javaobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] pid [interval]
+.br
+.B rubyobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] pid [interval]
+.br
+.B tclobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] pid [interval]
+.br
+.B uobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] [-l {c,java,ruby,tcl}] pid [interval]
+.SH DESCRIPTION
+uobjnew traces object allocations in high-level languages (including "malloc")
+and prints summaries of the most frequently allocated types by number of
+objects or number of bytes.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+C, Java, Ruby, and Tcl. It requires a runtime instrumented with these
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java, the Java process
+must be started with the "-XX:+ExtendedDTraceProbes" flag.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C TOP_COUNT
+Print the top object types sorted by number of instances.
+.TP
+\-S TOP_SIZE
+Print the top object types sorted by size.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+{c,java,ruby,tcl}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.TP
+interval
+Wait this many seconds and then print the summary and exit. By default, wait
+for Ctrl+C to exit.
+.SH EXAMPLES
+.TP
+Trace object allocations in a Ruby process:
+#
+.B uobjnew ruby 148
+.TP
+Trace object allocations from "malloc" and print the top 10 by total size:
+#
+.B uobjnew -S 10 c 1788
+.SH FIELDS
+.TP
+TYPE
+The object type being allocated. For C (malloc), this is the block size.
+.TP
+ALLOCS
+The number of objects allocated.
+.TP
+BYTES
+The number of bytes allocated.
+.SH OVERHEAD
+Object allocation events are quite frequent, and therefore the overhead from
+running this tool can be considerable. Use with caution and make sure to 
+test before using in a production environment. Nonetheless, even thousands of
+allocations per second will likely produce a reasonable overhead when 
+investigating a problem.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), ugc(8), memleak(8)
diff --git a/man/man8/ustat.8 b/man/man8/ustat.8
new file mode 100644
index 0000000..371d855
--- /dev/null
+++ b/man/man8/ustat.8
@@ -0,0 +1,133 @@
+.TH ustat 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+ustat, javastat, nodestat, perlstat, phpstat, pythonstat, rubystat, tclstat \- Activity stats from
+high-level languages.
+.SH SYNOPSIS
+.B javastat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B nodestat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B perlstat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B phpstat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B pythonstat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B rubystat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B tclstat [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.br
+.B ustat [-l {java,node,perl,php,python,ruby,tcl}] [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.SH DESCRIPTION
+This is "top" for high-level language events, such as garbage collections,
+exceptions, thread creations, object allocations, method calls, and more. The
+events are aggregated for each process and printed in a top-like table, which
+can be sorted by various fields. Not all language runtimes provide the same
+set of details.
+
+This uses in-kernel eBPF maps to store per process summaries for efficiency.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Java, Node, Perl, PHP, Python, Ruby, and Tcl. It requires a runtime instrumented with
+these probes, which in some cases requires building from source with a
+USDT-specific flag, such as "--enable-dtrace" or "--with-dtrace". For Java,
+some probes are not enabled by default, and can be turned on by running the Java
+process with the "-XX:+ExtendedDTraceProbes" flag. For PHP processes, the
+environment variable USE_ZEND_DTRACE must be set to 1.
+
+Newly-created processes will only be traced at the next interval. If you run
+this tool with a short interval (say, 1-5 seconds), this should be virtually
+unnoticeable. For longer intervals, you might miss processes that were started
+and terminated during the interval window.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {java,node,perl,php,python,ruby,tcl}
+The language to trace. By default, all languages are traced.
+.TP
+\-C
+Do not clear the screen between updates.
+.TP
+\-S {cload,excp,gc,method,objnew,thread}
+Sort the output by the specified field.
+.TP
+\-r MAXROWS
+Do not print more than this number of rows.
+.TP
+\-d
+Print the resulting BPF program, for debugging purposes.
+.TP
+interval
+Interval between updates, seconds.
+.TP
+count
+Number of interval summaries.
+.SH EXAMPLES
+.TP
+Summarize activity in high-level languages, 1 second refresh:
+#
+.B ustat
+.TP
+Don't clear the screen, and top 8 rows only:
+#
+.B ustat -Cr 8
+.TP
+5 second summaries, 10 times only:
+#
+.B ustat 5 10
+.SH FIELDS
+.TP
+loadavg
+The contents of /proc/loadavg
+.TP
+PID
+Process ID.
+.TP
+CMDLINE
+Process command line (often the second and following arguments will give you a
+hint as to which application is being run.
+.TP
+METHOD/s
+Count of method invocations during interval.
+.TP
+GC/s
+Count of garbage collections during interval.
+.TP
+OBJNEW/s
+Count of objects allocated during interval.
+.TP
+CLOAD/s
+Count of classes loaded during interval.
+.TP
+EXC/s
+Count of exceptions thrown during interval.
+.TP
+THR/s
+Count of threads created during interval.
+.SH OVERHEAD
+When using this tool with high-frequency events, such as method calls, a very
+significant slow-down can be expected. However, many of the high-level
+languages covered by this tool already have a fairly high per-method invocation
+cost, especially when running in interpreted mode. For the lower-frequency
+events, such as garbage collections or thread creations, the overhead should
+not be significant. Specifically, when probing Java processes and not using the
+"-XX:+ExtendedDTraceProbes" flag, the most expensive probes are not emitted,
+and the overhead should be acceptable.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), argdist(8), tplist(8)
diff --git a/man/man8/uthreads.8 b/man/man8/uthreads.8
new file mode 100644
index 0000000..6acffa5
--- /dev/null
+++ b/man/man8/uthreads.8
@@ -0,0 +1,68 @@
+.TH uthreads 8  "2018-10-09" "USER COMMANDS"
+.SH NAME
+uthreads, cthreads, javathreads \- Trace thread creation events in Java or pthreads.
+.SH SYNOPSIS
+.B cthreads [-h] [-v] pid
+.BR
+.B javathreads [-h] [-v] pid
+.BR
+.B uthreads [-h] [-l {c,java,none}] [-v] pid
+.SH DESCRIPTION
+This traces thread creation events in Java processes, or pthread creation
+events in any process. When a thread is created, its name or start address
+is printed.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {c,java,none}
+The language to trace. C and none select tracing pthreads only, regardless
+of the runtime being traced.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Trace Java thread creations:
+#
+.B uthreads -l java 148
+.TP
+Trace pthread creations:
+#
+.B uthreads 1802
+.SH FIELDS
+.TP
+TIME
+The event's time in seconds from the beginning of the trace.
+.TP
+ID
+The thread's ID. The information in this column depends on the runtime.
+.TP
+TYPE
+Event type -- thread start, stop, or pthread event.
+.TP
+DESCRIPTION
+The thread's name or start address function name.
+.SH OVERHEAD
+Thread start and stop events are usually not very frequent, which makes this
+tool's overhead negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), trace(8)
diff --git a/man/man8/vfscount.8 b/man/man8/vfscount.8
new file mode 100644
index 0000000..44acffc
--- /dev/null
+++ b/man/man8/vfscount.8
@@ -0,0 +1,54 @@
+.TH vfscount 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfscount \- Count VFS calls ("vfs_*"). Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfscount
+.SH DESCRIPTION
+This counts VFS calls. This can be useful for general workload
+characterization of these operations.
+
+This works by tracing all kernel functions beginning with "vfs_" using dynamic
+tracing. This may match more functions than you are interested in measuring:
+Edit the script to customize which functions to trace.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Count some VFS calls until Ctrl-C is hit:
+#
+.B vfscount
+.SH FIELDS
+.TP
+ADDR
+Address of the instruction pointer that was traced (only useful if the FUNC column is suspicious and you would like to double check the translation).
+.TP
+FUNC
+Kernel function name
+.TP
+COUNT
+Number of calls while tracing
+.SH OVERHEAD
+This traces kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment, and if overheads are an issue, edit the script
+to reduce the types of vfs functions traced (currently all beginning with
+"vfs_").
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfsstat(8)
diff --git a/man/man8/vfsstat.8 b/man/man8/vfsstat.8
new file mode 100644
index 0000000..929e6b6
--- /dev/null
+++ b/man/man8/vfsstat.8
@@ -0,0 +1,65 @@
+.TH vfsstat 8  "2015-08-18" "USER COMMANDS"
+.SH NAME
+vfsstat \- Statistics for some common VFS calls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B vfsstat
+[interval [count]]
+.SH DESCRIPTION
+This traces some common VFS calls and prints per-second summaries. This can
+be useful for general workload characterization, and looking for patterns
+in operation usage over time.
+
+This works by tracing some kernel vfs functions using dynamic tracing, and will
+need updating to match any changes to these functions. Edit the script to
+customize which functions are traced. Also see vfscount, which is more
+easily customized to trace multiple functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH EXAMPLES
+.TP
+Print summaries each second:
+#
+.B vfsstat
+.TP
+Print output every five seconds, three times:
+#
+.B vfsstat 5 3
+.SH FIELDS
+.TP
+READ/s
+Number of vfs_read() calls as a per-second average.
+.TP
+WRITE/s
+Number of vfs_write() calls as a per-second average.
+.TP
+CREATE/s
+Number of vfs_create() calls as a per-second average.
+.TP
+OPEN/s
+Number of vfs_open() calls as a per-second average.
+.TP
+FSYNC/s
+Number of vfs_fsync() calls as a per-second average.
+.SH OVERHEAD
+This traces various kernel vfs functions and maintains in-kernel counts, which
+are asynchronously copied to user-space. While the rate of VFS operations can
+be very high (>1M/sec), this is a relatively efficient way to trace these
+events, and so the overhead is expected to be small for normal workloads.
+Measure in a test environment.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+vfscount(8)
diff --git a/man/man8/wakeuptime.8 b/man/man8/wakeuptime.8
new file mode 100644
index 0000000..8630ae4
--- /dev/null
+++ b/man/man8/wakeuptime.8
@@ -0,0 +1,104 @@
+.TH wakeuptime 8  "2016-01-27" "USER COMMANDS"
+.SH NAME
+wakeuptime \- Summarize sleep to wakeup time by waker kernel stack. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B wakeuptime [\-h] [\-u] [\-p PID] [\-v] [\-f] [\-\-stack-storage-size STACK_STORAGE_SIZE] [\-m MIN_BLOCK_TIME] [\-M MAX_BLOCK_TIME] [duration]
+.SH DESCRIPTION
+This program shows the kernel stack traces for threads that woke up other 
+blocked threads, along with the process names of the waker and target, along
+with a sum of the time that the target was blocked: the "blocked time".
+It works by tracing when threads block and when they were then woken up, and
+measuring the time delta. This time measurement will be very similar to off-CPU
+time, however, off-CPU time may include a little extra time spent waiting
+on a run queue to be scheduled. The stack traces, process names, and time spent
+blocked is summarized in the kernel using an eBPF map for efficiency.
+
+The output summary will help you identify reasons why threads
+were blocking by showing who woke them up, along with the time they were
+blocked. This spans all types of blocking activity: disk I/O, network I/O,
+locks, page faults, involuntary context switches, etc.
+
+This can be used in conjunction with offcputime, which shows the stack trace
+of the blocked thread. wakeuptime shows the stack trace of the waker thread.
+
+See http://www.brendangregg.com/FlameGraphs/offcpuflamegraphs.html
+
+This tool only works on Linux 4.6+. It uses the new `BPF_STACK_TRACE` table
+APIs to generate the in-kernel stack traces.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-f
+Print output in folded stack format.
+.TP
+\-u
+Only trace user threads (not kernel threads).
+.TP
+\-v
+Show raw addresses (for non-folded format).
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-\-stack-storage-size STACK_STORAGE_SIZE
+Change the number of unique stack traces that can be stored and displayed.
+.TP
+duration
+Duration to trace, in seconds.
+.TP
+\-m MIN_BLOCK_TIME
+The amount of time in microseconds over which we store traces (default 1)
+.TP
+\-M MAX_BLOCK_TIME
+The amount of time in microseconds under which we store traces (default U64_MAX)
+.SH EXAMPLES
+.TP
+Trace all thread blocking events, and summarize (in-kernel) by kernel stack trace and total blocked time:
+#
+.B wakeuptime
+.TP
+Trace user-mode target threads only:
+#
+.B wakeuptime -u
+.TP
+Trace for 5 seconds only:
+#
+.B wakeuptime 5
+.TP
+Trace for 5 seconds, and emit output in folded stack format (suitable for flame graphs):
+#
+.B wakeuptime -f 5
+.TP
+Trace PID 185 only:
+#
+.B wakeuptime -p 185
+.SH OVERHEAD
+This summarizes unique stack traces in-kernel for efficiency, allowing it to
+trace a higher rate of events than methods that post-process in user space. The
+stack trace and time data is only copied to user space once, when the output is
+printed. While these techniques greatly lower overhead, scheduler events are
+still a high frequency event, as they can exceed 1 million events per second,
+and so caution should still be used. Test before production use.
+
+If the overhead is still a problem, take a look at the min block option.
+If your aim is to chase down longer blocking events, then this could
+be increased to filter shorter blocking events, further lowering overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+offcputime(8), stackcount(8)
diff --git a/man/man8/xfsdist.8 b/man/man8/xfsdist.8
new file mode 100644
index 0000000..3c89000
--- /dev/null
+++ b/man/man8/xfsdist.8
@@ -0,0 +1,80 @@
+.TH xfsdist 8  "2016-02-12" "USER COMMANDS"
+.SH NAME
+xfsdist \- Summarize XFS operation latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B xfsdist [\-h] [\-T] [\-m] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This tool summarizes time (latency) spent in common XFS file operations: reads,
+writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
+in-kernel eBPF map to store the histogram for efficiency.
+
+Since this works by tracing the xfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Don't include timestamps on interval output.
+.TP
+\-m
+Output in milliseconds.
+.TP
+\-p PID
+Trace this PID only.
+.SH EXAMPLES
+.TP
+Trace XFS operation time, and print a summary on Ctrl-C:
+#
+.B xfsdist
+.TP
+Trace PID 181 only:
+#
+.B xfsdist -p 181
+.TP
+Print 1 second summaries, 10 times:
+#
+.B xfsdist 1 10
+.TP
+1 second summaries, printed in milliseconds
+#
+.B xfsdist \-m 1
+.SH FIELDS
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+count
+Number of operations in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these XFS operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+xfssnoop(8)
diff --git a/man/man8/xfsslower.8 b/man/man8/xfsslower.8
new file mode 100644
index 0000000..533a701
--- /dev/null
+++ b/man/man8/xfsslower.8
@@ -0,0 +1,113 @@
+.TH xfsslower 8  "2016-02-11" "USER COMMANDS"
+.SH NAME
+xfsslower \- Trace slow xfs file operations, with per-event details.
+.SH SYNOPSIS
+.B xfsslower [\-h] [\-j] [\-p PID] [min_ms]
+.SH DESCRIPTION
+This tool traces common XFS file operations: reads, writes, opens, and
+syncs. It measures the time spent in these operations, and prints details
+for each that exceeded a threshold.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used. If a threshold of 0
+is used, all events are printed (warning: verbose).
+
+Since this works by tracing the xfs_file_operations interface functions, it
+will need updating to match any changes to these functions.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B xfsslower
+.TP
+Trace slower than 1 ms:
+#
+.B xfsslower 1
+.TP
+Trace slower than 1 ms, and output just the fields in parsable format (csv):
+#
+.B xfsslower \-j 1
+.TP
+Trace all file reads and writes (warning: the output will be verbose):
+#
+.B xfsslower 0
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B xfsslower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+T
+Type of operation. R == read, W == write, O == open, S == fsync.
+.TP
+OFF_KB
+File offset for the I/O, in Kbytes.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when it was issued by VFS to the
+filesystem, to when it completed. This time is inclusive of block device I/O,
+file system CPU cycles, file system locks, run queue latency, etc. It's a more
+accurate measure of the latency suffered by applications performing file
+system I/O, than to measure this down at the block device interface.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.TP
+ENDTIME_us
+Completion timestamp, microseconds (\-j only).
+.TP
+OFFSET_b
+File offset, bytes (\-j only).
+.TP
+LATENCY_us
+Latency (duration) of the I/O, in microseconds (\-j only).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these XFS operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool (even if it prints no "slower" events) can
+begin to become significant. Measure and quantify before use. If this
+continues to be a problem, consider switching to a tool that prints in-kernel
+summaries only.
+.PP
+Note that the overhead of this tool should be less than fileslower(8), as
+this tool targets xfs functions only, and not all file read/write paths
+(which can include socket I/O).
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), funccount(8), fileslower(8)
diff --git a/man/man8/zfsdist.8 b/man/man8/zfsdist.8
new file mode 100644
index 0000000..1e5632f
--- /dev/null
+++ b/man/man8/zfsdist.8
@@ -0,0 +1,83 @@
+.TH zfsdist 8  "2016-02-12" "USER COMMANDS"
+.SH NAME
+zfsdist \- Summarize ZFS operation latency. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B zfsdist [\-h] [\-T] [\-m] [\-p PID] [interval] [count]
+.SH DESCRIPTION
+This tool summarizes time (latency) spent in common ZFS file operations: reads,
+writes, opens, and syncs, and presents it as a power-of-2 histogram. It uses an
+in-kernel eBPF map to store the histogram for efficiency.
+
+This uses kernel dynamic tracing of the ZPL interface (ZFS POSIX
+Layer), and will need updates to match any changes to this interface.
+.TP
+This is intended to work with the ZFS on Linux project:
+http://zfsonlinux.org
+.PP
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Don't include timestamps on interval output.
+.TP
+\-m
+Output in milliseconds.
+.TP
+\-p PID
+Trace this PID only.
+.SH EXAMPLES
+.TP
+Trace ZFS operation time, and print a summary on Ctrl-C:
+#
+.B zfsdist
+.TP
+Trace PID 181 only:
+#
+.B zfsdist -p 181
+.TP
+Print 1 second summaries, 10 times:
+#
+.B zfsdist 1 10
+.TP
+1 second summaries, printed in milliseconds
+#
+.B zfsdist \-m 1
+.SH FIELDS
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+count
+Number of operations in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these ZFS operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool may become noticeable.
+Measure and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+zfssnoop(8)
diff --git a/man/man8/zfsslower.8 b/man/man8/zfsslower.8
new file mode 100644
index 0000000..8f5c8cf
--- /dev/null
+++ b/man/man8/zfsslower.8
@@ -0,0 +1,116 @@
+.TH zfsslower 8  "2016-02-11" "USER COMMANDS"
+.SH NAME
+zfsslower \- Trace slow zfs file operations, with per-event details.
+.SH SYNOPSIS
+.B zfsslower [\-h] [\-j] [\-p PID] [min_ms]
+.SH DESCRIPTION
+This tool traces common ZFS file operations: reads, writes, opens, and
+syncs. It measures the time spent in these operations, and prints details
+for each that exceeded a threshold.
+
+WARNING: See the OVERHEAD section.
+
+By default, a minimum millisecond threshold of 10 is used. If a threshold of 0
+is used, all events are printed (warning: verbose).
+
+This uses kernel dynamic tracing of the ZPL interface (ZFS POSIX
+Layer), and will need updates to match any changes to this interface.
+.TP
+This is intended to work with the ZFS on Linux project:
+http://zfsonlinux.org
+.PP
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+\-p PID
+Trace this PID only.
+.TP
+min_ms
+Minimum I/O latency (duration) to trace, in milliseconds. Default is 10 ms.
+.SH EXAMPLES
+.TP
+Trace synchronous file reads and writes slower than 10 ms:
+#
+.B zfsslower
+.TP
+Trace slower than 1 ms:
+#
+.B zfsslower 1
+.TP
+Trace slower than 1 ms, and output just the fields in parsable format (csv):
+#
+.B zfsslower \-j 1
+.TP
+Trace all file reads and writes (warning: the output will be verbose):
+#
+.B zfsslower 0
+.TP
+Trace slower than 1 ms, for PID 181 only:
+#
+.B zfsslower \-p 181 1
+.SH FIELDS
+.TP
+TIME(s)
+Time of I/O completion since the first I/O seen, in seconds.
+.TP
+COMM
+Process name.
+.TP
+PID
+Process ID.
+.TP
+T
+Type of operation. R == read, W == write, O == open, S == fsync.
+.TP
+OFF_KB
+File offset for the I/O, in Kbytes.
+.TP
+BYTES
+Size of I/O, in bytes.
+.TP
+LAT(ms)
+Latency (duration) of I/O, measured from when it was issued by VFS to the
+filesystem, to when it completed. This time is inclusive of block device I/O,
+file system CPU cycles, file system locks, run queue latency, etc. It's a more
+accurate measure of the latency suffered by applications performing file
+system I/O, than to measure this down at the block device interface.
+.TP
+FILENAME
+A cached kernel file name (comes from dentry->d_iname).
+.TP
+ENDTIME_us
+Completion timestamp, microseconds (\-j only).
+.TP
+OFFSET_b
+File offset, bytes (\-j only).
+.TP
+LATENCY_us
+Latency (duration) of the I/O, in microseconds (\-j only).
+.SH OVERHEAD
+This adds low-overhead instrumentation to these ZFS operations,
+including reads and writes from the file system cache. Such reads and writes
+can be very frequent (depending on the workload; eg, 1M/sec), at which
+point the overhead of this tool (even if it prints no "slower" events) can
+begin to become significant. Measure and quantify before use. If this
+continues to be a problem, consider switching to a tool that prints in-kernel
+summaries only.
+.PP
+Note that the overhead of this tool should be less than fileslower(8), as
+this tool targets zfs functions only, and not all file read/write paths
+(which can include socket I/O).
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+biosnoop(8), funccount(8), fileslower(8)
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..8dd200e
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,68 @@
+
+## Fedora Demo VM
+
+Before running the script, ensure that virt-install is available on the system.
+
+`./build_bpf_demo.sh -n bpf-demo -k bpf_demo.ks.erb`
+
+After setting up the initial VM, log in (the default password is 'iovisor')
+and determine the DHCP IP. SSH to this IP as root.
+
+To set up a kernel with the right options, run `bpf-kernel-setup`.
+
+```
+[root@bpf-demo ~]# bpf-kernel-setup
+Cloning into 'net-next'...
+```
+After pulling the net-next branch, the kernel config menu should pop up. Ensure
+that the below settings are proper.
+```
+General setup --->
+  [*] Enable bpf() system call
+Networking support --->
+  Networking options --->
+    QoS and/or fair queueing --->
+      <M> BPF-based classifier
+      <M> BPF based action
+    [*] enable BPF Just In Time compiler
+```
+Once the .config is saved, the build will proceed and install the resulting
+kernel. This kernel has updated userspace headers (e.g. the bpf() syscall) which
+install into /usr/local/include...proper packaging for this will be
+distro-dependent.
+
+Next, run `bpf-llvm-setup` to pull and compile LLVM with BPF support enabled.
+```
+[root@bpf-demo ~]# bpf-llvm-setup
+Cloning into 'llvm'...
+```
+The resulting libraries will be installed into /opt/local/llvm.
+
+Next, reboot into the new kernel, either manually or by using the kexec helper.
+```
+[root@bpf-demo ~]# kexec-4.1.0-rc1+
+Connection to 192.168.122.247 closed by remote host.
+Connection to 192.168.122.247 closed.
+```
+
+Reconnect and run the final step, building and testing bcc.
+```
+[root@bpf-demo ~]# bcc-setup
+Cloning into 'bcc'...
+...
+Linking CXX shared library libcc.so
+[100%] Built target bcc
+...
+Running tests...
+Test project /root/bcc/build
+    Start 1: py_test1
+1/4 Test #1: py_test1 .........................   Passed    0.24 sec
+    Start 2: py_test2
+2/4 Test #2: py_test2 .........................   Passed    0.53 sec
+    Start 3: py_trace1
+3/4 Test #3: py_trace1 ........................   Passed    0.09 sec
+    Start 4: py_trace2
+4/4 Test #4: py_trace2 ........................   Passed    1.06 sec
+
+100% tests passed, 0 tests failed out of 4
+```
diff --git a/scripts/bpf_demo.ks.erb b/scripts/bpf_demo.ks.erb
new file mode 100644
index 0000000..a32f0b6
--- /dev/null
+++ b/scripts/bpf_demo.ks.erb
@@ -0,0 +1,126 @@
+# Minimal Kickstart file
+install
+text
+reboot
+lang en_US.UTF-8
+
+# repo to install the OS
+url --url=<%= @mirror %>/Everything/x86_64/os/
+
+keyboard us
+network --bootproto dhcp
+rootpw <%= @password %>
+authconfig --enableshadow --passalgo=sha512 --enablefingerprint
+firewall --enabled --ssh
+selinux --enforcing
+timezone --utc America/Los_Angeles
+#firstboot --disable
+bootloader --location=mbr --append="console=tty0 console=ttyS0,115200 rd_NO_PLYMOUTH crashkernel=auto"
+zerombr
+clearpart --all --initlabel
+autopart --type=lvm
+repo --name=everything --baseurl=<%= @mirror %>/Everything/x86_64/os/
+
+#Just core packages
+%packages --nobase
+@core
+ntp
+@c-development
+@development-tools
+@rpm-development-tools
+ncurses-devel
+vim
+bc
+kexec-tools
+cmake
+libstdc++-static
+python-netaddr
+python-futures
+%end
+
+%post --log=/root/anaconda-post.log
+echo Kickstart post
+
+chkconfig NetworkManager off
+chkconfig network on
+
+chkconfig ntpd on
+
+dnf config-manager --add-repo=http://alt.fedoraproject.org/pub/alt/rawhide-kernel-nodebug/fedora-rawhide-kernel-nodebug.repo
+
+yum -y clean metadata
+yum -y update
+
+hostname <%= @name %>.<%= @domain %>
+echo "<%= @name %>.<%= @domain %>" > /etc/hostname
+
+cat > /usr/local/bin/bpf-kernel-setup <<'DELIM__'
+#!/bin/bash
+set -e -x
+numcpu=$(grep -c ^processor /proc/cpuinfo)
+
+git clone https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
+cd net-next/
+
+cp /boot/config-$(uname -r) ./
+cp ./config-$(uname -r) .config
+
+make -j$numcpu mrproper
+make -j$numcpu nconfig
+make -j$numcpu bzImage
+make -j$numcpu modules
+sudo make modules_install
+sudo make install
+sudo make INSTALL_HDR_PATH=/usr/local headers_install
+
+release=$(<include/config/kernel.release)
+echo "kexec -l /boot/vmlinuz-$release --initrd=/boot/initramfs-$release.img --reuse-cmdline; reboot" > /usr/local/bin/kexec-$release
+chmod +x /usr/local/bin/kexec-$release
+ln -fs kexec-$release /usr/local/bin/kexec-latest
+
+DELIM__
+chmod +x /usr/local/bin/bpf-kernel-setup
+
+cat > /usr/local/bin/bpf-llvm-setup <<'DELIM__'
+#!/bin/bash
+set -e -x
+numcpu=$(grep -c ^processor /proc/cpuinfo)
+
+git clone https://github.com/llvm-mirror/llvm.git
+git clone https://github.com/llvm-mirror/clang.git llvm/tools/clang
+mkdir llvm/build/
+cd llvm/build/
+
+cmake .. \
+  -DBUILD_SHARED_LIBS=OFF \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_TERMINFO=OFF \
+  -DLLVM_TARGETS_TO_BUILD="ARM;CppBackend;X86;BPF" \
+  -DCMAKE_INSTALL_PREFIX=/opt/local/llvm
+
+make -j$numcpu
+sudo make install
+grep -q llvm $HOME/.bashrc || echo 'PATH=/opt/local/llvm/bin:$PATH' >> $HOME/.bashrc
+
+DELIM__
+chmod +x /usr/local/bin/bpf-llvm-setup
+
+cat > /usr/local/bin/bcc-setup <<'DELIM__'
+#!/bin/bash
+set -e -x
+
+git clone https://github.com/svinota/pyroute2.git
+(cd pyroute2; make install)
+
+numcpu=$(grep -c ^processor /proc/cpuinfo)
+
+git clone https://github.com/iovisor/bcc.git
+mkdir bcc/build/
+cd bcc/build/
+export PATH=/opt/local/llvm/bin:$PATH
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr
+make -j$numcpu
+DELIM__
+chmod +x /usr/local/bin/bcc-setup
+
+%end
diff --git a/scripts/build-deb.sh b/scripts/build-deb.sh
new file mode 100755
index 0000000..97bed85
--- /dev/null
+++ b/scripts/build-deb.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# helper script to be invoked by jenkins/buildbot
+
+# $1 [optional]: the build type - release | nightly | test
+buildtype=${1:-test}
+
+set -x
+set -e
+
+PARALLEL=${PARALLEL:-1}
+TMP=$(mktemp -d /tmp/debuild.XXXXXX)
+
+function cleanup() {
+  [[ -d $TMP ]] && rm -rf $TMP
+}
+trap cleanup EXIT
+
+. scripts/git-tag.sh
+
+git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/bcc_$revision.orig.tar.gz
+
+pushd $TMP
+tar xf bcc_$revision.orig.tar.gz
+cd bcc
+
+debuild=debuild
+if [[ "$buildtype" = "test" ]]; then
+  # when testing, use faster compression options
+  debuild+=" --preserve-envvar PATH"
+  echo -e '#!/bin/bash\nexec /usr/bin/dpkg-deb -z1 "$@"' \
+    | sudo tee /usr/local/bin/dpkg-deb
+  sudo chmod +x /usr/local/bin/dpkg-deb
+  dch -b -v $revision-$release "$git_subject"
+fi
+if [[ "$buildtype" = "nightly" ]]; then
+  dch -v $revision-$release "$git_subject"
+fi
+
+DEB_BUILD_OPTIONS="nocheck parallel=${PARALLEL}" $debuild -us -uc
+popd
+
+cp $TMP/*.deb .
diff --git a/scripts/build-deb.sh.in b/scripts/build-deb.sh.in
new file mode 100755
index 0000000..ec0cee9
--- /dev/null
+++ b/scripts/build-deb.sh.in
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+set -x
+set -e
+
+PARALLEL=${PARALLEL:-1}
+TMP=$(mktemp -d /tmp/debuild.XXXXXX)
+
+function cleanup() {
+  [[ -d $TMP ]] && rm -rf $TMP
+}
+trap cleanup EXIT
+
+mkdir $TMP/bcc
+cp -a * $TMP/bcc
+pushd $TMP
+tar zcf bcc_@REVISION_LAST@.orig.tar.gz bcc/
+cd bcc
+DEB_BUILD_OPTIONS="nocheck parallel=${PARALLEL}" debuild -us -uc
+popd
+
+cp $TMP/*.deb .
diff --git a/scripts/build-release-rpm.sh b/scripts/build-release-rpm.sh
new file mode 100755
index 0000000..0fd6b70
--- /dev/null
+++ b/scripts/build-release-rpm.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -x
+set -e
+
+TMP=$(mktemp -d /tmp/rpmbuild.XXXXXX)
+
+function cleanup() {
+  [[ -d $TMP ]] && rm -rf $TMP
+}
+trap cleanup EXIT
+
+mkdir $TMP/{BUILD,RPMS,SOURCES,SPECS,SRPMS}
+
+llvmver=3.7.1
+
+. scripts/git-tag.sh
+
+git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/SOURCES/$git_tag_latest.tar.gz
+wget -P $TMP/SOURCES http://llvm.org/releases/$llvmver/{cfe,llvm}-$llvmver.src.tar.xz
+
+sed \
+  -e "s/^\(Version:\s*\)@REVISION@/\1$revision/" \
+  -e "s/^\(Release:\s*\)@GIT_REV_COUNT@/\1$release/" \
+  SPECS/bcc+clang.spec > $TMP/SPECS/bcc.spec
+
+pushd $TMP
+rpmbuild --define "_topdir `pwd`" -ba SPECS/bcc.spec
+popd
+
+cp $TMP/RPMS/*/*.rpm .
+cp $TMP/SRPMS/*.rpm .
diff --git a/scripts/build-rpm.sh b/scripts/build-rpm.sh
new file mode 100755
index 0000000..0616501
--- /dev/null
+++ b/scripts/build-rpm.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -x
+set -e
+
+TMP=$(mktemp -d /tmp/rpmbuild.XXXXXX)
+
+function cleanup() {
+  [[ -d $TMP ]] && rm -rf $TMP
+}
+trap cleanup EXIT
+
+mkdir $TMP/{BUILD,RPMS,SOURCES,SPECS,SRPMS}
+
+llvmver=3.7.1
+
+. scripts/git-tag.sh
+
+git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/SOURCES/bcc.tar.gz
+
+sed \
+  -e "s/^\(Version:\s*\)@REVISION@/\1$revision/" \
+  -e "s/^\(Release:\s*\)@GIT_REV_COUNT@/\1$release/" \
+  SPECS/bcc.spec > $TMP/SPECS/bcc.spec
+
+pushd $TMP
+rpmbuild $RPM_WITH_OPTS --define "_topdir `pwd`" -ba SPECS/bcc.spec
+popd
+
+cp $TMP/RPMS/*/*.rpm .
+cp $TMP/SRPMS/*.rpm .
diff --git a/scripts/build_bpf_demo.sh b/scripts/build_bpf_demo.sh
new file mode 100755
index 0000000..ce26914
--- /dev/null
+++ b/scripts/build_bpf_demo.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+#set -x
+set -e
+
+function usage() {
+  cat <<DELIM__
+usage: $(basename $0) [options]
+
+Options:
+  -b, --bridge BRNAME   Which linux bridge to attach to
+  -c, --cpu NUM         Number of CPUs to reserve to the instance (default 4)
+  -g, --github_token X  HTTP Github oauth token (for buildbots)
+  -k, --kickstart KS    Path to kickstart file to use (required)
+  -m, --mirror URL      URL at which to reach netinstallable packages
+  -M, --mem NUM         Number of MB to reserve to the instance (default 4094)
+  -n, --name NAME       Name of the instance (required)
+  -p, --password PASS   Password to set in the VM
+  -s, --size NUM        Size in GB to reserve for the virtual HDD (default 40GB)
+DELIM__
+}
+
+TEMP=$(getopt -o b:c:k:m:M:n:p:s: --long bridge:,cpu:,kickstart:,mirror:,mem:,name:,password:size: -- "$@")
+if [[ $? -ne 0 ]]; then
+  usage
+  exit 1
+fi
+
+eval set -- "$TEMP"
+
+while true; do
+  case "$1" in
+    -b|--bridge) BRIDGE="$2"; shift 2 ;;
+    -c|--cpu) CPU="$2"; shift 2 ;;
+    -k|--kickstart) KICKSTART="$2"; shift 2 ;;
+    -n|--name) NAME="$2"; shift 2 ;;
+    -m|--mirror) MIRROR="$2"; shift 2 ;;
+    -M|--mem) MEM="$2"; shift 2 ;;
+    -p|--password) PASSWORD="$2"; shift 2 ;;
+    -s|--size) SIZE="$2"; shift 2 ;;
+    --) shift; break ;;
+    *) usage; exit 1
+      ;;
+  esac
+done
+[[ ! -f "$KICKSTART" ]] && { usage; exit 1; }
+[[ -z "$NAME" ]] && { usage; exit 1; }
+
+PASSWORD=${PASSWORD:-"iovisor"}
+BRIDGE=${BRIDGE:-virbr0}
+MIRROR=${MIRROR:-http://mirror.pnl.gov/fedora/linux/releases/22}
+MEM=${MEM:-4094}
+CPU=${CPU:-4}
+SIZE=${SIZE:-40}
+
+if [[ "$(id -u)" != "0" ]]; then
+  sudo="sudo"
+fi
+
+if ! which virt-install &> /dev/null; then
+  echo "Error: virt-install is not installed"
+  exit 1
+fi
+
+libvirt_dir=/var/lib/libvirt/images
+img_name=$NAME
+tmpdir=$(mktemp -d /tmp/virt-install_XXXXX)
+tmp_ks_file=$tmpdir/$img_name.ks
+
+function cleanup() {
+  set +e
+  [[ -d "$tmpdir" ]] && rm -fr "$tmpdir"
+  local destroy_kvm=n
+  [[ -f "/etc/libvirt/qemu/$img_name.xml" ]] && read -p "Destroy libvirt VM (y/n)? " destroy_kvm
+  if [[ "$destroy_kvm" != n* ]]; then
+    virsh destroy $img_name
+    virsh undefine $img_name
+    virsh vol-delete $img_name.img --pool default
+    $sudo rm -f $libvirt_dir/$img_name.img
+  fi
+}
+trap cleanup EXIT
+
+ruby <<DELIM__
+require 'erb'
+@password="$PASSWORD"
+@name="$NAME"
+@domain="example.com"
+@github_access_token="$GITHUB_ACCESS_TOKEN"
+@mirror="$MIRROR"
+File.open('$tmp_ks_file', 'w') do |f|
+  f.puts ERB.new(File.open('$KICKSTART', 'rb').read, nil, '-').result()
+end
+DELIM__
+
+tree=$MIRROR/Server/x86_64/os/
+virt-install --connect=qemu:///system \
+    --network=bridge:$BRIDGE \
+    --initrd-inject=$tmp_ks_file \
+    --controller type=scsi,model=virtio-scsi \
+    --extra-args="ks=file:/$(basename $tmp_ks_file) console=tty0 console=ttyS0,115200" \
+    --name=$img_name \
+    --disk $libvirt_dir/$img_name.img,cache=none,format=qcow2,size=$SIZE,bus=scsi \
+    --ram $MEM \
+    --vcpus=$CPU \
+    --check-cpu \
+    --accelerate \
+    --hvm \
+    --location=$tree \
+    --nographics
+
+echo "SUCCESS"
+exit 0
diff --git a/scripts/check-helpers.sh b/scripts/check-helpers.sh
new file mode 100755
index 0000000..fe79971
--- /dev/null
+++ b/scripts/check-helpers.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+ret=0
+
+libbpf=$(grep -oP '(?<={")\w+(?=", "\d\.\d+")' src/cc/libbpf.c | sort)
+doc=$(grep -oP "(?<=BPF_FUNC_)\w+" docs/kernel-versions.md | sort)
+dif=$(diff <(echo "$doc") <(echo "$libbpf"))
+if [ $? -ne 0 ]; then
+	echo "The lists of helpers in src/cc/libbpf.c and docs/kernel-versions.md differ:"
+	echo -e "$dif\n"
+	((ret++))
+fi
+
+compat=$(grep -oP "(?<=^\sFN\()\w+" src/cc/compat/linux/bpf.h | tail -n +2 | sort)
+dif=$(diff <(echo "$doc") <(echo "$compat"))
+if [ $? -ne 0 ]; then
+	echo "The lists of helpers in docs/kernel-versions.md and src/cc/compat/linux/bpf.h differ:"
+	echo -e "$dif\n"
+	((ret++))
+fi
+
+virtual=$(grep -oP "(?<=^\sFN\()\w+" src/cc/compat/linux/virtual_bpf.h | tail -n +2 | sort -u)
+dif=$(diff <(echo "$compat") <(echo "$virtual"))
+if [ $? -ne 0 ]; then
+	echo "The lists of helpers in src/cc/compat/linux/bpf.h and src/cc/compat/linux/virtual_bpf.h differ:"
+	echo "$dif"
+	((ret++))
+fi
+
+export=$(grep -oP "(?<=BPF_FUNC_)\w+" src/cc/export/helpers.h | sort -u)
+dif=$(diff <(echo "$compat") <(echo "$export"))
+if [ $? -ne 0 ]; then
+	echo "The lists of helpers in src/cc/compat/linux/bpf.h and src/cc/export/helpers.h differ:"
+	echo "$dif"
+	((ret++))
+fi
+
+exit $ret
diff --git a/scripts/git-clang-format b/scripts/git-clang-format
new file mode 100755
index 0000000..74310b7
--- /dev/null
+++ b/scripts/git-clang-format
@@ -0,0 +1,485 @@
+#!/usr/bin/env python2
+#
+#===- git-clang-format - ClangFormat Git Integration ---------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+r"""                                                                             
+clang-format git integration                                                     
+============================                                                     
+                                                                                 
+This file provides a clang-format integration for git. Put it somewhere in your  
+path and ensure that it is executable. Then, "git clang-format" will invoke      
+clang-format on the changes in current files or a specific commit.               
+                                                                                 
+For further details, run:                                                        
+git clang-format -h                                                              
+                                                                                 
+Requires Python 2.7                                                              
+"""               
+
+import argparse
+import collections
+import contextlib
+import errno
+import os
+import re
+import subprocess
+import sys
+
+usage = 'git clang-format [OPTIONS] [<commit>] [--] [<file>...]'
+
+desc = '''
+Run clang-format on all lines that differ between the working directory
+and <commit>, which defaults to HEAD.  Changes are only applied to the working
+directory.
+
+The following git-config settings set the default of the corresponding option:
+  clangFormat.binary
+  clangFormat.commit
+  clangFormat.extension
+  clangFormat.style
+'''
+
+# Name of the temporary index file in which save the output of clang-format.
+# This file is created within the .git directory.
+temp_index_basename = 'clang-format-index'
+
+
+Range = collections.namedtuple('Range', 'start, count')
+
+
+def main():
+  config = load_git_config()
+
+  # In order to keep '--' yet allow options after positionals, we need to
+  # check for '--' ourselves.  (Setting nargs='*' throws away the '--', while
+  # nargs=argparse.REMAINDER disallows options after positionals.)
+  argv = sys.argv[1:]
+  try:
+    idx = argv.index('--')
+  except ValueError:
+    dash_dash = []
+  else:
+    dash_dash = argv[idx:]
+    argv = argv[:idx]
+
+  default_extensions = ','.join([
+      # From clang/lib/Frontend/FrontendOptions.cpp, all lower case
+      'c', 'h',  # C
+      'm',  # ObjC
+      'mm',  # ObjC++
+      'cc', 'cp', 'cpp', 'c++', 'cxx', 'hpp',  # C++
+      # Other languages that clang-format supports
+      'proto', 'protodevel',  # Protocol Buffers
+      'js',  # JavaScript
+      'ts',  # TypeScript
+      ])
+
+  p = argparse.ArgumentParser(
+    usage=usage, formatter_class=argparse.RawDescriptionHelpFormatter,
+    description=desc)
+  p.add_argument('--binary',
+                 default=config.get('clangformat.binary', 'clang-format'),
+                 help='path to clang-format'),
+  p.add_argument('--commit',
+                 default=config.get('clangformat.commit', 'HEAD'),
+                 help='default commit to use if none is specified'),
+  p.add_argument('--diff', action='store_true',
+                 help='print a diff instead of applying the changes')
+  p.add_argument('--extensions',
+                 default=config.get('clangformat.extensions',
+                                    default_extensions),
+                 help=('comma-separated list of file extensions to format, '
+                       'excluding the period and case-insensitive')),
+  p.add_argument('-f', '--force', action='store_true',
+                 help='allow changes to unstaged files')
+  p.add_argument('-p', '--patch', action='store_true',
+                 help='select hunks interactively')
+  p.add_argument('-q', '--quiet', action='count', default=0,
+                 help='print less information')
+  p.add_argument('--style',
+                 default=config.get('clangformat.style', None),
+                 help='passed to clang-format'),
+  p.add_argument('-v', '--verbose', action='count', default=0,
+                 help='print extra information')
+  # We gather all the remaining positional arguments into 'args' since we need
+  # to use some heuristics to determine whether or not <commit> was present.
+  # However, to print pretty messages, we make use of metavar and help.
+  p.add_argument('args', nargs='*', metavar='<commit>',
+                 help='revision from which to compute the diff')
+  p.add_argument('ignored', nargs='*', metavar='<file>...',
+                 help='if specified, only consider differences in these files')
+  opts = p.parse_args(argv)
+
+  opts.verbose -= opts.quiet
+  del opts.quiet
+
+  commit, files = interpret_args(opts.args, dash_dash, opts.commit)
+  changed_lines = compute_diff_and_extract_lines(commit, files)
+  if opts.verbose >= 1:
+    ignored_files = set(changed_lines)
+  filter_by_extension(changed_lines, opts.extensions.lower().split(','))
+  if opts.verbose >= 1:
+    ignored_files.difference_update(changed_lines)
+    if ignored_files:
+      print 'Ignoring changes in the following files (wrong extension):'
+      for filename in ignored_files:
+        print '   ', filename
+    if changed_lines:
+      print 'Running clang-format on the following files:'
+      for filename in changed_lines:
+        print '   ', filename
+    else:
+      print 'no modified files to format'
+      return
+  # The computed diff outputs absolute paths, so we must cd before accessing
+  # those files.
+  cd_to_toplevel()
+  old_tree = create_tree_from_workdir(changed_lines)
+  new_tree = run_clang_format_and_save_to_tree(changed_lines,
+                                               binary=opts.binary,
+                                               style=opts.style)
+  if opts.verbose >= 1:
+    print 'old tree:', old_tree
+    print 'new tree:', new_tree
+  if old_tree == new_tree:
+    if opts.verbose >= 0:
+      print 'clang-format did not modify any files'
+  elif opts.diff:
+    print_diff(old_tree, new_tree)
+  else:
+    changed_files = apply_changes(old_tree, new_tree, force=opts.force,
+                                  patch_mode=opts.patch)
+    if (opts.verbose >= 0 and not opts.patch) or opts.verbose >= 1:
+      print 'changed files:'
+      for filename in changed_files:
+        print '   ', filename
+
+
+def load_git_config(non_string_options=None):
+  """Return the git configuration as a dictionary.
+
+  All options are assumed to be strings unless in `non_string_options`, in which
+  is a dictionary mapping option name (in lower case) to either "--bool" or
+  "--int"."""
+  if non_string_options is None:
+    non_string_options = {}
+  out = {}
+  for entry in run('git', 'config', '--list', '--null').split('\0'):
+    if entry:
+      name, value = entry.split('\n', 1)
+      if name in non_string_options:
+        value = run('git', 'config', non_string_options[name], name)
+      out[name] = value
+  return out
+
+
+def interpret_args(args, dash_dash, default_commit):
+  """Interpret `args` as "[commit] [--] [files...]" and return (commit, files).
+
+  It is assumed that "--" and everything that follows has been removed from
+  args and placed in `dash_dash`.
+
+  If "--" is present (i.e., `dash_dash` is non-empty), the argument to its
+  left (if present) is taken as commit.  Otherwise, the first argument is
+  checked if it is a commit or a file.  If commit is not given,
+  `default_commit` is used."""
+  if dash_dash:
+    if len(args) == 0:
+      commit = default_commit
+    elif len(args) > 1:
+      die('at most one commit allowed; %d given' % len(args))
+    else:
+      commit = args[0]
+    object_type = get_object_type(commit)
+    if object_type not in ('commit', 'tag'):
+      if object_type is None:
+        die("'%s' is not a commit" % commit)
+      else:
+        die("'%s' is a %s, but a commit was expected" % (commit, object_type))
+    files = dash_dash[1:]
+  elif args:
+    if disambiguate_revision(args[0]):
+      commit = args[0]
+      files = args[1:]
+    else:
+      commit = default_commit
+      files = args
+  else:
+    commit = default_commit
+    files = []
+  return commit, files
+
+
+def disambiguate_revision(value):
+  """Returns True if `value` is a revision, False if it is a file, or dies."""
+  # If `value` is ambiguous (neither a commit nor a file), the following
+  # command will die with an appropriate error message.
+  run('git', 'rev-parse', value, verbose=False)
+  object_type = get_object_type(value)
+  if object_type is None:
+    return False
+  if object_type in ('commit', 'tag'):
+    return True
+  die('`%s` is a %s, but a commit or filename was expected' %
+      (value, object_type))
+
+
+def get_object_type(value):
+  """Returns a string description of an object's type, or None if it is not
+  a valid git object."""
+  cmd = ['git', 'cat-file', '-t', value]
+  p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+  stdout, stderr = p.communicate()
+  if p.returncode != 0:
+    return None
+  return stdout.strip()
+
+
+def compute_diff_and_extract_lines(commit, files):
+  """Calls compute_diff() followed by extract_lines()."""
+  diff_process = compute_diff(commit, files)
+  changed_lines = extract_lines(diff_process.stdout)
+  diff_process.stdout.close()
+  diff_process.wait()
+  if diff_process.returncode != 0:
+    # Assume error was already printed to stderr.
+    sys.exit(2)
+  return changed_lines
+
+
+def compute_diff(commit, files):
+  """Return a subprocess object producing the diff from `commit`.
+
+  The return value's `stdin` file object will produce a patch with the
+  differences between the working directory and `commit`, filtered on `files`
+  (if non-empty).  Zero context lines are used in the patch."""
+  cmd = ['git', 'diff-index', '-p', '-U0', commit, '--']
+  cmd.extend(files)
+  p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+  p.stdin.close()
+  return p
+
+
+def extract_lines(patch_file):
+  """Extract the changed lines in `patch_file`.
+
+  The return value is a dictionary mapping filename to a list of (start_line,
+  line_count) pairs.
+
+  The input must have been produced with ``-U0``, meaning unidiff format with
+  zero lines of context.  The return value is a dict mapping filename to a
+  list of line `Range`s."""
+  matches = {}
+  for line in patch_file:
+    match = re.search(r'^\+\+\+\ [^/]+/(.*)', line)
+    if match:
+      filename = match.group(1).rstrip('\r\n')
+    match = re.search(r'^@@ -[0-9,]+ \+(\d+)(,(\d+))?', line)
+    if match:
+      start_line = int(match.group(1))
+      line_count = 1
+      if match.group(3):
+        line_count = int(match.group(3))
+      if line_count > 0:
+        matches.setdefault(filename, []).append(Range(start_line, line_count))
+  return matches
+
+
+def filter_by_extension(dictionary, allowed_extensions):
+  """Delete every key in `dictionary` that doesn't have an allowed extension.
+
+  `allowed_extensions` must be a collection of lowercase file extensions,
+  excluding the period."""
+  allowed_extensions = frozenset(allowed_extensions)
+  for filename in dictionary.keys():
+    base_ext = filename.rsplit('.', 1)
+    if len(base_ext) == 1 or base_ext[1].lower() not in allowed_extensions:
+      del dictionary[filename]
+
+
+def cd_to_toplevel():
+  """Change to the top level of the git repository."""
+  toplevel = run('git', 'rev-parse', '--show-toplevel')
+  os.chdir(toplevel)
+
+
+def create_tree_from_workdir(filenames):
+  """Create a new git tree with the given files from the working directory.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  return create_tree(filenames, '--stdin')
+
+
+def run_clang_format_and_save_to_tree(changed_lines, binary='clang-format',
+                                      style=None):
+  """Run clang-format on each file and save the result to a git tree.
+
+  Returns the object ID (SHA-1) of the created tree."""
+  def index_info_generator():
+    for filename, line_ranges in changed_lines.iteritems():
+      mode = oct(os.stat(filename).st_mode)
+      blob_id = clang_format_to_blob(filename, line_ranges, binary=binary,
+                                     style=style)
+      yield '%s %s\t%s' % (mode, blob_id, filename)
+  return create_tree(index_info_generator(), '--index-info')
+
+
+def create_tree(input_lines, mode):
+  """Create a tree object from the given input.
+
+  If mode is '--stdin', it must be a list of filenames.  If mode is
+  '--index-info' is must be a list of values suitable for "git update-index
+  --index-info", such as "<mode> <SP> <sha1> <TAB> <filename>".  Any other mode
+  is invalid."""
+  assert mode in ('--stdin', '--index-info')
+  cmd = ['git', 'update-index', '--add', '-z', mode]
+  with temporary_index_file():
+    p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
+    for line in input_lines:
+      p.stdin.write('%s\0' % line)
+    p.stdin.close()
+    if p.wait() != 0:
+      die('`%s` failed' % ' '.join(cmd))
+    tree_id = run('git', 'write-tree')
+    return tree_id
+
+
+def clang_format_to_blob(filename, line_ranges, binary='clang-format',
+                         style=None):
+  """Run clang-format on the given file and save the result to a git blob.
+
+  Returns the object ID (SHA-1) of the created blob."""
+  clang_format_cmd = [binary, filename]
+  if style:
+    clang_format_cmd.extend(['-style='+style])
+  clang_format_cmd.extend([
+      '-lines=%s:%s' % (start_line, start_line+line_count-1)
+      for start_line, line_count in line_ranges])
+  try:
+    clang_format = subprocess.Popen(clang_format_cmd, stdin=subprocess.PIPE,
+                                    stdout=subprocess.PIPE)
+  except OSError as e:
+    if e.errno == errno.ENOENT:
+      die('cannot find executable "%s"' % binary)
+    else:
+      raise
+  clang_format.stdin.close()
+  hash_object_cmd = ['git', 'hash-object', '-w', '--path='+filename, '--stdin']
+  hash_object = subprocess.Popen(hash_object_cmd, stdin=clang_format.stdout,
+                                 stdout=subprocess.PIPE)
+  clang_format.stdout.close()
+  stdout = hash_object.communicate()[0]
+  if hash_object.returncode != 0:
+    die('`%s` failed' % ' '.join(hash_object_cmd))
+  if clang_format.wait() != 0:
+    die('`%s` failed' % ' '.join(clang_format_cmd))
+  return stdout.rstrip('\r\n')
+
+
+@contextlib.contextmanager
+def temporary_index_file(tree=None):
+  """Context manager for setting GIT_INDEX_FILE to a temporary file and deleting
+  the file afterward."""
+  index_path = create_temporary_index(tree)
+  old_index_path = os.environ.get('GIT_INDEX_FILE')
+  os.environ['GIT_INDEX_FILE'] = index_path
+  try:
+    yield
+  finally:
+    if old_index_path is None:
+      del os.environ['GIT_INDEX_FILE']
+    else:
+      os.environ['GIT_INDEX_FILE'] = old_index_path
+    os.remove(index_path)
+
+
+def create_temporary_index(tree=None):
+  """Create a temporary index file and return the created file's path.
+
+  If `tree` is not None, use that as the tree to read in.  Otherwise, an
+  empty index is created."""
+  gitdir = run('git', 'rev-parse', '--git-dir')
+  path = os.path.join(gitdir, temp_index_basename)
+  if tree is None:
+    tree = '--empty'
+  run('git', 'read-tree', '--index-output='+path, tree)
+  return path
+
+
+def print_diff(old_tree, new_tree):
+  """Print the diff between the two trees to stdout."""
+  # We use the porcelain 'diff' and not plumbing 'diff-tree' because the output
+  # is expected to be viewed by the user, and only the former does nice things
+  # like color and pagination.
+  subprocess.check_call(['git', 'diff', old_tree, new_tree, '--'])
+
+
+def apply_changes(old_tree, new_tree, force=False, patch_mode=False):
+  """Apply the changes in `new_tree` to the working directory.
+
+  Bails if there are local changes in those files and not `force`.  If
+  `patch_mode`, runs `git checkout --patch` to select hunks interactively."""
+  changed_files = run('git', 'diff-tree', '-r', '-z', '--name-only', old_tree,
+                      new_tree).rstrip('\0').split('\0')
+  if not force:
+    unstaged_files = run('git', 'diff-files', '--name-status', *changed_files)
+    if unstaged_files:
+      print >>sys.stderr, ('The following files would be modified but '
+                           'have unstaged changes:')
+      print >>sys.stderr, unstaged_files
+      print >>sys.stderr, 'Please commit, stage, or stash them first.'
+      sys.exit(2)
+  if patch_mode:
+    # In patch mode, we could just as well create an index from the new tree
+    # and checkout from that, but then the user will be presented with a
+    # message saying "Discard ... from worktree".  Instead, we use the old
+    # tree as the index and checkout from new_tree, which gives the slightly
+    # better message, "Apply ... to index and worktree".  This is not quite
+    # right, since it won't be applied to the user's index, but oh well.
+    with temporary_index_file(old_tree):
+      subprocess.check_call(['git', 'checkout', '--patch', new_tree])
+    index_tree = old_tree
+  else:
+    with temporary_index_file(new_tree):
+      run('git', 'checkout-index', '-a', '-f')
+  return changed_files
+
+
+def run(*args, **kwargs):
+  stdin = kwargs.pop('stdin', '')
+  verbose = kwargs.pop('verbose', True)
+  strip = kwargs.pop('strip', True)
+  for name in kwargs:
+    raise TypeError("run() got an unexpected keyword argument '%s'" % name)
+  p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                       stdin=subprocess.PIPE)
+  stdout, stderr = p.communicate(input=stdin)
+  if p.returncode == 0:
+    if stderr:
+      if verbose:
+        print >>sys.stderr, '`%s` printed to stderr:' % ' '.join(args)
+      print >>sys.stderr, stderr.rstrip()
+    if strip:
+      stdout = stdout.rstrip('\r\n')
+    return stdout
+  if verbose:
+    print >>sys.stderr, '`%s` returned %s' % (' '.join(args), p.returncode)
+  if stderr:
+    print >>sys.stderr, stderr.rstrip()
+  sys.exit(2)
+
+
+def die(message):
+  print >>sys.stderr, 'error:', message
+  sys.exit(2)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/scripts/git-tag.sh b/scripts/git-tag.sh
new file mode 100644
index 0000000..073b4a1
--- /dev/null
+++ b/scripts/git-tag.sh
@@ -0,0 +1,9 @@
+git_tag_latest=$(git describe --abbrev=0)
+git_rev_count=$(git rev-list $git_tag_latest.. --count)
+git_rev_count=$[$git_rev_count+1]
+git_subject=$(git log --pretty="%s" -n 1)
+release=$git_rev_count
+if [[ "$release" != "1" ]]; then
+  release="${release}.git.$(git log --pretty='%h' -n 1)"
+fi
+revision=${git_tag_latest:1}
diff --git a/scripts/style-check.sh b/scripts/style-check.sh
new file mode 100755
index 0000000..04aa1b2
--- /dev/null
+++ b/scripts/style-check.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Runs clang-format on the files changed between HEAD and $1, which defaults to
+# origin/master.
+
+# to pick up git-clang-format from scripts/
+export PATH=$(dirname $0):$PATH
+
+CLANG_FORMAT=${CLANG_FORMAT:-clang-format}
+GITREF=${1:-origin/master}
+
+if ! hash $CLANG_FORMAT 2> /dev/null; then
+  echo "Could not find clang-format tool" 1>&2
+  exit 1
+fi
+
+cmd="git clang-format $GITREF --binary $CLANG_FORMAT --diff --extensions h,c,cc"
+
+n=$($cmd --quiet | wc -l)
+if [ $n -gt 0 ]; then
+  $cmd -v
+  exit 1
+fi
diff --git a/snapcraft/Makefile b/snapcraft/Makefile
new file mode 100644
index 0000000..92d1f6c
--- /dev/null
+++ b/snapcraft/Makefile
@@ -0,0 +1,53 @@
+#
+# Copyright (C) 2016 Canonical
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+
+#
+# Simple makefile to mangle version info in the yaml file
+#
+VERSION=$(shell git tag | tail -1 | cut -c2-)
+COMMITS=$(shell git log --oneline | wc -l)
+SHA=$(shell git log -1 --oneline | cut -d' ' -f1)
+DATE=$(shell date +'%Y%m%d')
+V=$(VERSION)-$(DATE)-$(COMMITS)-$(SHA)
+
+all: set_version
+	snapcraft
+
+set_version:
+	cat snapcraft.yaml | sed 's/version: .*/version: $(V)/' > snapcraft-tmp.yaml
+	mv snapcraft-tmp.yaml snapcraft.yaml
+
+install:
+	#
+	# Install latest snap
+	#
+	sudo snap install --devmode bcc_*.snap
+
+	#
+	# Connect up interfaces
+	#
+	sudo snap connect bcc:mount-observe
+	sudo snap connect bcc:system-observe
+	sudo snap connect bcc:system-trace
+
+remove:
+	sudo snap remove bcc
+
+clean:
+	snapcraft clean
+	rm -rf setup *.snap snapcraft
diff --git a/snapcraft/README.md b/snapcraft/README.md
new file mode 100644
index 0000000..b95729a
--- /dev/null
+++ b/snapcraft/README.md
@@ -0,0 +1,44 @@
+# bcc snap
+
+This is an unconfined snap of the BPF Compiler Collection (BCC), a toolkit for
+creating efficient kernel tracing and manipulation programs.
+
+First, install snapcraft, e.g. on Ubuntu:
+
+sudo apt install snapcraft
+
+Clone the bcc repo (if you haven't done so already) and create the snap:
+
+git clone https://github.com/iovisor/bcc.git
+cd snapcraft
+make
+
+Note: running `make` just gets the version from the current bcc gito
+repository and uses this in the snapcraft yaml file to version the bcc
+snap. The Makefile basically runs snapcraft to snap up bcc.
+
+Install the snap by running:
+
+sudo snap install --devmode bcc_*.snap
+
+One may need to ensure the snap plugins are enabled for the snap using:
+
+sudo snap connect bcc:mount-observe
+sudo snap connect bcc:system-observe
+sudo snap connect bcc:system-trace
+
+Now run a bcc tool, for example, to run opensnoop use:
+
+sudo bcc.opensnoop
+
+Note that this may fail to build and run if you do not have the kernel
+headers installed or perhaps the kernel config is not set up correctly.
+
+This snap has been tested using the mainly 4.8 and 4.9 kernels built
+with the Ubuntu Yakkety and Zesty kernel configs as well as the default
+Ubuntu 4.8 Yakkey and 4.9 Zesty kernels.
+
+Contact Colin Ian King <colin.king@canonical.com> for support on this
+bcc snap.
+
+Thu 15 Dec 17:08:29 GMT 2016
diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml
new file mode 100644
index 0000000..e4acdb2
--- /dev/null
+++ b/snapcraft/snapcraft.yaml
@@ -0,0 +1,368 @@
+#
+# Copyright (C) 2016 Canonical
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+name: bcc
+version: 0.3.0-20170401-1747-c5f48c9
+summary: BPF Compiler Collection (BCC)
+description: A toolkit for creating efficient kernel tracing and manipulation programs
+confinement: strict
+grade: stable
+plugs:
+    mount-observe: null
+    system-observe: null
+    system-trace: null
+assumes: [snapd2.23]
+apps:
+    argdist:
+        command: wrapper argdist
+        aliases: [argdist]
+    bashreadline:
+        command: wrapper bashreadline
+        aliases: [bashreadline]
+    biolatency:
+        command: wrapper biolatency
+        aliases: [biolatency]
+    biosnoop:
+        command: wrapper biosnoop
+        aliases: [biosnoop]
+    biotop:
+        command: wrapper biotop
+        aliases: [biotop]
+    bitesize:
+        command: wrapper bitesize
+        aliases: [bitesize]
+    bpflist:
+        command: wrapper bpflist
+        aliases: [bpflist]
+    btrfsdist:
+        command: wrapper btrfsdist
+        aliases: [btrfsdist]
+    btrfsslower:
+        command: wrapper btrfsslower
+        aliases: [btrfsslower]
+    cachestat:
+        command: wrapper cachestat
+        aliases: [cachestat]
+    cachetop:
+        command: wrapper cachetop
+        aliases: [cachetop]
+    capable:
+        command: wrapper capable
+        aliases: [capable]
+    cobjnew:
+        command: wrapper cobjnew
+        aliases: [cobjnew]
+    cpudist:
+        command: wrapper cpudist
+        aliases: [cpudist]
+    cpuunclaimed:
+        command: wrapper cpuunclaimed
+        aliases: [cpuunclaimed]
+    dbslower:
+        command: wrapper dbslower
+        aliases: [dbslower]
+    dbstat:
+        command: wrapper dbstat
+        aliases: [dbstat]
+    dcsnoop:
+        command: wrapper dcsnoop
+        aliases: [dcsnoop]
+    dcstat:
+        command: wrapper dcstat
+        aliases: [dcstat]
+    deadlock-detector:
+        command: wrapper deadlock_detector
+        aliases: [deadlock-detector]
+    execsnoop:
+        command: wrapper execsnoop
+        aliases: [execsnoop]
+    ext4dist:
+        command: wrapper ext4dist
+        aliases: [ext4dist]
+    ext4slower:
+        command: wrapper ext4slower
+        aliases: [ext4slower]
+    filelife:
+        command: wrapper filelife
+        aliases: [filelife]
+    fileslower:
+        command: wrapper fileslower
+        aliases: [fileslower]
+    filetop:
+        command: wrapper filetop
+        aliases: [filetop]
+    funccount:
+        command: wrapper funccount
+        aliases: [funccount]
+    funclatency:
+        command: wrapper funclatency
+        aliases: [funclatency]
+    funcslower:
+        command: wrapper funcslower
+        aliases: [funcslower]
+    gethostlatency:
+        command: wrapper gethostlatency
+        aliases: [gethostlatency]
+    hardirqs:
+        command: wrapper hardirqs
+        aliases: [hardirqs]
+    javacalls:
+        command: wrapper javacalls
+        aliases: [javacalls]
+    javaflow:
+        command: wrapper javaflow
+        aliases: [javaflow]
+    javagc:
+        command: wrapper javagc
+        aliases: [javagc]
+    javaobjnew:
+        command: wrapper javaobjnew
+        aliases: [javaobjnew]
+    javastat:
+        command: wrapper javastat
+        aliases: [javastat]
+    javathreads:
+        command: wrapper javathreads
+        aliases: [javathreads]
+    killsnoop:
+        command: wrapper killsnoop
+        aliases: [killsnoop]
+    llcstat:
+        command: wrapper llcstat
+        aliases: [llcstat]
+    mdflush:
+        command: wrapper mdflush
+        aliases: [mdflush]
+    memleak:
+        command: wrapper memleak
+        aliases: [memleak]
+    mountsnoop:
+        command: wrapper mountsnoop
+        aliases: [mountsnoop]
+    mysqld-qslower:
+        command: wrapper mysqld_qslower
+        aliases: [mysqld-qslower]
+    nfsdist:
+        command: wrapper nfsdist
+        aliases: [nfsdist]
+    nfsslower:
+        command: wrapper nfsslower
+        aliases: [nfsslower]
+    nodegc:
+        command: wrapper nodegc
+        aliases: [nodegc]
+    nodestat:
+        command: wrapper nodestat
+        aliases: [nodestat]
+    offcputime:
+        command: wrapper offcputime
+        aliases: [offcputime]
+    offwaketime:
+        command: wrapper offwaketime
+        aliases: [offwaketime]
+    oomkill:
+        command: wrapper oomkill
+        aliases: [oomkill]
+    opensnoop:
+        command: wrapper opensnoop
+        aliases: [opensnoop]
+    perlcalls:
+        command: wrapper perlcalls
+        aliases: [perlcalls]
+    perlflow:
+        command: wrapper perlflow
+        aliases: [perlflow]
+    perlstat:
+        command: wrapper perlstat
+        aliases: [perlstat]
+    phpcalls:
+        command: wrapper phpcalls
+        aliases: [phpcalls]
+    phpflow:
+        command: wrapper phpflow
+        aliases: [phpflow]
+    phpstat:
+        command: wrapper phpstat
+        aliases: [phpstat]
+    pidpersec:
+        command: wrapper pidpersec
+        aliases: [pidpersec]
+    profile:
+        command: wrapper profile
+        aliases: [profile]
+    pythoncalls:
+        command: wrapper pythoncalls
+        aliases: [pythoncalls]
+    pythonflow:
+        command: wrapper pythonflow
+        aliases: [pythonflow]
+    pythongc:
+        command: wrapper pythongc
+        aliases: [pythongc]
+    pythonstat:
+        command: wrapper pythonstat
+        aliases: [pythonstat]
+    rubycalls:
+        command: wrapper rubycalls
+        aliases: [rubycalls]
+    rubyflow:
+        command: wrapper rubyflow
+        aliases: [rubyflow]
+    rubygc:
+        command: wrapper rubygc
+        aliases: [rubygc]
+    rubyobjnew:
+        command: wrapper rubyobjnew
+        aliases: [rubyobjnew]
+    rubystat:
+        command: wrapper rubystat
+        aliases: [rubystat]
+    runqlat:
+        command: wrapper runqlat
+        aliases: [runqlat]
+    runqlen:
+        command: wrapper runqlen
+        aliases: [runqlen]
+    slabratetop:
+        command: wrapper slabratetop
+        aliases: [slabratetop]
+    softirqs:
+        command: wrapper softirqs
+        aliases: [softirqs]
+    solisten:
+        command: wrapper solisten
+        aliases: [solisten]
+    sslsniff:
+        command: wrapper sslsniff
+        aliases: [sslsniff]
+    stackcount:
+        command: wrapper stackcount
+        aliases: [stackcount]
+    stacksnoop:
+        command: wrapper stacksnoop
+        aliases: [stacksnoop]
+    statsnoop:
+        command: wrapper statsnoop
+        aliases: [statsnoop]
+    syncsnoop:
+        command: wrapper syncsnoop
+        aliases: [syncsnoop]
+    syscount:
+        command: wrapper syscount
+        aliases: [syscount]
+    tcpaccept:
+        command: wrapper tcpaccept
+        aliases: [tcpaccept]
+    tcpconnect:
+        command: wrapper tcpconnect
+        aliases: [tcpconnect]
+    tcpconnlat:
+        command: wrapper tcpconnlat
+        aliases: [tcpconnlat]
+    tcplife:
+        command: wrapper tcplife
+        aliases: [tcplife]
+    tcpretrans:
+        command: wrapper tcpretrans
+        aliases: [tcpretrans]
+    tcptop:
+        command: wrapper tcptop
+        aliases: [tcptop]
+    tcptracer:
+        command: wrapper tcptracer
+        aliases: [tcptracer]
+    tplist:
+        command: wrapper tplist
+        aliases: [tplist]
+    trace:
+        command: wrapper trace
+        aliases: [trace]
+    ttysnoop:
+        command: wrapper ttysnoop
+        aliases: [ttysnoop]
+    ucalls:
+        command: wrapper lib/ucalls
+        aliases: [ucalls]
+    uflow:
+        command: wrapper lib/uflow
+        aliases: [uflow]
+    ugc:
+        command: wrapper lib/ugc
+        aliases: [ugc]
+    uobjnew:
+        command: wrapper lib/uobjnew
+        aliases: [uobjnew]
+    ustat:
+        command: wrapper lib/ustat
+        aliases: [ustat]
+    uthreads:
+        command: wrapper lib/uthreads
+        aliases: [uthreads]
+    vfscount:
+        command: wrapper vfscount
+        aliases: [vfscount]
+    vfsstat:
+        command: wrapper vfsstat
+        aliases: [vfsstat]
+    wakeuptime:
+        command: wrapper wakeuptime
+        aliases: [wakeuptime]
+    xfsdist:
+        command: wrapper xfsdist
+        aliases: [xfsdist]
+    xfsslower:
+        command: wrapper xfsslower
+        aliases: [xfsslower]
+    zfsdist:
+        command: wrapper zfsdist
+        aliases: [zfsdist]
+    zfsslower:
+        command: wrapper zfsslower
+        aliases: [zfsslower]
+parts:
+    bcc:
+        plugin: cmake
+        configflags:
+            - -DCMAKE_INSTALL_PREFIX=/usr
+        source: ..
+        build-packages:
+            - bison
+            - build-essential
+            - cmake
+            - flex
+            - libedit-dev
+            - libllvm4.0
+            - llvm-4.0-dev
+            - libclang-4.0-dev
+            - python
+            - zlib1g-dev
+            - libelf-dev
+        stage-packages:
+            - python
+        snap:
+            - usr/bin/python*
+            - usr/share/bcc/tools
+            - usr/lib/*/lib*.so*
+            - usr/lib/python2.7
+            - -usr/share/bcc/tools/doc
+    wrapper:
+        source: .
+        plugin: copy
+        files:
+            wrapper: bin/wrapper
+
+# vim: set ai et sts=4 tabstop=4 sw=4:
diff --git a/snapcraft/wrapper b/snapcraft/wrapper
new file mode 100755
index 0000000..0256962
--- /dev/null
+++ b/snapcraft/wrapper
@@ -0,0 +1,14 @@
+#!/bin/sh -e
+# Snappy does not yet support CAP_SYS_ADMIN for unconfined snaps, thus sudo:
+# https://bugs.launchpad.net/snappy/+bug/1586581
+# stdout isn't set to line buffered mode:
+# https://bugs.launchpad.net/snappy/+bug/1587675
+
+cmd="$1"
+if [ `id -u` = 0 ] ; then
+	shift
+	stdbuf -oL $SNAP/usr/bin/python "$SNAP/usr/share/bcc/tools/$cmd" $@
+else
+	echo "Need to run $cmd as root (use sudo $@)"
+	exit 1
+fi
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..7daca5b
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+if(NOT PYTHON_ONLY)
+add_subdirectory(cc)
+endif()
+if(ENABLE_CLANG_JIT)
+add_subdirectory(python)
+add_subdirectory(lua)
+endif()
diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt
new file mode 100644
index 0000000..fda165d
--- /dev/null
+++ b/src/cc/CMakeLists.txt
@@ -0,0 +1,105 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+# to be removed
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/frontends/b)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/frontends/b)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/frontends/clang)
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${LIBELF_INCLUDE_DIRS})
+# todo: if check for kernel version
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/compat)
+add_definitions(${LLVM_DEFINITIONS})
+configure_file(libbcc.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc @ONLY)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -DBCC_PROG_TAG_DIR='\"${BCC_PROG_TAG_DIR}\"'")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+
+string(REGEX MATCH "^([0-9]+).*" _ ${LLVM_PACKAGE_VERSION})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_MAJOR_VERSION=${CMAKE_MATCH_1}")
+
+include(static_libstdc++)
+
+add_library(bpf-static STATIC libbpf.c perf_reader.c)
+set_target_properties(bpf-static PROPERTIES OUTPUT_NAME bpf)
+add_library(bpf-shared SHARED libbpf.c perf_reader.c)
+set_target_properties(bpf-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0)
+set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bpf)
+
+set(bcc_common_sources bpf_common.cc bpf_module.cc exported_files.cc)
+if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 6 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 6)
+  set(bcc_common_sources ${bcc_common_sources} bcc_debug.cc)
+endif()
+
+set(bcc_table_sources table_storage.cc shared_table.cc bpffs_table.cc json_map_decl_visitor.cc)
+set(bcc_util_sources ns_guard.cc common.cc)
+set(bcc_sym_sources bcc_syms.cc bcc_elf.c bcc_perf_map.c bcc_proc.c)
+set(bcc_common_headers libbpf.h perf_reader.h)
+set(bcc_table_headers file_desc.h table_desc.h table_storage.h)
+set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h)
+
+if(ENABLE_CLANG_JIT)
+add_library(bcc-shared SHARED
+  link_all.cc ${bcc_common_sources} ${bcc_table_sources} ${bcc_sym_sources}
+  ${bcc_util_sources})
+set_target_properties(bcc-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0)
+set_target_properties(bcc-shared PROPERTIES OUTPUT_NAME bcc)
+
+if(ENABLE_USDT)
+  set(bcc_usdt_sources usdt/usdt.cc usdt/usdt_args.cc)
+  # else undefined
+endif()
+
+add_library(bcc-loader-static STATIC ${bcc_sym_sources} ${bcc_util_sources})
+target_link_libraries(bcc-loader-static elf)
+add_library(bcc-static STATIC
+  ${bcc_common_sources} ${bcc_table_sources} ${bcc_util_sources} ${bcc_usdt_sources})
+set_target_properties(bcc-static PROPERTIES OUTPUT_NAME bcc)
+set(bcc-lua-static
+  ${bcc_common_sources} ${bcc_table_sources} ${bcc_sym_sources} ${bcc_util_sources})
+
+include(clang_libs)
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${clang_lib_exclude_flags}")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${llvm_lib_exclude_flags}")
+
+# bcc_common_libs_for_a for archive libraries
+# bcc_common_libs_for_s for shared libraries
+set(bcc_common_libs_for_a b_frontend clang_frontend bpf-static
+  -Wl,--whole-archive ${clang_libs} ${llvm_libs} -Wl,--no-whole-archive
+  ${LIBELF_LIBRARIES})
+set(bcc_common_libs_for_s ${bcc_common_libs_for_a})
+set(bcc_common_libs_for_lua b_frontend clang_frontend bpf-static
+  ${clang_libs} ${llvm_libs} ${LIBELF_LIBRARIES})
+
+if(ENABLE_CPP_API)
+  add_subdirectory(api)
+  list(APPEND bcc_common_libs_for_a api-static)
+  # Keep all API functions
+  list(APPEND bcc_common_libs_for_s -Wl,--whole-archive api-static -Wl,--no-whole-archive)
+endif()
+
+if(ENABLE_USDT)
+  list(APPEND bcc_api_headers bcc_usdt.h)
+  add_subdirectory(usdt)
+  list(APPEND bcc_common_libs_for_a usdt-static)
+  list(APPEND bcc_common_libs_for_s usdt-static)
+  list(APPEND bcc_common_libs_for_lua usdt-static)
+endif()
+
+add_subdirectory(frontends)
+
+# Link against LLVM libraries
+target_link_libraries(bcc-shared ${bcc_common_libs_for_s})
+target_link_libraries(bcc-static ${bcc_common_libs_for_a} bcc-loader-static)
+set(bcc-lua-static ${bcc-lua-static} ${bcc_common_libs_for_lua})
+
+install(TARGETS bcc-shared LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(FILES ${bcc_table_headers} DESTINATION include/bcc)
+install(FILES ${bcc_api_headers} DESTINATION include/bcc)
+install(DIRECTORY compat/linux/ DESTINATION include/bcc/compat/linux FILES_MATCHING PATTERN "*.h")
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+endif(ENABLE_CLANG_JIT)
+install(FILES ${bcc_common_headers} DESTINATION include/bcc)
+install(TARGETS bpf-shared LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc
new file mode 100644
index 0000000..5f451f7
--- /dev/null
+++ b/src/cc/api/BPF.cc
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <linux/bpf.h>
+#include <linux/perf_event.h>
+#include <unistd.h>
+#include <cstdio>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "bcc_exception.h"
+#include "bcc_syms.h"
+#include "bpf_module.h"
+#include "common.h"
+#include "libbpf.h"
+#include "perf_reader.h"
+#include "syms.h"
+#include "table_storage.h"
+#include "usdt.h"
+
+#include "BPF.h"
+
+namespace ebpf {
+
+std::string uint_to_hex(uint64_t value) {
+  std::stringstream ss;
+  ss << std::hex << value;
+  return ss.str();
+}
+
+std::string sanitize_str(std::string str, bool (*validator)(char),
+                         char replacement = '_') {
+  for (size_t i = 0; i < str.length(); i++)
+    if (!validator(str[i]))
+      str[i] = replacement;
+  return str;
+}
+
+StatusTuple BPF::init(const std::string& bpf_program,
+                      const std::vector<std::string>& cflags,
+                      const std::vector<USDT>& usdt) {
+  std::string all_bpf_program;
+
+  usdt_.reserve(usdt.size());
+  for (const auto& u : usdt) {
+    usdt_.emplace_back(u);
+  }
+  for (auto& u : usdt_) {
+    TRY2(u.init());
+    all_bpf_program += u.program_text_;
+  }
+
+  auto flags_len = cflags.size();
+  const char* flags[flags_len];
+  for (size_t i = 0; i < flags_len; i++)
+    flags[i] = cflags[i].c_str();
+
+  all_bpf_program += bpf_program;
+  if (bpf_module_->load_string(all_bpf_program, flags, flags_len) != 0)
+    return StatusTuple(-1, "Unable to initialize BPF program");
+
+  return StatusTuple(0);
+};
+
+BPF::~BPF() {
+  auto res = detach_all();
+  if (res.code() != 0)
+    std::cerr << "Failed to detach all probes on destruction: " << std::endl
+              << res.msg() << std::endl;
+}
+
+StatusTuple BPF::detach_all() {
+  bool has_error = false;
+  std::string error_msg;
+
+  for (auto& it : kprobes_) {
+    auto res = detach_kprobe_event(it.first, it.second);
+    if (res.code() != 0) {
+      error_msg += "Failed to detach kprobe event " + it.first + ": ";
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  for (auto& it : uprobes_) {
+    auto res = detach_uprobe_event(it.first, it.second);
+    if (res.code() != 0) {
+      error_msg += "Failed to detach uprobe event " + it.first + ": ";
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  for (auto& it : tracepoints_) {
+    auto res = detach_tracepoint_event(it.first, it.second);
+    if (res.code() != 0) {
+      error_msg += "Failed to detach Tracepoint " + it.first + ": ";
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  for (auto& it : perf_buffers_) {
+    auto res = it.second->close_all_cpu();
+    if (res.code() != 0) {
+      error_msg += "Failed to close perf buffer " + it.first + ": ";
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+    delete it.second;
+  }
+
+  for (auto& it : perf_event_arrays_) {
+    auto res = it.second->close_all_cpu();
+    if (res.code() != 0) {
+      error_msg += "Failed to close perf event array " + it.first + ": ";
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+    delete it.second;
+  }
+
+  for (auto& it : perf_events_) {
+    auto res = detach_perf_event_all_cpu(it.second);
+    if (res.code() != 0) {
+      error_msg += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  for (auto& it : funcs_) {
+    int res = close(it.second);
+    if (res != 0) {
+      error_msg += "Failed to unload BPF program for " + it.first + ": ";
+      error_msg += std::string(std::strerror(errno)) + "\n";
+      has_error = true;
+    }
+  }
+
+  if (has_error)
+    return StatusTuple(-1, error_msg);
+  else
+    return StatusTuple(0);
+}
+
+StatusTuple BPF::attach_kprobe(const std::string& kernel_func,
+                               const std::string& probe_func,
+                               uint64_t kernel_func_offset,
+                               bpf_probe_attach_type attach_type) {
+  std::string probe_event = get_kprobe_event(kernel_func, attach_type);
+  if (kprobes_.find(probe_event) != kprobes_.end())
+    return StatusTuple(-1, "kprobe %s already attached", probe_event.c_str());
+
+  int probe_fd;
+  TRY2(load_func(probe_func, BPF_PROG_TYPE_KPROBE, probe_fd));
+
+  int res_fd = bpf_attach_kprobe(probe_fd, attach_type, probe_event.c_str(),
+                                 kernel_func.c_str(), kernel_func_offset);
+
+  if (res_fd < 0) {
+    TRY2(unload_func(probe_func));
+    return StatusTuple(-1, "Unable to attach %skprobe for %s using %s",
+                       attach_type_debug(attach_type).c_str(),
+                       kernel_func.c_str(), probe_func.c_str());
+  }
+
+  open_probe_t p = {};
+  p.perf_event_fd = res_fd;
+  p.func = probe_func;
+  kprobes_[probe_event] = std::move(p);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::attach_uprobe(const std::string& binary_path,
+                               const std::string& symbol,
+                               const std::string& probe_func,
+                               uint64_t symbol_addr,
+                               bpf_probe_attach_type attach_type, pid_t pid) {
+  std::string module;
+  uint64_t offset;
+  TRY2(check_binary_symbol(binary_path, symbol, symbol_addr, module, offset));
+
+  std::string probe_event = get_uprobe_event(module, offset, attach_type, pid);
+  if (uprobes_.find(probe_event) != uprobes_.end())
+    return StatusTuple(-1, "uprobe %s already attached", probe_event.c_str());
+
+  int probe_fd;
+  TRY2(load_func(probe_func, BPF_PROG_TYPE_KPROBE, probe_fd));
+
+  int res_fd = bpf_attach_uprobe(probe_fd, attach_type, probe_event.c_str(),
+                                 binary_path.c_str(), offset, pid);
+
+  if (res_fd < 0) {
+    TRY2(unload_func(probe_func));
+    return StatusTuple(
+        -1,
+        "Unable to attach %suprobe for binary %s symbol %s addr %lx using %s\n",
+        attach_type_debug(attach_type).c_str(), binary_path.c_str(),
+        symbol.c_str(), symbol_addr, probe_func.c_str());
+  }
+
+  open_probe_t p = {};
+  p.perf_event_fd = res_fd;
+  p.func = probe_func;
+  uprobes_[probe_event] = std::move(p);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::attach_usdt(const USDT& usdt, pid_t pid) {
+  for (const auto& u : usdt_) {
+    if (u == usdt) {
+      auto& probe = *static_cast<::USDT::Probe*>(u.probe_.get());
+      if (!probe.enable(u.probe_func_))
+        return StatusTuple(-1, "Unable to enable USDT " + u.print_name());
+
+      bool failed = false;
+      std::string err_msg;
+      int cnt = 0;
+      for (const auto& loc : probe.locations_) {
+        auto res = attach_uprobe(loc.bin_path_, std::string(), u.probe_func_,
+                                 loc.address_, BPF_PROBE_ENTRY, pid);
+        if (res.code() != 0) {
+          failed = true;
+          err_msg += "USDT " + u.print_name() + " at " + loc.bin_path_ +
+                     " address " + std::to_string(loc.address_);
+          err_msg += ": " + res.msg() + "\n";
+          break;
+        }
+        cnt++;
+      }
+      if (failed) {
+        for (int i = 0; i < cnt; i++) {
+          auto res =
+              detach_uprobe(probe.locations_[i].bin_path_, std::string(),
+                            probe.locations_[i].address_, BPF_PROBE_ENTRY, pid);
+          if (res.code() != 0)
+            err_msg += "During clean up: " + res.msg() + "\n";
+        }
+        return StatusTuple(-1, err_msg);
+      } else {
+        return StatusTuple(0);
+      }
+    }
+  }
+
+  return StatusTuple(-1, "USDT %s not found", usdt.print_name().c_str());
+}
+
+StatusTuple BPF::attach_tracepoint(const std::string& tracepoint,
+                                   const std::string& probe_func) {
+  if (tracepoints_.find(tracepoint) != tracepoints_.end())
+    return StatusTuple(-1, "Tracepoint %s already attached",
+                       tracepoint.c_str());
+
+  auto pos = tracepoint.find(":");
+  if ((pos == std::string::npos) || (pos != tracepoint.rfind(":")))
+    return StatusTuple(-1, "Unable to parse Tracepoint %s", tracepoint.c_str());
+  std::string tp_category = tracepoint.substr(0, pos);
+  std::string tp_name = tracepoint.substr(pos + 1);
+
+  int probe_fd;
+  TRY2(load_func(probe_func, BPF_PROG_TYPE_TRACEPOINT, probe_fd));
+
+  int res_fd =
+      bpf_attach_tracepoint(probe_fd, tp_category.c_str(), tp_name.c_str());
+
+  if (res_fd < 0) {
+    TRY2(unload_func(probe_func));
+    return StatusTuple(-1, "Unable to attach Tracepoint %s using %s",
+                       tracepoint.c_str(), probe_func.c_str());
+  }
+
+  open_probe_t p = {};
+  p.perf_event_fd = res_fd;
+  p.func = probe_func;
+  tracepoints_[tracepoint] = std::move(p);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::attach_perf_event(uint32_t ev_type, uint32_t ev_config,
+                                   const std::string& probe_func,
+                                   uint64_t sample_period, uint64_t sample_freq,
+                                   pid_t pid, int cpu, int group_fd) {
+  auto ev_pair = std::make_pair(ev_type, ev_config);
+  if (perf_events_.find(ev_pair) != perf_events_.end())
+    return StatusTuple(-1, "Perf event type %d config %d already attached",
+                       ev_type, ev_config);
+
+  int probe_fd;
+  TRY2(load_func(probe_func, BPF_PROG_TYPE_PERF_EVENT, probe_fd));
+
+  std::vector<int> cpus;
+  if (cpu >= 0)
+    cpus.push_back(cpu);
+  else
+    cpus = get_online_cpus();
+  auto fds = new std::vector<std::pair<int, int>>();
+  fds->reserve(cpus.size());
+  for (int i : cpus) {
+    int fd = bpf_attach_perf_event(probe_fd, ev_type, ev_config, sample_period,
+                                   sample_freq, pid, i, group_fd);
+    if (fd < 0) {
+      for (const auto& it : *fds)
+        close(it.second);
+      delete fds;
+      TRY2(unload_func(probe_func));
+      return StatusTuple(-1, "Failed to attach perf event type %d config %d",
+                         ev_type, ev_config);
+    }
+    fds->emplace_back(i, fd);
+  }
+
+  open_probe_t p = {};
+  p.func = probe_func;
+  p.per_cpu_fd = fds;
+  perf_events_[ev_pair] = std::move(p);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::attach_perf_event_raw(void* perf_event_attr,
+                                       const std::string& probe_func, pid_t pid,
+                                       int cpu, int group_fd,
+                                       unsigned long extra_flags) {
+  auto attr = static_cast<struct perf_event_attr*>(perf_event_attr);
+  auto ev_pair = std::make_pair(attr->type, attr->config);
+  if (perf_events_.find(ev_pair) != perf_events_.end())
+    return StatusTuple(-1, "Perf event type %d config %d already attached",
+                       attr->type, attr->config);
+
+  int probe_fd;
+  TRY2(load_func(probe_func, BPF_PROG_TYPE_PERF_EVENT, probe_fd));
+
+  std::vector<int> cpus;
+  if (cpu >= 0)
+    cpus.push_back(cpu);
+  else
+    cpus = get_online_cpus();
+  auto fds = new std::vector<std::pair<int, int>>();
+  fds->reserve(cpus.size());
+  for (int i : cpus) {
+    int fd = bpf_attach_perf_event_raw(probe_fd, attr, pid, i, group_fd,
+                                       extra_flags);
+    if (fd < 0) {
+      for (const auto& it : *fds)
+        close(it.second);
+      delete fds;
+      TRY2(unload_func(probe_func));
+      return StatusTuple(-1, "Failed to attach perf event type %d config %d",
+                         attr->type, attr->config);
+    }
+    fds->emplace_back(i, fd);
+  }
+
+  open_probe_t p = {};
+  p.func = probe_func;
+  p.per_cpu_fd = fds;
+  perf_events_[ev_pair] = std::move(p);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_kprobe(const std::string& kernel_func,
+                               bpf_probe_attach_type attach_type) {
+  std::string event = get_kprobe_event(kernel_func, attach_type);
+
+  auto it = kprobes_.find(event);
+  if (it == kprobes_.end())
+    return StatusTuple(-1, "No open %skprobe for %s",
+                       attach_type_debug(attach_type).c_str(),
+                       kernel_func.c_str());
+
+  TRY2(detach_kprobe_event(it->first, it->second));
+  kprobes_.erase(it);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_uprobe(const std::string& binary_path,
+                               const std::string& symbol, uint64_t symbol_addr,
+                               bpf_probe_attach_type attach_type, pid_t pid) {
+  std::string module;
+  uint64_t offset;
+  TRY2(check_binary_symbol(binary_path, symbol, symbol_addr, module, offset));
+
+  std::string event = get_uprobe_event(module, offset, attach_type, pid);
+  auto it = uprobes_.find(event);
+  if (it == uprobes_.end())
+    return StatusTuple(-1, "No open %suprobe for binary %s symbol %s addr %lx",
+                       attach_type_debug(attach_type).c_str(),
+                       binary_path.c_str(), symbol.c_str(), symbol_addr);
+
+  TRY2(detach_uprobe_event(it->first, it->second));
+  uprobes_.erase(it);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_usdt(const USDT& usdt, pid_t pid) {
+  for (const auto& u : usdt_) {
+    if (u == usdt) {
+      auto& probe = *static_cast<::USDT::Probe*>(u.probe_.get());
+      bool failed = false;
+      std::string err_msg;
+      for (const auto& loc : probe.locations_) {
+        auto res = detach_uprobe(loc.bin_path_, std::string(), loc.address_,
+                                 BPF_PROBE_ENTRY, pid);
+        if (res.code() != 0) {
+          failed = true;
+          err_msg += "USDT " + u.print_name() + " at " + loc.bin_path_ +
+                     " address " + std::to_string(loc.address_);
+          err_msg += ": " + res.msg() + "\n";
+        }
+      }
+
+      if (!probe.disable()) {
+        failed = true;
+        err_msg += "Unable to disable USDT " + u.print_name();
+      }
+
+      if (failed)
+        return StatusTuple(-1, err_msg);
+      else
+        return StatusTuple(0);
+    }
+  }
+
+  return StatusTuple(-1, "USDT %s not found", usdt.print_name().c_str());
+}
+
+StatusTuple BPF::detach_tracepoint(const std::string& tracepoint) {
+  auto it = tracepoints_.find(tracepoint);
+  if (it == tracepoints_.end())
+    return StatusTuple(-1, "No open Tracepoint %s", tracepoint.c_str());
+
+  TRY2(detach_tracepoint_event(it->first, it->second));
+  tracepoints_.erase(it);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
+  auto it = perf_events_.find(std::make_pair(ev_type, ev_config));
+  if (it == perf_events_.end())
+    return StatusTuple(-1, "Perf Event type %d config %d not attached", ev_type,
+                       ev_config);
+  TRY2(detach_perf_event_all_cpu(it->second));
+  perf_events_.erase(it);
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_perf_event_raw(void* perf_event_attr) {
+  auto attr = static_cast<struct perf_event_attr*>(perf_event_attr);
+  return detach_perf_event(attr->type, attr->config);
+}
+
+StatusTuple BPF::open_perf_event(const std::string& name, uint32_t type,
+                                 uint64_t config) {
+  if (perf_event_arrays_.find(name) == perf_event_arrays_.end()) {
+    TableStorage::iterator it;
+    if (!bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return StatusTuple(-1, "open_perf_event: unable to find table_storage %s",
+                         name.c_str());
+    perf_event_arrays_[name] = new BPFPerfEventArray(it->second);
+  }
+  auto table = perf_event_arrays_[name];
+  TRY2(table->open_all_cpu(type, config));
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::close_perf_event(const std::string& name) {
+  auto it = perf_event_arrays_.find(name);
+  if (it == perf_event_arrays_.end())
+    return StatusTuple(-1, "Perf Event for %s not open", name.c_str());
+  TRY2(it->second->close_all_cpu());
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::open_perf_buffer(const std::string& name,
+                                  perf_reader_raw_cb cb,
+                                  perf_reader_lost_cb lost_cb, void* cb_cookie,
+                                  int page_cnt) {
+  if (perf_buffers_.find(name) == perf_buffers_.end()) {
+    TableStorage::iterator it;
+    if (!bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return StatusTuple(-1,
+                         "open_perf_buffer: unable to find table_storage %s",
+                         name.c_str());
+    perf_buffers_[name] = new BPFPerfBuffer(it->second);
+  }
+  if ((page_cnt & (page_cnt - 1)) != 0)
+    return StatusTuple(-1, "open_perf_buffer page_cnt must be a power of two");
+  auto table = perf_buffers_[name];
+  TRY2(table->open_all_cpu(cb, lost_cb, cb_cookie, page_cnt));
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::close_perf_buffer(const std::string& name) {
+  auto it = perf_buffers_.find(name);
+  if (it == perf_buffers_.end())
+    return StatusTuple(-1, "Perf buffer for %s not open", name.c_str());
+  TRY2(it->second->close_all_cpu());
+  return StatusTuple(0);
+}
+
+BPFPerfBuffer* BPF::get_perf_buffer(const std::string& name) {
+  auto it = perf_buffers_.find(name);
+  return (it == perf_buffers_.end()) ? nullptr : it->second;
+}
+
+int BPF::poll_perf_buffer(const std::string& name, int timeout_ms) {
+  auto it = perf_buffers_.find(name);
+  if (it == perf_buffers_.end())
+    return -1;
+  return it->second->poll(timeout_ms);
+}
+
+StatusTuple BPF::load_func(const std::string& func_name, bpf_prog_type type,
+                           int& fd) {
+  if (funcs_.find(func_name) != funcs_.end()) {
+    fd = funcs_[func_name];
+    return StatusTuple(0);
+  }
+
+  uint8_t* func_start = bpf_module_->function_start(func_name);
+  if (!func_start)
+    return StatusTuple(-1, "Can't find start of function %s",
+                       func_name.c_str());
+  size_t func_size = bpf_module_->function_size(func_name);
+
+  int log_level = 0;
+  if (flag_ & DEBUG_BPF_REGISTER_STATE)
+    log_level = 2;
+  else if (flag_ & DEBUG_BPF)
+    log_level = 1;
+
+  fd = bpf_prog_load(type, func_name.c_str(),
+                     reinterpret_cast<struct bpf_insn*>(func_start), func_size,
+                     bpf_module_->license(), bpf_module_->kern_version(),
+                     log_level, nullptr, 0);
+
+  if (fd < 0)
+    return StatusTuple(-1, "Failed to load %s: %d", func_name.c_str(), fd);
+
+  bpf_module_->annotate_prog_tag(
+      func_name, fd, reinterpret_cast<struct bpf_insn*>(func_start), func_size);
+  funcs_[func_name] = fd;
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::unload_func(const std::string& func_name) {
+  auto it = funcs_.find(func_name);
+  if (it == funcs_.end())
+    return StatusTuple(0);
+
+  int res = close(it->second);
+  if (res != 0)
+    return StatusTuple(-1, "Can't close FD for %s: %d", it->first.c_str(), res);
+
+  funcs_.erase(it);
+  return StatusTuple(0);
+}
+
+std::string BPF::get_syscall_fnname(const std::string& name) {
+  if (syscall_prefix_ == nullptr) {
+    KSyms ksym;
+    uint64_t addr;
+
+    if (ksym.resolve_name(nullptr, "sys_bpf", &addr))
+      syscall_prefix_.reset(new std::string("sys_"));
+    else if (ksym.resolve_name(nullptr, "__x64_sys_bpf", &addr))
+      syscall_prefix_.reset(new std::string("__x64_sys_"));
+    else
+      syscall_prefix_.reset(new std::string());
+  }
+
+  return *syscall_prefix_ + name;
+}
+
+StatusTuple BPF::check_binary_symbol(const std::string& binary_path,
+                                     const std::string& symbol,
+                                     uint64_t symbol_addr,
+                                     std::string& module_res,
+                                     uint64_t& offset_res) {
+  bcc_symbol output;
+  int res = bcc_resolve_symname(binary_path.c_str(), symbol.c_str(),
+                                symbol_addr, -1, nullptr, &output);
+  if (res < 0)
+    return StatusTuple(
+        -1, "Unable to find offset for binary %s symbol %s address %lx",
+        binary_path.c_str(), symbol.c_str(), symbol_addr);
+
+  if (output.module) {
+    module_res = output.module;
+    ::free(const_cast<char*>(output.module));
+  } else {
+    module_res = "";
+  }
+  offset_res = output.offset;
+  return StatusTuple(0);
+}
+
+std::string BPF::get_kprobe_event(const std::string& kernel_func,
+                                  bpf_probe_attach_type type) {
+  std::string res = attach_type_prefix(type) + "_";
+  res += sanitize_str(kernel_func, &BPF::kprobe_event_validator);
+  return res;
+}
+
+BPFProgTable BPF::get_prog_table(const std::string& name) {
+  TableStorage::iterator it;
+  if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+    return BPFProgTable(it->second);
+  return BPFProgTable({});
+}
+
+BPFCgroupArray BPF::get_cgroup_array(const std::string& name) {
+  TableStorage::iterator it;
+  if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+    return BPFCgroupArray(it->second);
+  return BPFCgroupArray({});
+}
+
+BPFDevmapTable BPF::get_devmap_table(const std::string& name) {
+  TableStorage::iterator it;
+  if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+    return BPFDevmapTable(it->second);
+  return BPFDevmapTable({});
+}
+
+BPFStackTable BPF::get_stack_table(const std::string& name, bool use_debug_file,
+                                   bool check_debug_file_crc) {
+  TableStorage::iterator it;
+  if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+    return BPFStackTable(it->second, use_debug_file, check_debug_file_crc);
+  return BPFStackTable({}, use_debug_file, check_debug_file_crc);
+}
+
+std::string BPF::get_uprobe_event(const std::string& binary_path,
+                                  uint64_t offset, bpf_probe_attach_type type,
+                                  pid_t pid) {
+  std::string res = attach_type_prefix(type) + "_";
+  res += sanitize_str(binary_path, &BPF::uprobe_path_validator);
+  res += "_0x" + uint_to_hex(offset);
+  if (pid != -1)
+    res += "_" + std::to_string(pid);
+  return res;
+}
+
+StatusTuple BPF::detach_kprobe_event(const std::string& event,
+                                     open_probe_t& attr) {
+  bpf_close_perf_event_fd(attr.perf_event_fd);
+  TRY2(unload_func(attr.func));
+  if (bpf_detach_kprobe(event.c_str()) < 0)
+    return StatusTuple(-1, "Unable to detach kprobe %s", event.c_str());
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_uprobe_event(const std::string& event,
+                                     open_probe_t& attr) {
+  bpf_close_perf_event_fd(attr.perf_event_fd);
+  TRY2(unload_func(attr.func));
+  if (bpf_detach_uprobe(event.c_str()) < 0)
+    return StatusTuple(-1, "Unable to detach uprobe %s", event.c_str());
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_tracepoint_event(const std::string& tracepoint,
+                                         open_probe_t& attr) {
+  bpf_close_perf_event_fd(attr.perf_event_fd);
+  TRY2(unload_func(attr.func));
+
+  // TODO: bpf_detach_tracepoint currently does nothing.
+  return StatusTuple(0);
+}
+
+StatusTuple BPF::detach_perf_event_all_cpu(open_probe_t& attr) {
+  bool has_error = false;
+  std::string err_msg;
+  for (const auto& it : *attr.per_cpu_fd) {
+    int res = bpf_close_perf_event_fd(it.second);
+    if (res != 0) {
+      has_error = true;
+      err_msg += "Failed to close perf event FD " + std::to_string(it.second) +
+                 " For CPU " + std::to_string(it.first) + ": ";
+      err_msg += std::string(std::strerror(errno)) + "\n";
+    }
+  }
+  delete attr.per_cpu_fd;
+  TRY2(unload_func(attr.func));
+
+  if (has_error)
+    return StatusTuple(-1, err_msg);
+  return StatusTuple(0);
+}
+
+USDT::USDT(const std::string& binary_path, const std::string& provider,
+           const std::string& name, const std::string& probe_func)
+    : initialized_(false),
+      binary_path_(binary_path),
+      pid_(-1),
+      provider_(provider),
+      name_(name),
+      probe_func_(probe_func) {}
+
+USDT::USDT(pid_t pid, const std::string& provider, const std::string& name,
+           const std::string& probe_func)
+    : initialized_(false),
+      binary_path_(),
+      pid_(pid),
+      provider_(provider),
+      name_(name),
+      probe_func_(probe_func) {}
+
+USDT::USDT(const std::string& binary_path, pid_t pid,
+           const std::string& provider, const std::string& name,
+           const std::string& probe_func)
+    : initialized_(false),
+      binary_path_(binary_path),
+      pid_(pid),
+      provider_(provider),
+      name_(name),
+      probe_func_(probe_func) {}
+
+USDT::USDT(const USDT& usdt)
+    : initialized_(false),
+      binary_path_(usdt.binary_path_),
+      pid_(usdt.pid_),
+      provider_(usdt.provider_),
+      name_(usdt.name_),
+      probe_func_(usdt.probe_func_) {}
+
+USDT::USDT(USDT&& usdt) noexcept
+    : initialized_(usdt.initialized_),
+      binary_path_(std::move(usdt.binary_path_)),
+      pid_(usdt.pid_),
+      provider_(std::move(usdt.provider_)),
+      name_(std::move(usdt.name_)),
+      probe_func_(std::move(usdt.probe_func_)),
+      probe_(std::move(usdt.probe_)),
+      program_text_(std::move(usdt.program_text_)) {
+  usdt.initialized_ = false;
+}
+
+bool USDT::operator==(const USDT& other) const {
+  return (provider_ == other.provider_) && (name_ == other.name_) &&
+         (binary_path_ == other.binary_path_) && (pid_ == other.pid_) &&
+         (probe_func_ == other.probe_func_);
+}
+
+StatusTuple USDT::init() {
+  std::unique_ptr<::USDT::Context> ctx;
+  if (!binary_path_.empty() && pid_ > 0)
+    ctx.reset(new ::USDT::Context(pid_, binary_path_));
+  else if (!binary_path_.empty())
+    ctx.reset(new ::USDT::Context(binary_path_));
+  else if (pid_ > 0)
+    ctx.reset(new ::USDT::Context(pid_));
+  else
+    return StatusTuple(-1, "No valid Binary Path or PID provided");
+
+  if (!ctx->loaded())
+    return StatusTuple(-1, "Unable to load USDT " + print_name());
+
+  auto deleter = [](void* probe) { delete static_cast<::USDT::Probe*>(probe); };
+  for (auto& p : ctx->probes_) {
+    if (p->provider_ == provider_ && p->name_ == name_) {
+      // Take ownership of the probe that we are interested in, and avoid it
+      // being destrcuted when we destruct the USDT::Context instance
+      probe_ = std::unique_ptr<void, std::function<void(void*)>>(p.release(),
+                                                                 deleter);
+      p.swap(ctx->probes_.back());
+      ctx->probes_.pop_back();
+      break;
+    }
+  }
+  if (!probe_)
+    return StatusTuple(-1, "Unable to find USDT " + print_name());
+  ctx.reset(nullptr);
+  auto& probe = *static_cast<::USDT::Probe*>(probe_.get());
+
+  std::ostringstream stream;
+  if (!probe.usdt_getarg(stream, probe_func_))
+    return StatusTuple(
+        -1, "Unable to generate program text for USDT " + print_name());
+  program_text_ = ::USDT::USDT_PROGRAM_HEADER + stream.str();
+
+  initialized_ = true;
+  return StatusTuple(0);
+}
+
+}  // namespace ebpf
diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h
new file mode 100644
index 0000000..21fb42d
--- /dev/null
+++ b/src/cc/api/BPF.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cctype>
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include "BPFTable.h"
+#include "bcc_exception.h"
+#include "bcc_syms.h"
+#include "bpf_module.h"
+#include "compat/linux/bpf.h"
+#include "libbpf.h"
+#include "table_storage.h"
+
+static const int DEFAULT_PERF_BUFFER_PAGE_CNT = 8;
+
+namespace ebpf {
+
+struct open_probe_t {
+  int perf_event_fd;
+  std::string func;
+  std::vector<std::pair<int, int>>* per_cpu_fd;
+};
+
+class USDT;
+
+class BPF {
+ public:
+  static const int BPF_MAX_STACK_DEPTH = 127;
+
+  explicit BPF(unsigned int flag = 0, TableStorage* ts = nullptr,
+               bool rw_engine_enabled = true, const std::string &maps_ns = "")
+      : flag_(flag),
+      bpf_module_(new BPFModule(flag, ts, rw_engine_enabled, maps_ns)) {}
+  StatusTuple init(const std::string& bpf_program,
+                   const std::vector<std::string>& cflags = {},
+                   const std::vector<USDT>& usdt = {});
+
+  ~BPF();
+  StatusTuple detach_all();
+
+  StatusTuple attach_kprobe(const std::string& kernel_func,
+                            const std::string& probe_func,
+                            uint64_t kernel_func_offset = 0,
+                            bpf_probe_attach_type = BPF_PROBE_ENTRY);
+  StatusTuple detach_kprobe(
+      const std::string& kernel_func,
+      bpf_probe_attach_type attach_type = BPF_PROBE_ENTRY);
+
+  StatusTuple attach_uprobe(const std::string& binary_path,
+                            const std::string& symbol,
+                            const std::string& probe_func,
+                            uint64_t symbol_addr = 0,
+                            bpf_probe_attach_type attach_type = BPF_PROBE_ENTRY,
+                            pid_t pid = -1);
+  StatusTuple detach_uprobe(const std::string& binary_path,
+                            const std::string& symbol, uint64_t symbol_addr = 0,
+                            bpf_probe_attach_type attach_type = BPF_PROBE_ENTRY,
+                            pid_t pid = -1);
+  StatusTuple attach_usdt(const USDT& usdt, pid_t pid = -1);
+  StatusTuple detach_usdt(const USDT& usdt, pid_t pid = -1);
+
+  StatusTuple attach_tracepoint(const std::string& tracepoint,
+                                const std::string& probe_func);
+  StatusTuple detach_tracepoint(const std::string& tracepoint);
+
+  StatusTuple attach_perf_event(uint32_t ev_type, uint32_t ev_config,
+                                const std::string& probe_func,
+                                uint64_t sample_period, uint64_t sample_freq,
+                                pid_t pid = -1, int cpu = -1,
+                                int group_fd = -1);
+  StatusTuple attach_perf_event_raw(void* perf_event_attr,
+                                    const std::string& probe_func,
+                                    pid_t pid = -1, int cpu = -1,
+                                    int group_fd = -1,
+                                    unsigned long extra_flags = 0);
+  StatusTuple detach_perf_event(uint32_t ev_type, uint32_t ev_config);
+  StatusTuple detach_perf_event_raw(void* perf_event_attr);
+  std::string get_syscall_fnname(const std::string& name);
+
+  BPFTable get_table(const std::string& name) {
+    TableStorage::iterator it;
+    if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return BPFTable(it->second);
+    return BPFTable({});
+  }
+
+  template <class ValueType>
+  BPFArrayTable<ValueType> get_array_table(const std::string& name) {
+    TableStorage::iterator it;
+    if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return BPFArrayTable<ValueType>(it->second);
+    return BPFArrayTable<ValueType>({});
+  }
+
+  template <class ValueType>
+  BPFPercpuArrayTable<ValueType> get_percpu_array_table(
+      const std::string& name) {
+    TableStorage::iterator it;
+    if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return BPFPercpuArrayTable<ValueType>(it->second);
+    return BPFPercpuArrayTable<ValueType>({});
+  }
+
+  template <class KeyType, class ValueType>
+  BPFHashTable<KeyType, ValueType> get_hash_table(const std::string& name) {
+    TableStorage::iterator it;
+    if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return BPFHashTable<KeyType, ValueType>(it->second);
+    return BPFHashTable<KeyType, ValueType>({});
+  }
+
+  template <class KeyType, class ValueType>
+  BPFPercpuHashTable<KeyType, ValueType> get_percpu_hash_table(
+      const std::string& name) {
+    TableStorage::iterator it;
+    if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it))
+      return BPFPercpuHashTable<KeyType, ValueType>(it->second);
+    return BPFPercpuHashTable<KeyType, ValueType>({});
+  }
+
+  BPFProgTable get_prog_table(const std::string& name);
+
+  BPFCgroupArray get_cgroup_array(const std::string& name);
+
+  BPFDevmapTable get_devmap_table(const std::string& name);
+
+  BPFStackTable get_stack_table(const std::string& name,
+                                bool use_debug_file = true,
+                                bool check_debug_file_crc = true);
+
+  StatusTuple open_perf_event(const std::string& name, uint32_t type,
+                              uint64_t config);
+
+  StatusTuple close_perf_event(const std::string& name);
+
+  // Open a Perf Buffer of given name, providing callback and callback cookie
+  // to use when polling. BPF class owns the opened Perf Buffer and will free
+  // it on-demand or on destruction.
+  StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb,
+                               perf_reader_lost_cb lost_cb = nullptr,
+                               void* cb_cookie = nullptr,
+                               int page_cnt = DEFAULT_PERF_BUFFER_PAGE_CNT);
+  // Close and free the Perf Buffer of given name.
+  StatusTuple close_perf_buffer(const std::string& name);
+  // Obtain an pointer to the opened BPFPerfBuffer instance of given name.
+  // Will return nullptr if such open Perf Buffer doesn't exist.
+  BPFPerfBuffer* get_perf_buffer(const std::string& name);
+  // Poll an opened Perf Buffer of given name with given timeout, using callback
+  // provided when opening. Do nothing if such open Perf Buffer doesn't exist.
+  // Returns:
+  //   -1 on error or if perf buffer with such name doesn't exist;
+  //   0, if no data was available before timeout;
+  //   number of CPUs that have new data, otherwise.
+  int poll_perf_buffer(const std::string& name, int timeout_ms = -1);
+
+  StatusTuple load_func(const std::string& func_name, enum bpf_prog_type type,
+                        int& fd);
+  StatusTuple unload_func(const std::string& func_name);
+
+ private:
+  std::string get_kprobe_event(const std::string& kernel_func,
+                               bpf_probe_attach_type type);
+  std::string get_uprobe_event(const std::string& binary_path, uint64_t offset,
+                               bpf_probe_attach_type type, pid_t pid);
+
+  StatusTuple detach_kprobe_event(const std::string& event, open_probe_t& attr);
+  StatusTuple detach_uprobe_event(const std::string& event, open_probe_t& attr);
+  StatusTuple detach_tracepoint_event(const std::string& tracepoint,
+                                      open_probe_t& attr);
+  StatusTuple detach_perf_event_all_cpu(open_probe_t& attr);
+
+  std::string attach_type_debug(bpf_probe_attach_type type) {
+    switch (type) {
+    case BPF_PROBE_ENTRY:
+      return "";
+    case BPF_PROBE_RETURN:
+      return "return ";
+    }
+    return "ERROR";
+  }
+
+  std::string attach_type_prefix(bpf_probe_attach_type type) {
+    switch (type) {
+    case BPF_PROBE_ENTRY:
+      return "p";
+    case BPF_PROBE_RETURN:
+      return "r";
+    }
+    return "ERROR";
+  }
+
+  static bool kprobe_event_validator(char c) {
+    return (c != '+') && (c != '.');
+  }
+
+  static bool uprobe_path_validator(char c) {
+    return std::isalpha(c) || std::isdigit(c) || (c == '_');
+  }
+
+  StatusTuple check_binary_symbol(const std::string& binary_path,
+                                  const std::string& symbol,
+                                  uint64_t symbol_addr, std::string& module_res,
+                                  uint64_t& offset_res);
+
+  int flag_;
+
+  std::unique_ptr<std::string> syscall_prefix_;
+
+  std::unique_ptr<BPFModule> bpf_module_;
+
+  std::map<std::string, int> funcs_;
+
+  std::vector<USDT> usdt_;
+
+  std::map<std::string, open_probe_t> kprobes_;
+  std::map<std::string, open_probe_t> uprobes_;
+  std::map<std::string, open_probe_t> tracepoints_;
+  std::map<std::string, BPFPerfBuffer*> perf_buffers_;
+  std::map<std::string, BPFPerfEventArray*> perf_event_arrays_;
+  std::map<std::pair<uint32_t, uint32_t>, open_probe_t> perf_events_;
+};
+
+class USDT {
+ public:
+  USDT(const std::string& binary_path, const std::string& provider,
+       const std::string& name, const std::string& probe_func);
+  USDT(pid_t pid, const std::string& provider, const std::string& name,
+       const std::string& probe_func);
+  USDT(const std::string& binary_path, pid_t pid, const std::string& provider,
+       const std::string& name, const std::string& probe_func);
+  USDT(const USDT& usdt);
+  USDT(USDT&& usdt) noexcept;
+
+  StatusTuple init();
+
+  bool operator==(const USDT& other) const;
+
+  std::string print_name() const {
+    return provider_ + ":" + name_ + " from binary " + binary_path_ + " PID " +
+           std::to_string(pid_) + " for probe " + probe_func_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const USDT& usdt) {
+    return out << usdt.provider_ << ":" << usdt.name_ << " from binary "
+               << usdt.binary_path_ << " PID " << usdt.pid_ << " for probe "
+               << usdt.probe_func_;
+  }
+
+ private:
+  bool initialized_;
+
+  std::string binary_path_;
+  pid_t pid_;
+
+  std::string provider_;
+  std::string name_;
+  std::string probe_func_;
+
+  std::unique_ptr<void, std::function<void(void*)>> probe_;
+  std::string program_text_;
+
+  friend class BPF;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/api/BPFTable.cc b/src/cc/api/BPFTable.cc
new file mode 100644
index 0000000..64fe77c
--- /dev/null
+++ b/src/cc/api/BPFTable.cc
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <linux/elf.h>
+#include <linux/perf_event.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+#include <memory>
+
+#include "BPFTable.h"
+
+#include "bcc_exception.h"
+#include "bcc_syms.h"
+#include "common.h"
+#include "file_desc.h"
+#include "libbpf.h"
+#include "perf_reader.h"
+
+namespace ebpf {
+
+BPFTable::BPFTable(const TableDesc& desc) : BPFTableBase<void, void>(desc) {}
+
+StatusTuple BPFTable::get_value(const std::string& key_str,
+                                std::string& value_str) {
+  char key[desc.key_size];
+  char value[desc.leaf_size];
+
+  StatusTuple r(0);
+
+  r = string_to_key(key_str, key);
+  if (r.code() != 0)
+    return r;
+
+  if (!lookup(key, value))
+    return StatusTuple(-1, "error getting value");
+
+  return leaf_to_string(value, value_str);
+}
+
+StatusTuple BPFTable::get_value(const std::string& key_str,
+                                std::vector<std::string>& value_str) {
+  size_t ncpus = get_possible_cpus().size();
+  char key[desc.key_size];
+  char value[desc.leaf_size * ncpus];
+
+  StatusTuple r(0);
+
+  r = string_to_key(key_str, key);
+  if (r.code() != 0)
+    return r;
+
+  if (!lookup(key, value))
+    return StatusTuple(-1, "error getting value");
+
+  value_str.resize(ncpus);
+
+  for (size_t i = 0; i < ncpus; i++) {
+    r = leaf_to_string(value + i * desc.leaf_size, value_str.at(i));
+    if (r.code() != 0)
+      return r;
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple BPFTable::update_value(const std::string& key_str,
+                                   const std::string& value_str) {
+  char key[desc.key_size];
+  char value[desc.leaf_size];
+
+  StatusTuple r(0);
+
+  r = string_to_key(key_str, key);
+  if (r.code() != 0)
+    return r;
+
+  r = string_to_leaf(value_str, value);
+  if (r.code() != 0)
+    return r;
+
+  if (!update(key, value))
+    return StatusTuple(-1, "error updating element");
+
+  return StatusTuple(0);
+}
+
+StatusTuple BPFTable::update_value(const std::string& key_str,
+                                   const std::vector<std::string>& value_str) {
+  size_t ncpus = get_possible_cpus().size();
+  char key[desc.key_size];
+  char value[desc.leaf_size * ncpus];
+
+  StatusTuple r(0);
+
+  r = string_to_key(key_str, key);
+  if (r.code() != 0)
+    return r;
+
+  if (value_str.size() != ncpus)
+    return StatusTuple(-1, "bad value size");
+
+  for (size_t i = 0; i < ncpus; i++) {
+    r = string_to_leaf(value_str.at(i), value + i * desc.leaf_size);
+    if (r.code() != 0)
+      return r;
+  }
+
+  if (!update(key, value))
+    return StatusTuple(-1, "error updating element");
+
+  return StatusTuple(0);
+}
+
+StatusTuple BPFTable::remove_value(const std::string& key_str) {
+  char key[desc.key_size];
+
+  StatusTuple r(0);
+
+  r = string_to_key(key_str, key);
+  if (r.code() != 0)
+    return r;
+
+  if (!remove(key))
+    return StatusTuple(-1, "error removing element");
+
+  return StatusTuple(0);
+}
+
+StatusTuple BPFTable::clear_table_non_atomic() {
+  if (desc.type == BPF_MAP_TYPE_HASH || desc.type == BPF_MAP_TYPE_PERCPU_HASH ||
+      desc.type == BPF_MAP_TYPE_LRU_HASH ||
+      desc.type == BPF_MAP_TYPE_PERCPU_HASH ||
+      desc.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+    // For hash maps, use the first() interface (which uses get_next_key) to
+    // iterate through the map and clear elements
+    auto key = std::unique_ptr<void, decltype(::free)*>(::malloc(desc.key_size),
+                                                        ::free);
+
+    while (this->first(key.get()))
+      if (!this->remove(key.get())) {
+        return StatusTuple(-1,
+                           "Failed to delete element when clearing table %s",
+                           desc.name.c_str());
+      }
+  } else if (desc.type == BPF_MAP_TYPE_ARRAY ||
+             desc.type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+    return StatusTuple(-1, "Array map %s do not support clearing elements",
+                       desc.name.c_str());
+  } else if (desc.type == BPF_MAP_TYPE_PROG_ARRAY ||
+             desc.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+             desc.type == BPF_MAP_TYPE_STACK_TRACE ||
+             desc.type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+    // For Stack-trace and FD arrays, just iterate over all indices
+    for (size_t i = 0; i < desc.max_entries; i++) {
+      this->remove(&i);
+    }
+  } else {
+    return StatusTuple(-1, "Clearing for map type of %s not supported yet",
+                       desc.name.c_str());
+  }
+
+  return StatusTuple(0);
+}
+
+StatusTuple BPFTable::get_table_offline(
+  std::vector<std::pair<std::string, std::string>> &res) {
+  StatusTuple r(0);
+  int err;
+
+  auto key = std::unique_ptr<void, decltype(::free)*>(::malloc(desc.key_size),
+                                                      ::free);
+  auto value = std::unique_ptr<void, decltype(::free)*>(::malloc(desc.leaf_size),
+                                                      ::free);
+  std::string key_str;
+  std::string value_str;
+
+  if (desc.type == BPF_MAP_TYPE_ARRAY ||
+      desc.type == BPF_MAP_TYPE_PROG_ARRAY ||
+      desc.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+      desc.type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+      desc.type == BPF_MAP_TYPE_CGROUP_ARRAY ||
+      desc.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+      desc.type == BPF_MAP_TYPE_DEVMAP ||
+      desc.type == BPF_MAP_TYPE_CPUMAP ||
+      desc.type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+    // For arrays, just iterate over all indices
+    for (size_t i = 0; i < desc.max_entries; i++) {
+      err = bpf_lookup_elem(desc.fd, &i, value.get());
+      if (err < 0 && errno == ENOENT) {
+        // Element is not present, skip it
+        continue;
+      } else if (err < 0) {
+        // Other error, abort
+        return StatusTuple(-1, "Error looking up value: %s", std::strerror(errno));
+      }
+
+      r = key_to_string(&i, key_str);
+      if (r.code() != 0)
+        return r;
+
+      r = leaf_to_string(value.get(), value_str);
+      if (r.code() != 0)
+        return r;
+      res.emplace_back(key_str, value_str);
+    }
+  } else {
+    res.clear();
+    // For other maps, try to use the first() and next() interfaces
+    if (!this->first(key.get()))
+      return StatusTuple(0);
+
+    while (true) {
+      if (!this->lookup(key.get(), value.get()))
+        break;
+      r = key_to_string(key.get(), key_str);
+      if (r.code() != 0)
+        return r;
+
+      r = leaf_to_string(value.get(), value_str);
+      if (r.code() != 0)
+        return r;
+      res.emplace_back(key_str, value_str);
+      if (!this->next(key.get(), key.get()))
+        break;
+    }
+  }
+
+  return StatusTuple(0);
+}
+
+size_t BPFTable::get_possible_cpu_count() { return get_possible_cpus().size(); }
+
+BPFStackTable::BPFStackTable(const TableDesc& desc, bool use_debug_file,
+                             bool check_debug_file_crc)
+    : BPFTableBase<int, stacktrace_t>(desc) {
+  if (desc.type != BPF_MAP_TYPE_STACK_TRACE)
+    throw std::invalid_argument("Table '" + desc.name +
+                                "' is not a stack table");
+
+  symbol_option_ = {.use_debug_file = use_debug_file,
+                    .check_debug_file_crc = check_debug_file_crc,
+                    .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC)};
+}
+
+BPFStackTable::BPFStackTable(BPFStackTable&& that)
+    : BPFTableBase<int, stacktrace_t>(that.desc),
+      symbol_option_(std::move(that.symbol_option_)),
+      pid_sym_(std::move(that.pid_sym_)) {
+  that.pid_sym_.clear();
+}
+
+BPFStackTable::~BPFStackTable() {
+  for (auto it : pid_sym_)
+    bcc_free_symcache(it.second, it.first);
+}
+
+void BPFStackTable::clear_table_non_atomic() {
+  for (int i = 0; size_t(i) < capacity(); i++) {
+    remove(&i);
+  }
+}
+
+std::vector<uintptr_t> BPFStackTable::get_stack_addr(int stack_id) {
+  std::vector<uintptr_t> res;
+  stacktrace_t stack;
+  if (stack_id < 0)
+    return res;
+  if (!lookup(&stack_id, &stack))
+    return res;
+  for (int i = 0; (i < BPF_MAX_STACK_DEPTH) && (stack.ip[i] != 0); i++)
+    res.push_back(stack.ip[i]);
+  return res;
+}
+
+std::vector<std::string> BPFStackTable::get_stack_symbol(int stack_id,
+                                                         int pid) {
+  auto addresses = get_stack_addr(stack_id);
+  std::vector<std::string> res;
+  if (addresses.empty())
+    return res;
+  res.reserve(addresses.size());
+
+  if (pid < 0)
+    pid = -1;
+  if (pid_sym_.find(pid) == pid_sym_.end())
+    pid_sym_[pid] = bcc_symcache_new(pid, &symbol_option_);
+  void* cache = pid_sym_[pid];
+
+  bcc_symbol symbol;
+  for (auto addr : addresses)
+    if (bcc_symcache_resolve(cache, addr, &symbol) != 0)
+      res.emplace_back("[UNKNOWN]");
+    else {
+      res.push_back(symbol.demangle_name);
+      bcc_symbol_free_demangle_name(&symbol);
+    }
+
+  return res;
+}
+
+BPFPerfBuffer::BPFPerfBuffer(const TableDesc& desc)
+    : BPFTableBase<int, int>(desc), epfd_(-1) {
+  if (desc.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+    throw std::invalid_argument("Table '" + desc.name +
+                                "' is not a perf buffer");
+}
+
+StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb,
+                                       perf_reader_lost_cb lost_cb, int cpu,
+                                       void* cb_cookie, int page_cnt) {
+  if (cpu_readers_.find(cpu) != cpu_readers_.end())
+    return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu);
+
+  auto reader = static_cast<perf_reader*>(
+      bpf_open_perf_buffer(cb, lost_cb, cb_cookie, -1, cpu, page_cnt));
+  if (reader == nullptr)
+    return StatusTuple(-1, "Unable to construct perf reader");
+
+  int reader_fd = perf_reader_fd(reader);
+  if (!update(&cpu, &reader_fd)) {
+    perf_reader_free(static_cast<void*>(reader));
+    return StatusTuple(-1, "Unable to open perf buffer on CPU %d: %s", cpu,
+                       std::strerror(errno));
+  }
+
+  struct epoll_event event = {};
+  event.events = EPOLLIN;
+  event.data.ptr = static_cast<void*>(reader);
+  if (epoll_ctl(epfd_, EPOLL_CTL_ADD, reader_fd, &event) != 0) {
+    perf_reader_free(static_cast<void*>(reader));
+    return StatusTuple(-1, "Unable to add perf_reader FD to epoll: %s",
+                       std::strerror(errno));
+  }
+
+  cpu_readers_[cpu] = reader;
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
+                                        perf_reader_lost_cb lost_cb,
+                                        void* cb_cookie, int page_cnt) {
+  if (cpu_readers_.size() != 0 || epfd_ != -1)
+    return StatusTuple(-1, "Previously opened perf buffer not cleaned");
+
+  std::vector<int> cpus = get_online_cpus();
+  ep_events_.reset(new epoll_event[cpus.size()]);
+  epfd_ = epoll_create1(EPOLL_CLOEXEC);
+
+  for (int i : cpus) {
+    auto res = open_on_cpu(cb, lost_cb, i, cb_cookie, page_cnt);
+    if (res.code() != 0) {
+      TRY2(close_all_cpu());
+      return res;
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfBuffer::close_on_cpu(int cpu) {
+  auto it = cpu_readers_.find(cpu);
+  if (it == cpu_readers_.end())
+    return StatusTuple(0);
+  perf_reader_free(static_cast<void*>(it->second));
+  if (!remove(const_cast<int*>(&(it->first))))
+    return StatusTuple(-1, "Unable to close perf buffer on CPU %d", it->first);
+  cpu_readers_.erase(it);
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfBuffer::close_all_cpu() {
+  std::string errors;
+  bool has_error = false;
+
+  if (epfd_ >= 0) {
+    int close_res = close(epfd_);
+    epfd_ = -1;
+    ep_events_.reset();
+    if (close_res != 0) {
+      has_error = true;
+      errors += std::string(std::strerror(errno)) + "\n";
+    }
+  }
+
+  std::vector<int> opened_cpus;
+  for (auto it : cpu_readers_)
+    opened_cpus.push_back(it.first);
+  for (int i : opened_cpus) {
+    auto res = close_on_cpu(i);
+    if (res.code() != 0) {
+      errors += "Failed to close CPU" + std::to_string(i) + " perf buffer: ";
+      errors += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  if (has_error)
+    return StatusTuple(-1, errors);
+  return StatusTuple(0);
+}
+
+int BPFPerfBuffer::poll(int timeout_ms) {
+  if (epfd_ < 0)
+    return -1;
+  int cnt =
+      epoll_wait(epfd_, ep_events_.get(), cpu_readers_.size(), timeout_ms);
+  for (int i = 0; i < cnt; i++)
+    perf_reader_event_read(static_cast<perf_reader*>(ep_events_[i].data.ptr));
+  return cnt;
+}
+
+BPFPerfBuffer::~BPFPerfBuffer() {
+  auto res = close_all_cpu();
+  if (res.code() != 0)
+    std::cerr << "Failed to close all perf buffer on destruction: " << res.msg()
+              << std::endl;
+}
+
+BPFPerfEventArray::BPFPerfEventArray(const TableDesc& desc)
+    : BPFTableBase<int, int>(desc) {
+  if (desc.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+    throw std::invalid_argument("Table '" + desc.name +
+                                "' is not a perf event array");
+}
+
+StatusTuple BPFPerfEventArray::open_all_cpu(uint32_t type, uint64_t config) {
+  if (cpu_fds_.size() != 0)
+    return StatusTuple(-1, "Previously opened perf event not cleaned");
+
+  std::vector<int> cpus = get_online_cpus();
+
+  for (int i : cpus) {
+    auto res = open_on_cpu(i, type, config);
+    if (res.code() != 0) {
+      TRY2(close_all_cpu());
+      return res;
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfEventArray::close_all_cpu() {
+  std::string errors;
+  bool has_error = false;
+
+  std::vector<int> opened_cpus;
+  for (auto it : cpu_fds_)
+    opened_cpus.push_back(it.first);
+  for (int i : opened_cpus) {
+    auto res = close_on_cpu(i);
+    if (res.code() != 0) {
+      errors += "Failed to close CPU" + std::to_string(i) + " perf event: ";
+      errors += res.msg() + "\n";
+      has_error = true;
+    }
+  }
+
+  if (has_error)
+    return StatusTuple(-1, errors);
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfEventArray::open_on_cpu(int cpu, uint32_t type,
+                                           uint64_t config) {
+  if (cpu_fds_.find(cpu) != cpu_fds_.end())
+    return StatusTuple(-1, "Perf event already open on CPU %d", cpu);
+  int fd = bpf_open_perf_event(type, config, -1, cpu);
+  if (fd < 0) {
+    return StatusTuple(-1, "Error constructing perf event %" PRIu32 ":%" PRIu64,
+                       type, config);
+  }
+  if (!update(&cpu, &fd)) {
+    bpf_close_perf_event_fd(fd);
+    return StatusTuple(-1, "Unable to open perf event on CPU %d: %s", cpu,
+                       std::strerror(errno));
+  }
+  cpu_fds_[cpu] = fd;
+  return StatusTuple(0);
+}
+
+StatusTuple BPFPerfEventArray::close_on_cpu(int cpu) {
+  auto it = cpu_fds_.find(cpu);
+  if (it == cpu_fds_.end()) {
+    return StatusTuple(0);
+  }
+  bpf_close_perf_event_fd(it->second);
+  cpu_fds_.erase(it);
+  return StatusTuple(0);
+}
+
+BPFPerfEventArray::~BPFPerfEventArray() {
+  auto res = close_all_cpu();
+  if (res.code() != 0) {
+    std::cerr << "Failed to close all perf buffer on destruction: " << res.msg()
+              << std::endl;
+  }
+}
+
+BPFProgTable::BPFProgTable(const TableDesc& desc)
+    : BPFTableBase<int, int>(desc) {
+  if (desc.type != BPF_MAP_TYPE_PROG_ARRAY)
+    throw std::invalid_argument("Table '" + desc.name +
+                                "' is not a prog table");
+}
+
+StatusTuple BPFProgTable::update_value(const int& index, const int& prog_fd) {
+  if (!this->update(const_cast<int*>(&index), const_cast<int*>(&prog_fd)))
+    return StatusTuple(-1, "Error updating value: %s", std::strerror(errno));
+  return StatusTuple(0);
+}
+
+StatusTuple BPFProgTable::remove_value(const int& index) {
+  if (!this->remove(const_cast<int*>(&index)))
+    return StatusTuple(-1, "Error removing value: %s", std::strerror(errno));
+  return StatusTuple(0);
+}
+
+BPFCgroupArray::BPFCgroupArray(const TableDesc& desc)
+    : BPFTableBase<int, int>(desc) {
+  if (desc.type != BPF_MAP_TYPE_CGROUP_ARRAY)
+    throw std::invalid_argument("Table '" + desc.name +
+                                "' is not a cgroup array");
+}
+
+StatusTuple BPFCgroupArray::update_value(const int& index,
+                                         const int& cgroup2_fd) {
+  if (!this->update(const_cast<int*>(&index), const_cast<int*>(&cgroup2_fd)))
+    return StatusTuple(-1, "Error updating value: %s", std::strerror(errno));
+  return StatusTuple(0);
+}
+
+StatusTuple BPFCgroupArray::update_value(const int& index,
+                                         const std::string& cgroup2_path) {
+  FileDesc f(::open(cgroup2_path.c_str(), O_RDONLY | O_CLOEXEC));
+  if ((int)f < 0)
+    return StatusTuple(-1, "Unable to open %s", cgroup2_path.c_str());
+  TRY2(update_value(index, (int)f));
+  return StatusTuple(0);
+}
+
+StatusTuple BPFCgroupArray::remove_value(const int& index) {
+  if (!this->remove(const_cast<int*>(&index)))
+    return StatusTuple(-1, "Error removing value: %s", std::strerror(errno));
+  return StatusTuple(0);
+}
+
+BPFDevmapTable::BPFDevmapTable(const TableDesc& desc) 
+    : BPFTableBase<int, int>(desc) {
+    if(desc.type != BPF_MAP_TYPE_DEVMAP)
+      throw std::invalid_argument("Table '" + desc.name + 
+                                  "' is not a devmap table");
+}
+
+StatusTuple BPFDevmapTable::update_value(const int& index, 
+                                         const int& value) {
+    if (!this->update(const_cast<int*>(&index), const_cast<int*>(&value)))
+      return StatusTuple(-1, "Error updating value: %s", std::strerror(errno));
+    return StatusTuple(0);
+}
+
+StatusTuple BPFDevmapTable::get_value(const int& index, 
+                                      int& value) {
+    if (!this->lookup(const_cast<int*>(&index), &value))
+      return StatusTuple(-1, "Error getting value: %s", std::strerror(errno));
+    return StatusTuple(0);
+}
+
+StatusTuple BPFDevmapTable::remove_value(const int& index) {
+    if (!this->remove(const_cast<int*>(&index)))
+      return StatusTuple(-1, "Error removing value: %s", std::strerror(errno));
+    return StatusTuple(0);
+}
+
+}  // namespace ebpf
diff --git a/src/cc/api/BPFTable.h b/src/cc/api/BPFTable.h
new file mode 100644
index 0000000..3a183f4
--- /dev/null
+++ b/src/cc/api/BPFTable.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <errno.h>
+#include <sys/epoll.h>
+#include <cstring>
+#include <exception>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "bcc_exception.h"
+#include "bcc_syms.h"
+#include "bpf_module.h"
+#include "libbpf.h"
+#include "perf_reader.h"
+#include "table_desc.h"
+
+namespace ebpf {
+
+template <class KeyType, class ValueType>
+class BPFTableBase {
+ public:
+  size_t capacity() { return desc.max_entries; }
+
+  StatusTuple string_to_key(const std::string& key_str, KeyType* key) {
+    return desc.key_sscanf(key_str.c_str(), key);
+  }
+
+  StatusTuple string_to_leaf(const std::string& value_str, ValueType* value) {
+    return desc.leaf_sscanf(value_str.c_str(), value);
+  }
+
+  StatusTuple key_to_string(const KeyType* key, std::string& key_str) {
+    char buf[8 * desc.key_size];
+    StatusTuple rc = desc.key_snprintf(buf, sizeof(buf), key);
+    if (!rc.code())
+      key_str.assign(buf);
+    return rc;
+  }
+
+  StatusTuple leaf_to_string(const ValueType* value, std::string& value_str) {
+    char buf[8 * desc.leaf_size];
+    StatusTuple rc = desc.leaf_snprintf(buf, sizeof(buf), value);
+    if (!rc.code())
+      value_str.assign(buf);
+    return rc;
+  }
+
+ protected:
+  explicit BPFTableBase(const TableDesc& desc) : desc(desc) {}
+
+  bool lookup(void* key, void* value) {
+    return bpf_lookup_elem(desc.fd, key, value) >= 0;
+  }
+
+  bool first(void* key) {
+    return bpf_get_first_key(desc.fd, key, desc.key_size) >= 0;
+  }
+
+  bool next(void* key, void* next_key) {
+    return bpf_get_next_key(desc.fd, key, next_key) >= 0;
+  }
+
+  bool update(void* key, void* value) {
+    return bpf_update_elem(desc.fd, key, value, 0) >= 0;
+  }
+
+  bool remove(void* key) { return bpf_delete_elem(desc.fd, key) >= 0; }
+
+  const TableDesc& desc;
+};
+
+class BPFTable : public BPFTableBase<void, void> {
+ public:
+  BPFTable(const TableDesc& desc);
+
+  StatusTuple get_value(const std::string& key_str, std::string& value);
+  StatusTuple get_value(const std::string& key_str,
+                        std::vector<std::string>& value);
+
+  StatusTuple update_value(const std::string& key_str,
+                           const std::string& value_str);
+  StatusTuple update_value(const std::string& key_str,
+                           const std::vector<std::string>& value_str);
+
+  StatusTuple remove_value(const std::string& key_str);
+
+  StatusTuple clear_table_non_atomic();
+  StatusTuple get_table_offline(std::vector<std::pair<std::string, std::string>> &res);
+
+  static size_t get_possible_cpu_count();
+};
+
+template <class ValueType>
+void* get_value_addr(ValueType& t) {
+  return &t;
+}
+
+template <class ValueType>
+void* get_value_addr(std::vector<ValueType>& t) {
+  return t.data();
+}
+
+template <class ValueType>
+class BPFArrayTable : public BPFTableBase<int, ValueType> {
+ public:
+  BPFArrayTable(const TableDesc& desc) : BPFTableBase<int, ValueType>(desc) {
+    if (desc.type != BPF_MAP_TYPE_ARRAY &&
+        desc.type != BPF_MAP_TYPE_PERCPU_ARRAY)
+      throw std::invalid_argument("Table '" + desc.name +
+                                  "' is not an array table");
+  }
+
+  virtual StatusTuple get_value(const int& index, ValueType& value) {
+    if (!this->lookup(const_cast<int*>(&index), get_value_addr(value)))
+      return StatusTuple(-1, "Error getting value: %s", std::strerror(errno));
+    return StatusTuple(0);
+  }
+
+  virtual StatusTuple update_value(const int& index, const ValueType& value) {
+    if (!this->update(const_cast<int*>(&index),
+                      get_value_addr(const_cast<ValueType&>(value))))
+      return StatusTuple(-1, "Error updating value: %s", std::strerror(errno));
+    return StatusTuple(0);
+  }
+
+  ValueType operator[](const int& key) {
+    ValueType value;
+    get_value(key, value);
+    return value;
+  }
+
+  std::vector<ValueType> get_table_offline() {
+    std::vector<ValueType> res(this->capacity());
+
+    for (int i = 0; i < (int)this->capacity(); i++) {
+      get_value(i, res[i]);
+    }
+
+    return res;
+  }
+};
+
+template <class ValueType>
+class BPFPercpuArrayTable : public BPFArrayTable<std::vector<ValueType>> {
+ public:
+  BPFPercpuArrayTable(const TableDesc& desc)
+      : BPFArrayTable<std::vector<ValueType>>(desc),
+        ncpus(BPFTable::get_possible_cpu_count()) {
+    if (desc.type != BPF_MAP_TYPE_PERCPU_ARRAY)
+      throw std::invalid_argument("Table '" + desc.name +
+                                  "' is not a percpu array table");
+    // leaf structures have to be aligned to 8 bytes as hardcoded in the linux
+    // kernel.
+    if (sizeof(ValueType) % 8)
+      throw std::invalid_argument("leaf must be aligned to 8 bytes");
+  }
+
+  StatusTuple get_value(const int& index, std::vector<ValueType>& value) {
+    value.resize(ncpus);
+    return BPFArrayTable<std::vector<ValueType>>::get_value(index, value);
+  }
+
+  StatusTuple update_value(const int& index,
+                           const std::vector<ValueType>& value) {
+    if (value.size() != ncpus)
+      return StatusTuple(-1, "bad value size");
+    return BPFArrayTable<std::vector<ValueType>>::update_value(index, value);
+  }
+
+ private:
+  unsigned int ncpus;
+};
+
+template <class KeyType, class ValueType>
+class BPFHashTable : public BPFTableBase<KeyType, ValueType> {
+ public:
+  explicit BPFHashTable(const TableDesc& desc)
+      : BPFTableBase<KeyType, ValueType>(desc) {
+    if (desc.type != BPF_MAP_TYPE_HASH &&
+        desc.type != BPF_MAP_TYPE_PERCPU_HASH &&
+        desc.type != BPF_MAP_TYPE_LRU_HASH &&
+        desc.type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+      throw std::invalid_argument("Table '" + desc.name +
+                                  "' is not a hash table");
+  }
+
+  virtual StatusTuple get_value(const KeyType& key, ValueType& value) {
+    if (!this->lookup(const_cast<KeyType*>(&key), get_value_addr(value)))
+      return StatusTuple(-1, "Error getting value: %s", std::strerror(errno));
+    return StatusTuple(0);
+  }
+
+  virtual StatusTuple update_value(const KeyType& key, const ValueType& value) {
+    if (!this->update(const_cast<KeyType*>(&key),
+                      get_value_addr(const_cast<ValueType&>(value))))
+      return StatusTuple(-1, "Error updating value: %s", std::strerror(errno));
+    return StatusTuple(0);
+  }
+
+  virtual StatusTuple remove_value(const KeyType& key) {
+    if (!this->remove(const_cast<KeyType*>(&key)))
+      return StatusTuple(-1, "Error removing value: %s", std::strerror(errno));
+    return StatusTuple(0);
+  }
+
+  ValueType operator[](const KeyType& key) {
+    ValueType value;
+    get_value(key, value);
+    return value;
+  }
+
+  std::vector<std::pair<KeyType, ValueType>> get_table_offline() {
+    std::vector<std::pair<KeyType, ValueType>> res;
+    KeyType cur;
+    ValueType value;
+
+    StatusTuple r(0);
+
+    if (!this->first(&cur))
+      return res;
+
+    while (true) {
+      r = get_value(cur, value);
+      if (r.code() != 0)
+        break;
+      res.emplace_back(cur, value);
+      if (!this->next(&cur, &cur))
+        break;
+    }
+
+    return res;
+  }
+
+  StatusTuple clear_table_non_atomic() {
+    KeyType cur;
+    while (this->first(&cur))
+      TRY2(remove_value(cur));
+
+    return StatusTuple(0);
+  }
+};
+
+template <class KeyType, class ValueType>
+class BPFPercpuHashTable
+    : public BPFHashTable<KeyType, std::vector<ValueType>> {
+ public:
+  explicit BPFPercpuHashTable(const TableDesc& desc)
+      : BPFHashTable<KeyType, std::vector<ValueType>>(desc),
+        ncpus(BPFTable::get_possible_cpu_count()) {
+    if (desc.type != BPF_MAP_TYPE_PERCPU_HASH &&
+        desc.type != BPF_MAP_TYPE_LRU_PERCPU_HASH)
+      throw std::invalid_argument("Table '" + desc.name +
+                                  "' is not a percpu hash table");
+    // leaf structures have to be aligned to 8 bytes as hardcoded in the linux
+    // kernel.
+    if (sizeof(ValueType) % 8)
+      throw std::invalid_argument("leaf must be aligned to 8 bytes");
+  }
+
+  StatusTuple get_value(const KeyType& key, std::vector<ValueType>& value) {
+    value.resize(ncpus);
+    return BPFHashTable<KeyType, std::vector<ValueType>>::get_value(key, value);
+  }
+
+  StatusTuple update_value(const KeyType& key,
+                           const std::vector<ValueType>& value) {
+    if (value.size() != ncpus)
+      return StatusTuple(-1, "bad value size");
+    return BPFHashTable<KeyType, std::vector<ValueType>>::update_value(key,
+                                                                       value);
+  }
+
+ private:
+  unsigned int ncpus;
+};
+
+// From src/cc/export/helpers.h
+static const int BPF_MAX_STACK_DEPTH = 127;
+struct stacktrace_t {
+  uintptr_t ip[BPF_MAX_STACK_DEPTH];
+};
+
+class BPFStackTable : public BPFTableBase<int, stacktrace_t> {
+ public:
+  BPFStackTable(const TableDesc& desc, bool use_debug_file,
+                bool check_debug_file_crc);
+  BPFStackTable(BPFStackTable&& that);
+  ~BPFStackTable();
+
+  void clear_table_non_atomic();
+  std::vector<uintptr_t> get_stack_addr(int stack_id);
+  std::vector<std::string> get_stack_symbol(int stack_id, int pid);
+
+ private:
+  bcc_symbol_option symbol_option_;
+  std::map<int, void*> pid_sym_;
+};
+
+class BPFPerfBuffer : public BPFTableBase<int, int> {
+ public:
+  BPFPerfBuffer(const TableDesc& desc);
+  ~BPFPerfBuffer();
+
+  StatusTuple open_all_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
+                           void* cb_cookie, int page_cnt);
+  StatusTuple close_all_cpu();
+  int poll(int timeout_ms);
+
+ private:
+  StatusTuple open_on_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
+                          int cpu, void* cb_cookie, int page_cnt);
+  StatusTuple close_on_cpu(int cpu);
+
+  std::map<int, perf_reader*> cpu_readers_;
+
+  int epfd_;
+  std::unique_ptr<epoll_event[]> ep_events_;
+};
+
+class BPFPerfEventArray : public BPFTableBase<int, int> {
+ public:
+  BPFPerfEventArray(const TableDesc& desc);
+  ~BPFPerfEventArray();
+
+  StatusTuple open_all_cpu(uint32_t type, uint64_t config);
+  StatusTuple close_all_cpu();
+
+ private:
+  StatusTuple open_on_cpu(int cpu, uint32_t type, uint64_t config);
+  StatusTuple close_on_cpu(int cpu);
+
+  std::map<int, int> cpu_fds_;
+};
+
+class BPFProgTable : public BPFTableBase<int, int> {
+ public:
+  BPFProgTable(const TableDesc& desc);
+
+  StatusTuple update_value(const int& index, const int& prog_fd);
+  StatusTuple remove_value(const int& index);
+};
+
+class BPFCgroupArray : public BPFTableBase<int, int> {
+ public:
+  BPFCgroupArray(const TableDesc& desc);
+
+  StatusTuple update_value(const int& index, const int& cgroup2_fd);
+  StatusTuple update_value(const int& index, const std::string& cgroup2_path);
+  StatusTuple remove_value(const int& index);
+};
+
+class BPFDevmapTable : public BPFTableBase<int, int> {
+public:
+  BPFDevmapTable(const TableDesc& desc);
+  
+  StatusTuple update_value(const int& index, const int& value);
+  StatusTuple get_value(const int& index, int& value);
+  StatusTuple remove_value(const int& index);
+
+};
+
+}  // namespace ebpf
diff --git a/src/cc/api/CMakeLists.txt b/src/cc/api/CMakeLists.txt
new file mode 100644
index 0000000..4234e20
--- /dev/null
+++ b/src/cc/api/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(bcc_api_sources BPF.cc BPFTable.cc)
+add_library(api-static STATIC ${bcc_api_sources})
+install(FILES BPF.h BPFTable.h COMPONENT libbcc DESTINATION include/bcc)
diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc
new file mode 100644
index 0000000..786074a
--- /dev/null
+++ b/src/cc/bcc_debug.cc
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <llvm/DebugInfo/DWARF/DWARFContext.h>
+#include <llvm/DebugInfo/DWARF/DWARFDebugLine.h>
+#include <llvm/IR/Module.h>
+#include <llvm/MC/MCAsmInfo.h>
+#include <llvm/MC/MCContext.h>
+#include <llvm/MC/MCDisassembler/MCDisassembler.h>
+#include <llvm/MC/MCInstPrinter.h>
+#include <llvm/MC/MCInstrInfo.h>
+#include <llvm/MC/MCObjectFileInfo.h>
+#include <llvm/MC/MCRegisterInfo.h>
+#include <llvm/Support/TargetRegistry.h>
+
+#include "bcc_debug.h"
+
+namespace ebpf {
+
+// ld_pseudo can only be disassembled properly
+// in llvm 6.0, so having this workaround now
+// until disto llvm versions catch up
+#define WORKAROUND_FOR_LD_PSEUDO
+
+using std::get;
+using std::map;
+using std::string;
+using std::tuple;
+using std::vector;
+using namespace llvm;
+using DWARFLineTable = DWARFDebugLine::LineTable;
+
+void SourceDebugger::adjustInstSize(uint64_t &Size, uint8_t byte0,
+                                    uint8_t byte1) {
+#ifdef WORKAROUND_FOR_LD_PSEUDO
+  bool isLittleEndian = mod_->getDataLayout().isLittleEndian();
+  if (byte0 == 0x18 && ((isLittleEndian && (byte1 & 0xf) == 0x1) ||
+                        (!isLittleEndian && (byte1 & 0xf0) == 0x10)))
+    Size = 16;
+#endif
+}
+
+vector<string> SourceDebugger::buildLineCache() {
+  vector<string> LineCache;
+  size_t FileBufSize = mod_src_.size();
+
+  for (uint32_t start = 0, end = start; end < FileBufSize; end++)
+    if (mod_src_[end] == '\n' || end == FileBufSize - 1 ||
+        (mod_src_[end] == '\r' && mod_src_[end + 1] == '\n')) {
+      // Not including the endline
+      LineCache.push_back(string(mod_src_.substr(start, end - start)));
+      if (mod_src_[end] == '\r')
+        end++;
+      start = end + 1;
+    }
+
+  return LineCache;
+}
+
+void SourceDebugger::dumpSrcLine(const vector<string> &LineCache,
+                                 const string &FileName, uint32_t Line,
+                                 uint32_t &CurrentSrcLine,
+                                 llvm::raw_ostream &os) {
+  if (Line != 0 && Line != CurrentSrcLine && Line < LineCache.size() &&
+      FileName == mod_->getSourceFileName()) {
+    os << "; " << StringRef(LineCache[Line - 1]).ltrim()
+       << format(
+              " // Line"
+              "%4" PRIu64 "\n",
+              Line);
+    CurrentSrcLine = Line;
+  }
+}
+
+void SourceDebugger::getDebugSections(
+    StringMap<std::unique_ptr<MemoryBuffer>> &DebugSections) {
+  for (auto section : sections_) {
+    if (strncmp(section.first.c_str(), ".debug", 6) == 0) {
+      StringRef SecData(reinterpret_cast<const char *>(get<0>(section.second)),
+                        get<1>(section.second));
+      DebugSections[section.first.substr(1)] =
+          MemoryBuffer::getMemBufferCopy(SecData);
+    }
+  }
+}
+
+void SourceDebugger::dump() {
+  string Error;
+  string TripleStr(mod_->getTargetTriple());
+  Triple TheTriple(TripleStr);
+  const Target *T = TargetRegistry::lookupTarget(TripleStr, Error);
+  if (!T) {
+    errs() << "Debug Error: cannot get target\n";
+    return;
+  }
+
+  std::unique_ptr<MCRegisterInfo> MRI(T->createMCRegInfo(TripleStr));
+  if (!MRI) {
+    errs() << "Debug Error: cannot get register info\n";
+    return;
+  }
+  std::unique_ptr<MCAsmInfo> MAI(T->createMCAsmInfo(*MRI, TripleStr));
+  if (!MAI) {
+    errs() << "Debug Error: cannot get assembly info\n";
+    return;
+  }
+
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, nullptr);
+  MOFI.InitMCObjectFileInfo(TheTriple, false, Ctx, false);
+  std::unique_ptr<MCSubtargetInfo> STI(
+      T->createMCSubtargetInfo(TripleStr, "", ""));
+
+  std::unique_ptr<MCInstrInfo> MCII(T->createMCInstrInfo());
+  MCInstPrinter *IP = T->createMCInstPrinter(TheTriple, 0, *MAI, *MCII, *MRI);
+  if (!IP) {
+    errs() << "Debug Error: unable to create instruction printer\n";
+    return;
+  }
+
+  std::unique_ptr<const MCDisassembler> DisAsm(
+      T->createMCDisassembler(*STI, Ctx));
+  if (!DisAsm) {
+    errs() << "Debug Error: no disassembler\n";
+    return;
+  }
+
+  // Set up the dwarf debug context
+  StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
+  getDebugSections(DebugSections);
+  std::unique_ptr<DWARFContext> DwarfCtx =
+      DWARFContext::create(DebugSections, 8);
+  if (!DwarfCtx) {
+    errs() << "Debug Error: dwarf context creation failed\n";
+    return;
+  }
+
+  // bcc has only one compilation unit
+  // getCompileUnitAtIndex() was gone in llvm 8.0 (https://reviews.llvm.org/D49741)
+#if LLVM_MAJOR_VERSION >= 8
+  DWARFCompileUnit *CU = cast<DWARFCompileUnit>(DwarfCtx->getUnitAtIndex(0));
+#else
+  DWARFCompileUnit *CU = DwarfCtx->getCompileUnitAtIndex(0);
+#endif
+  if (!CU) {
+    errs() << "Debug Error: dwarf context failed to get compile unit\n";
+    return;
+  }
+
+  const DWARFLineTable *LineTable = DwarfCtx->getLineTableForUnit(CU);
+  if (!LineTable) {
+    errs() << "Debug Error: dwarf context failed to get line table\n";
+    return;
+  }
+
+  // Build LineCache for later source code printing
+  vector<string> LineCache = buildLineCache();
+
+  // Start to disassemble with source code annotation section by section
+  for (auto section : sections_)
+    if (!strncmp(fn_prefix_.c_str(), section.first.c_str(),
+                 fn_prefix_.size())) {
+      MCDisassembler::DecodeStatus S;
+      MCInst Inst;
+      uint64_t Size;
+      uint8_t *FuncStart = get<0>(section.second);
+      uint64_t FuncSize = get<1>(section.second);
+      ArrayRef<uint8_t> Data(FuncStart, FuncSize);
+      uint32_t CurrentSrcLine = 0;
+      string func_name = section.first.substr(fn_prefix_.size());
+
+      errs() << "Disassembly of section " << section.first << ":\n"
+             << func_name << ":\n";
+
+      string src_dbg_str;
+      llvm::raw_string_ostream os(src_dbg_str);
+      for (uint64_t Index = 0; Index < FuncSize; Index += Size) {
+        S = DisAsm->getInstruction(Inst, Size, Data.slice(Index), Index,
+                                   nulls(), nulls());
+        if (S != MCDisassembler::Success) {
+          os << "Debug Error: disassembler failed: " << std::to_string(S)
+             << '\n';
+          break;
+        } else {
+          DILineInfo LineInfo;
+          LineTable->getFileLineInfoForAddress(
+              (uint64_t)FuncStart + Index, CU->getCompilationDir(),
+              DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
+              LineInfo);
+
+          adjustInstSize(Size, Data[Index], Data[Index + 1]);
+          dumpSrcLine(LineCache, LineInfo.FileName, LineInfo.Line,
+                      CurrentSrcLine, os);
+          os << format("%4" PRIu64 ":", Index >> 3) << '\t';
+          dumpBytes(Data.slice(Index, Size), os);
+          IP->printInst(&Inst, os, "", *STI);
+          os << '\n';
+        }
+      }
+      os.flush();
+      errs() << src_dbg_str << '\n';
+      src_dbg_fmap_[func_name] = src_dbg_str;
+    }
+}
+
+}  // namespace ebpf
diff --git a/src/cc/bcc_debug.h b/src/cc/bcc_debug.h
new file mode 100644
index 0000000..9b195be
--- /dev/null
+++ b/src/cc/bcc_debug.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+namespace ebpf {
+
+class SourceDebugger {
+ public:
+  SourceDebugger(
+      llvm::Module *mod,
+      std::map<std::string, std::tuple<uint8_t *, uintptr_t>> &sections,
+      const std::string &fn_prefix, const std::string &mod_src,
+      std::map<std::string, std::string> &src_dbg_fmap)
+      : mod_(mod),
+        sections_(sections),
+        fn_prefix_(fn_prefix),
+        mod_src_(mod_src),
+        src_dbg_fmap_(src_dbg_fmap) {}
+// Only support dump for llvm 6.x and later.
+//
+// The llvm 5.x, but not earlier versions, also supports create
+// a dwarf context for source debugging based
+// on a set of in-memory sections with slightly different interfaces.
+// FIXME: possibly to support 5.x
+//
+#if LLVM_MAJOR_VERSION >= 6
+  void dump();
+
+ private:
+  void adjustInstSize(uint64_t &Size, uint8_t byte0, uint8_t byte1);
+  std::vector<std::string> buildLineCache();
+  void dumpSrcLine(const std::vector<std::string> &LineCache,
+                   const std::string &FileName, uint32_t Line,
+                   uint32_t &CurrentSrcLine, llvm::raw_ostream &os);
+  void getDebugSections(
+      llvm::StringMap<std::unique_ptr<llvm::MemoryBuffer>> &DebugSections);
+#else
+  void dump() {
+  }
+#endif
+
+ private:
+  llvm::Module *mod_;
+  const std::map<std::string, std::tuple<uint8_t *, uintptr_t>> &sections_;
+  const std::string &fn_prefix_;
+  const std::string &mod_src_;
+  std::map<std::string, std::string> &src_dbg_fmap_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c
new file mode 100644
index 0000000..c425db6
--- /dev/null
+++ b/src/cc/bcc_elf.c
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <gelf.h>
+#include "bcc_elf.h"
+#include "bcc_proc.h"
+#include "bcc_syms.h"
+
+#define NT_STAPSDT 3
+#define ELF_ST_TYPE(x) (((uint32_t) x) & 0xf)
+
+static int openelf_fd(int fd, Elf **elf_out) {
+  if (elf_version(EV_CURRENT) == EV_NONE)
+    return -1;
+
+  *elf_out = elf_begin(fd, ELF_C_READ, 0);
+  if (*elf_out == NULL)
+    return -1;
+
+  return 0;
+}
+
+static int openelf(const char *path, Elf **elf_out, int *fd_out) {
+  *fd_out = open(path, O_RDONLY);
+  if (*fd_out < 0)
+    return -1;
+
+  if (openelf_fd(*fd_out, elf_out) == -1) {
+    close(*fd_out);
+    return -1;
+  }
+
+  return 0;
+}
+
+static const char *parse_stapsdt_note(struct bcc_elf_usdt *probe,
+                                      const char *desc, int elf_class) {
+  if (elf_class == ELFCLASS32) {
+    probe->pc = *((uint32_t *)(desc));
+    probe->base_addr = *((uint32_t *)(desc + 4));
+    probe->semaphore = *((uint32_t *)(desc + 8));
+    desc = desc + 12;
+  } else {
+    probe->pc = *((uint64_t *)(desc));
+    probe->base_addr = *((uint64_t *)(desc + 8));
+    probe->semaphore = *((uint64_t *)(desc + 16));
+    desc = desc + 24;
+  }
+
+  probe->provider = desc;
+  desc += strlen(desc) + 1;
+
+  probe->name = desc;
+  desc += strlen(desc) + 1;
+
+  probe->arg_fmt = desc;
+  desc += strlen(desc) + 1;
+
+  return desc;
+}
+
+static int do_note_segment(Elf_Scn *section, int elf_class,
+                           bcc_elf_probecb callback, const char *binpath,
+                           uint64_t first_inst_offset, void *payload) {
+  Elf_Data *data = NULL;
+
+  while ((data = elf_getdata(section, data)) != 0) {
+    size_t offset = 0;
+    GElf_Nhdr hdr;
+    size_t name_off, desc_off;
+
+    while ((offset = gelf_getnote(data, offset, &hdr, &name_off, &desc_off)) !=
+           0) {
+      const char *desc, *desc_end;
+      struct bcc_elf_usdt probe;
+
+      if (hdr.n_type != NT_STAPSDT)
+        continue;
+
+      if (hdr.n_namesz != 8)
+        continue;
+
+      if (memcmp((const char *)data->d_buf + name_off, "stapsdt", 8) != 0)
+        continue;
+
+      desc = (const char *)data->d_buf + desc_off;
+      desc_end = desc + hdr.n_descsz;
+
+      if (parse_stapsdt_note(&probe, desc, elf_class) == desc_end) {
+        if (probe.pc < first_inst_offset)
+          fprintf(stderr,
+                  "WARNING: invalid address 0x%lx for probe (%s,%s) in binary %s\n",
+                  probe.pc, probe.provider, probe.name, binpath);
+        else
+          callback(binpath, &probe, payload);
+      }
+    }
+  }
+  return 0;
+}
+
+static int listprobes(Elf *e, bcc_elf_probecb callback, const char *binpath,
+                      void *payload) {
+  Elf_Scn *section = NULL;
+  size_t stridx;
+  int elf_class = gelf_getclass(e);
+  uint64_t first_inst_offset = 0;
+
+  if (elf_getshdrstrndx(e, &stridx) != 0)
+    return -1;
+
+  // Get the offset to the first instruction
+  while ((section = elf_nextscn(e, section)) != 0) {
+    GElf_Shdr header;
+
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    // The elf file section layout is based on increasing virtual address,
+    // getting the first section with SHF_EXECINSTR is enough.
+    if (header.sh_flags & SHF_EXECINSTR) {
+      first_inst_offset = header.sh_addr;
+      break;
+    }
+  }
+
+  while ((section = elf_nextscn(e, section)) != 0) {
+    GElf_Shdr header;
+    char *name;
+
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    if (header.sh_type != SHT_NOTE)
+      continue;
+
+    name = elf_strptr(e, stridx, header.sh_name);
+    if (name && !strcmp(name, ".note.stapsdt")) {
+      if (do_note_segment(section, elf_class, callback, binpath,
+                          first_inst_offset, payload) < 0)
+        return -1;
+    }
+  }
+
+  return 0;
+}
+
+int bcc_elf_foreach_usdt(const char *path, bcc_elf_probecb callback,
+                         void *payload) {
+  Elf *e;
+  int fd, res;
+
+  if (openelf(path, &e, &fd) < 0)
+    return -1;
+
+  res = listprobes(e, callback, path, payload);
+  elf_end(e);
+  close(fd);
+
+  return res;
+}
+
+static int list_in_scn(Elf *e, Elf_Scn *section, size_t stridx, size_t symsize,
+                       struct bcc_symbol_option *option,
+                       bcc_elf_symcb callback, void *payload) {
+  Elf_Data *data = NULL;
+
+  while ((data = elf_getdata(section, data)) != 0) {
+    size_t i, symcount = data->d_size / symsize;
+
+    if (data->d_size % symsize)
+      return -1;
+
+    for (i = 0; i < symcount; ++i) {
+      GElf_Sym sym;
+      const char *name;
+
+      if (!gelf_getsym(data, (int)i, &sym))
+        continue;
+
+      if ((name = elf_strptr(e, stridx, sym.st_name)) == NULL)
+        continue;
+      if (name[0] == 0)
+        continue;
+
+      if (sym.st_value == 0)
+        continue;
+
+      uint32_t st_type = ELF_ST_TYPE(sym.st_info);
+      if (!(option->use_symbol_type & (1 << st_type)))
+        continue;
+
+      if (callback(name, sym.st_value, sym.st_size, payload) < 0)
+        return 1;      // signal termination to caller
+    }
+  }
+
+  return 0;
+}
+
+static int listsymbols(Elf *e, bcc_elf_symcb callback, void *payload,
+                       struct bcc_symbol_option *option) {
+  Elf_Scn *section = NULL;
+
+  while ((section = elf_nextscn(e, section)) != 0) {
+    GElf_Shdr header;
+
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    if (header.sh_type != SHT_SYMTAB && header.sh_type != SHT_DYNSYM)
+      continue;
+
+    int rc = list_in_scn(e, section, header.sh_link, header.sh_entsize,
+                         option, callback, payload);
+    if (rc == 1)
+      break;    // callback signaled termination
+
+    if (rc < 0)
+      return rc;
+  }
+
+  return 0;
+}
+
+static Elf_Data * get_section_elf_data(Elf *e, const char *section_name) {
+  Elf_Scn *section = NULL;
+  GElf_Shdr header;
+  char *name;
+
+  size_t stridx;
+  if (elf_getshdrstrndx(e, &stridx) != 0)
+    return NULL;
+
+  while ((section = elf_nextscn(e, section)) != 0) {
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    name = elf_strptr(e, stridx, header.sh_name);
+    if (name && !strcmp(name, section_name)) {
+      return elf_getdata(section, NULL);
+    }
+  }
+
+  return NULL;
+}
+
+static int find_debuglink(Elf *e, char **debug_file, unsigned int *crc) {
+  Elf_Data *data = NULL;
+
+  *debug_file = NULL;
+  *crc = 0;
+
+  data = get_section_elf_data(e, ".gnu_debuglink");
+  if (!data || data->d_size <= 5)
+    return 0;
+
+  *debug_file = (char *)data->d_buf;
+  *crc = *(unsigned int*)((char *)data->d_buf + data->d_size - 4);
+
+  return *debug_file ? 1 : 0;
+}
+
+static int find_buildid(Elf *e, char *buildid) {
+  Elf_Data *data = get_section_elf_data(e, ".note.gnu.build-id");
+  if (!data || data->d_size <= 16 || strcmp((char *)data->d_buf + 12, "GNU"))
+    return 0;
+
+  char *buf = (char *)data->d_buf + 16;
+  size_t length = data->d_size - 16;
+  size_t i = 0;
+  for (i = 0; i < length; ++i) {
+    sprintf(buildid + (i * 2), "%02hhx", buf[i]);
+  }
+
+  return 1;
+}
+
+// The CRC algorithm used by GNU debuglink. Taken from:
+//    https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
+static unsigned int gnu_debuglink_crc32(unsigned int crc,
+                                        char *buf, size_t len) {
+  static const unsigned int crc32_table[256] =
+  {
+    0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+    0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+    0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+    0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+    0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+    0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+    0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+    0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+    0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+    0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+    0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+    0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+    0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+    0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+    0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+    0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+    0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+    0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+    0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+    0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+    0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+    0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+    0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+    0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+    0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+    0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+    0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+    0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+    0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+    0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+    0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+    0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+    0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+    0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+    0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+    0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+    0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+    0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+    0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+    0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+    0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+    0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+    0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+    0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+    0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+    0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+    0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+    0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+    0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+    0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+    0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+    0x2d02ef8d
+  };
+  char *end;
+
+  crc = ~crc & 0xffffffff;
+  for (end = buf + len; buf < end; ++buf)
+    crc = crc32_table[(crc ^ *buf) & 0xff] ^ (crc >> 8);
+  return ~crc & 0xffffffff;
+}
+
+static int verify_checksum(const char *file, unsigned int crc) {
+  struct stat st;
+  int fd;
+  void *buf;
+  unsigned int actual;
+
+  fd = open(file, O_RDONLY);
+  if (fd < 0)
+    return 0;
+
+  if (fstat(fd, &st) < 0) {
+    close(fd);
+    return 0;
+  }
+
+  buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  if (!buf) {
+    close(fd);
+    return 0;
+  }
+
+  actual = gnu_debuglink_crc32(0, buf, st.st_size);
+
+  munmap(buf, st.st_size);
+  close(fd);
+  return actual == crc;
+}
+
+static char *find_debug_via_debuglink(Elf *e, const char *binpath,
+                                      int check_crc) {
+  char fullpath[PATH_MAX];
+  char *bindir = NULL;
+  char *res = NULL;
+  unsigned int crc;
+  char *name;  // the name of the debuginfo file
+
+  if (!find_debuglink(e, &name, &crc))
+    return NULL;
+
+  bindir = strdup(binpath);
+  bindir = dirname(bindir);
+
+  // Search for the file in 'binpath', but ignore the file we find if it
+  // matches the binary itself: the binary will always be probed later on,
+  // and it might contain poorer symbols (e.g. stripped or partial symbols)
+  // than the external debuginfo that might be available elsewhere.
+  snprintf(fullpath, sizeof(fullpath),"%s/%s", bindir, name);
+  if (strcmp(fullpath, binpath) != 0 && access(fullpath, F_OK) != -1) {
+    res = strdup(fullpath);
+    goto DONE;
+  }
+
+  // Search for the file in 'binpath'/.debug
+  snprintf(fullpath, sizeof(fullpath), "%s/.debug/%s", bindir, name);
+  if (access(fullpath, F_OK) != -1) {
+    res = strdup(fullpath);
+    goto DONE;
+  }
+
+  // Search for the file in the global debug directory /usr/lib/debug/'binpath'
+  snprintf(fullpath, sizeof(fullpath), "/usr/lib/debug%s/%s", bindir, name);
+  if (access(fullpath, F_OK) != -1) {
+    res = strdup(fullpath);
+    goto DONE;
+  }
+
+DONE:
+  free(bindir);
+  if (res && check_crc && !verify_checksum(res, crc))
+    return NULL;
+  return res;
+}
+
+static char *find_debug_via_buildid(Elf *e) {
+  char fullpath[PATH_MAX];
+  char buildid[128];  // currently 40 seems to be default, let's be safe
+
+  if (!find_buildid(e, buildid))
+    return NULL;
+
+  // Search for the file in the global debug directory with a sub-path:
+  //    mm/nnnnnn...nnnn.debug
+  // Where mm are the first two characters of the buildid, and nnnn are the
+  // rest of the build id, followed by .debug.
+  snprintf(fullpath, sizeof(fullpath), "/usr/lib/debug/.build-id/%c%c/%s.debug",
+          buildid[0], buildid[1], buildid + 2);
+  if (access(fullpath, F_OK) != -1) {
+    return strdup(fullpath);
+  }
+
+  return NULL;
+}
+
+static int foreach_sym_core(const char *path, bcc_elf_symcb callback,
+                            struct bcc_symbol_option *option, void *payload,
+                            int is_debug_file) {
+  Elf *e;
+  int fd, res;
+  char *debug_file;
+
+  if (!option)
+    return -1;
+
+  if (openelf(path, &e, &fd) < 0)
+    return -1;
+
+  // If there is a separate debuginfo file, try to locate and read it, first
+  // using the build-id section, then using the debuglink section. These are
+  // also the rules that GDB folows.
+  // See: https://sourceware.org/gdb/onlinedocs/gdb/Separate-Debug-Files.html
+  if (option->use_debug_file && !is_debug_file) {
+    // The is_debug_file argument helps avoid infinitely resolving debuginfo
+    // files for debuginfo files and so on.
+    debug_file = find_debug_via_buildid(e);
+    if (!debug_file)
+      debug_file = find_debug_via_debuglink(e, path,
+                                            option->check_debug_file_crc);
+    if (debug_file) {
+      foreach_sym_core(debug_file, callback, option, payload, 1);
+      free(debug_file);
+    }
+  }
+
+  res = listsymbols(e, callback, payload, option);
+  elf_end(e);
+  close(fd);
+  return res;
+}
+
+int bcc_elf_foreach_sym(const char *path, bcc_elf_symcb callback,
+                        void *option, void *payload) {
+  return foreach_sym_core(
+      path, callback, (struct bcc_symbol_option*)option, payload, 0);
+}
+
+int bcc_elf_get_text_scn_info(const char *path, uint64_t *addr,
+				   uint64_t *offset) {
+  Elf *e = NULL;
+  int fd = -1, err;
+  Elf_Scn *section = NULL;
+  GElf_Shdr header;
+  size_t stridx;
+  char *name;
+
+  if ((err = openelf(path, &e, &fd)) < 0 ||
+      (err = elf_getshdrstrndx(e, &stridx)) < 0)
+    goto exit;
+
+  err = -1;
+  while ((section = elf_nextscn(e, section)) != 0) {
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    name = elf_strptr(e, stridx, header.sh_name);
+    if (name && !strcmp(name, ".text")) {
+      *addr = (uint64_t)header.sh_addr;
+      *offset = (uint64_t)header.sh_offset;
+      err = 0;
+      break;
+    }
+  }
+
+exit:
+  if (e)
+    elf_end(e);
+  if (fd >= 0)
+    close(fd);
+  return err;
+}
+
+int bcc_elf_foreach_load_section(const char *path,
+                                 bcc_elf_load_sectioncb callback,
+                                 void *payload) {
+  Elf *e = NULL;
+  int fd = -1, err = -1, res;
+  size_t nhdrs, i;
+
+  if (openelf(path, &e, &fd) < 0)
+    goto exit;
+
+  if (elf_getphdrnum(e, &nhdrs) != 0)
+    goto exit;
+
+  GElf_Phdr header;
+  for (i = 0; i < nhdrs; i++) {
+    if (!gelf_getphdr(e, (int)i, &header))
+      continue;
+    if (header.p_type != PT_LOAD || !(header.p_flags & PF_X))
+      continue;
+    res = callback(header.p_vaddr, header.p_memsz, header.p_offset, payload);
+    if (res < 0) {
+      err = 1;
+      goto exit;
+    }
+  }
+  err = 0;
+
+exit:
+  if (e)
+    elf_end(e);
+  if (fd >= 0)
+    close(fd);
+  return err;
+}
+
+int bcc_elf_get_type(const char *path) {
+  Elf *e;
+  GElf_Ehdr hdr;
+  int fd;
+  void* res = NULL;
+
+  if (openelf(path, &e, &fd) < 0)
+    return -1;
+
+  res = (void*)gelf_getehdr(e, &hdr);
+  elf_end(e);
+  close(fd);
+
+  if (!res)
+    return -1;
+  else
+    return hdr.e_type;
+}
+
+int bcc_elf_is_exe(const char *path) {
+  return (bcc_elf_get_type(path) != -1) && (access(path, X_OK) == 0);
+}
+
+int bcc_elf_is_shared_obj(const char *path) {
+  return bcc_elf_get_type(path) == ET_DYN;
+}
+
+int bcc_elf_is_vdso(const char *name) {
+  return strcmp(name, "[vdso]") == 0;
+}
+
+// -2: Failed
+// -1: Not initialized
+// >0: Initialized
+static int vdso_image_fd = -1;
+
+static int find_vdso(const char *name, uint64_t st, uint64_t en,
+                     uint64_t offset, bool enter_ns, void *payload) {
+  int fd;
+  char tmpfile[128];
+  if (!bcc_elf_is_vdso(name))
+    return 0;
+
+  void *image = malloc(en - st);
+  if (!image)
+    goto on_error;
+  memcpy(image, (void *)st, en - st);
+
+  snprintf(tmpfile, sizeof(tmpfile), "/tmp/bcc_%d_vdso_image_XXXXXX", getpid());
+  fd = mkostemp(tmpfile, O_CLOEXEC);
+  if (fd < 0) {
+    fprintf(stderr, "Unable to create temp file: %s\n", strerror(errno));
+    goto on_error;
+  }
+  // Unlink the file to avoid leaking
+  if (unlink(tmpfile) == -1)
+    fprintf(stderr, "Unlink %s failed: %s\n", tmpfile, strerror(errno));
+
+  if (write(fd, image, en - st) == -1) {
+    fprintf(stderr, "Failed to write to vDSO image: %s\n", strerror(errno));
+    close(fd);
+    goto on_error;
+  }
+  vdso_image_fd = fd;
+
+on_error:
+  if (image)
+    free(image);
+  // Always stop the iteration
+  return -1;
+}
+
+int bcc_elf_foreach_vdso_sym(bcc_elf_symcb callback, void *payload) {
+  Elf *elf;
+  static struct bcc_symbol_option default_option = {
+    .use_debug_file = 0,
+    .check_debug_file_crc = 0,
+    .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC)
+  };
+
+  if (vdso_image_fd == -1) {
+    vdso_image_fd = -2;
+    bcc_procutils_each_module(getpid(), &find_vdso, NULL);
+  }
+  if (vdso_image_fd == -2)
+    return -1;
+
+  if (openelf_fd(vdso_image_fd, &elf) == -1)
+    return -1;
+
+  return listsymbols(elf, callback, payload, &default_option);
+}
+
+#if 0
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+  uint64_t addr;
+  if (bcc_elf_findsym(argv[1], argv[2], -1, STT_FUNC, &addr) < 0)
+    return -1;
+
+  printf("%s: %p\n", argv[2], (void *)addr);
+  return 0;
+}
+#endif
diff --git a/src/cc/bcc_elf.h b/src/cc/bcc_elf.h
new file mode 100644
index 0000000..bbe2494
--- /dev/null
+++ b/src/cc/bcc_elf.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBBCC_ELF_H
+#define LIBBCC_ELF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+struct bcc_elf_usdt {
+  uint64_t pc;
+  uint64_t base_addr;
+  uint64_t semaphore;
+
+  const char *provider;
+  const char *name;
+  const char *arg_fmt;
+};
+
+// Binary module path, bcc_elf_usdt struct, payload
+typedef void (*bcc_elf_probecb)(const char *, const struct bcc_elf_usdt *,
+                                void *);
+// Symbol name, start address, length, payload
+// Callback returning a negative value indicates to stop the iteration
+typedef int (*bcc_elf_symcb)(const char *, uint64_t, uint64_t, void *);
+// Segment virtual address, memory size, file offset, payload
+// Callback returning a negative value indicates to stop the iteration
+typedef int (*bcc_elf_load_sectioncb)(uint64_t, uint64_t, uint64_t, void *);
+
+// Iterate over all USDT probes noted in a binary module
+// Returns -1 on error, and 0 on success
+int bcc_elf_foreach_usdt(const char *path, bcc_elf_probecb callback,
+                         void *payload);
+// Iterate over all executable load sections of an ELF
+// Returns -1 on error, 1 if stopped by callback, and 0 on success
+int bcc_elf_foreach_load_section(const char *path,
+                                 bcc_elf_load_sectioncb callback,
+                                 void *payload);
+// Iterate over symbol table of a binary module
+// Parameter "option" points to a bcc_symbol_option struct to indicate wheather
+// and how to use debuginfo file, and what types of symbols to load.
+// Returns -1 on error, and 0 on success or stopped by callback
+int bcc_elf_foreach_sym(const char *path, bcc_elf_symcb callback, void *option,
+                        void *payload);
+// Iterate over all symbols from current system's vDSO
+// Returns -1 on error, and 0 on success or stopped by callback
+int bcc_elf_foreach_vdso_sym(bcc_elf_symcb callback, void *payload);
+
+int bcc_elf_get_text_scn_info(const char *path, uint64_t *addr,
+                              uint64_t *offset);
+
+int bcc_elf_get_type(const char *path);
+int bcc_elf_is_shared_obj(const char *path);
+int bcc_elf_is_exe(const char *path);
+int bcc_elf_is_vdso(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/cc/bcc_exception.h b/src/cc/bcc_exception.h
new file mode 100644
index 0000000..1f8aee6
--- /dev/null
+++ b/src/cc/bcc_exception.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdio>
+#include <string>
+
+namespace ebpf {
+
+class StatusTuple {
+public:
+  StatusTuple(int ret) : ret_(ret) {}
+
+  StatusTuple(int ret, const char *msg) : ret_(ret), msg_(msg) {}
+
+  StatusTuple(int ret, const std::string &msg) : ret_(ret), msg_(msg) {}
+
+  template <typename... Args>
+  StatusTuple(int ret, const char *fmt, Args... args) : ret_(ret) {
+    char buf[2048];
+    snprintf(buf, sizeof(buf), fmt, args...);
+    msg_ = std::string(buf);
+  }
+
+  void append_msg(const std::string& msg) {
+    msg_ += msg;
+  }
+
+  int code() { return ret_; }
+
+  std::string msg() { return msg_; }
+
+private:
+  int ret_;
+  std::string msg_;
+};
+
+#define TRY2(CMD)              \
+  do {                         \
+    StatusTuple __stp = (CMD); \
+    if (__stp.code() != 0) {   \
+      return __stp;            \
+    }                          \
+  } while (0)
+
+}  // namespace ebpf
diff --git a/src/cc/bcc_perf_map.c b/src/cc/bcc_perf_map.c
new file mode 100644
index 0000000..a86dfe9
--- /dev/null
+++ b/src/cc/bcc_perf_map.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bcc_perf_map.h"
+
+bool bcc_is_perf_map(const char *path) {
+  char* pos = strstr(path, ".map");
+  // Path ends with ".map"
+  return (pos != NULL) && (*(pos + 4)== 0);
+}
+
+bool bcc_is_valid_perf_map(const char *path) {
+  return bcc_is_perf_map(path) && (access(path, R_OK) == 0);
+}
+
+int bcc_perf_map_nstgid(int pid) {
+  char status_path[64];
+  FILE *status;
+
+  snprintf(status_path, sizeof(status_path), "/proc/%d/status", pid);
+  status = fopen(status_path, "r");
+
+  if (!status)
+    return -1;
+
+  // return the original PID if we fail to work out the TGID
+  int nstgid = pid;
+
+  size_t size = 0;
+  char *line = NULL;
+  while (getline(&line, &size, status) != -1) {
+    // check Tgid line first in case CONFIG_PID_NS is off
+    if (strstr(line, "Tgid:") != NULL)
+      nstgid = (int)strtol(strrchr(line, '\t'), NULL, 10);
+    if (strstr(line, "NStgid:") != NULL)
+      // PID namespaces can be nested -- last number is innermost PID
+      nstgid = (int)strtol(strrchr(line, '\t'), NULL, 10);
+  }
+  free(line);
+  fclose(status);
+
+  return nstgid;
+}
+
+bool bcc_perf_map_path(char *map_path, size_t map_len, int pid) {
+  char source[64];
+  snprintf(source, sizeof(source), "/proc/%d/root", pid);
+
+  char target[4096];
+  ssize_t target_len = readlink(source, target, sizeof(target) - 1);
+  if (target_len == -1)
+    return false;
+
+  target[target_len] = '\0';
+  if (strcmp(target, "/") == 0)
+    target[0] = '\0';
+
+  int nstgid = bcc_perf_map_nstgid(pid);
+
+  snprintf(map_path, map_len, "%s/tmp/perf-%d.map", target, nstgid);
+  return true;
+}
+
+int bcc_perf_map_foreach_sym(const char *path, bcc_perf_map_symcb callback,
+                             void* payload) {
+  FILE* file = fopen(path, "r");
+  if (!file)
+    return -1;
+
+  char *line = NULL;
+  size_t size = 0;
+  long long begin, len;
+  while (getline(&line, &size, file) != -1) {
+    char *cursor = line;
+    char *newline, *sep;
+
+    begin = strtoull(cursor, &sep, 16);
+    if (begin == 0 || *sep != ' ' || (begin == ULLONG_MAX && errno == ERANGE))
+      continue;
+    cursor = sep;
+    while (*cursor && isspace(*cursor)) cursor++;
+
+    len = strtoull(cursor, &sep, 16);
+    if (*sep != ' ' ||
+        (sep == cursor && len == 0) ||
+        (len == ULLONG_MAX && errno == ERANGE))
+      continue;
+    cursor = sep;
+    while (*cursor && isspace(*cursor)) cursor++;
+
+    newline = strchr(cursor, '\n');
+    if (newline)
+        newline[0] = '\0';
+
+    callback(cursor, begin, len, payload);
+  }
+
+  free(line);
+  fclose(file);
+
+  return 0;
+}
diff --git a/src/cc/bcc_perf_map.h b/src/cc/bcc_perf_map.h
new file mode 100644
index 0000000..137508c
--- /dev/null
+++ b/src/cc/bcc_perf_map.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBBCC_PERF_MAP_H
+#define LIBBCC_PERF_MAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+// Symbol name, start address, length, payload
+typedef int (*bcc_perf_map_symcb)(const char *, uint64_t, uint64_t, void *);
+
+bool bcc_is_perf_map(const char *path);
+bool bcc_is_valid_perf_map(const char *path);
+
+int bcc_perf_map_nstgid(int pid);
+bool bcc_perf_map_path(char *map_path, size_t map_len, int pid);
+int bcc_perf_map_foreach_sym(const char *path, bcc_perf_map_symcb callback,
+                             void* payload);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/cc/bcc_proc.c b/src/cc/bcc_proc.c
new file mode 100644
index 0000000..d694eb9
--- /dev/null
+++ b/src/cc/bcc_proc.c
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bcc_perf_map.h"
+#include "bcc_proc.h"
+#include "bcc_elf.h"
+
+#ifdef __x86_64__
+// https://www.kernel.org/doc/Documentation/x86/x86_64/mm.txt
+const unsigned long long kernelAddrSpace = 0x00ffffffffffffff;
+#else
+const unsigned long long kernelAddrSpace = 0x0;
+#endif
+
+char *bcc_procutils_which(const char *binpath) {
+  char buffer[4096];
+  const char *PATH;
+
+  if (strchr(binpath, '/'))
+    return bcc_elf_is_exe(binpath) ? strdup(binpath) : 0;
+
+  if (!(PATH = getenv("PATH")))
+    return 0;
+
+  while (PATH) {
+    const char *next = strchr(PATH, ':') ?: strchr(PATH, '\0');
+    const size_t path_len = next - PATH;
+
+    if (path_len) {
+      int ret = snprintf(buffer, sizeof(buffer), "%.*s/%s",
+	                  (int)path_len, PATH, binpath);
+      if (ret < 0 || ret >= sizeof(buffer))
+        return 0;
+
+      if (bcc_elf_is_exe(buffer))
+        return strdup(buffer);
+    }
+
+    PATH = *next ? (next + 1) : 0;
+  }
+
+  return 0;
+}
+
+#define STARTS_WITH(mapname, prefix) (!strncmp(mapname, prefix, sizeof(prefix)-1))
+
+int bcc_mapping_is_file_backed(const char *mapname) {
+  return mapname[0] && !(
+    STARTS_WITH(mapname, "//anon") ||
+    STARTS_WITH(mapname, "/dev/zero") ||
+    STARTS_WITH(mapname, "/anon_hugepage") ||
+    STARTS_WITH(mapname, "[stack") ||
+    STARTS_WITH(mapname, "/SYSV") ||
+    STARTS_WITH(mapname, "[heap]") ||
+    STARTS_WITH(mapname, "[vsyscall]"));
+}
+
+int bcc_procutils_each_module(int pid, bcc_procutils_modulecb callback,
+                              void *payload) {
+  char procmap_filename[128];
+  FILE *procmap;
+  snprintf(procmap_filename, sizeof(procmap_filename), "/proc/%ld/maps",
+           (long)pid);
+  procmap = fopen(procmap_filename, "r");
+  if (!procmap)
+    return -1;
+
+  char buf[PATH_MAX + 1], perm[5], dev[8];
+  char *name;
+  uint64_t begin, end, inode;
+  unsigned long long offset;
+  while (true) {
+    buf[0] = '\0';
+    // From fs/proc/task_mmu.c:show_map_vma
+    if (fscanf(procmap, "%lx-%lx %s %llx %s %lu%[^\n]", &begin, &end, perm,
+               &offset, dev, &inode, buf) != 7)
+      break;
+
+    if (perm[2] != 'x')
+      continue;
+
+    name = buf;
+    while (isspace(*name))
+      name++;
+    if (!bcc_mapping_is_file_backed(name))
+      continue;
+
+    if (callback(name, begin, end, (uint64_t)offset, true, payload) < 0)
+      break;
+  }
+
+  fclose(procmap);
+
+  // Address mapping for the entire address space maybe in /tmp/perf-<PID>.map
+  // This will be used if symbols aren't resolved in an earlier mapping.
+  char map_path[4096];
+  // Try perf-<PID>.map path with process's mount namespace, chroot and NSPID,
+  // in case it is generated by the process itself.
+  if (bcc_perf_map_path(map_path, sizeof(map_path), pid))
+    if (callback(map_path, 0, -1, 0, true, payload) < 0)
+      return 0;
+  // Try perf-<PID>.map path with global root and PID, in case it is generated
+  // by other Process. Avoid checking mount namespace for this.
+  int res = snprintf(map_path, 4096, "/tmp/perf-%d.map", pid);
+  if (res > 0 && res < 4096)
+    if (callback(map_path, 0, -1, 0, false, payload) < 0)
+      return 0;
+
+  return 0;
+}
+
+int bcc_procutils_each_ksym(bcc_procutils_ksymcb callback, void *payload) {
+  char line[2048];
+  char *symname, *endsym;
+  FILE *kallsyms;
+  unsigned long long addr;
+
+  /* root is needed to list ksym addresses */
+  if (geteuid() != 0)
+    return -1;
+
+  kallsyms = fopen("/proc/kallsyms", "r");
+  if (!kallsyms)
+    return -1;
+
+  while (fgets(line, sizeof(line), kallsyms)) {
+    addr = strtoull(line, &symname, 16);
+    if (addr == 0 || addr == ULLONG_MAX)
+      continue;
+    if (addr < kernelAddrSpace)
+      continue;
+
+    symname++;
+    // Ignore data symbols
+    if (*symname == 'b' || *symname == 'B' || *symname == 'd' ||
+        *symname == 'D' || *symname == 'r' || *symname =='R')
+      continue;
+
+    endsym = (symname = symname + 2);
+    while (*endsym && !isspace(*endsym)) endsym++;
+    *endsym = '\0';
+
+    callback(symname, addr, payload);
+  }
+
+  fclose(kallsyms);
+  return 0;
+}
+
+#define CACHE1_HEADER "ld.so-1.7.0"
+#define CACHE1_HEADER_LEN (sizeof(CACHE1_HEADER) - 1)
+
+#define CACHE2_HEADER "glibc-ld.so.cache"
+#define CACHE2_HEADER_LEN (sizeof(CACHE2_HEADER) - 1)
+#define CACHE2_VERSION "1.1"
+
+struct ld_cache1_entry {
+  int32_t flags;
+  uint32_t key;
+  uint32_t value;
+};
+
+struct ld_cache1 {
+  char header[CACHE1_HEADER_LEN];
+  uint32_t entry_count;
+  struct ld_cache1_entry entries[0];
+};
+
+struct ld_cache2_entry {
+  int32_t flags;
+  uint32_t key;
+  uint32_t value;
+  uint32_t pad1_;
+  uint64_t pad2_;
+};
+
+struct ld_cache2 {
+  char header[CACHE2_HEADER_LEN];
+  char version[3];
+  uint32_t entry_count;
+  uint32_t string_table_len;
+  uint32_t pad_[5];
+  struct ld_cache2_entry entries[0];
+};
+
+static int lib_cache_count;
+static struct ld_lib {
+  char *libname;
+  char *path;
+  int flags;
+} * lib_cache;
+
+static int read_cache1(const char *ld_map) {
+  struct ld_cache1 *ldcache = (struct ld_cache1 *)ld_map;
+  const char *ldstrings =
+      (const char *)(ldcache->entries + ldcache->entry_count);
+  uint32_t i;
+
+  lib_cache =
+      (struct ld_lib *)malloc(ldcache->entry_count * sizeof(struct ld_lib));
+  lib_cache_count = (int)ldcache->entry_count;
+
+  for (i = 0; i < ldcache->entry_count; ++i) {
+    const char *key = ldstrings + ldcache->entries[i].key;
+    const char *val = ldstrings + ldcache->entries[i].value;
+    const int flags = ldcache->entries[i].flags;
+
+    lib_cache[i].libname = strdup(key);
+    lib_cache[i].path = strdup(val);
+    lib_cache[i].flags = flags;
+  }
+  return 0;
+}
+
+static int read_cache2(const char *ld_map) {
+  struct ld_cache2 *ldcache = (struct ld_cache2 *)ld_map;
+  uint32_t i;
+
+  if (memcmp(ld_map, CACHE2_HEADER, CACHE2_HEADER_LEN))
+    return -1;
+
+  lib_cache =
+      (struct ld_lib *)malloc(ldcache->entry_count * sizeof(struct ld_lib));
+  lib_cache_count = (int)ldcache->entry_count;
+
+  for (i = 0; i < ldcache->entry_count; ++i) {
+    const char *key = ld_map + ldcache->entries[i].key;
+    const char *val = ld_map + ldcache->entries[i].value;
+    const int flags = ldcache->entries[i].flags;
+
+    lib_cache[i].libname = strdup(key);
+    lib_cache[i].path = strdup(val);
+    lib_cache[i].flags = flags;
+  }
+  return 0;
+}
+
+static int load_ld_cache(const char *cache_path) {
+  struct stat st;
+  size_t ld_size;
+  const char *ld_map;
+  int ret, fd = open(cache_path, O_RDONLY);
+
+  if (fd < 0)
+    return -1;
+
+  if (fstat(fd, &st) < 0 || st.st_size < sizeof(struct ld_cache1)) {
+    close(fd);
+    return -1;
+  }
+
+  ld_size = st.st_size;
+  ld_map = (const char *)mmap(NULL, ld_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  if (ld_map == MAP_FAILED) {
+    close(fd);
+    return -1;
+  }
+
+  if (memcmp(ld_map, CACHE1_HEADER, CACHE1_HEADER_LEN) == 0) {
+    const struct ld_cache1 *cache1 = (struct ld_cache1 *)ld_map;
+    size_t cache1_len = sizeof(struct ld_cache1) +
+                        (cache1->entry_count * sizeof(struct ld_cache1_entry));
+    cache1_len = (cache1_len + 0x7) & ~0x7ULL;
+
+    if (ld_size > (cache1_len + sizeof(struct ld_cache2)))
+      ret = read_cache2(ld_map + cache1_len);
+    else
+      ret = read_cache1(ld_map);
+  } else {
+    ret = read_cache2(ld_map);
+  }
+
+  munmap((void *)ld_map, ld_size);
+  close(fd);
+  return ret;
+}
+
+#define LD_SO_CACHE "/etc/ld.so.cache"
+#define FLAG_TYPE_MASK 0x00ff
+#define TYPE_ELF_LIBC6 0x0003
+#define FLAG_ABI_MASK 0xff00
+#define ABI_SPARC_LIB64 0x0100
+#define ABI_IA64_LIB64 0x0200
+#define ABI_X8664_LIB64 0x0300
+#define ABI_S390_LIB64 0x0400
+#define ABI_POWERPC_LIB64 0x0500
+#define ABI_AARCH64_LIB64 0x0a00
+
+static bool match_so_flags(int flags) {
+  if ((flags & FLAG_TYPE_MASK) != TYPE_ELF_LIBC6)
+    return false;
+
+  switch (flags & FLAG_ABI_MASK) {
+  case ABI_SPARC_LIB64:
+  case ABI_IA64_LIB64:
+  case ABI_X8664_LIB64:
+  case ABI_S390_LIB64:
+  case ABI_POWERPC_LIB64:
+  case ABI_AARCH64_LIB64:
+    return (sizeof(void *) == 8);
+  }
+
+  return sizeof(void *) == 4;
+}
+
+static bool which_so_in_process(const char* libname, int pid, char* libpath) {
+  int ret, found = false;
+  char endline[4096], *mapname = NULL, *newline;
+  char mappings_file[128];
+  const size_t search_len = strlen(libname) + strlen("/lib.");
+  char search1[search_len + 1];
+  char search2[search_len + 1];
+
+  snprintf(mappings_file, sizeof(mappings_file), "/proc/%ld/maps", (long)pid);
+  FILE *fp = fopen(mappings_file, "r");
+  if (!fp)
+    return NULL;
+
+  snprintf(search1, search_len + 1, "/lib%s.", libname);
+  snprintf(search2, search_len + 1, "/lib%s-", libname);
+
+  do {
+    ret = fscanf(fp, "%*x-%*x %*s %*x %*s %*d");
+    if (!fgets(endline, sizeof(endline), fp))
+      break;
+
+    mapname = endline;
+    newline = strchr(endline, '\n');
+    if (newline)
+      newline[0] = '\0';
+
+    while (isspace(mapname[0])) mapname++;
+
+    if (strstr(mapname, ".so") && (strstr(mapname, search1) ||
+                                   strstr(mapname, search2))) {
+      found = true;
+      memcpy(libpath, mapname, strlen(mapname) + 1);
+      break;
+    }
+  } while (ret != EOF);
+
+  fclose(fp);
+  return found;
+}
+
+char *bcc_procutils_which_so(const char *libname, int pid) {
+  const size_t soname_len = strlen(libname) + strlen("lib.so");
+  char soname[soname_len + 1];
+  char libpath[4096];
+  int i;
+
+  if (strchr(libname, '/'))
+    return strdup(libname);
+
+  if (pid && which_so_in_process(libname, pid, libpath))
+    return strdup(libpath);
+
+  if (lib_cache_count < 0)
+    return NULL;
+
+  if (!lib_cache_count && load_ld_cache(LD_SO_CACHE) < 0) {
+    lib_cache_count = -1;
+    return NULL;
+  }
+
+  snprintf(soname, soname_len + 1, "lib%s.so", libname);
+
+  for (i = 0; i < lib_cache_count; ++i) {
+    if (!strncmp(lib_cache[i].libname, soname, soname_len) &&
+        match_so_flags(lib_cache[i].flags)) {
+      return strdup(lib_cache[i].path);
+    }
+  }
+  return NULL;
+}
+
+void bcc_procutils_free(const char *ptr) {
+  free((void *)ptr);
+}
+
+/* Detects the following languages + C. */
+const char *languages[] = {"java", "node", "perl", "php", "python", "ruby"};
+const char *language_c = "c";
+const int nb_languages = 6;
+
+const char *bcc_procutils_language(int pid) {
+  char procfilename[24], line[4096], pathname[32], *str;
+  FILE *procfile;
+  int i, ret;
+
+  /* Look for clues in the absolute path to the executable. */
+  snprintf(procfilename, sizeof(procfilename), "/proc/%ld/exe", (long)pid);
+  if (realpath(procfilename, line)) {
+    for (i = 0; i < nb_languages; i++)
+      if (strstr(line, languages[i]))
+        return languages[i];
+  }
+
+
+  snprintf(procfilename, sizeof(procfilename), "/proc/%ld/maps", (long)pid);
+  procfile = fopen(procfilename, "r");
+  if (!procfile)
+    return NULL;
+
+  /* Look for clues in memory mappings. */
+  bool libc = false;
+  do {
+    char perm[8], dev[8];
+    long long begin, end, size, inode;
+    ret = fscanf(procfile, "%llx-%llx %s %llx %s %lld", &begin, &end, perm,
+                 &size, dev, &inode);
+    if (!fgets(line, sizeof(line), procfile))
+      break;
+    if (ret == 6) {
+      char *mapname = line;
+      char *newline = strchr(line, '\n');
+      if (newline)
+        newline[0] = '\0';
+      while (isspace(mapname[0])) mapname++;
+      for (i = 0; i < nb_languages; i++) {
+        snprintf(pathname, sizeof(pathname), "/lib%s", languages[i]);
+        if (strstr(mapname, pathname)) {
+          fclose(procfile);
+          return languages[i];
+	}
+        if ((str = strstr(mapname, "libc")) &&
+            (str[4] == '-' || str[4] == '.'))
+          libc = true;
+      }
+    }
+  } while (ret && ret != EOF);
+
+  fclose(procfile);
+
+  /* Return C as the language if libc was found and nothing else. */
+  return libc ? language_c : NULL;
+}
diff --git a/src/cc/bcc_proc.h b/src/cc/bcc_proc.h
new file mode 100644
index 0000000..1e5a720
--- /dev/null
+++ b/src/cc/bcc_proc.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBBCC_PROC_H
+#define LIBBCC_PROC_H
+
+#include "bcc_syms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+
+// Module name, start address, end address, file_offset,
+// whether to check mount namespace, payload
+// Callback returning a negative value indicates to stop the iteration
+typedef int (*bcc_procutils_modulecb)(const char *, uint64_t, uint64_t,
+                                      uint64_t, bool, void *);
+// Symbol name, address, payload
+typedef void (*bcc_procutils_ksymcb)(const char *, uint64_t, void *);
+
+char *bcc_procutils_which_so(const char *libname, int pid);
+char *bcc_procutils_which(const char *binpath);
+int bcc_mapping_is_file_backed(const char *mapname);
+// Iterate over all executable memory mapping sections of a Process.
+// All anonymous and non-file-backed mapping sections, namely those
+// listed in bcc_mapping_is_file_backed, will be ignored.
+// Returns -1 on error, and 0 on success
+int bcc_procutils_each_module(int pid, bcc_procutils_modulecb callback,
+                              void *payload);
+// Iterate over all non-data Kernel symbols.
+// Returns -1 on error, and 0 on success
+int bcc_procutils_each_ksym(bcc_procutils_ksymcb callback, void *payload);
+void bcc_procutils_free(const char *ptr);
+const char *bcc_procutils_language(int pid);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc
new file mode 100644
index 0000000..be9781a
--- /dev/null
+++ b/src/cc/bcc_syms.cc
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cxxabi.h>
+#include <cstring>
+#include <fcntl.h>
+#include <linux/elf.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstdio>
+
+#include "bcc_elf.h"
+#include "bcc_perf_map.h"
+#include "bcc_proc.h"
+#include "bcc_syms.h"
+#include "common.h"
+#include "vendor/tinyformat.hpp"
+
+#include "syms.h"
+
+ino_t ProcStat::getinode_() {
+  struct stat s;
+  return (!stat(procfs_.c_str(), &s)) ? s.st_ino : -1;
+}
+
+bool ProcStat::is_stale() {
+  ino_t cur_inode = getinode_();
+  return (cur_inode > 0) && (cur_inode != inode_);
+}
+
+ProcStat::ProcStat(int pid)
+    : procfs_(tfm::format("/proc/%d/exe", pid)), inode_(getinode_()) {}
+
+void KSyms::_add_symbol(const char *symname, uint64_t addr, void *p) {
+  KSyms *ks = static_cast<KSyms *>(p);
+  ks->syms_.emplace_back(symname, addr);
+}
+
+void KSyms::refresh() {
+  if (syms_.empty()) {
+    bcc_procutils_each_ksym(_add_symbol, this);
+    std::sort(syms_.begin(), syms_.end());
+  }
+}
+
+bool KSyms::resolve_addr(uint64_t addr, struct bcc_symbol *sym, bool demangle) {
+  refresh();
+
+  std::vector<Symbol>::iterator it;
+
+  if (syms_.empty())
+    goto unknown_symbol;
+
+  it = std::upper_bound(syms_.begin(), syms_.end(), Symbol("", addr));
+  if (it != syms_.begin()) {
+    it--;
+    sym->name = (*it).name.c_str();
+    if (demangle)
+      sym->demangle_name = sym->name;
+    sym->module = "kernel";
+    sym->offset = addr - (*it).addr;
+    return true;
+  }
+
+unknown_symbol:
+  memset(sym, 0, sizeof(struct bcc_symbol));
+  return false;
+}
+
+bool KSyms::resolve_name(const char *_unused, const char *name,
+                         uint64_t *addr) {
+  refresh();
+
+  if (syms_.size() != symnames_.size()) {
+    symnames_.clear();
+    for (Symbol &sym : syms_) {
+      symnames_[sym.name] = sym.addr;
+    }
+  }
+
+  auto it = symnames_.find(name);
+  if (it == symnames_.end())
+    return false;
+
+  *addr = it->second;
+  return true;
+}
+
+ProcSyms::ProcSyms(int pid, struct bcc_symbol_option *option)
+    : pid_(pid), procstat_(pid), mount_ns_instance_(new ProcMountNS(pid_)) {
+  if (option)
+    std::memcpy(&symbol_option_, option, sizeof(bcc_symbol_option));
+  else
+    symbol_option_ = {
+      .use_debug_file = 1,
+      .check_debug_file_crc = 1,
+      .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC)
+    };
+  load_modules();
+}
+
+int ProcSyms::_add_load_sections(uint64_t v_addr, uint64_t mem_sz,
+                                 uint64_t file_offset, void *payload) {
+  auto module = static_cast<Module *>(payload);
+  module->ranges_.emplace_back(v_addr, v_addr + mem_sz, file_offset);
+  return 0;
+}
+
+void ProcSyms::load_exe() {
+  std::string exe = ebpf::get_pid_exe(pid_);
+  Module module(exe.c_str(), mount_ns_instance_.get(), &symbol_option_);
+
+  if (module.type_ != ModuleType::EXEC)
+    return;
+
+  ProcMountNSGuard g(mount_ns_instance_.get());
+
+  bcc_elf_foreach_load_section(exe.c_str(), &_add_load_sections, &module);
+
+  if (!module.ranges_.empty())
+    modules_.emplace_back(std::move(module));
+}
+
+void ProcSyms::load_modules() {
+  load_exe();
+  bcc_procutils_each_module(pid_, _add_module, this);
+}
+
+void ProcSyms::refresh() {
+  modules_.clear();
+  mount_ns_instance_.reset(new ProcMountNS(pid_));
+  load_modules();
+  procstat_.reset();
+}
+
+int ProcSyms::_add_module(const char *modname, uint64_t start, uint64_t end,
+                          uint64_t offset, bool check_mount_ns, void *payload) {
+  ProcSyms *ps = static_cast<ProcSyms *>(payload);
+  auto it = std::find_if(
+      ps->modules_.begin(), ps->modules_.end(),
+      [=](const ProcSyms::Module &m) { return m.name_ == modname; });
+  if (it == ps->modules_.end()) {
+    auto module = Module(
+        modname, check_mount_ns ? ps->mount_ns_instance_.get() : nullptr,
+        &ps->symbol_option_);
+
+    // pid/maps doesn't account for file_offset of text within the ELF.
+    // It only gives the mmap offset. We need the real offset for symbol
+    // lookup.
+    if (module.type_ == ModuleType::SO) {
+      if (bcc_elf_get_text_scn_info(modname, &module.elf_so_addr_,
+                                    &module.elf_so_offset_) < 0) {
+        fprintf(stderr, "WARNING: Couldn't find .text section in %s\n", modname);
+        fprintf(stderr, "WARNING: BCC can't handle sym look ups for %s", modname);
+      }
+    }
+
+    if (!bcc_is_perf_map(modname) || module.type_ != ModuleType::UNKNOWN)
+      // Always add the module even if we can't read it, so that we could
+      // report correct module name. Unless it's a perf map that we only
+      // add readable ones.
+      it = ps->modules_.insert(ps->modules_.end(), std::move(module));
+    else
+      return 0;
+  }
+  it->ranges_.emplace_back(start, end, offset);
+  // perf-PID map is added last. We try both inside the Process's mount
+  // namespace + chroot, and in global /tmp. Make sure we only add one.
+  if (it->type_ == ModuleType::PERF_MAP)
+    return -1;
+
+  return 0;
+}
+
+bool ProcSyms::resolve_addr(uint64_t addr, struct bcc_symbol *sym,
+                            bool demangle) {
+  if (procstat_.is_stale())
+    refresh();
+
+  memset(sym, 0, sizeof(struct bcc_symbol));
+
+  const char *original_module = nullptr;
+  uint64_t offset;
+  bool only_perf_map = false;
+  for (Module &mod : modules_) {
+    if (only_perf_map && (mod.type_ != ModuleType::PERF_MAP))
+      continue;
+    if (mod.contains(addr, offset)) {
+      if (mod.find_addr(offset, sym)) {
+        if (demangle) {
+          if (sym->name && (!strncmp(sym->name, "_Z", 2) || !strncmp(sym->name, "___Z", 4)))
+            sym->demangle_name =
+                abi::__cxa_demangle(sym->name, nullptr, nullptr, nullptr);
+          if (!sym->demangle_name)
+            sym->demangle_name = sym->name;
+        }
+        return true;
+      } else if (mod.type_ != ModuleType::PERF_MAP) {
+        // In this case, we found the address in the range of a module, but
+        // not able to find a symbol of that address in the module.
+        // Thus, we would try to find the address in perf map, and
+        // save the module's name in case we will need it later.
+        original_module = mod.name_.c_str();
+        only_perf_map = true;
+      }
+    }
+  }
+  // If we didn't find the symbol anywhere, the module name is probably
+  // set to be the perf map's name as it would be the last we tried.
+  // In this case, if we have found the address previously in a module,
+  // report the saved original module name instead.
+  if (original_module)
+    sym->module = original_module;
+  return false;
+}
+
+bool ProcSyms::resolve_name(const char *module, const char *name,
+                            uint64_t *addr) {
+  if (procstat_.is_stale())
+    refresh();
+
+  for (Module &mod : modules_) {
+    if (mod.name_ == module)
+      return mod.find_name(name, addr);
+  }
+  return false;
+}
+
+ProcSyms::Module::Module(const char *name, ProcMountNS *mount_ns,
+                         struct bcc_symbol_option *option)
+    : name_(name),
+      loaded_(false),
+      mount_ns_(mount_ns),
+      symbol_option_(option),
+      type_(ModuleType::UNKNOWN) {
+  ProcMountNSGuard g(mount_ns_);
+  int elf_type = bcc_elf_get_type(name_.c_str());
+  // The Module is an ELF file
+  if (elf_type >= 0) {
+    if (elf_type == ET_EXEC)
+      type_ = ModuleType::EXEC;
+    else if (elf_type == ET_DYN)
+      type_ = ModuleType::SO;
+    return;
+  }
+  // Other symbol files
+  if (bcc_is_valid_perf_map(name_.c_str()) == 1)
+    type_ = ModuleType::PERF_MAP;
+  else if (bcc_elf_is_vdso(name_.c_str()) == 1)
+    type_ = ModuleType::VDSO;
+
+  // Will be stored later
+  elf_so_offset_ = 0;
+  elf_so_addr_ = 0;
+}
+
+int ProcSyms::Module::_add_symbol(const char *symname, uint64_t start,
+                                  uint64_t size, void *p) {
+  Module *m = static_cast<Module *>(p);
+  auto res = m->symnames_.emplace(symname);
+  m->syms_.emplace_back(&*(res.first), start, size);
+  return 0;
+}
+
+void ProcSyms::Module::load_sym_table() {
+  if (loaded_)
+    return;
+  loaded_ = true;
+
+  if (type_ == ModuleType::UNKNOWN)
+    return;
+
+  ProcMountNSGuard g(mount_ns_);
+
+  if (type_ == ModuleType::PERF_MAP)
+    bcc_perf_map_foreach_sym(name_.c_str(), _add_symbol, this);
+  if (type_ == ModuleType::EXEC || type_ == ModuleType::SO)
+    bcc_elf_foreach_sym(name_.c_str(), _add_symbol, symbol_option_, this);
+  if (type_ == ModuleType::VDSO)
+    bcc_elf_foreach_vdso_sym(_add_symbol, this);
+
+  std::sort(syms_.begin(), syms_.end());
+}
+
+bool ProcSyms::Module::contains(uint64_t addr, uint64_t &offset) const {
+  for (const auto &range : ranges_) {
+    if (addr >= range.start && addr < range.end) {
+      if (type_ == ModuleType::SO || type_ == ModuleType::VDSO) {
+        // Offset within the mmap
+        offset = addr - range.start + range.file_offset;
+
+        // Offset within the ELF for SO symbol lookup
+        offset += (elf_so_addr_ - elf_so_offset_);
+      } else {
+        offset = addr;
+      }
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ProcSyms::Module::find_name(const char *symname, uint64_t *addr) {
+  load_sym_table();
+
+  for (Symbol &s : syms_) {
+    if (*(s.name) == symname) {
+      *addr = type_ == ModuleType::SO ? start() + s.start : s.start;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ProcSyms::Module::find_addr(uint64_t offset, struct bcc_symbol *sym) {
+  load_sym_table();
+
+  sym->module = name_.c_str();
+  sym->offset = offset;
+
+  auto it = std::upper_bound(syms_.begin(), syms_.end(), Symbol(nullptr, offset, 0));
+  if (it == syms_.begin())
+    return false;
+
+  // 'it' points to the symbol whose start address is strictly greater than
+  // the address we're looking for. Start stepping backwards as long as the
+  // current symbol is still below the desired address, and see if the end
+  // of the current symbol (start + size) is above the desired address. Once
+  // we have a matching symbol, return it. Note that simply looking at '--it'
+  // is not enough, because symbols can be nested. For example, we could be
+  // looking for offset 0x12 with the following symbols available:
+  // SYMBOL   START   SIZE    END
+  // goo      0x0     0x6     0x0 + 0x6 = 0x6
+  // foo      0x6     0x10    0x6 + 0x10 = 0x16
+  // bar      0x8     0x4     0x8 + 0x4 = 0xc
+  // baz      0x16    0x10    0x16 + 0x10 = 0x26
+  // The upper_bound lookup will return baz, and then going one symbol back
+  // brings us to bar, which does not contain offset 0x12 and is nested inside
+  // foo. Going back one more symbol brings us to foo, which contains 0x12
+  // and is a match.
+  // However, we also don't want to walk through the entire symbol list for
+  // unknown / missing symbols. So we will break if we reach a function that
+  // doesn't cover the function immediately before 'it', which means it is
+  // not possibly a nested function containing the address we're looking for.
+  --it;
+  uint64_t limit = it->start;
+  for (; offset >= it->start; --it) {
+    if (offset < it->start + it->size) {
+      sym->name = it->name->c_str();
+      sym->offset = (offset - it->start);
+      return true;
+    }
+    if (limit > it->start + it->size)
+      break;
+    // But don't step beyond begin()!
+    if (it == syms_.begin())
+      break;
+  }
+
+  return false;
+}
+
+extern "C" {
+
+void *bcc_symcache_new(int pid, struct bcc_symbol_option *option) {
+  if (pid < 0)
+    return static_cast<void *>(new KSyms());
+  return static_cast<void *>(new ProcSyms(pid, option));
+}
+
+void bcc_free_symcache(void *symcache, int pid) {
+  if (pid < 0)
+    delete static_cast<KSyms*>(symcache);
+  else
+    delete static_cast<ProcSyms*>(symcache);
+}
+
+void bcc_symbol_free_demangle_name(struct bcc_symbol *sym) {
+  if (sym->demangle_name && (sym->demangle_name != sym->name))
+    free(const_cast<char*>(sym->demangle_name));
+}
+
+int bcc_symcache_resolve(void *resolver, uint64_t addr,
+                         struct bcc_symbol *sym) {
+  SymbolCache *cache = static_cast<SymbolCache *>(resolver);
+  return cache->resolve_addr(addr, sym) ? 0 : -1;
+}
+
+int bcc_symcache_resolve_no_demangle(void *resolver, uint64_t addr,
+                                     struct bcc_symbol *sym) {
+  SymbolCache *cache = static_cast<SymbolCache *>(resolver);
+  return cache->resolve_addr(addr, sym, false) ? 0 : -1;
+}
+
+int bcc_symcache_resolve_name(void *resolver, const char *module,
+                              const char *name, uint64_t *addr) {
+  SymbolCache *cache = static_cast<SymbolCache *>(resolver);
+  return cache->resolve_name(module, name, addr) ? 0 : -1;
+}
+
+void bcc_symcache_refresh(void *resolver) {
+  SymbolCache *cache = static_cast<SymbolCache *>(resolver);
+  cache->refresh();
+}
+
+struct mod_st {
+  const char *name;
+  uint64_t start;
+  uint64_t file_offset;
+};
+
+static int _find_module(const char *modname, uint64_t start, uint64_t end,
+                        uint64_t offset, bool, void *p) {
+  struct mod_st *mod = (struct mod_st *)p;
+  if (!strcmp(modname, mod->name)) {
+    mod->start = start;
+    mod->file_offset = offset;
+    return -1;
+  }
+  return 0;
+}
+
+int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address,
+                            uint64_t *global) {
+  struct mod_st mod = {module, 0x0};
+  if (bcc_procutils_each_module(pid, _find_module, &mod) < 0 ||
+      mod.start == 0x0)
+    return -1;
+
+  *global = mod.start - mod.file_offset + address;
+  return 0;
+}
+
+static int _sym_cb_wrapper(const char *symname, uint64_t addr, uint64_t,
+                           void *payload) {
+  SYM_CB cb = (SYM_CB) payload;
+  return cb(symname, addr);
+}
+
+int bcc_foreach_function_symbol(const char *module, SYM_CB cb) {
+  if (module == 0 || cb == 0)
+    return -1;
+
+  static struct bcc_symbol_option default_option = {
+    .use_debug_file = 1,
+    .check_debug_file_crc = 1,
+    .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC)
+  };
+
+  return bcc_elf_foreach_sym(
+      module, _sym_cb_wrapper, &default_option, (void *)cb);
+}
+
+static int _find_sym(const char *symname, uint64_t addr, uint64_t,
+                     void *payload) {
+  struct bcc_symbol *sym = (struct bcc_symbol *)payload;
+  if (!strcmp(sym->name, symname)) {
+    sym->offset = addr;
+    return -1;
+  }
+  return 0;
+}
+
+struct load_addr_t {
+  uint64_t target_addr;
+  uint64_t binary_addr;
+};
+int _find_load(uint64_t v_addr, uint64_t mem_sz, uint64_t file_offset,
+                       void *payload) {
+  struct load_addr_t *addr = static_cast<load_addr_t *>(payload);
+  if (addr->target_addr >= v_addr && addr->target_addr < (v_addr + mem_sz)) {
+    addr->binary_addr = addr->target_addr - v_addr + file_offset;
+    return -1;
+  }
+  return 0;
+}
+
+int bcc_resolve_symname(const char *module, const char *symname,
+                        const uint64_t addr, int pid,
+                        struct bcc_symbol_option *option,
+                        struct bcc_symbol *sym) {
+  static struct bcc_symbol_option default_option = {
+    .use_debug_file = 1,
+    .check_debug_file_crc = 1,
+    .use_symbol_type = BCC_SYM_ALL_TYPES,
+  };
+
+  if (module == NULL)
+    return -1;
+
+  memset(sym, 0, sizeof(bcc_symbol));
+
+  if (strchr(module, '/')) {
+    sym->module = strdup(module);
+  } else {
+    sym->module = bcc_procutils_which_so(module, pid);
+  }
+  if (sym->module == NULL)
+    return -1;
+
+  ProcMountNSGuard g(pid);
+
+  sym->name = symname;
+  sym->offset = addr;
+  if (option == NULL)
+    option = &default_option;
+
+  if (sym->name && sym->offset == 0x0)
+    if (bcc_elf_foreach_sym(sym->module, _find_sym, option, sym) < 0)
+      goto invalid_module;
+  if (sym->offset == 0x0)
+    goto invalid_module;
+
+  // For executable (ET_EXEC) binaries, translate the virtual address
+  // to physical address in the binary file.
+  // For shared object binaries (ET_DYN), the address from symbol table should
+  // already be physical address in the binary file.
+  if (bcc_elf_get_type(sym->module) == ET_EXEC) {
+    struct load_addr_t addr = {
+      .target_addr = sym->offset,
+      .binary_addr = 0x0,
+    };
+    if (bcc_elf_foreach_load_section(sym->module, &_find_load, &addr) < 0)
+      goto invalid_module;
+    if (!addr.binary_addr)
+      goto invalid_module;
+    sym->offset = addr.binary_addr;
+  }
+  return 0;
+
+invalid_module:
+  if (sym->module) {
+    ::free(const_cast<char*>(sym->module));
+    sym->module = NULL;
+  }
+  return -1;
+}
+}
diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h
new file mode 100644
index 0000000..d617c1d
--- /dev/null
+++ b/src/cc/bcc_syms.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBBCC_SYMS_H
+#define LIBBCC_SYMS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+struct bcc_symbol {
+  const char *name;
+  const char *demangle_name;
+  const char *module;
+  uint64_t offset;
+};
+
+typedef int (*SYM_CB)(const char *symname, uint64_t addr);
+
+#ifndef STT_GNU_IFUNC
+#define STT_GNU_IFUNC 10
+#endif
+static const uint32_t BCC_SYM_ALL_TYPES = 65535;
+struct bcc_symbol_option {
+  int use_debug_file;
+  int check_debug_file_crc;
+  // Bitmask flags indicating what types of ELF symbols to use
+  uint32_t use_symbol_type;
+};
+
+void *bcc_symcache_new(int pid, struct bcc_symbol_option *option);
+void bcc_free_symcache(void *symcache, int pid);
+
+// The demangle_name pointer in bcc_symbol struct is returned from the
+// __cxa_demangle function call, which is supposed to be freed by caller. Call
+// this function after done using returned result of bcc_symcache_resolve.
+void bcc_symbol_free_demangle_name(struct bcc_symbol *sym);
+int bcc_symcache_resolve(void *symcache, uint64_t addr, struct bcc_symbol *sym);
+int bcc_symcache_resolve_no_demangle(void *symcache, uint64_t addr,
+                                     struct bcc_symbol *sym);
+
+int bcc_symcache_resolve_name(void *resolver, const char *module,
+                              const char *name, uint64_t *addr);
+void bcc_symcache_refresh(void *resolver);
+
+int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address,
+                            uint64_t *global);
+
+// Call cb on every function symbol in the specified module. Uses simpler
+// SYM_CB callback mainly for easier to use in Python API.
+// Will prefer use debug file and check debug file CRC when reading the module.
+int bcc_foreach_function_symbol(const char *module, SYM_CB cb);
+
+// Find the offset of a symbol in a module binary. If addr is not zero, will
+// calculate the offset using the provided addr and the module's load address.
+//
+// If pid is provided, will use it to help lookup the module in the Process and
+// enter the Process's mount Namespace.
+//
+// If option is not NULL, will respect the specified options for lookup.
+// Otherwise default option will apply, which is to use debug file, verify
+// checksum, and try all types of symbols.
+//
+// Return 0 on success and -1 on failure. Output will be write to sym. After
+// use, sym->module need to be freed if it's not empty.
+int bcc_resolve_symname(const char *module, const char *symname,
+                        const uint64_t addr, int pid,
+                        struct bcc_symbol_option* option,
+                        struct bcc_symbol *sym);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/cc/bcc_usdt.h b/src/cc/bcc_usdt.h
new file mode 100644
index 0000000..a031bc6
--- /dev/null
+++ b/src/cc/bcc_usdt.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBBCC_USDT_H
+#define LIBBCC_USDT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+void *bcc_usdt_new_frompid(int pid, const char *path);
+void *bcc_usdt_new_frompath(const char *path);
+void bcc_usdt_close(void *usdt);
+
+struct bcc_usdt {
+    const char *provider;
+    const char *name;
+    const char *bin_path;
+    uint64_t semaphore;
+    int num_locations;
+    int num_arguments;
+};
+
+struct bcc_usdt_location {
+    uint64_t address;
+    const char *bin_path;
+};
+
+#define BCC_USDT_ARGUMENT_NONE                0x0
+#define BCC_USDT_ARGUMENT_CONSTANT            0x1
+#define BCC_USDT_ARGUMENT_DEREF_OFFSET        0x2
+#define BCC_USDT_ARGUMENT_DEREF_IDENT         0x4
+#define BCC_USDT_ARGUMENT_BASE_REGISTER_NAME  0x8
+#define BCC_USDT_ARGUMENT_INDEX_REGISTER_NAME 0x10
+#define BCC_USDT_ARGUMENT_SCALE               0x20
+
+struct bcc_usdt_argument {
+    int size;
+    int valid;
+    int constant;
+    int deref_offset;
+    const char *deref_ident;
+    const char *base_register_name;
+    const char *index_register_name;
+    int scale;
+};
+
+typedef void (*bcc_usdt_cb)(struct bcc_usdt *);
+void bcc_usdt_foreach(void *usdt, bcc_usdt_cb callback);
+int bcc_usdt_get_location(void *usdt, const char *provider_name,
+                          const char *probe_name,
+                          int index, struct bcc_usdt_location *location);
+int bcc_usdt_get_argument(void *usdt, const char *provider_name,
+                          const char *probe_name,
+                          int location_index, int argument_index,
+                          struct bcc_usdt_argument *argument);
+
+int bcc_usdt_enable_probe(void *, const char *, const char *);
+const char *bcc_usdt_genargs(void **ctx_array, int len);
+const char *bcc_usdt_get_probe_argctype(
+  void *ctx, const char* probe_name, const int arg_index
+);
+
+typedef void (*bcc_usdt_uprobe_cb)(const char *, const char *, uint64_t, int);
+void bcc_usdt_foreach_uprobe(void *usdt, bcc_usdt_uprobe_cb callback);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/cc/bpf_common.cc b/src/cc/bpf_common.cc
new file mode 100644
index 0000000..4a71197
--- /dev/null
+++ b/src/cc/bpf_common.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bpf_common.h"
+#include "bpf_module.h"
+
+extern "C" {
+void * bpf_module_create_b(const char *filename, const char *proto_filename, unsigned flags) {
+  auto mod = new ebpf::BPFModule(flags);
+  if (mod->load_b(filename, proto_filename) != 0) {
+    delete mod;
+    return nullptr;
+  }
+  return mod;
+}
+
+void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags) {
+  auto mod = new ebpf::BPFModule(flags);
+  if (mod->load_c(filename, cflags, ncflags) != 0) {
+    delete mod;
+    return nullptr;
+  }
+  return mod;
+}
+
+void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags) {
+  auto mod = new ebpf::BPFModule(flags);
+  if (mod->load_string(text, cflags, ncflags) != 0) {
+    delete mod;
+    return nullptr;
+  }
+  return mod;
+}
+
+void bpf_module_destroy(void *program) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return;
+  delete mod;
+}
+
+size_t bpf_num_functions(void *program) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->num_functions();
+}
+
+const char * bpf_function_name(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->function_name(id);
+}
+
+void * bpf_function_start(void *program, const char *name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->function_start(name);
+}
+
+void * bpf_function_start_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->function_start(id);
+}
+
+size_t bpf_function_size(void *program, const char *name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->function_size(name);
+}
+
+size_t bpf_function_size_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->function_size(id);
+}
+
+char * bpf_module_license(void *program) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->license();
+}
+
+unsigned bpf_module_kern_version(void *program) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->kern_version();
+}
+
+size_t bpf_num_tables(void *program) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->num_tables();
+}
+
+size_t bpf_table_id(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return ~0ull;
+  return mod->table_id(table_name);
+}
+
+int bpf_table_fd(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_fd(table_name);
+}
+
+int bpf_table_fd_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_fd(id);
+}
+
+int bpf_table_type(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_type(table_name);
+}
+
+int bpf_table_type_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_type(id);
+}
+
+size_t bpf_table_max_entries(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_max_entries(table_name);
+}
+
+size_t bpf_table_max_entries_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_max_entries(id);
+}
+
+int bpf_table_flags(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_flags(table_name);
+}
+
+int bpf_table_flags_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_flags(id);
+}
+
+const char * bpf_table_name(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->table_name(id);
+}
+
+const char * bpf_table_key_desc(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->table_key_desc(table_name);
+}
+
+const char * bpf_table_key_desc_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->table_key_desc(id);
+}
+
+const char * bpf_table_leaf_desc(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->table_leaf_desc(table_name);
+}
+
+const char * bpf_table_leaf_desc_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return nullptr;
+  return mod->table_leaf_desc(id);
+}
+
+size_t bpf_table_key_size(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_key_size(table_name);
+}
+
+size_t bpf_table_key_size_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_key_size(id);
+}
+
+size_t bpf_table_leaf_size(void *program, const char *table_name) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_leaf_size(table_name);
+}
+
+size_t bpf_table_leaf_size_id(void *program, size_t id) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return 0;
+  return mod->table_leaf_size(id);
+}
+
+int bpf_table_key_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *key) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_key_printf(id, buf, buflen, key);
+}
+int bpf_table_leaf_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *leaf) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_leaf_printf(id, buf, buflen, leaf);
+}
+
+int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_key_scanf(id, buf, key);
+}
+int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf) {
+  auto mod = static_cast<ebpf::BPFModule *>(program);
+  if (!mod) return -1;
+  return mod->table_leaf_scanf(id, buf, leaf);
+}
+
+}
diff --git a/src/cc/bpf_common.h b/src/cc/bpf_common.h
new file mode 100644
index 0000000..0abdbd4
--- /dev/null
+++ b/src/cc/bpf_common.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BPF_COMMON_H
+#define BPF_COMMON_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void * bpf_module_create_b(const char *filename, const char *proto_filename, unsigned flags);
+void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags);
+void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags);
+void bpf_module_destroy(void *program);
+char * bpf_module_license(void *program);
+unsigned bpf_module_kern_version(void *program);
+size_t bpf_num_functions(void *program);
+const char * bpf_function_name(void *program, size_t id);
+void * bpf_function_start_id(void *program, size_t id);
+void * bpf_function_start(void *program, const char *name);
+size_t bpf_function_size_id(void *program, size_t id);
+size_t bpf_function_size(void *program, const char *name);
+size_t bpf_num_tables(void *program);
+size_t bpf_table_id(void *program, const char *table_name);
+int bpf_table_fd(void *program, const char *table_name);
+int bpf_table_fd_id(void *program, size_t id);
+int bpf_table_type(void *program, const char *table_name);
+int bpf_table_type_id(void *program, size_t id);
+size_t bpf_table_max_entries(void *program, const char *table_name);
+size_t bpf_table_max_entries_id(void *program, size_t id);
+int bpf_table_flags(void *program, const char *table_name);
+int bpf_table_flags_id(void *program, size_t id);
+const char * bpf_table_name(void *program, size_t id);
+const char * bpf_table_key_desc(void *program, const char *table_name);
+const char * bpf_table_key_desc_id(void *program, size_t id);
+const char * bpf_table_leaf_desc(void *program, const char *table_name);
+const char * bpf_table_leaf_desc_id(void *program, size_t id);
+size_t bpf_table_key_size(void *program, const char *table_name);
+size_t bpf_table_key_size_id(void *program, size_t id);
+size_t bpf_table_leaf_size(void *program, const char *table_name);
+size_t bpf_table_leaf_size_id(void *program, size_t id);
+int bpf_table_key_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *key);
+int bpf_table_leaf_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *leaf);
+int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key);
+int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc
new file mode 100644
index 0000000..a8174be
--- /dev/null
+++ b/src/cc/bpf_module.cc
@@ -0,0 +1,1041 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <fcntl.h>
+#include <ftw.h>
+#include <map>
+#include <stdio.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <vector>
+#include <linux/bpf.h>
+
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IRPrintingPasses.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Object/ObjectFile.h>
+#include <llvm/Support/FormattedStream.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm-c/Transforms/IPO.h>
+
+#include "common.h"
+#include "bcc_debug.h"
+#include "bcc_exception.h"
+#include "frontends/b/loader.h"
+#include "frontends/clang/loader.h"
+#include "frontends/clang/b_frontend_action.h"
+#include "bpf_module.h"
+#include "exported_files.h"
+#include "kbuild_helper.h"
+#include "libbpf.h"
+
+namespace ebpf {
+
+using std::get;
+using std::make_tuple;
+using std::map;
+using std::move;
+using std::string;
+using std::tuple;
+using std::unique_ptr;
+using std::vector;
+using namespace llvm;
+
+const string BPFModule::FN_PREFIX = BPF_FN_PREFIX;
+
+// Snooping class to remember the sections as the JIT creates them
+class MyMemoryManager : public SectionMemoryManager {
+ public:
+
+  explicit MyMemoryManager(map<string, tuple<uint8_t *, uintptr_t>> *sections)
+      : sections_(sections) {
+  }
+
+  virtual ~MyMemoryManager() {}
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID,
+                               StringRef SectionName) override {
+    uint8_t *Addr = SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName);
+    //printf("allocateCodeSection: %s Addr %p Size %ld Alignment %d SectionID %d\n",
+    //       SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID);
+    (*sections_)[SectionName.str()] = make_tuple(Addr, Size);
+    return Addr;
+  }
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                               unsigned SectionID, StringRef SectionName,
+                               bool isReadOnly) override {
+    uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, isReadOnly);
+    //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d RO %d\n",
+    //       SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID, isReadOnly);
+    (*sections_)[SectionName.str()] = make_tuple(Addr, Size);
+    return Addr;
+  }
+  map<string, tuple<uint8_t *, uintptr_t>> *sections_;
+};
+
+BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled,
+                     const std::string &maps_ns)
+    : flags_(flags),
+      rw_engine_enabled_(rw_engine_enabled),
+      used_b_loader_(false),
+      ctx_(new LLVMContext),
+      id_(std::to_string((uintptr_t)this)),
+      maps_ns_(maps_ns),
+      ts_(ts) {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+  LLVMInitializeBPFTarget();
+  LLVMInitializeBPFTargetMC();
+  LLVMInitializeBPFTargetInfo();
+  LLVMInitializeBPFAsmPrinter();
+#if LLVM_MAJOR_VERSION >= 6
+  LLVMInitializeBPFAsmParser();
+  if (flags & DEBUG_SOURCE)
+    LLVMInitializeBPFDisassembler();
+#endif
+  LLVMLinkInMCJIT(); /* call empty function to force linking of MCJIT */
+  if (!ts_) {
+    local_ts_ = createSharedTableStorage();
+    ts_ = &*local_ts_;
+  }
+  func_src_ = ebpf::make_unique<FuncSource>();
+}
+
+static StatusTuple unimplemented_sscanf(const char *, void *) {
+  return StatusTuple(-1, "sscanf unimplemented");
+}
+static StatusTuple unimplemented_snprintf(char *, size_t, const void *) {
+  return StatusTuple(-1, "snprintf unimplemented");
+}
+
+BPFModule::~BPFModule() {
+  for (auto &v : tables_) {
+    v->key_sscanf = unimplemented_sscanf;
+    v->leaf_sscanf = unimplemented_sscanf;
+    v->key_snprintf = unimplemented_snprintf;
+    v->leaf_snprintf = unimplemented_snprintf;
+  }
+
+  if (!rw_engine_enabled_) {
+    for (auto section : sections_)
+      delete[] get<0>(section.second);
+  }
+
+  engine_.reset();
+  rw_engine_.reset();
+  ctx_.reset();
+  func_src_.reset();
+
+  ts_->DeletePrefix(Path({id_}));
+}
+
+static void debug_printf(Module *mod, IRBuilder<> &B, const string &fmt, vector<Value *> args) {
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
+  args.insert(args.begin(), B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)})));
+  args.insert(args.begin(), B.getInt64((uintptr_t)stderr));
+  Function *fprintf_fn = mod->getFunction("fprintf");
+  if (!fprintf_fn) {
+    vector<Type *> fprintf_fn_args({B.getInt64Ty(), B.getInt8PtrTy()});
+    FunctionType *fprintf_fn_type = FunctionType::get(B.getInt32Ty(), fprintf_fn_args, /*isvarArg=*/true);
+    fprintf_fn = Function::Create(fprintf_fn_type, GlobalValue::ExternalLinkage, "fprintf", mod);
+    fprintf_fn->setCallingConv(CallingConv::C);
+    fprintf_fn->addFnAttr(Attribute::NoUnwind);
+  }
+  B.CreateCall(fprintf_fn, args);
+}
+
+static void finish_sscanf(IRBuilder<> &B, vector<Value *> *args, string *fmt,
+                          const map<string, Value *> &locals, bool exact_args) {
+  // fmt += "%n";
+  // int nread = 0;
+  // int n = sscanf(s, fmt, args..., &nread);
+  // if (n < 0) return -1;
+  // s = &s[nread];
+  Value *sptr = locals.at("sptr");
+  Value *nread = locals.at("nread");
+  Function *cur_fn = B.GetInsertBlock()->getParent();
+  Function *sscanf_fn = B.GetInsertBlock()->getModule()->getFunction("sscanf");
+  *fmt += "%n";
+  B.CreateStore(B.getInt32(0), nread);
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(*fmt, "fmt");
+  (*args)[1] = B.CreateInBoundsGEP(fmt_gvar, {B.getInt64(0), B.getInt64(0)});
+  (*args)[0] = B.CreateLoad(sptr);
+  args->push_back(nread);
+  CallInst *call = B.CreateCall(sscanf_fn, *args);
+  call->setTailCall(true);
+
+  BasicBlock *label_true = BasicBlock::Create(B.getContext(), "", cur_fn);
+  BasicBlock *label_false = BasicBlock::Create(B.getContext(), "", cur_fn);
+
+  // exact_args means fail if don't consume exact number of "%" inputs
+  // exact_args is disabled for string parsing (empty case)
+  Value *cond = exact_args ? B.CreateICmpNE(call, B.getInt32(args->size() - 3))
+                           : B.CreateICmpSLT(call, B.getInt32(0));
+  B.CreateCondBr(cond, label_true, label_false);
+
+  B.SetInsertPoint(label_true);
+  B.CreateRet(B.getInt32(-1));
+
+  B.SetInsertPoint(label_false);
+  // s = &s[nread];
+  B.CreateStore(
+      B.CreateInBoundsGEP(B.CreateLoad(sptr), B.CreateLoad(nread, true)), sptr);
+
+  args->resize(2);
+  fmt->clear();
+}
+
+// recursive helper to capture the arguments
+static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt,
+                       Type *type, Value *out,
+                       const map<string, Value *> &locals, bool is_writer) {
+  if (StructType *st = dyn_cast<StructType>(type)) {
+    *fmt += "{ ";
+    unsigned idx = 0;
+    for (auto field : st->elements()) {
+      parse_type(B, args, fmt, field, B.CreateStructGEP(type, out, idx++),
+                 locals, is_writer);
+      *fmt += " ";
+    }
+    *fmt += "}";
+  } else if (ArrayType *at = dyn_cast<ArrayType>(type)) {
+    if (at->getElementType() == B.getInt8Ty()) {
+      // treat i8[] as a char string instead of as an array of u8's
+      if (is_writer) {
+        *fmt += "\"%s\"";
+        args->push_back(out);
+      } else {
+        // When reading strings, scanf doesn't support empty "", so we need to
+        // break this up into multiple scanf calls. To understand it, let's take
+        // an example:
+        // struct Event {
+        //   u32 a;
+        //   struct {
+        //     char x[64];
+        //     int y;
+        //   } b[2];
+        //   u32 c;
+        // };
+        // The writer string would look like:
+        //  "{ 0x%x [ { \"%s\" 0x%x } { \"%s\" 0x%x } ] 0x%x }"
+        // But the reader string needs to restart at each \"\".
+        //  reader0(const char *s, struct Event *val) {
+        //    int nread, rc;
+        //    nread = 0;
+        //    rc = sscanf(s, "{ %i [ { \"%n", &val->a, &nread);
+        //    if (rc != 1) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "%[^\"]%n", &val->b[0].x, &nread);
+        //    if (rc < 0) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "\" %i } { \"%n", &val->b[0].y, &nread);
+        //    if (rc != 1) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "%[^\"]%n", &val->b[1].x, &nread);
+        //    if (rc < 0) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "\" %i } ] %i }%n", &val->b[1].y, &val->c, &nread);
+        //    if (rc != 2) return -1;
+        //    s += nread; nread = 0;
+        //    return 0;
+        //  }
+        *fmt += "\"";
+        finish_sscanf(B, args, fmt, locals, true);
+
+        *fmt = "%[^\"]";
+        args->push_back(out);
+        finish_sscanf(B, args, fmt, locals, false);
+
+        *fmt = "\"";
+      }
+    } else {
+      *fmt += "[ ";
+      for (size_t i = 0; i < at->getNumElements(); ++i) {
+        parse_type(B, args, fmt, at->getElementType(),
+                   B.CreateStructGEP(type, out, i), locals, is_writer);
+        *fmt += " ";
+      }
+      *fmt += "]";
+    }
+  } else if (isa<PointerType>(type)) {
+    *fmt += "0xl";
+    if (is_writer)
+      *fmt += "x";
+    else
+      *fmt += "i";
+  } else if (IntegerType *it = dyn_cast<IntegerType>(type)) {
+    if (is_writer)
+      *fmt += "0x";
+    if (it->getBitWidth() <= 8)
+      *fmt += "%hh";
+    else if (it->getBitWidth() <= 16)
+      *fmt += "%h";
+    else if (it->getBitWidth() <= 32)
+      *fmt += "%";
+    else
+      *fmt += "%l";
+    if (is_writer)
+      *fmt += "x";
+    else
+      *fmt += "i";
+    args->push_back(is_writer ? B.CreateLoad(out) : out);
+  }
+}
+
+// make_reader generates a dynamic function in the instruction set of the host
+// (not bpf) that is able to convert c-strings in the pretty-print format of
+// make_writer back into binary representations. The encoding of the string
+// takes the llvm ir structure format, which closely maps the c structure but
+// not exactly (no support for unions for instance).
+// The general algorithm is:
+//  pod types (u8..u64)                <= %i
+//  array types
+//   u8[]  no nested quotes :(         <= "..."
+//   !u8[]                             <= [ %i %i ... ]
+//  struct types
+//   struct { u8 a; u64 b; }           <= { %i %i }
+//  nesting is supported
+//   struct { struct { u8 a[]; }; }    <= { "" }
+//   struct { struct { u64 a[]; }; }   <= { [ %i %i .. ] }
+string BPFModule::make_reader(Module *mod, Type *type) {
+  auto fn_it = readers_.find(type);
+  if (fn_it != readers_.end())
+    return fn_it->second;
+
+  // int read(const char *in, Type *out) {
+  //   int n = sscanf(in, "{ %i ... }", &out->field1, ...);
+  //   if (n != num_fields) return -1;
+  //   return 0;
+  // }
+
+  IRBuilder<> B(*ctx_);
+
+  FunctionType *sscanf_fn_type = FunctionType::get(
+      B.getInt32Ty(), {B.getInt8PtrTy(), B.getInt8PtrTy()}, /*isVarArg=*/true);
+  Function *sscanf_fn = mod->getFunction("sscanf");
+  if (!sscanf_fn) {
+    sscanf_fn = Function::Create(sscanf_fn_type, GlobalValue::ExternalLinkage,
+                                 "sscanf", mod);
+    sscanf_fn->setCallingConv(CallingConv::C);
+    sscanf_fn->addFnAttr(Attribute::NoUnwind);
+  }
+
+  string name = "reader" + std::to_string(readers_.size());
+  vector<Type *> fn_args({B.getInt8PtrTy(), PointerType::getUnqual(type)});
+  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
+  Function *fn =
+      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
+  auto arg_it = fn->arg_begin();
+  Argument *arg_in = &*arg_it;
+  ++arg_it;
+  arg_in->setName("in");
+  Argument *arg_out = &*arg_it;
+  ++arg_it;
+  arg_out->setName("out");
+
+  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
+  B.SetInsertPoint(label_entry);
+
+  Value *nread = B.CreateAlloca(B.getInt32Ty());
+  Value *sptr = B.CreateAlloca(B.getInt8PtrTy());
+  map<string, Value *> locals{{"nread", nread}, {"sptr", sptr}};
+  B.CreateStore(arg_in, sptr);
+  vector<Value *> args({nullptr, nullptr});
+  string fmt;
+  parse_type(B, &args, &fmt, type, arg_out, locals, false);
+
+  if (0)
+    debug_printf(mod, B, "%p %p\n", vector<Value *>({arg_in, arg_out}));
+
+  finish_sscanf(B, &args, &fmt, locals, true);
+
+  B.CreateRet(B.getInt32(0));
+
+  readers_[type] = name;
+  return name;
+}
+
+// make_writer generates a dynamic function in the instruction set of the host
+// (not bpf) that is able to pretty-print key/leaf entries as a c-string. The
+// encoding of the string takes the llvm ir structure format, which closely maps
+// the c structure but not exactly (no support for unions for instance).
+// The general algorithm is:
+//  pod types (u8..u64)                => 0x%x
+//  array types
+//   u8[]                              => "..."
+//   !u8[]                             => [ 0x%x 0x%x ... ]
+//  struct types
+//   struct { u8 a; u64 b; }           => { 0x%x 0x%x }
+//  nesting is supported
+//   struct { struct { u8 a[]; }; }    => { "" }
+//   struct { struct { u64 a[]; }; }   => { [ 0x%x 0x%x .. ] }
+string BPFModule::make_writer(Module *mod, Type *type) {
+  auto fn_it = writers_.find(type);
+  if (fn_it != writers_.end())
+    return fn_it->second;
+
+  // int write(int len, char *out, Type *in) {
+  //   return snprintf(out, len, "{ %i ... }", out->field1, ...);
+  // }
+
+  IRBuilder<> B(*ctx_);
+
+  string name = "writer" + std::to_string(writers_.size());
+  vector<Type *> fn_args({B.getInt8PtrTy(), B.getInt64Ty(), PointerType::getUnqual(type)});
+  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
+  Function *fn =
+      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
+  auto arg_it = fn->arg_begin();
+  Argument *arg_out = &*arg_it;
+  ++arg_it;
+  arg_out->setName("out");
+  Argument *arg_len = &*arg_it;
+  ++arg_it;
+  arg_len->setName("len");
+  Argument *arg_in = &*arg_it;
+  ++arg_it;
+  arg_in->setName("in");
+
+  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
+  B.SetInsertPoint(label_entry);
+
+  map<string, Value *> locals{
+      {"nread", B.CreateAlloca(B.getInt64Ty())},
+  };
+  vector<Value *> args({arg_out, B.CreateZExt(arg_len, B.getInt64Ty()), nullptr});
+  string fmt;
+  parse_type(B, &args, &fmt, type, arg_in, locals, true);
+
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
+
+  args[2] = B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)}));
+
+  if (0)
+    debug_printf(mod, B, "%d %p %p\n", vector<Value *>({arg_len, arg_out, arg_in}));
+
+  vector<Type *> snprintf_fn_args({B.getInt8PtrTy(), B.getInt64Ty(), B.getInt8PtrTy()});
+  FunctionType *snprintf_fn_type = FunctionType::get(B.getInt32Ty(), snprintf_fn_args, /*isVarArg=*/true);
+  Function *snprintf_fn = mod->getFunction("snprintf");
+  if (!snprintf_fn)
+    snprintf_fn = Function::Create(snprintf_fn_type, GlobalValue::ExternalLinkage, "snprintf", mod);
+  snprintf_fn->setCallingConv(CallingConv::C);
+  snprintf_fn->addFnAttr(Attribute::NoUnwind);
+
+  CallInst *call = B.CreateCall(snprintf_fn, args);
+  call->setTailCall(true);
+
+  B.CreateRet(call);
+
+  writers_[type] = name;
+  return name;
+}
+
+unique_ptr<ExecutionEngine> BPFModule::finalize_rw(unique_ptr<Module> m) {
+  Module *mod = &*m;
+
+  run_pass_manager(*mod);
+
+  string err;
+  EngineBuilder builder(move(m));
+  builder.setErrorStr(&err);
+  builder.setUseOrcMCJITReplacement(false);
+  auto engine = unique_ptr<ExecutionEngine>(builder.create());
+  if (!engine)
+    fprintf(stderr, "Could not create ExecutionEngine: %s\n", err.c_str());
+  return engine;
+}
+
+// load an entire c file as a module
+int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) {
+  ClangLoader clang_loader(&*ctx_, flags_);
+  if (clang_loader.parse(&mod_, *ts_, file, in_memory, cflags, ncflags, id_,
+                         *func_src_, mod_src_, maps_ns_))
+    return -1;
+  return 0;
+}
+
+// NOTE: this is a duplicate of the above, but planning to deprecate if we
+// settle on clang as the frontend
+
+// Load in a pre-built list of functions into the initial Module object, then
+// build an ExecutionEngine.
+int BPFModule::load_includes(const string &text) {
+  ClangLoader clang_loader(&*ctx_, flags_);
+  if (clang_loader.parse(&mod_, *ts_, text, true, nullptr, 0, "", *func_src_,
+                         mod_src_, ""))
+    return -1;
+  return 0;
+}
+
+void BPFModule::annotate_light() {
+  for (auto fn = mod_->getFunctionList().begin(); fn != mod_->getFunctionList().end(); ++fn)
+    if (!fn->hasFnAttribute(Attribute::NoInline))
+      fn->addFnAttr(Attribute::AlwaysInline);
+
+  size_t id = 0;
+  Path path({id_});
+  for (auto it = ts_->lower_bound(path), up = ts_->upper_bound(path); it != up; ++it) {
+    TableDesc &table = it->second;
+    tables_.push_back(&it->second);
+    table_names_[table.name] = id++;
+  }
+}
+
+int BPFModule::annotate() {
+  for (auto fn = mod_->getFunctionList().begin(); fn != mod_->getFunctionList().end(); ++fn)
+    if (!fn->hasFnAttribute(Attribute::NoInline))
+      fn->addFnAttr(Attribute::AlwaysInline);
+
+  // separate module to hold the reader functions
+  auto m = ebpf::make_unique<Module>("sscanf", *ctx_);
+
+  size_t id = 0;
+  Path path({id_});
+  for (auto it = ts_->lower_bound(path), up = ts_->upper_bound(path); it != up; ++it) {
+    TableDesc &table = it->second;
+    tables_.push_back(&it->second);
+    table_names_[table.name] = id++;
+    GlobalValue *gvar = mod_->getNamedValue(table.name);
+    if (!gvar) continue;
+    if (PointerType *pt = dyn_cast<PointerType>(gvar->getType())) {
+      if (StructType *st = dyn_cast<StructType>(pt->getElementType())) {
+        if (st->getNumElements() < 2) continue;
+        Type *key_type = st->elements()[0];
+        Type *leaf_type = st->elements()[1];
+
+        using std::placeholders::_1;
+        using std::placeholders::_2;
+        using std::placeholders::_3;
+        table.key_sscanf = std::bind(&BPFModule::sscanf, this,
+                                     make_reader(&*m, key_type), _1, _2);
+        table.leaf_sscanf = std::bind(&BPFModule::sscanf, this,
+                                      make_reader(&*m, leaf_type), _1, _2);
+        table.key_snprintf = std::bind(&BPFModule::snprintf, this,
+                                       make_writer(&*m, key_type), _1, _2, _3);
+        table.leaf_snprintf =
+            std::bind(&BPFModule::snprintf, this, make_writer(&*m, leaf_type),
+                      _1, _2, _3);
+      }
+    }
+  }
+
+  rw_engine_ = finalize_rw(move(m));
+  if (!rw_engine_)
+    return -1;
+  return 0;
+}
+
+StatusTuple BPFModule::sscanf(string fn_name, const char *str, void *val) {
+  if (!rw_engine_enabled_)
+    return StatusTuple(-1, "rw_engine not enabled");
+  auto fn =
+      (int (*)(const char *, void *))rw_engine_->getFunctionAddress(fn_name);
+  if (!fn)
+    return StatusTuple(-1, "sscanf not available");
+  int rc = fn(str, val);
+  if (rc < 0)
+    return StatusTuple(rc, "error in sscanf: %s", std::strerror(errno));
+  return StatusTuple(rc);
+}
+
+StatusTuple BPFModule::snprintf(string fn_name, char *str, size_t sz,
+                                const void *val) {
+  if (!rw_engine_enabled_)
+    return StatusTuple(-1, "rw_engine not enabled");
+  auto fn = (int (*)(char *, size_t,
+                     const void *))rw_engine_->getFunctionAddress(fn_name);
+  if (!fn)
+    return StatusTuple(-1, "snprintf not available");
+  int rc = fn(str, sz, val);
+  if (rc < 0)
+    return StatusTuple(rc, "error in snprintf: %s", std::strerror(errno));
+  if ((size_t)rc == sz)
+    return StatusTuple(-1, "buffer of size %zd too small", sz);
+  return StatusTuple(0);
+}
+
+void BPFModule::dump_ir(Module &mod) {
+  legacy::PassManager PM;
+  PM.add(createPrintModulePass(errs()));
+  PM.run(mod);
+}
+
+int BPFModule::run_pass_manager(Module &mod) {
+  if (verifyModule(mod, &errs())) {
+    if (flags_ & DEBUG_LLVM_IR)
+      dump_ir(mod);
+    return -1;
+  }
+
+  legacy::PassManager PM;
+  PassManagerBuilder PMB;
+  PMB.OptLevel = 3;
+  PM.add(createFunctionInliningPass());
+  /*
+   * llvm < 4.0 needs
+   * PM.add(createAlwaysInlinerPass());
+   * llvm >= 4.0 needs
+   * PM.add(createAlwaysInlinerLegacyPass());
+   * use below 'stable' workaround
+   */
+  LLVMAddAlwaysInlinerPass(reinterpret_cast<LLVMPassManagerRef>(&PM));
+  PMB.populateModulePassManager(PM);
+  if (flags_ & DEBUG_LLVM_IR)
+    PM.add(createPrintModulePass(outs()));
+  PM.run(mod);
+  return 0;
+}
+
+int BPFModule::finalize() {
+  Module *mod = &*mod_;
+  std::map<std::string, std::tuple<uint8_t *, uintptr_t>> tmp_sections,
+      *sections_p;
+
+  mod->setTargetTriple("bpf-pc-linux");
+  sections_p = rw_engine_enabled_ ? &sections_ : &tmp_sections;
+
+  string err;
+  EngineBuilder builder(move(mod_));
+  builder.setErrorStr(&err);
+  builder.setMCJITMemoryManager(ebpf::make_unique<MyMemoryManager>(sections_p));
+  builder.setMArch("bpf");
+  builder.setUseOrcMCJITReplacement(false);
+  engine_ = unique_ptr<ExecutionEngine>(builder.create());
+  if (!engine_) {
+    fprintf(stderr, "Could not create ExecutionEngine: %s\n", err.c_str());
+    return -1;
+  }
+
+  if (flags_ & DEBUG_SOURCE)
+    engine_->setProcessAllSections(true);
+
+  if (int rc = run_pass_manager(*mod))
+    return rc;
+
+  engine_->finalizeObject();
+
+  if (flags_ & DEBUG_SOURCE) {
+    SourceDebugger src_debugger(mod, *sections_p, FN_PREFIX, mod_src_,
+                                src_dbg_fmap_);
+    src_debugger.dump();
+  }
+
+  if (!rw_engine_enabled_) {
+    // Setup sections_ correctly and then free llvm internal memory
+    for (auto section : tmp_sections) {
+      auto fname = section.first;
+      uintptr_t size = get<1>(section.second);
+      uint8_t *tmp_p = NULL;
+      // Only copy data for non-map sections
+      if (strncmp("maps/", section.first.c_str(), 5)) {
+        uint8_t *addr = get<0>(section.second);
+        tmp_p = new uint8_t[size];
+        memcpy(tmp_p, addr, size);
+      }
+      sections_[fname] = make_tuple(tmp_p, size);
+    }
+    engine_.reset();
+    ctx_.reset();
+  }
+
+  // give functions an id
+  for (auto section : sections_)
+    if (!strncmp(FN_PREFIX.c_str(), section.first.c_str(), FN_PREFIX.size()))
+      function_names_.push_back(section.first);
+
+  return 0;
+}
+
+size_t BPFModule::num_functions() const {
+  return function_names_.size();
+}
+
+const char * BPFModule::function_name(size_t id) const {
+  if (id >= function_names_.size())
+    return nullptr;
+  return function_names_[id].c_str() + FN_PREFIX.size();
+}
+
+uint8_t * BPFModule::function_start(size_t id) const {
+  if (id >= function_names_.size())
+    return nullptr;
+  auto section = sections_.find(function_names_[id]);
+  if (section == sections_.end())
+    return nullptr;
+  return get<0>(section->second);
+}
+
+uint8_t * BPFModule::function_start(const string &name) const {
+  auto section = sections_.find(FN_PREFIX + name);
+  if (section == sections_.end())
+    return nullptr;
+
+  return get<0>(section->second);
+}
+
+const char * BPFModule::function_source(const string &name) const {
+  return func_src_->src(name);
+}
+
+const char * BPFModule::function_source_rewritten(const string &name) const {
+  return func_src_->src_rewritten(name);
+}
+
+int BPFModule::annotate_prog_tag(const string &name, int prog_fd,
+                                 struct bpf_insn *insns, int prog_len) {
+  unsigned long long tag1, tag2;
+  int err;
+
+  err = bpf_prog_compute_tag(insns, prog_len, &tag1);
+  if (err)
+    return err;
+  err = bpf_prog_get_tag(prog_fd, &tag2);
+  if (err)
+    return err;
+  if (tag1 != tag2) {
+    fprintf(stderr, "prog tag mismatch %llx %llx\n", tag1, tag2);
+    return -1;
+  }
+
+  err = mkdir(BCC_PROG_TAG_DIR, 0777);
+  if (err && errno != EEXIST) {
+    fprintf(stderr, "cannot create " BCC_PROG_TAG_DIR "\n");
+    return -1;
+  }
+
+  char buf[128];
+  ::snprintf(buf, sizeof(buf), BCC_PROG_TAG_DIR "/bpf_prog_%llx", tag1);
+  err = mkdir(buf, 0777);
+  if (err && errno != EEXIST) {
+    fprintf(stderr, "cannot create %s\n", buf);
+    return -1;
+  }
+
+  ::snprintf(buf, sizeof(buf), BCC_PROG_TAG_DIR "/bpf_prog_%llx/%s.c",
+             tag1, name.data());
+  FileDesc fd(open(buf, O_CREAT | O_WRONLY | O_TRUNC, 0644));
+  if (fd < 0) {
+    fprintf(stderr, "cannot create %s\n", buf);
+    return -1;
+  }
+
+  const char *src = function_source(name);
+  write(fd, src, strlen(src));
+
+  ::snprintf(buf, sizeof(buf), BCC_PROG_TAG_DIR "/bpf_prog_%llx/%s.rewritten.c",
+             tag1, name.data());
+  fd = open(buf, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+  if (fd < 0) {
+    fprintf(stderr, "cannot create %s\n", buf);
+    return -1;
+  }
+
+  src = function_source_rewritten(name);
+  write(fd, src, strlen(src));
+
+  if (!src_dbg_fmap_[name].empty()) {
+    ::snprintf(buf, sizeof(buf), BCC_PROG_TAG_DIR "/bpf_prog_%llx/%s.dis.txt",
+               tag1, name.data());
+    fd = open(buf, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+    if (fd < 0) {
+      fprintf(stderr, "cannot create %s\n", buf);
+      return -1;
+    }
+
+    const char *src = src_dbg_fmap_[name].c_str();
+    write(fd, src, strlen(src));
+  }
+
+  return 0;
+}
+
+size_t BPFModule::function_size(size_t id) const {
+  if (id >= function_names_.size())
+    return 0;
+  auto section = sections_.find(function_names_[id]);
+  if (section == sections_.end())
+    return 0;
+  return get<1>(section->second);
+}
+
+size_t BPFModule::function_size(const string &name) const {
+  auto section = sections_.find(FN_PREFIX + name);
+  if (section == sections_.end())
+    return 0;
+
+  return get<1>(section->second);
+}
+
+char * BPFModule::license() const {
+  auto section = sections_.find("license");
+  if (section == sections_.end())
+    return nullptr;
+
+  return (char *)get<0>(section->second);
+}
+
+unsigned BPFModule::kern_version() const {
+  auto section = sections_.find("version");
+  if (section == sections_.end())
+    return 0;
+
+  return *(unsigned *)get<0>(section->second);
+}
+
+size_t BPFModule::num_tables() const { return tables_.size(); }
+
+size_t BPFModule::table_id(const string &name) const {
+  auto it = table_names_.find(name);
+  if (it == table_names_.end()) return ~0ull;
+  return it->second;
+}
+
+int BPFModule::table_fd(const string &name) const {
+  return table_fd(table_id(name));
+}
+
+int BPFModule::table_fd(size_t id) const {
+  if (id >= tables_.size())
+    return -1;
+  return tables_[id]->fd;
+}
+
+int BPFModule::table_type(const string &name) const {
+  return table_type(table_id(name));
+}
+
+int BPFModule::table_type(size_t id) const {
+  if (id >= tables_.size())
+    return -1;
+  return tables_[id]->type;
+}
+
+size_t BPFModule::table_max_entries(const string &name) const {
+  return table_max_entries(table_id(name));
+}
+
+size_t BPFModule::table_max_entries(size_t id) const {
+  if (id >= tables_.size())
+    return 0;
+  return tables_[id]->max_entries;
+}
+
+int BPFModule::table_flags(const string &name) const {
+  return table_flags(table_id(name));
+}
+
+int BPFModule::table_flags(size_t id) const {
+  if (id >= tables_.size())
+    return -1;
+  return tables_[id]->flags;
+}
+
+const char * BPFModule::table_name(size_t id) const {
+  if (id >= tables_.size())
+    return nullptr;
+  return tables_[id]->name.c_str();
+}
+
+const char * BPFModule::table_key_desc(size_t id) const {
+  if (used_b_loader_) return nullptr;
+  if (id >= tables_.size())
+    return nullptr;
+  return tables_[id]->key_desc.c_str();
+}
+
+const char * BPFModule::table_key_desc(const string &name) const {
+  return table_key_desc(table_id(name));
+}
+
+const char * BPFModule::table_leaf_desc(size_t id) const {
+  if (used_b_loader_) return nullptr;
+  if (id >= tables_.size())
+    return nullptr;
+  return tables_[id]->leaf_desc.c_str();
+}
+
+const char * BPFModule::table_leaf_desc(const string &name) const {
+  return table_leaf_desc(table_id(name));
+}
+size_t BPFModule::table_key_size(size_t id) const {
+  if (id >= tables_.size())
+    return 0;
+  return tables_[id]->key_size;
+}
+size_t BPFModule::table_key_size(const string &name) const {
+  return table_key_size(table_id(name));
+}
+
+size_t BPFModule::table_leaf_size(size_t id) const {
+  if (id >= tables_.size())
+    return 0;
+  return tables_[id]->leaf_size;
+}
+size_t BPFModule::table_leaf_size(const string &name) const {
+  return table_leaf_size(table_id(name));
+}
+
+struct TableIterator {
+  TableIterator(size_t key_size, size_t leaf_size)
+      : key(new uint8_t[key_size]), leaf(new uint8_t[leaf_size]) {
+  }
+  unique_ptr<uint8_t[]> key;
+  unique_ptr<uint8_t[]> leaf;
+  uint8_t keyb[512];
+};
+
+int BPFModule::table_key_printf(size_t id, char *buf, size_t buflen, const void *key) {
+  if (id >= tables_.size())
+    return -1;
+  const TableDesc &desc = *tables_[id];
+  StatusTuple rc = desc.key_snprintf(buf, buflen, key);
+  if (rc.code() < 0) {
+    fprintf(stderr, "%s\n", rc.msg().c_str());
+    return -1;
+  }
+  return 0;
+}
+
+int BPFModule::table_leaf_printf(size_t id, char *buf, size_t buflen, const void *leaf) {
+  if (id >= tables_.size())
+    return -1;
+  const TableDesc &desc = *tables_[id];
+  StatusTuple rc = desc.leaf_snprintf(buf, buflen, leaf);
+  if (rc.code() < 0) {
+    fprintf(stderr, "%s\n", rc.msg().c_str());
+    return -1;
+  }
+  return 0;
+}
+
+int BPFModule::table_key_scanf(size_t id, const char *key_str, void *key) {
+  if (id >= tables_.size())
+    return -1;
+  const TableDesc &desc = *tables_[id];
+  StatusTuple rc = desc.key_sscanf(key_str, key);
+  if (rc.code() < 0) {
+    fprintf(stderr, "%s\n", rc.msg().c_str());
+    return -1;
+  }
+  return 0;
+}
+
+int BPFModule::table_leaf_scanf(size_t id, const char *leaf_str, void *leaf) {
+  if (id >= tables_.size())
+    return -1;
+  const TableDesc &desc = *tables_[id];
+  StatusTuple rc = desc.leaf_sscanf(leaf_str, leaf);
+  if (rc.code() < 0) {
+    fprintf(stderr, "%s\n", rc.msg().c_str());
+    return -1;
+  }
+  return 0;
+}
+
+// load a B file, which comes in two parts
+int BPFModule::load_b(const string &filename, const string &proto_filename) {
+  if (!sections_.empty()) {
+    fprintf(stderr, "Program already initialized\n");
+    return -1;
+  }
+  if (filename.empty() || proto_filename.empty()) {
+    fprintf(stderr, "Invalid filenames\n");
+    return -1;
+  }
+
+  // Helpers are inlined in the following file (C). Load the definitions and
+  // pass the partially compiled module to the B frontend to continue with.
+  auto helpers_h = ExportedFiles::headers().find("/virtual/include/bcc/helpers.h");
+  if (helpers_h == ExportedFiles::headers().end()) {
+    fprintf(stderr, "Internal error: missing bcc/helpers.h");
+    return -1;
+  }
+  if (int rc = load_includes(helpers_h->second))
+    return rc;
+
+  BLoader b_loader(flags_);
+  used_b_loader_ = true;
+  if (int rc = b_loader.parse(&*mod_, filename, proto_filename, *ts_, id_,
+                              maps_ns_))
+    return rc;
+  if (rw_engine_enabled_) {
+    if (int rc = annotate())
+      return rc;
+  } else {
+    annotate_light();
+  }
+  if (int rc = finalize())
+    return rc;
+  return 0;
+}
+
+// load a C file
+int BPFModule::load_c(const string &filename, const char *cflags[], int ncflags) {
+  if (!sections_.empty()) {
+    fprintf(stderr, "Program already initialized\n");
+    return -1;
+  }
+  if (filename.empty()) {
+    fprintf(stderr, "Invalid filename\n");
+    return -1;
+  }
+  if (int rc = load_cfile(filename, false, cflags, ncflags))
+    return rc;
+  if (rw_engine_enabled_) {
+    if (int rc = annotate())
+      return rc;
+  } else {
+    annotate_light();
+  }
+  if (int rc = finalize())
+    return rc;
+  return 0;
+}
+
+// load a C text string
+int BPFModule::load_string(const string &text, const char *cflags[], int ncflags) {
+  if (!sections_.empty()) {
+    fprintf(stderr, "Program already initialized\n");
+    return -1;
+  }
+  if (int rc = load_cfile(text, true, cflags, ncflags))
+    return rc;
+  if (rw_engine_enabled_) {
+    if (int rc = annotate())
+      return rc;
+  } else {
+    annotate_light();
+  }
+
+  if (int rc = finalize())
+    return rc;
+  return 0;
+}
+
+} // namespace ebpf
diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h
new file mode 100644
index 0000000..ff237a5
--- /dev/null
+++ b/src/cc/bpf_module.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "bcc_exception.h"
+
+namespace llvm {
+class ExecutionEngine;
+class Function;
+class LLVMContext;
+class Module;
+class Type;
+}
+
+namespace ebpf {
+
+// Options to enable different debug logging.
+enum {
+  // Debug output compiled LLVM IR.
+  DEBUG_LLVM_IR = 0x1,
+  // Debug output loaded BPF bytecode and register state on branches.
+  DEBUG_BPF = 0x2,
+  // Debug output pre-processor result.
+  DEBUG_PREPROCESSOR = 0x4,
+  // Debug output ASM instructions embedded with source.
+  DEBUG_SOURCE = 0x8,
+  // Debug output register state on all instructions in addition to DEBUG_BPF.
+  DEBUG_BPF_REGISTER_STATE = 0x10,
+};
+
+class TableDesc;
+class TableStorage;
+class BLoader;
+class ClangLoader;
+class FuncSource;
+
+class BPFModule {
+ private:
+  static const std::string FN_PREFIX;
+  int init_engine();
+  int parse(llvm::Module *mod);
+  int finalize();
+  int annotate();
+  void annotate_light();
+  std::unique_ptr<llvm::ExecutionEngine> finalize_rw(std::unique_ptr<llvm::Module> mod);
+  std::string make_reader(llvm::Module *mod, llvm::Type *type);
+  std::string make_writer(llvm::Module *mod, llvm::Type *type);
+  void dump_ir(llvm::Module &mod);
+  int load_file_module(std::unique_ptr<llvm::Module> *mod, const std::string &file, bool in_memory);
+  int load_includes(const std::string &text);
+  int load_cfile(const std::string &file, bool in_memory, const char *cflags[], int ncflags);
+  int kbuild_flags(const char *uname_release, std::vector<std::string> *cflags);
+  int run_pass_manager(llvm::Module &mod);
+  StatusTuple sscanf(std::string fn_name, const char *str, void *val);
+  StatusTuple snprintf(std::string fn_name, char *str, size_t sz,
+                       const void *val);
+
+ public:
+  BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true,
+            const std::string &maps_ns = "");
+  ~BPFModule();
+  int load_b(const std::string &filename, const std::string &proto_filename);
+  int load_c(const std::string &filename, const char *cflags[], int ncflags);
+  int load_string(const std::string &text, const char *cflags[], int ncflags);
+  std::string id() const { return id_; }
+  std::string maps_ns() const { return maps_ns_; }
+  size_t num_functions() const;
+  uint8_t * function_start(size_t id) const;
+  uint8_t * function_start(const std::string &name) const;
+  const char * function_source(const std::string &name) const;
+  const char * function_source_rewritten(const std::string &name) const;
+  int annotate_prog_tag(const std::string &name, int fd,
+			struct bpf_insn *insn, int prog_len);
+  const char * function_name(size_t id) const;
+  size_t function_size(size_t id) const;
+  size_t function_size(const std::string &name) const;
+  size_t num_tables() const;
+  size_t table_id(const std::string &name) const;
+  int table_fd(size_t id) const;
+  int table_fd(const std::string &name) const;
+  const char * table_name(size_t id) const;
+  int table_type(const std::string &name) const;
+  int table_type(size_t id) const;
+  size_t table_max_entries(const std::string &name) const;
+  size_t table_max_entries(size_t id) const;
+  int table_flags(const std::string &name) const;
+  int table_flags(size_t id) const;
+  const char * table_key_desc(size_t id) const;
+  const char * table_key_desc(const std::string &name) const;
+  size_t table_key_size(size_t id) const;
+  size_t table_key_size(const std::string &name) const;
+  int table_key_printf(size_t id, char *buf, size_t buflen, const void *key);
+  int table_key_scanf(size_t id, const char *buf, void *key);
+  const char * table_leaf_desc(size_t id) const;
+  const char * table_leaf_desc(const std::string &name) const;
+  size_t table_leaf_size(size_t id) const;
+  size_t table_leaf_size(const std::string &name) const;
+  int table_leaf_printf(size_t id, char *buf, size_t buflen, const void *leaf);
+  int table_leaf_scanf(size_t id, const char *buf, void *leaf);
+  char * license() const;
+  unsigned kern_version() const;
+  TableStorage &table_storage() { return *ts_; }
+
+ private:
+  unsigned flags_;  // 0x1 for printing
+  bool rw_engine_enabled_;
+  bool used_b_loader_;
+  std::string filename_;
+  std::string proto_filename_;
+  std::unique_ptr<llvm::LLVMContext> ctx_;
+  std::unique_ptr<llvm::ExecutionEngine> engine_;
+  std::unique_ptr<llvm::ExecutionEngine> rw_engine_;
+  std::unique_ptr<llvm::Module> mod_;
+  std::unique_ptr<FuncSource> func_src_;
+  std::map<std::string, std::tuple<uint8_t *, uintptr_t>> sections_;
+  std::vector<TableDesc *> tables_;
+  std::map<std::string, size_t> table_names_;
+  std::vector<std::string> function_names_;
+  std::map<llvm::Type *, std::string> readers_;
+  std::map<llvm::Type *, std::string> writers_;
+  std::string id_;
+  std::string maps_ns_;
+  std::string mod_src_;
+  std::map<std::string, std::string> src_dbg_fmap_;
+  TableStorage *ts_;
+  std::unique_ptr<TableStorage> local_ts_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/bpffs_table.cc b/src/cc/bpffs_table.cc
new file mode 100644
index 0000000..6b49942
--- /dev/null
+++ b/src/cc/bpffs_table.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.h"
+#include "table_storage_impl.h"
+
+namespace ebpf {
+
+using std::string;
+using std::unique_ptr;
+
+/// A filesystem backed table storage
+class BpfFsTableStorage : public TableStorageImpl {
+ public:
+  class iterator : public TableStorageIteratorImpl {
+   public:
+    virtual ~iterator() {}
+    virtual unique_ptr<self_type> clone() const override;
+    virtual self_type &operator++() override;
+    virtual value_type &operator*() const override;
+    virtual pointer operator->() const override;
+  };
+  virtual ~BpfFsTableStorage() {}
+  virtual bool Find(const string &name, TableStorage::iterator &result) const override;
+  virtual bool Insert(const string &name, TableDesc &&desc) override;
+  virtual bool Delete(const string &name) override;
+  virtual unique_ptr<TableStorageIteratorImpl> begin() override;
+  virtual unique_ptr<TableStorageIteratorImpl> end() override;
+  virtual unique_ptr<TableStorageIteratorImpl> lower_bound(const string &k) override;
+  virtual unique_ptr<TableStorageIteratorImpl> upper_bound(const string &k) override;
+  virtual unique_ptr<TableStorageIteratorImpl> erase(const TableStorageIteratorImpl &it) override;
+
+ private:
+};
+
+bool BpfFsTableStorage::Find(const string &name, TableStorage::iterator &result) const {
+  return false;
+}
+
+bool BpfFsTableStorage::Insert(const string &name, TableDesc &&desc) { return false; }
+
+bool BpfFsTableStorage::Delete(const string &name) { return false; }
+
+unique_ptr<TableStorageIteratorImpl> BpfFsTableStorage::begin() { return unique_ptr<iterator>(); }
+unique_ptr<TableStorageIteratorImpl> BpfFsTableStorage::end() { return unique_ptr<iterator>(); }
+unique_ptr<TableStorageIteratorImpl> BpfFsTableStorage::lower_bound(const string &k) {
+  return unique_ptr<iterator>();
+}
+unique_ptr<TableStorageIteratorImpl> BpfFsTableStorage::upper_bound(const string &k) {
+  return unique_ptr<iterator>();
+}
+unique_ptr<TableStorageIteratorImpl> BpfFsTableStorage::erase(const TableStorageIteratorImpl &it) {
+  return unique_ptr<iterator>();
+}
+
+unique_ptr<TableStorage> createBpfFsTableStorage() {
+  auto t = make_unique<TableStorage>();
+  t->Init(make_unique<BpfFsTableStorage>());
+  return t;
+}
+
+}  // namespace ebpf
diff --git a/src/cc/clang/include/stdarg.h b/src/cc/clang/include/stdarg.h
new file mode 100644
index 0000000..e0a43f5
--- /dev/null
+++ b/src/cc/clang/include/stdarg.h
@@ -0,0 +1,54 @@
+R"********(
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+/* Hack required to make standard headers work, at least on Ubuntu */
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+#endif
+typedef __builtin_va_list __gnuc_va_list;
+
+#endif /* __STDARG_H */
+)********"
diff --git a/src/cc/common.cc b/src/cc/common.cc
new file mode 100644
index 0000000..c8370a3
--- /dev/null
+++ b/src/cc/common.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016 Catalysts GmbH
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fstream>
+#include <sstream>
+
+#include "common.h"
+#include "vendor/tinyformat.hpp"
+
+namespace ebpf {
+
+std::vector<int> read_cpu_range(std::string path) {
+  std::ifstream cpus_range_stream { path };
+  std::vector<int> cpus;
+  std::string cpu_range;
+
+  while (std::getline(cpus_range_stream, cpu_range, ',')) {
+    std::size_t rangeop = cpu_range.find('-');
+    if (rangeop == std::string::npos) {
+      cpus.push_back(std::stoi(cpu_range));
+    }
+    else {
+      int start = std::stoi(cpu_range.substr(0, rangeop));
+      int end = std::stoi(cpu_range.substr(rangeop + 1));
+      for (int i = start; i <= end; i++)
+        cpus.push_back(i);
+    }
+  }
+  return cpus;
+}
+
+std::vector<int> get_online_cpus() {
+  return read_cpu_range("/sys/devices/system/cpu/online");
+}
+
+std::vector<int> get_possible_cpus() {
+  return read_cpu_range("/sys/devices/system/cpu/possible");
+}
+
+std::string get_pid_exe(pid_t pid) {
+  char exe_path[4096];
+  int res;
+
+  std::string exe_link = tfm::format("/proc/%d/exe", pid);
+  res = readlink(exe_link.c_str(), exe_path, sizeof(exe_path));
+  if (res == -1)
+    return "";
+  if (res >= sizeof(exe_path))
+    res = sizeof(exe_path) - 1;
+  exe_path[res] = '\0';
+  return std::string(exe_path);
+}
+
+} // namespace ebpf
diff --git a/src/cc/common.h b/src/cc/common.h
new file mode 100644
index 0000000..c227474
--- /dev/null
+++ b/src/cc/common.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unistd.h>
+#include <vector>
+
+namespace ebpf {
+
+template <class T, class... Args>
+typename std::enable_if<!std::is_array<T>::value, std::unique_ptr<T>>::type
+make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+std::vector<int> get_online_cpus();
+
+std::vector<int> get_possible_cpus();
+
+std::string get_pid_exe(pid_t pid);
+
+}  // namespace ebpf
diff --git a/src/cc/compat/linux/bpf.h b/src/cc/compat/linux/bpf.h
new file mode 100644
index 0000000..f780fd4
--- /dev/null
+++ b/src/cc/compat/linux/bpf.h
@@ -0,0 +1,2945 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _UAPI__LINUX_BPF_H__
+#define _UAPI__LINUX_BPF_H__
+
+#include <linux/types.h>
+#include "bpf_common.h"
+
+/* Extended instruction set based on top of classic BPF */
+
+/* instruction classes */
+#define BPF_ALU64	0x07	/* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW		0x18	/* double word (64-bit) */
+#define BPF_XADD	0xc0	/* exclusive add */
+
+/* alu/jmp fields */
+#define BPF_MOV		0xb0	/* mov reg to reg */
+#define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END		0xd0	/* flags for endianness conversion: */
+#define BPF_TO_LE	0x00	/* convert to little-endian */
+#define BPF_TO_BE	0x08	/* convert to big-endian */
+#define BPF_FROM_LE	BPF_TO_LE
+#define BPF_FROM_BE	BPF_TO_BE
+
+/* jmp encodings */
+#define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
+#define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
+#define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
+#define BPF_CALL	0x80	/* function call */
+#define BPF_EXIT	0x90	/* function return */
+
+/* Register numbers */
+enum {
+	BPF_REG_0 = 0,
+	BPF_REG_1,
+	BPF_REG_2,
+	BPF_REG_3,
+	BPF_REG_4,
+	BPF_REG_5,
+	BPF_REG_6,
+	BPF_REG_7,
+	BPF_REG_8,
+	BPF_REG_9,
+	BPF_REG_10,
+	__MAX_BPF_REG,
+};
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG	__MAX_BPF_REG
+
+struct bpf_insn {
+	__u8	code;		/* opcode */
+	__u8	dst_reg:4;	/* dest register */
+	__u8	src_reg:4;	/* source register */
+	__s16	off;		/* signed offset */
+	__s32	imm;		/* signed immediate constant */
+};
+
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
+struct bpf_cgroup_storage_key {
+	__u64	cgroup_inode_id;	/* cgroup inode id */
+	__u32	attach_type;		/* program attach type */
+};
+
+/* BPF syscall commands, see bpf(2) man-page for details. */
+enum bpf_cmd {
+	BPF_MAP_CREATE,
+	BPF_MAP_LOOKUP_ELEM,
+	BPF_MAP_UPDATE_ELEM,
+	BPF_MAP_DELETE_ELEM,
+	BPF_MAP_GET_NEXT_KEY,
+	BPF_PROG_LOAD,
+	BPF_OBJ_PIN,
+	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
+	BPF_PROG_TEST_RUN,
+	BPF_PROG_GET_NEXT_ID,
+	BPF_MAP_GET_NEXT_ID,
+	BPF_PROG_GET_FD_BY_ID,
+	BPF_MAP_GET_FD_BY_ID,
+	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
+	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
+};
+
+enum bpf_map_type {
+	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
+	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
+	BPF_MAP_TYPE_PERCPU_ARRAY,
+	BPF_MAP_TYPE_STACK_TRACE,
+	BPF_MAP_TYPE_CGROUP_ARRAY,
+	BPF_MAP_TYPE_LRU_HASH,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
+	BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	BPF_MAP_TYPE_HASH_OF_MAPS,
+	BPF_MAP_TYPE_DEVMAP,
+	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
+	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_QUEUE,
+	BPF_MAP_TYPE_STACK,
+};
+
+enum bpf_prog_type {
+	BPF_PROG_TYPE_UNSPEC,
+	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
+	BPF_PROG_TYPE_SCHED_CLS,
+	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_XDP,
+	BPF_PROG_TYPE_PERF_EVENT,
+	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
+	BPF_PROG_TYPE_SOCK_OPS,
+	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_RAW_TRACEPOINT,
+	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_FLOW_DISSECTOR,
+};
+
+enum bpf_attach_type {
+	BPF_CGROUP_INET_INGRESS,
+	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
+	BPF_CGROUP_SOCK_OPS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
+	BPF_SK_MSG_VERDICT,
+	BPF_CGROUP_INET4_BIND,
+	BPF_CGROUP_INET6_BIND,
+	BPF_CGROUP_INET4_CONNECT,
+	BPF_CGROUP_INET6_CONNECT,
+	BPF_CGROUP_INET4_POST_BIND,
+	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
+	BPF_FLOW_DISSECTOR,
+	__MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
+ */
+#define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
+
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
+/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
+#define BPF_PSEUDO_MAP_FD	1
+
+/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
+ * offset to another bpf function
+ */
+#define BPF_PSEUDO_CALL		1
+
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY		0 /* create new element or update existing */
+#define BPF_NOEXIST	1 /* create new element if it didn't exist */
+#define BPF_EXIST	2 /* update existing element */
+
+/* flags for BPF_MAP_CREATE command */
+#define BPF_F_NO_PREALLOC	(1U << 0)
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+#define BPF_F_NO_COMMON_LRU	(1U << 1)
+/* Specify numa node during map creation */
+#define BPF_F_NUMA_NODE		(1U << 2)
+
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
+#define BPF_OBJ_NAME_LEN 16U
+
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID	(1U << 5)
+
+enum bpf_stack_build_id_status {
+	/* user space need an empty entry to identify end of a trace */
+	BPF_STACK_BUILD_ID_EMPTY = 0,
+	/* with valid build_id and offset */
+	BPF_STACK_BUILD_ID_VALID = 1,
+	/* couldn't get build_id, fallback to ip */
+	BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+	__s32		status;
+	unsigned char	build_id[BPF_BUILD_ID_SIZE];
+	union {
+		__u64	offset;
+		__u64	ip;
+	};
+};
+
+union bpf_attr {
+	struct { /* anonymous struct used by BPF_MAP_CREATE command */
+		__u32	map_type;	/* one of enum bpf_map_type */
+		__u32	key_size;	/* size of key in bytes */
+		__u32	value_size;	/* size of value in bytes */
+		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* BPF_MAP_CREATE related
+					 * flags defined above.
+					 */
+		__u32	inner_map_fd;	/* fd pointing to the inner map */
+		__u32	numa_node;	/* numa node (effective only if
+					 * BPF_F_NUMA_NODE is set).
+					 */
+		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
+	};
+
+	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+		__u32		map_fd;
+		__aligned_u64	key;
+		union {
+			__aligned_u64 value;
+			__aligned_u64 next_key;
+		};
+		__u64		flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_LOAD command */
+		__u32		prog_type;	/* one of enum bpf_prog_type */
+		__u32		insn_cnt;
+		__aligned_u64	insns;
+		__aligned_u64	license;
+		__u32		log_level;	/* verbosity level of verifier */
+		__u32		log_size;	/* size of user buffer */
+		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
+		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
+		/* For some prog types expected attach type must be known at
+		 * load time to verify attach type specific parts of prog
+		 * (context accesses, allowed helpers, etc).
+		 */
+		__u32		expected_attach_type;
+	};
+
+	struct { /* anonymous struct used by BPF_OBJ_* commands */
+		__aligned_u64	pathname;
+		__u32		bpf_fd;
+		__u32		file_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;
+		__u32		attach_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+		__u32		prog_fd;
+		__u32		retval;
+		__u32		data_size_in;
+		__u32		data_size_out;
+		__aligned_u64	data_in;
+		__aligned_u64	data_out;
+		__u32		repeat;
+		__u32		duration;
+	} test;
+
+	struct { /* anonymous struct used by BPF_*_GET_*_ID */
+		union {
+			__u32		start_id;
+			__u32		prog_id;
+			__u32		map_id;
+			__u32		btf_id;
+		};
+		__u32		next_id;
+		__u32		open_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
+		__u32		bpf_fd;
+		__u32		info_len;
+		__aligned_u64	info;
+	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
+
+	struct {
+		__u64 name;
+		__u32 prog_fd;
+	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
+} __attribute__((aligned(8)));
+
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is removed to
+ * 		make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
+ *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_addr** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_ops** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ * 	Return
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
+ *
+ * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only), and ifindex
+ *		is set to the device index of the nexthop from the FIB lookup.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
+ *
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *     Return
+ *		* < 0 if any input argument is invalid
+ *		*   0 on success (packet is forwarded, nexthop neighbor exists)
+ *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ *		  packet is not forwarded or needs assist from full stack
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
+ *
+ * void* get_local_storage(void *map, u64 flags)
+ *	Description
+ *		Get the pointer to the local storage area.
+ *		The type and the size of the local storage is defined
+ *		by the *map* argument.
+ *		The *flags* meaning is specific for each map type,
+ *		and has to be 0 for cgroup local storage.
+ *
+ *		Depending on the bpf program type, a local storage area
+ *		can be shared between multiple instances of the bpf program,
+ *		running simultaneously.
+ *
+ *		A user should care about the synchronization by himself.
+ *		For example, by using the BPF_STX_XADD instruction to alter
+ *		the shared data.
+ *	Return
+ *		Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ *		It checks the selected sk is matching the incoming
+ *		request in the skb.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ *	Description
+ *		For socket policies, insert *len* bytes into msg at offset
+ *		*start*.
+ *
+ *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ *		*msg* it may want to insert metadata or options into the msg.
+ *		This can later be read and used by any of the lower layer BPF
+ *		hooks.
+ *
+ *		This helper may fail if under memory pressure (a malloc
+ *		fails) in these cases BPF programs will get an appropriate
+ *		error and BPF programs will need to handle them.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ */
+#define __BPF_FUNC_MAPPER(FN)		\
+	FN(unspec),			\
+	FN(map_lookup_elem),		\
+	FN(map_update_elem),		\
+	FN(map_delete_elem),		\
+	FN(probe_read),			\
+	FN(ktime_get_ns),		\
+	FN(trace_printk),		\
+	FN(get_prandom_u32),		\
+	FN(get_smp_processor_id),	\
+	FN(skb_store_bytes),		\
+	FN(l3_csum_replace),		\
+	FN(l4_csum_replace),		\
+	FN(tail_call),			\
+	FN(clone_redirect),		\
+	FN(get_current_pid_tgid),	\
+	FN(get_current_uid_gid),	\
+	FN(get_current_comm),		\
+	FN(get_cgroup_classid),		\
+	FN(skb_vlan_push),		\
+	FN(skb_vlan_pop),		\
+	FN(skb_get_tunnel_key),		\
+	FN(skb_set_tunnel_key),		\
+	FN(perf_event_read),		\
+	FN(redirect),			\
+	FN(get_route_realm),		\
+	FN(perf_event_output),		\
+	FN(skb_load_bytes),		\
+	FN(get_stackid),		\
+	FN(csum_diff),			\
+	FN(skb_get_tunnel_opt),		\
+	FN(skb_set_tunnel_opt),		\
+	FN(skb_change_proto),		\
+	FN(skb_change_type),		\
+	FN(skb_under_cgroup),		\
+	FN(get_hash_recalc),		\
+	FN(get_current_task),		\
+	FN(probe_write_user),		\
+	FN(current_task_under_cgroup),	\
+	FN(skb_change_tail),		\
+	FN(skb_pull_data),		\
+	FN(csum_update),		\
+	FN(set_hash_invalid),		\
+	FN(get_numa_node_id),		\
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),		\
+	FN(probe_read_str),		\
+	FN(get_socket_cookie),		\
+	FN(get_socket_uid),		\
+	FN(set_hash),			\
+	FN(setsockopt),			\
+	FN(skb_adjust_room),		\
+	FN(redirect_map),		\
+	FN(sk_redirect_map),		\
+	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),	\
+	FN(getsockopt),			\
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),	\
+	FN(msg_redirect_map),		\
+	FN(msg_apply_bytes),		\
+	FN(msg_cork_bytes),		\
+	FN(msg_pull_data),		\
+	FN(bind),			\
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),	\
+	FN(get_local_storage),		\
+	FN(sk_select_reuseport),	\
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),			\
+	FN(map_push_elem),		\
+	FN(map_pop_elem),		\
+	FN(map_peek_elem),		\
+	FN(msg_push_data),
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x
+enum bpf_func_id {
+	__BPF_FUNC_MAPPER(__BPF_ENUM_FN)
+	__BPF_FUNC_MAX_ID,
+};
+#undef __BPF_ENUM_FN
+
+/* All flags used by eBPF helper functions, placed here. */
+
+/* BPF_FUNC_skb_store_bytes flags. */
+#define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
+
+/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
+ * First 4 bits are for passing the header field size.
+ */
+#define BPF_F_HDR_FIELD_MASK		0xfULL
+
+/* BPF_FUNC_l4_csum_replace flags. */
+#define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
+#define BPF_F_MARK_ENFORCE		(1ULL << 6)
+
+/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
+#define BPF_F_INGRESS			(1ULL << 0)
+
+/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
+#define BPF_F_TUNINFO_IPV6		(1ULL << 0)
+
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
+#define BPF_F_SKIP_FIELD_MASK		0xffULL
+#define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
+#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
+#define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
+
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+#define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
+#define BPF_F_SEQ_NUMBER		(1ULL << 3)
+
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
+#define BPF_F_INDEX_MASK		0xffffffffULL
+#define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+/* BPF_FUNC_perf_event_output for sk_buff input context. */
+#define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
+
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+	BPF_ADJ_ROOM_NET,
+};
+
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
+/* user accessible mirror of in-kernel sk_buff.
+ * new fields can only be added to the end of this structure
+ */
+struct __sk_buff {
+	__u32 len;
+	__u32 pkt_type;
+	__u32 mark;
+	__u32 queue_mapping;
+	__u32 protocol;
+	__u32 vlan_present;
+	__u32 vlan_tci;
+	__u32 vlan_proto;
+	__u32 priority;
+	__u32 ingress_ifindex;
+	__u32 ifindex;
+	__u32 tc_index;
+	__u32 cb[5];
+	__u32 hash;
+	__u32 tc_classid;
+	__u32 data;
+	__u32 data_end;
+	__u32 napi_id;
+
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
+	struct bpf_flow_keys *flow_keys;
+};
+
+struct bpf_tunnel_key {
+	__u32 tunnel_id;
+	union {
+		__u32 remote_ipv4;
+		__u32 remote_ipv6[4];
+	};
+	__u8 tunnel_tos;
+	__u8 tunnel_ttl;
+	__u16 tunnel_ext;	/* Padding, future use. */
+	__u32 tunnel_label;
+};
+
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	__u16 ext;	/* Padding, future use. */
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
+struct bpf_sock {
+	__u32 bound_dev_if;
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
+	__u32 mark;
+	__u32 priority;
+	__u32 src_ip4;		/* Allows 1,2,4-byte read.
+				 * Stored in network byte order.
+				 */
+	__u32 src_ip6[4];	/* Allows 1,2,4-byte read.
+				 * Stored in network byte order.
+				 */
+	__u32 src_port;		/* Allows 4-byte read.
+				 * Stored in host byte order
+				 */
+};
+
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
+#define XDP_PACKET_HEADROOM 256
+
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will
+ * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
+ */
+enum xdp_action {
+	XDP_ABORTED = 0,
+	XDP_DROP,
+	XDP_PASS,
+	XDP_TX,
+	XDP_REDIRECT,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+	__u32 data;
+	__u32 data_end;
+	__u32 data_meta;
+	/* Below access go through struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
+};
+
+enum sk_action {
+	SK_DROP = 0,
+	SK_PASS,
+};
+
+/* user accessible metadata for SK_MSG packet hook, new fields must
+ * be added to the end of this structure
+ */
+struct sk_msg_md {
+	void *data;
+	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+};
+
+struct sk_reuseport_md {
+	/*
+	 * Start of directly accessible data. It begins from
+	 * the tcp/udp header.
+	 */
+	void *data;
+	void *data_end;		/* End of directly accessible data */
+	/*
+	 * Total length of packet (starting from the tcp/udp header).
+	 * Note that the directly accessible bytes (data_end - data)
+	 * could be less than this "len".  Those bytes could be
+	 * indirectly read by a helper "bpf_skb_load_bytes()".
+	 */
+	__u32 len;
+	/*
+	 * Eth protocol in the mac header (network byte order). e.g.
+	 * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+	 */
+	__u32 eth_protocol;
+	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+	__u32 bind_inany;	/* Is sock bound to an INANY address? */
+	__u32 hash;		/* A hash of the packet 4 tuples */
+};
+
+#define BPF_TAG_SIZE	8
+
+struct bpf_prog_info {
+	__u32 type;
+	__u32 id;
+	__u8  tag[BPF_TAG_SIZE];
+	__u32 jited_prog_len;
+	__u32 xlated_prog_len;
+	__aligned_u64 jited_prog_insns;
+	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 gpl_compatible:1;
+	__u64 netns_dev;
+	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
+	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
+} __attribute__((aligned(8)));
+
+struct bpf_map_info {
+	__u32 type;
+	__u32 id;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 map_flags;
+	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 :32;
+	__u64 netns_dev;
+	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
+} __attribute__((aligned(8)));
+
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
+} __attribute__((aligned(8)));
+
+/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
+ * by user and intended to be used by socket (e.g. to bind to, depends on
+ * attach attach type).
+ */
+struct bpf_sock_addr {
+	__u32 user_family;	/* Allows 4-byte read, but no write. */
+	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 user_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 user_port;	/* Allows 4-byte read and write.
+				 * Stored in network byte order
+				 */
+	__u32 family;		/* Allows 4-byte read, but no write */
+	__u32 type;		/* Allows 4-byte read, but no write */
+	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+};
+
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+	__u32 op;
+	union {
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
+	};
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+							 * supported cb flags
+							 */
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+	BPF_SOCK_OPS_VOID,
+	BPF_SOCK_OPS_TIMEOUT_INIT,	/* Should return SYN-RTO value to use or
+					 * -1 if default value should be used
+					 */
+	BPF_SOCK_OPS_RWND_INIT,		/* Should return initial advertized
+					 * window (in packets) or -1 if default
+					 * value should be used
+					 */
+	BPF_SOCK_OPS_TCP_CONNECT_CB,	/* Calls BPF program right before an
+					 * active connection is initialized
+					 */
+	BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB,	/* Calls BPF program when an
+						 * active connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,	/* Calls BPF program when a
+						 * passive connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
+					 * needs ECN
+					 */
+	BPF_SOCK_OPS_BASE_RTT,		/* Get base RTT. The correct value is
+					 * based on the path and may be
+					 * dependent on the congestion control
+					 * algorithm. In general it indicates
+					 * a congestion threshold. RTTs above
+					 * this indicate congestion
+					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
+					 * socket transition to LISTEN state.
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
+};
+
+#define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
+
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
+	__u32 major;
+	__u32 minor;
+};
+
+struct bpf_raw_tracepoint_args {
+	__u64 args[0];
+};
+
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+enum {
+	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
+	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed; can be dropped */
+	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable; can be dropped */
+	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed; can be dropped */
+	BPF_FIB_LKUP_RET_NOT_FWDED,    /* packet is not forwarded */
+	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
+	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
+	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+};
+
+struct bpf_fib_lookup {
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+
+	/* input: L3 device index for lookup
+	 * output: device index from FIB lookup
+	 */
+	__u32	ifindex;
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
+
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
+	};
+
+	union {
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
+	 */
+	union {
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
+struct bpf_flow_keys {
+	__u16	nhoff;
+	__u16	thoff;
+	__u16	addr_proto;			/* ETH_P_* of valid addrs */
+	__u8	is_frag;
+	__u8	is_first_frag;
+	__u8	is_encap;
+	__u8	ip_proto;
+	__be16	n_proto;
+	__be16	sport;
+	__be16	dport;
+	union {
+		struct {
+			__be32	ipv4_src;
+			__be32	ipv4_dst;
+		};
+		struct {
+			__u32	ipv6_src[4];	/* in6_addr; network order */
+			__u32	ipv6_dst[4];	/* in6_addr; network order */
+		};
+	};
+};
+
+#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/src/cc/compat/linux/bpf_common.h b/src/cc/compat/linux/bpf_common.h
new file mode 100644
index 0000000..a5c220e
--- /dev/null
+++ b/src/cc/compat/linux/bpf_common.h
@@ -0,0 +1,55 @@
+#ifndef _UAPI__LINUX_BPF_COMMON_H__
+#define _UAPI__LINUX_BPF_COMMON_H__
+
+/* Instruction classes */
+#define BPF_CLASS(code) ((code) & 0x07)
+#define		BPF_LD		0x00
+#define		BPF_LDX		0x01
+#define		BPF_ST		0x02
+#define		BPF_STX		0x03
+#define		BPF_ALU		0x04
+#define		BPF_JMP		0x05
+#define		BPF_RET		0x06
+#define		BPF_MISC        0x07
+
+/* ld/ldx fields */
+#define BPF_SIZE(code)  ((code) & 0x18)
+#define		BPF_W		0x00
+#define		BPF_H		0x08
+#define		BPF_B		0x10
+#define BPF_MODE(code)  ((code) & 0xe0)
+#define		BPF_IMM		0x00
+#define		BPF_ABS		0x20
+#define		BPF_IND		0x40
+#define		BPF_MEM		0x60
+#define		BPF_LEN		0x80
+#define		BPF_MSH		0xa0
+
+/* alu/jmp fields */
+#define BPF_OP(code)    ((code) & 0xf0)
+#define		BPF_ADD		0x00
+#define		BPF_SUB		0x10
+#define		BPF_MUL		0x20
+#define		BPF_DIV		0x30
+#define		BPF_OR		0x40
+#define		BPF_AND		0x50
+#define		BPF_LSH		0x60
+#define		BPF_RSH		0x70
+#define		BPF_NEG		0x80
+#define		BPF_MOD		0x90
+#define		BPF_XOR		0xa0
+
+#define		BPF_JA		0x00
+#define		BPF_JEQ		0x10
+#define		BPF_JGT		0x20
+#define		BPF_JGE		0x30
+#define		BPF_JSET        0x40
+#define BPF_SRC(code)   ((code) & 0x08)
+#define		BPF_K		0x00
+#define		BPF_X		0x08
+
+#ifndef BPF_MAXINSNS
+#define BPF_MAXINSNS 4096
+#endif
+
+#endif /* _UAPI__LINUX_BPF_COMMON_H__ */
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
new file mode 100644
index 0000000..26039d5
--- /dev/null
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -0,0 +1,2947 @@
+R"********(
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _UAPI__LINUX_BPF_H__
+#define _UAPI__LINUX_BPF_H__
+
+#include <linux/types.h>
+#include <linux/bpf_common.h>
+
+/* Extended instruction set based on top of classic BPF */
+
+/* instruction classes */
+#define BPF_ALU64	0x07	/* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW		0x18	/* double word (64-bit) */
+#define BPF_XADD	0xc0	/* exclusive add */
+
+/* alu/jmp fields */
+#define BPF_MOV		0xb0	/* mov reg to reg */
+#define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END		0xd0	/* flags for endianness conversion: */
+#define BPF_TO_LE	0x00	/* convert to little-endian */
+#define BPF_TO_BE	0x08	/* convert to big-endian */
+#define BPF_FROM_LE	BPF_TO_LE
+#define BPF_FROM_BE	BPF_TO_BE
+
+/* jmp encodings */
+#define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
+#define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
+#define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
+#define BPF_CALL	0x80	/* function call */
+#define BPF_EXIT	0x90	/* function return */
+
+/* Register numbers */
+enum {
+	BPF_REG_0 = 0,
+	BPF_REG_1,
+	BPF_REG_2,
+	BPF_REG_3,
+	BPF_REG_4,
+	BPF_REG_5,
+	BPF_REG_6,
+	BPF_REG_7,
+	BPF_REG_8,
+	BPF_REG_9,
+	BPF_REG_10,
+	__MAX_BPF_REG,
+};
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG	__MAX_BPF_REG
+
+struct bpf_insn {
+	__u8	code;		/* opcode */
+	__u8	dst_reg:4;	/* dest register */
+	__u8	src_reg:4;	/* source register */
+	__s16	off;		/* signed offset */
+	__s32	imm;		/* signed immediate constant */
+};
+
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+	__u32	prefixlen;	/* up to 32 for AF_INET, 128 for AF_INET6 */
+	__u8	data[0];	/* Arbitrary size */
+};
+
+struct bpf_cgroup_storage_key {
+	__u64	cgroup_inode_id;	/* cgroup inode id */
+	__u32	attach_type;		/* program attach type */
+};
+
+/* BPF syscall commands, see bpf(2) man-page for details. */
+enum bpf_cmd {
+	BPF_MAP_CREATE,
+	BPF_MAP_LOOKUP_ELEM,
+	BPF_MAP_UPDATE_ELEM,
+	BPF_MAP_DELETE_ELEM,
+	BPF_MAP_GET_NEXT_KEY,
+	BPF_PROG_LOAD,
+	BPF_OBJ_PIN,
+	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
+	BPF_PROG_TEST_RUN,
+	BPF_PROG_GET_NEXT_ID,
+	BPF_MAP_GET_NEXT_ID,
+	BPF_PROG_GET_FD_BY_ID,
+	BPF_MAP_GET_FD_BY_ID,
+	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
+	BPF_RAW_TRACEPOINT_OPEN,
+	BPF_BTF_LOAD,
+	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
+};
+
+enum bpf_map_type {
+	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
+	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
+	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
+	BPF_MAP_TYPE_PERCPU_ARRAY,
+	BPF_MAP_TYPE_STACK_TRACE,
+	BPF_MAP_TYPE_CGROUP_ARRAY,
+	BPF_MAP_TYPE_LRU_HASH,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH,
+	BPF_MAP_TYPE_LPM_TRIE,
+	BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	BPF_MAP_TYPE_HASH_OF_MAPS,
+	BPF_MAP_TYPE_DEVMAP,
+	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
+	BPF_MAP_TYPE_SOCKHASH,
+	BPF_MAP_TYPE_CGROUP_STORAGE,
+	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_QUEUE,
+	BPF_MAP_TYPE_STACK,
+};
+
+enum bpf_prog_type {
+	BPF_PROG_TYPE_UNSPEC,
+	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_KPROBE,
+	BPF_PROG_TYPE_SCHED_CLS,
+	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_XDP,
+	BPF_PROG_TYPE_PERF_EVENT,
+	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
+	BPF_PROG_TYPE_SOCK_OPS,
+	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
+	BPF_PROG_TYPE_SK_MSG,
+	BPF_PROG_TYPE_RAW_TRACEPOINT,
+	BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+	BPF_PROG_TYPE_LWT_SEG6LOCAL,
+	BPF_PROG_TYPE_LIRC_MODE2,
+	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_FLOW_DISSECTOR,
+};
+
+enum bpf_attach_type {
+	BPF_CGROUP_INET_INGRESS,
+	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
+	BPF_CGROUP_SOCK_OPS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
+	BPF_SK_MSG_VERDICT,
+	BPF_CGROUP_INET4_BIND,
+	BPF_CGROUP_INET6_BIND,
+	BPF_CGROUP_INET4_CONNECT,
+	BPF_CGROUP_INET6_CONNECT,
+	BPF_CGROUP_INET4_POST_BIND,
+	BPF_CGROUP_INET6_POST_BIND,
+	BPF_CGROUP_UDP4_SENDMSG,
+	BPF_CGROUP_UDP6_SENDMSG,
+	BPF_LIRC_MODE2,
+	BPF_FLOW_DISSECTOR,
+	__MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
+ */
+#define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
+
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
+/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
+#define BPF_PSEUDO_MAP_FD	1
+
+/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
+ * offset to another bpf function
+ */
+#define BPF_PSEUDO_CALL		1
+
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY		0 /* create new element or update existing */
+#define BPF_NOEXIST	1 /* create new element if it didn't exist */
+#define BPF_EXIST	2 /* update existing element */
+
+/* flags for BPF_MAP_CREATE command */
+#define BPF_F_NO_PREALLOC	(1U << 0)
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+#define BPF_F_NO_COMMON_LRU	(1U << 1)
+/* Specify numa node during map creation */
+#define BPF_F_NUMA_NODE		(1U << 2)
+
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
+#define BPF_OBJ_NAME_LEN 16U
+
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
+/* Flag for stack_map, store build_id+offset instead of pointer */
+#define BPF_F_STACK_BUILD_ID	(1U << 5)
+
+enum bpf_stack_build_id_status {
+	/* user space need an empty entry to identify end of a trace */
+	BPF_STACK_BUILD_ID_EMPTY = 0,
+	/* with valid build_id and offset */
+	BPF_STACK_BUILD_ID_VALID = 1,
+	/* couldn't get build_id, fallback to ip */
+	BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+	__s32		status;
+	unsigned char	build_id[BPF_BUILD_ID_SIZE];
+	union {
+		__u64	offset;
+		__u64	ip;
+	};
+};
+
+union bpf_attr {
+	struct { /* anonymous struct used by BPF_MAP_CREATE command */
+		__u32	map_type;	/* one of enum bpf_map_type */
+		__u32	key_size;	/* size of key in bytes */
+		__u32	value_size;	/* size of value in bytes */
+		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* BPF_MAP_CREATE related
+					 * flags defined above.
+					 */
+		__u32	inner_map_fd;	/* fd pointing to the inner map */
+		__u32	numa_node;	/* numa node (effective only if
+					 * BPF_F_NUMA_NODE is set).
+					 */
+		char	map_name[BPF_OBJ_NAME_LEN];
+		__u32	map_ifindex;	/* ifindex of netdev to create on */
+		__u32	btf_fd;		/* fd pointing to a BTF type data */
+		__u32	btf_key_type_id;	/* BTF type_id of the key */
+		__u32	btf_value_type_id;	/* BTF type_id of the value */
+	};
+
+	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+		__u32		map_fd;
+		__aligned_u64	key;
+		union {
+			__aligned_u64 value;
+			__aligned_u64 next_key;
+		};
+		__u64		flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_LOAD command */
+		__u32		prog_type;	/* one of enum bpf_prog_type */
+		__u32		insn_cnt;
+		__aligned_u64	insns;
+		__aligned_u64	license;
+		__u32		log_level;	/* verbosity level of verifier */
+		__u32		log_size;	/* size of user buffer */
+		__aligned_u64	log_buf;	/* user supplied buffer */
+		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
+		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
+		/* For some prog types expected attach type must be known at
+		 * load time to verify attach type specific parts of prog
+		 * (context accesses, allowed helpers, etc).
+		 */
+		__u32		expected_attach_type;
+	};
+
+	struct { /* anonymous struct used by BPF_OBJ_* commands */
+		__aligned_u64	pathname;
+		__u32		bpf_fd;
+		__u32		file_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;
+		__u32		attach_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+		__u32		prog_fd;
+		__u32		retval;
+		__u32		data_size_in;
+		__u32		data_size_out;
+		__aligned_u64	data_in;
+		__aligned_u64	data_out;
+		__u32		repeat;
+		__u32		duration;
+	} test;
+
+	struct { /* anonymous struct used by BPF_*_GET_*_ID */
+		union {
+			__u32		start_id;
+			__u32		prog_id;
+			__u32		map_id;
+			__u32		btf_id;
+		};
+		__u32		next_id;
+		__u32		open_flags;
+	};
+
+	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
+		__u32		bpf_fd;
+		__u32		info_len;
+		__aligned_u64	info;
+	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
+
+	struct {
+		__u64 name;
+		__u32 prog_fd;
+	} raw_tracepoint;
+
+	struct { /* anonymous struct for BPF_BTF_LOAD */
+		__aligned_u64	btf;
+		__aligned_u64	btf_log_buf;
+		__u32		btf_size;
+		__u32		btf_log_size;
+		__u32		btf_log_level;
+	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
+} __attribute__((aligned(8)));
+
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ *     $ ./scripts/bpf_helpers_doc.py \
+ *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ *     $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Perform a lookup in *map* for an entry associated to *key*.
+ * 	Return
+ * 		Map value associated to *key*, or **NULL** if no entry was
+ * 		found.
+ *
+ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * 	Description
+ * 		Add or update the value of the entry associated to *key* in
+ * 		*map* with *value*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * 		**BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY**  (all
+ * 		elements always exist), the helper would return an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * 	Description
+ * 		Delete entry with *key* from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is removed to
+ * 		make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read(void *dst, u32 size, const void *src)
+ * 	Description
+ * 		For tracing programs, safely attempt to read *size* bytes from
+ * 		address *src* and store the data in *dst*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * 	Description
+ * 		Return the time elapsed since system boot, in nanoseconds.
+ * 	Return
+ * 		Current *ktime*.
+ *
+ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		This helper is a "printk()-like" facility for debugging. It
+ * 		prints a message defined by format *fmt* (of size *fmt_size*)
+ * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		available. It can take up to three additional **u64**
+ * 		arguments (as an eBPF helpers, the total number of arguments is
+ * 		limited to five).
+ *
+ * 		Each time the helper is called, it appends a line to the trace.
+ * 		The format of the trace is customizable, and the exact output
+ * 		one will get depends on the options set in
+ * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*README* file under the same directory). However, it usually
+ * 		defaults to something like:
+ *
+ * 		::
+ *
+ * 			telnet-470   [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * 		In the above:
+ *
+ * 			* ``telnet`` is the name of the current task.
+ * 			* ``470`` is the PID of the current task.
+ * 			* ``001`` is the CPU number on which the task is
+ * 			  running.
+ * 			* In ``.N..``, each character refers to a set of
+ * 			  options (whether irqs are enabled, scheduling
+ * 			  options, whether hard/softirqs are running, level of
+ * 			  preempt_disabled respectively). **N** means that
+ * 			  **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * 			  are set.
+ * 			* ``419421.045894`` is a timestamp.
+ * 			* ``0x00000001`` is a fake value used by BPF for the
+ * 			  instruction pointer register.
+ * 			* ``<formatted msg>`` is the message formatted with
+ * 			  *fmt*.
+ *
+ * 		The conversion specifiers supported by *fmt* are similar, but
+ * 		more limited than for printk(). They are **%d**, **%i**,
+ * 		**%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * 		**%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * 		of field, padding with zeroes, etc.) is available, and the
+ * 		helper will return **-EINVAL** (but print nothing) if it
+ * 		encounters an unknown specifier.
+ *
+ * 		Also, note that **bpf_trace_printk**\ () is slow, and should
+ * 		only be used for debugging purposes. For this reason, a notice
+ * 		bloc (spanning several lines) is printed to kernel logs and
+ * 		states that the helper should not be used "for production use"
+ * 		the first time this helper is used (or more precisely, when
+ * 		**trace_printk**\ () buffers are allocated). For passing values
+ * 		to user space, perf events should be preferred.
+ * 	Return
+ * 		The number of bytes written to the buffer, or a negative error
+ * 		in case of failure.
+ *
+ * u32 bpf_get_prandom_u32(void)
+ * 	Description
+ * 		Get a pseudo-random number.
+ *
+ * 		From a security point of view, this helper uses its own
+ * 		pseudo-random internal state, and cannot be used to infer the
+ * 		seed of other random functions in the kernel. However, it is
+ * 		essential to note that the generator used by the helper is not
+ * 		cryptographically secure.
+ * 	Return
+ * 		A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * 	Description
+ * 		Get the SMP (symmetric multiprocessing) processor id. Note that
+ * 		all programs run with preemption disabled, which means that the
+ * 		SMP processor id is stable during all the execution of the
+ * 		program.
+ * 	Return
+ * 		The SMP id of the processor running the program.
+ *
+ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * 	Description
+ * 		Store *len* bytes from address *from* into the packet
+ * 		associated to *skb*, at *offset*. *flags* are a combination of
+ * 		**BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * 		checksum for the packet after storing the bytes) and
+ * 		**BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * 		**->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * 	Description
+ * 		Recompute the layer 3 (e.g. IP) checksum for the packet
+ * 		associated to *skb*. Computation is incremental, so the helper
+ * 		must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored in *size*.
+ * 		Alternatively, it is possible to store the difference between
+ * 		the previous and the new values of the header field in *to*, by
+ * 		setting *from* and *size* to 0. For both methods, *offset*
+ * 		indicates the location of the IP checksum within the packet.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * 	Description
+ * 		Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * 		packet associated to *skb*. Computation is incremental, so the
+ * 		helper must know the former value of the header field that was
+ * 		modified (*from*), the new value of this field (*to*), and the
+ * 		number of bytes (2 or 4) for this field, stored on the lowest
+ * 		four bits of *flags*. Alternatively, it is possible to store
+ * 		the difference between the previous and the new values of the
+ * 		header field in *to*, by setting *from* and the four lowest
+ * 		bits of *flags* to 0. For both methods, *offset* indicates the
+ * 		location of the IP checksum within the packet. In addition to
+ * 		the size of the field, *flags* can be added (bitwise OR) actual
+ * 		flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * 		untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * 		for updates resulting in a null checksum the value is set to
+ * 		**CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * 		the checksum is to be computed against a pseudo-header.
+ *
+ * 		This helper works in combination with **bpf_csum_diff**\ (),
+ * 		which does not update the checksum in-place, but offers more
+ * 		flexibility and can handle sizes larger than 2 or 4 for the
+ * 		checksum to update.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * 	Description
+ * 		This special helper is used to trigger a "tail call", or in
+ * 		other words, to jump into another eBPF program. The same stack
+ * 		frame is used (but values on stack and in registers for the
+ * 		caller are not accessible to the callee). This mechanism allows
+ * 		for program chaining, either for raising the maximum number of
+ * 		available eBPF instructions, or to execute given programs in
+ * 		conditional blocks. For security reasons, there is an upper
+ * 		limit to the number of successive tail calls that can be
+ * 		performed.
+ *
+ * 		Upon call of this helper, the program attempts to jump into a
+ * 		program referenced at index *index* in *prog_array_map*, a
+ * 		special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * 		*ctx*, a pointer to the context.
+ *
+ * 		If the call succeeds, the kernel immediately runs the first
+ * 		instruction of the new program. This is not a function call,
+ * 		and it never returns to the previous program. If the call
+ * 		fails, then the helper has no effect, and the caller continues
+ * 		to run its subsequent instructions. A call can fail if the
+ * 		destination program for the jump does not exist (i.e. *index*
+ * 		is superior to the number of entries in *prog_array_map*), or
+ * 		if the maximum number of tail calls has been reached for this
+ * 		chain of programs. This limit is defined in the kernel by the
+ * 		macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * 		which is currently set to 32.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * 	Description
+ * 		Clone and redirect the packet associated to *skb* to another
+ * 		net device of index *ifindex*. Both ingress and egress
+ * 		interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * 		value in *flags* is used to make the distinction (ingress path
+ * 		is selected if the flag is present, egress path otherwise).
+ * 		This is the only flag supported for now.
+ *
+ * 		In comparison with **bpf_redirect**\ () helper,
+ * 		**bpf_clone_redirect**\ () has the associated cost of
+ * 		duplicating the packet buffer, but this can be executed out of
+ * 		the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * 		efficient, but it is handled through an action code where the
+ * 		redirection happens only after the eBPF program has returned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current tgid and pid, and
+ * 		created as such:
+ * 		*current_task*\ **->tgid << 32 \|**
+ * 		*current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * 	Return
+ * 		A 64-bit integer containing the current GID and UID, and
+ * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * int bpf_get_current_comm(char *buf, u32 size_of_buf)
+ * 	Description
+ * 		Copy the **comm** attribute of the current task into *buf* of
+ * 		*size_of_buf*. The **comm** attribute contains the name of
+ * 		the executable (excluding the path) for the current task. The
+ * 		*size_of_buf* must be strictly positive. On success, the
+ * 		helper makes sure that the *buf* is NUL-terminated. On failure,
+ * 		it is filled with zeroes.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the classid for the current task, i.e. for the net_cls
+ * 		cgroup to which *skb* belongs.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress.
+ *
+ * 		The net_cls cgroup provides an interface to tag network packets
+ * 		based on a user-provided identifier for all traffic coming from
+ * 		the tasks belonging to the related cgroup. See also the related
+ * 		kernel documentation, available from the Linux sources in file
+ * 		*Documentation/cgroup-v1/net_cls.txt*.
+ *
+ * 		The Linux kernel has two versions for cgroups: there are
+ * 		cgroups v1 and cgroups v2. Both are available to users, who can
+ * 		use a mixture of them, but note that the net_cls cgroup is for
+ * 		cgroup v1 only. This makes it incompatible with BPF programs
+ * 		run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * 		only hold data for one version of cgroups at a time).
+ *
+ * 		This helper is only available is the kernel was compiled with
+ * 		the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * 		"**y**" or to "**m**".
+ * 	Return
+ * 		The classid, or 0 for the default unconfigured classid.
+ *
+ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * 	Description
+ * 		Push a *vlan_tci* (VLAN tag control information) of protocol
+ * 		*vlan_proto* to the packet associated to *skb*, then update
+ * 		the checksum. Note that if *vlan_proto* is different from
+ * 		**ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * 		be **ETH_P_8021Q**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_vlan_pop(struct sk_buff *skb)
+ * 	Description
+ * 		Pop a VLAN header from the packet associated to *skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Get tunnel metadata. This helper takes a pointer *key* to an
+ * 		empty **struct bpf_tunnel_key** of **size**, that will be
+ * 		filled with tunnel metadata for the packet associated to *skb*.
+ * 		The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * 		indicates that the tunnel is based on IPv6 protocol instead of
+ * 		IPv4.
+ *
+ * 		The **struct bpf_tunnel_key** is an object that generalizes the
+ * 		principal parameters used by various tunneling protocols into a
+ * 		single struct. This way, it can be used to easily make a
+ * 		decision based on the contents of the encapsulation header,
+ * 		"summarized" in this struct. In particular, it holds the IP
+ * 		address of the remote end (IPv4 or IPv6, depending on the case)
+ * 		in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * 		this struct exposes the *key*\ **->tunnel_id**, which is
+ * 		generally mapped to a VNI (Virtual Network Identifier), making
+ * 		it programmable together with the **bpf_skb_set_tunnel_key**\
+ * 		() helper.
+ *
+ * 		Let's imagine that the following code is part of a program
+ * 		attached to the TC ingress interface, on one end of a GRE
+ * 		tunnel, and is supposed to filter out all messages coming from
+ * 		remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * 		::
+ *
+ * 			int ret;
+ * 			struct bpf_tunnel_key key = {};
+ * 			
+ * 			ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			if (ret < 0)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			if (key.remote_ipv4 != 0x0a000001)
+ * 				return TC_ACT_SHOT;	// drop packet
+ * 			
+ * 			return TC_ACT_OK;		// accept packet
+ *
+ * 		This interface can also be used with all encapsulation devices
+ * 		that can operate in "collect metadata" mode: instead of having
+ * 		one network device per specific configuration, the "collect
+ * 		metadata" mode only requires a single device where the
+ * 		configuration can be extracted from this helper.
+ *
+ * 		This can be used together with various tunnels such as VXLan,
+ * 		Geneve, GRE or IP in IP (IPIP).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * 	Description
+ * 		Populate tunnel metadata for packet associated to *skb.* The
+ * 		tunnel metadata is set to the contents of *key*, of *size*. The
+ * 		*flags* can be set to a combination of the following values:
+ *
+ * 		**BPF_F_TUNINFO_IPV6**
+ * 			Indicate that the tunnel is based on IPv6 protocol
+ * 			instead of IPv4.
+ * 		**BPF_F_ZERO_CSUM_TX**
+ * 			For IPv4 packets, add a flag to tunnel metadata
+ * 			indicating that checksum computation should be skipped
+ * 			and checksum set to zeroes.
+ * 		**BPF_F_DONT_FRAGMENT**
+ * 			Add a flag to tunnel metadata indicating that the
+ * 			packet should not be fragmented.
+ * 		**BPF_F_SEQ_NUMBER**
+ * 			Add a flag to tunnel metadata indicating that a
+ * 			sequence number should be added to tunnel header before
+ * 			sending the packet. This flag was added for GRE
+ * 			encapsulation, but might be used with other protocols
+ * 			as well in the future.
+ *
+ * 		Here is a typical usage on the transmit path:
+ *
+ * 		::
+ *
+ * 			struct bpf_tunnel_key key;
+ * 			     populate key ...
+ * 			bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * 			bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Read the value of a perf event counter. This helper relies on a
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * 		the perf event counter is selected when *map* is updated with
+ * 		perf event file descriptors. The *map* is an array whose size
+ * 		is the number of available CPUs, and each cell contains a value
+ * 		relative to one CPU. The value to retrieve is indicated by
+ * 		*flags*, that contains the index of the CPU to look up, masked
+ * 		with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		Note that before Linux 4.13, only hardware perf event can be
+ * 		retrieved.
+ *
+ * 		Also, be aware that the newer helper
+ * 		**bpf_perf_event_read_value**\ () is recommended over
+ * 		**bpf_perf_event_read**\ () in general. The latter has some ABI
+ * 		quirks where error and counter value are used as a return code
+ * 		(which is wrong to do since ranges may overlap). This issue is
+ * 		fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * 		time provides more features over the **bpf_perf_event_read**\
+ * 		() interface. Please refer to the description of
+ * 		**bpf_perf_event_read_value**\ () for details.
+ * 	Return
+ * 		The value of the perf event counter read from the map, or a
+ * 		negative error code in case of failure.
+ *
+ * int bpf_redirect(u32 ifindex, u64 flags)
+ * 	Description
+ * 		Redirect the packet to another net device of index *ifindex*.
+ * 		This helper is somewhat similar to **bpf_clone_redirect**\
+ * 		(), except that the packet is not cloned, which provides
+ * 		increased performance.
+ *
+ * 		Except for XDP, both ingress and egress interfaces can be used
+ * 		for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * 		to make the distinction (ingress path is selected if the flag
+ * 		is present, egress path otherwise). Currently, XDP only
+ * 		supports redirection to the egress interface, and accepts no
+ * 		flag at all.
+ *
+ * 		The same effect can be attained with the more generic
+ * 		**bpf_redirect_map**\ (), which requires specific maps to be
+ * 		used but offers better performance.
+ * 	Return
+ * 		For XDP, the helper returns **XDP_REDIRECT** on success or
+ * 		**XDP_ABORTED** on error. For other program types, the values
+ * 		are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * 		error.
+ *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the realm or the route, that is to say the
+ * 		**tclassid** field of the destination for the *skb*. The
+ * 		indentifier retrieved is a user-provided tag, similar to the
+ * 		one used with the net_cls cgroup (see description for
+ * 		**bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * 		held by a route (a destination entry), not by a task.
+ *
+ * 		Retrieving this identifier works with the clsact TC egress hook
+ * 		(see also **tc-bpf(8)**), or alternatively on conventional
+ * 		classful egress qdiscs, but not on TC ingress path. In case of
+ * 		clsact TC egress hook, this has the advantage that, internally,
+ * 		the destination entry has not been dropped yet in the transmit
+ * 		path. Therefore, the destination entry does not need to be
+ * 		artificially held via **netif_keep_dst**\ () for a classful
+ * 		qdisc until the *skb* is freed.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * 	Return
+ * 		The realm of the route for the packet associated to *skb*, or 0
+ * 		if none was found.
+ *
+ * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * 	Description
+ * 		Write raw *data* blob into a special BPF perf event held by
+ * 		*map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * 		event must have the following attributes: **PERF_SAMPLE_RAW**
+ * 		as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * 		**PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * 		The *flags* are used to indicate the index in *map* for which
+ * 		the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * 		Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * 		to indicate that the index of the current CPU core should be
+ * 		used.
+ *
+ * 		The value to write, of *size*, is passed through eBPF stack and
+ * 		pointed by *data*.
+ *
+ * 		The context of the program *ctx* needs also be passed to the
+ * 		helper.
+ *
+ * 		On user space, a program willing to read the values needs to
+ * 		call **perf_event_open**\ () on the perf event (either for
+ * 		one or for all CPUs) and to store the file descriptor into the
+ * 		*map*. This must be done before the eBPF program can send data
+ * 		into it. An example is available in file
+ * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
+ * 		tree (the eBPF program counterpart is in
+ * 		*samples/bpf/trace_output_kern.c*).
+ *
+ * 		**bpf_perf_event_output**\ () achieves better performance
+ * 		than **bpf_trace_printk**\ () for sharing data with user
+ * 		space, and is much better suitable for streaming data from eBPF
+ * 		programs.
+ *
+ * 		Note that this helper is not restricted to tracing use cases
+ * 		and can be used with programs attached to TC or XDP as well,
+ * 		where it allows for passing data to user space listeners. Data
+ * 		can be:
+ *
+ * 		* Only custom structs,
+ * 		* Only the packet payload, or
+ * 		* A combination of both.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+ * 	Description
+ * 		This helper was provided as an easy way to load data from a
+ * 		packet. It can be used to load *len* bytes from *offset* from
+ * 		the packet associated to *skb*, into the buffer pointed by
+ * 		*to*.
+ *
+ * 		Since Linux 4.7, usage of this helper has mostly been replaced
+ * 		by "direct packet access", enabling packet data to be
+ * 		manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * 		pointing respectively to the first byte of packet data and to
+ * 		the byte after the last byte of packet data. However, it
+ * 		remains useful if one wishes to read large quantities of data
+ * 		at once from a packet into the eBPF stack.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * 	Description
+ * 		Walk a user or a kernel stack and return its id. To achieve
+ * 		this, the helper needs *ctx*, which is a pointer to the context
+ * 		on which the tracing program is executed, and a pointer to a
+ * 		*map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		a combination of the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_FAST_STACK_CMP**
+ * 			Compare stacks by hash only.
+ * 		**BPF_F_REUSE_STACKID**
+ * 			If two different stacks hash into the same *stackid*,
+ * 			discard the old one.
+ *
+ * 		The stack id retrieved is a 32 bit long integer handle which
+ * 		can be further combined with other data (including other stack
+ * 		ids) and used as a key into maps. This can be useful for
+ * 		generating a variety of graphs (such as flame graphs or off-cpu
+ * 		graphs).
+ *
+ * 		For walking a stack, this helper is an improvement over
+ * 		**bpf_probe_read**\ (), which can be used with unrolled loops
+ * 		but is not efficient and consumes a lot of eBPF instructions.
+ * 		Instead, **bpf_get_stackid**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ * 	Return
+ * 		The positive or null stack id on success, or a negative error
+ * 		in case of failure.
+ *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * 	Description
+ * 		Compute a checksum difference, from the raw buffer pointed by
+ * 		*from*, of length *from_size* (that must be a multiple of 4),
+ * 		towards the raw buffer pointed by *to*, of size *to_size*
+ * 		(same remark). An optional *seed* can be added to the value
+ * 		(this can be cascaded, the seed may come from a previous call
+ * 		to the helper).
+ *
+ * 		This is flexible enough to be used in several ways:
+ *
+ * 		* With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * 		  checksum, it can be used when pushing new data.
+ * 		* With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * 		  checksum, it can be used when removing data from a packet.
+ * 		* With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * 		  can be used to compute a diff. Note that *from_size* and
+ * 		  *to_size* do not need to be equal.
+ *
+ * 		This helper can be used in combination with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * 		which one can feed in the difference computed with
+ * 		**bpf_csum_diff**\ ().
+ * 	Return
+ * 		The checksum result, or a negative error code in case of
+ * 		failure.
+ *
+ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Retrieve tunnel options metadata for the packet associated to
+ * 		*skb*, and store the raw tunnel option data to the buffer *opt*
+ * 		of *size*.
+ *
+ * 		This helper can be used with encapsulation devices that can
+ * 		operate in "collect metadata" mode (please refer to the related
+ * 		note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * 		more details). A particular example where this can be used is
+ * 		in combination with the Geneve encapsulation protocol, where it
+ * 		allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * 		and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * 		the eBPF program. This allows for full customization of these
+ * 		headers.
+ * 	Return
+ * 		The size of the option data retrieved.
+ *
+ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size)
+ * 	Description
+ * 		Set tunnel options metadata for the packet associated to *skb*
+ * 		to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * 		See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * 		helper for additional information.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * 	Description
+ * 		Change the protocol of the *skb* to *proto*. Currently
+ * 		supported are transition from IPv4 to IPv6, and from IPv6 to
+ * 		IPv4. The helper takes care of the groundwork for the
+ * 		transition, including resizing the socket buffer. The eBPF
+ * 		program is expected to fill the new headers, if any, via
+ * 		**skb_store_bytes**\ () and to recompute the checksums with
+ * 		**bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * 		(). The main case for this helper is to perform NAT64
+ * 		operations out of an eBPF program.
+ *
+ * 		Internally, the GSO type is marked as dodgy so that headers are
+ * 		checked and segments are recalculated by the GSO/GRO engine.
+ * 		The size for GSO target is adapted as well.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * 	Description
+ * 		Change the packet type for the packet associated to *skb*. This
+ * 		comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * 		the eBPF program does not have a write access to *skb*\
+ * 		**->pkt_type** beside this helper. Using a helper here allows
+ * 		for graceful handling of errors.
+ *
+ * 		The major use case is to change incoming *skb*s to
+ * 		**PACKET_HOST** in a programmatic way instead of having to
+ * 		recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * 		example.
+ *
+ * 		Note that *type* only allows certain values. At this time, they
+ * 		are:
+ *
+ * 		**PACKET_HOST**
+ * 			Packet is for us.
+ * 		**PACKET_BROADCAST**
+ * 			Send packet to all.
+ * 		**PACKET_MULTICAST**
+ * 			Send packet to group.
+ * 		**PACKET_OTHERHOST**
+ * 			Send packet to someone else.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether *skb* is a descendant of the cgroup2 held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* failed the cgroup2 descendant test.
+ * 		* 1, if the *skb* succeeded the cgroup2 descendant test.
+ * 		* A negative error code, if an error occurred.
+ *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * 	Description
+ * 		Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * 		not set, in particular if the hash was cleared due to mangling,
+ * 		recompute this hash. Later accesses to the hash can be done
+ * 		directly with *skb*\ **->hash**.
+ *
+ * 		Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * 		prototype with **bpf_skb_change_proto**\ (), or calling
+ * 		**bpf_skb_store_bytes**\ () with the
+ * 		**BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * 		the hash and to trigger a new computation for the next call to
+ * 		**bpf_get_hash_recalc**\ ().
+ * 	Return
+ * 		The 32-bit hash.
+ *
+ * u64 bpf_get_current_task(void)
+ * 	Return
+ * 		A pointer to the current task struct.
+ *
+ * int bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * 	Description
+ * 		Attempt in a safe way to write *len* bytes from the buffer
+ * 		*src* to *dst* in memory. It only works for threads that are in
+ * 		user context, and *dst* must be a valid user space address.
+ *
+ * 		This helper should not be used to implement any kind of
+ * 		security mechanism because of TOC-TOU attacks, but rather to
+ * 		debug, divert, and manipulate execution of semi-cooperative
+ * 		processes.
+ *
+ * 		Keep in mind that this feature is meant for experiments, and it
+ * 		has a risk of crashing the system and running programs.
+ * 		Therefore, when an eBPF program using this helper is attached,
+ * 		a warning including PID and process name is printed to kernel
+ * 		logs.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * 	Description
+ * 		Check whether the probe is being run is the context of a given
+ * 		subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * 		*map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * 	Return
+ * 		The return value depends on the result of the test, and can be:
+ *
+ * 		* 0, if the *skb* task belongs to the cgroup2.
+ * 		* 1, if the *skb* task does not belong to the cgroup2.
+ * 		* A negative error code, if an error occurred.
+ *
+ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Resize (trim or grow) the packet associated to *skb* to the
+ * 		new *len*. The *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		The basic idea is that the helper performs the needed work to
+ * 		change the size of the packet, then the eBPF program rewrites
+ * 		the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * 		**bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * 		and others. This helper is a slow path utility intended for
+ * 		replies with control messages. And because it is targeted for
+ * 		slow path, the helper itself can afford to be slow: it
+ * 		implicitly linearizes, unclones and drops offloads from the
+ * 		*skb*.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * 	Description
+ * 		Pull in non-linear data in case the *skb* is non-linear and not
+ * 		all of *len* are part of the linear section. Make *len* bytes
+ * 		from *skb* readable and writable. If a zero value is passed for
+ * 		*len*, then the whole length of the *skb* is pulled.
+ *
+ * 		This helper is only needed for reading and writing with direct
+ * 		packet access.
+ *
+ * 		For direct packet access, testing that offsets to access
+ * 		are within packet boundaries (test on *skb*\ **->data_end**) is
+ * 		susceptible to fail if offsets are invalid, or if the requested
+ * 		data is in non-linear parts of the *skb*. On failure the
+ * 		program can just bail out, or in the case of a non-linear
+ * 		buffer, use a helper to make the data available. The
+ * 		**bpf_skb_load_bytes**\ () helper is a first solution to access
+ * 		the data. Another one consists in using **bpf_skb_pull_data**
+ * 		to pull in once the non-linear parts, then retesting and
+ * 		eventually access the data.
+ *
+ * 		At the same time, this also makes sure the *skb* is uncloned,
+ * 		which is a necessary condition for direct write. As this needs
+ * 		to be an invariant for the write part only, the verifier
+ * 		detects writes and adds a prologue that is calling
+ * 		**bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * 		the very beginning in case it is indeed cloned.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * 	Description
+ * 		Add the checksum *csum* into *skb*\ **->csum** in case the
+ * 		driver has supplied a checksum for the entire packet into that
+ * 		field. Return an error otherwise. This helper is intended to be
+ * 		used in combination with **bpf_csum_diff**\ (), in particular
+ * 		when the checksum needs to be updated after data has been
+ * 		written into the packet through direct packet access.
+ * 	Return
+ * 		The checksum on success, or a negative error code in case of
+ * 		failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * 	Description
+ * 		Invalidate the current *skb*\ **->hash**. It can be used after
+ * 		mangling on headers through direct packet access, in order to
+ * 		indicate that the hash is outdated and to trigger a
+ * 		recalculation the next time the kernel tries to access this
+ * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ *
+ * int bpf_get_numa_node_id(void)
+ * 	Description
+ * 		Return the id of the current NUMA node. The primary use case
+ * 		for this helper is the selection of sockets for the local NUMA
+ * 		node, when the program is attached to sockets using the
+ * 		**SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * 		but the helper is also available to other eBPF program types,
+ * 		similarly to **bpf_get_smp_processor_id**\ ().
+ * 	Return
+ * 		The id of current NUMA node.
+ *
+ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * 	Description
+ * 		Grows headroom of packet associated to *skb* and adjusts the
+ * 		offset of the MAC header accordingly, adding *len* bytes of
+ * 		space. It automatically extends and reallocates memory as
+ * 		required.
+ *
+ * 		This helper can be used on a layer 3 *skb* to push a MAC header
+ * 		for redirection into a layer 2 device.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * 		it is possible to use a negative value for *delta*. This helper
+ * 		can be used to prepare the packet for pushing or popping
+ * 		headers.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
+ * 	Description
+ * 		Copy a NUL terminated string from an unsafe address
+ * 		*unsafe_ptr* to *dst*. The *size* should include the
+ * 		terminating NUL byte. In case the string length is smaller than
+ * 		*size*, the target is not padded with further NUL bytes. If the
+ * 		string length is larger than *size*, just *size*-1 bytes are
+ * 		copied and the last byte is set to NUL.
+ *
+ * 		On success, the length of the copied string is returned. This
+ * 		makes this helper useful in tracing programs for reading
+ * 		strings, and more importantly to get its length at runtime. See
+ * 		the following snippet:
+ *
+ * 		::
+ *
+ * 			SEC("kprobe/sys_open")
+ * 			void bpf_sys_open(struct pt_regs *ctx)
+ * 			{
+ * 			        char buf[PATHLEN]; // PATHLEN is defined to 256
+ * 			        int res = bpf_probe_read_str(buf, sizeof(buf),
+ * 				                             ctx->di);
+ *
+ * 				// Consume buf, for example push it to
+ * 				// userspace via bpf_perf_event_output(); we
+ * 				// can use res (the string length) as event
+ * 				// size, after checking its boundaries.
+ * 			}
+ *
+ * 		In comparison, using **bpf_probe_read()** helper here instead
+ * 		to read the string would require to estimate the length at
+ * 		compile time, and would often result in copying more memory
+ * 		than necessary.
+ *
+ * 		Another useful use case is when parsing individual process
+ * 		arguments or individual environment variables navigating
+ * 		*current*\ **->mm->arg_start** and *current*\
+ * 		**->mm->env_start**: using this helper and the return value,
+ * 		one can quickly iterate at the right offset of the memory area.
+ * 	Return
+ * 		On success, the strictly positive length of the string,
+ * 		including the trailing NUL character. On error, a negative
+ * 		value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * 	Description
+ * 		If the **struct sk_buff** pointed by *skb* has a known socket,
+ * 		retrieve the cookie (generated by the kernel) of this socket.
+ * 		If no cookie has been set yet, generate a new cookie. Once
+ * 		generated, the socket cookie remains stable for the life of the
+ * 		socket. This helper can be useful for monitoring per socket
+ * 		networking traffic statistics as it provides a unique socket
+ * 		identifier per namespace.
+ * 	Return
+ * 		A 8-byte long non-decreasing number on success, or 0 if the
+ * 		socket field is missing inside *skb*.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_addr** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
+ * 	Description
+ * 		Equivalent to bpf_get_socket_cookie() helper that accepts
+ * 		*skb*, but gets socket from **struct bpf_sock_ops** contex.
+ * 	Return
+ * 		A 8-byte long non-decreasing number.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Return
+ * 		The owner UID of the socket associated to *skb*. If the socket
+ * 		is **NULL**, or if it is not a full socket (i.e. if it is a
+ * 		time-wait or a request socket instead), **overflowuid** value
+ * 		is returned (note that **overflowuid** might also be the actual
+ * 		UID value for the socket).
+ *
+ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * 	Description
+ * 		Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * 		to value *hash*.
+ * 	Return
+ * 		0
+ *
+ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **setsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **setsockopt(2)** for more information.
+ * 		The option value of length *optlen* is pointed by *optval*.
+ *
+ * 		This helper actually implements a subset of **setsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **SOL_SOCKET**, which supports the following *optname*\ s:
+ * 		  **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * 		  **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**.
+ * 		* **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * 		  **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * 		  **TCP_BPF_SNDCWND_CLAMP**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
+ * 	Description
+ * 		Grow or shrink the room for data in the packet associated to
+ * 		*skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * 		There is a single supported mode at this time:
+ *
+ * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * 		  (room space is added or removed below the layer 3 header).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the endpoint referenced by *map* at
+ * 		index *key*. Depending on its type, this *map* can contain
+ * 		references to net devices (for forwarding packets through other
+ * 		ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * 		but this is only implemented for native XDP (with driver
+ * 		support) as of this writing).
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		When used to redirect packets to net devices, this helper
+ * 		provides a high performance increase over **bpf_redirect**\ ().
+ * 		This is due to various implementation details of the underlying
+ * 		mechanisms, one of which is the fact that **bpf_redirect_map**\
+ * 		() tries to send packet as a "bulk" to the device.
+ * 	Return
+ * 		**XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
+ *
+ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		Redirect the packet to the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * 	Description
+ * 		Add an entry to, or update a *map* referencing sockets. The
+ * 		*skops* is used as a new value for the entry associated to
+ * 		*key*. *flags* is one of:
+ *
+ * 		**BPF_NOEXIST**
+ * 			The entry for *key* must not exist in the map.
+ * 		**BPF_EXIST**
+ * 			The entry for *key* must already exist in the map.
+ * 		**BPF_ANY**
+ * 			No condition on the existence of the entry for *key*.
+ *
+ * 		If the *map* has eBPF programs (parser and verdict), those will
+ * 		be inherited by the socket being added. If the socket is
+ * 		already attached to eBPF programs, this results in an error.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * 		*delta* (which can be positive or negative). Note that this
+ * 		operation modifies the address stored in *xdp_md*\ **->data**,
+ * 		so the latter must be loaded only after the helper has been
+ * 		called.
+ *
+ * 		The use of *xdp_md*\ **->data_meta** is optional and programs
+ * 		are not required to use it. The rationale is that when the
+ * 		packet is processed with XDP (e.g. as DoS filter), it is
+ * 		possible to push further meta data along with it before passing
+ * 		to the stack, and to give the guarantee that an ingress eBPF
+ * 		program attached as a TC classifier on the same device can pick
+ * 		this up for further post-processing. Since TC works with socket
+ * 		buffers, it remains possible to set from XDP the **mark** or
+ * 		**priority** pointers, or other pointers for the socket buffer.
+ * 		Having this scratch space generic and programmable allows for
+ * 		more flexibility as the user is free to store whatever meta
+ * 		data they need.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		Read the value of a perf event counter, and store it into *buf*
+ * 		of size *buf_size*. This helper relies on a *map* of type
+ * 		**BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * 		counter is selected when *map* is updated with perf event file
+ * 		descriptors. The *map* is an array whose size is the number of
+ * 		available CPUs, and each cell contains a value relative to one
+ * 		CPU. The value to retrieve is indicated by *flags*, that
+ * 		contains the index of the CPU to look up, masked with
+ * 		**BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * 		**BPF_F_CURRENT_CPU** to indicate that the value for the
+ * 		current CPU should be retrieved.
+ *
+ * 		This helper behaves in a way close to
+ * 		**bpf_perf_event_read**\ () helper, save that instead of
+ * 		just returning the value observed, it fills the *buf*
+ * 		structure. This allows for additional data to be retrieved: in
+ * 		particular, the enabled and running times (in *buf*\
+ * 		**->enabled** and *buf*\ **->running**, respectively) are
+ * 		copied. In general, **bpf_perf_event_read_value**\ () is
+ * 		recommended over **bpf_perf_event_read**\ (), which has some
+ * 		ABI issues and provides fewer functionalities.
+ *
+ * 		These values are interesting, because hardware PMU (Performance
+ * 		Monitoring Unit) counters are limited resources. When there are
+ * 		more PMU based perf events opened than available counters,
+ * 		kernel will multiplex these events so each event gets certain
+ * 		percentage (but not all) of the PMU time. In case that
+ * 		multiplexing happens, the number of samples or counter value
+ * 		will not reflect the case compared to when no multiplexing
+ * 		occurs. This makes comparison between different runs difficult.
+ * 		Typically, the counter value should be normalized before
+ * 		comparing to other experiments. The usual normalization is done
+ * 		as follows.
+ *
+ * 		::
+ *
+ * 			normalized_counter = counter * t_enabled / t_running
+ *
+ * 		Where t_enabled is the time enabled for event and t_running is
+ * 		the time running for event since last normalization. The
+ * 		enabled and running times are accumulated since the perf event
+ * 		open. To achieve scaling factor between two invocations of an
+ * 		eBPF program, users can can use CPU id as the key (which is
+ * 		typical for perf array usage model) to remember the previous
+ * 		value and do the calculation inside the eBPF program.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		value of the event counter associated to *ctx* and store it in
+ * 		the structure pointed by *buf* and of size *buf_size*. Enabled
+ * 		and running times are also stored in the structure (see
+ * 		description of helper **bpf_perf_event_read_value**\ () for
+ * 		more details).
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen)
+ * 	Description
+ * 		Emulate a call to **getsockopt()** on the socket associated to
+ * 		*bpf_socket*, which must be a full socket. The *level* at
+ * 		which the option resides and the name *optname* of the option
+ * 		must be specified, see **getsockopt(2)** for more information.
+ * 		The retrieved value is stored in the structure pointed by
+ * 		*opval* and of length *optlen*.
+ *
+ * 		This helper actually implements a subset of **getsockopt()**.
+ * 		It supports the following *level*\ s:
+ *
+ * 		* **IPPROTO_TCP**, which supports *optname*
+ * 		  **TCP_CONGESTION**.
+ * 		* **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * 		* **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * 	Description
+ * 		Used for error injection, this helper uses kprobes to override
+ * 		the return value of the probed function, and to set it to *rc*.
+ * 		The first argument is the context *regs* on which the kprobe
+ * 		works.
+ *
+ * 		This helper works by setting setting the PC (program counter)
+ * 		to an override function which is run in place of the original
+ * 		probed function. This means the probed function is not run at
+ * 		all. The replacement function just returns with the required
+ * 		value.
+ *
+ * 		This helper has security implications, and thus is subject to
+ * 		restrictions. It is only available if the kernel was compiled
+ * 		with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * 		option, and in this case it only works on functions tagged with
+ * 		**ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * 		Also, the helper is only available for the architectures having
+ * 		the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * 		x86 architecture is the only one to support this feature.
+ * 	Return
+ * 		0
+ *
+ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
+ * 	Description
+ * 		Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * 		for the full TCP socket associated to *bpf_sock_ops* to
+ * 		*argval*.
+ *
+ * 		The primary use of this field is to determine if there should
+ * 		be calls to eBPF programs of type
+ * 		**BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * 		code. A program of the same type can change its value, per
+ * 		connection and as necessary, when the connection is
+ * 		established. This field is directly accessible for reading, but
+ * 		this helper must be used for updates in order to return an
+ * 		error if an eBPF program tries to set a callback that is not
+ * 		supported in the current kernel.
+ *
+ * 		The supported callback values that *argval* can combine are:
+ *
+ * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ *
+ * 		Here are some examples of where one could call such eBPF
+ * 		program:
+ *
+ * 		* When RTO fires.
+ * 		* When a packet is retransmitted.
+ * 		* When the connection terminates.
+ * 		* When a packet is sent.
+ * 		* When a packet is received.
+ * 	Return
+ * 		Code **-EINVAL** if the socket is not a full TCP socket;
+ * 		otherwise, a positive number containing the bits that could not
+ * 		be set is returned (which comes down to 0 if all bits were set
+ * 		as required).
+ *
+ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * 	Description
+ * 		This helper is used in programs implementing policies at the
+ * 		socket level. If the message *msg* is allowed to pass (i.e. if
+ * 		the verdict eBPF program returns **SK_PASS**), redirect it to
+ * 		the socket referenced by *map* (of type
+ * 		**BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * 		egress interfaces can be used for redirection. The
+ * 		**BPF_F_INGRESS** value in *flags* is used to make the
+ * 		distinction (ingress path is selected if the flag is present,
+ * 		egress path otherwise). This is the only flag supported for now.
+ * 	Return
+ * 		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, apply the verdict of the eBPF program to
+ * 		the next *bytes* (number of bytes) of message *msg*.
+ *
+ * 		For example, this helper can be used in the following cases:
+ *
+ * 		* A single **sendmsg**\ () or **sendfile**\ () system call
+ * 		  contains multiple logical messages that the eBPF program is
+ * 		  supposed to read and for which it should apply a verdict.
+ * 		* An eBPF program only cares to read the first *bytes* of a
+ * 		  *msg*. If the message has a large payload, then setting up
+ * 		  and calling the eBPF program repeatedly for all bytes, even
+ * 		  though the verdict is already known, would create unnecessary
+ * 		  overhead.
+ *
+ * 		When called from within an eBPF program, the helper sets a
+ * 		counter internal to the BPF infrastructure, that is used to
+ * 		apply the last verdict to the next *bytes*. If *bytes* is
+ * 		smaller than the current data being processed from a
+ * 		**sendmsg**\ () or **sendfile**\ () system call, the first
+ * 		*bytes* will be sent and the eBPF program will be re-run with
+ * 		the pointer for start of data pointing to byte number *bytes*
+ * 		**+ 1**. If *bytes* is larger than the current data being
+ * 		processed, then the eBPF verdict will be applied to multiple
+ * 		**sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * 		consumed.
+ *
+ * 		Note that if a socket closes with the internal counter holding
+ * 		a non-zero value, this is not a problem because data is not
+ * 		being buffered for *bytes* and is sent as it is received.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * 	Description
+ * 		For socket policies, prevent the execution of the verdict eBPF
+ * 		program for message *msg* until *bytes* (byte number) have been
+ * 		accumulated.
+ *
+ * 		This can be used when one needs a specific number of bytes
+ * 		before a verdict can be assigned, even if the data spans
+ * 		multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * 		case would be a user calling **sendmsg**\ () repeatedly with
+ * 		1-byte long message segments. Obviously, this is bad for
+ * 		performance, but it is still valid. If the eBPF program needs
+ * 		*bytes* bytes to validate a header, this helper can be used to
+ * 		prevent the eBPF program to be called again until *bytes* have
+ * 		been accumulated.
+ * 	Return
+ * 		0
+ *
+ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * 	Description
+ * 		For socket policies, pull in non-linear data from user space
+ * 		for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * 		**->data_end** to *start* and *end* bytes offsets into *msg*,
+ * 		respectively.
+ *
+ * 		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * 		*msg* it can only parse data that the (**data**, **data_end**)
+ * 		pointers have already consumed. For **sendmsg**\ () hooks this
+ * 		is likely the first scatterlist element. But for calls relying
+ * 		on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * 		be the range (**0**, **0**) because the data is shared with
+ * 		user space and by default the objective is to avoid allowing
+ * 		user space to modify data while (or after) eBPF verdict is
+ * 		being decided. This helper can be used to pull in data and to
+ * 		set the start and end pointer to given values. Data will be
+ * 		copied if necessary (i.e. if data was not linear and if start
+ * 		and end pointers do not point to the same chunk).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
+ * 	Description
+ * 		Bind the socket associated to *ctx* to the address pointed by
+ * 		*addr*, of length *addr_len*. This allows for making outgoing
+ * 		connection from the desired IP address, which can be useful for
+ * 		example when all processes inside a cgroup should use one
+ * 		single IP address on a host that has multiple IP configured.
+ *
+ * 		This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * 		domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * 		**AF_INET6**). Looking for a free port to bind to can be
+ * 		expensive, therefore binding to port is not permitted by the
+ * 		helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
+ * 		must be set to zero.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * 	Description
+ * 		Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * 		only possible to shrink the packet as of this writing,
+ * 		therefore *delta* must be a negative integer.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * 	Description
+ * 		Retrieve the XFRM state (IP transform framework, see also
+ * 		**ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * 		The retrieved value is stored in the **struct bpf_xfrm_state**
+ * 		pointed by *xfrm_state* and of length *size*.
+ *
+ * 		All values for *flags* are reserved for future usage, and must
+ * 		be left at zero.
+ *
+ * 		This helper is available only if the kernel was compiled with
+ * 		**CONFIG_XFRM** configuration option.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags)
+ * 	Description
+ * 		Return a user or a kernel stack in bpf program provided buffer.
+ * 		To achieve this, the helper needs *ctx*, which is a pointer
+ * 		to the context on which the tracing program is executed.
+ * 		To store the stacktrace, the bpf program provides *buf* with
+ * 		a nonnegative *size*.
+ *
+ * 		The last argument, *flags*, holds the number of stack frames to
+ * 		skip (from 0 to 255), masked with
+ * 		**BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * 		the following flags:
+ *
+ * 		**BPF_F_USER_STACK**
+ * 			Collect a user space stack instead of a kernel stack.
+ * 		**BPF_F_USER_BUILD_ID**
+ * 			Collect buildid+offset instead of ips for user stack,
+ * 			only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * 		**bpf_get_stack**\ () can collect up to
+ * 		**PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * 		to sufficient large buffer size. Note that
+ * 		this limit can be controlled with the **sysctl** program, and
+ * 		that it should be manually increased in order to profile long
+ * 		user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * 		::
+ *
+ * 			# sysctl kernel.perf_event_max_stack=<new value>
+ * 	Return
+ * 		A non-negative value equal to or less than *size* on success,
+ * 		or a negative error in case of failure.
+ *
+ * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * 	Description
+ * 		This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * 		it provides an easy way to load *len* bytes from *offset*
+ * 		from the packet associated to *skb*, into the buffer pointed
+ * 		by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * 		a fifth argument *start_header* exists in order to select a
+ * 		base offset to start from. *start_header* can be one of:
+ *
+ * 		**BPF_HDR_START_MAC**
+ * 			Base offset to load data from is *skb*'s mac header.
+ * 		**BPF_HDR_START_NET**
+ * 			Base offset to load data from is *skb*'s network header.
+ *
+ * 		In general, "direct packet access" is the preferred method to
+ * 		access packet data, however, this helper is in particular useful
+ * 		in socket filters where *skb*\ **->data** does not always point
+ * 		to the start of the mac header and where "direct packet access"
+ * 		is not available.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ *	Description
+ *		Do FIB lookup in kernel tables using parameters in *params*.
+ *		If lookup is successful and result shows packet is to be
+ *		forwarded, the neighbor tables are searched for the nexthop.
+ *		If successful (ie., FIB lookup shows forwarding and nexthop
+ *		is resolved), the nexthop address is returned in ipv4_dst
+ *		or ipv6_dst based on family, smac is set to mac address of
+ *		egress device, dmac is set to nexthop mac address, rt_metric
+ *		is set to metric from route (IPv4/IPv6 only), and ifindex
+ *		is set to the device index of the nexthop from the FIB lookup.
+ *
+ *             *plen* argument is the size of the passed in struct.
+ *             *flags* argument can be a combination of one or more of the
+ *             following values:
+ *
+ *		**BPF_FIB_LOOKUP_DIRECT**
+ *			Do a direct table lookup vs full lookup using FIB
+ *			rules.
+ *		**BPF_FIB_LOOKUP_OUTPUT**
+ *			Perform lookup from an egress perspective (default is
+ *			ingress).
+ *
+ *             *ctx* is either **struct xdp_md** for XDP programs or
+ *             **struct sk_buff** tc cls_act programs.
+ *     Return
+ *		* < 0 if any input argument is invalid
+ *		*   0 on success (packet is forwarded, nexthop neighbor exists)
+ *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ *		  packet is not forwarded or needs assist from full stack
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Add an entry to, or update a sockhash *map* referencing sockets.
+ *		The *skops* is used as a new value for the entry associated to
+ *		*key*. *flags* is one of:
+ *
+ *		**BPF_NOEXIST**
+ *			The entry for *key* must not exist in the map.
+ *		**BPF_EXIST**
+ *			The entry for *key* must already exist in the map.
+ *		**BPF_ANY**
+ *			No condition on the existence of the entry for *key*.
+ *
+ *		If the *map* has eBPF programs (parser and verdict), those will
+ *		be inherited by the socket being added. If the socket is
+ *		already attached to eBPF programs, this results in an error.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		socket level. If the message *msg* is allowed to pass (i.e. if
+ *		the verdict eBPF program returns **SK_PASS**), redirect it to
+ *		the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress path otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		This helper is used in programs implementing policies at the
+ *		skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ *		if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *		to the socket referenced by *map* (of type
+ *		**BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ *		egress interfaces can be used for redirection. The
+ *		**BPF_F_INGRESS** value in *flags* is used to make the
+ *		distinction (ingress path is selected if the flag is present,
+ *		egress otherwise). This is the only flag supported for now.
+ *	Return
+ *		**SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ *	Description
+ *		Encapsulate the packet associated to *skb* within a Layer 3
+ *		protocol header. This header is provided in the buffer at
+ *		address *hdr*, with *len* its size in bytes. *type* indicates
+ *		the protocol of the header and can be one of:
+ *
+ *		**BPF_LWT_ENCAP_SEG6**
+ *			IPv6 encapsulation with Segment Routing Header
+ *			(**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ *			the IPv6 header is computed by the kernel.
+ *		**BPF_LWT_ENCAP_SEG6_INLINE**
+ *			Only works if *skb* contains an IPv6 packet. Insert a
+ *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ *			the IPv6 header.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ *	Description
+ *		Store *len* bytes from address *from* into the packet
+ *		associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ *		inside the outermost IPv6 Segment Routing Header can be
+ *		modified through this helper.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ *	Description
+ *		Adjust the size allocated to TLVs in the outermost IPv6
+ *		Segment Routing Header contained in the packet associated to
+ *		*skb*, at position *offset* by *delta* bytes. Only offsets
+ *		after the segments are accepted. *delta* can be as well
+ *		positive (growing) as negative (shrinking).
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ *	Description
+ *		Apply an IPv6 Segment Routing action of type *action* to the
+ *		packet associated to *skb*. Each action takes a parameter
+ *		contained at address *param*, and of length *param_len* bytes.
+ *		*action* can be one of:
+ *
+ *		**SEG6_LOCAL_ACTION_END_X**
+ *			End.X action: Endpoint with Layer-3 cross-connect.
+ *			Type of *param*: **struct in6_addr**.
+ *		**SEG6_LOCAL_ACTION_END_T**
+ *			End.T action: Endpoint with specific IPv6 table lookup.
+ *			Type of *param*: **int**.
+ *		**SEG6_LOCAL_ACTION_END_B6**
+ *			End.B6 action: Endpoint bound to an SRv6 policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *		**SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ *			End.B6.Encap action: Endpoint bound to an SRv6
+ *			encapsulation policy.
+ *			Type of param: **struct ipv6_sr_hdr**.
+ *
+ * 		A call to this helper is susceptible to change the underlaying
+ * 		packet buffer. Therefore, at load time, all checks on pointers
+ * 		previously done by the verifier are invalidated and must be
+ * 		performed again, if the helper is used in combination with
+ * 		direct packet access.
+ *	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded key press with *scancode*,
+ *		*toggle* value in the given *protocol*. The scancode will be
+ *		translated to a keycode using the rc keymap, and reported as
+ *		an input key down event. After a period a key up event is
+ *		generated. This period can be extended by calling either
+ *		**bpf_rc_keydown** () again with the same values, or calling
+ *		**bpf_rc_repeat** ().
+ *
+ *		Some protocols include a toggle bit, in case the button	was
+ *		released and pressed again between consecutive scancodes.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		The *protocol* is the decoded protocol number (see
+ *		**enum rc_proto** for some predefined values).
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
+ *
+ * int bpf_rc_repeat(void *ctx)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded repeat key message. This delays
+ *		the generation of a key up event for previously generated
+ *		key down event.
+ *
+ *		Some IR protocols like NEC have a special IR message for
+ *		repeating last button, for when a button is held down.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
+ *
+ * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
+ * 	Description
+ * 		Return the cgroup v2 id of the socket associated with the *skb*.
+ * 		This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * 		helper for cgroup v1 by providing a tag resp. identifier that
+ * 		can be matched on or used for map lookups e.g. to implement
+ * 		policy. The cgroup v2 id of a given path in the hierarchy is
+ * 		exposed in user space through the f_handle API in order to get
+ * 		to the same 64-bit id.
+ *
+ * 		This helper can be used on TC egress path, but not on ingress,
+ * 		and is available only if the kernel was compiled with the
+ * 		**CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * 	Return
+ * 		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *skb* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *skb*, then return value will be same as that
+ *		of **bpf_skb_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *skb*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_skb_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * 	Return
+ * 		A 64-bit integer containing the current cgroup id based
+ * 		on the cgroup within which the current task is running.
+ *
+ * void* get_local_storage(void *map, u64 flags)
+ *	Description
+ *		Get the pointer to the local storage area.
+ *		The type and the size of the local storage is defined
+ *		by the *map* argument.
+ *		The *flags* meaning is specific for each map type,
+ *		and has to be 0 for cgroup local storage.
+ *
+ *		Depending on the bpf program type, a local storage area
+ *		can be shared between multiple instances of the bpf program,
+ *		running simultaneously.
+ *
+ *		A user should care about the synchronization by himself.
+ *		For example, by using the BPF_STX_XADD instruction to alter
+ *		the shared data.
+ *	Return
+ *		Pointer to the local storage area.
+ *
+ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ *	Description
+ *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
+ *		It checks the selected sk is matching the incoming
+ *		request in the skb.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ *	Description
+ *		Look for UDP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *
+ *		The *ctx* should point to the context of the program, such as
+ *		the skb or socket (depending on the hook in use). This is used
+ *		to determine the base network namespace for the lookup.
+ *
+ *		*tuple_size* must be one of:
+ *
+ *		**sizeof**\ (*tuple*\ **->ipv4**)
+ *			Look for an IPv4 socket.
+ *		**sizeof**\ (*tuple*\ **->ipv6**)
+ *			Look for an IPv6 socket.
+ *
+ *		If the *netns* is zero, then the socket lookup table in the
+ *		netns associated with the *ctx* will be used. For the TC hooks,
+ *		this in the netns of the device in the skb. For socket hooks,
+ *		this in the netns of the socket. If *netns* is non-zero, then
+ *		it specifies the ID of the netns relative to the netns
+ *		associated with the *ctx*.
+ *
+ *		All values for *flags* are reserved for future usage, and must
+ *		be left at zero.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *
+ * int bpf_sk_release(struct bpf_sock *sk)
+ *	Description
+ *		Release the reference held by *sock*. *sock* must be a non-NULL
+ *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ *	Description
+ *		For socket policies, insert *len* bytes into msg at offset
+ *		*start*.
+ *
+ *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ *		*msg* it may want to insert metadata or options into the msg.
+ *		This can later be read and used by any of the lower layer BPF
+ *		hooks.
+ *
+ *		This helper may fail if under memory pressure (a malloc
+ *		fails) in these cases BPF programs will get an appropriate
+ *		error and BPF programs will need to handle them.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ */
+#define __BPF_FUNC_MAPPER(FN)		\
+	FN(unspec),			\
+	FN(map_lookup_elem),		\
+	FN(map_update_elem),		\
+	FN(map_delete_elem),		\
+	FN(probe_read),			\
+	FN(ktime_get_ns),		\
+	FN(trace_printk),		\
+	FN(get_prandom_u32),		\
+	FN(get_smp_processor_id),	\
+	FN(skb_store_bytes),		\
+	FN(l3_csum_replace),		\
+	FN(l4_csum_replace),		\
+	FN(tail_call),			\
+	FN(clone_redirect),		\
+	FN(get_current_pid_tgid),	\
+	FN(get_current_uid_gid),	\
+	FN(get_current_comm),		\
+	FN(get_cgroup_classid),		\
+	FN(skb_vlan_push),		\
+	FN(skb_vlan_pop),		\
+	FN(skb_get_tunnel_key),		\
+	FN(skb_set_tunnel_key),		\
+	FN(perf_event_read),		\
+	FN(redirect),			\
+	FN(get_route_realm),		\
+	FN(perf_event_output),		\
+	FN(skb_load_bytes),		\
+	FN(get_stackid),		\
+	FN(csum_diff),			\
+	FN(skb_get_tunnel_opt),		\
+	FN(skb_set_tunnel_opt),		\
+	FN(skb_change_proto),		\
+	FN(skb_change_type),		\
+	FN(skb_under_cgroup),		\
+	FN(get_hash_recalc),		\
+	FN(get_current_task),		\
+	FN(probe_write_user),		\
+	FN(current_task_under_cgroup),	\
+	FN(skb_change_tail),		\
+	FN(skb_pull_data),		\
+	FN(csum_update),		\
+	FN(set_hash_invalid),		\
+	FN(get_numa_node_id),		\
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),		\
+	FN(probe_read_str),		\
+	FN(get_socket_cookie),		\
+	FN(get_socket_uid),		\
+	FN(set_hash),			\
+	FN(setsockopt),			\
+	FN(skb_adjust_room),		\
+	FN(redirect_map),		\
+	FN(sk_redirect_map),		\
+	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),	\
+	FN(getsockopt),			\
+	FN(override_return),		\
+	FN(sock_ops_cb_flags_set),	\
+	FN(msg_redirect_map),		\
+	FN(msg_apply_bytes),		\
+	FN(msg_cork_bytes),		\
+	FN(msg_pull_data),		\
+	FN(bind),			\
+	FN(xdp_adjust_tail),		\
+	FN(skb_get_xfrm_state),		\
+	FN(get_stack),			\
+	FN(skb_load_bytes_relative),	\
+	FN(fib_lookup),			\
+	FN(sock_hash_update),		\
+	FN(msg_redirect_hash),		\
+	FN(sk_redirect_hash),		\
+	FN(lwt_push_encap),		\
+	FN(lwt_seg6_store_bytes),	\
+	FN(lwt_seg6_adjust_srh),	\
+	FN(lwt_seg6_action),		\
+	FN(rc_repeat),			\
+	FN(rc_keydown),			\
+	FN(skb_cgroup_id),		\
+	FN(get_current_cgroup_id),	\
+	FN(get_local_storage),		\
+	FN(sk_select_reuseport),	\
+	FN(skb_ancestor_cgroup_id),	\
+	FN(sk_lookup_tcp),		\
+	FN(sk_lookup_udp),		\
+	FN(sk_release),			\
+	FN(map_push_elem),		\
+	FN(map_pop_elem),		\
+	FN(map_peek_elem),		\
+	FN(msg_push_data),
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x
+enum bpf_func_id {
+	__BPF_FUNC_MAPPER(__BPF_ENUM_FN)
+	__BPF_FUNC_MAX_ID,
+};
+#undef __BPF_ENUM_FN
+
+/* All flags used by eBPF helper functions, placed here. */
+
+/* BPF_FUNC_skb_store_bytes flags. */
+#define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
+
+/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
+ * First 4 bits are for passing the header field size.
+ */
+#define BPF_F_HDR_FIELD_MASK		0xfULL
+
+/* BPF_FUNC_l4_csum_replace flags. */
+#define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
+#define BPF_F_MARK_ENFORCE		(1ULL << 6)
+
+/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
+#define BPF_F_INGRESS			(1ULL << 0)
+
+/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
+#define BPF_F_TUNINFO_IPV6		(1ULL << 0)
+
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
+#define BPF_F_SKIP_FIELD_MASK		0xffULL
+#define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
+#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
+#define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
+
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+#define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
+#define BPF_F_SEQ_NUMBER		(1ULL << 3)
+
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
+#define BPF_F_INDEX_MASK		0xffffffffULL
+#define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+/* BPF_FUNC_perf_event_output for sk_buff input context. */
+#define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
+
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+	BPF_ADJ_ROOM_NET,
+};
+
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+	BPF_HDR_START_MAC,
+	BPF_HDR_START_NET,
+};
+
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+	BPF_LWT_ENCAP_SEG6,
+	BPF_LWT_ENCAP_SEG6_INLINE
+};
+
+/* user accessible mirror of in-kernel sk_buff.
+ * new fields can only be added to the end of this structure
+ */
+struct __sk_buff {
+	__u32 len;
+	__u32 pkt_type;
+	__u32 mark;
+	__u32 queue_mapping;
+	__u32 protocol;
+	__u32 vlan_present;
+	__u32 vlan_tci;
+	__u32 vlan_proto;
+	__u32 priority;
+	__u32 ingress_ifindex;
+	__u32 ifindex;
+	__u32 tc_index;
+	__u32 cb[5];
+	__u32 hash;
+	__u32 tc_classid;
+	__u32 data;
+	__u32 data_end;
+	__u32 napi_id;
+
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
+	struct bpf_flow_keys *flow_keys;
+};
+
+struct bpf_tunnel_key {
+	__u32 tunnel_id;
+	union {
+		__u32 remote_ipv4;
+		__u32 remote_ipv6[4];
+	};
+	__u8 tunnel_tos;
+	__u8 tunnel_ttl;
+	__u16 tunnel_ext;	/* Padding, future use. */
+	__u32 tunnel_label;
+};
+
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+	__u32 reqid;
+	__u32 spi;	/* Stored in network byte order */
+	__u16 family;
+	__u16 ext;	/* Padding, future use. */
+	union {
+		__u32 remote_ipv4;	/* Stored in network byte order */
+		__u32 remote_ipv6[4];	/* Stored in network byte order */
+	};
+};
+
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
+struct bpf_sock {
+	__u32 bound_dev_if;
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
+	__u32 mark;
+	__u32 priority;
+	__u32 src_ip4;		/* Allows 1,2,4-byte read.
+				 * Stored in network byte order.
+				 */
+	__u32 src_ip6[4];	/* Allows 1,2,4-byte read.
+				 * Stored in network byte order.
+				 */
+	__u32 src_port;		/* Allows 4-byte read.
+				 * Stored in host byte order
+				 */
+};
+
+struct bpf_sock_tuple {
+	union {
+		struct {
+			__be32 saddr;
+			__be32 daddr;
+			__be16 sport;
+			__be16 dport;
+		} ipv4;
+		struct {
+			__be32 saddr[4];
+			__be32 daddr[4];
+			__be16 sport;
+			__be16 dport;
+		} ipv6;
+	};
+};
+
+#define XDP_PACKET_HEADROOM 256
+
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will
+ * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
+ */
+enum xdp_action {
+	XDP_ABORTED = 0,
+	XDP_DROP,
+	XDP_PASS,
+	XDP_TX,
+	XDP_REDIRECT,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+	__u32 data;
+	__u32 data_end;
+	__u32 data_meta;
+	/* Below access go through struct xdp_rxq_info */
+	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	__u32 rx_queue_index;  /* rxq->queue_index  */
+};
+
+enum sk_action {
+	SK_DROP = 0,
+	SK_PASS,
+};
+
+/* user accessible metadata for SK_MSG packet hook, new fields must
+ * be added to the end of this structure
+ */
+struct sk_msg_md {
+	void *data;
+	void *data_end;
+
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+};
+
+struct sk_reuseport_md {
+	/*
+	 * Start of directly accessible data. It begins from
+	 * the tcp/udp header.
+	 */
+	void *data;
+	void *data_end;		/* End of directly accessible data */
+	/*
+	 * Total length of packet (starting from the tcp/udp header).
+	 * Note that the directly accessible bytes (data_end - data)
+	 * could be less than this "len".  Those bytes could be
+	 * indirectly read by a helper "bpf_skb_load_bytes()".
+	 */
+	__u32 len;
+	/*
+	 * Eth protocol in the mac header (network byte order). e.g.
+	 * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+	 */
+	__u32 eth_protocol;
+	__u32 ip_protocol;	/* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+	__u32 bind_inany;	/* Is sock bound to an INANY address? */
+	__u32 hash;		/* A hash of the packet 4 tuples */
+};
+
+#define BPF_TAG_SIZE	8
+
+struct bpf_prog_info {
+	__u32 type;
+	__u32 id;
+	__u8  tag[BPF_TAG_SIZE];
+	__u32 jited_prog_len;
+	__u32 xlated_prog_len;
+	__aligned_u64 jited_prog_insns;
+	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 gpl_compatible:1;
+	__u64 netns_dev;
+	__u64 netns_ino;
+	__u32 nr_jited_ksyms;
+	__u32 nr_jited_func_lens;
+	__aligned_u64 jited_ksyms;
+	__aligned_u64 jited_func_lens;
+} __attribute__((aligned(8)));
+
+struct bpf_map_info {
+	__u32 type;
+	__u32 id;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 map_flags;
+	char  name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 :32;
+	__u64 netns_dev;
+	__u64 netns_ino;
+	__u32 btf_id;
+	__u32 btf_key_type_id;
+	__u32 btf_value_type_id;
+} __attribute__((aligned(8)));
+
+struct bpf_btf_info {
+	__aligned_u64 btf;
+	__u32 btf_size;
+	__u32 id;
+} __attribute__((aligned(8)));
+
+/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
+ * by user and intended to be used by socket (e.g. to bind to, depends on
+ * attach attach type).
+ */
+struct bpf_sock_addr {
+	__u32 user_family;	/* Allows 4-byte read, but no write. */
+	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 user_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 user_port;	/* Allows 4-byte read and write.
+				 * Stored in network byte order
+				 */
+	__u32 family;		/* Allows 4-byte read, but no write */
+	__u32 type;		/* Allows 4-byte read, but no write */
+	__u32 protocol;		/* Allows 4-byte read, but no write */
+	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read an 4-byte write.
+				 * Stored in network byte order.
+				 */
+};
+
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+	__u32 op;
+	union {
+		__u32 args[4];		/* Optionally passed to bpf program */
+		__u32 reply;		/* Returned by bpf program	    */
+		__u32 replylong[4];	/* Optionally returned by bpf prog  */
+	};
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+	__u32 is_fullsock;	/* Some TCP fields are only valid if
+				 * there is a full socket. If not, the
+				 * fields read as zero.
+				 */
+	__u32 snd_cwnd;
+	__u32 srtt_us;		/* Averaged RTT << 3 in usecs */
+	__u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+	__u32 state;
+	__u32 rtt_min;
+	__u32 snd_ssthresh;
+	__u32 rcv_nxt;
+	__u32 snd_nxt;
+	__u32 snd_una;
+	__u32 mss_cache;
+	__u32 ecn_flags;
+	__u32 rate_delivered;
+	__u32 rate_interval_us;
+	__u32 packets_out;
+	__u32 retrans_out;
+	__u32 total_retrans;
+	__u32 segs_in;
+	__u32 data_segs_in;
+	__u32 segs_out;
+	__u32 data_segs_out;
+	__u32 lost_out;
+	__u32 sacked_out;
+	__u32 sk_txhash;
+	__u64 bytes_received;
+	__u64 bytes_acked;
+};
+
+/* Definitions for bpf_sock_ops_cb_flags */
+#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
+#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
+#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
+#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
+							 * supported cb flags
+							 */
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+	BPF_SOCK_OPS_VOID,
+	BPF_SOCK_OPS_TIMEOUT_INIT,	/* Should return SYN-RTO value to use or
+					 * -1 if default value should be used
+					 */
+	BPF_SOCK_OPS_RWND_INIT,		/* Should return initial advertized
+					 * window (in packets) or -1 if default
+					 * value should be used
+					 */
+	BPF_SOCK_OPS_TCP_CONNECT_CB,	/* Calls BPF program right before an
+					 * active connection is initialized
+					 */
+	BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB,	/* Calls BPF program when an
+						 * active connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,	/* Calls BPF program when a
+						 * passive connection is
+						 * established
+						 */
+	BPF_SOCK_OPS_NEEDS_ECN,		/* If connection's congestion control
+					 * needs ECN
+					 */
+	BPF_SOCK_OPS_BASE_RTT,		/* Get base RTT. The correct value is
+					 * based on the path and may be
+					 * dependent on the congestion control
+					 * algorithm. In general it indicates
+					 * a congestion threshold. RTTs above
+					 * this indicate congestion
+					 */
+	BPF_SOCK_OPS_RTO_CB,		/* Called when an RTO has triggered.
+					 * Arg1: value of icsk_retransmits
+					 * Arg2: value of icsk_rto
+					 * Arg3: whether RTO has expired
+					 */
+	BPF_SOCK_OPS_RETRANS_CB,	/* Called when skb is retransmitted.
+					 * Arg1: sequence number of 1st byte
+					 * Arg2: # segments
+					 * Arg3: return value of
+					 *       tcp_transmit_skb (0 => success)
+					 */
+	BPF_SOCK_OPS_STATE_CB,		/* Called when TCP changes state.
+					 * Arg1: old_state
+					 * Arg2: new_state
+					 */
+	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
+					 * socket transition to LISTEN state.
+					 */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+	BPF_TCP_ESTABLISHED = 1,
+	BPF_TCP_SYN_SENT,
+	BPF_TCP_SYN_RECV,
+	BPF_TCP_FIN_WAIT1,
+	BPF_TCP_FIN_WAIT2,
+	BPF_TCP_TIME_WAIT,
+	BPF_TCP_CLOSE,
+	BPF_TCP_CLOSE_WAIT,
+	BPF_TCP_LAST_ACK,
+	BPF_TCP_LISTEN,
+	BPF_TCP_CLOSING,	/* Now a valid state */
+	BPF_TCP_NEW_SYN_RECV,
+
+	BPF_TCP_MAX_STATES	/* Leave at the end! */
+};
+
+#define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
+
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	/* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+	__u32 access_type;
+	__u32 major;
+	__u32 minor;
+};
+
+struct bpf_raw_tracepoint_args {
+	__u64 args[0];
+};
+
+/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT:  Do lookup from egress perspective; default is ingress
+ */
+#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
+
+enum {
+	BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
+	BPF_FIB_LKUP_RET_BLACKHOLE,    /* dest is blackholed; can be dropped */
+	BPF_FIB_LKUP_RET_UNREACHABLE,  /* dest is unreachable; can be dropped */
+	BPF_FIB_LKUP_RET_PROHIBIT,     /* dest not allowed; can be dropped */
+	BPF_FIB_LKUP_RET_NOT_FWDED,    /* packet is not forwarded */
+	BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+	BPF_FIB_LKUP_RET_UNSUPP_LWT,   /* fwd requires encapsulation */
+	BPF_FIB_LKUP_RET_NO_NEIGH,     /* no neighbor entry for nh */
+	BPF_FIB_LKUP_RET_FRAG_NEEDED,  /* fragmentation required to fwd */
+};
+
+struct bpf_fib_lookup {
+	/* input:  network family for lookup (AF_INET, AF_INET6)
+	 * output: network family of egress nexthop
+	 */
+	__u8	family;
+
+	/* set if lookup is to consider L4 data - e.g., FIB rules */
+	__u8	l4_protocol;
+	__be16	sport;
+	__be16	dport;
+
+	/* total length of packet from network header - used for MTU check */
+	__u16	tot_len;
+
+	/* input: L3 device index for lookup
+	 * output: device index from FIB lookup
+	 */
+	__u32	ifindex;
+
+	union {
+		/* inputs to lookup */
+		__u8	tos;		/* AF_INET  */
+		__be32	flowinfo;	/* AF_INET6, flow_label + priority */
+
+		/* output: metric of fib result (IPv4/IPv6 only) */
+		__u32	rt_metric;
+	};
+
+	union {
+		__be32		ipv4_src;
+		__u32		ipv6_src[4];  /* in6_addr; network order */
+	};
+
+	/* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+	 * network header. output: bpf_fib_lookup sets to gateway address
+	 * if FIB lookup returns gateway route
+	 */
+	union {
+		__be32		ipv4_dst;
+		__u32		ipv6_dst[4];  /* in6_addr; network order */
+	};
+
+	/* output */
+	__be16	h_vlan_proto;
+	__be16	h_vlan_TCI;
+	__u8	smac[6];     /* ETH_ALEN */
+	__u8	dmac[6];     /* ETH_ALEN */
+};
+
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
+struct bpf_flow_keys {
+	__u16	nhoff;
+	__u16	thoff;
+	__u16	addr_proto;			/* ETH_P_* of valid addrs */
+	__u8	is_frag;
+	__u8	is_first_frag;
+	__u8	is_encap;
+	__u8	ip_proto;
+	__be16	n_proto;
+	__be16	sport;
+	__be16	dport;
+	union {
+		struct {
+			__be32	ipv4_src;
+			__be32	ipv4_dst;
+		};
+		struct {
+			__u32	ipv6_src[4];	/* in6_addr; network order */
+			__u32	ipv6_dst[4];	/* in6_addr; network order */
+		};
+	};
+};
+
+#endif /* _UAPI__LINUX_BPF_H__ */
+)********"
diff --git a/src/cc/export/footer.h b/src/cc/export/footer.h
new file mode 100644
index 0000000..4e20dd4
--- /dev/null
+++ b/src/cc/export/footer.h
@@ -0,0 +1,28 @@
+R"********(
+/*
+ * Copyright (c) 2018 Clevernet, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BPF_LICENSE
+/* No license defined, using GPL
+ * You can define your own BPF_LICENSE in your C code */
+#define BPF_LICENSE GPL
+#endif
+#define ___LICENSE(s) #s
+#define __LICENSE(s) ___LICENSE(s)
+#define _LICENSE __LICENSE(BPF_LICENSE)
+char _license[] SEC("license") = _LICENSE;
+
+)********"
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
new file mode 100755
index 0000000..882a79d
--- /dev/null
+++ b/src/cc/export/helpers.h
@@ -0,0 +1,782 @@
+R"********(
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __BPF_HELPERS_H
+#define __BPF_HELPERS_H
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_packet.h>
+#include <linux/version.h>
+#include <linux/log2.h>
+
+#ifndef CONFIG_BPF_SYSCALL
+#error "CONFIG_BPF_SYSCALL is undefined, please check your .config or ask your Linux distro to enable this feature"
+#endif
+
+#ifdef PERF_MAX_STACK_DEPTH
+#define BPF_MAX_STACK_DEPTH PERF_MAX_STACK_DEPTH
+#else
+#define BPF_MAX_STACK_DEPTH 127
+#endif
+
+/* helper macro to place programs, maps, license in
+ * different sections in elf_bpf file. Section names
+ * are interpreted by elf_bpf loader
+ */
+#define SEC(NAME) __attribute__((section(NAME), used))
+
+// Changes to the macro require changes in BFrontendAction classes
+#define BPF_F_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries, _flags) \
+struct _name##_table_t { \
+  _key_type key; \
+  _leaf_type leaf; \
+  _leaf_type * (*lookup) (_key_type *); \
+  _leaf_type * (*lookup_or_init) (_key_type *, _leaf_type *); \
+  int (*update) (_key_type *, _leaf_type *); \
+  int (*insert) (_key_type *, _leaf_type *); \
+  int (*delete) (_key_type *); \
+  void (*call) (void *, int index); \
+  void (*increment) (_key_type, ...); \
+  int (*get_stackid) (void *, u64); \
+  u32 max_entries; \
+  int flags; \
+}; \
+__attribute__((section("maps/" _table_type))) \
+struct _name##_table_t _name = { .flags = (_flags), .max_entries = (_max_entries) }
+
+#define BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries) \
+BPF_F_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries, 0)
+
+// define a table same as above but allow it to be referenced by other modules
+#define BPF_TABLE_PUBLIC(_table_type, _key_type, _leaf_type, _name, _max_entries) \
+BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries); \
+__attribute__((section("maps/export"))) \
+struct _name##_table_t __##_name
+
+// define a table that is shared accross the programs in the same namespace
+#define BPF_TABLE_SHARED(_table_type, _key_type, _leaf_type, _name, _max_entries) \
+BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries); \
+__attribute__((section("maps/shared"))) \
+struct _name##_table_t __##_name
+
+// Identifier for current CPU used in perf_submit and perf_read
+// Prefer BPF_F_CURRENT_CPU flag, falls back to call helper for older kernel
+// Can be overridden from BCC
+#ifndef CUR_CPU_IDENTIFIER
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
+#define CUR_CPU_IDENTIFIER BPF_F_CURRENT_CPU
+#else
+#define CUR_CPU_IDENTIFIER bpf_get_smp_processor_id()
+#endif
+#endif
+
+// Table for pushing custom events to userspace via ring buffer
+#define BPF_PERF_OUTPUT(_name) \
+struct _name##_table_t { \
+  int key; \
+  u32 leaf; \
+  /* map.perf_submit(ctx, data, data_size) */ \
+  int (*perf_submit) (void *, void *, u32); \
+  int (*perf_submit_skb) (void *, u32, void *, u32); \
+  u32 max_entries; \
+}; \
+__attribute__((section("maps/perf_output"))) \
+struct _name##_table_t _name = { .max_entries = 0 }
+
+// Table for reading hw perf cpu counters
+#define BPF_PERF_ARRAY(_name, _max_entries) \
+struct _name##_table_t { \
+  int key; \
+  u32 leaf; \
+  /* counter = map.perf_read(index) */ \
+  u64 (*perf_read) (int); \
+  int (*perf_counter_value) (int, void *, u32); \
+  u32 max_entries; \
+}; \
+__attribute__((section("maps/perf_array"))) \
+struct _name##_table_t _name = { .max_entries = (_max_entries) }
+
+// Table for cgroup file descriptors
+#define BPF_CGROUP_ARRAY(_name, _max_entries) \
+struct _name##_table_t { \
+  int key; \
+  u32 leaf; \
+  int (*check_current_task) (int); \
+  u32 max_entries; \
+}; \
+__attribute__((section("maps/cgroup_array"))) \
+struct _name##_table_t _name = { .max_entries = (_max_entries) }
+
+#define BPF_HASH1(_name) \
+  BPF_TABLE("hash", u64, u64, _name, 10240)
+#define BPF_HASH2(_name, _key_type) \
+  BPF_TABLE("hash", _key_type, u64, _name, 10240)
+#define BPF_HASH3(_name, _key_type, _leaf_type) \
+  BPF_TABLE("hash", _key_type, _leaf_type, _name, 10240)
+#define BPF_HASH4(_name, _key_type, _leaf_type, _size) \
+  BPF_TABLE("hash", _key_type, _leaf_type, _name, _size)
+
+// helper for default-variable macro function
+#define BPF_HASHX(_1, _2, _3, _4, NAME, ...) NAME
+
+// Define a hash function, some arguments optional
+// BPF_HASH(name, key_type=u64, leaf_type=u64, size=10240)
+#define BPF_HASH(...) \
+  BPF_HASHX(__VA_ARGS__, BPF_HASH4, BPF_HASH3, BPF_HASH2, BPF_HASH1)(__VA_ARGS__)
+
+#define BPF_ARRAY1(_name) \
+  BPF_TABLE("array", int, u64, _name, 10240)
+#define BPF_ARRAY2(_name, _leaf_type) \
+  BPF_TABLE("array", int, _leaf_type, _name, 10240)
+#define BPF_ARRAY3(_name, _leaf_type, _size) \
+  BPF_TABLE("array", int, _leaf_type, _name, _size)
+
+// helper for default-variable macro function
+#define BPF_ARRAYX(_1, _2, _3, NAME, ...) NAME
+
+// Define an array function, some arguments optional
+// BPF_ARRAY(name, leaf_type=u64, size=10240)
+#define BPF_ARRAY(...) \
+  BPF_ARRAYX(__VA_ARGS__, BPF_ARRAY3, BPF_ARRAY2, BPF_ARRAY1)(__VA_ARGS__)
+
+#define BPF_PERCPU_ARRAY1(_name)                        \
+    BPF_TABLE("percpu_array", int, u64, _name, 10240)
+#define BPF_PERCPU_ARRAY2(_name, _leaf_type) \
+    BPF_TABLE("percpu_array", int, _leaf_type, _name, 10240)
+#define BPF_PERCPU_ARRAY3(_name, _leaf_type, _size) \
+    BPF_TABLE("percpu_array", int, _leaf_type, _name, _size)
+
+// helper for default-variable macro function
+#define BPF_PERCPU_ARRAYX(_1, _2, _3, NAME, ...) NAME
+
+// Define an array function (per CPU), some arguments optional
+// BPF_PERCPU_ARRAY(name, leaf_type=u64, size=10240)
+#define BPF_PERCPU_ARRAY(...)                                           \
+  BPF_PERCPU_ARRAYX(                                                    \
+    __VA_ARGS__, BPF_PERCPU_ARRAY3, BPF_PERCPU_ARRAY2, BPF_PERCPU_ARRAY1) \
+           (__VA_ARGS__)
+
+#define BPF_HIST1(_name) \
+  BPF_TABLE("histogram", int, u64, _name, 64)
+#define BPF_HIST2(_name, _key_type) \
+  BPF_TABLE("histogram", _key_type, u64, _name, 64)
+#define BPF_HIST3(_name, _key_type, _size) \
+  BPF_TABLE("histogram", _key_type, u64, _name, _size)
+#define BPF_HISTX(_1, _2, _3, NAME, ...) NAME
+
+// Define a histogram, some arguments optional
+// BPF_HISTOGRAM(name, key_type=int, size=64)
+#define BPF_HISTOGRAM(...) \
+  BPF_HISTX(__VA_ARGS__, BPF_HIST3, BPF_HIST2, BPF_HIST1)(__VA_ARGS__)
+
+#define BPF_LPM_TRIE1(_name) \
+  BPF_F_TABLE("lpm_trie", u64, u64, _name, 10240, BPF_F_NO_PREALLOC)
+#define BPF_LPM_TRIE2(_name, _key_type) \
+  BPF_F_TABLE("lpm_trie", _key_type, u64, _name, 10240, BPF_F_NO_PREALLOC)
+#define BPF_LPM_TRIE3(_name, _key_type, _leaf_type) \
+  BPF_F_TABLE("lpm_trie", _key_type, _leaf_type, _name, 10240, BPF_F_NO_PREALLOC)
+#define BPF_LPM_TRIE4(_name, _key_type, _leaf_type, _size) \
+  BPF_F_TABLE("lpm_trie", _key_type, _leaf_type, _name, _size, BPF_F_NO_PREALLOC)
+#define BPF_LPM_TRIEX(_1, _2, _3, _4, NAME, ...) NAME
+
+// Define a LPM trie function, some arguments optional
+// BPF_LPM_TRIE(name, key_type=u64, leaf_type=u64, size=10240)
+#define BPF_LPM_TRIE(...) \
+  BPF_LPM_TRIEX(__VA_ARGS__, BPF_LPM_TRIE4, BPF_LPM_TRIE3, BPF_LPM_TRIE2, BPF_LPM_TRIE1)(__VA_ARGS__)
+
+struct bpf_stacktrace {
+  u64 ip[BPF_MAX_STACK_DEPTH];
+};
+
+#define BPF_STACK_TRACE(_name, _max_entries) \
+  BPF_TABLE("stacktrace", int, struct bpf_stacktrace, _name, roundup_pow_of_two(_max_entries))
+
+#define BPF_PROG_ARRAY(_name, _max_entries) \
+  BPF_TABLE("prog", u32, u32, _name, _max_entries)
+
+#define BPF_XDP_REDIRECT_MAP(_table_type, _leaf_type, _name, _max_entries) \
+struct _name##_table_t { \
+  u32 key; \
+  _leaf_type leaf; \
+  /* xdp_act = map.redirect_map(index, flag) */ \
+  u64 (*redirect_map) (int, int); \
+  u32 max_entries; \
+}; \
+__attribute__((section("maps/"_table_type))) \
+struct _name##_table_t _name = { .max_entries = (_max_entries) }
+
+#define BPF_DEVMAP(_name, _max_entries) \
+  BPF_XDP_REDIRECT_MAP("devmap", int, _name, _max_entries)
+
+#define BPF_CPUMAP(_name, _max_entries) \
+  BPF_XDP_REDIRECT_MAP("cpumap", u32, _name, _max_entries)
+
+// packet parsing state machine helpers
+#define cursor_advance(_cursor, _len) \
+  ({ void *_tmp = _cursor; _cursor += _len; _tmp; })
+
+#ifdef LINUX_VERSION_CODE_OVERRIDE
+unsigned _version SEC("version") = LINUX_VERSION_CODE_OVERRIDE;
+#else
+unsigned _version SEC("version") = LINUX_VERSION_CODE;
+#endif
+
+/* helper functions called from eBPF programs written in C */
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+  (void *) BPF_FUNC_map_lookup_elem;
+static int (*bpf_map_update_elem)(void *map, void *key, void *value, u64 flags) =
+  (void *) BPF_FUNC_map_update_elem;
+static int (*bpf_map_delete_elem)(void *map, void *key) =
+  (void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_probe_read)(void *dst, u64 size, const void *unsafe_ptr) =
+  (void *) BPF_FUNC_probe_read;
+static u64 (*bpf_ktime_get_ns)(void) =
+  (void *) BPF_FUNC_ktime_get_ns;
+static u32 (*bpf_get_prandom_u32)(void) =
+  (void *) BPF_FUNC_get_prandom_u32;
+static int (*bpf_trace_printk_)(const char *fmt, u64 fmt_size, ...) =
+  (void *) BPF_FUNC_trace_printk;
+static int (*bpf_probe_read_str)(void *dst, u64 size, const void *unsafe_ptr) =
+  (void *) BPF_FUNC_probe_read_str;
+int bpf_trace_printk(const char *fmt, ...) asm("llvm.bpf.extra");
+static inline __attribute__((always_inline))
+void bpf_tail_call_(u64 map_fd, void *ctx, int index) {
+  ((void (*)(void *, u64, int))BPF_FUNC_tail_call)(ctx, map_fd, index);
+}
+static int (*bpf_clone_redirect)(void *ctx, int ifindex, u32 flags) =
+  (void *) BPF_FUNC_clone_redirect;
+static u64 (*bpf_get_smp_processor_id)(void) =
+  (void *) BPF_FUNC_get_smp_processor_id;
+static u64 (*bpf_get_current_pid_tgid)(void) =
+  (void *) BPF_FUNC_get_current_pid_tgid;
+static u64 (*bpf_get_current_uid_gid)(void) =
+  (void *) BPF_FUNC_get_current_uid_gid;
+static int (*bpf_get_current_comm)(void *buf, int buf_size) =
+  (void *) BPF_FUNC_get_current_comm;
+static u64 (*bpf_get_cgroup_classid)(void *ctx) =
+  (void *) BPF_FUNC_get_cgroup_classid;
+static u64 (*bpf_skb_vlan_push)(void *ctx, u16 proto, u16 vlan_tci) =
+  (void *) BPF_FUNC_skb_vlan_push;
+static u64 (*bpf_skb_vlan_pop)(void *ctx) =
+  (void *) BPF_FUNC_skb_vlan_pop;
+static int (*bpf_skb_get_tunnel_key)(void *ctx, void *to, u32 size, u64 flags) =
+  (void *) BPF_FUNC_skb_get_tunnel_key;
+static int (*bpf_skb_set_tunnel_key)(void *ctx, void *from, u32 size, u64 flags) =
+  (void *) BPF_FUNC_skb_set_tunnel_key;
+static u64 (*bpf_perf_event_read)(void *map, u64 flags) =
+  (void *) BPF_FUNC_perf_event_read;
+static int (*bpf_redirect)(int ifindex, u32 flags) =
+  (void *) BPF_FUNC_redirect;
+static u32 (*bpf_get_route_realm)(void *ctx) =
+  (void *) BPF_FUNC_get_route_realm;
+static int (*bpf_perf_event_output)(void *ctx, void *map, u64 index, void *data, u32 size) =
+  (void *) BPF_FUNC_perf_event_output;
+static int (*bpf_skb_load_bytes)(void *ctx, int offset, void *to, u32 len) =
+  (void *) BPF_FUNC_skb_load_bytes;
+static int (*bpf_perf_event_read_value)(void *map, u64 flags, void *buf, u32 buf_size) =
+  (void *) BPF_FUNC_perf_event_read_value;
+static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, u32 buf_size) =
+  (void *) BPF_FUNC_perf_prog_read_value;
+static int (*bpf_current_task_under_cgroup)(void *map, int index) =
+  (void *) BPF_FUNC_current_task_under_cgroup;
+static u32 (*bpf_get_socket_cookie)(void *ctx) =
+  (void *) BPF_FUNC_get_socket_cookie;
+static u64 (*bpf_get_socket_uid)(void *ctx) =
+  (void *) BPF_FUNC_get_socket_uid;
+static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, int optlen) =
+  (void *) BPF_FUNC_getsockopt;
+static int (*bpf_redirect_map)(void *map, int key, int flags) =
+  (void *) BPF_FUNC_redirect_map;
+static int (*bpf_set_hash)(void *ctx, u32 hash) =
+  (void *) BPF_FUNC_set_hash;
+static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, int optlen) =
+  (void *) BPF_FUNC_setsockopt;
+static int (*bpf_skb_adjust_room)(void *ctx, int len_diff, u32 mode, u64 flags) =
+  (void *) BPF_FUNC_skb_adjust_room;
+static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
+  (void *) BPF_FUNC_skb_under_cgroup;
+static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) =
+  (void *) BPF_FUNC_sk_redirect_map;
+static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) =
+  (void *) BPF_FUNC_sock_map_update;
+static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) =
+  (void *) BPF_FUNC_xdp_adjust_meta;
+
+/* bcc_get_stackid will return a negative value in the case of an error
+ *
+ * BPF_STACK_TRACE(_name, _size) will allocate space for _size stack traces.
+ *  -ENOMEM will be returned when this limit is reached.
+ *
+ * -EFAULT is typically returned when requesting user-space stack straces (using
+ * BPF_F_USER_STACK) for kernel threads. However, a valid stackid may be
+ * returned in some cases; consider a tracepoint or kprobe executing in the
+ * kernel context. Given this you can typically ignore -EFAULT errors when
+ * retrieving user-space stack traces.
+ */
+static int (*bcc_get_stackid_)(void *ctx, void *map, u64 flags) =
+  (void *) BPF_FUNC_get_stackid;
+static inline __attribute__((always_inline))
+int bcc_get_stackid(uintptr_t map, void *ctx, u64 flags) {
+  return bcc_get_stackid_(ctx, (void *)map, flags);
+}
+
+static int (*bpf_csum_diff)(void *from, u64 from_size, void *to, u64 to_size, u64 seed) =
+  (void *) BPF_FUNC_csum_diff;
+static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, u32 size) =
+  (void *) BPF_FUNC_skb_get_tunnel_opt;
+static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, u32 size) =
+  (void *) BPF_FUNC_skb_set_tunnel_opt;
+static int (*bpf_skb_change_proto)(void *ctx, u16 proto, u64 flags) =
+  (void *) BPF_FUNC_skb_change_proto;
+static int (*bpf_skb_change_type)(void *ctx, u32 type) =
+  (void *) BPF_FUNC_skb_change_type;
+static u32 (*bpf_get_hash_recalc)(void *ctx) =
+  (void *) BPF_FUNC_get_hash_recalc;
+static u64 (*bpf_get_current_task)(void) =
+  (void *) BPF_FUNC_get_current_task;
+static int (*bpf_probe_write_user)(void *dst, void *src, u32 size) =
+  (void *) BPF_FUNC_probe_write_user;
+static int (*bpf_skb_change_tail)(void *ctx, u32 new_len, u64 flags) =
+  (void *) BPF_FUNC_skb_change_tail;
+static int (*bpf_skb_pull_data)(void *ctx, u32 len) =
+  (void *) BPF_FUNC_skb_pull_data;
+static int (*bpf_csum_update)(void *ctx, u16 csum) =
+  (void *) BPF_FUNC_csum_update;
+static int (*bpf_set_hash_invalid)(void *ctx) =
+  (void *) BPF_FUNC_set_hash_invalid;
+static int (*bpf_get_numa_node_id)(void) =
+  (void *) BPF_FUNC_get_numa_node_id;
+static int (*bpf_skb_change_head)(void *ctx, u32 len, u64 flags) =
+  (void *) BPF_FUNC_skb_change_head;
+static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
+  (void *) BPF_FUNC_xdp_adjust_head;
+static int (*bpf_override_return)(void *pt_regs, unsigned long rc) =
+  (void *) BPF_FUNC_override_return;
+static int (*bpf_sock_ops_cb_flags_set)(void *skops, int flags) =
+  (void *) BPF_FUNC_sock_ops_cb_flags_set;
+static int (*bpf_msg_redirect_map)(void *msg, void *map, u32 key, u64 flags) =
+  (void *) BPF_FUNC_msg_redirect_map;
+static int (*bpf_msg_apply_bytes)(void *msg, u32 bytes) =
+  (void *) BPF_FUNC_msg_apply_bytes;
+static int (*bpf_msg_cork_bytes)(void *msg, u32 bytes) =
+  (void *) BPF_FUNC_msg_cork_bytes;
+static int (*bpf_msg_pull_data)(void *msg, u32 start, u32 end, u64 flags) =
+  (void *) BPF_FUNC_msg_pull_data;
+static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
+  (void *) BPF_FUNC_bind;
+static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
+  (void *) BPF_FUNC_xdp_adjust_tail;
+static int (*bpf_skb_get_xfrm_state)(void *ctx, u32 index, void *xfrm_state, u32 size, u64 flags) =
+  (void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_get_stack)(void *ctx, void *buf, u32 size, u64 flags) =
+  (void *) BPF_FUNC_get_stack;
+static int (*bpf_skb_load_bytes_relative)(void *ctx, u32 offset, void *to, u32 len, u32 start_header) =
+  (void *) BPF_FUNC_skb_load_bytes_relative;
+static int (*bpf_fib_lookup)(void *ctx, void *params, int plen, u32 flags) =
+  (void *) BPF_FUNC_fib_lookup;
+static int (*bpf_sock_hash_update)(void *ctx, void *map, void *key, u64 flags) =
+  (void *) BPF_FUNC_sock_hash_update;
+static int (*bpf_msg_redirect_hash)(void *ctx, void *map, void *key, u64 flags) =
+  (void *) BPF_FUNC_msg_redirect_hash;
+static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, u64 flags) =
+  (void *) BPF_FUNC_sk_redirect_hash;
+static int (*bpf_lwt_push_encap)(void *skb, u32 type, void *hdr, u32 len) =
+  (void *) BPF_FUNC_lwt_push_encap;
+static int (*bpf_lwt_seg6_store_bytes)(void *ctx, u32 offset, const void *from, u32 len) =
+  (void *) BPF_FUNC_lwt_seg6_store_bytes;
+static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, u32 offset, s32 delta) =
+  (void *) BPF_FUNC_lwt_seg6_adjust_srh;
+static int (*bpf_lwt_seg6_action)(void *ctx, u32 action, void *param, u32 param_len) =
+  (void *) BPF_FUNC_lwt_seg6_action;
+static int (*bpf_rc_keydown)(void *ctx, u32 protocol, u64 scancode, u32 toggle) =
+  (void *) BPF_FUNC_rc_keydown;
+static int (*bpf_rc_repeat)(void *ctx) =
+  (void *) BPF_FUNC_rc_repeat;
+static u64 (*bpf_skb_cgroup_id)(void *skb) =
+  (void *) BPF_FUNC_skb_cgroup_id;
+static u64 (*bpf_get_current_cgroup_id)(void) =
+  (void *) BPF_FUNC_get_current_cgroup_id;
+static u64 (*bpf_skb_ancestor_cgroup_id)(void *skb, int ancestor_level) =
+  (void *) BPF_FUNC_skb_ancestor_cgroup_id;
+static void * (*bpf_get_local_storage)(void *map, u64 flags) =
+  (void *) BPF_FUNC_get_local_storage;
+static int (*bpf_sk_select_reuseport)(void *reuse, void *map, void *key, u64 flags) =
+  (void *) BPF_FUNC_sk_select_reuseport;
+static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx,
+                                             struct bpf_sock_tuple *tuple,
+                                             int size, unsigned int netns_id,
+                                             unsigned long long flags) =
+  (void *) BPF_FUNC_sk_lookup_tcp;
+static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
+                                             struct bpf_sock_tuple *tuple,
+                                             int size, unsigned int netns_id,
+                                             unsigned long long flags) =
+  (void *) BPF_FUNC_sk_lookup_udp;
+static int (*bpf_sk_release)(struct bpf_sock *sk) =
+  (void *) BPF_FUNC_sk_release;
+static int (*bpf_map_push_elem)(void *map, const void *value, u64 flags) =
+  (void *) BPF_FUNC_map_push_elem;
+static int (*bpf_map_pop_elem)(void *map, void *value) =
+  (void *) BPF_FUNC_map_pop_elem;
+static int (*bpf_map_peek_elem)(void *map, void *value) =
+  (void *) BPF_FUNC_map_peek_elem;
+static int (*bpf_msg_push_data)(void *skb, u32 start, u32 len, u64 flags) =
+  (void *) BPF_FUNC_msg_push_data;
+
+/* llvm builtin functions that eBPF C program may use to
+ * emit BPF_LD_ABS and BPF_LD_IND instructions
+ */
+struct sk_buff;
+unsigned long long load_byte(void *skb,
+  unsigned long long off) asm("llvm.bpf.load.byte");
+unsigned long long load_half(void *skb,
+  unsigned long long off) asm("llvm.bpf.load.half");
+unsigned long long load_word(void *skb,
+  unsigned long long off) asm("llvm.bpf.load.word");
+
+/* a helper structure used by eBPF C program
+ * to describe map attributes to elf_bpf loader
+ */
+struct bpf_map_def {
+  unsigned int type;
+  unsigned int key_size;
+  unsigned int value_size;
+  unsigned int max_entries;
+};
+
+static int (*bpf_skb_store_bytes)(void *ctx, unsigned long long off, void *from,
+                                  unsigned long long len, unsigned long long flags) =
+  (void *) BPF_FUNC_skb_store_bytes;
+static int (*bpf_l3_csum_replace)(void *ctx, unsigned long long off, unsigned long long from,
+                                  unsigned long long to, unsigned long long flags) =
+  (void *) BPF_FUNC_l3_csum_replace;
+static int (*bpf_l4_csum_replace)(void *ctx, unsigned long long off, unsigned long long from,
+                                  unsigned long long to, unsigned long long flags) =
+  (void *) BPF_FUNC_l4_csum_replace;
+
+static inline __attribute__((always_inline))
+u16 bpf_ntohs(u16 val) {
+  /* will be recognized by gcc into rotate insn and eventually rolw 8 */
+  return (val << 8) | (val >> 8);
+}
+
+static inline __attribute__((always_inline))
+u32 bpf_ntohl(u32 val) {
+  /* gcc will use bswapsi2 insn */
+  return __builtin_bswap32(val);
+}
+
+static inline __attribute__((always_inline))
+u64 bpf_ntohll(u64 val) {
+  /* gcc will use bswapdi2 insn */
+  return __builtin_bswap64(val);
+}
+
+static inline __attribute__((always_inline))
+unsigned __int128 bpf_ntoh128(unsigned __int128 val) {
+  return (((unsigned __int128)bpf_ntohll(val) << 64) | (u64)bpf_ntohll(val >> 64));
+}
+
+static inline __attribute__((always_inline))
+u16 bpf_htons(u16 val) {
+  return bpf_ntohs(val);
+}
+
+static inline __attribute__((always_inline))
+u32 bpf_htonl(u32 val) {
+  return bpf_ntohl(val);
+}
+
+static inline __attribute__((always_inline))
+u64 bpf_htonll(u64 val) {
+  return bpf_ntohll(val);
+}
+
+static inline __attribute__((always_inline))
+unsigned __int128 bpf_hton128(unsigned __int128 val) {
+  return bpf_ntoh128(val);
+}
+
+static inline __attribute__((always_inline))
+u64 load_dword(void *skb, u64 off) {
+  return ((u64)load_word(skb, off) << 32) | load_word(skb, off + 4);
+}
+
+void bpf_store_byte(void *skb, u64 off, u64 val) asm("llvm.bpf.store.byte");
+void bpf_store_half(void *skb, u64 off, u64 val) asm("llvm.bpf.store.half");
+void bpf_store_word(void *skb, u64 off, u64 val) asm("llvm.bpf.store.word");
+u64 bpf_pseudo_fd(u64, u64) asm("llvm.bpf.pseudo");
+
+static inline void __attribute__((always_inline))
+bpf_store_dword(void *skb, u64 off, u64 val) {
+  bpf_store_word(skb, off, (u32)val);
+  bpf_store_word(skb, off + 4, val >> 32);
+}
+
+#define MASK(_n) ((_n) < 64 ? (1ull << (_n)) - 1 : ((u64)-1LL))
+#define MASK128(_n) ((_n) < 128 ? ((unsigned __int128)1 << (_n)) - 1 : ((unsigned __int128)-1))
+
+static inline __attribute__((always_inline))
+unsigned int bpf_log2(unsigned int v)
+{
+  unsigned int r;
+  unsigned int shift;
+
+  r = (v > 0xFFFF) << 4; v >>= r;
+  shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+  shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+  shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+  r |= (v >> 1);
+  return r;
+}
+
+static inline __attribute__((always_inline))
+unsigned int bpf_log2l(unsigned long v)
+{
+  unsigned int hi = v >> 32;
+  if (hi)
+    return bpf_log2(hi) + 32 + 1;
+  else
+    return bpf_log2(v) + 1;
+}
+
+struct bpf_context;
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+u64 bpf_dext_pkt(void *pkt, u64 off, u64 bofs, u64 bsz) {
+  if (bofs == 0 && bsz == 8) {
+    return load_byte(pkt, off);
+  } else if (bofs + bsz <= 8) {
+    return load_byte(pkt, off) >> (8 - (bofs + bsz))  &  MASK(bsz);
+  } else if (bofs == 0 && bsz == 16) {
+    return load_half(pkt, off);
+  } else if (bofs + bsz <= 16) {
+    return load_half(pkt, off) >> (16 - (bofs + bsz))  &  MASK(bsz);
+  } else if (bofs == 0 && bsz == 32) {
+    return load_word(pkt, off);
+  } else if (bofs + bsz <= 32) {
+    return load_word(pkt, off) >> (32 - (bofs + bsz))  &  MASK(bsz);
+  } else if (bofs == 0 && bsz == 64) {
+    return load_dword(pkt, off);
+  } else if (bofs + bsz <= 64) {
+    return load_dword(pkt, off) >> (64 - (bofs + bsz))  &  MASK(bsz);
+  }
+  return 0;
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+void bpf_dins_pkt(void *pkt, u64 off, u64 bofs, u64 bsz, u64 val) {
+  // The load_xxx function does a bswap before returning the short/word/dword,
+  // so the value in register will always be host endian. However, the bytes
+  // written back need to be in network order.
+  if (bofs == 0 && bsz == 8) {
+    bpf_skb_store_bytes(pkt, off, &val, 1, 0);
+  } else if (bofs + bsz <= 8) {
+    u8 v = load_byte(pkt, off);
+    v &= ~(MASK(bsz) << (8 - (bofs + bsz)));
+    v |= ((val & MASK(bsz)) << (8 - (bofs + bsz)));
+    bpf_skb_store_bytes(pkt, off, &v, 1, 0);
+  } else if (bofs == 0 && bsz == 16) {
+    u16 v = bpf_htons(val);
+    bpf_skb_store_bytes(pkt, off, &v, 2, 0);
+  } else if (bofs + bsz <= 16) {
+    u16 v = load_half(pkt, off);
+    v &= ~(MASK(bsz) << (16 - (bofs + bsz)));
+    v |= ((val & MASK(bsz)) << (16 - (bofs + bsz)));
+    v = bpf_htons(v);
+    bpf_skb_store_bytes(pkt, off, &v, 2, 0);
+  } else if (bofs == 0 && bsz == 32) {
+    u32 v = bpf_htonl(val);
+    bpf_skb_store_bytes(pkt, off, &v, 4, 0);
+  } else if (bofs + bsz <= 32) {
+    u32 v = load_word(pkt, off);
+    v &= ~(MASK(bsz) << (32 - (bofs + bsz)));
+    v |= ((val & MASK(bsz)) << (32 - (bofs + bsz)));
+    v = bpf_htonl(v);
+    bpf_skb_store_bytes(pkt, off, &v, 4, 0);
+  } else if (bofs == 0 && bsz == 64) {
+    u64 v = bpf_htonll(val);
+    bpf_skb_store_bytes(pkt, off, &v, 8, 0);
+  } else if (bofs + bsz <= 64) {
+    u64 v = load_dword(pkt, off);
+    v &= ~(MASK(bsz) << (64 - (bofs + bsz)));
+    v |= ((val & MASK(bsz)) << (64 - (bofs + bsz)));
+    v = bpf_htonll(v);
+    bpf_skb_store_bytes(pkt, off, &v, 8, 0);
+  }
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+void * bpf_map_lookup_elem_(uintptr_t map, void *key) {
+  return bpf_map_lookup_elem((void *)map, key);
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+int bpf_map_update_elem_(uintptr_t map, void *key, void *value, u64 flags) {
+  return bpf_map_update_elem((void *)map, key, value, flags);
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+int bpf_map_delete_elem_(uintptr_t map, void *key) {
+  return bpf_map_delete_elem((void *)map, key);
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+int bpf_l3_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) {
+  switch (flags & 0xf) {
+    case 2:
+      return bpf_l3_csum_replace(ctx, off, bpf_htons(from), bpf_htons(to), flags);
+    case 4:
+      return bpf_l3_csum_replace(ctx, off, bpf_htonl(from), bpf_htonl(to), flags);
+    case 8:
+      return bpf_l3_csum_replace(ctx, off, bpf_htonll(from), bpf_htonll(to), flags);
+    default:
+      {}
+  }
+  return bpf_l3_csum_replace(ctx, off, from, to, flags);
+}
+
+static inline __attribute__((always_inline))
+SEC("helpers")
+int bpf_l4_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) {
+  switch (flags & 0xf) {
+    case 2:
+      return bpf_l4_csum_replace(ctx, off, bpf_htons(from), bpf_htons(to), flags);
+    case 4:
+      return bpf_l4_csum_replace(ctx, off, bpf_htonl(from), bpf_htonl(to), flags);
+    case 8:
+      return bpf_l4_csum_replace(ctx, off, bpf_htonll(from), bpf_htonll(to), flags);
+    default:
+      {}
+  }
+  return bpf_l4_csum_replace(ctx, off, from, to, flags);
+}
+
+int incr_cksum_l3(void *off, u64 oldval, u64 newval) asm("llvm.bpf.extra");
+int incr_cksum_l4(void *off, u64 oldval, u64 newval, u64 flags) asm("llvm.bpf.extra");
+int bpf_num_cpus() asm("llvm.bpf.extra");
+
+struct pt_regs;
+int bpf_usdt_readarg(int argc, struct pt_regs *ctx, void *arg) asm("llvm.bpf.extra");
+int bpf_usdt_readarg_p(int argc, struct pt_regs *ctx, void *buf, u64 len) asm("llvm.bpf.extra");
+
+/* Scan the ARCH passed in from ARCH env variable (see kbuild_helper.cc) */
+#if defined(__TARGET_ARCH_x86)
+#define bpf_target_x86
+#define bpf_target_defined
+#elif defined(__TARGET_ARCH_s930x)
+#define bpf_target_s930x
+#define bpf_target_defined
+#elif defined(__TARGET_ARCH_arm64)
+#define bpf_target_arm64
+#define bpf_target_defined
+#elif defined(__TARGET_ARCH_powerpc)
+#define bpf_target_powerpc
+#define bpf_target_defined
+#else
+#undef bpf_target_defined
+#endif
+
+/* Fall back to what the compiler says */
+#ifndef bpf_target_defined
+#if defined(__x86_64__)
+#define bpf_target_x86
+#elif defined(__s390x__)
+#define bpf_target_s930x
+#elif defined(__aarch64__)
+#define bpf_target_arm64
+#elif defined(__powerpc__)
+#define bpf_target_powerpc
+#endif
+#endif
+
+#if defined(bpf_target_powerpc)
+#define PT_REGS_PARM1(ctx)	((ctx)->gpr[3])
+#define PT_REGS_PARM2(ctx)	((ctx)->gpr[4])
+#define PT_REGS_PARM3(ctx)	((ctx)->gpr[5])
+#define PT_REGS_PARM4(ctx)	((ctx)->gpr[6])
+#define PT_REGS_PARM5(ctx)	((ctx)->gpr[7])
+#define PT_REGS_PARM6(ctx)	((ctx)->gpr[8])
+#define PT_REGS_RC(ctx)		((ctx)->gpr[3])
+#define PT_REGS_IP(ctx)		((ctx)->nip)
+#define PT_REGS_SP(ctx)		((ctx)->gpr[1])
+#elif defined(bpf_target_s930x)
+#define PT_REGS_PARM1(x) ((x)->gprs[2])
+#define PT_REGS_PARM2(x) ((x)->gprs[3])
+#define PT_REGS_PARM3(x) ((x)->gprs[4])
+#define PT_REGS_PARM4(x) ((x)->gprs[5])
+#define PT_REGS_PARM5(x) ((x)->gprs[6])
+#define PT_REGS_RET(x) ((x)->gprs[14])
+#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */
+#define PT_REGS_RC(x) ((x)->gprs[2])
+#define PT_REGS_SP(x) ((x)->gprs[15])
+#define PT_REGS_IP(x) ((x)->psw.addr)
+#elif defined(bpf_target_x86)
+#define PT_REGS_PARM1(ctx)	((ctx)->di)
+#define PT_REGS_PARM2(ctx)	((ctx)->si)
+#define PT_REGS_PARM3(ctx)	((ctx)->dx)
+#define PT_REGS_PARM4(ctx)	((ctx)->cx)
+#define PT_REGS_PARM5(ctx)	((ctx)->r8)
+#define PT_REGS_PARM6(ctx)	((ctx)->r9)
+#define PT_REGS_FP(ctx)         ((ctx)->bp) /* Works only with CONFIG_FRAME_POINTER */
+#define PT_REGS_RC(ctx)		((ctx)->ax)
+#define PT_REGS_IP(ctx)		((ctx)->ip)
+#define PT_REGS_SP(ctx)		((ctx)->sp)
+#elif defined(bpf_target_arm64)
+#define PT_REGS_PARM1(x)	((x)->regs[0])
+#define PT_REGS_PARM2(x)	((x)->regs[1])
+#define PT_REGS_PARM3(x)	((x)->regs[2])
+#define PT_REGS_PARM4(x)	((x)->regs[3])
+#define PT_REGS_PARM5(x)	((x)->regs[4])
+#define PT_REGS_PARM6(x)	((x)->regs[5])
+#define PT_REGS_RET(x)		((x)->regs[30])
+#define PT_REGS_FP(x)		((x)->regs[29]) /*  Works only with CONFIG_FRAME_POINTER */
+#define PT_REGS_RC(x)		((x)->regs[0])
+#define PT_REGS_SP(x)		((x)->sp)
+#define PT_REGS_IP(x)		((x)->pc)
+#else
+#error "bcc does not support this platform yet"
+#endif
+
+#define lock_xadd(ptr, val) ((void)__sync_fetch_and_add(ptr, val))
+
+#define TRACEPOINT_PROBE(category, event) \
+int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args)
+
+#define RAW_TRACEPOINT_PROBE(event) \
+int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx)
+
+#define TP_DATA_LOC_READ_CONST(dst, field, length)                        \
+        do {                                                              \
+            unsigned short __offset = args->data_loc_##field & 0xFFFF;    \
+            bpf_probe_read((void *)dst, length, (char *)args + __offset); \
+        } while (0);
+
+#define TP_DATA_LOC_READ(dst, field)                                        \
+        do {                                                                \
+            unsigned short __offset = args->data_loc_##field & 0xFFFF;      \
+            unsigned short __length = args->data_loc_##field >> 16;         \
+            bpf_probe_read((void *)dst, __length, (char *)args + __offset); \
+        } while (0);
+
+#endif
+)********"
diff --git a/src/cc/export/proto.h b/src/cc/export/proto.h
new file mode 100644
index 0000000..5acaf31
--- /dev/null
+++ b/src/cc/export/proto.h
@@ -0,0 +1,150 @@
+R"********(
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BCC_PROTO_H
+#define __BCC_PROTO_H
+
+#include <uapi/linux/if_ether.h>
+
+#define BPF_PACKET_HEADER __attribute__((packed)) __attribute__((deprecated("packet")))
+
+struct ethernet_t {
+  unsigned long long  dst:48;
+  unsigned long long  src:48;
+  unsigned int        type:16;
+} BPF_PACKET_HEADER;
+
+struct dot1q_t {
+  unsigned short pri:3;
+  unsigned short cfi:1;
+  unsigned short vlanid:12;
+  unsigned short type;
+} BPF_PACKET_HEADER;
+
+struct arp_t {
+  unsigned short      htype;
+  unsigned short      ptype;
+  unsigned char       hlen;
+  unsigned char       plen;
+  unsigned short      oper;
+  unsigned long long  sha:48;
+  unsigned long long  spa:32;
+  unsigned long long  tha:48;
+  unsigned int        tpa;
+} BPF_PACKET_HEADER;
+
+struct ip_t {
+  unsigned char   ver:4;           // byte 0
+  unsigned char   hlen:4;
+  unsigned char   tos;
+  unsigned short  tlen;
+  unsigned short  identification; // byte 4
+  unsigned short  ffo_unused:1;
+  unsigned short  df:1;
+  unsigned short  mf:1;
+  unsigned short  foffset:13;
+  unsigned char   ttl;             // byte 8
+  unsigned char   nextp;
+  unsigned short  hchecksum;
+  unsigned int    src;            // byte 12
+  unsigned int    dst;            // byte 16
+} BPF_PACKET_HEADER;
+
+struct icmp_t {
+  unsigned char   type;
+  unsigned char   code;
+  unsigned short  checksum;
+} BPF_PACKET_HEADER;
+
+struct ip6_t {
+  unsigned int        ver:4;
+  unsigned int        priority:8;
+  unsigned int        flow_label:20;
+  unsigned short      payload_len;
+  unsigned char       next_header;
+  unsigned char       hop_limit;
+  unsigned long long  src_hi;
+  unsigned long long  src_lo;
+  unsigned long long  dst_hi;
+  unsigned long long  dst_lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_opt_t {
+  unsigned char  next_header;
+  unsigned char  ext_len;
+  unsigned char  pad[6];
+} BPF_PACKET_HEADER;
+
+struct icmp6_t {
+  unsigned char   type;
+  unsigned char   code;
+  unsigned short  checksum;
+} BPF_PACKET_HEADER;
+
+struct udp_t {
+  unsigned short sport;
+  unsigned short dport;
+  unsigned short length;
+  unsigned short crc;
+} BPF_PACKET_HEADER;
+
+struct tcp_t {
+  unsigned short  src_port;   // byte 0
+  unsigned short  dst_port;
+  unsigned int    seq_num;    // byte 4
+  unsigned int    ack_num;    // byte 8
+  unsigned char   offset:4;    // byte 12
+  unsigned char   reserved:4;
+  unsigned char   flag_cwr:1;
+  unsigned char   flag_ece:1;
+  unsigned char   flag_urg:1;
+  unsigned char   flag_ack:1;
+  unsigned char   flag_psh:1;
+  unsigned char   flag_rst:1;
+  unsigned char   flag_syn:1;
+  unsigned char   flag_fin:1;
+  unsigned short  rcv_wnd;
+  unsigned short  cksum;      // byte 16
+  unsigned short  urg_ptr;
+} BPF_PACKET_HEADER;
+
+struct vxlan_t {
+  unsigned int rsv1:4;
+  unsigned int iflag:1;
+  unsigned int rsv2:3;
+  unsigned int rsv3:24;
+  unsigned int key:24;
+  unsigned int rsv4:8;
+} BPF_PACKET_HEADER;
+
+struct vxlan_gbp_t {
+  unsigned int gflag:1;
+  unsigned int rsv1:3;
+  unsigned int iflag:1;
+  unsigned int rsv2:3;
+  unsigned int rsv3:1;
+  unsigned int dflag:1;
+  unsigned int rsv4:1;
+  unsigned int aflag:1;
+  unsigned int rsv5:3;
+  unsigned int tag:16;
+  unsigned int key:24;
+  unsigned int rsv6:8;
+} BPF_PACKET_HEADER;
+
+#endif
+)********"
diff --git a/src/cc/exported_files.cc b/src/cc/exported_files.cc
new file mode 100644
index 0000000..b9818e1
--- /dev/null
+++ b/src/cc/exported_files.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "exported_files.h"
+
+using std::map;
+using std::string;
+
+namespace ebpf {
+
+// c++11 feature for including raw string literals
+// see http://www.stroustrup.com/C++11FAQ.html#raw-strings
+
+map<string, const char *> ExportedFiles::headers_ = {
+  {
+    "/virtual/include/bcc/bpf.h",
+    #include "compat/linux/virtual_bpf.h"
+  },
+  {
+    "/virtual/include/bcc/proto.h",
+    #include "export/proto.h"
+  },
+  {
+    "/virtual/include/bcc/helpers.h",
+    #include "export/helpers.h"
+  },
+  {
+    "/virtual/lib/clang/include/stdarg.h",
+    #include "clang/include/stdarg.h"
+  },
+};
+
+map<string, const char *> ExportedFiles::footers_ = {
+  {
+    "/virtual/include/bcc/footer.h",
+    #include "export/footer.h"
+  },
+};
+
+}
diff --git a/src/cc/exported_files.h b/src/cc/exported_files.h
new file mode 100644
index 0000000..121e558
--- /dev/null
+++ b/src/cc/exported_files.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <string>
+
+namespace ebpf {
+
+class ExportedFiles {
+  static std::map<std::string, const char *> headers_;
+  static std::map<std::string, const char *> footers_;
+ public:
+  static const std::map<std::string, const char *> & headers() { return headers_; }
+  static const std::map<std::string, const char *> & footers() { return footers_; }
+};
+
+}
diff --git a/src/cc/file_desc.h b/src/cc/file_desc.h
new file mode 100644
index 0000000..a55ba0b
--- /dev/null
+++ b/src/cc/file_desc.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <unistd.h>
+
+namespace ebpf {
+
+/// FileDesc is a helper class for managing open file descriptors. Copy is
+/// disallowed (call dup instead), and cleanup happens automatically.
+class FileDesc {
+ public:
+  explicit FileDesc(int fd = -1) : fd_(fd) {}
+  FileDesc(FileDesc &&that) : fd_(-1) { *this = std::move(that); }
+  FileDesc(const FileDesc &that) = delete;
+
+  ~FileDesc() {
+    if (fd_ >= 0)
+      ::close(fd_);
+  }
+
+  FileDesc &operator=(int fd) {
+    if (fd_ >= 0)
+      ::close(fd_);
+    fd_ = fd;
+    return *this;
+  }
+  FileDesc &operator=(FileDesc &&that) {
+    if (fd_ >= 0)
+      ::close(fd_);
+    fd_ = that.fd_;
+    that.fd_ = -1;
+    return *this;
+  }
+  FileDesc &operator=(const FileDesc &that) = delete;
+
+  FileDesc dup() const {
+    int dup_fd = ::dup(fd_);
+    return FileDesc(dup_fd);
+  }
+
+  operator int() { return fd_; }
+  operator int() const { return fd_; }
+
+ private:
+  int fd_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/CMakeLists.txt b/src/cc/frontends/CMakeLists.txt
new file mode 100644
index 0000000..cef6c3c
--- /dev/null
+++ b/src/cc/frontends/CMakeLists.txt
@@ -0,0 +1,5 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+add_subdirectory(b)
+add_subdirectory(clang)
diff --git a/src/cc/frontends/b/CMakeLists.txt b/src/cc/frontends/b/CMakeLists.txt
new file mode 100644
index 0000000..391ab27
--- /dev/null
+++ b/src/cc/frontends/b/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+BISON_TARGET(Parser parser.yy ${CMAKE_CURRENT_BINARY_DIR}/parser.yy.cc COMPILE_FLAGS "-o parser.yy.cc -v --debug")
+FLEX_TARGET(Lexer lexer.ll ${CMAKE_CURRENT_BINARY_DIR}/lexer.ll.cc COMPILE_FLAGS "--c++ --o lexer.ll.cc")
+ADD_FLEX_BISON_DEPENDENCY(Lexer Parser)
+if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/lexer.ll.cc PROPERTIES COMPILE_FLAGS "-Wno-deprecated-register")
+endif()
+
+add_library(b_frontend STATIC loader.cc codegen_llvm.cc node.cc parser.cc printer.cc
+  type_check.cc ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS})
diff --git a/src/cc/frontends/b/codegen_llvm.cc b/src/cc/frontends/b/codegen_llvm.cc
new file mode 100644
index 0000000..dc9651b
--- /dev/null
+++ b/src/cc/frontends/b/codegen_llvm.cc
@@ -0,0 +1,1351 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <algorithm>
+#include <sstream>
+
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CallingConv.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+#include <llvm/IR/InlineAsm.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IRPrintingPasses.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+
+#include "bcc_exception.h"
+#include "codegen_llvm.h"
+#include "file_desc.h"
+#include "lexer.h"
+#include "libbpf.h"
+#include "linux/bpf.h"
+#include "table_storage.h"
+#include "type_helper.h"
+
+namespace ebpf {
+namespace cc {
+
+using namespace llvm;
+
+using std::for_each;
+using std::make_tuple;
+using std::map;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+using std::to_string;
+using std::vector;
+
+// can't forward declare IRBuilder in .h file (template with default
+// parameters), so cast it instead :(
+#define B (*((IRBuilder<> *)this->b_))
+
+// Helper class to push/pop the insert block
+class BlockStack {
+ public:
+  explicit BlockStack(CodegenLLVM *cc, BasicBlock *bb)
+    : old_bb_(cc->b_->GetInsertBlock()), cc_(cc) {
+    cc_->b_->SetInsertPoint(bb);
+  }
+  ~BlockStack() {
+    if (old_bb_)
+      cc_->b_->SetInsertPoint(old_bb_);
+    else
+      cc_->b_->ClearInsertionPoint();
+  }
+ private:
+  BasicBlock *old_bb_;
+  CodegenLLVM *cc_;
+};
+
+// Helper class to push/pop switch statement insert block
+class SwitchStack {
+ public:
+  explicit SwitchStack(CodegenLLVM *cc, SwitchInst *sw)
+    : old_sw_(cc->cur_switch_), cc_(cc) {
+    cc_->cur_switch_ = sw;
+  }
+  ~SwitchStack() {
+    cc_->cur_switch_ = old_sw_;
+  }
+ private:
+  SwitchInst *old_sw_;
+  CodegenLLVM *cc_;
+};
+
+CodegenLLVM::CodegenLLVM(llvm::Module *mod, Scopes *scopes, Scopes *proto_scopes)
+  : out_(stdout), mod_(mod), indent_(0), tmp_reg_index_(0), scopes_(scopes),
+    proto_scopes_(proto_scopes), expr_(nullptr) {
+  b_ = new IRBuilder<>(ctx());
+}
+CodegenLLVM::~CodegenLLVM() {
+  delete b_;
+}
+
+template <typename... Args>
+void CodegenLLVM::emit(const char *fmt, Args&&... params) {
+  //fprintf(out_, fmt, std::forward<Args>(params)...);
+  //fflush(out_);
+}
+void CodegenLLVM::emit(const char *s) {
+  //fprintf(out_, "%s", s);
+  //fflush(out_);
+}
+
+StatusTuple CodegenLLVM::visit_block_stmt_node(BlockStmtNode *n) {
+
+  // enter scope
+  if (n->scope_)
+    scopes_->push_var(n->scope_);
+
+  if (!n->stmts_.empty()) {
+    for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it)
+      TRY2((*it)->accept(this));
+  }
+  // exit scope
+  if (n->scope_)
+    scopes_->pop_var();
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_if_stmt_node(IfStmtNode *n) {
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "if.then", parent);
+  BasicBlock *label_else = n->false_block_ ? BasicBlock::Create(ctx(), "if.else", parent) : nullptr;
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "if.end", parent);
+
+  TRY2(n->cond_->accept(this));
+  Value *is_not_null = B.CreateIsNotNull(pop_expr());
+
+  if (n->false_block_)
+    B.CreateCondBr(is_not_null, label_then, label_else);
+  else
+    B.CreateCondBr(is_not_null, label_then, label_end);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->true_block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  if (n->false_block_) {
+    BlockStack bstack(this, label_else);
+    TRY2(n->false_block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_onvalid_stmt_node(OnValidStmtNode *n) {
+  TRY2(n->cond_->accept(this));
+
+  Value *is_null = B.CreateIsNotNull(pop_expr());
+
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "onvalid.then", parent);
+  BasicBlock *label_else = n->else_block_ ? BasicBlock::Create(ctx(), "onvalid.else", parent) : nullptr;
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "onvalid.end", parent);
+
+  if (n->else_block_)
+    B.CreateCondBr(is_null, label_then, label_else);
+  else
+    B.CreateCondBr(is_null, label_then, label_end);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  if (n->else_block_) {
+    BlockStack bstack(this, label_else);
+    TRY2(n->else_block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_switch_stmt_node(SwitchStmtNode *n) {
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_default = BasicBlock::Create(ctx(), "switch.default", parent);
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "switch.end", parent);
+  // switch (cond)
+  TRY2(n->cond_->accept(this));
+  SwitchInst *switch_inst = B.CreateSwitch(pop_expr(), label_default);
+  B.SetInsertPoint(label_end);
+  {
+    // case 1..N
+    SwitchStack sstack(this, switch_inst);
+    TRY2(n->block_->accept(this));
+  }
+  // if other cases are terminal, erase the end label
+  if (pred_empty(label_end)) {
+    B.SetInsertPoint(resolve_label("DONE"));
+    label_end->eraseFromParent();
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_case_stmt_node(CaseStmtNode *n) {
+  if (!cur_switch_) return mkstatus_(n, "no valid switch instruction");
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_end = B.GetInsertBlock();
+  BasicBlock *dest;
+  if (n->value_) {
+    TRY2(n->value_->accept(this));
+    dest = BasicBlock::Create(ctx(), "switch.case", parent);
+    Value *cond = B.CreateIntCast(pop_expr(), cur_switch_->getCondition()->getType(), false);
+    cur_switch_->addCase(cast<ConstantInt>(cond), dest);
+  } else {
+    dest = cur_switch_->getDefaultDest();
+  }
+  {
+    BlockStack bstack(this, dest);
+    TRY2(n->block_->accept(this));
+    // if no trailing goto, fall to end
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_ident_expr_node(IdentExprNode *n) {
+  if (!n->decl_)
+    return mkstatus_(n, "variable lookup failed: %s", n->name_.c_str());
+  if (n->decl_->is_pointer()) {
+    if (n->sub_name_.size()) {
+      if (n->bitop_) {
+        // ident is holding a host endian number, don't use dext
+        if (n->is_lhs()) {
+          emit("%s%s->%s", n->decl_->scope_id(), n->c_str(), n->sub_name_.c_str());
+        } else {
+          emit("(((%s%s->%s) >> %d) & (((%s)1 << %d) - 1))", n->decl_->scope_id(), n->c_str(), n->sub_name_.c_str(),
+              n->bitop_->bit_offset_, bits_to_uint(n->bitop_->bit_width_ + 1), n->bitop_->bit_width_);
+        }
+        return mkstatus_(n, "unsupported");
+      } else {
+        if (n->struct_type_->id_->name_ == "_Packet" && n->sub_name_.substr(0, 3) == "arg") {
+          // convert arg1~arg8 into args[0]~args[7] assuming type_check verified the range already
+          auto arg_num = stoi(n->sub_name_.substr(3, 3));
+          if (arg_num < 5) {
+            emit("%s%s->args_lo[%d]", n->decl_->scope_id(), n->c_str(), arg_num - 1);
+          } else {
+            emit("%s%s->args_hi[%d]", n->decl_->scope_id(), n->c_str(), arg_num - 5);
+          }
+          return mkstatus_(n, "unsupported");
+        } else {
+          emit("%s%s->%s", n->decl_->scope_id(), n->c_str(), n->sub_name_.c_str());
+          auto it = vars_.find(n->decl_);
+          if (it == vars_.end()) return mkstatus_(n, "Cannot locate variable %s in vars_ table", n->c_str());
+          LoadInst *load_1 = B.CreateLoad(it->second);
+          vector<Value *> indices({B.getInt32(0), B.getInt32(n->sub_decl_->slot_)});
+          expr_ = B.CreateInBoundsGEP(load_1, indices);
+          if (!n->is_lhs())
+            expr_ = B.CreateLoad(pop_expr());
+        }
+      }
+    } else {
+      auto it = vars_.find(n->decl_);
+      if (it == vars_.end()) return mkstatus_(n, "Cannot locate variable %s in vars_ table", n->c_str());
+      expr_ = n->is_lhs() ? it->second : (Value *)B.CreateLoad(it->second);
+    }
+  } else {
+    if (n->sub_name_.size()) {
+      emit("%s%s.%s", n->decl_->scope_id(), n->c_str(), n->sub_name_.c_str());
+      auto it = vars_.find(n->decl_);
+      if (it == vars_.end()) return mkstatus_(n, "Cannot locate variable %s in vars_ table", n->c_str());
+      vector<Value *> indices({const_int(0), const_int(n->sub_decl_->slot_, 32)});
+      expr_ = B.CreateGEP(nullptr, it->second, indices);
+      if (!n->is_lhs())
+        expr_ = B.CreateLoad(pop_expr());
+    } else {
+      if (n->bitop_) {
+        // ident is holding a host endian number, don't use dext
+        if (n->is_lhs())
+          return mkstatus_(n, "illegal: ident %s is a left-hand-side type", n->name_.c_str());
+        if (n->decl_->is_struct())
+          return mkstatus_(n, "illegal: can only take bitop of a struct subfield");
+        emit("(((%s%s) >> %d) & (((%s)1 << %d) - 1))", n->decl_->scope_id(), n->c_str(),
+             n->bitop_->bit_offset_, bits_to_uint(n->bitop_->bit_width_ + 1), n->bitop_->bit_width_);
+      } else {
+        emit("%s%s", n->decl_->scope_id(), n->c_str());
+        auto it = vars_.find(n->decl_);
+        if (it == vars_.end()) return mkstatus_(n, "Cannot locate variable %s in vars_ table", n->c_str());
+        if (n->is_lhs() || n->decl_->is_struct())
+          expr_ = it->second;
+        else
+          expr_ = B.CreateLoad(it->second);
+      }
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_assign_expr_node(AssignExprNode *n) {
+  if (n->bitop_) {
+    TRY2(n->lhs_->accept(this));
+    emit(" = (");
+    TRY2(n->lhs_->accept(this));
+    emit(" & ~((((%s)1 << %d) - 1) << %d)) | (", bits_to_uint(n->lhs_->bit_width_),
+         n->bitop_->bit_width_, n->bitop_->bit_offset_);
+    TRY2(n->rhs_->accept(this));
+    emit(" << %d)", n->bitop_->bit_offset_);
+    return mkstatus_(n, "unsupported");
+  } else {
+    if (n->lhs_->flags_[ExprNode::PROTO]) {
+      // auto f = n->lhs_->struct_type_->field(n->id_->sub_name_);
+      // emit("bpf_dins(%s%s + %zu, %zu, %zu, ", n->id_->decl_->scope_id(), n->id_->c_str(),
+      //      f->bit_offset_ >> 3, f->bit_offset_ & 0x7, f->bit_width_);
+      // TRY2(n->rhs_->accept(this));
+      // emit(")");
+      return mkstatus_(n, "unsupported");
+    } else {
+      TRY2(n->rhs_->accept(this));
+      if (n->lhs_->is_pkt()) {
+        TRY2(n->lhs_->accept(this));
+      } else {
+        Value *rhs = pop_expr();
+        TRY2(n->lhs_->accept(this));
+        Value *lhs = pop_expr();
+        if (!n->rhs_->is_ref())
+          rhs = B.CreateIntCast(rhs, cast<PointerType>(lhs->getType())->getElementType(), false);
+        B.CreateStore(rhs, lhs);
+      }
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::lookup_var(Node *n, const string &name, Scopes::VarScope *scope,
+                                    VariableDeclStmtNode **decl, Value **mem) const {
+  *decl = scope->lookup(name, SCOPE_GLOBAL);
+  if (!*decl) return mkstatus_(n, "cannot find %s variable", name.c_str());
+  auto it = vars_.find(*decl);
+  if (it == vars_.end()) return mkstatus_(n, "unable to find %s memory location", name.c_str());
+  *mem = it->second;
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_packet_expr_node(PacketExprNode *n) {
+  auto p = proto_scopes_->top_struct()->lookup(n->id_->name_, true);
+  VariableDeclStmtNode *offset_decl, *skb_decl;
+  Value *offset_mem, *skb_mem;
+  TRY2(lookup_var(n, "skb", scopes_->current_var(), &skb_decl, &skb_mem));
+  TRY2(lookup_var(n, "$" + n->id_->name_, scopes_->current_var(), &offset_decl, &offset_mem));
+
+  if (p) {
+    auto f = p->field(n->id_->sub_name_);
+    if (f) {
+      size_t bit_offset = f->bit_offset_;
+      size_t bit_width = f->bit_width_;
+      if (n->bitop_) {
+        bit_offset += f->bit_width_ - (n->bitop_->bit_offset_ + n->bitop_->bit_width_);
+        bit_width = std::min(bit_width - n->bitop_->bit_offset_, n->bitop_->bit_width_);
+      }
+      if (n->is_ref()) {
+        // e.g.: @ip.hchecksum, return offset of the header within packet
+        LoadInst *offset_ptr = B.CreateLoad(offset_mem);
+        Value *skb_hdr_offset = B.CreateAdd(offset_ptr, B.getInt64(bit_offset >> 3));
+        expr_ = B.CreateIntCast(skb_hdr_offset, B.getInt64Ty(), false);
+      } else if (n->is_lhs()) {
+        emit("bpf_dins_pkt(pkt, %s + %zu, %zu, %zu, ", n->id_->c_str(), bit_offset >> 3, bit_offset & 0x7, bit_width);
+        Function *store_fn = mod_->getFunction("bpf_dins_pkt");
+        if (!store_fn) return mkstatus_(n, "unable to find function bpf_dins_pkt");
+        LoadInst *skb_ptr = B.CreateLoad(skb_mem);
+        Value *skb_ptr8 = B.CreateBitCast(skb_ptr, B.getInt8PtrTy());
+        LoadInst *offset_ptr = B.CreateLoad(offset_mem);
+        Value *skb_hdr_offset = B.CreateAdd(offset_ptr, B.getInt64(bit_offset >> 3));
+        Value *rhs = B.CreateIntCast(pop_expr(), B.getInt64Ty(), false);
+        B.CreateCall(store_fn, vector<Value *>({skb_ptr8, skb_hdr_offset, B.getInt64(bit_offset & 0x7),
+                                               B.getInt64(bit_width), rhs}));
+      } else {
+        emit("bpf_dext_pkt(pkt, %s + %zu, %zu, %zu)", n->id_->c_str(), bit_offset >> 3, bit_offset & 0x7, bit_width);
+        Function *load_fn = mod_->getFunction("bpf_dext_pkt");
+        if (!load_fn) return mkstatus_(n, "unable to find function bpf_dext_pkt");
+        LoadInst *skb_ptr = B.CreateLoad(skb_mem);
+        Value *skb_ptr8 = B.CreateBitCast(skb_ptr, B.getInt8PtrTy());
+        LoadInst *offset_ptr = B.CreateLoad(offset_mem);
+        Value *skb_hdr_offset = B.CreateAdd(offset_ptr, B.getInt64(bit_offset >> 3));
+        expr_ = B.CreateCall(load_fn, vector<Value *>({skb_ptr8, skb_hdr_offset,
+                                                      B.getInt64(bit_offset & 0x7), B.getInt64(bit_width)}));
+        // this generates extra trunc insns whereas the bpf.load fns already
+        // trunc the values internally in the bpf interpeter
+        //expr_ = B.CreateTrunc(pop_expr(), B.getIntNTy(bit_width));
+      }
+    } else {
+      emit("pkt->start + pkt->offset + %s", n->id_->c_str());
+      return mkstatus_(n, "unsupported");
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_integer_expr_node(IntegerExprNode *n) {
+  APInt val;
+  StringRef(n->val_).getAsInteger(0, val);
+  expr_ = ConstantInt::get(mod_->getContext(), val);
+  if (n->bits_)
+    expr_ = B.CreateIntCast(expr_, B.getIntNTy(n->bits_), false);
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_string_expr_node(StringExprNode *n) {
+  if (n->is_lhs()) return mkstatus_(n, "cannot assign to a string");
+
+  Value *global = B.CreateGlobalString(n->val_);
+  Value *ptr = make_alloca(resolve_entry_stack(), B.getInt8Ty(), "",
+                           B.getInt64(n->val_.size() + 1));
+#if LLVM_MAJOR_VERSION >= 7
+  B.CreateMemCpy(ptr, 1, global, 1, n->val_.size() + 1);
+#else
+  B.CreateMemCpy(ptr, global, n->val_.size() + 1, 1);
+#endif
+  expr_ = ptr;
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_short_circuit_and(BinopExprNode *n) {
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_start = B.GetInsertBlock();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "and.then", parent);
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "and.end", parent);
+
+  TRY2(n->lhs_->accept(this));
+  Value *neq_zero = B.CreateICmpNE(pop_expr(), B.getIntN(n->lhs_->bit_width_, 0));
+  B.CreateCondBr(neq_zero, label_then, label_end);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->rhs_->accept(this));
+    expr_ = B.CreateICmpNE(pop_expr(), B.getIntN(n->rhs_->bit_width_, 0));
+    B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+
+  PHINode *phi = B.CreatePHI(B.getInt1Ty(), 2);
+  phi->addIncoming(B.getFalse(), label_start);
+  phi->addIncoming(pop_expr(), label_then);
+  expr_ = phi;
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_short_circuit_or(BinopExprNode *n) {
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_start = B.GetInsertBlock();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "or.then", parent);
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "or.end", parent);
+
+  TRY2(n->lhs_->accept(this));
+  Value *neq_zero = B.CreateICmpNE(pop_expr(), B.getIntN(n->lhs_->bit_width_, 0));
+  B.CreateCondBr(neq_zero, label_end, label_then);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->rhs_->accept(this));
+    expr_ = B.CreateICmpNE(pop_expr(), B.getIntN(n->rhs_->bit_width_, 0));
+    B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+
+  PHINode *phi = B.CreatePHI(B.getInt1Ty(), 2);
+  phi->addIncoming(B.getTrue(), label_start);
+  phi->addIncoming(pop_expr(), label_then);
+  expr_ = phi;
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_binop_expr_node(BinopExprNode *n) {
+  if (n->op_ == Tok::TAND)
+    return emit_short_circuit_and(n);
+  if (n->op_ == Tok::TOR)
+    return emit_short_circuit_or(n);
+
+  TRY2(n->lhs_->accept(this));
+  Value *lhs = pop_expr();
+  TRY2(n->rhs_->accept(this));
+  Value *rhs = B.CreateIntCast(pop_expr(), lhs->getType(), false);
+  switch (n->op_) {
+    case Tok::TCEQ: expr_ = B.CreateICmpEQ(lhs, rhs); break;
+    case Tok::TCNE: expr_ = B.CreateICmpNE(lhs, rhs); break;
+    case Tok::TXOR: expr_ = B.CreateXor(lhs, rhs); break;
+    case Tok::TMOD: expr_ = B.CreateURem(lhs, rhs); break;
+    case Tok::TCLT: expr_ = B.CreateICmpULT(lhs, rhs); break;
+    case Tok::TCLE: expr_ = B.CreateICmpULE(lhs, rhs); break;
+    case Tok::TCGT: expr_ = B.CreateICmpUGT(lhs, rhs); break;
+    case Tok::TCGE: expr_ = B.CreateICmpUGE(lhs, rhs); break;
+    case Tok::TPLUS: expr_ = B.CreateAdd(lhs, rhs); break;
+    case Tok::TMINUS: expr_ = B.CreateSub(lhs, rhs); break;
+    case Tok::TLAND: expr_ = B.CreateAnd(lhs, rhs); break;
+    case Tok::TLOR: expr_ = B.CreateOr(lhs, rhs); break;
+    default: return mkstatus_(n, "unsupported binary operator");
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_unop_expr_node(UnopExprNode *n) {
+  TRY2(n->expr_->accept(this));
+  switch (n->op_) {
+    case Tok::TNOT: expr_ = B.CreateNot(pop_expr()); break;
+    case Tok::TCMPL: expr_ = B.CreateNeg(pop_expr()); break;
+    default: {}
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_bitop_expr_node(BitopExprNode *n) {
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_goto_expr_node(GotoExprNode *n) {
+  if (n->id_->name_ == "DONE") {
+    return mkstatus_(n, "use return statement instead");
+  }
+  string jump_label;
+  // when dealing with multistates, goto statements may be overridden
+  auto rewrite_it = proto_rewrites_.find(n->id_->full_name());
+  auto default_it = proto_rewrites_.find("");
+  if (rewrite_it != proto_rewrites_.end()) {
+    jump_label = rewrite_it->second;
+  } else if (default_it != proto_rewrites_.end()) {
+    jump_label = default_it->second;
+  } else {
+    auto state = scopes_->current_state()->lookup(n->id_->full_name(), false);
+    if (state) {
+      jump_label = state->scoped_name();
+      if (n->is_continue_) {
+        jump_label += "_continue";
+      }
+    } else {
+      state = scopes_->current_state()->lookup("EOP", false);
+      if (state) {
+        jump_label = state->scoped_name();
+      }
+    }
+  }
+  B.CreateBr(resolve_label(jump_label));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_return_expr_node(ReturnExprNode *n) {
+  TRY2(n->expr_->accept(this));
+  Function *parent = B.GetInsertBlock()->getParent();
+  Value *cast_1 = B.CreateIntCast(pop_expr(), parent->getReturnType(), true);
+  B.CreateStore(cast_1, retval_);
+  B.CreateBr(resolve_label("DONE"));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_table_lookup(MethodCallExprNode *n) {
+  TableDeclStmtNode* table = scopes_->top_table()->lookup(n->id_->name_);
+  IdentExprNode* arg0 = static_cast<IdentExprNode*>(n->args_.at(0).get());
+  IdentExprNode* arg1;
+  StructVariableDeclStmtNode* arg1_type;
+
+  auto table_fd_it = table_fds_.find(table);
+  if (table_fd_it == table_fds_.end())
+    return mkstatus_(n, "unable to find table %s in table_fds_", n->id_->c_str());
+
+  Function *pseudo_fn = mod_->getFunction("llvm.bpf.pseudo");
+  if (!pseudo_fn) return mkstatus_(n, "pseudo fd loader doesn't exist");
+  Function *lookup_fn = mod_->getFunction("bpf_map_lookup_elem_");
+  if (!lookup_fn) return mkstatus_(n, "bpf_map_lookup_elem_ undefined");
+
+  CallInst *pseudo_call = B.CreateCall(pseudo_fn, vector<Value *>({B.getInt64(BPF_PSEUDO_MAP_FD),
+                                                                  B.getInt64(table_fd_it->second)}));
+  Value *pseudo_map_fd = pseudo_call;
+
+  TRY2(arg0->accept(this));
+  Value *key_ptr = B.CreateBitCast(pop_expr(), B.getInt8PtrTy());
+
+  expr_ = B.CreateCall(lookup_fn, vector<Value *>({pseudo_map_fd, key_ptr}));
+
+  if (table->type_id()->name_ == "FIXED_MATCH" || table->type_id()->name_ == "INDEXED") {
+    if (n->args_.size() == 2) {
+      arg1 = static_cast<IdentExprNode*>(n->args_.at(1).get());
+      arg1_type = static_cast<StructVariableDeclStmtNode*>(arg1->decl_);
+      if (table->leaf_id()->name_ != arg1_type->struct_id_->name_) {
+        return mkstatus_(n, "lookup pointer type mismatch %s != %s", table->leaf_id()->c_str(),
+                        arg1_type->struct_id_->c_str());
+      }
+      auto it = vars_.find(arg1_type);
+      if (it == vars_.end()) return mkstatus_(n, "Cannot locate variable %s in vars_ table", n->id_->c_str());
+      expr_ = B.CreateBitCast(pop_expr(), cast<PointerType>(it->second->getType())->getElementType());
+      B.CreateStore(pop_expr(), it->second);
+    }
+  } else {
+    return mkstatus_(n, "lookup in table type %s unsupported", table->type_id()->c_str());
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_table_update(MethodCallExprNode *n) {
+  TableDeclStmtNode* table = scopes_->top_table()->lookup(n->id_->name_);
+  IdentExprNode* arg0 = static_cast<IdentExprNode*>(n->args_.at(0).get());
+  IdentExprNode* arg1 = static_cast<IdentExprNode*>(n->args_.at(1).get());
+
+  auto table_fd_it = table_fds_.find(table);
+  if (table_fd_it == table_fds_.end())
+    return mkstatus_(n, "unable to find table %s in table_fds_", n->id_->c_str());
+  Function *pseudo_fn = mod_->getFunction("llvm.bpf.pseudo");
+  if (!pseudo_fn) return mkstatus_(n, "pseudo fd loader doesn't exist");
+  Function *update_fn = mod_->getFunction("bpf_map_update_elem_");
+  if (!update_fn) return mkstatus_(n, "bpf_map_update_elem_ undefined");
+
+  CallInst *pseudo_call = B.CreateCall(pseudo_fn, vector<Value *>({B.getInt64(BPF_PSEUDO_MAP_FD),
+                                        B.getInt64(table_fd_it->second)}));
+  Value *pseudo_map_fd = pseudo_call;
+
+  TRY2(arg0->accept(this));
+  Value *key_ptr = B.CreateBitCast(pop_expr(), B.getInt8PtrTy());
+
+  if (table->type_id()->name_ == "FIXED_MATCH" || table->type_id()->name_ == "INDEXED") {
+    TRY2(arg1->accept(this));
+    Value *value_ptr = B.CreateBitCast(pop_expr(), B.getInt8PtrTy());
+
+    expr_ = B.CreateCall(update_fn, vector<Value *>({pseudo_map_fd, key_ptr, value_ptr, B.getInt64(BPF_ANY)}));
+  } else {
+    return mkstatus_(n, "unsupported");
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_table_delete(MethodCallExprNode *n) {
+  TableDeclStmtNode* table = scopes_->top_table()->lookup(n->id_->name_);
+  IdentExprNode* arg0 = static_cast<IdentExprNode*>(n->args_.at(0).get());
+
+  auto table_fd_it = table_fds_.find(table);
+  if (table_fd_it == table_fds_.end())
+    return mkstatus_(n, "unable to find table %s in table_fds_", n->id_->c_str());
+  Function *pseudo_fn = mod_->getFunction("llvm.bpf.pseudo");
+  if (!pseudo_fn) return mkstatus_(n, "pseudo fd loader doesn't exist");
+  Function *update_fn = mod_->getFunction("bpf_map_update_elem_");
+  if (!update_fn) return mkstatus_(n, "bpf_map_update_elem_ undefined");
+
+  CallInst *pseudo_call = B.CreateCall(pseudo_fn, vector<Value *>({B.getInt64(BPF_PSEUDO_MAP_FD),
+                                        B.getInt64(table_fd_it->second)}));
+  Value *pseudo_map_fd = pseudo_call;
+
+  TRY2(arg0->accept(this));
+  Value *key_ptr = B.CreateBitCast(pop_expr(), B.getInt8PtrTy());
+
+  if (table->type_id()->name_ == "FIXED_MATCH" || table->type_id()->name_ == "INDEXED") {
+    expr_ = B.CreateCall(update_fn, vector<Value *>({pseudo_map_fd, key_ptr}));
+  } else {
+    return mkstatus_(n, "unsupported");
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_log(MethodCallExprNode *n) {
+  vector<Value *> args;
+  auto arg = n->args_.begin();
+  TRY2((*arg)->accept(this));
+  args.push_back(pop_expr());
+  args.push_back(B.getInt64(((*arg)->bit_width_ >> 3) + 1));
+  ++arg;
+  for (; arg != n->args_.end(); ++arg) {
+    TRY2((*arg)->accept(this));
+    args.push_back(pop_expr());
+  }
+
+  // int bpf_trace_printk(fmt, sizeof(fmt), ...)
+  FunctionType *printk_fn_type = FunctionType::get(B.getInt32Ty(), vector<Type *>({B.getInt8PtrTy(), B.getInt64Ty()}), true);
+  Value *printk_fn = B.CreateIntToPtr(B.getInt64(BPF_FUNC_trace_printk),
+                                         PointerType::getUnqual(printk_fn_type));
+
+  expr_ = B.CreateCall(printk_fn, args);
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_packet_rewrite_field(MethodCallExprNode *n) {
+  TRY2(n->args_[1]->accept(this));
+  TRY2(n->args_[0]->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_atomic_add(MethodCallExprNode *n) {
+  TRY2(n->args_[0]->accept(this));
+  Value *lhs = B.CreateBitCast(pop_expr(), Type::getInt64PtrTy(ctx()));
+  TRY2(n->args_[1]->accept(this));
+  Value *rhs = B.CreateSExt(pop_expr(), B.getInt64Ty());
+  AtomicRMWInst *atomic_inst = B.CreateAtomicRMW(
+      AtomicRMWInst::Add, lhs, rhs, AtomicOrdering::SequentiallyConsistent);
+  atomic_inst->setVolatile(false);
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_incr_cksum(MethodCallExprNode *n, size_t sz) {
+  Value *is_pseudo;
+  string csum_fn_str;
+  if (n->args_.size() == 4) {
+    TRY2(n->args_[3]->accept(this));
+    is_pseudo = B.CreateIntCast(B.CreateIsNotNull(pop_expr()), B.getInt64Ty(), false);
+    csum_fn_str = "bpf_l4_csum_replace_";
+  } else {
+    is_pseudo = B.getInt64(0);
+    csum_fn_str = "bpf_l3_csum_replace_";
+  }
+
+  TRY2(n->args_[2]->accept(this));
+  Value *new_val = B.CreateZExt(pop_expr(), B.getInt64Ty());
+  TRY2(n->args_[1]->accept(this));
+  Value *old_val = B.CreateZExt(pop_expr(), B.getInt64Ty());
+  TRY2(n->args_[0]->accept(this));
+  Value *offset = B.CreateZExt(pop_expr(), B.getInt64Ty());
+
+  Function *csum_fn = mod_->getFunction(csum_fn_str);
+  if (!csum_fn) return mkstatus_(n, "Undefined built-in %s", csum_fn_str.c_str());
+
+  // flags = (is_pseudo << 4) | sizeof(old_val)
+  Value *flags_lower = B.getInt64(sz ? sz : bits_to_size(n->args_[1]->bit_width_));
+  Value *flags_upper = B.CreateShl(is_pseudo, B.getInt64(4));
+  Value *flags = B.CreateOr(flags_upper, flags_lower);
+
+  VariableDeclStmtNode *skb_decl;
+  Value *skb_mem;
+  TRY2(lookup_var(n, "skb", scopes_->current_var(), &skb_decl, &skb_mem));
+  LoadInst *skb_ptr = B.CreateLoad(skb_mem);
+  Value *skb_ptr8 = B.CreateBitCast(skb_ptr, B.getInt8PtrTy());
+
+  expr_ = B.CreateCall(csum_fn, vector<Value *>({skb_ptr8, offset, old_val, new_val, flags}));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::emit_get_usec_time(MethodCallExprNode *n) {
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_method_call_expr_node(MethodCallExprNode *n) {
+  if (n->id_->sub_name_.size()) {
+    if (n->id_->sub_name_ == "lookup") {
+      TRY2(emit_table_lookup(n));
+    } else if (n->id_->sub_name_ == "update") {
+      TRY2(emit_table_update(n));
+    } else if (n->id_->sub_name_ == "delete") {
+      TRY2(emit_table_delete(n));
+    } else if (n->id_->sub_name_ == "rewrite_field" && n->id_->name_ == "pkt") {
+      TRY2(emit_packet_rewrite_field(n));
+    }
+  } else if (n->id_->name_ == "atomic_add") {
+    TRY2(emit_atomic_add(n));
+  } else if (n->id_->name_ == "log") {
+    TRY2(emit_log(n));
+  } else if (n->id_->name_ == "incr_cksum") {
+    TRY2(emit_incr_cksum(n));
+  } else if (n->id_->name_ == "get_usec_time") {
+    TRY2(emit_get_usec_time(n));
+  } else {
+    return mkstatus_(n, "unsupported");
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+/* result = lookup(key)
+ * if (!result) {
+ *   update(key, {0}, BPF_NOEXIST)
+ *   result = lookup(key)
+ * }
+ */
+StatusTuple CodegenLLVM::visit_table_index_expr_node(TableIndexExprNode *n) {
+  auto table_fd_it = table_fds_.find(n->table_);
+  if (table_fd_it == table_fds_.end())
+    return mkstatus_(n, "unable to find table %s in table_fds_", n->id_->c_str());
+
+  Function *pseudo_fn = mod_->getFunction("llvm.bpf.pseudo");
+  if (!pseudo_fn) return mkstatus_(n, "pseudo fd loader doesn't exist");
+  Function *update_fn = mod_->getFunction("bpf_map_update_elem_");
+  if (!update_fn) return mkstatus_(n, "bpf_map_update_elem_ undefined");
+  Function *lookup_fn = mod_->getFunction("bpf_map_lookup_elem_");
+  if (!lookup_fn) return mkstatus_(n, "bpf_map_lookup_elem_ undefined");
+  StructType *leaf_type;
+  TRY2(lookup_struct_type(n->table_->leaf_type_, &leaf_type));
+  PointerType *leaf_ptype = PointerType::getUnqual(leaf_type);
+
+  CallInst *pseudo_call = B.CreateCall(pseudo_fn, vector<Value *>({B.getInt64(BPF_PSEUDO_MAP_FD),
+                                        B.getInt64(table_fd_it->second)}));
+  Value *pseudo_map_fd = pseudo_call;
+
+  TRY2(n->index_->accept(this));
+  Value *key_ptr = B.CreateBitCast(pop_expr(), B.getInt8PtrTy());
+
+  // result = lookup(key)
+  Value *lookup1 = B.CreateBitCast(B.CreateCall(lookup_fn, vector<Value *>({pseudo_map_fd, key_ptr})), leaf_ptype);
+
+  Value *result = nullptr;
+  if (n->table_->policy_id()->name_ == "AUTO") {
+    Function *parent = B.GetInsertBlock()->getParent();
+    BasicBlock *label_start = B.GetInsertBlock();
+    BasicBlock *label_then = BasicBlock::Create(ctx(), n->id_->name_ + "[].then", parent);
+    BasicBlock *label_end = BasicBlock::Create(ctx(), n->id_->name_ + "[].end", parent);
+
+    Value *eq_zero = B.CreateIsNull(lookup1);
+    B.CreateCondBr(eq_zero, label_then, label_end);
+
+    B.SetInsertPoint(label_then);
+    // var Leaf leaf {0}
+    Value *leaf_ptr = B.CreateBitCast(
+        make_alloca(resolve_entry_stack(), leaf_type), B.getInt8PtrTy());
+    B.CreateMemSet(leaf_ptr, B.getInt8(0), B.getInt64(n->table_->leaf_id()->bit_width_ >> 3), 1);
+    // update(key, leaf)
+    B.CreateCall(update_fn, vector<Value *>({pseudo_map_fd, key_ptr, leaf_ptr, B.getInt64(BPF_NOEXIST)}));
+
+    // result = lookup(key)
+    Value *lookup2 = B.CreateBitCast(B.CreateCall(lookup_fn, vector<Value *>({pseudo_map_fd, key_ptr})), leaf_ptype);
+    B.CreateBr(label_end);
+
+    B.SetInsertPoint(label_end);
+
+    PHINode *phi = B.CreatePHI(leaf_ptype, 2);
+    phi->addIncoming(lookup1, label_start);
+    phi->addIncoming(lookup2, label_then);
+    result = phi;
+  } else if (n->table_->policy_id()->name_ == "NONE") {
+    result = lookup1;
+  }
+
+  if (n->is_lhs()) {
+    if (n->sub_decl_) {
+      Type *ptr_type = PointerType::getUnqual(B.getIntNTy(n->sub_decl_->bit_width_));
+      // u64 *errval -> uN *errval
+      Value *err_cast = B.CreateBitCast(errval_, ptr_type);
+      // if valid then &field, else &errval
+      Function *parent = B.GetInsertBlock()->getParent();
+      BasicBlock *label_start = B.GetInsertBlock();
+      BasicBlock *label_then = BasicBlock::Create(ctx(), n->id_->name_ + "[]field.then", parent);
+      BasicBlock *label_end = BasicBlock::Create(ctx(), n->id_->name_ + "[]field.end", parent);
+
+      if (1) {
+        // the PHI implementation of this doesn't load, maybe eBPF limitation?
+        B.CreateCondBr(B.CreateIsNull(result), label_then, label_end);
+        B.SetInsertPoint(label_then);
+        B.CreateStore(B.getInt32(2), retval_);
+        B.CreateBr(resolve_label("DONE"));
+
+        B.SetInsertPoint(label_end);
+        vector<Value *> indices({B.getInt32(0), B.getInt32(n->sub_decl_->slot_)});
+        expr_ = B.CreateInBoundsGEP(result, indices);
+      } else {
+        B.CreateCondBr(B.CreateIsNotNull(result), label_then, label_end);
+
+        B.SetInsertPoint(label_then);
+        vector<Value *> indices({B.getInt32(0), B.getInt32(n->sub_decl_->slot_)});
+        Value *field = B.CreateInBoundsGEP(result, indices);
+        B.CreateBr(label_end);
+
+        B.SetInsertPoint(label_end);
+        PHINode *phi = B.CreatePHI(ptr_type, 2);
+        phi->addIncoming(err_cast, label_start);
+        phi->addIncoming(field, label_then);
+        expr_ = phi;
+      }
+    } else {
+      return mkstatus_(n, "unsupported");
+    }
+  } else {
+    expr_ = result;
+  }
+  return StatusTuple(0);
+}
+
+/// on_match
+StatusTuple CodegenLLVM::visit_match_decl_stmt_node(MatchDeclStmtNode *n) {
+  if (n->formals_.size() != 1)
+    return mkstatus_(n, "on_match expected 1 arguments, %zu given", n->formals_.size());
+  StructVariableDeclStmtNode* leaf_n = static_cast<StructVariableDeclStmtNode*>(n->formals_.at(0).get());
+  if (!leaf_n)
+    return mkstatus_(n, "invalid parameter type");
+  // lookup result variable
+  auto result_decl = scopes_->current_var()->lookup("_result", false);
+  if (!result_decl) return mkstatus_(n, "unable to find _result built-in");
+  auto result = vars_.find(result_decl);
+  if (result == vars_.end()) return mkstatus_(n, "unable to find memory for _result built-in");
+  vars_[leaf_n] = result->second;
+
+  Value *load_1 = B.CreateLoad(result->second);
+  Value *is_null = B.CreateIsNotNull(load_1);
+
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "onvalid.then", parent);
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "onvalid.end", parent);
+  B.CreateCondBr(is_null, label_then, label_end);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+  return StatusTuple(0);
+}
+
+/// on_miss
+StatusTuple CodegenLLVM::visit_miss_decl_stmt_node(MissDeclStmtNode *n) {
+  if (n->formals_.size() != 0)
+    return mkstatus_(n, "on_match expected 0 arguments, %zu given", n->formals_.size());
+  auto result_decl = scopes_->current_var()->lookup("_result", false);
+  if (!result_decl) return mkstatus_(n, "unable to find _result built-in");
+  auto result = vars_.find(result_decl);
+  if (result == vars_.end()) return mkstatus_(n, "unable to find memory for _result built-in");
+
+  Value *load_1 = B.CreateLoad(result->second);
+  Value *is_null = B.CreateIsNull(load_1);
+
+  Function *parent = B.GetInsertBlock()->getParent();
+  BasicBlock *label_then = BasicBlock::Create(ctx(), "onvalid.then", parent);
+  BasicBlock *label_end = BasicBlock::Create(ctx(), "onvalid.end", parent);
+  B.CreateCondBr(is_null, label_then, label_end);
+
+  {
+    BlockStack bstack(this, label_then);
+    TRY2(n->block_->accept(this));
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(label_end);
+  }
+
+  B.SetInsertPoint(label_end);
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_failure_decl_stmt_node(FailureDeclStmtNode *n) {
+  return mkstatus_(n, "unsupported");
+}
+
+StatusTuple CodegenLLVM::visit_expr_stmt_node(ExprStmtNode *n) {
+  TRY2(n->expr_->accept(this));
+  expr_ = nullptr;
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_struct_variable_decl_stmt_node(StructVariableDeclStmtNode *n) {
+  if (n->struct_id_->name_ == "" || n->struct_id_->name_[0] == '_') {
+    return StatusTuple(0);
+  }
+
+  StructType *stype;
+  StructDeclStmtNode *decl;
+  TRY2(lookup_struct_type(n, &stype, &decl));
+
+  Type *ptr_stype = n->is_pointer() ? PointerType::getUnqual(stype) : (PointerType *)stype;
+  AllocaInst *ptr_a = make_alloca(resolve_entry_stack(), ptr_stype);
+  vars_[n] = ptr_a;
+
+  if (n->struct_id_->scope_name_ == "proto") {
+    if (n->is_pointer()) {
+      ConstantPointerNull *const_null = ConstantPointerNull::get(cast<PointerType>(ptr_stype));
+      B.CreateStore(const_null, ptr_a);
+    } else {
+      return mkstatus_(n, "unsupported");
+      // string var = n->scope_id() + n->id_->name_;
+      // /* zero initialize array to be filled in with packet header */
+      // emit("uint64_t __%s[%zu] = {}; uint8_t *%s = (uint8_t*)__%s;",
+      //      var.c_str(), ((decl->bit_width_ >> 3) + 7) >> 3, var.c_str(), var.c_str());
+      // for (auto it = n->init_.begin(); it != n->init_.end(); ++it) {
+      //   auto asn = static_cast<AssignExprNode*>(it->get());
+      //   if (auto f = decl->field(asn->id_->sub_name_)) {
+      //     size_t bit_offset = f->bit_offset_;
+      //     size_t bit_width = f->bit_width_;
+      //     if (asn->bitop_) {
+      //       bit_offset += f->bit_width_ - (asn->bitop_->bit_offset_ + asn->bitop_->bit_width_);
+      //       bit_width = std::min(bit_width - asn->bitop_->bit_offset_, asn->bitop_->bit_width_);
+      //     }
+      //     emit(" bpf_dins(%s + %zu, %zu, %zu, ", var.c_str(), bit_offset >> 3, bit_offset & 0x7, bit_width);
+      //     TRY2(asn->rhs_->accept(this));
+      //     emit(");");
+      //   }
+      // }
+    }
+  } else {
+    if (n->is_pointer()) {
+      if (n->id_->name_ == "_result") {
+        // special case for capturing the return value of a previous method call
+        Value *cast_1 = B.CreateBitCast(pop_expr(), ptr_stype);
+        B.CreateStore(cast_1, ptr_a);
+      } else {
+        ConstantPointerNull *const_null = ConstantPointerNull::get(cast<PointerType>(ptr_stype));
+        B.CreateStore(const_null, ptr_a);
+      }
+    } else {
+      B.CreateMemSet(ptr_a, B.getInt8(0), B.getInt64(decl->bit_width_ >> 3), 1);
+      if (!n->init_.empty()) {
+        for (auto it = n->init_.begin(); it != n->init_.end(); ++it)
+          TRY2((*it)->accept(this));
+      }
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_integer_variable_decl_stmt_node(IntegerVariableDeclStmtNode *n) {
+  if (!B.GetInsertBlock())
+    return StatusTuple(0);
+
+  // uintX var = init
+  AllocaInst *ptr_a = make_alloca(resolve_entry_stack(),
+                                  B.getIntNTy(n->bit_width_), n->id_->name_);
+  vars_[n] = ptr_a;
+
+  // todo
+  if (!n->init_.empty())
+    TRY2(n->init_[0]->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_struct_decl_stmt_node(StructDeclStmtNode *n) {
+  ++indent_;
+  StructType *struct_type = StructType::create(ctx(), "_struct." + n->id_->name_);
+  vector<Type *> fields;
+  for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it)
+    fields.push_back(B.getIntNTy((*it)->bit_width_));
+  struct_type->setBody(fields, n->is_packed());
+  structs_[n] = struct_type;
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_parser_state_stmt_node(ParserStateStmtNode *n) {
+  string jump_label = n->scoped_name() + "_continue";
+  BasicBlock *label_entry = resolve_label(jump_label);
+  B.SetInsertPoint(label_entry);
+  if (n->next_state_)
+    TRY2(n->next_state_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_state_decl_stmt_node(StateDeclStmtNode *n) {
+  if (!n->id_)
+    return StatusTuple(0);
+  string jump_label = n->scoped_name();
+  BasicBlock *label_entry = resolve_label(jump_label);
+  B.SetInsertPoint(label_entry);
+
+  auto it = n->subs_.begin();
+
+  scopes_->push_state(it->scope_);
+
+  for (auto in = n->init_.begin(); in != n->init_.end(); ++in)
+    TRY2((*in)->accept(this));
+
+  if (n->subs_.size() == 1 && it->id_->name_ == "") {
+    // this is not a multistate protocol, emit everything and finish
+    TRY2(it->block_->accept(this));
+    if (n->parser_) {
+      B.CreateBr(resolve_label(jump_label + "_continue"));
+      TRY2(n->parser_->accept(this));
+    }
+  } else {
+    return mkstatus_(n, "unsupported");
+  }
+
+  scopes_->pop_state();
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_table_decl_stmt_node(TableDeclStmtNode *n) {
+  if (n->table_type_->name_ == "Table"
+      || n->table_type_->name_ == "SharedTable") {
+    if (n->templates_.size() != 4)
+      return mkstatus_(n, "%s expected 4 arguments, %zu given", n->table_type_->c_str(), n->templates_.size());
+    auto key = scopes_->top_struct()->lookup(n->key_id()->name_, /*search_local*/true);
+    if (!key) return mkstatus_(n, "cannot find key %s", n->key_id()->name_.c_str());
+    auto leaf = scopes_->top_struct()->lookup(n->leaf_id()->name_, /*search_local*/true);
+    if (!leaf) return mkstatus_(n, "cannot find leaf %s", n->leaf_id()->name_.c_str());
+
+    bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC;
+    if (n->type_id()->name_ == "FIXED_MATCH")
+      map_type = BPF_MAP_TYPE_HASH;
+    else if (n->type_id()->name_ == "INDEXED")
+      map_type = BPF_MAP_TYPE_ARRAY;
+    else
+      return mkstatus_(n, "Table type %s not implemented", n->type_id()->name_.c_str());
+
+    StructType *key_stype, *leaf_stype;
+    TRY2(lookup_struct_type(n->key_type_, &key_stype));
+    TRY2(lookup_struct_type(n->leaf_type_, &leaf_stype));
+    StructType *decl_struct = mod_->getTypeByName("_struct." + n->id_->name_);
+    if (!decl_struct)
+      decl_struct = StructType::create(ctx(), "_struct." + n->id_->name_);
+    if (decl_struct->isOpaque())
+      decl_struct->setBody(vector<Type *>({key_stype, leaf_stype}), /*isPacked=*/false);
+    GlobalVariable *decl_gvar = new GlobalVariable(*mod_, decl_struct, false,
+                                                   GlobalValue::ExternalLinkage, 0, n->id_->name_);
+    decl_gvar->setSection("maps");
+    tables_[n] = decl_gvar;
+
+    int map_fd = bpf_create_map(map_type, n->id_->name_.c_str(),
+                                key->bit_width_ / 8, leaf->bit_width_ / 8,
+                                n->size_, 0);
+    if (map_fd >= 0)
+      table_fds_[n] = map_fd;
+  } else {
+    return mkstatus_(n, "Table %s not implemented", n->table_type_->name_.c_str());
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::lookup_struct_type(StructDeclStmtNode *decl, StructType **stype) const {
+  auto struct_it = structs_.find(decl);
+  if (struct_it == structs_.end())
+    return mkstatus_(decl, "could not find IR for type %s", decl->id_->c_str());
+  *stype = struct_it->second;
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::lookup_struct_type(VariableDeclStmtNode *n, StructType **stype,
+                                            StructDeclStmtNode **decl) const {
+  if (!n->is_struct())
+    return mkstatus_(n, "attempt to search for struct with a non-struct type %s", n->id_->c_str());
+
+  auto var = (StructVariableDeclStmtNode *)n;
+  StructDeclStmtNode *type;
+  if (var->struct_id_->scope_name_ == "proto")
+    type = proto_scopes_->top_struct()->lookup(var->struct_id_->name_, true);
+  else
+    type = scopes_->top_struct()->lookup(var->struct_id_->name_, true);
+
+  if (!type) return mkstatus_(n, "could not find type %s", var->struct_id_->c_str());
+
+  TRY2(lookup_struct_type(type, stype));
+
+  if (decl)
+    *decl = type;
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit_func_decl_stmt_node(FuncDeclStmtNode *n) {
+  if (n->formals_.size() != 1)
+    return mkstatus_(n, "Functions must have exactly 1 argument, %zd given", n->formals_.size());
+
+  vector<Type *> formals;
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    VariableDeclStmtNode *formal = it->get();
+    if (formal->is_struct()) {
+      StructType *stype;
+      //TRY2(lookup_struct_type(formal, &stype));
+      auto var = (StructVariableDeclStmtNode *)formal;
+      stype = mod_->getTypeByName("_struct." + var->struct_id_->name_);
+      if (!stype) return mkstatus_(n, "could not find type %s", var->struct_id_->c_str());
+      formals.push_back(PointerType::getUnqual(stype));
+    } else {
+      formals.push_back(B.getIntNTy(formal->bit_width_));
+    }
+  }
+  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), formals, /*isVarArg=*/false);
+
+  Function *fn = mod_->getFunction(n->id_->name_);
+  if (fn) return mkstatus_(n, "Function %s already defined", n->id_->c_str());
+  fn = Function::Create(fn_type, GlobalValue::ExternalLinkage, n->id_->name_, mod_);
+  fn->setCallingConv(CallingConv::C);
+  fn->addFnAttr(Attribute::NoUnwind);
+  fn->setSection(BPF_FN_PREFIX + n->id_->name_);
+
+  BasicBlock *label_entry = BasicBlock::Create(ctx(), "entry", fn);
+  B.SetInsertPoint(label_entry);
+  string scoped_entry_label = to_string((uintptr_t)fn) + "::entry";
+  labels_[scoped_entry_label] = label_entry;
+  BasicBlock *label_return = resolve_label("DONE");
+  retval_ = make_alloca(label_entry, fn->getReturnType(), "ret");
+  B.CreateStore(B.getInt32(0), retval_);
+  errval_ = make_alloca(label_entry, B.getInt64Ty(), "err");
+  B.CreateStore(B.getInt64(0), errval_);
+
+  auto formal = n->formals_.begin();
+  for (auto arg = fn->arg_begin(); arg != fn->arg_end(); ++arg, ++formal) {
+    TRY2((*formal)->accept(this));
+    Value *ptr = vars_[formal->get()];
+    if (!ptr) return mkstatus_(n, "cannot locate memory location for arg %s", (*formal)->id_->c_str());
+    B.CreateStore(&*arg, ptr);
+
+    // Type *ptype;
+    // if ((*formal)->is_struct()) {
+    //   StructType *type;
+    //   TRY2(lookup_struct_type(formal->get(), &type));
+    //   ptype = PointerType::getUnqual(type);
+    // } else {
+    //   ptype = PointerType::getUnqual(B.getIntNTy((*formal)->bit_width_));
+    // }
+
+    // arg->setName((*formal)->id_->name_);
+    // AllocaInst *ptr = make_alloca(label_entry, ptype, (*formal)->id_->name_);
+    // B.CreateStore(arg, ptr);
+    // vars_[formal->get()] = ptr;
+  }
+
+  // visit function scoped variables
+  {
+    scopes_->push_state(n->scope_);
+
+    for (auto it = scopes_->current_var()->obegin(); it != scopes_->current_var()->oend(); ++it)
+      TRY2((*it)->accept(this));
+
+    TRY2(n->block_->accept(this));
+
+    scopes_->pop_state();
+    if (!B.GetInsertBlock()->getTerminator())
+      B.CreateBr(resolve_label("DONE"));
+
+    // always return something
+    B.SetInsertPoint(label_return);
+    B.CreateRet(B.CreateLoad(retval_));
+  }
+
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::visit(Node *root, TableStorage &ts, const string &id,
+                               const string &maps_ns) {
+  scopes_->set_current(scopes_->top_state());
+  scopes_->set_current(scopes_->top_var());
+
+  TRY2(print_header());
+
+  for (auto it = scopes_->top_table()->obegin(); it != scopes_->top_table()->oend(); ++it)
+    TRY2((*it)->accept(this));
+
+  for (auto it = scopes_->top_func()->obegin(); it != scopes_->top_func()->oend(); ++it)
+    TRY2((*it)->accept(this));
+  //TRY2(print_parser());
+
+  for (auto table : tables_) {
+    bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC;
+    if (table.first->type_id()->name_ == "FIXED_MATCH")
+      map_type = BPF_MAP_TYPE_HASH;
+    else if (table.first->type_id()->name_ == "INDEXED")
+      map_type = BPF_MAP_TYPE_ARRAY;
+    ts.Insert(Path({id, table.first->id_->name_}),
+              {
+                  table.first->id_->name_, FileDesc(table_fds_[table.first]), map_type,
+                  table.first->key_type_->bit_width_ >> 3, table.first->leaf_type_->bit_width_ >> 3,
+                  table.first->size_, 0,
+              });
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple CodegenLLVM::print_header() {
+
+  GlobalVariable *gvar_license = new GlobalVariable(*mod_, ArrayType::get(Type::getInt8Ty(ctx()), 4),
+                                                    false, GlobalValue::ExternalLinkage, 0, "_license");
+  gvar_license->setSection("license");
+  gvar_license->setInitializer(ConstantDataArray::getString(ctx(), "GPL", true));
+
+  Function *pseudo_fn = mod_->getFunction("llvm.bpf.pseudo");
+  if (!pseudo_fn) {
+    pseudo_fn = Function::Create(
+        FunctionType::get(B.getInt64Ty(), vector<Type *>({B.getInt64Ty(), B.getInt64Ty()}), false),
+        GlobalValue::ExternalLinkage, "llvm.bpf.pseudo", mod_);
+  }
+
+  // declare structures
+  for (auto it = scopes_->top_struct()->obegin(); it != scopes_->top_struct()->oend(); ++it) {
+    if ((*it)->id_->name_ == "_Packet")
+      continue;
+    TRY2((*it)->accept(this));
+  }
+  for (auto it = proto_scopes_->top_struct()->obegin(); it != proto_scopes_->top_struct()->oend(); ++it) {
+    if ((*it)->id_->name_ == "_Packet")
+      continue;
+    TRY2((*it)->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+int CodegenLLVM::get_table_fd(const string &name) const {
+  TableDeclStmtNode *table = scopes_->top_table()->lookup(name);
+  if (!table)
+    return -1;
+
+  auto table_fd_it = table_fds_.find(table);
+  if (table_fd_it == table_fds_.end())
+    return -1;
+
+  return table_fd_it->second;
+}
+
+LLVMContext & CodegenLLVM::ctx() const {
+  return mod_->getContext();
+}
+
+Constant * CodegenLLVM::const_int(uint64_t val, unsigned bits, bool is_signed) {
+  return ConstantInt::get(ctx(), APInt(bits, val, is_signed));
+}
+
+Value * CodegenLLVM::pop_expr() {
+  Value *ret = expr_;
+  expr_ = nullptr;
+  return ret;
+}
+
+BasicBlock * CodegenLLVM::resolve_label(const string &label) {
+  Function *parent = B.GetInsertBlock()->getParent();
+  string scoped_label = to_string((uintptr_t)parent) + "::" + label;
+  auto it = labels_.find(scoped_label);
+  if (it != labels_.end()) return it->second;
+  BasicBlock *label_new = BasicBlock::Create(ctx(), label, parent);
+  labels_[scoped_label] = label_new;
+  return label_new;
+}
+
+Instruction * CodegenLLVM::resolve_entry_stack() {
+  BasicBlock *label_entry = resolve_label("entry");
+  return &label_entry->back();
+}
+
+AllocaInst *CodegenLLVM::make_alloca(Instruction *Inst, Type *Ty,
+                                     const string &name, Value *ArraySize) {
+  IRBuilderBase::InsertPoint ip = B.saveIP();
+  B.SetInsertPoint(Inst);
+  AllocaInst *a = B.CreateAlloca(Ty, ArraySize, name);
+  B.restoreIP(ip);
+  return a;
+}
+
+AllocaInst *CodegenLLVM::make_alloca(BasicBlock *BB, Type *Ty,
+                                     const string &name, Value *ArraySize) {
+  IRBuilderBase::InsertPoint ip = B.saveIP();
+  B.SetInsertPoint(BB);
+  AllocaInst *a = B.CreateAlloca(Ty, ArraySize, name);
+  B.restoreIP(ip);
+  return a;
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/codegen_llvm.h b/src/cc/frontends/b/codegen_llvm.h
new file mode 100644
index 0000000..c2947f7
--- /dev/null
+++ b/src/cc/frontends/b/codegen_llvm.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <stdio.h>
+#include <vector>
+#include <string>
+#include <set>
+
+#include "node.h"
+#include "scope.h"
+
+namespace llvm {
+class AllocaInst;
+class BasicBlock;
+class BranchInst;
+class Constant;
+class Instruction;
+class IRBuilderBase;
+class LLVMContext;
+class Module;
+class StructType;
+class SwitchInst;
+class Type;
+class Value;
+class GlobalVariable;
+}
+
+namespace ebpf {
+
+class TableStorage;
+
+namespace cc {
+
+class BlockStack;
+class SwitchStack;
+
+using std::vector;
+using std::string;
+using std::set;
+
+class CodegenLLVM : public Visitor {
+  friend class BlockStack;
+  friend class SwitchStack;
+ public:
+  CodegenLLVM(llvm::Module *mod, Scopes *scopes, Scopes *proto_scopes);
+  virtual ~CodegenLLVM();
+
+#define VISIT(type, func) virtual STATUS_RETURN visit_##func(type* n);
+  EXPAND_NODES(VISIT)
+#undef VISIT
+
+  STATUS_RETURN visit(Node *n, TableStorage &ts, const std::string &id,
+                      const std::string &maps_ns);
+
+  int get_table_fd(const std::string &name) const;
+
+ private:
+  STATUS_RETURN emit_short_circuit_and(BinopExprNode* n);
+  STATUS_RETURN emit_short_circuit_or(BinopExprNode* n);
+  STATUS_RETURN emit_table_lookup(MethodCallExprNode* n);
+  STATUS_RETURN emit_table_update(MethodCallExprNode* n);
+  STATUS_RETURN emit_table_delete(MethodCallExprNode* n);
+  STATUS_RETURN emit_log(MethodCallExprNode* n);
+  STATUS_RETURN emit_packet_rewrite_field(MethodCallExprNode* n);
+  STATUS_RETURN emit_atomic_add(MethodCallExprNode* n);
+  STATUS_RETURN emit_cksum(MethodCallExprNode* n);
+  STATUS_RETURN emit_incr_cksum(MethodCallExprNode* n, size_t sz = 0);
+  STATUS_RETURN emit_lb_hash(MethodCallExprNode* n);
+  STATUS_RETURN emit_sizeof(MethodCallExprNode* n);
+  STATUS_RETURN emit_get_usec_time(MethodCallExprNode* n);
+  STATUS_RETURN emit_forward_to_vnf(MethodCallExprNode* n);
+  STATUS_RETURN emit_forward_to_group(MethodCallExprNode* n);
+  STATUS_RETURN print_header();
+
+  llvm::LLVMContext & ctx() const;
+  llvm::Constant * const_int(uint64_t val, unsigned bits = 64, bool is_signed = false);
+  llvm::Value * pop_expr();
+  llvm::BasicBlock * resolve_label(const string &label);
+  llvm::Instruction * resolve_entry_stack();
+  llvm::AllocaInst *make_alloca(llvm::Instruction *Inst, llvm::Type *Ty,
+                                const std::string &name = "",
+                                llvm::Value *ArraySize = nullptr);
+  llvm::AllocaInst *make_alloca(llvm::BasicBlock *BB, llvm::Type *Ty,
+                                const std::string &name = "",
+                                llvm::Value *ArraySize = nullptr);
+  StatusTuple lookup_var(Node *n, const std::string &name, Scopes::VarScope *scope,
+                         VariableDeclStmtNode **decl, llvm::Value **mem) const;
+  StatusTuple lookup_struct_type(StructDeclStmtNode *decl, llvm::StructType **stype) const;
+  StatusTuple lookup_struct_type(VariableDeclStmtNode *n, llvm::StructType **stype,
+                                 StructDeclStmtNode **decl = nullptr) const;
+
+  template <typename... Args> void emit(const char *fmt, Args&&... params);
+  void emit(const char *s);
+
+  FILE* out_;
+  llvm::Module* mod_;
+  llvm::IRBuilderBase *b_;
+  int indent_;
+  int tmp_reg_index_;
+  Scopes *scopes_;
+  Scopes *proto_scopes_;
+  vector<vector<string> > free_instructions_;
+  vector<string> table_inits_;
+  map<string, string> proto_rewrites_;
+  map<TableDeclStmtNode *, llvm::GlobalVariable *> tables_;
+  map<TableDeclStmtNode *, int> table_fds_;
+  map<VariableDeclStmtNode *, llvm::Value *> vars_;
+  map<StructDeclStmtNode *, llvm::StructType *> structs_;
+  map<string, llvm::BasicBlock *> labels_;
+  llvm::SwitchInst *cur_switch_;
+  llvm::Value *expr_;
+  llvm::AllocaInst *retval_;
+  llvm::AllocaInst *errval_;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/lexer.h b/src/cc/frontends/b/lexer.h
new file mode 100644
index 0000000..14f3110
--- /dev/null
+++ b/src/cc/frontends/b/lexer.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef yyFlexLexerOnce
+#undef yyFlexLexer
+#define yyFlexLexer ebpfccFlexLexer
+#include <FlexLexer.h>
+#endif
+
+#undef YY_DECL
+#define YY_DECL int ebpf::cc::Lexer::yylex()
+
+#include <iostream> // NOLINT
+#include <list>
+#include "parser.yy.hh"
+
+namespace ebpf {
+namespace cc {
+
+typedef BisonParser::token::yytokentype Tok;
+
+class Lexer : public yyFlexLexer {
+ public:
+  explicit Lexer(std::istream* in)
+      : yyFlexLexer(in), prev_tok_(Tok::TSEMI), lines_({""}), yylval_(NULL), yylloc_(NULL) {
+    if (!in || !*in)
+      fprintf(stderr, "Unable to open input stream\n");
+  }
+  int yylex(BisonParser::semantic_type *lval, BisonParser::location_type *lloc) {
+    yylval_ = lval;
+    yylloc_ = lloc;
+    return yylex();
+  }
+  std::string text(const BisonParser::location_type& loc) const {
+    return text(loc.begin, loc.end);
+  }
+  std::string text(const position& begin, const position& end) const {
+    std::string result;
+    for (size_t i = begin.line; i <= end.line; ++i) {
+      if (i == begin.line && i == end.line) {
+        result += lines_.at(i - 1).substr(begin.column - 1, end.column - begin.column);
+      } else if (i == begin.line && i < end.line) {
+        result += lines_.at(i - 1).substr(begin.column - 1);
+      } else if (i > begin.line && i == end.line) {
+        result += lines_.at(i - 1).substr(0, end.column);
+      } else if (i > begin.line && i == end.line) {
+        result += lines_.at(i - 1);
+      }
+    }
+    return result;
+  }
+ private:
+
+  // true if a semicolon should be replaced here
+  bool next_line() {
+    lines_.push_back("");
+    yylloc_->lines();
+    yylloc_->step();
+    switch (prev_tok_) {
+    case Tok::TIDENTIFIER:
+    case Tok::TINTEGER:
+    case Tok::THEXINTEGER:
+    case Tok::TRBRACE:
+    case Tok::TRPAREN:
+    case Tok::TRBRACK:
+    case Tok::TTRUE:
+    case Tok::TFALSE:
+      // uncomment to add implicit semicolons
+      //return true;
+    default:
+      break;
+    }
+    return false;
+  }
+
+  Tok save(Tok tok, bool ignore_text = false) {
+    if (!ignore_text) {
+      save_text();
+    }
+
+    switch (tok) {
+    case Tok::TIDENTIFIER:
+    case Tok::TINTEGER:
+    case Tok::THEXINTEGER:
+      yylval_->string = new std::string(yytext, yyleng);
+      break;
+    default:
+      yylval_->token = tok;
+    }
+    prev_tok_ = tok;
+    return tok;
+  }
+
+  /*
+  std::string * alloc_string(const char *c, size_t len) {
+    strings_.push_back(std::unique_ptr<std::string>(new std::string(c, len)));
+    return strings_.back().get();
+  }
+
+  std::string * alloc_string(const std::string &s) {
+    strings_.push_back(std::unique_ptr<std::string>(new std::string(s)));
+    return strings_.back().get();
+  }
+  */
+
+  void save_text() {
+    lines_.back().append(yytext, yyleng);
+    yylloc_->columns(yyleng);
+  }
+
+  int yylex();
+  Tok prev_tok_;
+  std::vector<std::string> lines_;
+  //std::list<std::unique_ptr<std::string>> strings_;
+  BisonParser::semantic_type *yylval_;
+  BisonParser::location_type *yylloc_;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/lexer.ll b/src/cc/frontends/b/lexer.ll
new file mode 100644
index 0000000..1072b59
--- /dev/null
+++ b/src/cc/frontends/b/lexer.ll
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+%{
+#include "lexer.h"
+%}
+
+%option yylineno nodefault yyclass="Lexer" noyywrap c++ prefix="ebpfcc"
+%option never-interactive
+%{
+#include <string>
+#include "parser.yy.hh"
+std::string tmp_str_cc;
+%}
+
+%x STRING_
+%%
+
+\"                      {BEGIN STRING_;}
+<STRING_>\"             { BEGIN 0;
+                        yylval_->string = new std::string(tmp_str_cc);
+                        tmp_str_cc = "";
+                        return Tok::TSTRING;
+                        }
+<STRING_>\\n            {tmp_str_cc += "\n"; }
+<STRING_>.              {tmp_str_cc += *yytext; }
+
+
+
+[ \t]+                  { save_text(); }
+\n                      { if (next_line()) { return save(Tok::TSEMI, true); } }
+"//".*\n                { if (next_line()) { return save(Tok::TSEMI, true); } }
+^"#"                    return save(Tok::TPRAGMA);
+"="                     return save(Tok::TEQUAL);
+"=="                    return save(Tok::TCEQ);
+"!="                    return save(Tok::TCNE);
+"<"                     return save(Tok::TCLT);
+"<="                    return save(Tok::TCLE);
+">"                     return save(Tok::TCGT);
+">="                    return save(Tok::TCGE);
+"("                     return save(Tok::TLPAREN);
+")"                     return save(Tok::TRPAREN);
+"{"                     return save(Tok::TLBRACE);
+"}"                     return save(Tok::TRBRACE);
+"["                     return save(Tok::TLBRACK);
+"]"                     return save(Tok::TRBRACK);
+"->"                    return save(Tok::TARROW);
+"."                     return save(Tok::TDOT);
+","                     return save(Tok::TCOMMA);
+"+"                     return save(Tok::TPLUS);
+"++"                    return save(Tok::TINCR);
+"-"                     return save(Tok::TMINUS);
+"--"                    return save(Tok::TDECR);
+"*"                     return save(Tok::TMUL);
+"/"                     return save(Tok::TDIV);
+"%"                     return save(Tok::TMOD);
+"^"                     return save(Tok::TXOR);
+"$"                     return save(Tok::TDOLLAR);
+"!"                     return save(Tok::TNOT);
+"~"                     return save(Tok::TCMPL);
+":"                     return save(Tok::TCOLON);
+"::"                    return save(Tok::TSCOPE);
+";"                     return save(Tok::TSEMI);
+"&&"                    return save(Tok::TAND);
+"||"                    return save(Tok::TOR);
+"&"                     return save(Tok::TLAND);
+"|"                     return save(Tok::TLOR);
+"@"                     return save(Tok::TAT);
+
+"case"                  return save(Tok::TCASE);
+"continue"              return save(Tok::TCONTINUE);
+"else"                  return save(Tok::TELSE);
+"false"                 return save(Tok::TFALSE);
+"goto"                  return save(Tok::TGOTO);
+"if"                    return save(Tok::TIF);
+"next"                  return save(Tok::TNEXT);
+"on_match"              return save(Tok::TMATCH);
+"on_miss"               return save(Tok::TMISS);
+"on_failure"            return save(Tok::TFAILURE);
+"on_valid"              return save(Tok::TVALID);
+"return"                return save(Tok::TRETURN);
+"state"                 return save(Tok::TSTATE);
+"struct"                return save(Tok::TSTRUCT);
+"switch"                return save(Tok::TSWITCH);
+"true"                  return save(Tok::TTRUE);
+"u8"                    return save(Tok::TU8);
+"u16"                   return save(Tok::TU16);
+"u32"                   return save(Tok::TU32);
+"u64"                   return save(Tok::TU64);
+
+[a-zA-Z_][a-zA-Z0-9_]*  return save(Tok::TIDENTIFIER);
+[0-9]+                  return save(Tok::TINTEGER);
+0x[0-9a-fA-F]+          return save(Tok::THEXINTEGER);
+
+.                       printf("Unknown token \"%s\"\n", yytext); yyterminate();
+
+%%
diff --git a/src/cc/frontends/b/loader.cc b/src/cc/frontends/b/loader.cc
new file mode 100644
index 0000000..8d7f8a2
--- /dev/null
+++ b/src/cc/frontends/b/loader.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parser.h"
+#include "type_check.h"
+#include "codegen_llvm.h"
+#include "loader.h"
+
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+namespace ebpf {
+
+BLoader::BLoader(unsigned flags) : flags_(flags) {
+  (void)flags_;
+}
+
+BLoader::~BLoader() {
+}
+
+int BLoader::parse(llvm::Module *mod, const string &filename, const string &proto_filename,
+                   TableStorage &ts, const string &id, const std::string &maps_ns) {
+  int rc;
+
+  proto_parser_ = make_unique<ebpf::cc::Parser>(proto_filename);
+  rc = proto_parser_->parse();
+  if (rc) {
+    fprintf(stderr, "In file: %s\n", filename.c_str());
+    return rc;
+  }
+
+  parser_ = make_unique<ebpf::cc::Parser>(filename);
+  rc = parser_->parse();
+  if (rc) {
+    fprintf(stderr, "In file: %s\n", filename.c_str());
+    return rc;
+  }
+
+  //ebpf::cc::Printer printer(stderr);
+  //printer.visit(parser_->root_node_);
+
+  ebpf::cc::TypeCheck type_check(parser_->scopes_.get(), proto_parser_->scopes_.get());
+  auto ret = type_check.visit(parser_->root_node_);
+  if (ret.code() != 0 || ret.msg().size()) {
+    fprintf(stderr, "Type error @line=%d: %s\n", ret.code(), ret.msg().c_str());
+    return -1;
+  }
+
+  codegen_ = ebpf::make_unique<ebpf::cc::CodegenLLVM>(mod, parser_->scopes_.get(), proto_parser_->scopes_.get());
+  ret = codegen_->visit(parser_->root_node_, ts, id, maps_ns);
+  if (ret.code() != 0 || ret.msg().size()) {
+    fprintf(stderr, "Codegen error @line=%d: %s\n", ret.code(), ret.msg().c_str());
+    return ret.code();
+  }
+
+  return 0;
+}
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/loader.h b/src/cc/frontends/b/loader.h
new file mode 100644
index 0000000..6330d5c
--- /dev/null
+++ b/src/cc/frontends/b/loader.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "table_storage.h"
+
+namespace llvm {
+class Module;
+}
+
+namespace ebpf {
+
+namespace cc {
+class Parser;
+class CodegenLLVM;
+}
+
+class BLoader {
+ public:
+  explicit BLoader(unsigned flags);
+  ~BLoader();
+  int parse(llvm::Module *mod, const std::string &filename, const std::string &proto_filename,
+            TableStorage &ts, const std::string &id, const std::string &maps_ns);
+
+ private:
+  unsigned flags_;
+  std::unique_ptr<cc::Parser> parser_;
+  std::unique_ptr<cc::Parser> proto_parser_;
+  std::unique_ptr<cc::CodegenLLVM> codegen_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/node.cc b/src/cc/frontends/b/node.cc
new file mode 100644
index 0000000..6dac700
--- /dev/null
+++ b/src/cc/frontends/b/node.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <vector>
+#include <string>
+
+#include "node.h"
+
+namespace ebpf {
+namespace cc {
+
+#define ACCEPT(type, func) \
+  STATUS_RETURN type::accept(Visitor* v) { return v->visit_##func(this); }
+EXPAND_NODES(ACCEPT)
+#undef ACCEPT
+
+VariableDeclStmtNode* StructDeclStmtNode::field(const string& name) const {
+  for (auto it = stmts_.begin(); it != stmts_.end(); ++it) {
+    if ((*it)->id_->name_ == name) {
+      return it->get();
+    }
+  }
+  return NULL;
+}
+
+int StructDeclStmtNode::indexof(const string& name) const {
+  int i = 0;
+  for (auto it = stmts_.begin(); it != stmts_.end(); ++it, ++i) {
+    if ((*it)->id_->name_ == name) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/node.h b/src/cc/frontends/b/node.h
new file mode 100644
index 0000000..6490566
--- /dev/null
+++ b/src/cc/frontends/b/node.h
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+#include <bitset>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <stdint.h>
+
+#include "common.h"
+#include "bcc_exception.h"
+#include "scope.h"
+
+#define REVISION_MASK 0xfff
+#define MAJOR_VER_POS 22
+#define MAJOR_VER_MASK ~((1 << MAJOR_VER_POS) - 1)
+#define MINOR_VER_POS 12
+#define MINOR_VER_MASK (~((1 << MINOR_VER_POS) - 1) & (~(MAJOR_VER_MASK)))
+#define GET_MAJOR_VER(version) ((version & MAJOR_VER_MASK) >> MAJOR_VER_POS)
+#define GET_MINOR_VER(version) ((version & MINOR_VER_MASK) >> MINOR_VER_POS)
+#define GET_REVISION(version) (version & REVISION_MASK)
+#define MAKE_VERSION(major, minor, rev) \
+    ((major << MAJOR_VER_POS) | \
+     (minor << MINOR_VER_POS) | \
+     (rev & REVISION_MASK))
+
+#define STATUS_RETURN __attribute((warn_unused_result)) StatusTuple
+
+namespace ebpf {
+
+namespace cc {
+
+using std::unique_ptr;
+using std::move;
+using std::string;
+using std::vector;
+using std::bitset;
+using std::find;
+
+typedef unique_ptr<string> String;
+
+#define NODE_EXPRESSIONS(EXPAND) \
+  EXPAND(IdentExprNode, ident_expr_node) \
+  EXPAND(AssignExprNode, assign_expr_node) \
+  EXPAND(PacketExprNode, packet_expr_node) \
+  EXPAND(IntegerExprNode, integer_expr_node) \
+  EXPAND(StringExprNode, string_expr_node) \
+  EXPAND(BinopExprNode, binop_expr_node) \
+  EXPAND(UnopExprNode, unop_expr_node) \
+  EXPAND(BitopExprNode, bitop_expr_node) \
+  EXPAND(GotoExprNode, goto_expr_node) \
+  EXPAND(ReturnExprNode, return_expr_node) \
+  EXPAND(MethodCallExprNode, method_call_expr_node) \
+  EXPAND(TableIndexExprNode, table_index_expr_node)
+
+#define NODE_STATEMENTS(EXPAND) \
+  EXPAND(ExprStmtNode, expr_stmt_node) \
+  EXPAND(BlockStmtNode, block_stmt_node) \
+  EXPAND(IfStmtNode, if_stmt_node) \
+  EXPAND(OnValidStmtNode, onvalid_stmt_node) \
+  EXPAND(SwitchStmtNode, switch_stmt_node) \
+  EXPAND(CaseStmtNode, case_stmt_node) \
+  EXPAND(StructVariableDeclStmtNode, struct_variable_decl_stmt_node) \
+  EXPAND(IntegerVariableDeclStmtNode, integer_variable_decl_stmt_node) \
+  EXPAND(StructDeclStmtNode, struct_decl_stmt_node) \
+  EXPAND(StateDeclStmtNode, state_decl_stmt_node) \
+  EXPAND(ParserStateStmtNode, parser_state_stmt_node) \
+  EXPAND(MatchDeclStmtNode, match_decl_stmt_node) \
+  EXPAND(MissDeclStmtNode, miss_decl_stmt_node) \
+  EXPAND(FailureDeclStmtNode, failure_decl_stmt_node) \
+  EXPAND(TableDeclStmtNode, table_decl_stmt_node) \
+  EXPAND(FuncDeclStmtNode, func_decl_stmt_node)
+
+#define EXPAND_NODES(EXPAND) \
+  NODE_EXPRESSIONS(EXPAND) \
+  NODE_STATEMENTS(EXPAND)
+
+class Visitor;
+
+// forward declare all classes
+#define FORWARD(type, func) class type;
+EXPAND_NODES(FORWARD)
+#undef FORWARD
+
+#define DECLARE(type) \
+  typedef unique_ptr<type> Ptr; \
+  virtual StatusTuple accept(Visitor* v);
+
+class Node {
+ public:
+  typedef unique_ptr<Node> Ptr;
+  Node() : line_(-1), column_(-1) {}
+  virtual ~Node() {}
+  virtual StatusTuple accept(Visitor* v) = 0;
+  int line_;
+  int column_;
+  string text_;
+};
+
+template <typename... Args>
+StatusTuple mkstatus_(Node *n, const char *fmt, Args... args) {
+  StatusTuple status = StatusTuple(n->line_ ? n->line_ : -1, fmt, args...);
+  if (n->line_ > 0)
+    status.append_msg("\n" + n->text_);
+  return status;
+}
+
+static inline StatusTuple mkstatus_(Node *n, const char *msg) {
+  StatusTuple status = StatusTuple(n->line_ ? n->line_ : -1, msg);
+  if (n->line_ > 0)
+    status.append_msg("\n" + n->text_);
+  return status;
+}
+
+class StmtNode : public Node {
+ public:
+  typedef unique_ptr<StmtNode> Ptr;
+  virtual StatusTuple accept(Visitor* v) = 0;
+
+};
+typedef vector<StmtNode::Ptr> StmtNodeList;
+
+class ExprNode : public Node {
+ public:
+  typedef unique_ptr<ExprNode> Ptr;
+  virtual StatusTuple accept(Visitor* v) = 0;
+  enum expr_type { STRUCT, INTEGER, STRING, VOID, UNKNOWN };
+  enum prop_flag { READ = 0, WRITE, PROTO, IS_LHS, IS_REF, IS_PKT, LAST };
+  expr_type typeof_;
+  StructDeclStmtNode *struct_type_;
+  size_t bit_width_;
+  bitset<LAST> flags_;
+  unique_ptr<BitopExprNode> bitop_;
+  ExprNode() : typeof_(UNKNOWN), struct_type_(NULL), flags_(1 << READ) {}
+  void copy_type(const ExprNode& other) {
+    typeof_ = other.typeof_;
+    struct_type_ = other.struct_type_;
+    bit_width_ = other.bit_width_;
+    flags_ = other.flags_;
+  }
+  bool is_lhs() const { return flags_[IS_LHS]; }
+  bool is_ref() const { return flags_[IS_REF]; }
+  bool is_pkt() const { return flags_[IS_PKT]; }
+};
+
+typedef vector<ExprNode::Ptr> ExprNodeList;
+
+class IdentExprNode : public ExprNode {
+ public:
+  DECLARE(IdentExprNode)
+
+  string name_;
+  string sub_name_;
+  string scope_name_;
+  VariableDeclStmtNode *decl_;
+  VariableDeclStmtNode *sub_decl_;
+  IdentExprNode(const IdentExprNode& other) {
+    name_ = other.name_;
+    sub_name_ = other.sub_name_;
+    scope_name_ = other.scope_name_;
+    decl_ = other.decl_;
+    sub_decl_ = other.sub_decl_;
+  }
+  IdentExprNode::Ptr copy() const {
+    return IdentExprNode::Ptr(new IdentExprNode(*this));
+  }
+  explicit IdentExprNode(const string& id) : name_(id) {}
+  explicit IdentExprNode(const char* id) : name_(id) {}
+  void prepend_scope(const string& id) {
+    scope_name_ = id;
+  }
+  void append_scope(const string& id) {
+    scope_name_ = move(name_);
+    name_ = id;
+  }
+  void prepend_dot(const string& id) {
+    sub_name_ = move(name_);
+    name_ = id;
+  }
+  void append_dot(const string& id) {
+    // we don't support nested struct so keep all subs as single variable
+    if (!sub_name_.empty()) {
+      sub_name_ += "." + id;
+    } else {
+      sub_name_ = id;
+    }
+  }
+  const string& full_name() {
+    if (full_name_.size()) {
+      return full_name_;  // lazy init
+    }
+    if (scope_name_.size()) {
+      full_name_ += scope_name_ + "::";
+    }
+    full_name_ += name_;
+    if (sub_name_.size()) {
+      full_name_ += "." + sub_name_;
+    }
+    return full_name_;
+  }
+  const char* c_str() const { return name_.c_str(); }
+ private:
+  string full_name_;
+};
+
+class BitopExprNode : public ExprNode {
+ public:
+  DECLARE(BitopExprNode)
+
+  ExprNode::Ptr expr_;
+  size_t bit_offset_;
+  size_t bit_width_;
+  BitopExprNode(const string& bofs, const string& bsz)
+      : bit_offset_(strtoul(bofs.c_str(), NULL, 0)), bit_width_(strtoul(bsz.c_str(), NULL, 0)) {}
+};
+
+typedef vector<IdentExprNode::Ptr> IdentExprNodeList;
+
+class AssignExprNode : public ExprNode {
+ public:
+  DECLARE(AssignExprNode)
+
+  //IdentExprNode *id_;
+  ExprNode::Ptr lhs_;
+  ExprNode::Ptr rhs_;
+  AssignExprNode(IdentExprNode::Ptr id, ExprNode::Ptr rhs)
+      : lhs_(move(id)), rhs_(move(rhs)) {
+    //id_ = (IdentExprNode *)lhs_.get();
+    lhs_->flags_[ExprNode::IS_LHS] = true;
+  }
+  AssignExprNode(ExprNode::Ptr lhs, ExprNode::Ptr rhs)
+      : lhs_(move(lhs)), rhs_(move(rhs)) {
+    //id_ = nullptr;
+    lhs_->flags_[ExprNode::IS_LHS] = true;
+  }
+};
+
+class PacketExprNode : public ExprNode {
+ public:
+  DECLARE(PacketExprNode)
+
+  IdentExprNode::Ptr id_;
+  explicit PacketExprNode(IdentExprNode::Ptr id) : id_(move(id)) {}
+};
+
+class StringExprNode : public ExprNode {
+ public:
+  DECLARE(StringExprNode)
+
+  string val_;
+  explicit StringExprNode(string *val) : val_(move(*val)) {
+    delete val;
+  }
+  explicit StringExprNode(const string &val) : val_(val) {}
+};
+
+class IntegerExprNode : public ExprNode {
+ public:
+  DECLARE(IntegerExprNode)
+
+  size_t bits_;
+  string val_;
+  IntegerExprNode(string* val, string* bits)
+      : bits_(strtoul(bits->c_str(), NULL, 0)), val_(move(*val)) {
+    delete val;
+    delete bits;
+  }
+  explicit IntegerExprNode(string* val)
+      : bits_(0), val_(move(*val)) {
+    delete val;
+  }
+  explicit IntegerExprNode(const string& val) : bits_(0), val_(val) {}
+  explicit IntegerExprNode(const string& val, size_t bits) : bits_(bits), val_(val) {}
+};
+
+class BinopExprNode : public ExprNode {
+ public:
+  DECLARE(BinopExprNode)
+
+  ExprNode::Ptr lhs_;
+  int op_;
+  ExprNode::Ptr rhs_;
+  BinopExprNode(ExprNode::Ptr lhs, int op, ExprNode::Ptr rhs)
+      : lhs_(move(lhs)), op_(op), rhs_(move(rhs))
+  {}
+};
+
+class UnopExprNode : public ExprNode {
+ public:
+  DECLARE(UnopExprNode)
+
+  ExprNode::Ptr expr_;
+  int op_;
+  UnopExprNode(int op, ExprNode::Ptr expr) : expr_(move(expr)), op_(op) {}
+};
+
+class GotoExprNode : public ExprNode {
+ public:
+  DECLARE(GotoExprNode)
+
+  bool is_continue_;
+  IdentExprNode::Ptr id_;
+  GotoExprNode(IdentExprNode::Ptr id, bool is_continue = false)
+      : is_continue_(is_continue), id_(move(id)) {}
+};
+
+class ReturnExprNode : public ExprNode {
+ public:
+  DECLARE(ReturnExprNode)
+
+  ExprNode::Ptr expr_;
+  ReturnExprNode(ExprNode::Ptr expr)
+      : expr_(move(expr)) {}
+};
+
+class BlockStmtNode : public StmtNode {
+ public:
+  DECLARE(BlockStmtNode)
+
+  explicit BlockStmtNode(StmtNodeList stmts = StmtNodeList())
+    : stmts_(move(stmts)), scope_(NULL) {}
+  ~BlockStmtNode() { delete scope_; }
+  StmtNodeList stmts_;
+  Scopes::VarScope* scope_;
+};
+
+class MethodCallExprNode : public ExprNode {
+ public:
+  DECLARE(MethodCallExprNode)
+
+  IdentExprNode::Ptr id_;
+  ExprNodeList args_;
+  BlockStmtNode::Ptr block_;
+  MethodCallExprNode(IdentExprNode::Ptr id, ExprNodeList&& args, int lineno)
+      : id_(move(id)), args_(move(args)), block_(make_unique<BlockStmtNode>()) {
+    line_ = lineno;
+  }
+};
+
+class TableIndexExprNode : public ExprNode {
+ public:
+  DECLARE(TableIndexExprNode)
+
+  IdentExprNode::Ptr id_;
+  IdentExprNode::Ptr sub_;
+  ExprNode::Ptr index_;
+  TableDeclStmtNode *table_;
+  VariableDeclStmtNode *sub_decl_;
+  TableIndexExprNode(IdentExprNode::Ptr id, ExprNode::Ptr index)
+      : id_(move(id)), index_(move(index)), table_(nullptr), sub_decl_(nullptr)
+  {}
+};
+
+class ExprStmtNode : public StmtNode {
+ public:
+  DECLARE(ExprStmtNode)
+
+  ExprNode::Ptr expr_;
+  explicit ExprStmtNode(ExprNode::Ptr expr) : expr_(move(expr)) {}
+};
+
+class IfStmtNode : public StmtNode {
+ public:
+  DECLARE(IfStmtNode)
+
+  ExprNode::Ptr cond_;
+  StmtNode::Ptr true_block_;
+  StmtNode::Ptr false_block_;
+  // create an if () {} expression
+  IfStmtNode(ExprNode::Ptr cond, StmtNode::Ptr true_block)
+      : cond_(move(cond)), true_block_(move(true_block)) {}
+  // create an if () {} else {} expression
+  IfStmtNode(ExprNode::Ptr cond, StmtNode::Ptr true_block, StmtNode::Ptr false_block)
+      : cond_(move(cond)), true_block_(move(true_block)),
+      false_block_(move(false_block)) {}
+};
+
+class OnValidStmtNode : public StmtNode {
+ public:
+  DECLARE(OnValidStmtNode)
+
+  IdentExprNode::Ptr cond_;
+  StmtNode::Ptr block_;
+  StmtNode::Ptr else_block_;
+  // create an onvalid () {} expression
+  OnValidStmtNode(IdentExprNode::Ptr cond, StmtNode::Ptr block)
+      : cond_(move(cond)), block_(move(block)) {}
+  // create an onvalid () {} else {} expression
+  OnValidStmtNode(IdentExprNode::Ptr cond, StmtNode::Ptr block, StmtNode::Ptr else_block)
+      : cond_(move(cond)), block_(move(block)),
+      else_block_(move(else_block)) {}
+};
+
+class SwitchStmtNode : public StmtNode {
+ public:
+  DECLARE(SwitchStmtNode)
+  ExprNode::Ptr cond_;
+  BlockStmtNode::Ptr block_;
+  SwitchStmtNode(ExprNode::Ptr cond, BlockStmtNode::Ptr block)
+      : cond_(move(cond)), block_(move(block)) {}
+};
+
+class CaseStmtNode : public StmtNode {
+ public:
+  DECLARE(CaseStmtNode)
+  IntegerExprNode::Ptr value_;
+  BlockStmtNode::Ptr block_;
+  CaseStmtNode(IntegerExprNode::Ptr value, BlockStmtNode::Ptr block)
+      : value_(move(value)), block_(move(block)) {}
+  explicit CaseStmtNode(BlockStmtNode::Ptr block) : block_(move(block)) {}
+};
+
+class VariableDeclStmtNode : public StmtNode {
+ public:
+  typedef unique_ptr<VariableDeclStmtNode> Ptr;
+  virtual StatusTuple accept(Visitor* v) = 0;
+  enum storage_type { INTEGER, STRUCT, STRUCT_REFERENCE };
+
+  IdentExprNode::Ptr id_;
+  ExprNodeList init_;
+  enum storage_type storage_type_;
+  size_t bit_width_;
+  size_t bit_offset_;
+  int slot_;
+  string scope_id_;
+  explicit VariableDeclStmtNode(IdentExprNode::Ptr id, storage_type t, size_t bit_width = 0, size_t bit_offset = 0)
+      : id_(move(id)), storage_type_(t), bit_width_(bit_width), bit_offset_(bit_offset), slot_(0) {}
+  const char* scope_id() const { return scope_id_.c_str(); }
+  bool is_struct() { return (storage_type_ == STRUCT || storage_type_ == STRUCT_REFERENCE); }
+  bool is_pointer() { return (storage_type_ == STRUCT_REFERENCE); }
+};
+
+typedef vector<VariableDeclStmtNode::Ptr> FormalList;
+
+class StructVariableDeclStmtNode : public VariableDeclStmtNode {
+ public:
+  DECLARE(StructVariableDeclStmtNode)
+
+  IdentExprNode::Ptr struct_id_;
+  StructVariableDeclStmtNode(IdentExprNode::Ptr struct_id, IdentExprNode::Ptr id,
+                             VariableDeclStmtNode::storage_type t = VariableDeclStmtNode::STRUCT)
+      : VariableDeclStmtNode(move(id), t), struct_id_(move(struct_id)) {}
+};
+
+class IntegerVariableDeclStmtNode : public VariableDeclStmtNode {
+ public:
+  DECLARE(IntegerVariableDeclStmtNode)
+
+  IntegerVariableDeclStmtNode(IdentExprNode::Ptr id, const string& bits)
+      : VariableDeclStmtNode(move(id), VariableDeclStmtNode::INTEGER, strtoul(bits.c_str(), NULL, 0)) {}
+};
+
+class StructDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(StructDeclStmtNode)
+
+  IdentExprNode::Ptr id_;
+  FormalList stmts_;
+  size_t bit_width_;
+  bool packed_;
+  StructDeclStmtNode(IdentExprNode::Ptr id, FormalList&& stmts = FormalList())
+      : id_(move(id)), stmts_(move(stmts)), bit_width_(0), packed_(false) {}
+  VariableDeclStmtNode* field(const string& name) const;
+  int indexof(const string& name) const;
+  bool is_packed() const { return packed_; }
+};
+
+class ParserStateStmtNode : public StmtNode {
+ public:
+  DECLARE(ParserStateStmtNode)
+
+  IdentExprNode::Ptr id_;
+  StmtNode* next_state_;
+  string scope_id_;
+  explicit ParserStateStmtNode(IdentExprNode::Ptr id)
+      : id_(move(id)) {}
+  static Ptr make(const IdentExprNode::Ptr& id) {
+    return Ptr(new ParserStateStmtNode(id->copy()));
+  }
+  string scoped_name() const { return scope_id_ + id_->name_; }
+};
+
+class StateDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(StateDeclStmtNode)
+
+  struct Sub {
+    IdentExprNode::Ptr id_;
+    BlockStmtNode::Ptr block_;
+    ParserStateStmtNode::Ptr parser_;
+    Scopes::StateScope* scope_;
+    Sub(decltype(id_) id, decltype(block_) block, decltype(parser_) parser, decltype(scope_) scope)
+        : id_(move(id)), block_(move(block)), parser_(move(parser)), scope_(scope) {}
+    ~Sub() { delete scope_; }
+    Sub(Sub&& other) : scope_(NULL) {
+      *this = move(other);
+    }
+    Sub& operator=(Sub&& other) {
+      if (this == &other) {
+        return *this;
+      }
+      id_ = move(other.id_);
+      block_ = move(other.block_);
+      parser_ = move(other.parser_);
+      std::swap(scope_, other.scope_);
+      return *this;
+    }
+  };
+
+  IdentExprNode::Ptr id_;
+  StmtNodeList init_;
+  string scope_id_;
+  ParserStateStmtNode::Ptr parser_;
+  vector<Sub> subs_;
+  StateDeclStmtNode() {}
+  StateDeclStmtNode(IdentExprNode::Ptr id, BlockStmtNode::Ptr block) : id_(move(id)) {
+    subs_.push_back(Sub(make_unique<IdentExprNode>(""), move(block), ParserStateStmtNode::Ptr(), NULL));
+  }
+  StateDeclStmtNode(IdentExprNode::Ptr id1, IdentExprNode::Ptr id2, BlockStmtNode::Ptr block)
+      : id_(move(id1)) {
+    subs_.push_back(Sub(move(id2), move(block), ParserStateStmtNode::Ptr(), NULL));
+  }
+  string scoped_name() const { return scope_id_ + id_->name_; }
+  vector<Sub>::iterator find_sub(const string& id) {
+    return find_if(subs_.begin(), subs_.end(), [&id] (const Sub& sub) {
+      if (sub.id_->name_ == id)
+        return true;
+      return false;
+    });
+
+  }
+};
+
+class MatchDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(MatchDeclStmtNode)
+
+  IdentExprNode::Ptr id_;
+  FormalList formals_;
+  BlockStmtNode::Ptr block_;
+  MatchDeclStmtNode(IdentExprNode::Ptr id, FormalList&& formals, BlockStmtNode::Ptr block)
+      : id_(move(id)), formals_(move(formals)), block_(move(block)) {}
+};
+
+class MissDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(MissDeclStmtNode)
+
+  IdentExprNode::Ptr id_;
+  FormalList formals_;
+  BlockStmtNode::Ptr block_;
+  MissDeclStmtNode(IdentExprNode::Ptr id, FormalList&& formals, BlockStmtNode::Ptr block)
+      : id_(move(id)), formals_(move(formals)), block_(move(block)) {}
+};
+
+class FailureDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(FailureDeclStmtNode)
+
+  IdentExprNode::Ptr id_;
+  FormalList formals_;
+  BlockStmtNode::Ptr block_;
+  FailureDeclStmtNode(IdentExprNode::Ptr id, FormalList&& formals, BlockStmtNode::Ptr block)
+      : id_(move(id)), formals_(move(formals)), block_(move(block)) {}
+};
+
+class TableDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(TableDeclStmtNode)
+
+  IdentExprNode::Ptr table_type_;
+  IdentExprNodeList templates_;
+  IdentExprNode::Ptr id_;
+  StructDeclStmtNode *key_type_;
+  StructDeclStmtNode *leaf_type_;
+  IdentExprNode * key_id() { return templates_.at(0).get(); }
+  IdentExprNode * leaf_id() { return templates_.at(1).get(); }
+  IdentExprNode * type_id() { return templates_.at(2).get(); }
+  IdentExprNode * policy_id() { return templates_.at(3).get(); }
+  size_t size_;
+  TableDeclStmtNode(IdentExprNode::Ptr table_type, IdentExprNodeList&& templates,
+                    IdentExprNode::Ptr id, string* size)
+      : table_type_(move(table_type)), templates_(move(templates)), id_(move(id)),
+      key_type_(nullptr), leaf_type_(nullptr), size_(strtoul(size->c_str(), NULL, 0)) {
+    delete size;
+  }
+};
+
+class FuncDeclStmtNode : public StmtNode {
+ public:
+  DECLARE(FuncDeclStmtNode)
+
+  IdentExprNode::Ptr id_;
+  FormalList formals_;
+  BlockStmtNode::Ptr block_;
+  Scopes::StateScope* scope_;
+  FuncDeclStmtNode(IdentExprNode::Ptr id, FormalList&& formals, BlockStmtNode::Ptr block)
+      : id_(move(id)), formals_(move(formals)), block_(move(block)), scope_(NULL) {}
+};
+
+class Visitor {
+ public:
+  typedef StatusTuple Ret;
+  virtual ~Visitor() {}
+#define VISIT(type, func) virtual STATUS_RETURN visit_##func(type* n) = 0;
+  EXPAND_NODES(VISIT)
+#undef VISIT
+};
+
+#undef DECLARE
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/parser.cc b/src/cc/frontends/b/parser.cc
new file mode 100644
index 0000000..8a5e149
--- /dev/null
+++ b/src/cc/frontends/b/parser.cc
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "bcc_exception.h"
+#include "parser.h"
+#include "type_helper.h"
+
+namespace ebpf {
+namespace cc {
+
+using std::find;
+using std::move;
+using std::string;
+using std::unique_ptr;
+
+bool Parser::variable_exists(VariableDeclStmtNode *decl) const {
+  if (scopes_->current_var()->lookup(decl->id_->name_, SCOPE_LOCAL) == NULL) {
+    return false;
+  }
+  return true;
+}
+
+VariableDeclStmtNode *Parser::variable_add(vector<int> *types, VariableDeclStmtNode *decl) {
+
+  if (variable_exists(decl)) {
+    fprintf(stderr, "redeclaration of variable %s", decl->id_->name_.c_str());
+    return nullptr;
+  }
+  decl->scope_id_ = string("v") + std::to_string(scopes_->current_var()->id_) + string("_");
+  scopes_->current_var()->add(decl->id_->name_, decl);
+  return decl;
+}
+
+VariableDeclStmtNode *Parser::variable_add(vector<int> *types, VariableDeclStmtNode *decl, ExprNode *init_expr) {
+  AssignExprNode::Ptr assign(new AssignExprNode(decl->id_->copy(), ExprNode::Ptr(init_expr)));
+  decl->init_.push_back(move(assign));
+
+  if (variable_exists(decl)) {
+    fprintf(stderr, "redeclaration of variable %s", decl->id_->name_.c_str());
+    return nullptr;
+  }
+  decl->scope_id_ = string("v") + std::to_string(scopes_->current_var()->id_) + string("_");
+  scopes_->current_var()->add(decl->id_->name_, decl);
+  return decl;
+}
+
+StructVariableDeclStmtNode *Parser::variable_add(StructVariableDeclStmtNode *decl, ExprNodeList *args, bool is_kv) {
+  if (is_kv) {
+    // annotate the init expressions with the declared id
+    for (auto arg = args->begin(); arg != args->end(); ++arg) {
+      // decorate with the name of this decl
+      auto n = static_cast<AssignExprNode *>(arg->get());
+      auto id = static_cast<IdentExprNode *>(n->lhs_.get());
+      id->prepend_dot(decl->id_->name_);
+    }
+  } else {
+    fprintf(stderr, "must use key = value syntax\n");
+    return NULL;
+  }
+
+  decl->init_ = move(*args);
+  delete args;
+
+  if (variable_exists(decl)) {
+    fprintf(stderr, "ccpg: warning: redeclaration of variable '%s'\n", decl->id_->name_.c_str());
+    return nullptr;
+  }
+  decl->scope_id_ = string("v") + std::to_string(scopes_->current_var()->id_) + string("_");
+  scopes_->current_var()->add(decl->id_->name_, decl);
+  return decl;
+}
+
+StmtNode *Parser::state_add(Scopes::StateScope *scope, IdentExprNode *id, BlockStmtNode *body) {
+  if (scopes_->current_state()->lookup(id->full_name(), SCOPE_LOCAL)) {
+    fprintf(stderr, "redeclaration of state %s\n", id->full_name().c_str());
+    // redeclaration
+    return NULL;
+  }
+  auto state = new StateDeclStmtNode(IdentExprNode::Ptr(id), BlockStmtNode::Ptr(body));
+    // add a reference to the lower scope
+  state->subs_[0].scope_ = scope;
+
+  // add me to the upper scope
+  scopes_->current_state()->add(state->id_->full_name(), state);
+  state->scope_id_ = string("s") + std::to_string(scopes_->current_state()->id_) + string("_");
+
+  return state;
+}
+
+StmtNode *Parser::state_add(Scopes::StateScope *scope, IdentExprNode *id1, IdentExprNode *id2, BlockStmtNode *body) {
+  auto state = scopes_->current_state()->lookup(id1->full_name(), SCOPE_LOCAL);
+  if (!state) {
+    state = new StateDeclStmtNode(IdentExprNode::Ptr(id1), IdentExprNode::Ptr(id2), BlockStmtNode::Ptr(body));
+    // add a reference to the lower scope
+    state->subs_[0].scope_ = scope;
+
+    // add me to the upper scope
+    scopes_->current_state()->add(state->id_->full_name(), state);
+    state->scope_id_ = string("s") + std::to_string(scopes_->current_state()->id_) + string("_");
+    return state;
+  } else {
+    if (state->find_sub(id2->name_) != state->subs_.end()) {
+      fprintf(stderr, "redeclaration of state %s, %s\n", id1->full_name().c_str(), id2->full_name().c_str());
+      return NULL;
+    }
+    state->subs_.push_back(StateDeclStmtNode::Sub(IdentExprNode::Ptr(id2), BlockStmtNode::Ptr(body),
+                                                  ParserStateStmtNode::Ptr(), scope));
+    delete id1;
+
+    return new StateDeclStmtNode(); // stub
+  }
+}
+
+bool Parser::table_exists(TableDeclStmtNode *decl, bool search_local) {
+  if (scopes_->top_table()->lookup(decl->id_->name_, search_local) == NULL) {
+    return false;
+  }
+  return true;
+}
+
+StmtNode *Parser::table_add(IdentExprNode *type, IdentExprNodeList *templates,
+                            IdentExprNode *id, string *size) {
+  auto table = new TableDeclStmtNode(IdentExprNode::Ptr(type),
+                                     move(*templates),
+                                     IdentExprNode::Ptr(id), size);
+  if (table_exists(table, true)) {
+    fprintf(stderr, "redeclaration of table %s\n", id->name_.c_str());
+    return table;
+  }
+  scopes_->top_table()->add(id->name_, table);
+  return table;
+}
+
+StmtNode * Parser::struct_add(IdentExprNode *type, FormalList *formals) {
+  auto struct_decl = new StructDeclStmtNode(IdentExprNode::Ptr(type), move(*formals));
+  if (scopes_->top_struct()->lookup(type->name_, SCOPE_LOCAL) != NULL) {
+    fprintf(stderr, "redeclaration of struct %s\n", type->name_.c_str());
+    return struct_decl;
+  }
+
+  auto pr_it = pragmas_.find("packed");
+  if (pr_it != pragmas_.end() && pr_it->second == "true")
+    struct_decl->packed_ = true;
+
+  int i = 0;
+  size_t offset = 0;
+  for (auto it = struct_decl->stmts_.begin(); it != struct_decl->stmts_.end(); ++it, ++i) {
+    FieldType ft = bits_to_enum((*it)->bit_width_);
+    offset = struct_decl->is_packed() ? offset : align_offset(offset, ft);
+    (*it)->slot_ = i;
+    (*it)->bit_offset_ = offset;
+    offset += (*it)->bit_width_;
+  }
+  struct_decl->bit_width_ = struct_decl->is_packed() ? offset : align_offset(offset, UINT32_T);
+
+  scopes_->top_struct()->add(type->name_, struct_decl);
+  return struct_decl;
+}
+
+StmtNode * Parser::result_add(int token, IdentExprNode *id, FormalList *formals, BlockStmtNode *body) {
+  StmtNode *stmt = NULL;
+  switch (token) {
+    case Tok::TMATCH:
+      stmt = new MatchDeclStmtNode(IdentExprNode::Ptr(id), move(*formals), BlockStmtNode::Ptr(body));
+      break;
+    case Tok::TMISS:
+      stmt = new MissDeclStmtNode(IdentExprNode::Ptr(id), move(*formals), BlockStmtNode::Ptr(body));
+      break;
+    case Tok::TFAILURE:
+      stmt = new FailureDeclStmtNode(IdentExprNode::Ptr(id), move(*formals), BlockStmtNode::Ptr(body));
+      break;
+    default:
+      {}
+  }
+  return stmt;
+}
+
+StmtNode * Parser::func_add(vector<int> *types, Scopes::StateScope *scope,
+                            IdentExprNode *id, FormalList *formals, BlockStmtNode *body) {
+  auto decl = new FuncDeclStmtNode(IdentExprNode::Ptr(id), move(*formals), BlockStmtNode::Ptr(body));
+  if (scopes_->top_func()->lookup(decl->id_->name_, SCOPE_LOCAL)) {
+    fprintf(stderr, "redeclaration of func %s\n", id->name_.c_str());
+    return decl;
+  }
+  auto cur_scope = scopes_->current_var();
+  scopes_->set_current(scope);
+  for (auto it = formals->begin(); it != formals->end(); ++it)
+    if (!variable_add(nullptr, it->get())) {
+      delete decl;
+      return nullptr;
+    }
+  scopes_->set_current(cur_scope);
+  decl->scope_ = scope;
+  scopes_->top_func()->add(id->name_, decl);
+  return decl;
+}
+
+void Parser::set_loc(Node *n, const BisonParser::location_type &loc) const {
+  n->line_ = loc.begin.line;
+  n->column_ = loc.begin.column;
+  n->text_ = lexer.text(loc);
+}
+
+string Parser::pragma(const string &name) const {
+  auto it = pragmas_.find(name);
+  if (it == pragmas_.end()) return "main";
+  return it->second;
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/parser.h b/src/cc/frontends/b/parser.h
new file mode 100644
index 0000000..21338b5
--- /dev/null
+++ b/src/cc/frontends/b/parser.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <fstream> // NOLINT
+#include "node.h"
+#include "lexer.h"
+#include "scope.h"
+
+namespace ebpf {
+namespace cc {
+
+using std::pair;
+using std::string;
+using std::vector;
+
+class Parser {
+ public:
+  explicit Parser(const string& infile)
+      : root_node_(NULL), scopes_(new Scopes), in_(infile), lexer(&in_), parser(lexer, *this) {
+    // parser.set_debug_level(1);
+  }
+  ~Parser() { delete root_node_; }
+  int parse() {
+    return parser.parse();
+  }
+
+  VariableDeclStmtNode * variable_add(vector<int> *types, VariableDeclStmtNode *decl);
+  VariableDeclStmtNode * variable_add(vector<int> *types, VariableDeclStmtNode *decl, ExprNode *init_expr);
+  StructVariableDeclStmtNode * variable_add(StructVariableDeclStmtNode *decl, ExprNodeList *args, bool is_kv);
+  StmtNode * state_add(Scopes::StateScope *scope, IdentExprNode *id1, BlockStmtNode *body);
+  StmtNode * state_add(Scopes::StateScope *scope, IdentExprNode *id1, IdentExprNode *id2, BlockStmtNode *body);
+  StmtNode * func_add(std::vector<int> *types, Scopes::StateScope *scope,
+                      IdentExprNode *id, FormalList *formals, BlockStmtNode *body);
+  StmtNode * table_add(IdentExprNode *type, IdentExprNodeList *templates, IdentExprNode *id, string *size);
+  StmtNode * struct_add(IdentExprNode *type, FormalList *formals);
+  StmtNode * result_add(int token, IdentExprNode *id, FormalList *formals, BlockStmtNode *body);
+  bool variable_exists(VariableDeclStmtNode *decl) const;
+  bool table_exists(TableDeclStmtNode *decl, bool search_local = true);
+  void add_pragma(const std::string& pr, const std::string& v) { pragmas_[pr] = v; }
+  void set_loc(Node *n, const BisonParser::location_type &loc) const;
+  std::string pragma(const std::string &name) const;
+
+  Node *root_node_;
+  Scopes::Ptr scopes_;
+  std::map<std::string, std::string> pragmas_;
+ private:
+  std::ifstream in_;
+  Lexer lexer;
+  BisonParser parser;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/parser.yy b/src/cc/frontends/b/parser.yy
new file mode 100644
index 0000000..527e84f
--- /dev/null
+++ b/src/cc/frontends/b/parser.yy
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+%skeleton "lalr1.cc"
+%defines
+%define namespace "ebpf::cc"
+%define parser_class_name "BisonParser"
+%parse-param { ebpf::cc::Lexer &lexer }
+%parse-param { ebpf::cc::Parser &parser }
+%lex-param { ebpf::cc::Lexer &lexer }
+%locations
+
+%code requires {
+    #include <memory>
+    #include <vector>
+    #include <string>
+    #include "node.h"
+    // forward declaration
+    namespace ebpf { namespace cc {
+        class Lexer;
+        class Parser;
+    } }
+}
+
+%code {
+    static int yylex(ebpf::cc::BisonParser::semantic_type *yylval,
+                     ebpf::cc::BisonParser::location_type *yylloc,
+                     ebpf::cc::Lexer &lexer);
+}
+
+%{
+    #include "node.h"
+    #include "parser.h"
+    using std::unique_ptr;
+    using std::vector;
+    using std::string;
+    using std::move;
+%}
+
+%union {
+    Scopes::StateScope *state_scope;
+    Scopes::VarScope *var_scope;
+    BlockStmtNode *block;
+    ExprNode *expr;
+    MethodCallExprNode *call;
+    StmtNode *stmt;
+    IdentExprNode *ident;
+    IntegerExprNode *numeric;
+    BitopExprNode *bitop;
+    ExprNodeList *args;
+    IdentExprNodeList *ident_args;
+    StmtNodeList *stmts;
+    FormalList *formals;
+    VariableDeclStmtNode *decl;
+    StructVariableDeclStmtNode *type_decl;
+    TableIndexExprNode *table_index;
+    std::vector<int> *type_specifiers;
+    std::string* string;
+    int token;
+}
+
+/* Define the terminal symbols. */
+%token <string> TIDENTIFIER TINTEGER THEXINTEGER TPRAGMA TSTRING
+%token <token> TU8 TU16 TU32 TU64
+%token <token> TEQUAL TCEQ TCNE TCLT TCLE TCGT TCGE TAND TOR
+%token <token> TLPAREN TRPAREN TLBRACE TRBRACE TLBRACK TRBRACK
+%token <token> TDOT TARROW TCOMMA TPLUS TMINUS TMUL TDIV TMOD TXOR TDOLLAR TCOLON TSCOPE TNOT TSEMI TCMPL TLAND TLOR
+%token <token> TSTRUCT TSTATE TFUNC TGOTO TCONTINUE TNEXT TTRUE TFALSE TRETURN
+%token <token> TIF TELSE TSWITCH TCASE
+%token <token> TMATCH TMISS TFAILURE TVALID
+%token <token> TAT
+
+/* Define non-terminal symbols as defined in the above union */
+%type <ident> ident scoped_ident dotted_ident any_ident
+%type <expr> expr assign_expr return_expr init_arg_kv
+%type <numeric> numeric
+%type <bitop> bitop
+%type <args> call_args /*init_args*/ init_args_kv
+%type <ident_args> table_decl_args
+%type <formals> struct_decl_stmts formals
+%type <block> program block prog_decls
+%type <decl> decl_stmt int_decl ref_stmt
+%type <type_decl> type_decl ptr_decl
+%type <stmt> stmt prog_decl var_decl struct_decl state_decl func_decl
+%type <stmt> table_decl table_result_stmt if_stmt switch_stmt case_stmt onvalid_stmt
+%type <var_scope> enter_varscope exit_varscope
+%type <state_scope> enter_statescope exit_statescope
+%type <stmts> stmts table_result_stmts case_stmts
+%type <call> call_expr
+%type <table_index> table_index_expr
+%type <type_specifiers> type_specifiers
+%type <stmt> pragma_decl
+%type <token> type_specifier
+
+/* taken from C++ operator precedence wiki page */
+%nonassoc TSCOPE
+%left TDOT TLBRACK TLBRACE TLPAREN TINCR TDECR
+%right TNOT TCMPL
+%left TMUL
+%left TDIV
+%left TMOD
+%left TPLUS
+%left TMINUS
+%left TCLT TCLE TCGT TCGE
+%left TCEQ
+%left TCNE
+%left TXOR
+%left TAND
+%left TOR
+%left TLAND
+%left TLOR
+%right TEQUAL
+
+%start program
+
+%%
+
+program
+  : enter_statescope enter_varscope prog_decls exit_varscope exit_statescope
+    { parser.root_node_ = $3; $3->scope_ = $2; }
+  ;
+
+/* program is a list of declarations */
+prog_decls
+  : prog_decl
+    { $$ = new BlockStmtNode; $$->stmts_.push_back(StmtNode::Ptr($1)); }
+  | prog_decls prog_decl
+    { $1->stmts_.push_back(StmtNode::Ptr($2)); }
+  ;
+
+/*
+ possible program declarations are:
+  "struct {}"
+  "state|on_miss|on_match|on_valid {}"
+  "var <var_decl>"
+  "Table <...> <ident>(size)"
+ */
+prog_decl
+  : var_decl TSEMI
+  | struct_decl TSEMI
+  | state_decl
+  | table_decl TSEMI
+  | pragma_decl
+  | func_decl
+  ;
+
+pragma_decl
+  : TPRAGMA TIDENTIFIER TIDENTIFIER
+    { $$ = new BlockStmtNode; parser.add_pragma(*$2, *$3); delete $2; delete $3; }
+  | TPRAGMA TIDENTIFIER TSTRING
+    { $$ = new BlockStmtNode; parser.add_pragma(*$2, *$3); delete $2; delete $3; }
+  ;
+
+stmts
+  : stmt
+    { $$ = new StmtNodeList; $$->push_back(StmtNode::Ptr($1)); }
+  | stmts stmt
+    { $1->push_back(StmtNode::Ptr($2)); }
+  ;
+
+stmt
+  : expr TSEMI
+    { $$ = new ExprStmtNode(ExprNode::Ptr($1));
+      parser.set_loc($$, @$); }
+  | assign_expr TSEMI
+    { $$ = new ExprStmtNode(ExprNode::Ptr($1));
+      parser.set_loc($$, @$); }
+  | return_expr TSEMI
+    { $$ = new ExprStmtNode(ExprNode::Ptr($1));
+      parser.set_loc($$, @$); }
+  | call_expr TLBRACE enter_varscope table_result_stmts exit_varscope TRBRACE TSEMI
+    { $$ = new ExprStmtNode(ExprNode::Ptr($1));
+      $1->block_->stmts_ = move(*$4); delete $4;
+      $1->block_->scope_ = $3;
+      parser.set_loc($$, @$); }
+  | call_expr TLBRACE TRBRACE TSEMI  // support empty curly braces
+    { $$ = new ExprStmtNode(ExprNode::Ptr($1));
+      parser.set_loc($$, @$); }
+  | if_stmt
+  | switch_stmt
+  | var_decl TSEMI
+    { $$ = $1; }
+  | state_decl
+  | onvalid_stmt
+  ;
+
+call_expr
+  : any_ident TLPAREN call_args TRPAREN
+    { $$ = new MethodCallExprNode(IdentExprNode::Ptr($1), move(*$3), lexer.lineno()); delete $3;
+      parser.set_loc($$, @$); }
+  ;
+
+block
+  : TLBRACE stmts TRBRACE
+    { $$ = new BlockStmtNode; $$->stmts_ = move(*$2); delete $2;
+      parser.set_loc($$, @$); }
+  | TLBRACE TRBRACE
+    { $$ = new BlockStmtNode;
+      parser.set_loc($$, @$); }
+  ;
+
+enter_varscope : /* empty */ { $$ = parser.scopes_->enter_var_scope(); } ;
+exit_varscope : /* emtpy */ { $$ = parser.scopes_->exit_var_scope(); } ;
+enter_statescope : /* empty */ { $$ = parser.scopes_->enter_state_scope(); } ;
+exit_statescope : /* emtpy */ { $$ = parser.scopes_->exit_state_scope(); } ;
+
+struct_decl
+  : TSTRUCT ident TLBRACE struct_decl_stmts TRBRACE
+    { $$ = parser.struct_add($2, $4); delete $4;
+      parser.set_loc($$, @$); }
+  ;
+
+struct_decl_stmts
+  : type_specifiers decl_stmt TSEMI
+    { $$ = new FormalList; $$->push_back(VariableDeclStmtNode::Ptr($2)); }
+  | struct_decl_stmts type_specifiers decl_stmt TSEMI
+    { $1->push_back(VariableDeclStmtNode::Ptr($3)); }
+  ;
+
+table_decl
+  : ident TCLT table_decl_args TCGT ident TLPAREN TINTEGER TRPAREN
+    { $$ = parser.table_add($1, $3, $5, $7); delete $3;
+      parser.set_loc($$, @$); }
+  ;
+
+table_decl_args
+  : ident
+    { $$ = new IdentExprNodeList; $$->push_back(IdentExprNode::Ptr($1)); }
+  | table_decl_args TCOMMA ident
+    { $$->push_back(IdentExprNode::Ptr($3)); }
+  ;
+
+state_decl
+  : TSTATE scoped_ident enter_statescope enter_varscope block exit_varscope exit_statescope
+    { $$ = parser.state_add($3, $2, $5); $5->scope_ = $4;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | TSTATE scoped_ident TCOMMA TMUL enter_statescope enter_varscope block exit_varscope exit_statescope
+    { $$ = parser.state_add($5, $2, new IdentExprNode(""), $7); $7->scope_ = $6;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | TSTATE scoped_ident TCOMMA scoped_ident enter_statescope enter_varscope block exit_varscope exit_statescope
+    { $$ = parser.state_add($5, $2, $4, $7); $7->scope_ = $6;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  ;
+
+func_decl
+  : type_specifiers ident enter_statescope enter_varscope TLPAREN formals TRPAREN block exit_varscope exit_statescope
+    { $$ = parser.func_add($1, $3, $2, $6, $8); $8->scope_ = $4;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  ;
+
+table_result_stmts
+  : table_result_stmt
+    { $$ = new StmtNodeList; $$->push_back(StmtNode::Ptr($1)); }
+  | table_result_stmts table_result_stmt
+    { $$->push_back(StmtNode::Ptr($2)); }
+  ;
+
+table_result_stmt
+  : TMATCH ident enter_varscope TLPAREN formals TRPAREN block exit_varscope TSEMI
+    { $$ = parser.result_add($1, $2, $5, $7); delete $5; $7->scope_ = $3;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | TMISS ident enter_varscope TLPAREN TRPAREN block exit_varscope TSEMI
+    { $$ = parser.result_add($1, $2, new FormalList, $6); $6->scope_ = $3;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | TFAILURE ident enter_varscope TLPAREN formals TRPAREN block exit_varscope TSEMI
+    { $$ = parser.result_add($1, $2, $5, $7); delete $5; $7->scope_ = $3;
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  ;
+
+formals
+  : TSTRUCT ptr_decl
+    { $$ = new FormalList; $$->push_back(VariableDeclStmtNode::Ptr(parser.variable_add(nullptr, $2))); }
+  | formals TCOMMA TSTRUCT ptr_decl
+    { $1->push_back(VariableDeclStmtNode::Ptr(parser.variable_add(nullptr, $4))); }
+  ;
+
+type_specifier
+  : TU8
+  | TU16
+  | TU32
+  | TU64
+  ;
+
+type_specifiers
+  : type_specifier { $$ = new std::vector<int>; $$->push_back($1); }
+  | type_specifiers type_specifier { $$->push_back($2); }
+  ;
+
+var_decl
+  : type_specifiers decl_stmt
+    { $$ = parser.variable_add($1, $2);
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | type_specifiers int_decl TEQUAL expr
+    { $$ = parser.variable_add($1, $2, $4);
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  | TSTRUCT type_decl TEQUAL TLBRACE init_args_kv TRBRACE
+    { $$ = parser.variable_add($2, $5, true);
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  /*| TSTRUCT type_decl TEQUAL TLBRACE init_args TRBRACE
+    { $$ = parser.variable_add($2, $5, false);
+      parser.set_loc($$, @$); }*/
+  | TSTRUCT ref_stmt
+    { $$ = parser.variable_add(nullptr, $2);
+      if (!$$) YYERROR;
+      parser.set_loc($$, @$); }
+  ;
+
+/* "id":"bitsize" or "type" "id" */
+decl_stmt : int_decl { $$ = $1; } | type_decl { $$ = $1; };
+int_decl : ident TCOLON TINTEGER
+    { $$ = new IntegerVariableDeclStmtNode(IdentExprNode::Ptr($1), *$3); delete $3;
+      parser.set_loc($$, @$); }
+  ;
+
+type_decl : scoped_ident ident
+    { $$ = new StructVariableDeclStmtNode(IdentExprNode::Ptr($1), IdentExprNode::Ptr($2));
+      parser.set_loc($$, @$); }
+  ;
+
+/* "type" "*" "id" */
+ref_stmt : ptr_decl { $$ = $1; };
+ptr_decl : scoped_ident TMUL ident
+    { $$ = new StructVariableDeclStmtNode(IdentExprNode::Ptr($1), IdentExprNode::Ptr($3),
+                                          VariableDeclStmtNode::STRUCT_REFERENCE);
+      parser.set_loc($$, @$); }
+  ;
+
+/* normal initializer */
+/* init_args
+  : expr { $$ = new ExprNodeList; $$->push_back(ExprNode::Ptr($1)); }
+  | init_args TCOMMA expr { $$->push_back(ExprNode::Ptr($3)); }
+  ;*/
+
+/* one or more of "field" = "expr" */
+init_args_kv
+  : init_arg_kv { $$ = new ExprNodeList; $$->push_back(ExprNode::Ptr($1)); }
+  | init_args_kv TCOMMA init_arg_kv { $$->push_back(ExprNode::Ptr($3)); }
+  ;
+init_arg_kv
+  : TDOT ident TEQUAL expr
+    { $$ = new AssignExprNode(IdentExprNode::Ptr($2), ExprNode::Ptr($4));
+      parser.set_loc($$, @$); }
+  | TDOT ident bitop TEQUAL expr
+    { $$ = new AssignExprNode(IdentExprNode::Ptr($2), ExprNode::Ptr($5)); $$->bitop_ = BitopExprNode::Ptr($3);
+      parser.set_loc($$, @$); }
+  ;
+
+if_stmt
+  : TIF expr enter_varscope block exit_varscope
+    { $$ = new IfStmtNode(ExprNode::Ptr($2), StmtNode::Ptr($4));
+      $4->scope_ = $3;
+      parser.set_loc($$, @$); }
+  | TIF expr enter_varscope block exit_varscope TELSE enter_varscope block exit_varscope
+    { $$ = new IfStmtNode(ExprNode::Ptr($2), StmtNode::Ptr($4), StmtNode::Ptr($8));
+      $4->scope_ = $3; $8->scope_ = $7;
+      parser.set_loc($$, @$); }
+  | TIF expr enter_varscope block exit_varscope TELSE if_stmt
+    { $$ = new IfStmtNode(ExprNode::Ptr($2), StmtNode::Ptr($4), StmtNode::Ptr($7));
+      $4->scope_ = $3;
+      parser.set_loc($$, @$); }
+  ;
+
+onvalid_stmt
+  : TVALID TLPAREN ident TRPAREN enter_varscope block exit_varscope
+    { $$ = new OnValidStmtNode(IdentExprNode::Ptr($3), StmtNode::Ptr($6));
+      $6->scope_ = $5;
+      parser.set_loc($$, @$); }
+  | TVALID TLPAREN ident TRPAREN enter_varscope block exit_varscope TELSE enter_varscope block exit_varscope
+    { $$ = new OnValidStmtNode(IdentExprNode::Ptr($3), StmtNode::Ptr($6), StmtNode::Ptr($10));
+      $6->scope_ = $5; $10->scope_ = $9;
+      parser.set_loc($$, @$); }
+  ;
+
+switch_stmt
+  : TSWITCH expr TLBRACE case_stmts TRBRACE
+    { $$ = new SwitchStmtNode(ExprNode::Ptr($2), make_unique<BlockStmtNode>(move(*$4))); delete $4;
+      parser.set_loc($$, @$); }
+  ;
+
+case_stmts
+  : case_stmt
+    { $$ = new StmtNodeList; $$->push_back(StmtNode::Ptr($1)); }
+  | case_stmts case_stmt
+    { $$->push_back(StmtNode::Ptr($2)); }
+  ;
+
+case_stmt
+  : TCASE numeric block TSEMI
+    { $$ = new CaseStmtNode(IntegerExprNode::Ptr($2), BlockStmtNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | TCASE TMUL block TSEMI
+    { $$ = new CaseStmtNode(BlockStmtNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  ;
+
+numeric
+  : TINTEGER
+    { $$ = new IntegerExprNode($1);
+      parser.set_loc($$, @$); }
+  | THEXINTEGER
+    { $$ = new IntegerExprNode($1);
+      parser.set_loc($$, @$); }
+  | TINTEGER TCOLON TINTEGER
+    { $$ = new IntegerExprNode($1, $3);
+      parser.set_loc($$, @$); }
+  | THEXINTEGER TCOLON TINTEGER
+    { $$ = new IntegerExprNode($1, $3);
+      parser.set_loc($$, @$); }
+  | TTRUE
+    { $$ = new IntegerExprNode(new string("1"), new string("1"));
+      parser.set_loc($$, @$); }
+  | TFALSE
+    { $$ = new IntegerExprNode(new string("0"), new string("1"));
+      parser.set_loc($$, @$); }
+  ;
+
+assign_expr
+  : expr TEQUAL expr
+    { $$ = new AssignExprNode(ExprNode::Ptr($1), ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  /* The below has a reduce/reduce conflict.
+     TODO: ensure the above is handled in the type check properly */
+  /*| dotted_ident TEQUAL expr
+    { $$ = new AssignExprNode(IdentExprNode::Ptr($1), ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | dotted_ident bitop TEQUAL expr
+    { $$ = new AssignExprNode(IdentExprNode::Ptr($1), ExprNode::Ptr($4)); $$->bitop_ = BitopExprNode::Ptr($2);
+      parser.set_loc($$, @$); }*/
+  ;
+
+return_expr
+  : TRETURN expr
+    { $$ = new ReturnExprNode(ExprNode::Ptr($2));
+      parser.set_loc($$, @$); }
+  ;
+
+expr
+  : call_expr
+    { $$ = $1; }
+  | call_expr bitop
+    { $$ = $1; $$->bitop_ = BitopExprNode::Ptr($2); }
+  | table_index_expr
+    { $$ = $1; }
+  | table_index_expr TDOT ident
+    { $$ = $1; $1->sub_ = IdentExprNode::Ptr($3); }
+  | any_ident
+    { $$ = $1; }
+  | TAT dotted_ident
+    { $$ = new PacketExprNode(IdentExprNode::Ptr($2));
+      $$->flags_[ExprNode::IS_REF] = true;
+      parser.set_loc($$, @$); }
+  | TDOLLAR dotted_ident
+    { $$ = new PacketExprNode(IdentExprNode::Ptr($2));
+      $$->flags_[ExprNode::IS_PKT] = true;
+      parser.set_loc($$, @$); }
+  | TDOLLAR dotted_ident bitop
+    { $$ = new PacketExprNode(IdentExprNode::Ptr($2)); $$->bitop_ = BitopExprNode::Ptr($3);
+      $$->flags_[ExprNode::IS_PKT] = true;
+      parser.set_loc($$, @$); }
+  | TGOTO scoped_ident
+    { $$ = new GotoExprNode(IdentExprNode::Ptr($2), false);
+      parser.set_loc($$, @$); }
+  | TNEXT scoped_ident
+    { $$ = new GotoExprNode(IdentExprNode::Ptr($2), false);
+      parser.set_loc($$, @$); }
+  | TCONTINUE scoped_ident
+    { $$ = new GotoExprNode(IdentExprNode::Ptr($2), true);
+      parser.set_loc($$, @$); }
+  | TLPAREN expr TRPAREN
+    { $$ = $2; }
+  | TLPAREN expr TRPAREN bitop
+    { $$ = $2; $$->bitop_ = BitopExprNode::Ptr($4); }
+  | TSTRING
+    { $$ = new StringExprNode($1);
+      parser.set_loc($$, @$); }
+  | numeric
+    { $$ = $1; }
+  | numeric bitop
+    { $$ = $1; $$->bitop_ = BitopExprNode::Ptr($2); }
+  | expr TCLT expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TCGT expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TCGE expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TCLE expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TCNE expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TCEQ expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TPLUS expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TMINUS expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TMUL expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TDIV expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TMOD expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TXOR expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TAND expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TOR expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TLAND expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  | expr TLOR expr
+    { $$ = new BinopExprNode(ExprNode::Ptr($1), $2, ExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  /*| expr bitop
+    { $$ = $1; $$->bitop_ = BitopExprNode::Ptr($2); }*/
+  | TNOT expr
+    { $$ = new UnopExprNode($1, ExprNode::Ptr($2));
+      parser.set_loc($$, @$); }
+  | TCMPL expr
+    { $$ = new UnopExprNode($1, ExprNode::Ptr($2));
+      parser.set_loc($$, @$); }
+  ;
+
+call_args
+  : /* empty */
+    { $$ = new ExprNodeList; }
+  | expr
+    { $$ = new ExprNodeList; $$->push_back(ExprNode::Ptr($1)); }
+  | call_args TCOMMA expr
+    { $$->push_back(ExprNode::Ptr($3)); }
+  ;
+
+bitop
+  : TLBRACK TCOLON TPLUS TINTEGER TRBRACK
+    { $$ = new BitopExprNode(string("0"), *$4); delete $4;
+      parser.set_loc($$, @$); }
+  | TLBRACK TINTEGER TCOLON TPLUS TINTEGER TRBRACK
+    { $$ = new BitopExprNode(*$2, *$5); delete $2; delete $5;
+      parser.set_loc($$, @$); }
+  ;
+
+table_index_expr
+  : dotted_ident TLBRACK ident TRBRACK
+    { $$ = new TableIndexExprNode(IdentExprNode::Ptr($1), IdentExprNode::Ptr($3));
+      parser.set_loc($$, @$); }
+  ;
+
+scoped_ident
+  : ident
+    { $$ = $1; }
+  | scoped_ident TSCOPE TIDENTIFIER
+    { $$->append_scope(*$3); delete $3; }
+  ;
+
+dotted_ident
+  : ident
+    { $$ = $1; }
+  | dotted_ident TDOT TIDENTIFIER
+    { $$->append_dot(*$3); delete $3; }
+  ;
+
+any_ident
+  : ident
+    { $$ = $1; }
+  | dotted_ident TARROW TIDENTIFIER
+    { $$->append_dot(*$3); delete $3; }
+  | dotted_ident TDOT TIDENTIFIER
+    { $$->append_dot(*$3); delete $3; }
+  | scoped_ident TSCOPE TIDENTIFIER
+    { $$->append_scope(*$3); delete $3; }
+  ;
+
+ident
+  : TIDENTIFIER
+    { $$ = new IdentExprNode(*$1); delete $1;
+      parser.set_loc($$, @$); }
+  ;
+
+%%
+
+void ebpf::cc::BisonParser::error(const ebpf::cc::BisonParser::location_type &loc,
+                            const string& msg) {
+    std::cerr << "Error: " << loc << " " << msg << std::endl;
+}
+
+#include "lexer.h"
+static int yylex(ebpf::cc::BisonParser::semantic_type *yylval,
+                 ebpf::cc::BisonParser::location_type *yylloc,
+                 ebpf::cc::Lexer &lexer) {
+    return lexer.yylex(yylval, yylloc);
+}
+
diff --git a/src/cc/frontends/b/printer.cc b/src/cc/frontends/b/printer.cc
new file mode 100644
index 0000000..e16a823
--- /dev/null
+++ b/src/cc/frontends/b/printer.cc
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "printer.h"
+#include "lexer.h"
+#include "bcc_exception.h"
+
+namespace ebpf {
+namespace cc {
+
+void Printer::print_indent() {
+  fprintf(out_, "%*s", indent_, "");
+}
+
+StatusTuple Printer::visit_block_stmt_node(BlockStmtNode* n) {
+  fprintf(out_, "{\n");
+
+  if (!n->stmts_.empty()) {
+    ++indent_;
+    for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it) {
+      print_indent();
+      TRY2((*it)->accept(this));
+      fprintf(out_, "\n");
+    }
+    --indent_;
+  }
+  fprintf(out_, "%*s}", indent_, "");
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_if_stmt_node(IfStmtNode* n) {
+  fprintf(out_, "if ");
+  TRY2(n->cond_->accept(this));
+  fprintf(out_, " ");
+  TRY2(n->true_block_->accept(this));
+  if (n->false_block_) {
+    fprintf(out_, " else ");
+    TRY2(n->false_block_->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_onvalid_stmt_node(OnValidStmtNode* n) {
+  fprintf(out_, "if ");
+  TRY2(n->cond_->accept(this));
+  fprintf(out_, " ");
+  TRY2(n->block_->accept(this));
+  if (n->else_block_) {
+    fprintf(out_, " else ");
+    TRY2(n->else_block_->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_switch_stmt_node(SwitchStmtNode* n) {
+  fprintf(out_, "switch (");
+  TRY2(n->cond_->accept(this));
+  fprintf(out_, ") ");
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_case_stmt_node(CaseStmtNode* n) {
+  if (n->value_) {
+    fprintf(out_, "case ");
+    TRY2(n->value_->accept(this));
+  } else {
+    fprintf(out_, "default");
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_ident_expr_node(IdentExprNode* n) {
+  if (n->scope_name_.size()) {
+    fprintf(out_, "%s::", n->scope_name_.c_str());
+  }
+  fprintf(out_, "%s", n->name_.c_str());
+  if (n->sub_name_.size()) {
+    fprintf(out_, ".%s", n->sub_name_.c_str());
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_assign_expr_node(AssignExprNode* n) {
+  TRY2(n->lhs_->accept(this));
+  fprintf(out_, " = ");
+  TRY2(n->rhs_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_packet_expr_node(PacketExprNode* n) {
+  fprintf(out_, "$");
+  TRY2(n->id_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_integer_expr_node(IntegerExprNode* n) {
+  fprintf(out_, "%s:%zu", n->val_.c_str(), n->bits_);
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_string_expr_node(StringExprNode *n) {
+  fprintf(out_, "%s", n->val_.c_str());
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_binop_expr_node(BinopExprNode* n) {
+  TRY2(n->lhs_->accept(this));
+  fprintf(out_, "%d", n->op_);
+  TRY2(n->rhs_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_unop_expr_node(UnopExprNode* n) {
+  const char* s = "";
+  switch (n->op_) {
+    case Tok::TNOT: s = "!"; break;
+    case Tok::TCMPL: s = "~"; break;
+    case Tok::TMOD:  s = "%"; break;
+    default: {}
+  }
+  fprintf(out_, "%s", s);
+  TRY2(n->expr_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_bitop_expr_node(BitopExprNode* n) {
+
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_return_expr_node(ReturnExprNode* n) {
+  fprintf(out_, "return ");
+  TRY2(n->expr_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_goto_expr_node(GotoExprNode* n) {
+  const char* s = n->is_continue_ ? "continue " : "goto ";
+  fprintf(out_, "%s", s);
+  TRY2(n->id_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_method_call_expr_node(MethodCallExprNode* n) {
+  TRY2(n->id_->accept(this));
+  fprintf(out_, "(");
+  for (auto it = n->args_.begin(); it != n->args_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->args_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, ")");
+  if (!n->block_->stmts_.empty()) {
+    fprintf(out_, " {\n");
+    ++indent_;
+    for (auto it = n->block_->stmts_.begin(); it != n->block_->stmts_.end(); ++it) {
+      print_indent();
+      TRY2((*it)->accept(this));
+      fprintf(out_, "\n");
+    }
+    --indent_;
+    fprintf(out_, "%*s}", indent_, "");
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_table_index_expr_node(TableIndexExprNode *n) {
+  fprintf(out_, "%s[", n->id_->c_str());
+  TRY2(n->index_->accept(this));
+  fprintf(out_, "]");
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_expr_stmt_node(ExprStmtNode* n) {
+  TRY2(n->expr_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_struct_variable_decl_stmt_node(StructVariableDeclStmtNode* n) {
+  fprintf(out_, "var ");
+  TRY2(n->struct_id_->accept(this));
+  fprintf(out_, " ");
+  TRY2(n->id_->accept(this));
+  if (!n->init_.empty()) {
+    fprintf(out_, "{");
+    for (auto it = n->init_.begin(); it != n->init_.end(); ++it) {
+      TRY2((*it)->accept(this));
+      if (it + 1 != n->init_.end()) {
+        fprintf(out_, ", ");
+      }
+    }
+    fprintf(out_, "}");
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_integer_variable_decl_stmt_node(IntegerVariableDeclStmtNode* n) {
+  fprintf(out_, "var ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, ":%zu", n->bit_width_);
+  if (!n->init_.empty()) {
+    fprintf(out_, "; ");
+    TRY2(n->init_[0]->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_struct_decl_stmt_node(StructDeclStmtNode* n) {
+  fprintf(out_, "struct ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, " {\n");
+  ++indent_;
+  for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it) {
+    print_indent();
+    TRY2((*it)->accept(this));
+    fprintf(out_, "\n");
+  }
+  --indent_;
+  fprintf(out_, "%*s}", indent_, "");
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_state_decl_stmt_node(StateDeclStmtNode* n) {
+  if (!n->id_) {
+    return StatusTuple(0);
+  }
+  fprintf(out_, "state ");
+  TRY2(n->id_->accept(this));
+  //if (!n->id2_) {
+  //  fprintf(out_, ", * ");
+  //} else {
+  //  fprintf(out_, ", ");
+  //  TRY2(n->id2_->accept(this));
+  //}
+  //TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_parser_state_stmt_node(ParserStateStmtNode* n) {
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_match_decl_stmt_node(MatchDeclStmtNode* n) {
+  fprintf(out_, "on_match ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, " (");
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->formals_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, ") ");
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_miss_decl_stmt_node(MissDeclStmtNode* n) {
+  fprintf(out_, "on_miss ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, " (");
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->formals_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, ") ");
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_failure_decl_stmt_node(FailureDeclStmtNode* n) {
+  fprintf(out_, "on_failure ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, " (");
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->formals_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, ") ");
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_table_decl_stmt_node(TableDeclStmtNode* n) {
+  TRY2(n->table_type_->accept(this));
+  fprintf(out_, "<");
+  for (auto it = n->templates_.begin(); it != n->templates_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->templates_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, "> ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, "(%zu)", n->size_);
+  return StatusTuple(0);
+}
+
+StatusTuple Printer::visit_func_decl_stmt_node(FuncDeclStmtNode *n) {
+  fprintf(out_, "func ");
+  TRY2(n->id_->accept(this));
+  fprintf(out_, "(");
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+    if (it + 1 != n->formals_.end()) {
+      fprintf(out_, ", ");
+    }
+  }
+  fprintf(out_, ") ");
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/printer.h b/src/cc/frontends/b/printer.h
new file mode 100644
index 0000000..6dd4894
--- /dev/null
+++ b/src/cc/frontends/b/printer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdio.h>
+
+#include "node.h"
+
+namespace ebpf {
+namespace cc {
+
+class Printer : public Visitor {
+ public:
+  explicit Printer(FILE* out) : out_(out), indent_(0) {}
+
+  void print_indent();
+
+#define VISIT(type, func) virtual STATUS_RETURN visit_##func(type* n);
+  EXPAND_NODES(VISIT)
+#undef VISIT
+
+ private:
+  FILE* out_;
+  int indent_;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/scope.h b/src/cc/frontends/b/scope.h
new file mode 100644
index 0000000..b0358b88
--- /dev/null
+++ b/src/cc/frontends/b/scope.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace ebpf {
+namespace cc {
+
+using std::string;
+using std::vector;
+using std::map;
+using std::pair;
+using std::unique_ptr;
+
+class StateDeclStmtNode;
+class VariableDeclStmtNode;
+class TableDeclStmtNode;
+class StructDeclStmtNode;
+class FuncDeclStmtNode;
+
+enum search_type { SCOPE_LOCAL, SCOPE_GLOBAL };
+
+template <typename T>
+class Scope {
+ public:
+  Scope() {}
+  Scope(Scope<T>* scope, int id) : parent_(scope), id_(id) {}
+
+  T* lookup(const string &name, bool search_local = true) {
+    return lookup(name, search_local ? SCOPE_LOCAL : SCOPE_GLOBAL);
+  }
+  T * lookup(const string &name, search_type stype) {
+    auto it = elems_.find(name);
+    if (it != elems_.end())
+      return it->second;
+
+    if (stype == SCOPE_LOCAL || !parent_)
+      return nullptr;
+    return parent_->lookup(name, stype);
+  }
+  void add(const string& name, T* n) {
+    elems_[name] = n;
+    elems_ordered_.push_back(n);
+  }
+  typename map<string, T*>::iterator begin() { return elems_.begin(); }
+  typename map<string, T*>::iterator end() { return elems_.end(); }
+  typename vector<T*>::iterator obegin() { return elems_ordered_.begin(); }
+  typename vector<T*>::iterator oend() { return elems_ordered_.end(); }
+
+  Scope<T> *parent_;
+  int id_;
+  map<string, T*> elems_;
+  vector<T*> elems_ordered_;
+};
+
+/**
+ * Hold the current stack of scope pointers.  Lookups search upwards.
+ * Actual scope pointers are kept in the AST.
+ */
+class Scopes {
+ public:
+  typedef unique_ptr<Scopes> Ptr;
+  typedef Scope<StructDeclStmtNode> StructScope;
+  typedef Scope<StateDeclStmtNode> StateScope;
+  typedef Scope<VariableDeclStmtNode> VarScope;
+  typedef Scope<TableDeclStmtNode> TableScope;
+  typedef Scope<FuncDeclStmtNode> FuncScope;
+
+  Scopes() : var_id__(0), state_id_(0), var_id_(0),
+    current_var_scope_(nullptr), top_var_scope_(nullptr),
+    current_state_scope_(nullptr), top_state_scope_(nullptr),
+    top_struct_scope_(new StructScope(nullptr, 1)),
+    top_table_scope_(new TableScope(nullptr, 1)),
+    top_func_scope_(new FuncScope(nullptr, 1)) {}
+  ~Scopes() {
+    delete top_func_scope_;
+    delete top_struct_scope_;
+    delete top_table_scope_;
+    delete top_state_scope_;
+  }
+
+  void push_var(VarScope *scope) {
+    if (scope == top_var_scope_)
+      return;
+    scope->parent_ = current_var_scope_;
+    current_var_scope_ = scope;
+  }
+  void pop_var() {
+    if (current_var_scope_ == top_var_scope_)
+      return;
+    VarScope *old = current_var_scope_;
+    current_var_scope_ = old->parent_;
+    old->parent_ = nullptr;
+  }
+
+  void push_state(StateScope *scope) {
+    if (scope == top_state_scope_)
+      return;
+    scope->parent_ = current_state_scope_;
+    current_state_scope_ = scope;
+  }
+  void pop_state() {
+    if (current_state_scope_ == top_state_scope_)
+      return;
+    StateScope *old = current_state_scope_;
+    current_state_scope_ = old->parent_;
+    old->parent_ = nullptr;
+  }
+
+  /// While building the AST, allocate a new scope
+  VarScope* enter_var_scope() {
+    current_var_scope_ = new VarScope(current_var_scope_, next_var_id());
+    if (!top_var_scope_) {
+      top_var_scope_ = current_var_scope_;
+    }
+    return current_var_scope_;
+  }
+
+  VarScope* exit_var_scope() {
+    current_var_scope_ = current_var_scope_->parent_;
+    return current_var_scope_;
+  }
+
+  StateScope* enter_state_scope() {
+    current_state_scope_ = new StateScope(current_state_scope_, next_state_id());
+    if (!top_state_scope_) {
+      top_state_scope_ = current_state_scope_;
+    }
+    return current_state_scope_;
+  }
+
+  StateScope* exit_state_scope() {
+    current_state_scope_ = current_state_scope_->parent_;
+    return current_state_scope_;
+  }
+
+  void set_current(VarScope* s) { current_var_scope_ = s; }
+  VarScope* current_var() const { return current_var_scope_; }
+  VarScope* top_var() const { return top_var_scope_; }
+
+  void set_current(StateScope* s) { current_state_scope_ = s; }
+  StateScope* current_state() const { return current_state_scope_; }
+  StateScope* top_state() const { return top_state_scope_; }
+
+  StructScope* top_struct() const { return top_struct_scope_; }
+
+  TableScope* top_table() const { return top_table_scope_; }
+  FuncScope* top_func() const { return top_func_scope_; }
+
+  int next_id() { return ++var_id__; }
+  int next_state_id() { return ++state_id_; }
+  int next_var_id() { return ++var_id_; }
+
+  int var_id__;
+  int state_id_;
+  int var_id_;
+  VarScope* current_var_scope_;
+  VarScope* top_var_scope_;
+  StateScope* current_state_scope_;
+  StateScope* top_state_scope_;
+  StructScope* top_struct_scope_;
+  TableScope* top_table_scope_;
+  FuncScope* top_func_scope_;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/type_check.cc b/src/cc/frontends/b/type_check.cc
new file mode 100644
index 0000000..8d49de9
--- /dev/null
+++ b/src/cc/frontends/b/type_check.cc
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <set>
+#include <algorithm>
+#include "bcc_exception.h"
+#include "type_check.h"
+#include "lexer.h"
+
+namespace ebpf {
+namespace cc {
+
+using std::for_each;
+using std::set;
+
+StatusTuple TypeCheck::visit_block_stmt_node(BlockStmtNode *n) {
+  // enter scope
+  if (n->scope_)
+    scopes_->push_var(n->scope_);
+  if (!n->stmts_.empty()) {
+    for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it)
+      TRY2((*it)->accept(this));
+  }
+
+  if (n->scope_)
+    scopes_->pop_var();
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_if_stmt_node(IfStmtNode *n) {
+  TRY2(n->cond_->accept(this));
+  //if (n->cond_->typeof_ != ExprNode::INTEGER)
+  //  return mkstatus_(n, "If condition must be a numeric type");
+  TRY2(n->true_block_->accept(this));
+  if (n->false_block_) {
+    TRY2(n->false_block_->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_onvalid_stmt_node(OnValidStmtNode *n) {
+  TRY2(n->cond_->accept(this));
+  auto sdecl = static_cast<StructVariableDeclStmtNode*>(n->cond_->decl_);
+  if (sdecl->storage_type_ != StructVariableDeclStmtNode::STRUCT_REFERENCE)
+    return mkstatus_(n, "on_valid condition must be a reference type");
+  TRY2(n->block_->accept(this));
+  if (n->else_block_) {
+    TRY2(n->else_block_->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_switch_stmt_node(SwitchStmtNode *n) {
+  TRY2(n->cond_->accept(this));
+  if (n->cond_->typeof_ != ExprNode::INTEGER)
+    return mkstatus_(n, "Switch condition must be a numeric type");
+  TRY2(n->block_->accept(this));
+  for (auto it = n->block_->stmts_.begin(); it != n->block_->stmts_.end(); ++it) {
+    /// @todo check for duplicates
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_case_stmt_node(CaseStmtNode *n) {
+  if (n->value_) {
+    TRY2(n->value_->accept(this));
+    if (n->value_->typeof_ != ExprNode::INTEGER)
+      return mkstatus_(n, "Switch condition must be a numeric type");
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_ident_expr_node(IdentExprNode *n) {
+  n->decl_ = scopes_->current_var()->lookup(n->name_, SCOPE_GLOBAL);
+  if (!n->decl_)
+    return mkstatus_(n, "Variable %s lookup failed", n->c_str());
+
+  n->typeof_ = ExprNode::UNKNOWN;
+  if (n->sub_name_.empty()) {
+    if (n->decl_->storage_type_ == VariableDeclStmtNode::INTEGER) {
+      n->typeof_ = ExprNode::INTEGER;
+      n->bit_width_ = n->decl_->bit_width_;
+      n->flags_[ExprNode::WRITE] = true;
+    } else if (n->decl_->is_struct()) {
+      n->typeof_ = ExprNode::STRUCT;
+      auto sdecl = static_cast<StructVariableDeclStmtNode*>(n->decl_);
+      if (sdecl->struct_id_->scope_name_ == "proto") {
+        n->struct_type_ = proto_scopes_->top_struct()->lookup(sdecl->struct_id_->name_, true);
+        n->flags_[ExprNode::PROTO] = true;
+      } else {
+        n->struct_type_ = scopes_->top_struct()->lookup(sdecl->struct_id_->name_, true);
+      }
+      if (!n->struct_type_)
+        return mkstatus_(n, "Type %s has not been declared", sdecl->struct_id_->full_name().c_str());
+      n->bit_width_ = n->struct_type_->bit_width_;
+    }
+  } else {
+    if (n->decl_->storage_type_ == VariableDeclStmtNode::INTEGER)
+      return mkstatus_(n, "Subfield access not valid for numeric types");
+    auto sdecl = static_cast<StructVariableDeclStmtNode*>(n->decl_);
+    if (sdecl->struct_id_->scope_name_ == "proto") {
+      n->struct_type_ = proto_scopes_->top_struct()->lookup(sdecl->struct_id_->name_, true);
+      n->flags_[ExprNode::PROTO] = true;
+    } else {
+      n->struct_type_ = scopes_->top_struct()->lookup(sdecl->struct_id_->name_, true);
+    }
+    if (!n->struct_type_)
+      return mkstatus_(n, "Type %s has not been declared", sdecl->struct_id_->full_name().c_str());
+    n->sub_decl_ = n->struct_type_->field(n->sub_name_);
+
+    if (!n->sub_decl_)
+      return mkstatus_(n, "Access to invalid subfield %s.%s", n->c_str(), n->sub_name_.c_str());
+    if (n->sub_decl_->storage_type_ != VariableDeclStmtNode::INTEGER)
+      return mkstatus_(n, "Accessing non-numeric subfield %s.%s", n->c_str(), n->sub_name_.c_str());
+
+    n->typeof_ = ExprNode::INTEGER;
+    n->bit_width_ = n->sub_decl_->bit_width_;
+    n->flags_[ExprNode::WRITE] = true;
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_assign_expr_node(AssignExprNode *n) {
+  /// @todo check lhs is assignable
+  TRY2(n->lhs_->accept(this));
+  if (n->lhs_->typeof_ == ExprNode::STRUCT) {
+    TRY2(n->rhs_->accept(this));
+    if (n->rhs_->typeof_ != ExprNode::STRUCT)
+      return mkstatus_(n, "Right-hand side of assignment must be a struct");
+  } else {
+    if (n->lhs_->typeof_ != ExprNode::INTEGER)
+      return mkstatus_(n, "Left-hand side of assignment must be a numeric type");
+    if (!n->lhs_->flags_[ExprNode::WRITE])
+      return mkstatus_(n, "Left-hand side of assignment is read-only");
+    TRY2(n->rhs_->accept(this));
+    if (n->rhs_->typeof_ != ExprNode::INTEGER)
+      return mkstatus_(n, "Right-hand side of assignment must be a numeric type");
+  }
+  n->typeof_ = ExprNode::VOID;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_packet_expr_node(PacketExprNode *n) {
+  StructDeclStmtNode *struct_type = proto_scopes_->top_struct()->lookup(n->id_->name_, true);
+  if (!struct_type)
+    return mkstatus_(n, "Undefined packet header %s", n->id_->c_str());
+  if (n->id_->sub_name_.empty()) {
+    n->typeof_ = ExprNode::STRUCT;
+    n->struct_type_ = struct_type;
+  } else {
+    VariableDeclStmtNode *sub_decl = struct_type->field(n->id_->sub_name_);
+    if (!sub_decl)
+      return mkstatus_(n, "Access to invalid subfield %s.%s", n->id_->c_str(), n->id_->sub_name_.c_str());
+    n->typeof_ = ExprNode::INTEGER;
+    if (n->is_ref())
+      n->bit_width_ = 64;
+    else
+      n->bit_width_ = sub_decl->bit_width_;
+  }
+  n->flags_[ExprNode::WRITE] = true;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_integer_expr_node(IntegerExprNode *n) {
+  n->typeof_ = ExprNode::INTEGER;
+  n->bit_width_ = n->bits_;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_string_expr_node(StringExprNode *n) {
+  n->typeof_ = ExprNode::STRING;
+  n->flags_[ExprNode::IS_REF] = true;
+  n->bit_width_ = n->val_.size() << 3;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_binop_expr_node(BinopExprNode *n) {
+  TRY2(n->lhs_->accept(this));
+  if (n->lhs_->typeof_ != ExprNode::INTEGER)
+    return mkstatus_(n, "Left-hand side of binary expression must be a numeric type");
+  TRY2(n->rhs_->accept(this));
+  if (n->rhs_->typeof_ != ExprNode::INTEGER)
+    return mkstatus_(n, "Right-hand side of binary expression must be a numeric type");
+  n->typeof_ = ExprNode::INTEGER;
+  switch(n->op_) {
+    case Tok::TCEQ:
+    case Tok::TCNE:
+    case Tok::TCLT:
+    case Tok::TCLE:
+    case Tok::TCGT:
+    case Tok::TCGE:
+      n->bit_width_ = 1;
+    default:
+      n->bit_width_ = std::max(n->lhs_->bit_width_, n->rhs_->bit_width_);
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_unop_expr_node(UnopExprNode *n) {
+  TRY2(n->expr_->accept(this));
+  if (n->expr_->typeof_ != ExprNode::INTEGER)
+    return mkstatus_(n, "Unary operand must be a numeric type");
+  n->copy_type(*n->expr_);
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_bitop_expr_node(BitopExprNode *n) {
+  if (n->expr_->typeof_ != ExprNode::INTEGER)
+    return mkstatus_(n, "Bitop [] can only operate on numeric types");
+  n->typeof_ = ExprNode::INTEGER;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_goto_expr_node(GotoExprNode *n) {
+  //n->id_->accept(this);
+  n->typeof_ = ExprNode::VOID;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_return_expr_node(ReturnExprNode *n) {
+  TRY2(n->expr_->accept(this));
+  n->typeof_ = ExprNode::VOID;
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::expect_method_arg(MethodCallExprNode *n, size_t num, size_t num_def_args = 0) {
+  if (num_def_args == 0) {
+    if (n->args_.size() != num)
+      return mkstatus_(n, "%s expected %d argument%s, %zu given", n->id_->sub_name_.c_str(),
+                      num, num == 1 ? "" : "s", n->args_.size());
+  } else {
+    if (n->args_.size() < num - num_def_args || n->args_.size() > num)
+      return mkstatus_(n, "%s expected %d argument%s (%d default), %zu given", n->id_->sub_name_.c_str(),
+                      num, num == 1 ? "" : "s", num_def_args, n->args_.size());
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::check_lookup_method(MethodCallExprNode *n) {
+  auto table = scopes_->top_table()->lookup(n->id_->name_);
+  if (!table)
+    return mkstatus_(n, "Unknown table name %s", n->id_->c_str());
+  TRY2(expect_method_arg(n, 2, 1));
+  if (table->type_id()->name_ == "LPM")
+    return mkstatus_(n, "LPM unsupported");
+  if (n->block_->scope_) {
+    auto result = make_unique<StructVariableDeclStmtNode>(table->leaf_id()->copy(), make_unique<IdentExprNode>("_result"),
+                                                          VariableDeclStmtNode::STRUCT_REFERENCE);
+    n->block_->scope_->add("_result", result.get());
+    n->block_->stmts_.insert(n->block_->stmts_.begin(), move(result));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::check_update_method(MethodCallExprNode *n) {
+  auto table = scopes_->top_table()->lookup(n->id_->name_);
+  if (!table)
+    return mkstatus_(n, "Unknown table name %s", n->id_->c_str());
+  if (table->type_id()->name_ == "FIXED_MATCH" || table->type_id()->name_ == "INDEXED")
+    TRY2(expect_method_arg(n, 2));
+  else if (table->type_id()->name_ == "LPM")
+    TRY2(expect_method_arg(n, 3));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::check_delete_method(MethodCallExprNode *n) {
+  auto table = scopes_->top_table()->lookup(n->id_->name_);
+  if (!table)
+    return mkstatus_(n, "Unknown table name %s", n->id_->c_str());
+  if (table->type_id()->name_ == "FIXED_MATCH" || table->type_id()->name_ == "INDEXED")
+    TRY2(expect_method_arg(n, 1));
+  else if (table->type_id()->name_ == "LPM")
+    {}
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_method_call_expr_node(MethodCallExprNode *n) {
+  // be sure to visit those child nodes ASAP, so their properties can
+  // be propagated up to this node and be ready to be used
+  for (auto it = n->args_.begin(); it != n->args_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+
+  n->typeof_ = ExprNode::VOID;
+  if (n->id_->sub_name_.size()) {
+    if (n->id_->sub_name_ == "lookup") {
+      TRY2(check_lookup_method(n));
+    } else if (n->id_->sub_name_ == "update") {
+      TRY2(check_update_method(n));
+    } else if (n->id_->sub_name_ == "delete") {
+      TRY2(check_delete_method(n));
+    } else if (n->id_->sub_name_ == "rewrite_field" && n->id_->name_ == "pkt") {
+      TRY2(expect_method_arg(n, 2));
+      n->args_[0]->flags_[ExprNode::IS_LHS] = true;
+    }
+  } else if (n->id_->name_ == "log") {
+    if (n->args_.size() < 1)
+      return mkstatus_(n, "%s expected at least 1 argument", n->id_->c_str());
+    if (n->args_[0]->typeof_ != ExprNode::STRING)
+      return mkstatus_(n, "%s expected a string for argument 1", n->id_->c_str());
+    n->typeof_ = ExprNode::INTEGER;
+    n->bit_width_ = 32;
+  } else if (n->id_->name_ == "atomic_add") {
+    TRY2(expect_method_arg(n, 2));
+    n->typeof_ = ExprNode::INTEGER;
+    n->bit_width_ = n->args_[0]->bit_width_;
+    n->args_[0]->flags_[ExprNode::IS_LHS] = true;
+  } else if (n->id_->name_ == "incr_cksum") {
+    TRY2(expect_method_arg(n, 4, 1));
+    n->typeof_ = ExprNode::INTEGER;
+    n->bit_width_ = 16;
+  } else if (n->id_->name_ == "sizeof") {
+    TRY2(expect_method_arg(n, 1));
+    n->typeof_ = ExprNode::INTEGER;
+    n->bit_width_ = 32;
+  } else if (n->id_->name_ == "get_usec_time") {
+     TRY2(expect_method_arg(n, 0));
+     n->typeof_ = ExprNode::INTEGER;
+     n->bit_width_ = 64;
+  }
+
+  if (!n->block_->stmts_.empty()) {
+    if (n->id_->sub_name_ != "update" && n->id_->sub_name_ != "lookup")
+      return mkstatus_(n, "%s does not allow trailing block statements", n->id_->full_name().c_str());
+    TRY2(n->block_->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_table_index_expr_node(TableIndexExprNode *n) {
+  n->table_ = scopes_->top_table()->lookup(n->id_->name_);
+  if (!n->table_) return mkstatus_(n, "Unknown table name %s", n->id_->c_str());
+  TRY2(n->index_->accept(this));
+  if (n->index_->struct_type_ != n->table_->key_type_)
+    return mkstatus_(n, "Key to table %s lookup must be of type %s", n->id_->c_str(), n->table_->key_id()->c_str());
+
+  if (n->sub_) {
+    n->sub_decl_ = n->table_->leaf_type_->field(n->sub_->name_);
+    if (!n->sub_decl_)
+      return mkstatus_(n, "Field %s is not a member of %s", n->sub_->c_str(), n->table_->leaf_id()->c_str());
+    n->typeof_ = ExprNode::INTEGER;
+  } else {
+    n->typeof_ = ExprNode::STRUCT;
+    n->flags_[ExprNode::IS_REF] = true;
+    n->struct_type_ = n->table_->leaf_type_;
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_expr_stmt_node(ExprStmtNode *n) {
+  TRY2(n->expr_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_struct_variable_decl_stmt_node(StructVariableDeclStmtNode *n) {
+  //TRY2(n->struct_id_->accept(this));
+  //TRY2(n->id_->accept(this));
+  if (!n->init_.empty()) {
+    StructDeclStmtNode *type;
+    if (n->struct_id_->scope_name_ == "proto")
+      type = proto_scopes_->top_struct()->lookup(n->struct_id_->name_, true);
+    else
+      type = scopes_->top_struct()->lookup(n->struct_id_->name_, true);
+
+    if (!type)
+      return mkstatus_(n, "type %s does not exist", n->struct_id_->full_name().c_str());
+
+    // init remaining fields to 0
+    set<string> used;
+    for (auto i = n->init_.begin(); i != n->init_.end(); ++i) {
+      auto asn = static_cast<AssignExprNode*>(i->get());
+      auto id = static_cast<IdentExprNode *>(asn->lhs_.get());
+      used.insert(id->sub_name_);
+    }
+    for (auto f = type->stmts_.begin(); f != type->stmts_.end(); ++f) {
+      if (used.find((*f)->id_->name_) == used.end()) {
+        auto id = make_unique<IdentExprNode>(n->id_->name_);
+        id->append_dot((*f)->id_->name_);
+        n->init_.push_back(make_unique<AssignExprNode>(move(id), make_unique<IntegerExprNode>("0")));
+      }
+    }
+
+    for (auto it = n->init_.begin(); it != n->init_.end(); ++it) {
+      TRY2((*it)->accept(this));
+    }
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_integer_variable_decl_stmt_node(IntegerVariableDeclStmtNode *n) {
+  //TRY2(n->id_->accept(this));
+  if (!n->init_.empty()) {
+    TRY2(n->init_[0]->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_struct_decl_stmt_node(StructDeclStmtNode *n) {
+  //TRY2(n->id_->accept(this));
+  for (auto it = n->stmts_.begin(); it != n->stmts_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_parser_state_stmt_node(ParserStateStmtNode *n) {
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_state_decl_stmt_node(StateDeclStmtNode *n) {
+  if (!n->id_) {
+    return StatusTuple(0);
+  }
+  auto s1 = proto_scopes_->top_state()->lookup(n->id_->name_, true);
+  if (s1) {
+    const string &name = n->id_->name_;
+    auto offset_var = make_unique<IntegerVariableDeclStmtNode>(make_unique<IdentExprNode>("$" + name), "64");
+    offset_var->init_.push_back(make_unique<AssignExprNode>(offset_var->id_->copy(), make_unique<IntegerExprNode>("0")));
+    scopes_->current_var()->add("$" + name, offset_var.get());
+    s1->subs_[0].block_->scope_->add("$" + name, offset_var.get());
+    n->init_.push_back(move(offset_var));
+
+    n->parser_ = ParserStateStmtNode::make(n->id_);
+    n->parser_->next_state_ = s1->subs_[0].block_.get();
+    n->parser_->scope_id_ = n->scope_id_;
+
+    auto p = proto_scopes_->top_struct()->lookup(n->id_->name_, true);
+    if (!p) return mkstatus_(n, "unable to find struct decl for parser state %s", n->id_->full_name().c_str());
+
+    // $proto = parsed_bytes; parsed_bytes += sizeof($proto);
+    auto asn1 = make_unique<AssignExprNode>(make_unique<IdentExprNode>("$" + n->id_->name_),
+                                            make_unique<IdentExprNode>("parsed_bytes"));
+    n->init_.push_back(make_unique<ExprStmtNode>(move(asn1)));
+    auto add_expr = make_unique<BinopExprNode>(make_unique<IdentExprNode>("parsed_bytes"), Tok::TPLUS,
+                                               make_unique<IntegerExprNode>(std::to_string(p->bit_width_ >> 3), 64));
+    auto asn2 = make_unique<AssignExprNode>(make_unique<IdentExprNode>("parsed_bytes"), move(add_expr));
+    n->init_.push_back(make_unique<ExprStmtNode>(move(asn2)));
+  }
+
+  for (auto it = n->init_.begin(); it != n->init_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+
+  for (auto it = n->subs_.begin(); it != n->subs_.end(); ++it) {
+    scopes_->push_state(it->scope_);
+
+    TRY2(it->block_->accept(this));
+
+    if (s1) {
+      if (it->id_->name_ == "") {
+        it->parser_ = ParserStateStmtNode::make(it->id_);
+        it->parser_->next_state_ = s1->subs_[0].block_.get();
+        it->parser_->scope_id_ = n->scope_id_ + n->id_->name_ + "_";
+      } else if (auto s2 = proto_scopes_->top_state()->lookup(it->id_->name_, true)) {
+        it->parser_ = ParserStateStmtNode::make(it->id_);
+        it->parser_->next_state_ = s2->subs_[0].block_.get();
+        it->parser_->scope_id_ = n->scope_id_ + n->id_->name_ + "_";
+      }
+
+      if (it->parser_) {
+        TRY2(it->parser_->accept(this));
+      }
+    }
+
+    scopes_->pop_state();
+  }
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_match_decl_stmt_node(MatchDeclStmtNode *n) {
+  //TRY2(n->id_->accept(this));
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_miss_decl_stmt_node(MissDeclStmtNode *n) {
+  //TRY2(n->id_->accept(this));
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_failure_decl_stmt_node(FailureDeclStmtNode *n) {
+  //TRY2(n->id_->accept(this));
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    TRY2((*it)->accept(this));
+  }
+  TRY2(n->block_->accept(this));
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_table_decl_stmt_node(TableDeclStmtNode *n) {
+  n->key_type_ = scopes_->top_struct()->lookup(n->key_id()->name_, true);
+  if (!n->key_type_)
+    return mkstatus_(n, "Table key type %s undefined", n->key_id()->c_str());
+  n->key_id()->bit_width_ = n->key_type_->bit_width_;
+  n->leaf_type_ = scopes_->top_struct()->lookup(n->leaf_id()->name_, true);
+  if (!n->leaf_type_)
+    return mkstatus_(n, "Table leaf type %s undefined", n->leaf_id()->c_str());
+  n->leaf_id()->bit_width_ = n->leaf_type_->bit_width_;
+  if (n->type_id()->name_ == "INDEXED" && n->policy_id()->name_ != "AUTO") {
+    fprintf(stderr, "Table %s is INDEXED, policy should be AUTO\n", n->id_->c_str());
+    n->policy_id()->name_ = "AUTO";
+  }
+  if (n->policy_id()->name_ != "AUTO" && n->policy_id()->name_ != "NONE")
+    return mkstatus_(n, "Unsupported policy type %s", n->policy_id()->c_str());
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit_func_decl_stmt_node(FuncDeclStmtNode *n) {
+  for (auto it = n->formals_.begin(); it != n->formals_.end(); ++it) {
+    VariableDeclStmtNode *var = it->get();
+    TRY2(var->accept(this));
+    if (var->is_struct()) {
+      if (!var->is_pointer())
+        return mkstatus_(n, "Only struct references allowed in function definitions");
+    }
+  }
+  scopes_->push_state(n->scope_);
+  TRY2(n->block_->accept(this));
+  scopes_->pop_state();
+  return StatusTuple(0);
+}
+
+StatusTuple TypeCheck::visit(Node *root) {
+  BlockStmtNode *b = static_cast<BlockStmtNode*>(root);
+
+  scopes_->set_current(scopes_->top_state());
+  scopes_->set_current(scopes_->top_var());
+
+  // // packet data in bpf socket
+  // if (scopes_->top_struct()->lookup("_skbuff", true)) {
+  //   return StatusTuple(-1, "_skbuff already defined");
+  // }
+  // auto skb_type = make_unique<StructDeclStmtNode>(make_unique<IdentExprNode>("_skbuff"));
+  // scopes_->top_struct()->add("_skbuff", skb_type.get());
+  // b->stmts_.push_back(move(skb_type));
+
+  // if (scopes_->current_var()->lookup("skb", true)) {
+  //   return StatusTuple(-1, "skb already defined");
+  // }
+  // auto skb = make_unique<StructVariableDeclStmtNode>(make_unique<IdentExprNode>("_skbuff"),
+  //                                                    make_unique<IdentExprNode>("skb"));
+  // skb->storage_type_ = VariableDeclStmtNode::STRUCT_REFERENCE;
+  // scopes_->current_var()->add("skb", skb.get());
+  // b->stmts_.push_back(move(skb));
+
+  // offset counter
+  auto parsed_bytes = make_unique<IntegerVariableDeclStmtNode>(
+                        make_unique<IdentExprNode>("parsed_bytes"), "64");
+  parsed_bytes->init_.push_back(make_unique<AssignExprNode>(parsed_bytes->id_->copy(), make_unique<IntegerExprNode>("0")));
+  scopes_->current_var()->add("parsed_bytes", parsed_bytes.get());
+  b->stmts_.push_back(move(parsed_bytes));
+
+  TRY2(b->accept(this));
+
+  if (!errors_.empty()) {
+    for (auto it = errors_.begin(); it != errors_.end(); ++it) {
+      fprintf(stderr, "%s\n", it->c_str());
+    }
+    return StatusTuple(-1, errors_.begin()->c_str());
+  }
+  return StatusTuple(0);
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/type_check.h b/src/cc/frontends/b/type_check.h
new file mode 100644
index 0000000..dbf427a
--- /dev/null
+++ b/src/cc/frontends/b/type_check.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include "node.h"
+#include "scope.h"
+
+namespace ebpf {
+namespace cc {
+
+class TypeCheck : public Visitor {
+ public:
+  TypeCheck(Scopes *scopes, Scopes *proto_scopes)
+      : scopes_(scopes), proto_scopes_(proto_scopes) {}
+
+  virtual STATUS_RETURN visit(Node* n);
+  STATUS_RETURN expect_method_arg(MethodCallExprNode* n, size_t num, size_t num_def_args);
+  STATUS_RETURN check_lookup_method(MethodCallExprNode* n);
+  STATUS_RETURN check_update_method(MethodCallExprNode* n);
+  STATUS_RETURN check_delete_method(MethodCallExprNode* n);
+
+#define VISIT(type, func) virtual STATUS_RETURN visit_##func(type* n);
+  EXPAND_NODES(VISIT)
+#undef VISIT
+
+ private:
+  Scopes *scopes_;
+  Scopes *proto_scopes_;
+  vector<string> errors_;
+};
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/b/type_helper.h b/src/cc/frontends/b/type_helper.h
new file mode 100644
index 0000000..ce96cc4
--- /dev/null
+++ b/src/cc/frontends/b/type_helper.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace ebpf {
+namespace cc {
+
+// Represent the numeric type of a protocol field
+enum FieldType {
+  INVALID = 0,
+  UINT8_T,
+  UINT16_T,
+  UINT32_T,
+  UINT64_T,
+#ifdef __SIZEOF_INT128__
+  UINT128_T,
+#endif
+  VOID
+};
+
+static inline size_t enum_to_size(const FieldType t) {
+  switch (t) {
+    case UINT8_T: return sizeof(uint8_t);
+    case UINT16_T: return sizeof(uint16_t);
+    case UINT32_T: return sizeof(uint32_t);
+    case UINT64_T: return sizeof(uint64_t);
+#ifdef __SIZEOF_INT128__
+    case UINT128_T: return sizeof(__uint128_t);
+#endif
+    default:
+      return 0;
+  }
+}
+
+/// Convert a bit size to the next highest power of 2
+static inline int next_base2(int v) {
+  --v;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  ++v;
+  return v;
+}
+
+static inline const char* bits_to_uint(int v) {
+  v = next_base2(v);
+  if (v <= 8) {
+    return "uint8_t";
+  } else if (v == 16) {
+    return "uint16_t";
+  } else if (v == 32) {
+    return "uint32_t";
+  } else if (v == 64) {
+    return "uint64_t";
+  } else if (v >= 128) {
+    /* in plumlet 128-bit integers should be 8-byte aligned,
+     * all other ints should have natural alignment */
+    return "unsigned __int128 __attribute__((packed, aligned(8)))";
+  }
+  return "void";
+}
+
+static inline FieldType bits_to_enum(int v) {
+  v = next_base2(v);
+  if (v <= 8) {
+    return UINT8_T;
+  } else if (v == 16) {
+    return UINT16_T;
+  } else if (v == 32) {
+    return UINT32_T;
+  } else if (v == 64) {
+    return UINT64_T;
+#ifdef __SIZEOF_INT128__
+  } else if (v >= 128) {
+    return UINT128_T;
+#endif
+  }
+  return VOID;
+}
+
+static inline size_t bits_to_size(int v) {
+  return enum_to_size(bits_to_enum(v));
+}
+
+static inline size_t align_offset(size_t offset, FieldType ft) {
+  switch (ft) {
+    case UINT8_T:
+      return offset % 8 > 0 ? offset + (8 - offset % 8) : offset;
+    case UINT16_T:
+      return offset % 16 > 0 ? offset + (16 - offset % 16) : offset;
+    case UINT32_T:
+      return offset % 32 > 0 ? offset + (32 - offset % 32) : offset;
+    case UINT64_T:
+#ifdef __SIZEOF_INT128__
+    case UINT128_T:
+#endif
+      return offset % 64 > 0 ? offset + (64 - offset % 64) : offset;
+    default:
+      ;
+  }
+  return offset;
+}
+
+}  // namespace cc
+}  // namespace ebpf
diff --git a/src/cc/frontends/clang/CMakeLists.txt b/src/cc/frontends/clang/CMakeLists.txt
new file mode 100644
index 0000000..a6228fc
--- /dev/null
+++ b/src/cc/frontends/clang/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DKERNEL_MODULES_DIR='\"${BCC_KERNEL_MODULES_DIR}\"'")
+if(DEFINED BCC_CUR_CPU_IDENTIFIER)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUR_CPU_IDENTIFIER='\"${BCC_CUR_CPU_IDENTIFIER}\"'")
+endif()
+if(DEFINED BCC_BACKUP_COMPILE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBCC_BACKUP_COMPILE='${BCC_BACKUP_COMPILE}'")
+endif()
+
+add_library(clang_frontend STATIC loader.cc b_frontend_action.cc tp_frontend_action.cc kbuild_helper.cc ../../common.cc)
diff --git a/src/cc/frontends/clang/arch_helper.h b/src/cc/frontends/clang/arch_helper.h
new file mode 100644
index 0000000..76b4651
--- /dev/null
+++ b/src/cc/frontends/clang/arch_helper.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 Google, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+
+typedef enum {
+  BCC_ARCH_PPC,
+  BCC_ARCH_PPC_LE,
+  BCC_ARCH_S390X,
+  BCC_ARCH_ARM64,
+  BCC_ARCH_X86
+} bcc_arch_t;
+
+typedef void *(*arch_callback_t)(bcc_arch_t arch);
+
+static void *run_arch_callback(arch_callback_t fn)
+{
+  const char *archenv = getenv("ARCH");
+
+  /* If ARCH is not set, detect from local arch clang is running on */
+  if (!archenv) {
+#if defined(__powerpc64__)
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+    return fn(BCC_ARCH_PPC_LE);
+#else
+    return fn(BCC_ARCH_PPC);
+#endif
+#elif defined(__s390x__)
+    return fn(BCC_ARCH_S390X);
+#elif defined(__aarch64__)
+    return fn(BCC_ARCH_ARM64);
+#else
+    return fn(BCC_ARCH_X86);
+#endif
+  }
+
+  /* Otherwise read it from ARCH */
+  if (!strcmp(archenv, "powerpc")) {
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+    return fn(BCC_ARCH_PPC_LE);
+#else
+    return fn(BCC_ARCH_PPC);
+#endif
+  } else if (!strcmp(archenv, "s390x")) {
+    return fn(BCC_ARCH_S390X);
+  } else if (!strcmp(archenv, "arm64")) {
+    return fn(BCC_ARCH_ARM64);
+  } else {
+    return fn(BCC_ARCH_X86);
+  }
+}
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
new file mode 100644
index 0000000..12095e6
--- /dev/null
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -0,0 +1,1416 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include <clang/AST/ASTConsumer.h>
+#include <clang/AST/ASTContext.h>
+#include <clang/AST/RecordLayout.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/MultiplexConsumer.h>
+#include <clang/Rewrite/Core/Rewriter.h>
+#include <clang/Lex/Lexer.h>
+
+#include "frontend_action_common.h"
+#include "b_frontend_action.h"
+#include "bpf_module.h"
+#include "common.h"
+#include "loader.h"
+#include "table_storage.h"
+#include "arch_helper.h"
+
+#include "libbpf.h"
+
+namespace ebpf {
+
+constexpr int MAX_CALLING_CONV_REGS = 6;
+const char *calling_conv_regs_x86[] = {
+  "di", "si", "dx", "cx", "r8", "r9"
+};
+const char *calling_conv_regs_ppc[] = {"gpr[3]", "gpr[4]", "gpr[5]",
+                                       "gpr[6]", "gpr[7]", "gpr[8]"};
+
+const char *calling_conv_regs_s390x[] = {"gprs[2]", "gprs[3]", "gprs[4]",
+					 "gprs[5]", "gprs[6]" };
+
+const char *calling_conv_regs_arm64[] = {"regs[0]", "regs[1]", "regs[2]",
+                                       "regs[3]", "regs[4]", "regs[5]"};
+
+void *get_call_conv_cb(bcc_arch_t arch)
+{
+  const char **ret;
+
+  switch(arch) {
+    case BCC_ARCH_PPC:
+    case BCC_ARCH_PPC_LE:
+      ret = calling_conv_regs_ppc;
+      break;
+    case BCC_ARCH_S390X:
+      ret = calling_conv_regs_s390x;
+      break;
+    case BCC_ARCH_ARM64:
+      ret = calling_conv_regs_arm64;
+      break;
+    default:
+      ret = calling_conv_regs_x86;
+  }
+
+  return (void *)ret;
+}
+
+const char **get_call_conv(void) {
+  const char **ret;
+
+  ret = (const char **)run_arch_callback(get_call_conv_cb);
+  return ret;
+}
+
+using std::map;
+using std::move;
+using std::set;
+using std::tuple;
+using std::make_tuple;
+using std::string;
+using std::to_string;
+using std::unique_ptr;
+using std::vector;
+using namespace clang;
+
+class ProbeChecker : public RecursiveASTVisitor<ProbeChecker> {
+ public:
+  explicit ProbeChecker(Expr *arg, const set<tuple<Decl *, int>> &ptregs,
+                        bool track_helpers, bool is_assign)
+      : needs_probe_(false), is_transitive_(false), ptregs_(ptregs),
+        track_helpers_(track_helpers), nb_derefs_(0), is_assign_(is_assign) {
+    if (arg) {
+      TraverseStmt(arg);
+      if (arg->getType()->isPointerType())
+        is_transitive_ = needs_probe_;
+    }
+  }
+  explicit ProbeChecker(Expr *arg, const set<tuple<Decl *, int>> &ptregs,
+                        bool is_transitive)
+      : ProbeChecker(arg, ptregs, is_transitive, false) {}
+  bool VisitCallExpr(CallExpr *E) {
+    needs_probe_ = false;
+
+    if (is_assign_) {
+      // We're looking for a function that returns an external pointer,
+      // regardless of the number of dereferences.
+      for(auto p : ptregs_) {
+        if (std::get<0>(p) == E->getDirectCallee()) {
+          needs_probe_ = true;
+          nb_derefs_ += std::get<1>(p);
+          return false;
+        }
+      }
+    } else {
+      tuple<Decl *, int> pt = make_tuple(E->getDirectCallee(), nb_derefs_);
+      if (ptregs_.find(pt) != ptregs_.end())
+        needs_probe_ = true;
+    }
+
+    if (!track_helpers_)
+      return false;
+    if (VarDecl *V = dyn_cast<VarDecl>(E->getCalleeDecl()))
+      needs_probe_ = V->getName() == "bpf_get_current_task";
+    return false;
+  }
+  bool VisitMemberExpr(MemberExpr *M) {
+    tuple<Decl *, int> pt = make_tuple(M->getMemberDecl(), nb_derefs_);
+    if (ptregs_.find(pt) != ptregs_.end()) {
+      needs_probe_ = true;
+      return false;
+    }
+    if (M->isArrow()) {
+      /* In A->b, if A is an external pointer, then A->b should be considered
+       * one too.  However, if we're taking the address of A->b
+       * (nb_derefs_ < 0), we should take it into account for the number of
+       * indirections; &A->b is a pointer to A with an offset. */
+      if (nb_derefs_ >= 0) {
+        ProbeChecker checker = ProbeChecker(M->getBase(), ptregs_,
+                                            track_helpers_, is_assign_);
+        if (checker.needs_probe() && checker.get_nb_derefs() == 0) {
+          needs_probe_ = true;
+          return false;
+        }
+      }
+      nb_derefs_++;
+    }
+    return true;
+  }
+  bool VisitUnaryOperator(UnaryOperator *E) {
+    if (E->getOpcode() == UO_Deref) {
+      /* In *A, if A is an external pointer, then *A should be considered one
+       * too. */
+      ProbeChecker checker = ProbeChecker(E->getSubExpr(), ptregs_,
+                                          track_helpers_, is_assign_);
+      if (checker.needs_probe() && checker.get_nb_derefs() == 0) {
+        needs_probe_ = true;
+        return false;
+      }
+      nb_derefs_++;
+    } else if (E->getOpcode() == UO_AddrOf) {
+      nb_derefs_--;
+    }
+    return true;
+  }
+  bool VisitDeclRefExpr(DeclRefExpr *E) {
+    if (is_assign_) {
+      // We're looking for an external pointer, regardless of the number of
+      // dereferences.
+      for(auto p : ptregs_) {
+        if (std::get<0>(p) == E->getDecl()) {
+          needs_probe_ = true;
+          nb_derefs_ += std::get<1>(p);
+          return false;
+        }
+      }
+    } else {
+      tuple<Decl *, int> pt = make_tuple(E->getDecl(), nb_derefs_);
+      if (ptregs_.find(pt) != ptregs_.end())
+        needs_probe_ = true;
+    }
+    return true;
+  }
+  bool needs_probe() const { return needs_probe_; }
+  bool is_transitive() const { return is_transitive_; }
+  int get_nb_derefs() const { return nb_derefs_; }
+ private:
+  bool needs_probe_;
+  bool is_transitive_;
+  const set<tuple<Decl *, int>> &ptregs_;
+  bool track_helpers_;
+  // Nb of dereferences we go through before finding the external pointer.
+  // A negative number counts the number of addrof.
+  int nb_derefs_;
+  bool is_assign_;
+};
+
+// Visit a piece of the AST and mark it as needing probe reads
+class ProbeSetter : public RecursiveASTVisitor<ProbeSetter> {
+ public:
+  explicit ProbeSetter(set<tuple<Decl *, int>> *ptregs, int nb_addrof)
+      : ptregs_(ptregs), nb_derefs_(-nb_addrof) {}
+  bool VisitDeclRefExpr(DeclRefExpr *E) {
+    tuple<Decl *, int> pt = make_tuple(E->getDecl(), nb_derefs_);
+    ptregs_->insert(pt);
+    return true;
+  }
+  explicit ProbeSetter(set<tuple<Decl *, int>> *ptregs)
+      : ProbeSetter(ptregs, 0) {}
+  bool VisitUnaryOperator(UnaryOperator *E) {
+    if (E->getOpcode() == UO_Deref)
+      nb_derefs_++;
+    return true;
+  }
+  bool VisitMemberExpr(MemberExpr *M) {
+    tuple<Decl *, int> pt = make_tuple(M->getMemberDecl(), nb_derefs_);
+    ptregs_->insert(pt);
+    return false;
+  }
+ private:
+  set<tuple<Decl *, int>> *ptregs_;
+  // Nb of dereferences we go through before getting to the actual variable.
+  int nb_derefs_;
+};
+
+MapVisitor::MapVisitor(set<Decl *> &m) : m_(m) {}
+
+bool MapVisitor::VisitCallExpr(CallExpr *Call) {
+  if (MemberExpr *Memb = dyn_cast<MemberExpr>(Call->getCallee()->IgnoreImplicit())) {
+    StringRef memb_name = Memb->getMemberDecl()->getName();
+    if (DeclRefExpr *Ref = dyn_cast<DeclRefExpr>(Memb->getBase())) {
+      if (SectionAttr *A = Ref->getDecl()->getAttr<SectionAttr>()) {
+        if (!A->getName().startswith("maps"))
+          return true;
+
+        if (memb_name == "update" || memb_name == "insert") {
+          ProbeChecker checker = ProbeChecker(Call->getArg(1), ptregs_, true,
+                                              true);
+          if (checker.needs_probe())
+            m_.insert(Ref->getDecl());
+        }
+      }
+    }
+  }
+  return true;
+}
+
+ProbeVisitor::ProbeVisitor(ASTContext &C, Rewriter &rewriter,
+                           set<Decl *> &m, bool track_helpers) :
+  C(C), rewriter_(rewriter), m_(m), track_helpers_(track_helpers),
+  addrof_stmt_(nullptr), is_addrof_(false) {}
+
+bool ProbeVisitor::assignsExtPtr(Expr *E, int *nbAddrOf) {
+  if (IsContextMemberExpr(E)) {
+    *nbAddrOf = 0;
+    return true;
+  }
+
+  /* If the expression contains a call to another function, we need to visit
+  * that function first to know if a rewrite is necessary (i.e., if the
+  * function returns an external pointer). */
+  if (!TraverseStmt(E))
+    return false;
+
+  ProbeChecker checker = ProbeChecker(E, ptregs_, track_helpers_,
+                                      true);
+  if (checker.is_transitive()) {
+    // The negative of the number of dereferences is the number of addrof.  In
+    // an assignment, if we went through n addrof before getting the external
+    // pointer, then we'll need n dereferences on the left-hand side variable
+    // to get to the external pointer.
+    *nbAddrOf = -checker.get_nb_derefs();
+    return true;
+  }
+
+  if (E->IgnoreParenCasts()->getStmtClass() == Stmt::CallExprClass) {
+    CallExpr *Call = dyn_cast<CallExpr>(E->IgnoreParenCasts());
+    if (MemberExpr *Memb = dyn_cast<MemberExpr>(Call->getCallee()->IgnoreImplicit())) {
+      StringRef memb_name = Memb->getMemberDecl()->getName();
+      if (DeclRefExpr *Ref = dyn_cast<DeclRefExpr>(Memb->getBase())) {
+        if (SectionAttr *A = Ref->getDecl()->getAttr<SectionAttr>()) {
+          if (!A->getName().startswith("maps"))
+            return false;
+
+          if (memb_name == "lookup" || memb_name == "lookup_or_init") {
+            if (m_.find(Ref->getDecl()) != m_.end()) {
+              // Retrieved an ext. pointer from a map, mark LHS as ext. pointer.
+              // Pointers from maps always need a single dereference to get the
+              // actual value.  The value may be an external pointer but cannot
+              // be a pointer to an external pointer as the verifier prohibits
+              // storing known pointers (to map values, context, the stack, or
+              // the packet) in maps.
+              *nbAddrOf = 1;
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+bool ProbeVisitor::VisitVarDecl(VarDecl *D) {
+  if (Expr *E = D->getInit()) {
+    int nbAddrOf;
+    if (assignsExtPtr(E, &nbAddrOf)) {
+      // The negative of the number of addrof is the number of dereferences.
+      tuple<Decl *, int> pt = make_tuple(D, -nbAddrOf);
+      set_ptreg(pt);
+    }
+  }
+  return true;
+}
+
+bool ProbeVisitor::TraverseStmt(Stmt *S) {
+  if (whitelist_.find(S) != whitelist_.end())
+    return true;
+  auto ret = RecursiveASTVisitor<ProbeVisitor>::TraverseStmt(S);
+  if (addrof_stmt_ == S) {
+    addrof_stmt_ = nullptr;
+    is_addrof_ = false;
+  }
+  return ret;
+}
+
+bool ProbeVisitor::VisitCallExpr(CallExpr *Call) {
+  // Skip bpf_probe_read for the third argument if it is an AddrOf.
+  if (VarDecl *V = dyn_cast<VarDecl>(Call->getCalleeDecl())) {
+    if (V->getName() == "bpf_probe_read" && Call->getNumArgs() >= 3) {
+      const Expr *E = Call->getArg(2)->IgnoreParenCasts();
+      whitelist_.insert(E);
+      return true;
+    }
+  }
+
+  if (FunctionDecl *F = dyn_cast<FunctionDecl>(Call->getCalleeDecl())) {
+    if (F->hasBody()) {
+      unsigned i = 0;
+      for (auto arg : Call->arguments()) {
+        ProbeChecker checker = ProbeChecker(arg, ptregs_, track_helpers_,
+                                            true);
+        if (checker.needs_probe()) {
+          tuple<Decl *, int> pt = make_tuple(F->getParamDecl(i),
+                                             checker.get_nb_derefs());
+          ptregs_.insert(pt);
+        }
+        ++i;
+      }
+      if (fn_visited_.find(F) == fn_visited_.end()) {
+        fn_visited_.insert(F);
+        /* Maintains a stack of the number of dereferences for the external
+         * pointers returned by each function in the call stack or -1 if the
+         * function didn't return an external pointer. */
+        ptregs_returned_.push_back(-1);
+        TraverseDecl(F);
+        int nb_derefs = ptregs_returned_.back();
+        ptregs_returned_.pop_back();
+        if (nb_derefs != -1) {
+          tuple<Decl *, int> pt = make_tuple(F, nb_derefs);
+          ptregs_.insert(pt);
+        }
+      }
+    }
+  }
+  return true;
+}
+bool ProbeVisitor::VisitReturnStmt(ReturnStmt *R) {
+  /* If this function wasn't called by another, there's no need to check the
+   * return statement for external pointers. */
+  if (ptregs_returned_.size() == 0)
+    return true;
+
+  /* Reverse order of traversals.  This is needed if, in the return statement,
+   * we're calling a function that's returning an external pointer: we need to
+   * know what the function is returning to decide what this function is
+   * returning. */
+  if (!TraverseStmt(R->getRetValue()))
+    return false;
+
+  ProbeChecker checker = ProbeChecker(R->getRetValue(), ptregs_,
+                                      track_helpers_, true);
+  if (checker.needs_probe()) {
+    int curr_nb_derefs = ptregs_returned_.back();
+    /* If the function returns external pointers with different levels of
+     * indirection, we handle the case with the highest level of indirection
+     * and leave it to the user to manually handle other cases. */
+    if (checker.get_nb_derefs() > curr_nb_derefs) {
+      ptregs_returned_.pop_back();
+      ptregs_returned_.push_back(checker.get_nb_derefs());
+    }
+  }
+  return true;
+}
+bool ProbeVisitor::VisitBinaryOperator(BinaryOperator *E) {
+  if (!E->isAssignmentOp())
+    return true;
+
+  // copy probe attribute from RHS to LHS if present
+  int nbAddrOf;
+  if (assignsExtPtr(E->getRHS(), &nbAddrOf)) {
+    ProbeSetter setter(&ptregs_, nbAddrOf);
+    setter.TraverseStmt(E->getLHS());
+  }
+  return true;
+}
+bool ProbeVisitor::VisitUnaryOperator(UnaryOperator *E) {
+  if (E->getOpcode() == UO_AddrOf) {
+    addrof_stmt_ = E;
+    is_addrof_ = true;
+  }
+  if (E->getOpcode() != UO_Deref)
+    return true;
+  if (memb_visited_.find(E) != memb_visited_.end())
+    return true;
+  Expr *sub = E->getSubExpr();
+  if (!ProbeChecker(sub, ptregs_, track_helpers_).needs_probe())
+    return true;
+  memb_visited_.insert(E);
+  string pre, post;
+  pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
+  pre += " bpf_probe_read(&_val, sizeof(_val), (u64)";
+  post = "); _val; })";
+  rewriter_.ReplaceText(expansionLoc(E->getOperatorLoc()), 1, pre);
+  rewriter_.InsertTextAfterToken(expansionLoc(GET_ENDLOC(sub)), post);
+  return true;
+}
+bool ProbeVisitor::VisitMemberExpr(MemberExpr *E) {
+  if (memb_visited_.find(E) != memb_visited_.end()) return true;
+
+  Expr *base;
+  SourceLocation rhs_start, member;
+  bool found = false;
+  for (MemberExpr *M = E; M; M = dyn_cast<MemberExpr>(M->getBase())) {
+    memb_visited_.insert(M);
+    rhs_start = GET_ENDLOC(M);
+    base = M->getBase();
+    member = M->getMemberLoc();
+    if (M->isArrow()) {
+      found = true;
+      break;
+    }
+  }
+  if (!found)
+    return true;
+  if (member.isInvalid()) {
+    error(GET_ENDLOC(base), "internal error: MemberLoc is invalid while preparing probe rewrite");
+    return false;
+  }
+
+  if (!rewriter_.isRewritable(GET_BEGINLOC(E)))
+    return true;
+
+  // parent expr has addrof, skip the rewrite, set is_addrof_ to flase so
+  // it won't affect next level of indirect address
+  if (is_addrof_) {
+    is_addrof_ = false;
+    return true;
+  }
+
+  /* If the base of the dereference is a call to another function, we need to
+   * visit that function first to know if a rewrite is necessary (i.e., if the
+   * function returns an external pointer). */
+  if (base->IgnoreParenCasts()->getStmtClass() == Stmt::CallExprClass) {
+    CallExpr *Call = dyn_cast<CallExpr>(base->IgnoreParenCasts());
+    if (!TraverseStmt(Call))
+      return false;
+  }
+
+  // Checks to see if the expression references something that needs to be run
+  // through bpf_probe_read.
+  if (!ProbeChecker(base, ptregs_, track_helpers_).needs_probe())
+    return true;
+
+  string rhs = rewriter_.getRewrittenText(expansionRange(SourceRange(rhs_start, GET_ENDLOC(E))));
+  string base_type = base->getType()->getPointeeType().getAsString();
+  string pre, post;
+  pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
+  pre += " bpf_probe_read(&_val, sizeof(_val), (u64)&";
+  post = rhs + "); _val; })";
+  rewriter_.InsertText(expansionLoc(GET_BEGINLOC(E)), pre);
+  rewriter_.ReplaceText(expansionRange(SourceRange(member, GET_ENDLOC(E))), post);
+  return true;
+}
+bool ProbeVisitor::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
+  if (memb_visited_.find(E) != memb_visited_.end()) return true;
+  if (!ProbeChecker(E, ptregs_, track_helpers_).needs_probe())
+    return true;
+
+  // Parent expr has addrof, skip the rewrite.
+  if (is_addrof_)
+    return true;
+
+  if (!rewriter_.isRewritable(GET_BEGINLOC(E)))
+    return true;
+
+  Expr *base = E->getBase();
+  Expr *idx = E->getIdx();
+  memb_visited_.insert(E);
+
+  if (!rewriter_.isRewritable(GET_BEGINLOC(base)))
+    return true;
+  if (!rewriter_.isRewritable(GET_BEGINLOC(idx)))
+    return true;
+
+
+  string pre, lbracket, rbracket;
+  LangOptions opts;
+  SourceLocation lbracket_start, lbracket_end;
+  SourceRange lbracket_range;
+  pre = "({ typeof(" + E->getType().getAsString() + ") _val; __builtin_memset(&_val, 0, sizeof(_val));";
+  pre += " bpf_probe_read(&_val, sizeof(_val), (u64)((";
+  if (isMemberDereference(base)) {
+    pre += "&";
+    // If the base of the array subscript is a member dereference, we'll rewrite
+    // both at the same time.
+    addrof_stmt_ = base;
+    is_addrof_ = true;
+  }
+  rewriter_.InsertText(expansionLoc(GET_BEGINLOC(base)), pre);
+
+  /* Replace left bracket and any space around it.  Since Clang doesn't provide
+   * a method to retrieve the left bracket, replace everything from the end of
+   * the base to the start of the index. */
+  lbracket = ") + (";
+  lbracket_start = Lexer::getLocForEndOfToken(GET_ENDLOC(base), 1,
+                                              rewriter_.getSourceMgr(),
+                                              opts).getLocWithOffset(1);
+  lbracket_end = GET_BEGINLOC(idx).getLocWithOffset(-1);
+  lbracket_range = expansionRange(SourceRange(lbracket_start, lbracket_end));
+  rewriter_.ReplaceText(lbracket_range, lbracket);
+
+  rbracket = "))); _val; })";
+  rewriter_.ReplaceText(expansionLoc(E->getRBracketLoc()), 1, rbracket);
+
+  return true;
+}
+
+bool ProbeVisitor::isMemberDereference(Expr *E) {
+  if (E->IgnoreParenCasts()->getStmtClass() != Stmt::MemberExprClass)
+    return false;
+  for (MemberExpr *M = dyn_cast<MemberExpr>(E->IgnoreParenCasts()); M;
+       M = dyn_cast<MemberExpr>(M->getBase()->IgnoreParenCasts())) {
+    if (M->isArrow())
+      return true;
+  }
+  return false;
+}
+bool ProbeVisitor::IsContextMemberExpr(Expr *E) {
+  if (!E->getType()->isPointerType())
+    return false;
+
+  Expr *base;
+  SourceLocation member;
+  bool found = false;
+  MemberExpr *M;
+  Expr *Ex = E->IgnoreParenCasts();
+  while (Ex->getStmtClass() == Stmt::ArraySubscriptExprClass
+         || Ex->getStmtClass() == Stmt::MemberExprClass) {
+    if (Ex->getStmtClass() == Stmt::ArraySubscriptExprClass) {
+      Ex = dyn_cast<ArraySubscriptExpr>(Ex)->getBase()->IgnoreParenCasts();
+    } else if (Ex->getStmtClass() == Stmt::MemberExprClass) {
+      M = dyn_cast<MemberExpr>(Ex);
+      base = M->getBase()->IgnoreParenCasts();
+      member = M->getMemberLoc();
+      if (M->isArrow()) {
+        found = true;
+        break;
+      }
+      Ex = base;
+    }
+  }
+  if (!found) {
+    return false;
+  }
+  if (member.isInvalid()) {
+    return false;
+  }
+
+  if (DeclRefExpr *base_expr = dyn_cast<DeclRefExpr>(base)) {
+    if (base_expr->getDecl() == ctx_) {
+      return true;
+    }
+  }
+  return false;
+}
+
+SourceRange
+ProbeVisitor::expansionRange(SourceRange range) {
+#if LLVM_MAJOR_VERSION >= 7
+  return rewriter_.getSourceMgr().getExpansionRange(range).getAsRange();
+#else
+  return rewriter_.getSourceMgr().getExpansionRange(range);
+#endif
+}
+
+SourceLocation
+ProbeVisitor::expansionLoc(SourceLocation loc) {
+  return rewriter_.getSourceMgr().getExpansionLoc(loc);
+}
+
+template <unsigned N>
+DiagnosticBuilder ProbeVisitor::error(SourceLocation loc, const char (&fmt)[N]) {
+  unsigned int diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, fmt);
+  return C.getDiagnostics().Report(loc, diag_id);
+}
+
+BTypeVisitor::BTypeVisitor(ASTContext &C, BFrontendAction &fe)
+    : C(C), diag_(C.getDiagnostics()), fe_(fe), rewriter_(fe.rewriter()), out_(llvm::errs()) {}
+
+void BTypeVisitor::genParamDirectAssign(FunctionDecl *D, string& preamble,
+                                        const char **calling_conv_regs) {
+  for (size_t idx = 0; idx < fn_args_.size(); idx++) {
+    ParmVarDecl *arg = fn_args_[idx];
+
+    if (idx >= 1) {
+      // Move the args into a preamble section where the same params are
+      // declared and initialized from pt_regs.
+      // Todo: this init should be done only when the program requests it.
+      string text = rewriter_.getRewrittenText(expansionRange(arg->getSourceRange()));
+      arg->addAttr(UnavailableAttr::CreateImplicit(C, "ptregs"));
+      size_t d = idx - 1;
+      const char *reg = calling_conv_regs[d];
+      preamble += " " + text + " = " + fn_args_[0]->getName().str() + "->" +
+                  string(reg) + ";";
+    }
+  }
+}
+
+void BTypeVisitor::genParamIndirectAssign(FunctionDecl *D, string& preamble,
+                                          const char **calling_conv_regs) {
+  string new_ctx;
+
+  for (size_t idx = 0; idx < fn_args_.size(); idx++) {
+    ParmVarDecl *arg = fn_args_[idx];
+
+    if (idx == 0) {
+      new_ctx = "__" + arg->getName().str();
+      preamble += " struct pt_regs * " + new_ctx + " = " +
+                  arg->getName().str() + "->" +
+                  string(calling_conv_regs[0]) + ";";
+    } else {
+      // Move the args into a preamble section where the same params are
+      // declared and initialized from pt_regs.
+      // Todo: this init should be done only when the program requests it.
+      string text = rewriter_.getRewrittenText(expansionRange(arg->getSourceRange()));
+      size_t d = idx - 1;
+      const char *reg = calling_conv_regs[d];
+      preamble += "\n " + text + ";";
+      preamble += " bpf_probe_read(&" + arg->getName().str() + ", sizeof(" +
+                  arg->getName().str() + "), &" + new_ctx + "->" +
+                  string(reg) + ");";
+    }
+  }
+}
+
+void BTypeVisitor::rewriteFuncParam(FunctionDecl *D) {
+  const char **calling_conv_regs = get_call_conv();
+
+  string preamble = "{\n";
+  if (D->param_size() > 1) {
+    // If function prefix is "syscall__" or "kprobe____x64_sys_",
+    // the function will attach to a kprobe syscall function.
+    // Guard parameter assiggnment with CONFIG_ARCH_HAS_SYSCALL_WRAPPER.
+    // For __x64_sys_* syscalls, this is always true, but we guard
+    // it in case of "syscall__" for other architectures.
+    if (strncmp(D->getName().str().c_str(), "syscall__", 9) == 0 ||
+        strncmp(D->getName().str().c_str(), "kprobe____x64_sys_", 18) == 0) {
+      preamble += "#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER\n";
+      genParamIndirectAssign(D, preamble, calling_conv_regs);
+      preamble += "\n#else\n";
+      genParamDirectAssign(D, preamble, calling_conv_regs);
+      preamble += "\n#endif\n";
+    } else {
+      genParamDirectAssign(D, preamble, calling_conv_regs);
+    }
+    rewriter_.ReplaceText(
+        expansionRange(SourceRange(GET_ENDLOC(D->getParamDecl(0)),
+                    GET_ENDLOC(D->getParamDecl(D->getNumParams() - 1)))),
+        fn_args_[0]->getName());
+  }
+  // for each trace argument, convert the variable from ptregs to something on stack
+  if (CompoundStmt *S = dyn_cast<CompoundStmt>(D->getBody()))
+    rewriter_.ReplaceText(S->getLBracLoc(), 1, preamble);
+}
+
+bool BTypeVisitor::VisitFunctionDecl(FunctionDecl *D) {
+  // put each non-static non-inline function decl in its own section, to be
+  // extracted by the MemoryManager
+  auto real_start_loc = rewriter_.getSourceMgr().getFileLoc(GET_BEGINLOC(D));
+  if (fe_.is_rewritable_ext_func(D)) {
+    current_fn_ = D->getName();
+    string bd = rewriter_.getRewrittenText(expansionRange(D->getSourceRange()));
+    fe_.func_src_.set_src(current_fn_, bd);
+    fe_.func_range_[current_fn_] = expansionRange(D->getSourceRange());
+    string attr = string("__attribute__((section(\"") + BPF_FN_PREFIX + D->getName().str() + "\")))\n";
+    rewriter_.InsertText(real_start_loc, attr);
+    if (D->param_size() > MAX_CALLING_CONV_REGS + 1) {
+      error(GET_BEGINLOC(D->getParamDecl(MAX_CALLING_CONV_REGS + 1)),
+            "too many arguments, bcc only supports in-register parameters");
+      return false;
+    }
+
+    fn_args_.clear();
+    for (auto arg_it = D->param_begin(); arg_it != D->param_end(); arg_it++) {
+      auto *arg = *arg_it;
+      if (arg->getName() == "") {
+        error(GET_ENDLOC(arg), "arguments to BPF program definition must be named");
+        return false;
+      }
+      fn_args_.push_back(arg);
+    }
+    rewriteFuncParam(D);
+  } else if (D->hasBody() &&
+             rewriter_.getSourceMgr().getFileID(real_start_loc)
+               == rewriter_.getSourceMgr().getMainFileID()) {
+    // rewritable functions that are static should be always treated as helper
+    rewriter_.InsertText(real_start_loc, "__attribute__((always_inline))\n");
+  }
+  return true;
+}
+
+// Reverse the order of call traversal so that parameters inside of
+// function calls will get rewritten before the call itself, otherwise
+// text mangling will result.
+bool BTypeVisitor::TraverseCallExpr(CallExpr *Call) {
+  for (auto child : Call->children())
+    if (!TraverseStmt(child))
+      return false;
+  if (!WalkUpFromCallExpr(Call))
+    return false;
+  return true;
+}
+
+// convert calls of the type:
+//  table.foo(&key)
+// to:
+//  bpf_table_foo_elem(bpf_pseudo_fd(table), &key [,&leaf])
+bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
+  // make sure node is a reference to a bpf table, which is assured by the
+  // presence of the section("maps/<typename>") GNU __attribute__
+  if (MemberExpr *Memb = dyn_cast<MemberExpr>(Call->getCallee()->IgnoreImplicit())) {
+    StringRef memb_name = Memb->getMemberDecl()->getName();
+    if (DeclRefExpr *Ref = dyn_cast<DeclRefExpr>(Memb->getBase())) {
+      if (SectionAttr *A = Ref->getDecl()->getAttr<SectionAttr>()) {
+        if (!A->getName().startswith("maps"))
+          return true;
+
+        string args = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(0)),
+                                                   GET_ENDLOC(Call->getArg(Call->getNumArgs() - 1)))));
+
+        // find the table fd, which was opened at declaration time
+        TableStorage::iterator desc;
+        Path local_path({fe_.id(), Ref->getDecl()->getName()});
+        Path global_path({Ref->getDecl()->getName()});
+        if (!fe_.table_storage().Find(local_path, desc)) {
+          if (!fe_.table_storage().Find(global_path, desc)) {
+            error(GET_ENDLOC(Ref), "bpf_table %0 failed to open") << Ref->getDecl()->getName();
+            return false;
+          }
+        }
+        string fd = to_string(desc->second.fd);
+        string prefix, suffix;
+        string txt;
+        auto rewrite_start = GET_BEGINLOC(Call);
+        auto rewrite_end = GET_ENDLOC(Call);
+        if (memb_name == "lookup_or_init") {
+          string name = Ref->getDecl()->getName();
+          string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
+          string arg1 = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange()));
+          string lookup = "bpf_map_lookup_elem_(bpf_pseudo_fd(1, " + fd + ")";
+          string update = "bpf_map_update_elem_(bpf_pseudo_fd(1, " + fd + ")";
+          txt  = "({typeof(" + name + ".leaf) *leaf = " + lookup + ", " + arg0 + "); ";
+          txt += "if (!leaf) {";
+          txt += " " + update + ", " + arg0 + ", " + arg1 + ", BPF_NOEXIST);";
+          txt += " leaf = " + lookup + ", " + arg0 + ");";
+          txt += " if (!leaf) return 0;";
+          txt += "}";
+          txt += "leaf;})";
+        } else if (memb_name == "increment") {
+          string name = Ref->getDecl()->getName();
+          string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
+
+          string increment_value = "1";
+          if (Call->getNumArgs() == 2) {
+            increment_value = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange()));
+
+          }
+
+          string lookup = "bpf_map_lookup_elem_(bpf_pseudo_fd(1, " + fd + ")";
+          string update = "bpf_map_update_elem_(bpf_pseudo_fd(1, " + fd + ")";
+          txt  = "({ typeof(" + name + ".key) _key = " + arg0 + "; ";
+          txt += "typeof(" + name + ".leaf) *_leaf = " + lookup + ", &_key); ";
+
+          txt += "if (_leaf) (*_leaf) += " + increment_value + ";";
+          if (desc->second.type == BPF_MAP_TYPE_HASH) {
+            txt += "else { typeof(" + name + ".leaf) _zleaf; __builtin_memset(&_zleaf, 0, sizeof(_zleaf)); ";
+            txt += "_zleaf += " + increment_value + ";";
+            txt += update + ", &_key, &_zleaf, BPF_NOEXIST); } ";
+          }
+          txt += "})";
+        } else if (memb_name == "perf_submit") {
+          string name = Ref->getDecl()->getName();
+          string arg0 = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
+          string args_other = rewriter_.getRewrittenText(expansionRange(SourceRange(GET_BEGINLOC(Call->getArg(1)),
+                                                           GET_ENDLOC(Call->getArg(2)))));
+          txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")";
+          txt += ", CUR_CPU_IDENTIFIER, " + args_other + ")";
+        } else if (memb_name == "perf_submit_skb") {
+          string skb = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
+          string skb_len = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange()));
+          string meta = rewriter_.getRewrittenText(expansionRange(Call->getArg(2)->getSourceRange()));
+          string meta_len = rewriter_.getRewrittenText(expansionRange(Call->getArg(3)->getSourceRange()));
+          txt = "bpf_perf_event_output(" +
+            skb + ", " +
+            "bpf_pseudo_fd(1, " + fd + "), " +
+            "((__u64)" + skb_len + " << 32) | BPF_F_CURRENT_CPU, " +
+            meta + ", " +
+            meta_len + ");";
+        } else if (memb_name == "get_stackid") {
+          if (desc->second.type == BPF_MAP_TYPE_STACK_TRACE) {
+            string arg0 =
+                rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange()));
+            txt = "bcc_get_stackid(";
+            txt += "bpf_pseudo_fd(1, " + fd + "), " + arg0;
+            rewrite_end = GET_ENDLOC(Call->getArg(0));
+            } else {
+              error(GET_BEGINLOC(Call), "get_stackid only available on stacktrace maps");
+              return false;
+            }
+        } else {
+          if (memb_name == "lookup") {
+            prefix = "bpf_map_lookup_elem";
+            suffix = ")";
+          } else if (memb_name == "update") {
+            prefix = "bpf_map_update_elem";
+            suffix = ", BPF_ANY)";
+          } else if (memb_name == "insert") {
+            if (desc->second.type == BPF_MAP_TYPE_ARRAY) {
+              warning(GET_BEGINLOC(Call), "all element of an array already exist; insert() will have no effect");
+            }
+            prefix = "bpf_map_update_elem";
+            suffix = ", BPF_NOEXIST)";
+          } else if (memb_name == "delete") {
+            prefix = "bpf_map_delete_elem";
+            suffix = ")";
+          } else if (memb_name == "call") {
+            prefix = "bpf_tail_call_";
+            suffix = ")";
+          } else if (memb_name == "perf_read") {
+            prefix = "bpf_perf_event_read";
+            suffix = ")";
+          } else if (memb_name == "perf_counter_value") {
+            prefix = "bpf_perf_event_read_value";
+            suffix = ")";
+          } else if (memb_name == "check_current_task") {
+            prefix = "bpf_current_task_under_cgroup";
+            suffix = ")";
+          } else if (memb_name == "redirect_map") {
+            prefix = "bpf_redirect_map";
+            suffix = ")";
+          } else {
+            error(GET_BEGINLOC(Call), "invalid bpf_table operation %0") << memb_name;
+            return false;
+          }
+          prefix += "((void *)bpf_pseudo_fd(1, " + fd + "), ";
+
+          txt = prefix + args + suffix;
+        }
+        if (!rewriter_.isRewritable(rewrite_start) || !rewriter_.isRewritable(rewrite_end)) {
+          error(GET_BEGINLOC(Call), "cannot use map function inside a macro");
+          return false;
+        }
+        rewriter_.ReplaceText(expansionRange(SourceRange(rewrite_start, rewrite_end)), txt);
+        return true;
+      }
+    }
+  } else if (Call->getCalleeDecl()) {
+    NamedDecl *Decl = dyn_cast<NamedDecl>(Call->getCalleeDecl());
+    if (!Decl) return true;
+    if (AsmLabelAttr *A = Decl->getAttr<AsmLabelAttr>()) {
+      // Functions with the tag asm("llvm.bpf.extra") are implemented in the
+      // rewriter rather than as a macro since they may also include nested
+      // rewrites, and clang::Rewriter does not support rewrites in macros,
+      // unless one preprocesses the entire source file.
+      if (A->getLabel() == "llvm.bpf.extra") {
+        if (!rewriter_.isRewritable(GET_BEGINLOC(Call))) {
+          error(GET_BEGINLOC(Call), "cannot use builtin inside a macro");
+          return false;
+        }
+
+        vector<string> args;
+        for (auto arg : Call->arguments())
+          args.push_back(rewriter_.getRewrittenText(expansionRange(arg->getSourceRange())));
+
+        string text;
+        if (Decl->getName() == "incr_cksum_l3") {
+          text = "bpf_l3_csum_replace_(" + fn_args_[0]->getName().str() + ", (u64)";
+          text += args[0] + ", " + args[1] + ", " + args[2] + ", sizeof(" + args[2] + "))";
+          rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+        } else if (Decl->getName() == "incr_cksum_l4") {
+          text = "bpf_l4_csum_replace_(" + fn_args_[0]->getName().str() + ", (u64)";
+          text += args[0] + ", " + args[1] + ", " + args[2];
+          text += ", ((" + args[3] + " & 0x1) << 4) | sizeof(" + args[2] + "))";
+          rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+        } else if (Decl->getName() == "bpf_trace_printk") {
+          checkFormatSpecifiers(args[0], GET_BEGINLOC(Call->getArg(0)));
+          //  #define bpf_trace_printk(fmt, args...)
+          //    ({ char _fmt[] = fmt; bpf_trace_printk_(_fmt, sizeof(_fmt), args...); })
+          text = "({ char _fmt[] = " + args[0] + "; bpf_trace_printk_(_fmt, sizeof(_fmt)";
+          if (args.size() <= 1) {
+            text += "); })";
+            rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+          } else {
+            rewriter_.ReplaceText(expansionRange(SourceRange(GET_BEGINLOC(Call), GET_ENDLOC(Call->getArg(0)))), text);
+            rewriter_.InsertTextAfter(GET_ENDLOC(Call), "); }");
+          }
+        } else if (Decl->getName() == "bpf_num_cpus") {
+          int numcpu = sysconf(_SC_NPROCESSORS_ONLN);
+          if (numcpu <= 0)
+            numcpu = 1;
+          text = to_string(numcpu);
+          rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+        } else if (Decl->getName() == "bpf_usdt_readarg_p") {
+          text = "({ u64 __addr = 0x0; ";
+          text += "_bpf_readarg_" + current_fn_ + "_" + args[0] + "(" +
+                  args[1] + ", &__addr, sizeof(__addr));";
+          text += "bpf_probe_read(" + args[2] + ", " + args[3] +
+                  ", (void *)__addr);";
+          text += "})";
+          rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+        } else if (Decl->getName() == "bpf_usdt_readarg") {
+          text = "_bpf_readarg_" + current_fn_ + "_" + args[0] + "(" + args[1] +
+                 ", " + args[2] + ", sizeof(*(" + args[2] + ")))";
+          rewriter_.ReplaceText(expansionRange(Call->getSourceRange()), text);
+        }
+      }
+    } else if (FunctionDecl *F = dyn_cast<FunctionDecl>(Decl)) {
+      if (F->isExternallyVisible() && !F->getBuiltinID()) {
+        auto start_loc = rewriter_.getSourceMgr().getFileLoc(GET_BEGINLOC(Decl));
+        if (rewriter_.getSourceMgr().getFileID(start_loc)
+            == rewriter_.getSourceMgr().getMainFileID()) {
+          error(GET_BEGINLOC(Call), "cannot call non-static helper function");
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool BTypeVisitor::checkFormatSpecifiers(const string& fmt, SourceLocation loc) {
+  unsigned nb_specifiers = 0, i, j;
+  bool has_s = false;
+  for (i = 0; i < fmt.length(); i++) {
+    if (!isascii(fmt[i]) || (!isprint(fmt[i]) && !isspace(fmt[i]))) {
+      warning(loc.getLocWithOffset(i), "unrecognized character");
+      return false;
+    }
+    if (fmt[i] != '%')
+      continue;
+    if (nb_specifiers >= 3) {
+      warning(loc.getLocWithOffset(i), "cannot use more than 3 conversion specifiers");
+      return false;
+    }
+    nb_specifiers++;
+    i++;
+    if (fmt[i] == 'l') {
+      i++;
+    } else if (fmt[i] == 'p' || fmt[i] == 's') {
+      i++;
+      if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) {
+        warning(loc.getLocWithOffset(i - 2),
+                "only %%d %%u %%x %%ld %%lu %%lx %%lld %%llu %%llx %%p %%s conversion specifiers allowed");
+        return false;
+      }
+      if (fmt[i - 1] == 's') {
+        if (has_s) {
+          warning(loc.getLocWithOffset(i - 2), "cannot use several %%s conversion specifiers");
+          return false;
+        }
+        has_s = true;
+      }
+      continue;
+    }
+    j = 1;
+    if (fmt[i] == 'l') {
+      i++;
+      j++;
+    }
+    if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') {
+      warning(loc.getLocWithOffset(i - j),
+              "only %%d %%u %%x %%ld %%lu %%lx %%lld %%llu %%llx %%p %%s conversion specifiers allowed");
+      return false;
+    }
+  }
+  return true;
+}
+
+bool BTypeVisitor::VisitBinaryOperator(BinaryOperator *E) {
+  if (!E->isAssignmentOp())
+    return true;
+  Expr *LHS = E->getLHS()->IgnoreImplicit();
+  if (MemberExpr *Memb = dyn_cast<MemberExpr>(LHS)) {
+    if (DeclRefExpr *Base = dyn_cast<DeclRefExpr>(Memb->getBase()->IgnoreImplicit())) {
+      if (DeprecatedAttr *A = Base->getDecl()->getAttr<DeprecatedAttr>()) {
+        if (A->getMessage() == "packet") {
+          if (FieldDecl *F = dyn_cast<FieldDecl>(Memb->getMemberDecl())) {
+            if (!rewriter_.isRewritable(GET_BEGINLOC(E))) {
+              error(GET_BEGINLOC(E), "cannot use \"packet\" header type inside a macro");
+              return false;
+            }
+            uint64_t ofs = C.getFieldOffset(F);
+            uint64_t sz = F->isBitField() ? F->getBitWidthValue(C) : C.getTypeSize(F->getType());
+            string base = rewriter_.getRewrittenText(expansionRange(Base->getSourceRange()));
+            string text = "bpf_dins_pkt(" + fn_args_[0]->getName().str() + ", (u64)" + base + "+" + to_string(ofs >> 3)
+                + ", " + to_string(ofs & 0x7) + ", " + to_string(sz) + ",";
+            rewriter_.ReplaceText(expansionRange(SourceRange(GET_BEGINLOC(E), E->getOperatorLoc())), text);
+            rewriter_.InsertTextAfterToken(GET_ENDLOC(E), ")");
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+bool BTypeVisitor::VisitImplicitCastExpr(ImplicitCastExpr *E) {
+  // use dext only for RValues
+  if (E->getCastKind() != CK_LValueToRValue)
+    return true;
+  MemberExpr *Memb = dyn_cast<MemberExpr>(E->IgnoreImplicit());
+  if (!Memb)
+    return true;
+  Expr *Base = Memb->getBase()->IgnoreImplicit();
+  if (DeclRefExpr *Ref = dyn_cast<DeclRefExpr>(Base)) {
+    if (DeprecatedAttr *A = Ref->getDecl()->getAttr<DeprecatedAttr>()) {
+      if (A->getMessage() == "packet") {
+        if (FieldDecl *F = dyn_cast<FieldDecl>(Memb->getMemberDecl())) {
+          if (!rewriter_.isRewritable(GET_BEGINLOC(E))) {
+            error(GET_BEGINLOC(E), "cannot use \"packet\" header type inside a macro");
+            return false;
+          }
+          uint64_t ofs = C.getFieldOffset(F);
+          uint64_t sz = F->isBitField() ? F->getBitWidthValue(C) : C.getTypeSize(F->getType());
+          string text = "bpf_dext_pkt(" + fn_args_[0]->getName().str() + ", (u64)" + Ref->getDecl()->getName().str() + "+"
+              + to_string(ofs >> 3) + ", " + to_string(ofs & 0x7) + ", " + to_string(sz) + ")";
+          rewriter_.ReplaceText(expansionRange(E->getSourceRange()), text);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+SourceRange
+BTypeVisitor::expansionRange(SourceRange range) {
+#if LLVM_MAJOR_VERSION >= 7
+  return rewriter_.getSourceMgr().getExpansionRange(range).getAsRange();
+#else
+  return rewriter_.getSourceMgr().getExpansionRange(range);
+#endif
+}
+
+template <unsigned N>
+DiagnosticBuilder BTypeVisitor::error(SourceLocation loc, const char (&fmt)[N]) {
+  unsigned int diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, fmt);
+  return C.getDiagnostics().Report(loc, diag_id);
+}
+
+template <unsigned N>
+DiagnosticBuilder BTypeVisitor::warning(SourceLocation loc, const char (&fmt)[N]) {
+  unsigned int diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Warning, fmt);
+  return C.getDiagnostics().Report(loc, diag_id);
+}
+
+// Open table FDs when bpf tables (as denoted by section("maps*") attribute)
+// are declared.
+bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
+  const RecordType *R = Decl->getType()->getAs<RecordType>();
+  if (SectionAttr *A = Decl->getAttr<SectionAttr>()) {
+    if (!A->getName().startswith("maps"))
+      return true;
+    if (!R) {
+      error(GET_ENDLOC(Decl), "invalid type for bpf_table, expect struct");
+      return false;
+    }
+    const RecordDecl *RD = R->getDecl()->getDefinition();
+
+    TableDesc table;
+    TableStorage::iterator table_it;
+    table.name = Decl->getName();
+    Path local_path({fe_.id(), table.name});
+    Path maps_ns_path({"ns", fe_.maps_ns(), table.name});
+    Path global_path({table.name});
+    QualType key_type, leaf_type;
+
+    unsigned i = 0;
+    for (auto F : RD->fields()) {
+      if (F->getType().getTypePtr()->isIncompleteType()) {
+        error(GET_BEGINLOC(F), "unknown type");
+        return false;
+      }
+
+      size_t sz = C.getTypeSize(F->getType()) >> 3;
+      if (F->getName() == "key") {
+        if (sz == 0) {
+          error(GET_BEGINLOC(F), "invalid zero-sized leaf");
+          return false;
+        }
+        table.key_size = sz;
+        key_type = F->getType();
+      } else if (F->getName() == "leaf") {
+        if (sz == 0) {
+          error(GET_BEGINLOC(F), "invalid zero-sized leaf");
+          return false;
+        }
+        table.leaf_size = sz;
+        leaf_type = F->getType();
+      } else if (F->getName() == "max_entries") {
+        unsigned idx = F->getFieldIndex();
+        if (auto I = dyn_cast_or_null<InitListExpr>(Decl->getInit())) {
+          llvm::APSInt res;
+          if (I->getInit(idx)->EvaluateAsInt(res, C)) {
+            table.max_entries = res.getExtValue();
+          }
+        }
+      } else if (F->getName() == "flags") {
+        unsigned idx = F->getFieldIndex();
+        if (auto I = dyn_cast_or_null<InitListExpr>(Decl->getInit())) {
+          llvm::APSInt res;
+          if (I->getInit(idx)->EvaluateAsInt(res, C)) {
+            table.flags = res.getExtValue();
+          }
+        }
+      }
+      ++i;
+    }
+
+    bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC;
+    if (A->getName() == "maps/hash") {
+      map_type = BPF_MAP_TYPE_HASH;
+    } else if (A->getName() == "maps/array") {
+      map_type = BPF_MAP_TYPE_ARRAY;
+    } else if (A->getName() == "maps/percpu_hash") {
+      map_type = BPF_MAP_TYPE_PERCPU_HASH;
+    } else if (A->getName() == "maps/percpu_array") {
+      map_type = BPF_MAP_TYPE_PERCPU_ARRAY;
+    } else if (A->getName() == "maps/lru_hash") {
+      map_type = BPF_MAP_TYPE_LRU_HASH;
+    } else if (A->getName() == "maps/lru_percpu_hash") {
+      map_type = BPF_MAP_TYPE_LRU_PERCPU_HASH;
+    } else if (A->getName() == "maps/lpm_trie") {
+      map_type = BPF_MAP_TYPE_LPM_TRIE;
+    } else if (A->getName() == "maps/histogram") {
+      map_type = BPF_MAP_TYPE_HASH;
+      if (key_type->isSpecificBuiltinType(BuiltinType::Int))
+        map_type = BPF_MAP_TYPE_ARRAY;
+      if (!leaf_type->isSpecificBuiltinType(BuiltinType::ULongLong))
+        error(GET_BEGINLOC(Decl), "histogram leaf type must be u64, got %0") << leaf_type;
+    } else if (A->getName() == "maps/prog") {
+      map_type = BPF_MAP_TYPE_PROG_ARRAY;
+    } else if (A->getName() == "maps/perf_output") {
+      map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
+      int numcpu = get_possible_cpus().size();
+      if (numcpu <= 0)
+        numcpu = 1;
+      table.max_entries = numcpu;
+    } else if (A->getName() == "maps/perf_array") {
+      map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
+    } else if (A->getName() == "maps/cgroup_array") {
+      map_type = BPF_MAP_TYPE_CGROUP_ARRAY;
+    } else if (A->getName() == "maps/stacktrace") {
+      map_type = BPF_MAP_TYPE_STACK_TRACE;
+    } else if (A->getName() == "maps/devmap") {
+      map_type = BPF_MAP_TYPE_DEVMAP;
+    } else if (A->getName() == "maps/cpumap") {
+      map_type = BPF_MAP_TYPE_CPUMAP;
+    } else if (A->getName() == "maps/extern") {
+      if (!fe_.table_storage().Find(maps_ns_path, table_it)) {
+        if (!fe_.table_storage().Find(global_path, table_it)) {
+          error(GET_BEGINLOC(Decl), "reference to undefined table");
+          return false;
+        }
+      }
+      table = table_it->second.dup();
+      table.is_extern = true;
+    } else if (A->getName() == "maps/export") {
+      if (table.name.substr(0, 2) == "__")
+        table.name = table.name.substr(2);
+      Path local_path({fe_.id(), table.name});
+      Path global_path({table.name});
+      if (!fe_.table_storage().Find(local_path, table_it)) {
+        error(GET_BEGINLOC(Decl), "reference to undefined table");
+        return false;
+      }
+      fe_.table_storage().Insert(global_path, table_it->second.dup());
+      return true;
+    } else if(A->getName() == "maps/shared") {
+      if (table.name.substr(0, 2) == "__")
+        table.name = table.name.substr(2);
+      Path local_path({fe_.id(), table.name});
+      Path maps_ns_path({"ns", fe_.maps_ns(), table.name});
+      if (!fe_.table_storage().Find(local_path, table_it)) {
+        error(GET_BEGINLOC(Decl), "reference to undefined table");
+        return false;
+      }
+      fe_.table_storage().Insert(maps_ns_path, table_it->second.dup());
+      return true;
+    }
+
+    if (!table.is_extern) {
+      if (map_type == BPF_MAP_TYPE_UNSPEC) {
+        error(GET_BEGINLOC(Decl), "unsupported map type: %0") << A->getName();
+        return false;
+      }
+
+      table.type = map_type;
+      table.fd = bpf_create_map(map_type, table.name.c_str(),
+                                table.key_size, table.leaf_size,
+                                table.max_entries, table.flags);
+    }
+    if (table.fd < 0) {
+      error(GET_BEGINLOC(Decl), "could not open bpf map: %0\nis %1 map type enabled in your kernel?") <<
+          strerror(errno) << A->getName();
+      return false;
+    }
+
+    if (!table.is_extern)
+      fe_.table_storage().VisitMapType(table, C, key_type, leaf_type);
+    fe_.table_storage().Insert(local_path, move(table));
+  } else if (const PointerType *P = Decl->getType()->getAs<PointerType>()) {
+    // if var is a pointer to a packet type, clone the annotation into the var
+    // decl so that the packet dext/dins rewriter can catch it
+    if (const RecordType *RT = P->getPointeeType()->getAs<RecordType>()) {
+      if (const RecordDecl *RD = RT->getDecl()->getDefinition()) {
+        if (DeprecatedAttr *DA = RD->getAttr<DeprecatedAttr>()) {
+          if (DA->getMessage() == "packet") {
+            Decl->addAttr(DA->clone(C));
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// First traversal of AST to retrieve maps with external pointers.
+BTypeConsumer::BTypeConsumer(ASTContext &C, BFrontendAction &fe,
+                             Rewriter &rewriter, set<Decl *> &m)
+    : fe_(fe),
+      map_visitor_(m),
+      btype_visitor_(C, fe),
+      probe_visitor1_(C, rewriter, m, true),
+      probe_visitor2_(C, rewriter, m, false) {}
+
+void BTypeConsumer::HandleTranslationUnit(ASTContext &Context) {
+  DeclContext::decl_iterator it;
+  DeclContext *DC = TranslationUnitDecl::castToDeclContext(Context.getTranslationUnitDecl());
+
+  /**
+   * In a first traversal, ProbeVisitor tracks external pointers identified
+   * through each function's arguments and replaces their dereferences with
+   * calls to bpf_probe_read. It also passes all identified pointers to
+   * external addresses to MapVisitor.
+   */
+  for (it = DC->decls_begin(); it != DC->decls_end(); it++) {
+    Decl *D = *it;
+    if (FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
+      if (fe_.is_rewritable_ext_func(F)) {
+        for (auto arg : F->parameters()) {
+          if (arg == F->getParamDecl(0)) {
+            /**
+             * Limit tracing of pointers from context to tracing contexts.
+             * We're whitelisting instead of blacklisting to avoid issues with
+             * existing programs if new context types are added in the future.
+             */
+            string type = arg->getType().getAsString();
+            if (type == "struct pt_regs *" ||
+                type == "struct bpf_raw_tracepoint_args *" ||
+                type.substr(0, 19) == "struct tracepoint__")
+              probe_visitor1_.set_ctx(arg);
+          } else if (!arg->getType()->isFundamentalType()) {
+            tuple<Decl *, int> pt = make_tuple(arg, 0);
+            probe_visitor1_.set_ptreg(pt);
+          }
+        }
+
+        probe_visitor1_.TraverseDecl(D);
+        for (auto ptreg : probe_visitor1_.get_ptregs()) {
+          map_visitor_.set_ptreg(ptreg);
+        }
+      }
+    }
+  }
+
+  /**
+   * MapVisitor uses external pointers identified by the first ProbeVisitor
+   * traversal to identify all maps with external pointers as values.
+   * MapVisitor runs only after ProbeVisitor finished its traversal of the
+   * whole translation unit to clearly separate the role of each ProbeVisitor's
+   * traversal: the first tracks external pointers from function arguments,
+   * whereas the second tracks external pointers from maps. Without this clear
+   * separation, ProbeVisitor might attempt to replace several times the same
+   * dereferences.
+   */
+  for (it = DC->decls_begin(); it != DC->decls_end(); it++) {
+    Decl *D = *it;
+    if (FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
+      if (fe_.is_rewritable_ext_func(F)) {
+        map_visitor_.TraverseDecl(D);
+      }
+    }
+  }
+
+  /**
+   * In a second traversal, ProbeVisitor tracks pointers passed through the
+   * maps identified by MapVisitor and replaces their dereferences with calls
+   * to bpf_probe_read.
+   * This last traversal runs after MapVisitor went through an entire
+   * translation unit, to ensure maps with external pointers have all been
+   * identified.
+   */
+  for (it = DC->decls_begin(); it != DC->decls_end(); it++) {
+    Decl *D = *it;
+    if (FunctionDecl *F = dyn_cast<FunctionDecl>(D)) {
+      if (fe_.is_rewritable_ext_func(F)) {
+        probe_visitor2_.TraverseDecl(D);
+      }
+    }
+
+    btype_visitor_.TraverseDecl(D);
+  }
+}
+
+BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags,
+                                 TableStorage &ts, const std::string &id,
+                                 const std::string &main_path,
+                                 FuncSource &func_src, std::string &mod_src,
+                                 const std::string &maps_ns)
+    : os_(os),
+      flags_(flags),
+      ts_(ts),
+      id_(id),
+      maps_ns_(maps_ns),
+      rewriter_(new Rewriter),
+      main_path_(main_path),
+      func_src_(func_src),
+      mod_src_(mod_src) {}
+
+bool BFrontendAction::is_rewritable_ext_func(FunctionDecl *D) {
+  StringRef file_name = rewriter_->getSourceMgr().getFilename(GET_BEGINLOC(D));
+  return (D->isExternallyVisible() && D->hasBody() &&
+          (file_name.empty() || file_name == main_path_));
+}
+
+void BFrontendAction::DoMiscWorkAround() {
+  // In 4.16 and later, CONFIG_CC_STACKPROTECTOR is moved out of Kconfig and into
+  // Makefile. It will be set depending on CONFIG_CC_STACKPROTECTOR_{AUTO|REGULAR|STRONG}.
+  // CONFIG_CC_STACKPROTECTOR is still used in various places, e.g., struct task_struct,
+  // to guard certain fields. The workaround here intends to define
+  // CONFIG_CC_STACKPROTECTOR properly based on other configs, so it relieved any bpf
+  // program (using task_struct, etc.) of patching the below code.
+  rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).InsertText(0,
+    "#if defined(BPF_LICENSE)\n"
+    "#error BPF_LICENSE cannot be specified through cflags\n"
+    "#endif\n"
+    "#if !defined(CONFIG_CC_STACKPROTECTOR)\n"
+    "#if defined(CONFIG_CC_STACKPROTECTOR_AUTO) \\\n"
+    "    || defined(CONFIG_CC_STACKPROTECTOR_REGULAR) \\\n"
+    "    || defined(CONFIG_CC_STACKPROTECTOR_STRONG)\n"
+    "#define CONFIG_CC_STACKPROTECTOR\n"
+    "#endif\n"
+    "#endif\n",
+    false);
+
+  rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).InsertTextAfter(
+    rewriter_->getSourceMgr().getBuffer(rewriter_->getSourceMgr().getMainFileID())->getBufferSize(),
+    "\n#include <bcc/footer.h>\n");
+}
+
+void BFrontendAction::EndSourceFileAction() {
+  // Additional misc rewrites
+  DoMiscWorkAround();
+
+  if (flags_ & DEBUG_PREPROCESSOR)
+    rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).write(llvm::errs());
+  if (flags_ & DEBUG_SOURCE) {
+    llvm::raw_string_ostream tmp_os(mod_src_);
+    rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID())
+        .write(tmp_os);
+  }
+
+  for (auto func : func_range_) {
+    auto f = func.first;
+    string bd = rewriter_->getRewrittenText(func_range_[f]);
+    func_src_.set_src_rewritten(f, bd);
+  }
+  rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).write(os_);
+  os_.flush();
+}
+
+unique_ptr<ASTConsumer> BFrontendAction::CreateASTConsumer(CompilerInstance &Compiler, llvm::StringRef InFile) {
+  rewriter_->setSourceMgr(Compiler.getSourceManager(), Compiler.getLangOpts());
+  vector<unique_ptr<ASTConsumer>> consumers;
+  consumers.push_back(unique_ptr<ASTConsumer>(new BTypeConsumer(Compiler.getASTContext(), *this, *rewriter_, m_)));
+  return unique_ptr<ASTConsumer>(new MultiplexConsumer(std::move(consumers)));
+}
+
+}
diff --git a/src/cc/frontends/clang/b_frontend_action.h b/src/cc/frontends/clang/b_frontend_action.h
new file mode 100644
index 0000000..4559d11
--- /dev/null
+++ b/src/cc/frontends/clang/b_frontend_action.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <clang/AST/RecursiveASTVisitor.h>
+#include <clang/Frontend/FrontendAction.h>
+#include <clang/Rewrite/Core/Rewriter.h>
+
+#include "table_storage.h"
+
+namespace clang {
+class ASTConsumer;
+class ASTContext;
+class CompilerInstance;
+}
+
+namespace llvm {
+class raw_ostream;
+class StringRef;
+}
+
+namespace ebpf {
+
+class BFrontendAction;
+class FuncSource;
+
+// Traces maps with external pointers as values.
+class MapVisitor : public clang::RecursiveASTVisitor<MapVisitor> {
+ public:
+  explicit MapVisitor(std::set<clang::Decl *> &m);
+  bool VisitCallExpr(clang::CallExpr *Call);
+  void set_ptreg(std::tuple<clang::Decl *, int> &pt) { ptregs_.insert(pt); }
+ private:
+  std::set<clang::Decl *> &m_;
+  std::set<std::tuple<clang::Decl *, int>> ptregs_;
+};
+
+// Type visitor and rewriter for B programs.
+// It will look for B-specific features and rewrite them into a valid
+// C program. As part of the processing, open the necessary BPF tables
+// and store the open handles in a map of table-to-fd's.
+class BTypeVisitor : public clang::RecursiveASTVisitor<BTypeVisitor> {
+ public:
+  explicit BTypeVisitor(clang::ASTContext &C, BFrontendAction &fe);
+  bool TraverseCallExpr(clang::CallExpr *Call);
+  bool VisitFunctionDecl(clang::FunctionDecl *D);
+  bool VisitCallExpr(clang::CallExpr *Call);
+  bool VisitVarDecl(clang::VarDecl *Decl);
+  bool VisitBinaryOperator(clang::BinaryOperator *E);
+  bool VisitImplicitCastExpr(clang::ImplicitCastExpr *E);
+
+ private:
+  clang::SourceRange expansionRange(clang::SourceRange range);
+  bool checkFormatSpecifiers(const std::string& fmt, clang::SourceLocation loc);
+  void genParamDirectAssign(clang::FunctionDecl *D, std::string& preamble,
+                            const char **calling_conv_regs);
+  void genParamIndirectAssign(clang::FunctionDecl *D, std::string& preamble,
+                              const char **calling_conv_regs);
+  void rewriteFuncParam(clang::FunctionDecl *D);
+  template <unsigned N>
+  clang::DiagnosticBuilder error(clang::SourceLocation loc, const char (&fmt)[N]);
+  template <unsigned N>
+  clang::DiagnosticBuilder warning(clang::SourceLocation loc, const char (&fmt)[N]);
+
+  clang::ASTContext &C;
+  clang::DiagnosticsEngine &diag_;
+  BFrontendAction &fe_;
+  clang::Rewriter &rewriter_;  /// modifications to the source go into this class
+  llvm::raw_ostream &out_;  /// for debugging
+  std::vector<clang::ParmVarDecl *> fn_args_;
+  std::set<clang::Expr *> visited_;
+  std::string current_fn_;
+};
+
+// Do a depth-first search to rewrite all pointers that need to be probed
+class ProbeVisitor : public clang::RecursiveASTVisitor<ProbeVisitor> {
+ public:
+  explicit ProbeVisitor(clang::ASTContext &C, clang::Rewriter &rewriter,
+                        std::set<clang::Decl *> &m, bool track_helpers);
+  bool VisitVarDecl(clang::VarDecl *Decl);
+  bool TraverseStmt(clang::Stmt *S);
+  bool VisitCallExpr(clang::CallExpr *Call);
+  bool VisitReturnStmt(clang::ReturnStmt *R);
+  bool VisitBinaryOperator(clang::BinaryOperator *E);
+  bool VisitUnaryOperator(clang::UnaryOperator *E);
+  bool VisitMemberExpr(clang::MemberExpr *E);
+  bool VisitArraySubscriptExpr(clang::ArraySubscriptExpr *E);
+  void set_ptreg(std::tuple<clang::Decl *, int> &pt) { ptregs_.insert(pt); }
+  void set_ctx(clang::Decl *D) { ctx_ = D; }
+  std::set<std::tuple<clang::Decl *, int>> get_ptregs() { return ptregs_; }
+ private:
+  bool assignsExtPtr(clang::Expr *E, int *nbAddrOf);
+  bool isMemberDereference(clang::Expr *E);
+  bool IsContextMemberExpr(clang::Expr *E);
+  clang::SourceRange expansionRange(clang::SourceRange range);
+  clang::SourceLocation expansionLoc(clang::SourceLocation loc);
+  template <unsigned N>
+  clang::DiagnosticBuilder error(clang::SourceLocation loc, const char (&fmt)[N]);
+
+  clang::ASTContext &C;
+  clang::Rewriter &rewriter_;
+  std::set<clang::Decl *> fn_visited_;
+  std::set<clang::Expr *> memb_visited_;
+  std::set<const clang::Stmt *> whitelist_;
+  std::set<std::tuple<clang::Decl *, int>> ptregs_;
+  std::set<clang::Decl *> &m_;
+  clang::Decl *ctx_;
+  bool track_helpers_;
+  std::list<int> ptregs_returned_;
+  const clang::Stmt *addrof_stmt_;
+  bool is_addrof_;
+};
+
+// A helper class to the frontend action, walks the decls
+class BTypeConsumer : public clang::ASTConsumer {
+ public:
+  explicit BTypeConsumer(clang::ASTContext &C, BFrontendAction &fe,
+                         clang::Rewriter &rewriter, std::set<clang::Decl *> &m);
+  void HandleTranslationUnit(clang::ASTContext &Context) override;
+ private:
+  BFrontendAction &fe_;
+  MapVisitor map_visitor_;
+  BTypeVisitor btype_visitor_;
+  ProbeVisitor probe_visitor1_;
+  ProbeVisitor probe_visitor2_;
+};
+
+// Create a B program in 2 phases (everything else is normal C frontend):
+// 1. Catch the map declarations and open the fd's
+// 2. Capture the IR
+class BFrontendAction : public clang::ASTFrontendAction {
+ public:
+  // Initialize with the output stream where the new source file contents
+  // should be written.
+  BFrontendAction(llvm::raw_ostream &os, unsigned flags, TableStorage &ts,
+                  const std::string &id, const std::string &main_path,
+                  FuncSource &func_src, std::string &mod_src,
+                  const std::string &maps_ns);
+
+  // Called by clang when the AST has been completed, here the output stream
+  // will be flushed.
+  void EndSourceFileAction() override;
+
+  std::unique_ptr<clang::ASTConsumer>
+      CreateASTConsumer(clang::CompilerInstance &Compiler, llvm::StringRef InFile) override;
+
+  clang::Rewriter &rewriter() const { return *rewriter_; }
+  TableStorage &table_storage() const { return ts_; }
+  std::string id() const { return id_; }
+  std::string maps_ns() const { return maps_ns_; }
+  bool is_rewritable_ext_func(clang::FunctionDecl *D);
+  void DoMiscWorkAround();
+
+ private:
+  llvm::raw_ostream &os_;
+  unsigned flags_;
+  TableStorage &ts_;
+  std::string id_;
+  std::string maps_ns_;
+  std::unique_ptr<clang::Rewriter> rewriter_;
+  friend class BTypeVisitor;
+  std::map<std::string, clang::SourceRange> func_range_;
+  const std::string &main_path_;
+  FuncSource &func_src_;
+  std::string &mod_src_;
+  std::set<clang::Decl *> m_;
+};
+
+}  // namespace visitor
diff --git a/src/cc/frontends/clang/frontend_action_common.h b/src/cc/frontends/clang/frontend_action_common.h
new file mode 100644
index 0000000..ec819f6
--- /dev/null
+++ b/src/cc/frontends/clang/frontend_action_common.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if LLVM_MAJOR_VERSION >= 8
+#define GET_BEGINLOC(E)  ((E)->getBeginLoc())
+#define GET_ENDLOC(E)  ((E)->getEndLoc())
+#else
+#define GET_BEGINLOC(E)  ((E)->getLocStart())
+#define GET_ENDLOC(E)    ((E)->getLocEnd())
+#endif
diff --git a/src/cc/frontends/clang/kbuild_helper.cc b/src/cc/frontends/clang/kbuild_helper.cc
new file mode 100644
index 0000000..63bb7d2
--- /dev/null
+++ b/src/cc/frontends/clang/kbuild_helper.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fcntl.h>
+#include <stdlib.h>
+#include <iostream>
+#include "kbuild_helper.h"
+
+namespace ebpf {
+
+using std::string;
+using std::vector;
+
+KBuildHelper::KBuildHelper(const std::string &kdir, bool has_source_dir) : kdir_(kdir),
+                                                                           has_source_dir_(has_source_dir) {
+}
+
+// read the flags from cache or learn
+int KBuildHelper::get_flags(const char *uname_machine, vector<string> *cflags) {
+  //uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/sun4u/sparc64/ -e s/arm.*/arm/
+  //               -e s/sa110/arm/ -e s/s390x/s390/ -e s/parisc64/parisc/
+  //               -e s/ppc.*/powerpc/ -e s/mips.*/mips/ -e s/sh[234].*/sh/
+  //               -e s/aarch64.*/arm64/
+
+  string arch = uname_machine;
+  const char *archenv;
+
+  if (!strncmp(uname_machine, "x86_64", 6)) {
+    arch = "x86";
+  } else if (uname_machine[0] == 'i' && !strncmp(&uname_machine[2], "86", 2)) {
+    arch = "x86";
+  } else if (!strncmp(uname_machine, "arm", 3)) {
+    arch = "arm";
+  } else if (!strncmp(uname_machine, "sa110", 5)) {
+    arch = "arm";
+  } else if (!strncmp(uname_machine, "s390x", 5)) {
+    arch = "s390";
+  } else if (!strncmp(uname_machine, "parisc64", 8)) {
+    arch = "parisc";
+  } else if (!strncmp(uname_machine, "ppc", 3)) {
+    arch = "powerpc";
+  } else if (!strncmp(uname_machine, "mips", 4)) {
+    arch = "mips";
+  } else if (!strncmp(uname_machine, "sh", 2)) {
+    arch = "sh";
+  } else if (!strncmp(uname_machine, "aarch64", 7)) {
+    arch = "arm64";
+  }
+
+  // If ARCH env is defined, use it over uname
+  archenv = getenv("ARCH");
+  if (archenv)
+    arch = string(archenv);
+
+  cflags->push_back("-nostdinc");
+  cflags->push_back("-isystem");
+  cflags->push_back("/virtual/lib/clang/include");
+
+  // some module build directories split headers between source/ and build/
+  if (has_source_dir_) {
+    cflags->push_back("-I" + kdir_ + "/build/arch/"+arch+"/include");
+    cflags->push_back("-I" + kdir_ + "/build/arch/"+arch+"/include/generated/uapi");
+    cflags->push_back("-I" + kdir_ + "/build/arch/"+arch+"/include/generated");
+    cflags->push_back("-I" + kdir_ + "/build/include");
+    cflags->push_back("-I" + kdir_ + "/build/./arch/"+arch+"/include/uapi");
+    cflags->push_back("-I" + kdir_ + "/build/arch/"+arch+"/include/generated/uapi");
+    cflags->push_back("-I" + kdir_ + "/build/include/uapi");
+    cflags->push_back("-I" + kdir_ + "/build/include/generated");
+    cflags->push_back("-I" + kdir_ + "/build/include/generated/uapi");
+  }
+
+  cflags->push_back("-I./arch/"+arch+"/include");
+  cflags->push_back("-Iarch/"+arch+"/include/generated/uapi");
+  cflags->push_back("-Iarch/"+arch+"/include/generated");
+  cflags->push_back("-Iinclude");
+  cflags->push_back("-I./arch/"+arch+"/include/uapi");
+  cflags->push_back("-Iarch/"+arch+"/include/generated/uapi");
+  cflags->push_back("-I./include/uapi");
+  cflags->push_back("-Iinclude/generated/uapi");
+  cflags->push_back("-include");
+  cflags->push_back("./include/linux/kconfig.h");
+  cflags->push_back("-D__KERNEL__");
+  cflags->push_back("-D__HAVE_BUILTIN_BSWAP16__");
+  cflags->push_back("-D__HAVE_BUILTIN_BSWAP32__");
+  cflags->push_back("-D__HAVE_BUILTIN_BSWAP64__");
+
+  // If ARCH env variable is set, pass this along.
+  if (archenv)
+	cflags->push_back("-D__TARGET_ARCH_" + arch);
+
+  cflags->push_back("-Wno-unused-value");
+  cflags->push_back("-Wno-pointer-sign");
+  cflags->push_back("-fno-stack-protector");
+
+  return 0;
+}
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/clang/kbuild_helper.h b/src/cc/frontends/clang/kbuild_helper.h
new file mode 100644
index 0000000..5a271ff
--- /dev/null
+++ b/src/cc/frontends/clang/kbuild_helper.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unistd.h>
+#include <errno.h>
+#include <ftw.h>
+
+namespace ebpf {
+
+struct FileDeleter {
+  void operator() (FILE *fp) {
+    fclose(fp);
+  }
+};
+typedef std::unique_ptr<FILE, FileDeleter> FILEPtr;
+
+// Helper with pushd/popd semantics
+class DirStack {
+ public:
+  explicit DirStack(const std::string &dst) : ok_(false) {
+    if (getcwd(cwd_, sizeof(cwd_)) == NULL) {
+      ::perror("getcwd");
+      return;
+    }
+    if (::chdir(dst.c_str())) {
+      fprintf(stderr, "chdir(%s): %s\n", dst.c_str(), strerror(errno));
+      return;
+    }
+    ok_ = true;
+  }
+  ~DirStack() {
+    if (!ok_) return;
+    if (::chdir(cwd_)) {
+      fprintf(stderr, "chdir(%s): %s\n", cwd_, strerror(errno));
+    }
+  }
+  bool ok() const { return ok_; }
+  const char * cwd() const { return cwd_; }
+ private:
+  bool ok_;
+  char cwd_[256];
+};
+
+static int ftw_cb(const char *path, const struct stat *, int, struct FTW *) {
+  return ::remove(path);
+}
+
+// Scoped class to manage the creation/deletion of tmpdirs
+class TmpDir {
+ public:
+  explicit TmpDir(const std::string &prefix = "/tmp/bcc-")
+      : ok_(false), prefix_(prefix) {
+    prefix_ += "XXXXXX";
+    if (::mkdtemp((char *)prefix_.data()) == NULL)
+      ::perror("mkdtemp");
+    else
+      ok_ = true;
+  }
+  ~TmpDir() {
+    if (::nftw(prefix_.c_str(), ftw_cb, 20, FTW_DEPTH) < 0)
+      ::perror("ftw");
+    else
+      ::remove(prefix_.c_str());
+  }
+  bool ok() const { return ok_; }
+  const std::string & str() const { return prefix_; }
+ private:
+  bool ok_;
+  std::string prefix_;
+};
+
+// Compute the kbuild flags for the currently running kernel
+// Do this by:
+//   1. Create temp Makefile with stub dummy.c
+//   2. Run module build on that makefile, saving the computed flags to a file
+//   3. Cache the file for fast flag lookup in subsequent runs
+//  Note: Depending on environment, different cache locations may be desired. In
+//  case we eventually support non-root user programs, cache in $HOME.
+class KBuildHelper {
+ public:
+  explicit KBuildHelper(const std::string &kdir, bool has_source_dir);
+  int get_flags(const char *uname_machine, std::vector<std::string> *cflags);
+ private:
+  std::string kdir_;
+  bool has_source_dir_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc
new file mode 100755
index 0000000..f461ded
--- /dev/null
+++ b/src/cc/frontends/clang/loader.cc
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <string>
+#include <algorithm>
+#include <fcntl.h>
+#include <ftw.h>
+#include <map>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <utility>
+#include <vector>
+#include <iostream>
+#include <linux/bpf.h>
+
+#include <clang/Basic/FileManager.h>
+#include <clang/Basic/TargetInfo.h>
+#include <clang/CodeGen/BackendUtil.h>
+#include <clang/CodeGen/CodeGenAction.h>
+#include <clang/Driver/Compilation.h>
+#include <clang/Driver/Driver.h>
+#include <clang/Driver/Job.h>
+#include <clang/Driver/Tool.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/CompilerInvocation.h>
+#include <clang/Frontend/FrontendActions.h>
+#include <clang/Frontend/FrontendDiagnostic.h>
+#include <clang/Frontend/TextDiagnosticPrinter.h>
+#include <clang/FrontendTool/Utils.h>
+#include <clang/Lex/PreprocessorOptions.h>
+
+#include <llvm/IR/Module.h>
+
+#include "bcc_exception.h"
+#include "bpf_module.h"
+#include "exported_files.h"
+#include "kbuild_helper.h"
+#include "b_frontend_action.h"
+#include "tp_frontend_action.h"
+#include "loader.h"
+#include "arch_helper.h"
+
+using std::map;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+
+namespace ebpf {
+
+ClangLoader::ClangLoader(llvm::LLVMContext *ctx, unsigned flags)
+    : ctx_(ctx), flags_(flags)
+{
+  for (auto f : ExportedFiles::headers())
+    remapped_headers_[f.first] = llvm::MemoryBuffer::getMemBuffer(f.second);
+  for (auto f : ExportedFiles::footers())
+    remapped_footers_[f.first] = llvm::MemoryBuffer::getMemBuffer(f.second);
+}
+
+ClangLoader::~ClangLoader() {}
+
+namespace
+{
+
+bool is_dir(const string& path)
+{
+  struct stat buf;
+
+  if (::stat (path.c_str (), &buf) < 0)
+    return false;
+
+  return S_ISDIR(buf.st_mode);
+}
+
+std::pair<bool, string> get_kernel_path_info(const string kdir)
+{
+  if (is_dir(kdir + "/build") && is_dir(kdir + "/source"))
+    return std::make_pair (true, "source");
+
+  const char* suffix_from_env = ::getenv("BCC_KERNEL_MODULES_SUFFIX");
+  if (suffix_from_env)
+    return std::make_pair(false, string(suffix_from_env));
+
+  return std::make_pair(false, "build");
+}
+
+}
+
+int ClangLoader::parse(unique_ptr<llvm::Module> *mod, TableStorage &ts,
+                       const string &file, bool in_memory, const char *cflags[],
+                       int ncflags, const std::string &id, FuncSource &func_src,
+                       std::string &mod_src,
+                       const std::string &maps_ns) {
+  string main_path = "/virtual/main.c";
+  unique_ptr<llvm::MemoryBuffer> main_buf;
+  struct utsname un;
+  uname(&un);
+  string kdir, kpath;
+  const char *kpath_env = ::getenv("BCC_KERNEL_SOURCE");
+  const char *version_override = ::getenv("BCC_LINUX_VERSION_CODE");
+  bool has_kpath_source = false;
+  string vmacro;
+
+  if (kpath_env) {
+    kpath = string(kpath_env);
+  } else {
+    kdir = string(KERNEL_MODULES_DIR) + "/" + un.release;
+    auto kernel_path_info = get_kernel_path_info(kdir);
+    has_kpath_source = kernel_path_info.first;
+    kpath = kdir + "/" + kernel_path_info.second;
+  }
+
+  if (flags_ & DEBUG_PREPROCESSOR)
+    std::cout << "Running from kernel directory at: " << kpath.c_str() << "\n";
+
+  // clang needs to run inside the kernel dir
+  DirStack dstack(kpath);
+  if (!dstack.ok())
+    return -1;
+
+  string abs_file;
+  if (in_memory) {
+    abs_file = main_path;
+    main_buf = llvm::MemoryBuffer::getMemBuffer(file);
+  } else {
+    if (file.substr(0, 1) == "/")
+      abs_file = file;
+    else
+      abs_file = string(dstack.cwd()) + "/" + file;
+  }
+
+  // -fno-color-diagnostics: this is a workaround for a bug in llvm terminalHasColors() as of
+  // 22 Jul 2016. Also see bcc #615.
+  // Enable -O2 for clang. In clang 5.0, -O0 may result in function marking as
+  // noinline and optnone (if not always inlining).
+  // Note that first argument is ignored in clang compilation invocation.
+  // "-D __BPF_TRACING__" below is added to suppress a warning in 4.17+.
+  // It can be removed once clang supports asm-goto or the kernel removes
+  // the warning.
+  vector<const char *> flags_cstr({"-O0", "-O2", "-emit-llvm", "-I", dstack.cwd(),
+                                   "-D", "__BPF_TRACING__",
+                                   "-Wno-deprecated-declarations",
+                                   "-Wno-gnu-variable-sized-type-not-at-end",
+                                   "-Wno-pragma-once-outside-header",
+                                   "-Wno-address-of-packed-member",
+                                   "-Wno-unknown-warning-option",
+                                   "-fno-color-diagnostics",
+                                   "-fno-unwind-tables",
+                                   "-fno-asynchronous-unwind-tables",
+                                   "-x", "c", "-c", abs_file.c_str()});
+
+  KBuildHelper kbuild_helper(kpath_env ? kpath : kdir, has_kpath_source);
+
+  vector<string> kflags;
+  if (kbuild_helper.get_flags(un.machine, &kflags))
+    return -1;
+  if (flags_ & DEBUG_SOURCE)
+    flags_cstr.push_back("-g");
+  for (auto it = kflags.begin(); it != kflags.end(); ++it)
+    flags_cstr.push_back(it->c_str());
+
+  vector<const char *> flags_cstr_rem;
+
+  if (version_override) {
+    vmacro = "-DLINUX_VERSION_CODE_OVERRIDE=" + string(version_override);
+
+    std::cout << "WARNING: Linux version for eBPF program is being overridden with: " << version_override << "\n";
+    std::cout << "WARNING: Due to this, the results of the program may be unpredictable\n";
+    flags_cstr_rem.push_back(vmacro.c_str());
+  }
+
+  flags_cstr_rem.push_back("-include");
+  flags_cstr_rem.push_back("/virtual/include/bcc/helpers.h");
+  flags_cstr_rem.push_back("-isystem");
+  flags_cstr_rem.push_back("/virtual/include");
+  if (cflags) {
+    for (auto i = 0; i < ncflags; ++i)
+      flags_cstr_rem.push_back(cflags[i]);
+  }
+#ifdef CUR_CPU_IDENTIFIER
+  string cur_cpu_flag = string("-DCUR_CPU_IDENTIFIER=") + CUR_CPU_IDENTIFIER;
+  flags_cstr_rem.push_back(cur_cpu_flag.c_str());
+#endif
+
+  if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path,
+                 main_buf, id, func_src, mod_src, true, maps_ns)) {
+#if BCC_BACKUP_COMPILE != 1
+    return -1;
+#else
+    // try one more time to compile with system bpf.h
+    llvm::errs() << "WARNING: compilation failure, trying with system bpf.h\n";
+
+    ts.DeletePrefix(Path({id}));
+    func_src.clear();
+    mod_src.clear();
+    if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path,
+                   main_buf, id, func_src, mod_src, false, maps_ns))
+      return -1;
+#endif
+  }
+
+  return 0;
+}
+
+void *get_clang_target_cb(bcc_arch_t arch)
+{
+  const char *ret;
+
+  switch(arch) {
+    case BCC_ARCH_PPC_LE:
+      ret = "powerpc64le-unknown-linux-gnu";
+      break;
+    case BCC_ARCH_PPC:
+      ret = "powerpc64-unknown-linux-gnu";
+      break;
+    case BCC_ARCH_S390X:
+      ret = "s390x-ibm-linux-gnu";
+      break;
+    case BCC_ARCH_ARM64:
+      ret = "aarch64-unknown-linux-gnu";
+      break;
+    default:
+      ret = "x86_64-unknown-linux-gnu";
+  }
+
+  return (void *)ret;
+}
+
+string get_clang_target(void) {
+  const char *ret;
+
+  ret = (const char *)run_arch_callback(get_clang_target_cb);
+  return string(ret);
+}
+
+int ClangLoader::do_compile(unique_ptr<llvm::Module> *mod, TableStorage &ts,
+                            bool in_memory,
+                            const vector<const char *> &flags_cstr_in,
+                            const vector<const char *> &flags_cstr_rem,
+                            const std::string &main_path,
+                            const unique_ptr<llvm::MemoryBuffer> &main_buf,
+                            const std::string &id, FuncSource &func_src,
+                            std::string &mod_src, bool use_internal_bpfh,
+                            const std::string &maps_ns) {
+  using namespace clang;
+
+  vector<const char *> flags_cstr = flags_cstr_in;
+  if (use_internal_bpfh) {
+    flags_cstr.push_back("-include");
+    flags_cstr.push_back("/virtual/include/bcc/bpf.h");
+  }
+  flags_cstr.insert(flags_cstr.end(), flags_cstr_rem.begin(),
+                    flags_cstr_rem.end());
+
+  // set up the error reporting class
+  IntrusiveRefCntPtr<DiagnosticOptions> diag_opts(new DiagnosticOptions());
+  auto diag_client = new TextDiagnosticPrinter(llvm::errs(), &*diag_opts);
+
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID(new DiagnosticIDs());
+  DiagnosticsEngine diags(DiagID, &*diag_opts, diag_client);
+
+  // set up the command line argument wrapper
+
+  string target_triple = get_clang_target();
+  driver::Driver drv("", target_triple, diags);
+
+  drv.setTitle("bcc-clang-driver");
+  drv.setCheckInputsExist(false);
+
+  unique_ptr<driver::Compilation> compilation(drv.BuildCompilation(flags_cstr));
+  if (!compilation)
+    return -1;
+
+  // expect exactly 1 job, otherwise error
+  const driver::JobList &jobs = compilation->getJobs();
+  if (jobs.size() != 1 || !isa<driver::Command>(*jobs.begin())) {
+    SmallString<256> msg;
+    llvm::raw_svector_ostream os(msg);
+    jobs.Print(os, "; ", true);
+    diags.Report(diag::err_fe_expected_compiler_job) << os.str();
+    return -1;
+  }
+
+  const driver::Command &cmd = cast<driver::Command>(*jobs.begin());
+  if (llvm::StringRef(cmd.getCreator().getName()) != "clang") {
+    diags.Report(diag::err_fe_expected_clang_command);
+    return -1;
+  }
+
+  // Initialize a compiler invocation object from the clang (-cc1) arguments.
+  const llvm::opt::ArgStringList &ccargs = cmd.getArguments();
+
+  if (flags_ & DEBUG_PREPROCESSOR) {
+    llvm::errs() << "clang";
+    for (auto arg : ccargs)
+      llvm::errs() << " " << arg;
+    llvm::errs() << "\n";
+  }
+
+  // pre-compilation pass for generating tracepoint structures
+  CompilerInstance compiler0;
+  CompilerInvocation &invocation0 = compiler0.getInvocation();
+  if (!CompilerInvocation::CreateFromArgs(
+          invocation0, const_cast<const char **>(ccargs.data()),
+          const_cast<const char **>(ccargs.data()) + ccargs.size(), diags))
+    return -1;
+
+  invocation0.getPreprocessorOpts().RetainRemappedFileBuffers = true;
+  for (const auto &f : remapped_headers_)
+    invocation0.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+  for (const auto &f : remapped_footers_)
+    invocation0.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+
+  if (in_memory) {
+    invocation0.getPreprocessorOpts().addRemappedFile(main_path, &*main_buf);
+    invocation0.getFrontendOpts().Inputs.clear();
+    invocation0.getFrontendOpts().Inputs.push_back(FrontendInputFile(
+        main_path, FrontendOptions::getInputKindForExtension("c")));
+  }
+  invocation0.getFrontendOpts().DisableFree = false;
+
+  compiler0.createDiagnostics(new IgnoringDiagConsumer());
+
+  // capture the rewritten c file
+  string out_str;
+  llvm::raw_string_ostream os(out_str);
+  TracepointFrontendAction tpact(os);
+  compiler0.ExecuteAction(tpact); // ignore errors, they will be reported later
+  unique_ptr<llvm::MemoryBuffer> out_buf = llvm::MemoryBuffer::getMemBuffer(out_str);
+
+  // first pass
+  CompilerInstance compiler1;
+  CompilerInvocation &invocation1 = compiler1.getInvocation();
+  if (!CompilerInvocation::CreateFromArgs(
+          invocation1, const_cast<const char **>(ccargs.data()),
+          const_cast<const char **>(ccargs.data()) + ccargs.size(), diags))
+    return -1;
+
+  // This option instructs clang whether or not to free the file buffers that we
+  // give to it. Since the embedded header files should be copied fewer times
+  // and reused if possible, set this flag to true.
+  invocation1.getPreprocessorOpts().RetainRemappedFileBuffers = true;
+  for (const auto &f : remapped_headers_)
+    invocation1.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+  for (const auto &f : remapped_footers_)
+    invocation1.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+  invocation1.getPreprocessorOpts().addRemappedFile(main_path, &*out_buf);
+  invocation1.getFrontendOpts().Inputs.clear();
+  invocation1.getFrontendOpts().Inputs.push_back(FrontendInputFile(
+      main_path, FrontendOptions::getInputKindForExtension("c")));
+  invocation1.getFrontendOpts().DisableFree = false;
+
+  compiler1.createDiagnostics();
+
+  // capture the rewritten c file
+  string out_str1;
+  llvm::raw_string_ostream os1(out_str1);
+  BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src, maps_ns);
+  if (!compiler1.ExecuteAction(bact))
+    return -1;
+  unique_ptr<llvm::MemoryBuffer> out_buf1 = llvm::MemoryBuffer::getMemBuffer(out_str1);
+
+  // second pass, clear input and take rewrite buffer
+  CompilerInstance compiler2;
+  CompilerInvocation &invocation2 = compiler2.getInvocation();
+  if (!CompilerInvocation::CreateFromArgs(
+          invocation2, const_cast<const char **>(ccargs.data()),
+          const_cast<const char **>(ccargs.data()) + ccargs.size(), diags))
+    return -1;
+  invocation2.getPreprocessorOpts().RetainRemappedFileBuffers = true;
+  for (const auto &f : remapped_headers_)
+    invocation2.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+  for (const auto &f : remapped_footers_)
+    invocation2.getPreprocessorOpts().addRemappedFile(f.first, &*f.second);
+  invocation2.getPreprocessorOpts().addRemappedFile(main_path, &*out_buf1);
+  invocation2.getFrontendOpts().Inputs.clear();
+  invocation2.getFrontendOpts().Inputs.push_back(FrontendInputFile(
+      main_path, FrontendOptions::getInputKindForExtension("c")));
+  invocation2.getFrontendOpts().DisableFree = false;
+  invocation2.getCodeGenOpts().DisableFree = false;
+  // Resort to normal inlining. In -O0 the default is OnlyAlwaysInlining and
+  // clang might add noinline attribute even for functions with inline hint.
+  invocation2.getCodeGenOpts().setInlining(CodeGenOptions::NormalInlining);
+  // suppress warnings in the 2nd pass, but bail out on errors (our fault)
+  invocation2.getDiagnosticOpts().IgnoreWarnings = true;
+  compiler2.createDiagnostics();
+
+  EmitLLVMOnlyAction ir_act(&*ctx_);
+  if (!compiler2.ExecuteAction(ir_act))
+    return -1;
+  *mod = ir_act.takeModule();
+
+  return 0;
+}
+
+const char * FuncSource::src(const std::string& name) {
+  auto src = funcs_.find(name);
+  if (src == funcs_.end())
+    return "";
+  return src->second.src_.data();
+}
+
+const char * FuncSource::src_rewritten(const std::string& name) {
+  auto src = funcs_.find(name);
+  if (src == funcs_.end())
+    return "";
+  return src->second.src_rewritten_.data();
+}
+
+void FuncSource::set_src(const std::string& name, const std::string& src) {
+  funcs_[name].src_ = src;
+}
+
+void FuncSource::set_src_rewritten(const std::string& name, const std::string& src) {
+  funcs_[name].src_rewritten_ = src;
+}
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/clang/loader.h b/src/cc/frontends/clang/loader.h
new file mode 100644
index 0000000..1aeb652
--- /dev/null
+++ b/src/cc/frontends/clang/loader.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "table_storage.h"
+
+namespace llvm {
+class Module;
+class LLVMContext;
+class MemoryBuffer;
+}
+
+namespace ebpf {
+
+class FuncSource {
+  class SourceCode {
+   public:
+    SourceCode(const std::string& s1 = "", const std::string& s2 = ""): src_(s1), src_rewritten_(s2) {}
+    std::string src_;
+    std::string src_rewritten_;
+  };
+  std::map<std::string, SourceCode> funcs_;
+ public:
+  FuncSource() {}
+  void clear() { funcs_.clear(); }
+  const char * src(const std::string& name);
+  const char * src_rewritten(const std::string& name);
+  void set_src(const std::string& name, const std::string& src);
+  void set_src_rewritten(const std::string& name, const std::string& src);
+};
+
+class ClangLoader {
+ public:
+  explicit ClangLoader(llvm::LLVMContext *ctx, unsigned flags);
+  ~ClangLoader();
+  int parse(std::unique_ptr<llvm::Module> *mod, TableStorage &ts,
+            const std::string &file, bool in_memory, const char *cflags[],
+            int ncflags, const std::string &id, FuncSource &func_src,
+            std::string &mod_src, const std::string &maps_ns);
+
+ private:
+  int do_compile(std::unique_ptr<llvm::Module> *mod, TableStorage &ts,
+                 bool in_memory, const std::vector<const char *> &flags_cstr_in,
+                 const std::vector<const char *> &flags_cstr_rem,
+                 const std::string &main_path,
+                 const std::unique_ptr<llvm::MemoryBuffer> &main_buf,
+                 const std::string &id, FuncSource &func_src,
+                 std::string &mod_src, bool use_internal_bpfh,
+                 const std::string &maps_ns);
+
+ private:
+  std::map<std::string, std::unique_ptr<llvm::MemoryBuffer>> remapped_headers_;
+  std::map<std::string, std::unique_ptr<llvm::MemoryBuffer>> remapped_footers_;
+  llvm::LLVMContext *ctx_;
+  unsigned flags_;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/frontends/clang/tp_frontend_action.cc b/src/cc/frontends/clang/tp_frontend_action.cc
new file mode 100644
index 0000000..d6faf01
--- /dev/null
+++ b/src/cc/frontends/clang/tp_frontend_action.cc
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2016 Sasha Goldshtein
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <regex>
+
+#include <clang/AST/ASTConsumer.h>
+#include <clang/AST/ASTContext.h>
+#include <clang/AST/RecordLayout.h>
+#include <clang/Frontend/CompilerInstance.h>
+#include <clang/Frontend/MultiplexConsumer.h>
+#include <clang/Rewrite/Core/Rewriter.h>
+
+#include "frontend_action_common.h"
+#include "tp_frontend_action.h"
+
+namespace ebpf {
+
+using std::map;
+using std::set;
+using std::string;
+using std::to_string;
+using std::unique_ptr;
+using std::vector;
+using std::regex;
+using std::smatch;
+using std::regex_search;
+using std::ifstream;
+using namespace clang;
+
+TracepointTypeVisitor::TracepointTypeVisitor(ASTContext &C, Rewriter &rewriter)
+    : C(C), diag_(C.getDiagnostics()), rewriter_(rewriter), out_(llvm::errs()) {
+}
+
+enum class field_kind_t {
+    common,
+    data_loc,
+    regular,
+    invalid
+};
+
+static inline field_kind_t _get_field_kind(string const& line,
+                                           string& field_type,
+                                           string& field_name) {
+  auto field_pos = line.find("field:");
+  if (field_pos == string::npos)
+    return field_kind_t::invalid;
+
+  auto field_semi_pos = line.find(';', field_pos);
+  if (field_semi_pos == string::npos)
+    return field_kind_t::invalid;
+
+  auto offset_pos = line.find("offset:", field_semi_pos);
+  if (offset_pos == string::npos)
+    return field_kind_t::invalid;
+
+  auto semi_pos = line.find(';', offset_pos);
+  if (semi_pos == string::npos)
+    return field_kind_t::invalid;
+
+  auto size_pos = line.find("size:", semi_pos);
+  if (size_pos == string::npos)
+    return field_kind_t::invalid;
+
+  semi_pos = line.find(';', size_pos);
+  if (semi_pos == string::npos)
+    return field_kind_t::invalid;
+
+  auto size_str = line.substr(size_pos + 5,
+                              semi_pos - size_pos - 5);
+  int size = std::stoi(size_str, nullptr);
+
+  auto field = line.substr(field_pos + 6/*"field:"*/,
+                           field_semi_pos - field_pos - 6);
+  auto pos = field.find_last_of("\t ");
+  if (pos == string::npos)
+    return field_kind_t::invalid;
+
+  field_type = field.substr(0, pos);
+  field_name = field.substr(pos + 1);
+  if (field_type.find("__data_loc") != string::npos)
+    return field_kind_t::data_loc;
+  if (field_name.find("common_") == 0)
+    return field_kind_t::common;
+  // do not change type definition for array
+  if (field_name.find("[") != string::npos)
+    return field_kind_t::regular;
+
+  // adjust the field_type based on the size of field
+  // otherwise, incorrect value may be retrieved for big endian
+  // and the field may have incorrect structure offset.
+  if (size == 2) {
+    if (field_type == "char" || field_type == "int8_t")
+      field_type = "s16";
+    if (field_type == "unsigned char" || field_type == "uint8_t")
+      field_type = "u16";
+  } else if (size == 4) {
+    if (field_type == "char" || field_type == "short" ||
+        field_type == "int8_t" || field_type == "int16_t")
+      field_type = "s32";
+    if (field_type == "unsigned char" || field_type == "unsigned short" ||
+        field_type == "uint8_t" || field_type == "uint16_t")
+      field_type = "u32";
+  } else if (size == 8) {
+    if (field_type == "char" || field_type == "short" || field_type == "int" ||
+        field_type == "int8_t" || field_type == "int16_t" ||
+        field_type == "int32_t" || field_type == "pid_t")
+      field_type = "s64";
+    if (field_type == "unsigned char" || field_type == "unsigned short" ||
+        field_type == "unsigned int" || field_type == "uint8_t" ||
+        field_type == "uint16_t" || field_type == "uint32_t" ||
+        field_type == "unsigned" || field_type == "u32" ||
+        field_type == "uid_t" || field_type == "gid_t")
+      field_type = "u64";
+  }
+
+  return field_kind_t::regular;
+}
+
+string TracepointTypeVisitor::GenerateTracepointStruct(
+    SourceLocation loc, string const& category, string const& event) {
+  string format_file = "/sys/kernel/debug/tracing/events/" +
+    category + "/" + event + "/format";
+  ifstream input(format_file.c_str());
+  if (!input)
+    return "";
+
+  string tp_struct = "struct tracepoint__" + category + "__" + event + " {\n";
+  tp_struct += "\tu64 __do_not_use__;\n";
+  for (string line; getline(input, line); ) {
+    string field_type, field_name;
+    switch (_get_field_kind(line, field_type, field_name)) {
+    case field_kind_t::invalid:
+    case field_kind_t::common:
+        continue;
+    case field_kind_t::data_loc:
+        tp_struct += "\tint data_loc_" + field_name + ";\n";
+        break;
+    case field_kind_t::regular:
+        tp_struct += "\t" + field_type + " " + field_name + ";\n";
+        break;
+    }
+  }
+
+  tp_struct += "};\n";
+  return tp_struct;
+}
+
+static inline bool _is_tracepoint_struct_type(string const& type_name,
+                                              string& tp_category,
+                                              string& tp_event) {
+  // We are looking to roughly match the regex:
+  //    (?:struct|class)\s+tracepoint__(\S+)__(\S+)
+  // Not using std::regex because older versions of GCC don't support it yet.
+  // E.g., the libstdc++ that ships with Ubuntu 14.04.
+
+  auto first_space_pos = type_name.find_first_of("\t ");
+  if (first_space_pos == string::npos)
+    return false;
+  auto first_tok = type_name.substr(0, first_space_pos);
+  if (first_tok != "struct" && first_tok != "class")
+    return false;
+
+  auto non_space_pos = type_name.find_first_not_of("\t ", first_space_pos);
+  auto second_space_pos = type_name.find_first_of("\t ", non_space_pos);
+  auto second_tok = type_name.substr(non_space_pos,
+                                     second_space_pos - non_space_pos);
+  if (second_tok.find("tracepoint__") != 0)
+    return false;
+
+  auto tp_event_pos = second_tok.rfind("__");
+  if (tp_event_pos == string::npos)
+    return false;
+  tp_event = second_tok.substr(tp_event_pos + 2);
+
+  auto tp_category_pos = second_tok.find("__");
+  if (tp_category_pos == tp_event_pos)
+    return false;
+  tp_category = second_tok.substr(tp_category_pos + 2,
+                                  tp_event_pos - tp_category_pos - 2);
+  return true;
+}
+
+
+bool TracepointTypeVisitor::VisitFunctionDecl(FunctionDecl *D) {
+  if (D->isExternallyVisible() && D->hasBody()) {
+    // If this function has a tracepoint structure as an argument,
+    // add that structure declaration based on the structure name.
+    for (auto it = D->param_begin(); it != D->param_end(); ++it) {
+      auto arg = *it;
+      auto type = arg->getType();
+      if (type->isPointerType() &&
+          type->getPointeeType()->isStructureOrClassType()) {
+        auto type_name = type->getPointeeType().getAsString();
+        string tp_cat, tp_evt;
+        if (_is_tracepoint_struct_type(type_name, tp_cat, tp_evt)) {
+          string tp_struct = GenerateTracepointStruct(
+              GET_BEGINLOC(D), tp_cat, tp_evt);
+          // Get the actual function declaration point (the macro instantiation
+          // point if using the TRACEPOINT_PROBE macro instead of the macro
+          // declaration point in bpf_helpers.h).
+          auto insert_loc = GET_BEGINLOC(D);
+          insert_loc = rewriter_.getSourceMgr().getFileLoc(insert_loc);
+          rewriter_.InsertText(insert_loc, tp_struct);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+TracepointTypeConsumer::TracepointTypeConsumer(ASTContext &C, Rewriter &rewriter)
+    : visitor_(C, rewriter) {
+}
+
+bool TracepointTypeConsumer::HandleTopLevelDecl(DeclGroupRef Group) {
+  for (auto D : Group)
+    visitor_.TraverseDecl(D);
+  return true;
+}
+
+TracepointFrontendAction::TracepointFrontendAction(llvm::raw_ostream &os)
+    : os_(os), rewriter_(new Rewriter) {
+}
+
+void TracepointFrontendAction::EndSourceFileAction() {
+  rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).write(os_);
+  os_.flush();
+}
+
+unique_ptr<ASTConsumer> TracepointFrontendAction::CreateASTConsumer(
+        CompilerInstance &Compiler, llvm::StringRef InFile) {
+  rewriter_->setSourceMgr(Compiler.getSourceManager(), Compiler.getLangOpts());
+  return unique_ptr<ASTConsumer>(new TracepointTypeConsumer(
+              Compiler.getASTContext(), *rewriter_));
+}
+
+}
diff --git a/src/cc/frontends/clang/tp_frontend_action.h b/src/cc/frontends/clang/tp_frontend_action.h
new file mode 100644
index 0000000..fb18c67
--- /dev/null
+++ b/src/cc/frontends/clang/tp_frontend_action.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016 Sasha Goldshtein
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <clang/AST/RecursiveASTVisitor.h>
+#include <clang/Frontend/FrontendAction.h>
+#include <clang/Rewrite/Core/Rewriter.h>
+
+namespace clang {
+class ASTConsumer;
+class ASTContext;
+class CompilerInstance;
+}
+
+namespace llvm {
+class raw_ostream;
+class StringRef;
+}
+
+namespace ebpf {
+
+// Visit functions that have a tracepoint argument structure in their signature
+// and automatically generate the structure on-the-fly.
+class TracepointTypeVisitor :
+  public clang::RecursiveASTVisitor<TracepointTypeVisitor> {
+ public:
+  explicit TracepointTypeVisitor(clang::ASTContext &C,
+                                 clang::Rewriter &rewriter);
+  bool VisitFunctionDecl(clang::FunctionDecl *D);
+
+ private:
+  std::string GenerateTracepointStruct(clang::SourceLocation loc,
+          std::string const& category, std::string const& event);
+
+  clang::ASTContext &C;
+  clang::DiagnosticsEngine &diag_;
+  clang::Rewriter &rewriter_;
+  llvm::raw_ostream &out_;
+};
+
+class TracepointTypeConsumer : public clang::ASTConsumer {
+ public:
+  explicit TracepointTypeConsumer(clang::ASTContext &C,
+                                  clang::Rewriter &rewriter);
+  bool HandleTopLevelDecl(clang::DeclGroupRef Group) override;
+ private:
+  TracepointTypeVisitor visitor_;
+};
+
+class TracepointFrontendAction : public clang::ASTFrontendAction {
+ public:
+  TracepointFrontendAction(llvm::raw_ostream &os);
+
+  void EndSourceFileAction() override;
+
+  std::unique_ptr<clang::ASTConsumer>
+      CreateASTConsumer(clang::CompilerInstance &Compiler, llvm::StringRef InFile) override;
+
+ private:
+  llvm::raw_ostream &os_;
+  std::unique_ptr<clang::Rewriter> rewriter_;
+};
+
+}  // namespace visitor
diff --git a/src/cc/frontends/p4/README.md b/src/cc/frontends/p4/README.md
new file mode 100644
index 0000000..4c7b50e
--- /dev/null
+++ b/src/cc/frontends/p4/README.md
@@ -0,0 +1,374 @@
+# Compiling P4 to EBPF
+
+Mihai Budiu - mbudiu@barefootnetworks.com
+
+September 22, 2015
+
+## Abstract
+
+This document describes a prototype compiler that translates programs
+written in the P4 programming languages to eBPF programs.  The
+translation is performed by generating programs written in a subset of
+the C programming language, that are converted to EBPF using the BPF
+Compiler Collection tools.
+
+The compiler code is licensed under an [Apache v2.0 license]
+(http://www.apache.org/licenses/LICENSE-2.0.html).
+
+## Preliminaries
+
+In this section we give a brief overview of P4 and EBPF.  A detailed
+treatment of these topics is outside the scope of this text.
+
+### P4
+
+P4 (http://p4.org) is a domain-specific programming language for
+specifying the behavior of the dataplanes of network-forwarding
+elements.  The name of the programming language comes from the title
+of a paper published in the proceedings of SIGCOMM Computer
+Communications Review in 2014:
+http://www.sigcomm.org/ccr/papers/2014/July/0000000.0000004:
+"Programming Protocol-Independent Packet Processors".
+
+P4 itself is protocol-independent but allows programmers to express a
+rich set of data plane behaviors and protocols. The core P4
+abstractions are:
+
+* Header definitions describe the format (the set of fields and their
+  sizes) of each header within a packet.
+
+* Parse graphs (finite-state machines) describe the permitted header
+  sequences within received packets.
+
+* Tables associate keys to actions. P4 tables generalize traditional
+  forwarding tables; they can be used to implement routing tables,
+  flow lookup tables, access-control lists, etc.
+
+* Actions describe how packet header fields and metadata are manipulated.
+
+* Match-action units stitch together tables and actions, and perform
+  the following sequence of operations:
+
+  * Construct lookup keys from packet fields or computed metadata,
+
+  * Use the constructed lookup key to index into tables, choosing an
+  action to execute,
+  
+  * Finally, execute the selected action.
+
+* Control flow is expressed as an imperative program describing the
+  data-dependent packet processing within a pipeline, including the
+  data-dependent sequence of match-action unit invocations.
+
+P4 programs describe the behavior of network-processing dataplanes.  A
+P4 program is designed to operate in concert with a separate *control
+plane* program.  The control plane is responsible for managing at
+runtime the contents of the P4 tables.  P4 cannot be used to specify
+control-planes; however, a P4 program implicitly specifies the
+interface between the data-plane and the control-plane.
+
+The P4 language is under active development; the current stable
+version is 1.0.2 (see http://p4.org/spec); a reference implementation
+of a compiler and associated tools is freely available using a Apache
+2 open-source license (see http://p4.org/code).
+
+### EBPF
+
+#### Safe code
+
+EBPF is a acronym that stands for Extended Berkeley Packet Filters.
+In essence EBPF is a low-level programming language (similar to
+machine code); EBPF programs are traditionally executed by a virtual
+machine that resides in the Linux kernel.  EBPF programs can be
+inserted and removed from a live kernel using dynamic code
+instrumentation.  The main feature of EBPF programs is their *static
+safety*: prior to execution all EBPF programs have to be validated as
+being safe, and unsafe programs cannot be executed.  A safe program
+provably cannot compromise the machine it is running on:
+
+* it can only access a restricted memory region (on the local stack)
+
+* it can run only for a limited amount of time; during execution it
+  cannot block, sleep or take any locks
+
+* it cannot use any kernel resources with the exception of a limited
+  set of kernel services which have been specifically whitelisted,
+  including operations to manipulate tables (described below)
+
+#### Kernel hooks
+
+EBPF programs are inserted into the kernel using *hooks*.  There are
+several types of hooks available:
+
+* any function entry point in the kernel can act as a hook; attaching
+  an EBPF program to a function `foo()` will cause the EBPF program to
+  execute every time some kernel thread executes `foo()`.
+
+* EBPF programs can also be attached using the Linux Traffic Control
+  (TC) subsystem, in the network packet processing datapath.  Such
+  programs can be used as TC classifiers and actions.
+
+* EBPF programs can also be attached to sockets or network interfaces.
+  In this case they can be used for processing packets that flow
+  through the socket/interface.
+
+EBPF programs can be used for many purposes; the main use cases are
+dynamic tracing and monitoring, and packet processing.  We are mostly
+interested in the latter use case in this document.
+
+#### EBPF Tables
+
+The EBPF runtime exposes a bi-directional kernel-userspace data
+communication channel, called *tables* (also called maps in some EBPF
+documents and code samples).  EBPF tables are essentially key-value
+stores, where keys and values are arbitrary fixed-size bitstrings.
+The key width, value width and table size (maximum number of entries
+that can be stored) are declared statically, at table creation time.
+
+In user-space tables handles are exposed as file descriptors.  Both
+user- and kernel-space programs can manipulate tables, by inserting,
+deleting, looking up, modifying, and enumerating entries in a table.
+
+In kernel space the keys and values are exposed as pointers to the raw
+underlying data stored in the table, whereas in user-space the
+pointers point to copies of the data.
+
+#### Concurrency
+
+An important aspect to understand related to EBPF is the execution
+model.  An EBPF program is triggered by a kernel hook; multiple
+instances of the same kernel hook can be running simultaneously on
+different cores.
+
+Each table however has a single instances across all the cores.  A
+single table may be accessed simultaneously by multiple instances of
+the same EBPF program running as separate kernel threads on different
+cores.  EBPF tables are native kernel objects, and access to the table
+contents is protected using the kernel RCU mechanism.  This makes
+access to table entries safe under concurrent execution; for example,
+the memory associated to a value cannot be accidentally freed while an
+EBPF program holds a pointer to the respective value.  However,
+accessing tables is prone to data races; since EBPF programs cannot
+use locks, some of these races often cannot be avoided.
+
+EBPF and the associated tools are also under active development, and
+new capabilities are added frequently.  The P4 compiler generates code
+that can be compiled using the BPF Compiler Collection (BCC)
+(https://github.com/iovisor/bcc)
+
+## Compiling P4 to EBPF
+
+From the above description it is apparent that the P4 and EBPF
+programming languages have different expressive powers.  However,
+there is a significant overlap in their capabilities, in particular,
+in the domain of network packet processing.  The following image
+illustrates the situation:
+
+![P4 and EBPF overlap in capabilities](scope.png)
+
+We expect that the overlapping region will grow in size as both P4 and
+EBPF continue to mature.
+
+The current version of the P4 to EBPF compiler translates programs
+written in the version 1.1 of the P4 programming language to programs
+written in a restricted subset of C.  The subset of C is chosen such
+that it should be compilable to EBPF using BCC.
+
+```
+         --------------              -------
+P4 --->  | P4-to-EBPF | ---> C ----> | BCC | --> EBPF
+         --------------              -------
+```
+
+The P4 program only describes the packet processing *data plane*, that
+runs in the Linux kernel.  The *control plane* must be separately
+implemented by the user.  The BCC tools simplify this task
+considerably, by generating C and/or Python APIs that expose the
+dataplane/control-plane APIs.
+
+### Dependencies
+
+EBPF programs require a Linux kernel with version 4.2 or newer.
+
+In order to use the P4 to EBPF compiler the following software must be installed:
+
+* The compiler itself is written in the Python (v2.x) programming
+  language.
+
+* the P4 compiler front-end: (https://github.com/p4lang/p4-hlir).
+  This is required for parsing the P4 programs.  
+
+* the BCC compiler collection tools: (https://github.com/iovisor/bcc).
+  This is required for compiling the generated code.  Also, BCC comes
+  with a set of Python utilities which can be used to implement
+  control-plane programs that operate in concert with the kernel EBPF
+  datapath.
+
+The P4 to EBPF compiler generates code that is designed for being used
+as a classifier using the Linux TC subsystem.
+
+Furthermore, the test code provided is written using the Python (v3.x)
+programming language and requires several Python packages to be
+installed.
+
+### Supported capabilities
+
+The current version of the P4 to EBPF compiler supports a relatively
+narrow subset of the P4 language, but still powerful enough to write
+very complex packet filters and simple packet forwarding engines.  In
+the spirit of open-source "release early, release often", we expect
+that the compiler's capabilities will improve gradually.
+
+* Packet filtering is performed using the `drop()` action.  Packets
+  that are not dropped will be forwarded.
+
+* Packet forwarding is performed by setting the
+  `standard_metadata.egress_port` to the index of the destination
+  network interface
+
+Here are some limitations imposed on the P4 programs:
+
+* Currently both the ingress and the egress P4 pipelines are executed
+  at the same hook (wherever the user chooses to insert the generated
+  EBPF program).  In the future the compiler should probably generate
+  two separate EBPF programs.
+
+* arbitrary parsers can be compiled, but the BCC compiler will reject
+  parsers that contain cycles
+
+* arithmetic on data wider than 32 bits is not supported
+
+* checksum computations are not implemented.  In consequence, programs
+  that IP/TCP/UDP headers will produce incorrect packet headers.
+
+* EBPF does not offer support for ternary or LPM tables
+
+* P4 cloning and recirculation and not supported
+
+* meters and registers are not supported; only direct counters are
+  currently supported.  EBPF can potentially support registers and
+  arbitrary counters, so these may appear in the future.
+
+* learning (i.e. `generate_digest`) is not implemented
+
+### Translating P4 to C
+
+To simplify the translation, the P4 programmer should refrain using
+identifiers whose name starts with `ebpf_`.
+
+The following table provides a brief summary of how each P4 construct
+is mapped to a corresponding C construct:
+
+#### Translating parsers
+
+P4 Construct | C Translation
+----------|------------
+`header_type` | `struct` type
+`header`      | `struct` instance with an additional `valid` bit
+`metadata`    | `struct` instance
+parser state  | code block
+state transition | `goto` statement
+`extract` | load/shift/mask data from packet buffer
+
+#### Translating match-action pipelines
+
+P4 Construct | C Translation
+----------|------------
+table  | 2 EBPF tables: second one used just for the default action
+table key | `struct` type
+table `actions` block | tagged `union` with all possible actions
+`action` arguments | `struct`
+table `reads` | EBPF table access
+`action` body | code block
+table `apply` | `switch` statement
+counters | additional EBPF table
+
+### Code organization
+
+The compiler code is organized in two folders:
+
+* `compiler`: the complete compiler source code, in Python v2.x
+  The compiler entry point is `p4toEbpf.py`.
+
+* `test`: testing code and data.  There are two testing programs:
+  * `testP4toEbpf.py`: which compiles all P4 files in the testprograms folder
+
+  * `endToEndTest.py`: which compiles and executes the simple.p4
+  program, and includes a simple control plane
+
+Currently the compiler contains no installation capabilities.
+
+### Invoking the compiler
+
+Invoking the compiler is just a matter of invoking the python program
+with a suitable input P4 file:
+
+```
+p4toEbpf.py file.p4 -o file.c
+```
+
+#### Compiler options
+
+The P4 compiler first runs the C preprocessor on the input P4 file.
+Some of the command-line options are passed directly to the
+preprocessor.
+
+The following compiler options are available:
+
+Option | Meaning
+-------|--------
+`-D macro` | Option passed to C preprocessor
+`-I path`  | Option passed to C preprocessor
+`-U macro` | Option passed to C preprocessor
+`-g [router|filter]` | Controls whether the generated code behaves like a router or a filter.
+`-o outoutFile` | writes the generated C code to the specified output file.
+
+The `-g` option controls the nature of the generated code:
+
+* `-g filter` generates a filter; the only P4 action that has an
+  effect is the `drop()` action.  Setting metadata in P4 (e.g.,
+  `egress_port`) has no effect.
+
+* `-g router` generates a simple router; both `drop()` and
+  `egress_port` impact packet processing.
+
+#### Using the generated code
+
+The resulting file contains the complete data structures, tables, and
+a C function named `ebpf_filter` that implements the P4-specified
+data-plane.  This C file can be manipulated using the BCC tools;
+please refer to the BCC project documentation and sample test files of
+the P4 to EBPF source code for an in-depth understanding.  A minimal
+Python program that compiles and loads into the kernel the generated
+file into EBPF is:
+
+```
+#!/usr/bin/env python3
+from bcc import BPF
+
+b = BPF(src_file="file.c", debug=0)
+fn = b.load_func("ebpf_filter", BPF.SCHED_CLS)
+```
+
+##### Connecting the generated program with the TC
+
+The EBPF code that is generated is intended to be used as a classifier
+attached to the ingress packet path using the Linux TC subsystem.  The
+same EBPF code should be attached to all interfaces.  Note however
+that all EBPF code instances share a single set of tables, which are
+used to control the program behavior.
+
+The following code fragment illustrates how the EBPF code can be
+hooked up to the `eth0` interface using a Python program.  (The `fn`
+variable is the one produced by the previous code fragment).
+
+```
+from pyroute2 import IPRoute
+
+ipr = IPRoute()
+interface_name="eth0"
+if_index = ipr.link_lookup(ifname=interface_name)[0]
+ipr.tc("add", "ingress", if_index, "ffff:")
+ipr.tc("add-filter", "bpf", if_index, ":1", fd=fn.fd,
+       name=fn.name, parent="ffff:", action="ok", classid=1)
+```
diff --git a/src/cc/frontends/p4/compiler/README.txt b/src/cc/frontends/p4/compiler/README.txt
new file mode 100644
index 0000000..c610240
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/README.txt
@@ -0,0 +1,4 @@
+This folder contains an implementation of a simple compiler that
+translates a programs written in a subset of P4 into C that can in
+turn be compiled into EBPF using the IOVisor bcc compiler.
+
diff --git a/src/cc/frontends/p4/compiler/compilationException.py b/src/cc/frontends/p4/compiler/compilationException.py
new file mode 100644
index 0000000..cc0e5ba
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/compilationException.py
@@ -0,0 +1,34 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+class CompilationException(Exception):
+    """Signals an error during compilation"""
+    def __init__(self, isBug, format, *message):
+        # isBug: indicates that this is a compiler bug
+        super(CompilationException, self).__init__()
+
+        assert isinstance(format, str)
+        assert isinstance(isBug, bool)
+        self.message = message
+        self.format = format
+        self.isBug = isBug
+
+    def show(self):
+        # TODO: format this message nicely
+        return self.format.format(*self.message)
+
+
+class NotSupportedException(Exception):
+    archError = " not supported by EBPF"
+
+    def __init__(self, format, *message):
+        super(NotSupportedException, self).__init__()
+
+        assert isinstance(format, str)
+        self.message = message
+        self.format = format
+
+    def show(self):
+        # TODO: format this message nicely
+        return (self.format + NotSupportedException.archError).format(
+            *self.message)
diff --git a/src/cc/frontends/p4/compiler/ebpfAction.py b/src/cc/frontends/p4/compiler/ebpfAction.py
new file mode 100644
index 0000000..99bf145
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfAction.py
@@ -0,0 +1,382 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_action, p4_field
+from p4_hlir.hlir import p4_signature_ref, p4_header_instance
+import ebpfProgram
+from programSerializer import ProgramSerializer
+from compilationException import *
+import ebpfScalarType
+import ebpfCounter
+import ebpfType
+import ebpfInstance
+
+
+class EbpfActionData(object):
+    def __init__(self, name, argtype):
+        self.name = name
+        self.argtype = argtype
+
+
+class EbpfActionBase(object):
+    def __init__(self, p4action):
+        self.name = p4action.name
+        self.hliraction = p4action
+        self.builtin = False
+        self.arguments = []
+
+    def serializeArgumentsAsStruct(self, serializer):
+        serializer.emitIndent()
+        serializer.appendFormat("/* no arguments for {0} */", self.name)
+        serializer.newline()
+
+    def serializeBody(self, serializer, valueName, program):
+        serializer.emitIndent()
+        serializer.appendFormat("/* no body for {0} */", self.name)
+        serializer.newline()
+
+    def __str__(self):
+        return "EbpfAction({0})".format(self.name)
+
+
+class EbpfAction(EbpfActionBase):
+    unsupported = [
+        # The following cannot be done in EBPF
+        "add_header", "remove_header", "execute_meter",
+        "clone_ingress_pkt_to_egress",
+        "clone_egress_pkt_to_egress", "generate_digest", "resubmit",
+        "modify_field_with_hash_based_offset", "truncate", "push", "pop",
+        # The following could be done, but are not yet implemented
+        # The situation with copy_header is complicated,
+        # because we don't do checksums
+        "copy_header", "count",
+        "register_read", "register_write"]
+
+    # noinspection PyUnresolvedReferences
+    def __init__(self, p4action, program):
+        super(EbpfAction, self).__init__(p4action)
+        assert isinstance(p4action, p4_action)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        self.builtin = False
+        self.invalid = False  # a leaf action which is never
+                              # called from a table can be invalid.
+
+        for i in range(0, len(p4action.signature)):
+            param = p4action.signature[i]
+            width = p4action.signature_widths[i]
+            if width is None:
+                self.invalid = True
+                return
+            argtype = ebpfScalarType.EbpfScalarType(p4action, width,
+                                                    False, program.config)
+            actionData = EbpfActionData(param, argtype)
+            self.arguments.append(actionData)
+
+    def serializeArgumentsAsStruct(self, serializer):
+        if self.invalid:
+            raise CompilationException(True,
+                "{0} Attempting to generate code for an invalid action",
+                                       self.hliraction)
+
+        # Build a struct containing all action arguments.
+        serializer.emitIndent()
+        serializer.append("struct ")
+        serializer.blockStart()
+        assert isinstance(serializer, ProgramSerializer)
+        for arg in self.arguments:
+            assert isinstance(arg, EbpfActionData)
+            serializer.emitIndent()
+            argtype = arg.argtype
+            assert isinstance(argtype, ebpfType.EbpfType)
+            argtype.declare(serializer, arg.name, False)
+            serializer.endOfStatement(True)
+        serializer.blockEnd(False)
+        serializer.space()
+        serializer.append(self.name)
+        serializer.endOfStatement(True)
+
+    def serializeBody(self, serializer, dataContainer, program):
+        if self.invalid:
+            raise CompilationException(True,
+                "{0} Attempting to generate code for an invalid action",
+                                       self.hliraction)
+
+        # TODO: generate PARALLEL implementation
+        # dataContainer is a string containing the variable name
+        # containing the action data
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(dataContainer, str)
+        callee_list = self.hliraction.flat_call_sequence
+        for e in callee_list:
+            action = e[0]
+            assert isinstance(action, p4_action)
+            arguments = e[1]
+            assert isinstance(arguments, list)
+            self.serializeCallee(self, action, arguments, serializer,
+                                 dataContainer, program)
+
+    def checkSize(self, call, args, program):
+        size = None
+        for a in args:
+            if a is None:
+                continue
+            if size is None:
+                size = a
+            elif a != size:
+                program.emitWarning(
+                    "{0}: Arguments do not have the same size {1} and {2}",
+                    call, size, a)
+        return size
+
+    @staticmethod
+    def translateActionToOperator(actionName):
+        if actionName == "add" or actionName == "add_to_field":
+            return "+"
+        elif actionName == "bit_and":
+            return "&"
+        elif actionName == "bit_or":
+            return "|"
+        elif actionName == "bit_xor":
+            return "^"
+        elif actionName == "subtract" or actionName == "subtract_from_field":
+            return "-"
+        else:
+            raise CompilationException(True,
+                                       "Unexpected primitive action {0}",
+                                       actionName)
+
+    def serializeCount(self, caller, arguments, serializer,
+                       dataContainer, program):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(arguments, list)
+        assert len(arguments) == 2
+
+        counter = arguments[0]
+        index = ArgInfo(arguments[1], caller, dataContainer, program)
+        ctr = program.getCounter(counter.name)
+        assert isinstance(ctr, ebpfCounter.EbpfCounter)
+        serializer.emitIndent()
+        serializer.blockStart()
+
+        # This is actually incorrect, since the key is not always an u32.
+        # This code is currently disabled
+        key = program.reservedPrefix + "index"
+        serializer.emitIndent()
+        serializer.appendFormat("u32 {0} = {1};", key, index.asString)
+        serializer.newline()
+
+        ctr.serializeCode(key, serializer, program)
+
+        serializer.blockEnd(True)
+
+    def serializeCallee(self, caller, callee, arguments,
+                        serializer, dataContainer, program):
+        if self.invalid:
+            raise CompilationException(
+                True,
+                "{0} Attempting to generate code for an invalid action",
+                self.hliraction)
+
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(callee, p4_action)
+        assert isinstance(arguments, list)
+
+        if callee.name in EbpfAction.unsupported:
+            raise NotSupportedException("{0}", callee)
+
+        # This is not yet ready
+        #if callee.name == "count":
+        #    self.serializeCount(caller, arguments,
+        #                        serializer, dataContainer, program)
+        #    return
+
+        serializer.emitIndent()
+        args = self.transformArguments(arguments, caller,
+                                       dataContainer, program)
+        if callee.name == "modify_field":
+            dst = args[0]
+            src = args[1]
+
+            size = self.checkSize(callee,
+                                  [a.widthInBits() for a in args],
+                                  program)
+            if size is None:
+                raise CompilationException(
+                    True, "Cannot infer width for arguments {0}",
+                    callee)
+            elif size <= 32:
+                serializer.appendFormat("{0} = {1};",
+                                        dst.asString,
+                                        src.asString)
+            else:
+                if not dst.isLvalue:
+                    raise NotSupportedException(
+                        "Constants wider than 32-bit: {0}({1})",
+                        dst.caller, dst.asString)
+                if not src.isLvalue:
+                    raise NotSupportedException(
+                        "Constants wider than 32-bit: {0}({1})",
+                        src.caller, src.asString)
+                serializer.appendFormat("memcpy(&{0}, &{1}, {2});",
+                                        dst.asString,
+                                        src.asString,
+                                        size / 8)
+        elif (callee.name == "add" or
+             callee.name == "bit_and" or
+             callee.name == "bit_or" or
+             callee.name == "bit_xor" or
+             callee.name == "subtract"):
+            size = self.checkSize(callee,
+                                  [a.widthInBits() for a in args],
+                                  program)
+            if size is None:
+                raise CompilationException(
+                    True,
+                    "Cannot infer width for arguments {0}",
+                    callee)
+            if size > 32:
+                raise NotSupportedException("{0}: Arithmetic on {1}-bits",
+                                            callee, size)
+            op = EbpfAction.translateActionToOperator(callee.name)
+            serializer.appendFormat("{0} = {1} {2} {3};",
+                                    args[0].asString,
+                                    args[1].asString,
+                                    op,
+                                    args[2].asString)
+        elif (callee.name == "add_to_field" or
+              callee.name == "subtract_from_field"):
+            size = self.checkSize(callee,
+                                  [a.widthInBits() for a in args],
+                                  program)
+            if size is None:
+                raise CompilationException(
+                    True, "Cannot infer width for arguments {0}", callee)
+            if size > 32:
+                raise NotSupportedException(
+                    "{0}: Arithmetic on {1}-bits", callee, size)
+
+            op = EbpfAction.translateActionToOperator(callee.name)
+            serializer.appendFormat("{0} = {0} {1} {2};",
+                                    args[0].asString,
+                                    op,
+                                    args[1].asString)
+        elif callee.name == "no_op":
+            serializer.append("/* noop */")
+        elif callee.name == "drop":
+            serializer.appendFormat("{0} = 1;", program.dropBit)
+        elif callee.name == "push" or callee.name == "pop":
+            raise CompilationException(
+                True, "{0} push/pop not yet implemented", callee)
+        else:
+            raise CompilationException(
+                True, "Unexpected primitive action {0}", callee)
+        serializer.newline()
+
+    def transformArguments(self, arguments, caller, dataContainer, program):
+        result = []
+        for a in arguments:
+            t = ArgInfo(a, caller, dataContainer, program)
+            result.append(t)
+        return result
+
+
+class BuiltinAction(EbpfActionBase):
+    def __init__(self, p4action):
+        super(BuiltinAction, self).__init__(p4action)
+        self.builtin = True
+
+    def serializeBody(self, serializer, valueName, program):
+        # This is ugly; there should be a better way
+        if self.name == "drop":
+            serializer.emitIndent()
+            serializer.appendFormat("{0} = 1;", program.dropBit)
+            serializer.newline()
+        else:
+            serializer.emitIndent()
+            serializer.appendFormat("/* no body for {0} */", self.name)
+            serializer.newline()
+
+
+class ArgInfo(object):
+    # noinspection PyUnresolvedReferences
+    # Represents an argument passed to an action
+    def __init__(self, argument, caller, dataContainer, program):
+        self.width = None
+        self.asString = None
+        self.isLvalue = True
+        self.caller = caller
+
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(caller, EbpfAction)
+
+        if isinstance(argument, int):
+            self.asString = str(argument)
+            self.isLvalue = False
+            # size is unknown
+        elif isinstance(argument, p4_field):
+            if ebpfProgram.EbpfProgram.isArrayElementInstance(
+                    argument.instance):
+                if isinstance(argument.instance.index, int):
+                    index = "[" + str(argument.instance.index) + "]"
+                else:
+                    raise CompilationException(
+                        True,
+                        "Unexpected index for array {0}",
+                        argument.instance.index)
+                stackInstance = program.getStackInstance(
+                    argument.instance.base_name)
+                assert isinstance(stackInstance, ebpfInstance.EbpfHeaderStack)
+                fieldtype = stackInstance.basetype.getField(argument.name)
+                self.width = fieldtype.widthInBits()
+                self.asString = "{0}.{1}{3}.{2}".format(
+                    program.headerStructName,
+                    stackInstance.name, argument.name, index)
+            else:
+                instance = program.getInstance(argument.instance.base_name)
+                if isinstance(instance, ebpfInstance.EbpfHeader):
+                    parent = program.headerStructName
+                else:
+                    parent = program.metadataStructName
+                fieldtype = instance.type.getField(argument.name)
+                self.width = fieldtype.widthInBits()
+                self.asString = "{0}.{1}.{2}".format(
+                    parent, instance.name, argument.name)
+        elif isinstance(argument, p4_signature_ref):
+            refarg = caller.arguments[argument.idx]
+            self.asString = "{0}->u.{1}.{2}".format(
+                dataContainer, caller.name, refarg.name)
+            self.width = caller.arguments[argument.idx].argtype.widthInBits()
+        elif isinstance(argument, p4_header_instance):
+            # This could be a header array element
+            # Unfortunately for push and pop, the user mean the whole array,
+            # but the representation contains just the first element here.
+            # This looks like a bug in the HLIR.
+            if ebpfProgram.EbpfProgram.isArrayElementInstance(argument):
+                if isinstance(argument.index, int):
+                    index = "[" + str(argument.index) + "]"
+                else:
+                    raise CompilationException(
+                        True,
+                        "Unexpected index for array {0}", argument.index)
+                stackInstance = program.getStackInstance(argument.base_name)
+                assert isinstance(stackInstance, ebpfInstance.EbpfHeaderStack)
+                fieldtype = stackInstance.basetype
+                self.width = fieldtype.widthInBits()
+                self.asString = "{0}.{1}{2}".format(
+                    program.headerStructName, stackInstance.name, index)
+            else:
+                instance = program.getInstance(argument.name)
+                instancetype = instance.type
+                self.width = instancetype.widthInBits()
+                self.asString = "{0}.{1}".format(
+                    program.headerStructName, argument.name)
+        else:
+            raise CompilationException(
+                True, "Unexpected action argument {0}", argument)
+
+    def widthInBits(self):
+        return self.width
diff --git a/src/cc/frontends/p4/compiler/ebpfConditional.py b/src/cc/frontends/p4/compiler/ebpfConditional.py
new file mode 100644
index 0000000..5c723d2
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfConditional.py
@@ -0,0 +1,118 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_conditional_node, p4_expression
+from p4_hlir.hlir import p4_header_instance, p4_field
+from programSerializer import ProgramSerializer
+from compilationException import CompilationException
+import ebpfProgram
+import ebpfInstance
+
+
+class EbpfConditional(object):
+    @staticmethod
+    def translate(op):
+        if op == "not":
+            return "!"
+        elif op == "or":
+            return "||"
+        elif op == "and":
+            return "&&"
+        return op
+
+    def __init__(self, p4conditional, program):
+        assert isinstance(p4conditional, p4_conditional_node)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        self.hlirconditional = p4conditional
+        self.name = p4conditional.name
+
+    def emitNode(self, node, serializer, program):
+        if isinstance(node, p4_expression):
+            self.emitExpression(node, serializer, program, False)
+        elif node is None:
+            pass
+        elif isinstance(node, int):
+            serializer.append(node)
+        elif isinstance(node, p4_header_instance):
+            header = program.getInstance(node.name)
+            assert isinstance(header, ebpfInstance.EbpfHeader)
+            # TODO: stacks?
+            serializer.appendFormat(
+                "{0}.{1}", program.headerStructName, header.name)
+        elif isinstance(node, p4_field):
+            instance = node.instance
+            einstance = program.getInstance(instance.name)
+            if isinstance(einstance, ebpfInstance.EbpfHeader):
+                base = program.headerStructName
+            else:
+                base = program.metadataStructName
+            serializer.appendFormat(
+                "{0}.{1}.{2}", base, einstance.name, node.name)
+        else:
+            raise CompilationException(True, "{0} Unexpected expression ", node)
+
+    def emitExpression(self, expression, serializer, program, toplevel):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(expression, p4_expression)
+        assert isinstance(toplevel, bool)
+        left = expression.left
+        op = expression.op
+        right = expression.right
+
+        assert isinstance(op, str)
+
+        if op == "valid":
+            self.emitNode(right, serializer, program)
+            serializer.append(".valid")
+            return
+
+        if not toplevel:
+            serializer.append("(")
+        self.emitNode(left, serializer, program)
+        op = EbpfConditional.translate(op)
+        serializer.append(op)
+        self.emitNode(right, serializer, program)
+        if not toplevel:
+            serializer.append(")")
+
+    def generateCode(self, serializer, program, nextNode):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        serializer.emitIndent()
+        serializer.blockStart()
+
+        trueBranch = self.hlirconditional.next_[True]
+        if trueBranch is None:
+            trueBranch = nextNode
+        falseBranch = self.hlirconditional.next_[False]
+        if falseBranch is None:
+            falseBranch = nextNode
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0}:", program.getLabel(self.hlirconditional))
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.append("if (")
+        self.emitExpression(
+            self.hlirconditional.condition, serializer, program, True)
+        serializer.appendLine(")")
+
+        serializer.increaseIndent()
+        label = program.getLabel(trueBranch)
+        serializer.emitIndent()
+        serializer.appendFormat("goto {0};", label)
+        serializer.newline()
+        serializer.decreaseIndent()
+
+        serializer.emitIndent()
+        serializer.appendLine("else")
+        serializer.increaseIndent()
+        label = program.getLabel(falseBranch)
+        serializer.emitIndent()
+        serializer.appendFormat("goto {0};", label)
+        serializer.newline()
+        serializer.decreaseIndent()
+
+        serializer.blockEnd(True)
diff --git a/src/cc/frontends/p4/compiler/ebpfCounter.py b/src/cc/frontends/p4/compiler/ebpfCounter.py
new file mode 100644
index 0000000..5b5b396
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfCounter.py
@@ -0,0 +1,116 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_counter, P4_DIRECT, P4_COUNTER_BYTES
+from programSerializer import ProgramSerializer
+from compilationException import *
+import ebpfTable
+import ebpfProgram
+
+
+class EbpfCounter(object):
+    # noinspection PyUnresolvedReferences
+    def __init__(self, hlircounter, program):
+        assert isinstance(hlircounter, p4_counter)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        self.name = hlircounter.name
+        self.hlircounter = hlircounter
+
+        width = hlircounter.min_width
+        # ebpf counters only work on 64-bits
+        if width <= 64:
+            self.valueTypeName = program.config.uprefix + "64"
+        else:
+            raise NotSupportedException(
+                "{0}: Counters with {1} bits", hlircounter, width)
+
+        self.dataMapName = self.name
+
+        if ((hlircounter.binding is None) or
+            (hlircounter.binding[0] != P4_DIRECT)):
+            raise NotSupportedException(
+                "{0}: counter which is not direct", hlircounter)
+
+        self.autoIncrement = (hlircounter.binding != None and
+                              hlircounter.binding[0] == P4_DIRECT)
+
+        if hlircounter.type is P4_COUNTER_BYTES:
+            self.increment = "{0}->len".format(program.packetName)
+        else:
+            self.increment = "1"
+
+    def getSize(self, program):
+        if self.hlircounter.instance_count is not None:
+            return self.hlircounter.instance_count
+        if self.autoIncrement:
+            return self.getTable(program).size
+        program.emitWarning(
+            "{0} does not specify a max_size; using 1024", self.hlircounter)
+        return 1024
+
+    def getTable(self, program):
+        table = program.getTable(self.hlircounter.binding[1].name)
+        assert isinstance(table, ebpfTable.EbpfTable)
+        return table
+
+    def serialize(self, serializer, program):
+        assert isinstance(serializer, ProgramSerializer)
+
+        # Direct counters have the same key as the associated table
+        # Static counters have integer keys
+        if self.autoIncrement:
+            keyTypeName = "struct " + self.getTable(program).keyTypeName
+        else:
+            keyTypeName = program.config.uprefix + "32"
+        program.config.serializeTableDeclaration(
+            serializer, self.dataMapName, True, keyTypeName,
+            self.valueTypeName, self.getSize(program))
+
+    def serializeCode(self, keyname, serializer, program):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        serializer.emitIndent()
+        serializer.appendFormat("/* Update counter {0} */", self.name)
+        serializer.newline()
+
+        valueName = "ctrvalue"
+        initValuename = "init_val"
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0} *{1};", self.valueTypeName, valueName)
+        serializer.newline()
+        serializer.emitIndent()
+        serializer.appendFormat("{0} {1};", self.valueTypeName, initValuename)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendLine("/* perform lookup */")
+        serializer.emitIndent()
+        program.config.serializeLookup(
+            serializer, self.dataMapName, keyname, valueName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat("if ({0} != NULL) ", valueName)
+        serializer.newline()
+        serializer.increaseIndent()
+        serializer.emitIndent()
+        serializer.appendFormat("__sync_fetch_and_add({0}, {1});",
+                                valueName, self.increment)
+        serializer.newline()
+        serializer.decreaseIndent()
+        serializer.emitIndent()
+
+        serializer.append("else ")
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendFormat("{0} = {1};", initValuename, self.increment)
+        serializer.newline()
+
+        serializer.emitIndent()
+        program.config.serializeUpdate(
+            serializer, self.dataMapName, keyname, initValuename)
+        serializer.newline()
+        serializer.blockEnd(True)
diff --git a/src/cc/frontends/p4/compiler/ebpfDeparser.py b/src/cc/frontends/p4/compiler/ebpfDeparser.py
new file mode 100644
index 0000000..e4ab51f
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfDeparser.py
@@ -0,0 +1,172 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from collections import defaultdict, OrderedDict
+from p4_hlir.hlir import parse_call, p4_field, p4_parse_value_set, \
+    P4_DEFAULT, p4_parse_state, p4_table, \
+    p4_conditional_node, p4_parser_exception, \
+    p4_header_instance, P4_NEXT
+
+import ebpfProgram
+import ebpfInstance
+import ebpfType
+import ebpfStructType
+from topoSorting import Graph
+from programSerializer import ProgramSerializer
+
+def produce_parser_topo_sorting(hlir):
+    # This function is copied from the P4 behavioral model implementation
+    header_graph = Graph()
+
+    def walk_rec(hlir, parse_state, prev_hdr_node, tag_stacks_index):
+        assert(isinstance(parse_state, p4_parse_state))
+        for call in parse_state.call_sequence:
+            call_type = call[0]
+            if call_type == parse_call.extract:
+                hdr = call[1]
+
+                if hdr.virtual:
+                    base_name = hdr.base_name
+                    current_index = tag_stacks_index[base_name]
+                    if current_index > hdr.max_index:
+                        return
+                    tag_stacks_index[base_name] += 1
+                    name = base_name + "[%d]" % current_index
+                    hdr = hlir.p4_header_instances[name]
+
+                if hdr not in header_graph:
+                    header_graph.add_node(hdr)
+                hdr_node = header_graph.get_node(hdr)
+
+                if prev_hdr_node:
+                    prev_hdr_node.add_edge_to(hdr_node)
+                else:
+                    header_graph.root = hdr
+                prev_hdr_node = hdr_node
+
+        for branch_case, next_state in parse_state.branch_to.items():
+            if not next_state:
+                continue
+            if not isinstance(next_state, p4_parse_state):
+                continue
+            walk_rec(hlir, next_state, prev_hdr_node, tag_stacks_index.copy())
+
+    start_state = hlir.p4_parse_states["start"]
+    walk_rec(hlir, start_state, None, defaultdict(int))
+
+    header_topo_sorting = header_graph.produce_topo_sorting()
+
+    return header_topo_sorting
+
+class EbpfDeparser(object):
+    def __init__(self, hlir):
+        header_topo_sorting = produce_parser_topo_sorting(hlir)
+        self.headerOrder = [hdr.name for hdr in header_topo_sorting]
+
+    def serialize(self, serializer, program):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        serializer.emitIndent()
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendLine("/* Deparser */")
+        serializer.emitIndent()
+        serializer.appendFormat("{0} = 0;", program.offsetVariableName)
+        serializer.newline()
+        for h in self.headerOrder:
+            header = program.getHeaderInstance(h)
+            self.serializeHeaderEmit(header, serializer, program)
+        serializer.blockEnd(True)
+
+    def serializeHeaderEmit(self, header, serializer, program):
+        assert isinstance(header, ebpfInstance.EbpfHeader)
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        p4header = header.hlirInstance
+        assert isinstance(p4header, p4_header_instance)
+
+        serializer.emitIndent()
+        serializer.appendFormat("if ({0}.{1}.valid) ",
+                                program.headerStructName, header.name)
+        serializer.blockStart()
+
+        if ebpfProgram.EbpfProgram.isArrayElementInstance(p4header):
+            ebpfStack = program.getStackInstance(p4header.base_name)
+            assert isinstance(ebpfStack, ebpfInstance.EbpfHeaderStack)
+
+            if isinstance(p4header.index, int):
+                index = "[" + str(headerInstance.index) + "]"
+            elif p4header.index is P4_NEXT:
+                index = "[" + ebpfStack.indexVar + "]"
+            else:
+                raise CompilationException(
+                    True, "Unexpected index for array {0}",
+                    p4header.index)
+            basetype = ebpfStack.basetype
+        else:
+            ebpfHeader = program.getHeaderInstance(p4header.name)
+            basetype = ebpfHeader.type
+            index = ""
+
+        alignment = 0
+        for field in basetype.fields:
+            assert isinstance(field, ebpfStructType.EbpfField)
+
+            self.serializeFieldEmit(serializer, p4header.base_name,
+                                    index, field, alignment, program)
+            alignment += field.widthInBits()
+            alignment = alignment % 8
+        serializer.blockEnd(True)
+
+    def serializeFieldEmit(self, serializer, name, index,
+                           field, alignment, program):
+        assert isinstance(index, str)
+        assert isinstance(name, str)
+        assert isinstance(field, ebpfStructType.EbpfField)
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(alignment, int)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        if field.name == "valid":
+            return
+
+        fieldToEmit = (program.headerStructName + "." + name +
+                       index + "." + field.name)
+        width = field.widthInBits()
+        if width <= 32:
+            store = self.generatePacketStore(fieldToEmit, 0, alignment,
+                                             width, program)
+            serializer.emitIndent()
+            serializer.appendLine(store)
+        else:
+            # Destination is bigger than 4 bytes and
+            # represented as a byte array.
+            b = (width + 7) / 8
+            for i in range(0, b):
+                serializer.emitIndent()
+                store = self.generatePacketStore(fieldToEmit + "["+str(i)+"]",
+                                                 i,
+                                                 alignment,
+                                                 8, program)
+                serializer.appendLine(store)
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0} += {1};",
+                                program.offsetVariableName, width)
+        serializer.newline()
+
+    def generatePacketStore(self, value, offset, alignment, width, program):
+        assert width > 0
+        assert alignment < 8
+        assert isinstance(width, int)
+        assert isinstance(alignment, int)
+
+        return "bpf_dins_pkt({0}, {1} / 8 + {2}, {3}, {4}, {5});".format(
+            program.packetName,
+            program.offsetVariableName,
+            offset,
+            alignment,
+            width,
+            value
+        )
diff --git a/src/cc/frontends/p4/compiler/ebpfInstance.py b/src/cc/frontends/p4/compiler/ebpfInstance.py
new file mode 100644
index 0000000..822688f
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfInstance.py
@@ -0,0 +1,87 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_header_instance
+from ebpfType import EbpfType
+from compilationException import CompilationException
+from programSerializer import ProgramSerializer
+import typeFactory
+
+
+class EbpfInstanceBase(object):
+    def __init__(self):
+        pass
+
+
+class SimpleInstance(EbpfInstanceBase):
+    # A header or a metadata instance (but not array elements)
+    def __init__(self, hlirInstance, factory, isMetadata):
+        super(SimpleInstance, self).__init__()
+        self.hlirInstance = hlirInstance
+        self.name = hlirInstance.base_name
+        self.type = factory.build(hlirInstance.header_type, isMetadata)
+
+    def declare(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        self.type.declare(serializer, self.name, False)
+
+
+class EbpfHeader(SimpleInstance):
+    """ Represents a header instance from a P4 program """
+    def __init__(self, hlirHeaderInstance, factory):
+        super(EbpfHeader, self).__init__(hlirHeaderInstance, factory, False)
+        if hlirHeaderInstance.metadata:
+            raise CompilationException(True, "Metadata passed to EpbfHeader")
+        if hlirHeaderInstance.index is not None:
+            self.name += "_" + str(hlirHeaderInstance.index)
+
+
+class EbpfMetadata(SimpleInstance):
+    """Represents a metadata instance from a P4 program"""
+    def __init__(self, hlirMetadataInstance, factory):
+        super(EbpfMetadata, self).__init__(hlirMetadataInstance, factory, True)
+        if not hlirMetadataInstance.metadata:
+            raise CompilationException(
+                True, "Header instance passed to EpbfMetadata {0}",
+                hlirMetadataInstance)
+        if hlirMetadataInstance.index is not None:
+            raise CompilationException(
+                True, "Unexpected metadata array {0}", self.hlirInstance)
+        if hasattr(hlirMetadataInstance, "initializer"):
+            self.initializer = hlirMetadataInstance.initializer
+        else:
+            self.initializer = None
+
+    def emitInitializer(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        if self.initializer is None:
+            self.type.emitInitializer(serializer)
+        else:
+            for key in self.initializer.keys():
+                serializer.appendFormat(
+                    ".{0} = {1},", key, self.initializer[key])
+
+
+class EbpfHeaderStack(EbpfInstanceBase):
+    """Represents a header stack instance; there is one instance of
+    this class for each STACK, and not for each
+    element of the stack, as in the HLIR"""
+    def __init__(self, hlirInstance, indexVar, factory):
+        super(EbpfHeaderStack, self).__init__()
+
+        # indexVar: name of the ebpf variable that
+        # holds the current index for this stack
+        assert isinstance(indexVar, str)
+        assert isinstance(factory, typeFactory.EbpfTypeFactory)
+        assert isinstance(hlirInstance, p4_header_instance)
+
+        self.indexVar = indexVar
+        self.name = hlirInstance.base_name
+        self.basetype = factory.build(hlirInstance.header_type, False)
+        assert isinstance(self.basetype, EbpfType)
+        self.arraySize = hlirInstance.max_index + 1
+        self.hlirInstance = hlirInstance
+
+    def declare(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        self.basetype.declareArray(serializer, self.name, self.arraySize)
diff --git a/src/cc/frontends/p4/compiler/ebpfParser.py b/src/cc/frontends/p4/compiler/ebpfParser.py
new file mode 100644
index 0000000..300bc69
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfParser.py
@@ -0,0 +1,427 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import parse_call, p4_field, p4_parse_value_set, \
+    P4_DEFAULT, p4_parse_state, p4_table, \
+    p4_conditional_node, p4_parser_exception, \
+    p4_header_instance, P4_NEXT
+import ebpfProgram
+import ebpfStructType
+import ebpfInstance
+import programSerializer
+from compilationException import *
+
+
+class EbpfParser(object):
+    def __init__(self, hlirParser):  # hlirParser is a P4 parser
+        self.parser = hlirParser
+        self.name = hlirParser.name
+
+    def serialize(self, serializer, program):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0}: ", self.name)
+        serializer.blockStart()
+        for op in self.parser.call_sequence:
+            self.serializeOperation(serializer, op, program)
+
+        self.serializeBranch(serializer, self.parser.branch_on,
+                             self.parser.branch_to, program)
+
+        serializer.blockEnd(True)
+
+    def serializeSelect(self, selectVarName, serializer, branch_on, program):
+        # selectVarName - name of temp variable to use for the select expression
+        assert isinstance(selectVarName, str)
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        totalWidth = 0
+        switchValue = ""
+        for e in branch_on:
+            if isinstance(e, p4_field):
+                instance = e.instance
+                assert isinstance(instance, p4_header_instance)
+                index = ""
+
+                if ebpfProgram.EbpfProgram.isArrayElementInstance(instance):
+                    ebpfStack = program.getStackInstance(instance.base_name)
+                    assert isinstance(ebpfStack, ebpfInstance.EbpfHeaderStack)
+
+                    if isinstance(instance.index, int):
+                        index = "[" + str(instance.index) + "]"
+                    elif instance.index is P4_NEXT:
+                        index = "[" + ebpfStack.indexVar + "]"
+                    else:
+                        raise CompilationException(True,
+                            "Unexpected index for array {0}", instance.index)
+                    basetype = ebpfStack.basetype
+                    name = ebpfStack.name
+                else:
+                    ebpfHeader = program.getInstance(instance.name)
+                    assert isinstance(ebpfHeader, ebpfInstance.EbpfHeader)
+                    basetype = ebpfHeader.type
+                    name = ebpfHeader.name
+
+                ebpfField = basetype.getField(e.name)
+                assert isinstance(ebpfField, ebpfStructType.EbpfField)
+
+                totalWidth += ebpfField.widthInBits()
+                fieldReference = (program.headerStructName + "." + name +
+                                  index + "." + ebpfField.name)
+
+                if switchValue == "":
+                    switchValue = fieldReference
+                else:
+                    switchValue = ("(" + switchValue + " << " +
+                                   str(ebpfField.widthInBits()) + ")")
+                    switchValue = switchValue + " | " + fieldReference
+            elif isinstance(e, tuple):
+                switchValue = self.currentReferenceAsString(e, program)
+            else:
+                raise CompilationException(
+                    True, "Unexpected element in match {0}", e)
+
+        if totalWidth > 32:
+            raise NotSupportedException("{0}: Matching on {1}-bit value",
+                                        branch_on, totalWidth)
+        serializer.emitIndent()
+        serializer.appendFormat("{0}32 {1} = {2};",
+                                program.config.uprefix,
+                                selectVarName, switchValue)
+        serializer.newline()
+
+    def generatePacketLoad(self, startBit, width, alignment, program):
+        # Generates an expression that does a load_*, shift and mask
+        # to load 'width' bits starting at startBit from the current
+        # packet offset.
+        # alignment is an integer <= 8 that holds the current alignment
+        # of of the packet offset.
+        assert width > 0
+        assert alignment < 8
+        assert isinstance(startBit, int)
+        assert isinstance(width, int)
+        assert isinstance(alignment, int)
+
+        firstBitIndex = startBit + alignment
+        lastBitIndex = startBit + width + alignment - 1
+        firstWordIndex = firstBitIndex / 8
+        lastWordIndex = lastBitIndex / 8
+
+        wordsToRead = lastWordIndex - firstWordIndex + 1
+        if wordsToRead == 1:
+            load = "load_byte"
+            loadSize = 8
+        elif wordsToRead == 2:
+            load = "load_half"
+            loadSize = 16
+        elif wordsToRead <= 4:
+            load = "load_word"
+            loadSize = 32
+        elif wordsToRead <= 8:
+            load = "load_dword"
+            loadSize = 64
+        else:
+            raise CompilationException(True, "Attempt to load more than 1 word")
+
+        readtype = program.config.uprefix + str(loadSize)
+        loadInstruction = "{0}({1}, ({2} + {3}) / 8)".format(
+            load, program.packetName, program.offsetVariableName, startBit)
+        shift = loadSize - alignment - width
+        load = "(({0}) >> ({1}))".format(loadInstruction, shift)
+        if width != loadSize:
+            mask = " & EBPF_MASK({0}, {1})".format(readtype, width)
+        else:
+            mask = ""
+        return load + mask
+
+    def currentReferenceAsString(self, tpl, program):
+        # a string describing an expression of the form current(position, width)
+        # The assumption is that at this point the packet cursor is ALWAYS
+        # byte aligned.  This should be true because headers are supposed
+        # to have sizes an integral number of bytes.
+        assert isinstance(tpl, tuple)
+        if len(tpl) != 2:
+            raise CompilationException(
+                True, "{0} Expected a tuple with 2 elements", tpl)
+
+        minIndex = tpl[0]
+        totalWidth = tpl[1]
+        result = self.generatePacketLoad(
+            minIndex, totalWidth, 0, program) # alignment is 0
+        return result
+
+    def serializeCases(self, selectVarName, serializer, branch_to, program):
+        assert isinstance(selectVarName, str)
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        branches = 0
+        seenDefault = False
+        for e in branch_to.keys():
+            serializer.emitIndent()
+            value = branch_to[e]
+
+            if isinstance(e, int):
+                serializer.appendFormat("if ({0} == {1})", selectVarName, e)
+            elif isinstance(e, tuple):
+                serializer.appendFormat(
+                    "if (({0} & {1}) == {2})", selectVarName, e[0], e[1])
+            elif isinstance(e, p4_parse_value_set):
+                raise NotSupportedException("{0}: Parser value sets", e)
+            elif e is P4_DEFAULT:
+                seenDefault = True
+                if branches > 0:
+                    serializer.append("else")
+            else:
+                raise CompilationException(
+                    True, "Unexpected element in match case {0}", e)
+
+            branches += 1
+            serializer.newline()
+            serializer.increaseIndent()
+            serializer.emitIndent()
+
+            label = program.getLabel(value)
+
+            if isinstance(value, p4_parse_state):
+                serializer.appendFormat("goto {0};", label)
+            elif isinstance(value, p4_table):
+                serializer.appendFormat("goto {0};", label)
+            elif isinstance(value, p4_conditional_node):
+                serializer.appendFormat("goto {0};", label)
+            elif isinstance(value, p4_parser_exception):
+                raise CompilationException(True, "Not yet implemented")
+            else:
+                raise CompilationException(
+                    True, "Unexpected element in match case {0}", value)
+
+            serializer.decreaseIndent()
+            serializer.newline()
+
+        # Must create default if it is missing
+        if not seenDefault:
+            serializer.emitIndent()
+            serializer.appendFormat(
+                "{0} = p4_pe_unhandled_select;", program.errorName)
+            serializer.newline()
+            serializer.emitIndent()
+            serializer.appendFormat("default: goto end;")
+            serializer.newline()
+
+    def serializeBranch(self, serializer, branch_on, branch_to, program):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        if branch_on == []:
+            dest = branch_to.values()[0]
+            serializer.emitIndent()
+            name = program.getLabel(dest)
+            serializer.appendFormat("goto {0};", name)
+            serializer.newline()
+        elif isinstance(branch_on, list):
+            tmpvar = program.generateNewName("tmp")
+            self.serializeSelect(tmpvar, serializer, branch_on, program)
+            self.serializeCases(tmpvar, serializer, branch_to, program)
+        else:
+            raise CompilationException(
+                True, "Unexpected branch_on {0}", branch_on)
+
+    def serializeOperation(self, serializer, op, program):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        operation = op[0]
+        if operation is parse_call.extract:
+            self.serializeExtract(serializer, op[1], program)
+        elif operation is parse_call.set:
+            self.serializeMetadataSet(serializer, op[1], op[2], program)
+        else:
+            raise CompilationException(
+                True, "Unexpected operation in parser {0}", op)
+
+    def serializeFieldExtract(self, serializer, headerInstanceName,
+                              index, field, alignment, program):
+        assert isinstance(index, str)
+        assert isinstance(headerInstanceName, str)
+        assert isinstance(field, ebpfStructType.EbpfField)
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(alignment, int)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        fieldToExtractTo = headerInstanceName + index + "." + field.name
+
+        serializer.emitIndent()
+        width = field.widthInBits()
+        if field.name == "valid":
+            serializer.appendFormat(
+                "{0}.{1} = 1;", program.headerStructName, fieldToExtractTo)
+            serializer.newline()
+            return
+
+        serializer.appendFormat("if ({0}->len < BYTES({1} + {2})) ",
+                                program.packetName,
+                                program.offsetVariableName, width)
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendFormat("{0} = p4_pe_header_too_short;",
+                                program.errorName)
+        serializer.newline()
+        serializer.emitIndent()
+        serializer.appendLine("goto end;")
+        # TODO: jump to correct exception handler
+        serializer.blockEnd(True)
+
+        if width <= 32:
+            serializer.emitIndent()
+            load = self.generatePacketLoad(0, width, alignment, program)
+
+            serializer.appendFormat("{0}.{1} = {2};",
+                                    program.headerStructName,
+                                    fieldToExtractTo, load)
+            serializer.newline()
+        else:
+            # Destination is bigger than 4 bytes and
+            # represented as a byte array.
+            if alignment == 0:
+                shift = 0
+            else:
+                shift = 8 - alignment
+
+            assert shift >= 0
+            if shift == 0:
+                method = "load_byte"
+            else:
+                method = "load_half"
+            b = (width + 7) / 8
+            for i in range(0, b):
+                serializer.emitIndent()
+                serializer.appendFormat("{0}.{1}[{2}] = ({3}8)",
+                                        program.headerStructName,
+                                        fieldToExtractTo, i,
+                                        program.config.uprefix)
+                serializer.appendFormat("(({0}({1}, ({2} / 8) + {3}) >> {4})",
+                                        method, program.packetName,
+                                        program.offsetVariableName, i, shift)
+                if (i == b - 1) and (width % 8 != 0):
+                    serializer.appendFormat(" & EBPF_MASK({0}8, {1})",
+                                            program.config.uprefix, width % 8)
+                serializer.append(")")
+                serializer.endOfStatement(True)
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0} += {1};",
+                                program.offsetVariableName, width)
+        serializer.newline()
+
+    def serializeExtract(self, serializer, headerInstance, program):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(headerInstance, p4_header_instance)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        if ebpfProgram.EbpfProgram.isArrayElementInstance(headerInstance):
+            ebpfStack = program.getStackInstance(headerInstance.base_name)
+            assert isinstance(ebpfStack, ebpfInstance.EbpfHeaderStack)
+
+            # write bounds check
+            serializer.emitIndent()
+            serializer.appendFormat("if ({0} >= {1}) ",
+                                    ebpfStack.indexVar, ebpfStack.arraySize)
+            serializer.blockStart()
+            serializer.emitIndent()
+            serializer.appendFormat("{0} = p4_pe_index_out_of_bounds;",
+                                    program.errorName)
+            serializer.newline()
+            serializer.emitIndent()
+            serializer.appendLine("goto end;")
+            serializer.blockEnd(True)
+
+            if isinstance(headerInstance.index, int):
+                index = "[" + str(headerInstance.index) + "]"
+            elif headerInstance.index is P4_NEXT:
+                index = "[" + ebpfStack.indexVar + "]"
+            else:
+                raise CompilationException(
+                    True, "Unexpected index for array {0}",
+                    headerInstance.index)
+            basetype = ebpfStack.basetype
+        else:
+            ebpfHeader = program.getHeaderInstance(headerInstance.name)
+            basetype = ebpfHeader.type
+            index = ""
+
+        # extract all fields
+        alignment = 0
+        for field in basetype.fields:
+            assert isinstance(field, ebpfStructType.EbpfField)
+
+            self.serializeFieldExtract(serializer, headerInstance.base_name,
+                                       index, field, alignment, program)
+            alignment += field.widthInBits()
+            alignment = alignment % 8
+
+        if ebpfProgram.EbpfProgram.isArrayElementInstance(headerInstance):
+            # increment stack index
+            ebpfStack = program.getStackInstance(headerInstance.base_name)
+            assert isinstance(ebpfStack, ebpfInstance.EbpfHeaderStack)
+
+            # write bounds check
+            serializer.emitIndent()
+            serializer.appendFormat("{0}++;", ebpfStack.indexVar)
+            serializer.newline()
+
+    def serializeMetadataSet(self, serializer, field, value, program):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+        assert isinstance(field, p4_field)
+
+        dest = program.getInstance(field.instance.name)
+        assert isinstance(dest, ebpfInstance.SimpleInstance)
+        destType = dest.type
+        assert isinstance(destType, ebpfStructType.EbpfStructType)
+        destField = destType.getField(field.name)
+
+        if destField.widthInBits() > 32:
+            useMemcpy = True
+            bytesToCopy = destField.widthInBits() / 8
+            if destField.widthInBits() % 8 != 0:
+                raise CompilationException(
+                    True,
+                    "{0}: Not implemented: wide field w. sz not multiple of 8",
+                    field)
+        else:
+            useMemcpy = False
+            bytesToCopy = None # not needed, but compiler is confused
+
+        serializer.emitIndent()
+        destination = "{0}.{1}.{2}".format(
+            program.metadataStructName, dest.name, destField.name)
+        if isinstance(value, int):
+            source = str(value)
+            if useMemcpy:
+                raise CompilationException(
+                    True,
+                    "{0}: Not implemented: copying from wide constant",
+                    value)
+        elif isinstance(value, tuple):
+            source = self.currentReferenceAsString(value, program)
+        elif isinstance(value, p4_field):
+            source = program.getInstance(value.instance.name)
+            if isinstance(source, ebpfInstance.EbpfMetadata):
+                sourceStruct = program.metadataStructName
+            else:
+                sourceStruct = program.headerStructName
+            source = "{0}.{1}.{2}".format(sourceStruct, source.name, value.name)
+        else:
+            raise CompilationException(
+                True, "Unexpected type for parse_call.set {0}", value)
+
+        if useMemcpy:
+            serializer.appendFormat("memcpy(&{0}, &{1}, {2})",
+                                    destination, source, bytesToCopy)
+        else:
+            serializer.appendFormat("{0} = {1}", destination, source)
+
+        serializer.endOfStatement(True)
diff --git a/src/cc/frontends/p4/compiler/ebpfProgram.py b/src/cc/frontends/p4/compiler/ebpfProgram.py
new file mode 100644
index 0000000..1237175
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfProgram.py
@@ -0,0 +1,506 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_header_instance, p4_table, \
+     p4_conditional_node, p4_action, p4_parse_state
+from p4_hlir.main import HLIR
+import typeFactory
+import ebpfTable
+import ebpfParser
+import ebpfAction
+import ebpfInstance
+import ebpfConditional
+import ebpfCounter
+import ebpfDeparser
+import programSerializer
+import target
+from compilationException import *
+
+
+class EbpfProgram(object):
+    def __init__(self, name, hlir, isRouter, config):
+        """Representation of an EbpfProgram (in fact,
+        a C program that is converted to EBPF)"""
+        assert isinstance(hlir, HLIR)
+        assert isinstance(isRouter, bool)
+        assert isinstance(config, target.TargetConfig)
+
+        self.hlir = hlir
+        self.name = name
+        self.uniqueNameCounter = 0
+        self.config = config
+        self.isRouter = isRouter
+        self.reservedPrefix = "ebpf_"
+
+        assert isinstance(config, target.TargetConfig)
+
+        self.packetName = self.reservedPrefix + "packet"
+        self.dropBit = self.reservedPrefix + "drop"
+        self.license = "GPL"
+        self.offsetVariableName = self.reservedPrefix + "packetOffsetInBits"
+        self.zeroKeyName = self.reservedPrefix + "zero"
+        self.arrayIndexType = self.config.uprefix + "32"
+        # all array tables must be indexed with u32 values
+
+        self.errorName = self.reservedPrefix + "error"
+        self.functionName = self.reservedPrefix + "filter"
+        self.egressPortName = "egress_port" # Hardwired in P4 definition
+
+        self.typeFactory = typeFactory.EbpfTypeFactory(config)
+        self.errorCodes = [
+            "p4_pe_no_error",
+            "p4_pe_index_out_of_bounds",
+            "p4_pe_out_of_packet",
+            "p4_pe_header_too_long",
+            "p4_pe_header_too_short",
+            "p4_pe_unhandled_select",
+            "p4_pe_checksum"]
+
+        self.actions = []
+        self.conditionals = []
+        self.tables = []
+        self.headers = []   # header instances
+        self.metadata = []  # metadata instances
+        self.stacks = []    # header stack instances EbpfHeaderStack
+        self.parsers = []   # all parsers
+        self.deparser = None
+        self.entryPoints = []  # control-flow entry points from parser
+        self.counters = []
+        self.entryPointLabels = {}  # maps p4_node from entryPoints
+                                    # to labels in the C program
+        self.egressEntry = None
+
+        self.construct()
+
+        self.headersStructTypeName = self.reservedPrefix + "headers_t"
+        self.headerStructName = self.reservedPrefix + "headers"
+        self.metadataStructTypeName = self.reservedPrefix + "metadata_t"
+        self.metadataStructName = self.reservedPrefix + "metadata"
+
+    def construct(self):
+        if len(self.hlir.p4_field_list_calculations) > 0:
+            raise NotSupportedException(
+                "{0} calculated field",
+                self.hlir.p4_field_list_calculations.values()[0].name)
+
+        for h in self.hlir.p4_header_instances.values():
+            if h.max_index is not None:
+                assert isinstance(h, p4_header_instance)
+                if h.index == 0:
+                    # header stack; allocate only for zero-th index
+                    indexVarName = self.generateNewName(h.base_name + "_index")
+                    stack = ebpfInstance.EbpfHeaderStack(
+                        h, indexVarName, self.typeFactory)
+                    self.stacks.append(stack)
+            elif h.metadata:
+                metadata = ebpfInstance.EbpfMetadata(h, self.typeFactory)
+                self.metadata.append(metadata)
+            else:
+                header = ebpfInstance.EbpfHeader(h, self.typeFactory)
+                self.headers.append(header)
+
+        for p in self.hlir.p4_parse_states.values():
+            parser = ebpfParser.EbpfParser(p)
+            self.parsers.append(parser)
+
+        for a in self.hlir.p4_actions.values():
+            if self.isInternalAction(a):
+                continue
+            action = ebpfAction.EbpfAction(a, self)
+            self.actions.append(action)
+
+        for c in self.hlir.p4_counters.values():
+            counter = ebpfCounter.EbpfCounter(c, self)
+            self.counters.append(counter)
+
+        for t in self.hlir.p4_tables.values():
+            table = ebpfTable.EbpfTable(t, self, self.config)
+            self.tables.append(table)
+
+        for n in self.hlir.p4_ingress_ptr.keys():
+            self.entryPoints.append(n)
+
+        for n in self.hlir.p4_conditional_nodes.values():
+            conditional = ebpfConditional.EbpfConditional(n, self)
+            self.conditionals.append(conditional)
+
+        self.egressEntry = self.hlir.p4_egress_ptr
+        self.deparser = ebpfDeparser.EbpfDeparser(self.hlir)
+
+    def isInternalAction(self, action):
+        # This is a heuristic really to guess which actions are built-in
+        # Unfortunately there seems to be no other way to do this
+        return action.lineno < 0
+
+    @staticmethod
+    def isArrayElementInstance(headerInstance):
+        assert isinstance(headerInstance, p4_header_instance)
+        return headerInstance.max_index is not None
+
+    def emitWarning(self, formatString, *message):
+        assert isinstance(formatString, str)
+        print("WARNING: ", formatString.format(*message))
+
+    def toC(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        self.generateIncludes(serializer)
+        self.generatePreamble(serializer)
+        self.generateTypes(serializer)
+        self.generateTables(serializer)
+
+        serializer.newline()
+        serializer.emitIndent()
+        self.config.serializeCodeSection(serializer)
+        serializer.newline()
+        serializer.emitIndent()
+        serializer.appendFormat("int {0}(struct __sk_buff* {1}) ",
+                                self.functionName, self.packetName)
+        serializer.blockStart()
+
+        self.generateHeaderInstance(serializer)
+        serializer.append(" = ")
+        self.generateInitializeHeaders(serializer)
+        serializer.endOfStatement(True)
+
+        self.generateMetadataInstance(serializer)
+        serializer.append(" = ")
+        self.generateInitializeMetadata(serializer)
+        serializer.endOfStatement(True)
+
+        self.createLocalVariables(serializer)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendLine("goto start;")
+
+        self.generateParser(serializer)
+        self.generatePipeline(serializer)
+
+        self.generateDeparser(serializer)
+
+        serializer.emitIndent()
+        serializer.appendLine("end:")
+        serializer.emitIndent()
+
+        if isinstance(self.config, target.KernelSamplesConfig):
+            serializer.appendFormat("return {0};", self.dropBit)
+            serializer.newline()
+        elif isinstance(self.config, target.BccConfig):
+            if self.isRouter:
+                serializer.appendFormat("if (!{0})", self.dropBit)
+                serializer.newline()
+                serializer.increaseIndent()
+                serializer.emitIndent()
+                serializer.appendFormat(
+                    "bpf_clone_redirect({0}, {1}.standard_metadata.{2}, 0);",
+                    self.packetName, self.metadataStructName,
+                    self.egressPortName)
+                serializer.newline()
+                serializer.decreaseIndent()
+
+                serializer.emitIndent()
+                serializer.appendLine(
+                    "return TC_ACT_SHOT /* drop packet; clone is forwarded */;")
+            else:
+                serializer.appendFormat(
+                    "return {1} ? TC_ACT_SHOT : TC_ACT_PIPE;",
+                    self.dropBit)
+                serializer.newline()
+        else:
+            raise CompilationException(
+                True, "Unexpected target configuration {0}",
+                self.config.targetName)
+        serializer.blockEnd(True)
+
+        self.generateLicense(serializer)
+
+        serializer.append(self.config.postamble)
+
+    def generateLicense(self, serializer):
+        self.config.serializeLicense(serializer, self.license)
+
+    # noinspection PyMethodMayBeStatic
+    def generateIncludes(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        serializer.append(self.config.getIncludes())
+
+    def getLabel(self, p4node):
+        # C label that corresponds to this point in the control-flow
+        if p4node is None:
+            return "end"
+        elif isinstance(p4node, p4_parse_state):
+            label = p4node.name
+            self.entryPointLabels[p4node.name] = label
+        if p4node.name not in self.entryPointLabels:
+            label = self.generateNewName(p4node.name)
+            self.entryPointLabels[p4node.name] = label
+        return self.entryPointLabels[p4node.name]
+
+    # noinspection PyMethodMayBeStatic
+    def generatePreamble(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.emitIndent()
+        serializer.append("enum ErrorCode ")
+        serializer.blockStart()
+        for error in self.errorCodes:
+            serializer.emitIndent()
+            serializer.appendFormat("{0},", error)
+            serializer.newline()
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+        serializer.newline()
+
+        serializer.appendLine(
+            "#define EBPF_MASK(t, w) ((((t)(1)) << (w)) - (t)1)")
+        serializer.appendLine("#define BYTES(w) ((w + 7) / 8)")
+
+        self.config.generateDword(serializer)
+
+    # noinspection PyMethodMayBeStatic
+    def generateNewName(self, base):  # base is a string
+        """Generates a fresh name based on the specified base name"""
+        # TODO: this should be made "safer"
+        assert isinstance(base, str)
+
+        base += "_" + str(self.uniqueNameCounter)
+        self.uniqueNameCounter += 1
+        return base
+
+    def generateTypes(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        for t in self.typeFactory.type_map.values():
+            t.serialize(serializer)
+
+        # generate a new struct type for the packet itself
+        serializer.appendFormat("struct {0} ", self.headersStructTypeName)
+        serializer.blockStart()
+        for h in self.headers:
+            serializer.emitIndent()
+            h.declare(serializer)
+            serializer.endOfStatement(True)
+
+        for h in self.stacks:
+            assert isinstance(h, ebpfInstance.EbpfHeaderStack)
+
+            serializer.emitIndent()
+            h.declare(serializer)
+            serializer.endOfStatement(True)
+
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+        # generate a new struct type for the metadata
+        serializer.appendFormat("struct {0} ", self.metadataStructTypeName)
+        serializer.blockStart()
+        for h in self.metadata:
+            assert isinstance(h, ebpfInstance.EbpfMetadata)
+
+            serializer.emitIndent()
+            h.declare(serializer)
+            serializer.endOfStatement(True)
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+    def generateTables(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        for t in self.tables:
+            t.serialize(serializer, self)
+
+        for c in self.counters:
+            c.serialize(serializer, self)
+
+    def generateHeaderInstance(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "struct {0} {1}", self.headersStructTypeName, self.headerStructName)
+
+    def generateInitializeHeaders(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.blockStart()
+        for h in self.headers:
+            serializer.emitIndent()
+            serializer.appendFormat(".{0} = ", h.name)
+            h.type.emitInitializer(serializer)
+            serializer.appendLine(",")
+        serializer.blockEnd(False)
+
+    def generateMetadataInstance(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "struct {0} {1}",
+            self.metadataStructTypeName,
+            self.metadataStructName)
+
+    def generateDeparser(self, serializer):
+        self.deparser.serialize(serializer, self)
+
+    def generateInitializeMetadata(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.blockStart()
+        for h in self.metadata:
+            serializer.emitIndent()
+            serializer.appendFormat(".{0} = ", h.name)
+            h.emitInitializer(serializer)
+            serializer.appendLine(",")
+        serializer.blockEnd(False)
+
+    def createLocalVariables(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+
+        serializer.emitIndent()
+        serializer.appendFormat("unsigned {0} = 0;", self.offsetVariableName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "enum ErrorCode {0} = p4_pe_no_error;", self.errorName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "{0}8 {1} = 0;", self.config.uprefix, self.dropBit)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "{0} {1} = 0;", self.arrayIndexType, self.zeroKeyName)
+        serializer.newline()
+
+        for h in self.stacks:
+            serializer.emitIndent()
+            serializer.appendFormat(
+                "{0}8 {0} = 0;", self.config.uprefix, h.indexVar)
+            serializer.newline()
+
+    def getStackInstance(self, name):
+        assert isinstance(name, str)
+
+        for h in self.stacks:
+            if h.name == name:
+                assert isinstance(h, ebpfInstance.EbpfHeaderStack)
+                return h
+        raise CompilationException(
+            True, "Could not locate header stack named {0}", name)
+
+    def getHeaderInstance(self, name):
+        assert isinstance(name, str)
+
+        for h in self.headers:
+            if h.name == name:
+                assert isinstance(h, ebpfInstance.EbpfHeader)
+                return h
+        raise CompilationException(
+            True, "Could not locate header instance named {0}", name)
+
+    def getInstance(self, name):
+        assert isinstance(name, str)
+
+        for h in self.headers:
+            if h.name == name:
+                return h
+        for h in self.metadata:
+            if h.name == name:
+                return h
+        raise CompilationException(
+            True, "Could not locate instance named {0}", name)
+
+    def getAction(self, p4action):
+        assert isinstance(p4action, p4_action)
+        for a in self.actions:
+            if a.name == p4action.name:
+                return a
+
+        newAction = ebpfAction.BuiltinAction(p4action)
+        self.actions.append(newAction)
+        return newAction
+
+    def getTable(self, name):
+        assert isinstance(name, str)
+        for t in self.tables:
+            if t.name == name:
+                return t
+        raise CompilationException(
+            True, "Could not locate table named {0}", name)
+
+    def getCounter(self, name):
+        assert isinstance(name, str)
+        for t in self.counters:
+            if t.name == name:
+                return t
+        raise CompilationException(
+            True, "Could not locate counters named {0}", name)
+
+    def getConditional(self, name):
+        assert isinstance(name, str)
+        for c in self.conditionals:
+            if c.name == name:
+                return c
+        raise CompilationException(
+            True, "Could not locate conditional named {0}", name)
+
+    def generateParser(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        for p in self.parsers:
+            p.serialize(serializer, self)
+
+    def generateIngressPipeline(self, serializer):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        for t in self.tables:
+            assert isinstance(t, ebpfTable.EbpfTable)
+            serializer.emitIndent()
+            serializer.appendFormat("{0}:", t.name)
+            serializer.newline()
+
+    def generateControlFlowNode(self, serializer, node, nextEntryPoint):
+        # nextEntryPoint is used as a target whenever the target is None
+        # nextEntryPoint may also be None
+        if isinstance(node, p4_table):
+            table = self.getTable(node.name)
+            assert isinstance(table, ebpfTable.EbpfTable)
+            table.serializeCode(serializer, self, nextEntryPoint)
+        elif isinstance(node, p4_conditional_node):
+            conditional = self.getConditional(node.name)
+            assert isinstance(conditional, ebpfConditional.EbpfConditional)
+            conditional.generateCode(serializer, self, nextEntryPoint)
+        else:
+            raise CompilationException(
+                True, "{0} Unexpected control flow node ", node)
+
+    def generatePipelineInternal(self, serializer, nodestoadd, nextEntryPoint):
+        assert isinstance(serializer, programSerializer.ProgramSerializer)
+        assert isinstance(nodestoadd, set)
+
+        done = set()
+        while len(nodestoadd) > 0:
+            todo = nodestoadd.pop()
+            if todo in done:
+                continue
+            if todo is None:
+                continue
+
+            print("Generating ", todo.name)
+
+            done.add(todo)
+            self.generateControlFlowNode(serializer, todo, nextEntryPoint)
+
+            for n in todo.next_.values():
+                nodestoadd.add(n)
+
+    def generatePipeline(self, serializer):
+        todo = set()
+        for e in self.entryPoints:
+            todo.add(e)
+        self.generatePipelineInternal(serializer, todo, self.egressEntry)
+        todo = set()
+        todo.add(self.egressEntry)
+        self.generatePipelineInternal(serializer, todo, None)
diff --git a/src/cc/frontends/p4/compiler/ebpfScalarType.py b/src/cc/frontends/p4/compiler/ebpfScalarType.py
new file mode 100644
index 0000000..cb5db21
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfScalarType.py
@@ -0,0 +1,84 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import P4_AUTO_WIDTH
+from ebpfType import *
+from compilationException import *
+from programSerializer import ProgramSerializer
+
+
+class EbpfScalarType(EbpfType):
+    __doc__ = "Represents a scalar type"
+    def __init__(self, parent, widthInBits, isSigned, config):
+        super(EbpfScalarType, self).__init__(None)
+        assert isinstance(widthInBits, int)
+        assert isinstance(isSigned, bool)
+        self.width = widthInBits
+        self.isSigned = isSigned
+        self.config = config
+        if widthInBits is P4_AUTO_WIDTH:
+            raise NotSupportedException("{0} Variable-width field", parent)
+
+    def widthInBits(self):
+        return self.width
+
+    @staticmethod
+    def bytesRequired(width):
+        return (width + 7) / 8
+
+    def asString(self):
+        if self.isSigned:
+            prefix = self.config.iprefix
+        else:
+            prefix = self.config.uprefix
+
+        if self.width <= 8:
+            name = prefix + "8"
+        elif self.width <= 16:
+            name = prefix + "16"
+        elif self.width <= 32:
+            name = prefix + "32"
+        else:
+            name = "char*"
+        return name
+
+    def alignment(self):
+        if self.width <= 8:
+            return 1
+        elif self.width <= 16:
+            return 2
+        elif self.width <= 32:
+            return 4
+        else:
+            return 1  # Char array
+
+    def serialize(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.append(self.asString())
+
+    def declareArray(self, serializer, identifier, size):
+        raise CompilationException(
+            True, "Arrays of base type not expected in P4")
+
+    def declare(self, serializer, identifier, asPointer):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(asPointer, bool)
+        assert isinstance(identifier, str)
+
+        if self.width <= 32:
+            self.serialize(serializer)
+            if asPointer:
+                serializer.append("*")
+            serializer.space()
+            serializer.append(identifier)
+        else:
+            if asPointer:
+                serializer.append("char*")
+            else:
+                serializer.appendFormat(
+                    "char {0}[{1}]", identifier,
+                    EbpfScalarType.bytesRequired(self.width))
+
+    def emitInitializer(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.append("0")
diff --git a/src/cc/frontends/p4/compiler/ebpfStructType.py b/src/cc/frontends/p4/compiler/ebpfStructType.py
new file mode 100644
index 0000000..8efa2d2
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfStructType.py
@@ -0,0 +1,128 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import P4_SIGNED, P4_SATURATING
+from ebpfScalarType import *
+
+
+class EbpfField(object):
+    __doc__ = "represents a field in a struct type, not in an instance"
+
+    def __init__(self, hlirParentType, name, widthInBits, attributes, config):
+        self.name = name
+        self.width = widthInBits
+        self.hlirType = hlirParentType
+        signed = False
+        if P4_SIGNED in attributes:
+            signed = True
+        if P4_SATURATING in attributes:
+            raise NotSupportedException(
+                "{0}.{1}: Saturated types", self.hlirType, self.name)
+
+        try:
+            self.type = EbpfScalarType(
+                self.hlirType, widthInBits, signed, config)
+        except CompilationException, e:
+            raise CompilationException(
+                e.isBug, "{0}.{1}: {2}", hlirParentType, self.name, e.show())
+
+    def widthInBits(self):
+        return self.width
+
+
+class EbpfStructType(EbpfType):
+    # Abstract base class for HeaderType and MetadataType.
+    # They are both represented by a p4 header_type
+    def __init__(self, hlirHeader, config):
+        super(EbpfStructType, self).__init__(hlirHeader)
+        self.name = hlirHeader.name
+        self.fields = []
+
+        for (fieldName, fieldSize) in self.hlirType.layout.items():
+            attributes = self.hlirType.attributes[fieldName]
+            field = EbpfField(
+                hlirHeader, fieldName, fieldSize, attributes, config)
+            self.fields.append(field)
+
+    def serialize(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+
+        serializer.emitIndent()
+        serializer.appendFormat("struct {0} ", self.name)
+        serializer.blockStart()
+
+        for field in self.fields:
+            serializer.emitIndent()
+            field.type.declare(serializer, field.name, False)
+            serializer.appendFormat("; /* {0} bits */", field.widthInBits())
+            serializer.newline()
+
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+    def declare(self, serializer, identifier, asPointer):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(identifier, str)
+        assert isinstance(asPointer, bool)
+
+        serializer.appendFormat("struct {0} ", self.name)
+        if asPointer:
+            serializer.append("*")
+        serializer.append(identifier)
+
+    def widthInBits(self):
+        return self.hlirType.length * 8
+
+    def getField(self, name):
+        assert isinstance(name, str)
+
+        for f in self.fields:
+            assert isinstance(f, EbpfField)
+            if f.name == name:
+                return f
+        raise CompilationException(
+            True, "Could not locate field {0}.{1}", self, name)
+
+
+class EbpfHeaderType(EbpfStructType):
+    def __init__(self, hlirHeader, config):
+        super(EbpfHeaderType, self).__init__(hlirHeader, config)
+        validField = EbpfField(hlirHeader, "valid", 1, set(), config)
+        # check that no "valid" field exists already
+        for f in self.fields:
+            if f.name == "valid":
+                raise CompilationException(
+                    True,
+                    "Header type contains a field named `valid': {0}",
+                    f)
+        self.fields.append(validField)
+
+    def emitInitializer(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendLine(".valid = 0")
+        serializer.blockEnd(False)
+
+    def declareArray(self, serializer, identifier, size):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.appendFormat(
+            "struct {0} {1}[{2}]", self.name, identifier, size)
+
+
+class EbpfMetadataType(EbpfStructType):
+    def __init__(self, hlirHeader, config):
+        super(EbpfMetadataType, self).__init__(hlirHeader, config)
+
+    def emitInitializer(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+
+        serializer.blockStart()
+        for field in self.fields:
+            serializer.emitIndent()
+            serializer.appendFormat(".{0} = ", field.name)
+
+            field.type.emitInitializer(serializer)
+            serializer.append(",")
+            serializer.newline()
+        serializer.blockEnd(False)
diff --git a/src/cc/frontends/p4/compiler/ebpfTable.py b/src/cc/frontends/p4/compiler/ebpfTable.py
new file mode 100644
index 0000000..eb1efd9
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfTable.py
@@ -0,0 +1,404 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_match_type, p4_field, p4_table, p4_header_instance
+from programSerializer import ProgramSerializer
+from compilationException import *
+import ebpfProgram
+import ebpfInstance
+import ebpfCounter
+import ebpfStructType
+import ebpfAction
+
+
+class EbpfTableKeyField(object):
+    def __init__(self, fieldname, instance, field, mask):
+        assert isinstance(instance, ebpfInstance.EbpfInstanceBase)
+        assert isinstance(field, ebpfStructType.EbpfField)
+
+        self.keyFieldName = fieldname
+        self.instance = instance
+        self.field = field
+        self.mask = mask
+
+    def serializeType(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        ftype = self.field.type
+        serializer.emitIndent()
+        ftype.declare(serializer, self.keyFieldName, False)
+        serializer.endOfStatement(True)
+
+    def serializeConstruction(self, keyName, serializer, program):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(keyName, str)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        if self.mask is not None:
+            maskExpression = " & {0}".format(self.mask)
+        else:
+            maskExpression = ""
+
+        if isinstance(self.instance, ebpfInstance.EbpfMetadata):
+            base = program.metadataStructName
+        else:
+            base = program.headerStructName
+
+        if isinstance(self.instance, ebpfInstance.SimpleInstance):
+            source = "{0}.{1}.{2}".format(
+                base, self.instance.name, self.field.name)
+        else:
+            assert isinstance(self.instance, ebpfInstance.EbpfHeaderStack)
+            source = "{0}.{1}[{2}].{3}".format(
+                base, self.instance.name,
+                self.instance.hlirInstance.index, self.field.name)
+        destination = "{0}.{1}".format(keyName, self.keyFieldName)
+        size = self.field.widthInBits()
+
+        serializer.emitIndent()
+        if size <= 32:
+            serializer.appendFormat("{0} = ({1}){2};",
+                                    destination, source, maskExpression)
+        else:
+            if maskExpression != "":
+                raise NotSupportedException(
+                    "{0} Mask wider than 32 bits", self.field.hlirType)
+            serializer.appendFormat(
+                "memcpy(&{0}, &{1}, {2});", destination, source, size / 8)
+
+        serializer.newline()
+
+
+class EbpfTableKey(object):
+    def __init__(self, match_fields, program):
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        self.expressions = []
+        self.fields = []
+        self.masks = []
+        self.fieldNamePrefix = "key_field_"
+        self.program = program
+
+        fieldNumber = 0
+        for f in match_fields:
+            field = f[0]
+            matchType = f[1]
+            mask = f[2]
+
+            if ((matchType is p4_match_type.P4_MATCH_TERNARY) or
+                (matchType is p4_match_type.P4_MATCH_LPM) or
+                (matchType is p4_match_type.P4_MATCH_RANGE)):
+                raise NotSupportedException(
+                    False, "Match type {0}", matchType)
+
+            if matchType is p4_match_type.P4_MATCH_VALID:
+                # we should be really checking the valid field;
+                # p4_field is a header instance
+                assert isinstance(field, p4_header_instance)
+                instance = field
+                fieldname = "valid"
+            else:
+                assert isinstance(field, p4_field)
+                instance = field.instance
+                fieldname = field.name
+
+            if ebpfProgram.EbpfProgram.isArrayElementInstance(instance):
+                ebpfStack = program.getStackInstance(instance.base_name)
+                assert isinstance(ebpfStack, ebpfInstance.EbpfHeaderStack)
+                basetype = ebpfStack.basetype
+                eInstance = program.getStackInstance(instance.base_name)
+            else:
+                ebpfHeader = program.getInstance(instance.name)
+                assert isinstance(ebpfHeader, ebpfInstance.SimpleInstance)
+                basetype = ebpfHeader.type
+                eInstance = program.getInstance(instance.base_name)
+
+            ebpfField = basetype.getField(fieldname)
+            assert isinstance(ebpfField, ebpfStructType.EbpfField)
+
+            fieldName = self.fieldNamePrefix + str(fieldNumber)
+            fieldNumber += 1
+            keyField = EbpfTableKeyField(fieldName, eInstance, ebpfField, mask)
+
+            self.fields.append(keyField)
+            self.masks.append(mask)
+
+    @staticmethod
+    def fieldRank(field):
+        assert isinstance(field, EbpfTableKeyField)
+        return field.field.type.alignment()
+
+    def serializeType(self, serializer, keyTypeName):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.emitIndent()
+        serializer.appendFormat("struct {0} ", keyTypeName)
+        serializer.blockStart()
+
+        # Sort fields in decreasing size; this will ensure that
+        # there is no padding.
+        # Padding may cause the ebpf verification to fail,
+        # since padding fields are not initalized
+        fieldOrder = sorted(
+            self.fields, key=EbpfTableKey.fieldRank, reverse=True)
+        for f in fieldOrder:
+            assert isinstance(f, EbpfTableKeyField)
+            f.serializeType(serializer)
+
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+    def serializeConstruction(self, serializer, keyName, program):
+        serializer.emitIndent()
+        serializer.appendLine("/* construct key */")
+
+        for f in self.fields:
+            f.serializeConstruction(keyName, serializer, program)
+
+
+class EbpfTable(object):
+    # noinspection PyUnresolvedReferences
+    def __init__(self, hlirtable, program, config):
+        assert isinstance(hlirtable, p4_table)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        self.name = hlirtable.name
+        self.hlirtable = hlirtable
+        self.config = config
+
+        self.defaultActionMapName = (program.reservedPrefix +
+                                     self.name + "_miss")
+        self.key = EbpfTableKey(hlirtable.match_fields, program)
+        self.size = hlirtable.max_size
+        if self.size is None:
+            program.emitWarning(
+                "{0} does not specify a max_size; using 1024", hlirtable)
+            self.size = 1024
+        self.isHash = True  # TODO: try to guess arrays when possible
+        self.dataMapName = self.name
+        self.actionEnumName = program.generateNewName(self.name + "_actions")
+        self.keyTypeName = program.generateNewName(self.name + "_key")
+        self.valueTypeName = program.generateNewName(self.name + "_value")
+        self.actions = []
+
+        if hlirtable.action_profile is not None:
+            raise NotSupportedException("{0}: action_profile tables",
+                                        hlirtable)
+        if hlirtable.support_timeout:
+            program.emitWarning("{0}: table timeout {1}; ignoring",
+                                hlirtable, NotSupportedException.archError)
+
+        self.counters = []
+        if (hlirtable.attached_counters is not None):
+            for c in hlirtable.attached_counters:
+                ctr = program.getCounter(c.name)
+                assert isinstance(ctr, ebpfCounter.EbpfCounter)
+                self.counters.append(ctr)
+
+        if (len(hlirtable.attached_meters) > 0 or
+            len(hlirtable.attached_registers) > 0):
+            program.emitWarning("{0}: meters/registers {1}; ignored",
+                                hlirtable, NotSupportedException.archError)
+
+        for a in hlirtable.actions:
+            action = program.getAction(a)
+            self.actions.append(action)
+
+    def serializeKeyType(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        self.key.serializeType(serializer, self.keyTypeName)
+
+    def serializeActionArguments(self, serializer, action):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(action, ebpfAction.EbpfActionBase)
+        action.serializeArgumentsAsStruct(serializer)
+
+    def serializeValueType(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        #  create an enum with tags for all actions
+        serializer.emitIndent()
+        serializer.appendFormat("enum {0} ", self.actionEnumName)
+        serializer.blockStart()
+
+        for a in self.actions:
+            name = a.name
+            serializer.emitIndent()
+            serializer.appendFormat("{0}_{1},", self.name, name)
+            serializer.newline()
+
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+        # a type-safe union: a struct with a tag and an union
+        serializer.emitIndent()
+        serializer.appendFormat("struct {0} ", self.valueTypeName)
+        serializer.blockStart()
+
+        serializer.emitIndent()
+        #serializer.appendFormat("enum {0} action;", self.actionEnumName)
+        # teporary workaround bcc bug
+        serializer.appendFormat("{0}32 action;",
+                                self.config.uprefix)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.append("union ")
+        serializer.blockStart()
+
+        for a in self.actions:
+            self.serializeActionArguments(serializer, a)
+
+        serializer.blockEnd(False)
+        serializer.space()
+        serializer.appendLine("u;")
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+    def serialize(self, serializer, program):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        self.serializeKeyType(serializer)
+        self.serializeValueType(serializer)
+
+        self.config.serializeTableDeclaration(
+            serializer, self.dataMapName, self.isHash,
+            "struct " + self.keyTypeName,
+            "struct " + self.valueTypeName, self.size)
+        self.config.serializeTableDeclaration(
+            serializer, self.defaultActionMapName, False,
+            program.arrayIndexType, "struct " + self.valueTypeName, 1)
+
+    def serializeCode(self, serializer, program, nextNode):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(program, ebpfProgram.EbpfProgram)
+
+        hitVarName = program.reservedPrefix + "hit"
+        keyname = "key"
+        valueName = "value"
+
+        serializer.newline()
+        serializer.emitIndent()
+        serializer.appendFormat("{0}:", program.getLabel(self))
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.blockStart()
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0}8 {1};", program.config.uprefix, hitVarName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat("struct {0} {1} = {{}};", self.keyTypeName, keyname)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "struct {0} *{1};", self.valueTypeName, valueName)
+        serializer.newline()
+
+        self.key.serializeConstruction(serializer, keyname, program)
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0} = 1;", hitVarName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendLine("/* perform lookup */")
+        serializer.emitIndent()
+        program.config.serializeLookup(
+            serializer, self.dataMapName, keyname, valueName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat("if ({0} == NULL) ", valueName)
+        serializer.blockStart()
+
+        serializer.emitIndent()
+        serializer.appendFormat("{0} = 0;", hitVarName)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendLine("/* miss; find default action */")
+        serializer.emitIndent()
+        program.config.serializeLookup(
+            serializer, self.defaultActionMapName,
+            program.zeroKeyName, valueName)
+        serializer.newline()
+        serializer.blockEnd(True)
+
+        if len(self.counters) > 0:
+            serializer.emitIndent()
+            serializer.append("else ")
+            serializer.blockStart()
+            for c in self.counters:
+                assert isinstance(c, ebpfCounter.EbpfCounter)
+                if c.autoIncrement:
+                    serializer.emitIndent()
+                    serializer.blockStart()
+                    c.serializeCode(keyname, serializer, program)
+                    serializer.blockEnd(True)
+            serializer.blockEnd(True)
+
+        serializer.emitIndent()
+        serializer.appendFormat("if ({0} != NULL) ", valueName)
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendLine("/* run action */")
+        self.runAction(serializer, self.name, valueName, program, nextNode)
+
+        nextNode = self.hlirtable.next_
+        if "hit" in nextNode:
+            node = nextNode["hit"]
+            if node is None:
+                node = nextNode
+            label = program.getLabel(node)
+            serializer.emitIndent()
+            serializer.appendFormat("if (hit) goto {0};", label)
+            serializer.newline()
+
+            node = nextNode["miss"]
+            if node is None:
+                node = nextNode
+            label = program.getLabel(node)
+            serializer.emitIndent()
+            serializer.appendFormat("else goto {0};", label)
+            serializer.newline()
+
+        serializer.blockEnd(True)
+        if not "hit" in nextNode:
+            # Catch-all
+            serializer.emitIndent()
+            serializer.appendFormat("goto end;")
+            serializer.newline()
+
+        serializer.blockEnd(True)
+
+    def runAction(self, serializer, tableName, valueName, program, nextNode):
+        serializer.emitIndent()
+        serializer.appendFormat("switch ({0}->action) ", valueName)
+        serializer.blockStart()
+
+        for a in self.actions:
+            assert isinstance(a, ebpfAction.EbpfActionBase)
+
+            serializer.emitIndent()
+            serializer.appendFormat("case {0}_{1}: ", tableName, a.name)
+            serializer.newline()
+            serializer.emitIndent()
+            serializer.blockStart()
+            a.serializeBody(serializer, valueName, program)
+            serializer.blockEnd(True)
+            serializer.emitIndent()
+
+            nextNodes = self.hlirtable.next_
+            if a.hliraction in nextNodes:
+                node = nextNodes[a.hliraction]
+                if node is None:
+                    node = nextNode
+                label = program.getLabel(node)
+                serializer.appendFormat("goto {0};", label)
+            else:
+                serializer.appendFormat("break;")
+            serializer.newline()
+
+        serializer.blockEnd(True)
diff --git a/src/cc/frontends/p4/compiler/ebpfType.py b/src/cc/frontends/p4/compiler/ebpfType.py
new file mode 100644
index 0000000..a652097
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/ebpfType.py
@@ -0,0 +1,30 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from compilationException import CompilationException
+
+class EbpfType(object):
+    __doc__ = "Base class for representing a P4 type"
+
+    def __init__(self, hlirType):
+        self.hlirType = hlirType
+
+    # Methods to override
+
+    def serialize(self, serializer):
+        # the type itself
+        raise CompilationException(True, "Method must be overridden")
+
+    def declare(self, serializer, identifier, asPointer):
+        # declaration of an identifier with this type
+        # asPointer is a boolean;
+        # if true, the identifier is declared as a pointer
+        raise CompilationException(True, "Method must be overridden")
+
+    def emitInitializer(self, serializer):
+        # A default initializer suitable for this type
+        raise CompilationException(True, "Method must be overridden")
+
+    def declareArray(self, serializer, identifier, size):
+        # Declare an identifier with an array type with the specified size
+        raise CompilationException(True, "Method must be overridden")
diff --git a/src/cc/frontends/p4/compiler/p4toEbpf.py b/src/cc/frontends/p4/compiler/p4toEbpf.py
new file mode 100755
index 0000000..7a5bc42
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/p4toEbpf.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# Compiler from P4 to EBPF
+# (See http://www.slideshare.net/PLUMgrid/ebpf-and-linux-networking).
+# This compiler in fact generates a C source file
+# which can be compiled to EBPF using the LLVM compiler
+# with the ebpf target.
+#
+# Main entry point.
+
+import argparse
+import os
+import traceback
+import sys
+import target
+from p4_hlir.main import HLIR
+from ebpfProgram import EbpfProgram
+from compilationException import *
+from programSerializer import ProgramSerializer
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='p4toEbpf arguments')
+    parser.add_argument('source', metavar='source', type=str,
+                        help='a P4 source file to compile')
+    parser.add_argument('-g', dest='generated', default="router",
+                        help="kind of output produced: filter or router")
+    parser.add_argument('-o', dest='output_file', default="output.c",
+                        help="generated C file name")
+    return parser
+
+
+def process(input_args):
+    parser = get_parser()
+    args, unparsed_args = parser.parse_known_args(input_args)
+
+    has_remaining_args = False
+    preprocessor_args = []
+    for a in unparsed_args:
+        if a[:2] == "-D" or a[:2] == "-I" or a[:2] == "-U":
+            input_args.remove(a)
+            preprocessor_args.append(a)
+        else:
+            has_remaining_args = True
+
+    # trigger error
+    if has_remaining_args:
+        parser.parse_args(input_args)
+
+    if args.generated == "router":
+        isRouter = True
+    elif args.generated == "filter":
+        isRouter = False
+    else:
+        print("-g should be one of 'filter' or 'router'")
+
+    print("*** Compiling ", args.source)
+    return compileP4(args.source, args.output_file, isRouter, preprocessor_args)
+
+
+class CompileResult(object):
+    def __init__(self, kind, error):
+        self.kind = kind
+        self.error = error
+
+    def __str__(self):
+        if self.kind == "OK":
+            return "Compilation successful"
+        else:
+            return "Compilation failed with error: " + self.error
+
+
+def compileP4(inputFile, gen_file, isRouter, preprocessor_args):
+    h = HLIR(inputFile)
+
+    for parg in preprocessor_args:
+        h.add_preprocessor_args(parg)
+    if not h.build():
+        return CompileResult("HLIR", "Error while building HLIR")
+
+    try:
+        basename = os.path.basename(inputFile)
+        basename = os.path.splitext(basename)[0]
+
+        config = target.BccConfig()
+        e = EbpfProgram(basename, h, isRouter, config)
+        serializer = ProgramSerializer()
+        e.toC(serializer)
+        f = open(gen_file, 'w')
+        f.write(serializer.toString())
+        return CompileResult("OK", "")
+    except CompilationException, e:
+        prefix = ""
+        if e.isBug:
+            prefix = "### Compiler bug: "
+        return CompileResult("bug", prefix + e.show())
+    except NotSupportedException, e:
+        return CompileResult("not supported", e.show())
+    except:
+        return CompileResult("exception", traceback.format_exc())
+
+
+# main entry point
+if __name__ == "__main__":
+    result = process(sys.argv[1:])
+    if result.kind != "OK":
+        print(str(result))
diff --git a/src/cc/frontends/p4/compiler/programSerializer.py b/src/cc/frontends/p4/compiler/programSerializer.py
new file mode 100644
index 0000000..651e019
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/programSerializer.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+# helper for building C program source text
+
+from compilationException import *
+
+
+class ProgramSerializer(object):
+    def __init__(self):
+        self.program = ""
+        self.eol = "\n"
+        self.currentIndent = 0
+        self.INDENT_AMOUNT = 4  # default indent amount
+
+    def __str__(self):
+        return self.program
+
+    def increaseIndent(self):
+        self.currentIndent += self.INDENT_AMOUNT
+
+    def decreaseIndent(self):
+        self.currentIndent -= self.INDENT_AMOUNT
+        if self.currentIndent < 0:
+            raise CompilationException(True, "Negative indentation level")
+
+    def toString(self):
+        return self.program
+
+    def space(self):
+        self.append(" ")
+
+    def newline(self):
+        self.program += self.eol
+
+    def endOfStatement(self, addNewline):
+        self.append(";")
+        if addNewline:
+            self.newline()
+
+    def append(self, string):
+        self.program += str(string)
+
+    def appendFormat(self, format, *args):
+        string = format.format(*args)
+        self.append(string)
+
+    def appendLine(self, string):
+        self.append(string)
+        self.newline()
+
+    def emitIndent(self):
+        self.program += " " * self.currentIndent
+
+    def blockStart(self):
+        self.append("{")
+        self.newline()
+        self.increaseIndent()
+
+    def blockEnd(self, addNewline):
+        self.decreaseIndent()
+        self.emitIndent()
+        self.append("}")
+        if addNewline:
+            self.newline()
diff --git a/src/cc/frontends/p4/compiler/target.py b/src/cc/frontends/p4/compiler/target.py
new file mode 100644
index 0000000..6124dff
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/target.py
@@ -0,0 +1,171 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from programSerializer import ProgramSerializer
+
+# abstraction for isolating target-specific features
+
+# Base class for representing target-specific configuration
+class TargetConfig(object):
+    def __init__(self, target):
+        self.targetName = target
+
+    def getIncludes(self):
+        return ""
+
+    def serializeLookup(self, serializer, tableName, key, value):
+        serializer.appendFormat("{0} = bpf_map_lookup_elem(&{1}, &{2});",
+                                value, tableName, key)
+
+    def serializeUpdate(self, serializer, tableName, key, value):
+        serializer.appendFormat(
+            "bpf_map_update_elem(&{0}, &{1}, &{2}, BPF_ANY);",
+            tableName, key, value)
+
+    def serializeLicense(self, serializer, licenseString):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.emitIndent()
+        serializer.appendFormat(
+            "char _license[] {0}(\"license\") = \"{1}\";",
+            self.config.section, licenseString)
+        serializer.newline()
+
+    def serializeCodeSection(self, serializer):
+        assert isinstance(serializer, ProgramSerializer)
+        serializer.appendFormat("{0}(\"{1}\")", self.section, self.entrySection)
+
+    def serializeTableDeclaration(self, serializer, tableName,
+                                  isHash, keyType, valueType, size):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(tableName, str)
+        assert isinstance(isHash, bool)
+        assert isinstance(keyType, str)
+        assert isinstance(valueType, str)
+        assert isinstance(size, int)
+
+        serializer.emitIndent()
+        serializer.appendFormat("struct {0} {1}(\"maps\") {2} = ",
+                                self.tableName, self.section, tableName)
+        serializer.blockStart()
+
+        serializer.emitIndent()
+        serializer.append(".type = ")
+        if isHash:
+            serializer.appendLine("BPF_MAP_TYPE_HASH,")
+        else:
+            serializer.appendLine("BPF_MAP_TYPE_ARRAY,")
+
+        serializer.emitIndent()
+        serializer.appendFormat(".{0} = sizeof(struct {1}), ",
+                                self.tableKeyAttribute, keyType)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(".{0} = sizeof(struct {1}), ",
+                                self.tableValueAttribute, valueType)
+        serializer.newline()
+
+        serializer.emitIndent()
+        serializer.appendFormat(".{0} = {1}, ", self.tableSizeAttribute, size)
+        serializer.newline()
+
+        serializer.blockEnd(False)
+        serializer.endOfStatement(True)
+
+    def generateDword(self, serializer):
+        serializer.appendFormat(
+            "static inline {0}64 load_dword(void *skb, {0}64 off)",
+            self.uprefix)
+        serializer.newline()
+        serializer.blockStart()
+        serializer.emitIndent()
+        serializer.appendFormat(
+            ("return (({0}64)load_word(skb, off) << 32) | " +
+             "load_word(skb, off + 4);"),
+            self.uprefix)
+        serializer.newline()
+        serializer.blockEnd(True)
+
+
+# Represents a target that is compiled within the kernel
+# source tree samples folder and which attaches to a socket
+class KernelSamplesConfig(TargetConfig):
+    def __init__(self):
+        super(SocketConfig, self).__init__("Socket")
+        self.entrySection = "socket1"
+        self.section = "SEC"
+        self.uprefix = "u"
+        self.iprefix = "i"
+        self.tableKeyAttribute = "key_size"
+        self.tableValueAttribute = "value_size"
+        self.tableSizeAttribute = "max_entries"
+        self.tableName = "bpf_map_def"
+        self.postamble = ""
+
+    def getIncludes(self):
+        return """
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include "bpf_helpers.h"
+"""
+
+
+# Represents a target compiled by bcc that uses the TC
+class BccConfig(TargetConfig):
+    def __init__(self):
+        super(BccConfig, self).__init__("BCC")
+        self.uprefix = "u"
+        self.iprefix = "i"
+        self.postamble = ""
+
+    def serializeTableDeclaration(self, serializer, tableName,
+                                  isHash, keyType, valueType, size):
+        assert isinstance(serializer, ProgramSerializer)
+        assert isinstance(tableName, str)
+        assert isinstance(isHash, bool)
+        assert isinstance(keyType, str)
+        assert isinstance(valueType, str)
+        assert isinstance(size, int)
+
+        serializer.emitIndent()
+        if isHash:
+            kind = "hash"
+        else:
+            kind = "array"
+        serializer.appendFormat(
+            "BPF_TABLE(\"{0}\", {1}, {2}, {3}, {4});",
+            kind, keyType, valueType, tableName, size)
+        serializer.newline()
+
+    def serializeLookup(self, serializer, tableName, key, value):
+        serializer.appendFormat("{0} = {1}.lookup(&{2});",
+                                value, tableName, key)
+
+    def serializeUpdate(self, serializer, tableName, key, value):
+        serializer.appendFormat("{0}.update(&{1}, &{2});",
+                                tableName, key, value)
+
+    def generateDword(self, serializer):
+        pass
+
+    def serializeCodeSection(self, serializer):
+        pass
+
+    def getIncludes(self):
+        return """
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/pkt_cls.h>
+"""
+
+    def serializeLicense(self, serializer, licenseString):
+        assert isinstance(serializer, ProgramSerializer)
+        pass
diff --git a/src/cc/frontends/p4/compiler/topoSorting.py b/src/cc/frontends/p4/compiler/topoSorting.py
new file mode 100644
index 0000000..21daba3
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/topoSorting.py
@@ -0,0 +1,89 @@
+# Copyright 2013-present Barefoot Networks, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Antonin Bas (antonin@barefootnetworks.com)
+#
+#
+
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+class Node(object):
+    def __init__(self, n):
+        self.n = n
+        self.edges = set()
+
+    def add_edge_to(self, other):
+        assert(isinstance(other, Node))
+        self.edges.add(other)
+
+    def __str__(self):
+        return str(self.n)
+
+
+class Graph(object):
+    def __init__(self):
+        self.nodes = {}
+        self.root = None
+
+    def add_node(self, node):
+        assert(node not in self.nodes)
+        self.nodes[node] = Node(node)
+
+    def __contains__(self, node):
+        return node in self.nodes
+
+    def get_node(self, node):
+        return self.nodes[node]
+
+    def produce_topo_sorting(self):
+        def visit(node, topo_sorting, sequence=None):
+            if sequence is not None:
+                sequence += [str(node)]
+            if node._behavioral_topo_sorting_mark == 1:
+                if sequence is not None:
+                    print("cycle", sequence)
+                return False
+            if node._behavioral_topo_sorting_mark != 2:
+                node._behavioral_topo_sorting_mark = 1
+                for next_node in node.edges:
+                    res = visit(next_node, topo_sorting, sequence)
+                    if not res:
+                        return False
+                node._behavioral_topo_sorting_mark = 2
+                topo_sorting.insert(0, node.n)
+            return True
+
+        has_cycle = False
+        topo_sorting = []
+
+        for node in self.nodes.values():
+            # 0 is unmarked, 1 is temp, 2 is permanent
+            node._behavioral_topo_sorting_mark = 0
+        for node in self.nodes.values():
+            if node._behavioral_topo_sorting_mark == 0:
+                if not visit(node, topo_sorting, sequence=[]):
+                    has_cycle = True
+                    break
+        # removing mark
+        for node in self.nodes.values():
+            del node._behavioral_topo_sorting_mark
+
+        if has_cycle:
+            return None
+
+        return topo_sorting
diff --git a/src/cc/frontends/p4/compiler/typeFactory.py b/src/cc/frontends/p4/compiler/typeFactory.py
new file mode 100644
index 0000000..71a0207
--- /dev/null
+++ b/src/cc/frontends/p4/compiler/typeFactory.py
@@ -0,0 +1,33 @@
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from p4_hlir.hlir import p4_header
+from ebpfStructType import *
+
+class EbpfTypeFactory(object):
+    def __init__(self, config):
+        self.type_map = {}
+        self.config = config
+
+    def build(self, hlirType, asMetadata):
+        name = hlirType.name
+        if hlirType.name in self.type_map:
+            retval = self.type_map[name]
+            if ((not asMetadata and isinstance(retval, EbpfMetadataType)) or
+                (asMetadata and isinstance(retval, EbpfHeaderType))):
+                raise CompilationException(
+                    True, "Same type used both as a header and metadata {0}",
+                    hlirType)
+
+        if isinstance(hlirType, p4_header):
+            if asMetadata:
+                type = EbpfMetadataType(hlirType, self.config)
+            else:
+                type = EbpfHeaderType(hlirType, self.config)
+        else:
+            raise CompilationException(True, "Unexpected type {0}", hlirType)
+        self.registerType(name, type)
+        return type
+
+    def registerType(self, name, ebpfType):
+        self.type_map[name] = ebpfType
diff --git a/src/cc/frontends/p4/docs/README.md b/src/cc/frontends/p4/docs/README.md
new file mode 100644
index 0000000..5f94933
--- /dev/null
+++ b/src/cc/frontends/p4/docs/README.md
@@ -0,0 +1,3 @@
+# External references
+
+See [p4toEbpf-bcc.pdf](https://github.com/iovisor/bpf-docs/blob/master/p4/p4toEbpf-bcc.pdf)
diff --git a/src/cc/frontends/p4/scope.png b/src/cc/frontends/p4/scope.png
new file mode 100644
index 0000000..585f8cf
--- /dev/null
+++ b/src/cc/frontends/p4/scope.png
Binary files differ
diff --git a/src/cc/frontends/p4/test/README.txt b/src/cc/frontends/p4/test/README.txt
new file mode 100644
index 0000000..9aace16
--- /dev/null
+++ b/src/cc/frontends/p4/test/README.txt
@@ -0,0 +1,16 @@
+This folder contains tests for the P4->C->EBPF compiler
+
+- cleanup.sh should be run if for some reason endToEndTest.py crashes
+  and leaves garbage namespaces or links
+
+- testP4toEbpf.py compiles all P4 files in the testprograms folder and
+  deposits the corresponding C files in the testoutputs folder
+
+- endToEndTest.py runs a complete end-to-end test compiling the
+  testprograms/simple.p4 program, creating a virtual network with 3
+  boxes (using network namespaces): client, server, switch, loading
+  the EBPF into the kernel of the switch box using the TC, and
+  implementing the forwarding in the switch solely using the P4
+  program.
+  
+  
diff --git a/src/cc/frontends/p4/test/cleanup.sh b/src/cc/frontends/p4/test/cleanup.sh
new file mode 100755
index 0000000..0c14387
--- /dev/null
+++ b/src/cc/frontends/p4/test/cleanup.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Run this script if for some reason the endToEndTest.py crashed
+# and left some garbage state
+
+ip netns del sw
+ip netns del srv
+ip netns del clt
+
+ip link del dev veth-clt-sw
+ip link del dev veth-srv-sw
+
diff --git a/src/cc/frontends/p4/test/endToEndTest.py b/src/cc/frontends/p4/test/endToEndTest.py
new file mode 100755
index 0000000..634a2ec
--- /dev/null
+++ b/src/cc/frontends/p4/test/endToEndTest.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# Testing example for P4->EBPF compiler
+#
+# This program exercises the simple.c EBPF program
+# generated from the simple.p4 source file.
+
+import subprocess
+import ctypes
+import time
+import sys
+import os
+from bcc import BPF
+from pyroute2 import IPRoute, NSPopen, NetNS
+from netaddr import IPAddress
+
+### This part is a simple generic network simulaton toolkit
+
+class Base(object):
+    def __init__(self):
+        self.verbose = True
+
+    def message(self, *args):
+        if self.verbose:
+            print(*args)
+
+
+class Endpoint(Base):
+    # a network interface really
+    def __init__(self, ipaddress, ethaddress):
+        Base.__init__(self)
+        self.mac_addr = ethaddress
+        self.ipaddress = ipaddress
+        self.prefixlen = 24
+        self.parent = None
+
+    def __str__(self):
+        return "Endpoint " + str(self.ipaddress)
+
+    def set_parent(self, parent):
+        assert isinstance(parent, Node)
+        self.parent = parent
+
+    def get_ip_address(self):
+        return IPAddress(self.ipaddress)
+
+
+class Node(Base):
+    # Used to represent one of clt, sw, srv
+    # Each lives in its own namespace
+    def __init__(self, name):
+        Base.__init__(self)
+        self.name = name
+        self.endpoints = []
+        self.get_ns()  # as a side-effect creates namespace
+
+    def add_endpoint(self, endpoint):
+        assert isinstance(endpoint, Endpoint)
+        self.endpoints.append(endpoint)
+        endpoint.set_parent(self)
+
+    def __str__(self):
+        return "Node " + self.name
+
+    def get_ns_name(self):
+        return self.name
+
+    def get_ns(self):
+        nsname = self.get_ns_name()
+        ns = NetNS(nsname)
+        return ns
+
+    def remove(self):
+        ns = self.get_ns();
+        ns.close()
+        ns.remove()
+
+    def execute(self, command):
+        # Run a command in the node's namespace
+        # Return the command's exit code
+        self.message(self.name, "Executing", command)
+        nsn = self.get_ns_name()
+        pipe = NSPopen(nsn, command)
+        result = pipe.wait()
+        pipe.release()
+        return result
+
+    def set_arp(self, destination):
+        assert isinstance(destination, Endpoint)
+        command = ["arp", "-s", str(destination.ipaddress),
+                   str(destination.mac_addr)]
+        self.execute(command)
+
+
+class NetworkBase(Base):
+    def __init__(self):
+        Base.__init__(self)
+        self.ipr = IPRoute()
+        self.nodes = []
+
+    def add_node(self, node):
+        assert isinstance(node, Node)
+        self.nodes.append(node)
+
+    def get_interface_name(self, source, dest):
+        assert isinstance(source, Node)
+        assert isinstance(dest, Node)
+        interface_name = "veth-" + source.name + "-" + dest.name
+        return interface_name
+
+    def get_interface(self, ifname):
+        interfaces = self.ipr.link_lookup(ifname=ifname)
+        if len(interfaces) != 1:
+            raise Exception("Could not identify interface " + ifname)
+        ix = interfaces[0]
+        assert isinstance(ix, int)
+        return ix
+
+    def set_interface_ipaddress(self, node, ifname, address, mask):
+        # Ask a node to set the specified interface address
+        if address is None:
+            return
+
+        assert isinstance(node, Node)
+        command = ["ip", "addr", "add", str(address) + "/" + str(mask),
+                   "dev", str(ifname)]
+        result = node.execute(command)
+        assert(result == 0)
+
+    def create_link(self, src, dest):
+        assert isinstance(src, Endpoint)
+        assert isinstance(dest, Endpoint)
+
+        ifname = self.get_interface_name(src.parent, dest.parent)
+        destname = self.get_interface_name(dest.parent, src.parent)
+        self.ipr.link_create(ifname=ifname, kind="veth", peer=destname)
+
+        self.message("Create", ifname, "link")
+
+        # Set source endpoint information
+        ix = self.get_interface(ifname)
+        self.ipr.link("set", index=ix, address=src.mac_addr)
+        # push source endpoint into source namespace
+        self.ipr.link("set", index=ix,
+                      net_ns_fd=src.parent.get_ns_name(), state="up")
+        # Set interface ip address; seems to be
+        # lost of set prior to moving to namespace
+        self.set_interface_ipaddress(
+            src.parent, ifname, src.ipaddress , src.prefixlen)
+
+        # Sef destination endpoint information
+        ix = self.get_interface(destname)
+        self.ipr.link("set", index=ix, address=dest.mac_addr)
+        # push destination endpoint into the destination namespace
+        self.ipr.link("set", index=ix,
+                      net_ns_fd=dest.parent.get_ns_name(), state="up")
+        # Set interface ip address
+        self.set_interface_ipaddress(dest.parent, destname,
+                                     dest.ipaddress, dest.prefixlen)
+
+    def show_interfaces(self, node):
+        cmd = ["ip", "addr"]
+        if node is None:
+            # Run with no namespace
+            subprocess.call(cmd)
+        else:
+            # Run in node's namespace
+            assert isinstance(node, Node)
+            self.message("Enumerating all interfaces in ", node.name)
+            node.execute(cmd)
+
+    def delete(self):
+        self.message("Deleting virtual network")
+        for n in self.nodes:
+            n.remove()
+        self.ipr.close()
+
+
+### Here begins the concrete instantiation of the network
+# Network setup:
+# Each of these is a separate namespace.
+#
+#                        62:ce:1b:48:3e:61          a2:59:94:cf:51:09
+#      96:a4:85:fe:2a:11           62:ce:1b:48:3e:60
+#              /------------------\     /-----------------\
+#      ----------                 --------                ---------
+#      |  clt   |                 |  sw  |                |  srv  |
+#      ----------                 --------                ---------
+#       10.0.0.11                                         10.0.0.10
+#
+
+class SimulatedNetwork(NetworkBase):
+    def __init__(self):
+        NetworkBase.__init__(self)
+
+        self.client = Node("clt")
+        self.add_node(self.client)
+        self.client_endpoint = Endpoint("10.0.0.11", "96:a4:85:fe:2a:11")
+        self.client.add_endpoint(self.client_endpoint)
+
+        self.server = Node("srv")
+        self.add_node(self.server)
+        self.server_endpoint = Endpoint("10.0.0.10", "a2:59:94:cf:51:09")
+        self.server.add_endpoint(self.server_endpoint)
+
+        self.switch = Node("sw")
+        self.add_node(self.switch)
+        self.sw_clt_endpoint = Endpoint(None, "62:ce:1b:48:3e:61")
+        self.sw_srv_endpoint = Endpoint(None, "62:ce:1b:48:3e:60")
+        self.switch.add_endpoint(self.sw_clt_endpoint)
+        self.switch.add_endpoint(self.sw_srv_endpoint)
+
+    def run_method_in_node(self, node, method, args):
+        # run a method of the SimulatedNetwork class in a different namespace
+        # return the exit code
+        assert isinstance(node, Node)
+        assert isinstance(args, list)
+        torun = __file__
+        args.insert(0, torun)
+        args.insert(1, method)
+        return node.execute(args)  # runs the command argv[0] method args
+
+    def instantiate(self):
+        # Creates the various namespaces
+        self.message("Creating virtual network")
+
+        self.message("Create client-switch link")
+        self.create_link(self.client_endpoint, self.sw_clt_endpoint)
+
+        self.message("Create server-switch link")
+        self.create_link(self.server_endpoint, self.sw_srv_endpoint)
+
+        self.show_interfaces(self.client)
+        self.show_interfaces(self.server)
+        self.show_interfaces(self.switch)
+
+        self.message("Set ARP mappings")
+        self.client.set_arp(self.server_endpoint)
+        self.server.set_arp(self.client_endpoint)
+
+    def setup_switch(self):
+        # This method is run in the switch namespace.
+        self.message("Compiling and loading BPF program")
+
+        b = BPF(src_file="./simple.c", debug=0)
+        fn = b.load_func("ebpf_filter", BPF.SCHED_CLS)
+
+        self.message("BPF program loaded")
+
+        self.message("Discovering tables")
+        routing_tbl = b.get_table("routing")
+        routing_miss_tbl = b.get_table("ebpf_routing_miss")
+        cnt_tbl = b.get_table("cnt")
+
+        self.message("Hooking up BPF classifiers using TC")
+
+        interfname = self.get_interface_name(self.switch, self.server)
+        sw_srv_idx = self.get_interface(interfname)
+        self.ipr.tc("add", "ingress", sw_srv_idx, "ffff:")
+        self.ipr.tc("add-filter", "bpf", sw_srv_idx, ":1", fd=fn.fd,
+                    name=fn.name, parent="ffff:", action="ok", classid=1)
+
+        interfname = self.get_interface_name(self.switch, self.client)
+        sw_clt_idx = self.get_interface(interfname)
+        self.ipr.tc("add", "ingress", sw_clt_idx, "ffff:")
+        self.ipr.tc("add-filter", "bpf", sw_clt_idx, ":1", fd=fn.fd,
+                    name=fn.name, parent="ffff:", action="ok", classid=1)
+
+        self.message("Populating tables from the control plane")
+        cltip = self.client_endpoint.get_ip_address()
+        srvip = self.server_endpoint.get_ip_address()
+
+        # BCC does not support tbl.Leaf when the type contains a union,
+        # so we have to make up the value type manually.  Unfortunately
+        # these sizes are not portable...
+
+        class Forward(ctypes.Structure):
+            _fields_ = [("port", ctypes.c_ushort)]
+
+        class Nop(ctypes.Structure):
+            _fields_ = []
+
+        class Union(ctypes.Union):
+            _fields_ = [("nop", Nop),
+                        ("forward", Forward)]
+
+        class Value(ctypes.Structure):
+            _fields_ = [("action", ctypes.c_uint),
+                        ("u", Union)]
+
+        if False:
+            # This is how it should ideally be done, but it does not work
+            routing_tbl[routing_tbl.Key(int(cltip))] = routing_tbl.Leaf(
+                1, sw_clt_idx)
+            routing_tbl[routing_tbl.Key(int(srvip))] = routing_tbl.Leaf(
+                1, sw_srv_idx)
+        else:
+            v1 = Value()
+            v1.action = 1
+            v1.u.forward.port = sw_clt_idx
+
+            v2 = Value()
+            v2.action = 1;
+            v2.u.forward.port = sw_srv_idx
+
+            routing_tbl[routing_tbl.Key(int(cltip))] = v1
+            routing_tbl[routing_tbl.Key(int(srvip))] = v2
+
+        self.message("Dumping table contents")
+        for key, leaf in routing_tbl.items():
+            self.message(str(IPAddress(key.key_field_0)),
+                         leaf.action, leaf.u.forward.port)
+
+    def run(self):
+        self.message("Pinging server from client")
+        ping = ["ping", self.server_endpoint.ipaddress, "-c", "2"]
+        result = self.client.execute(ping)
+        if result != 0:
+            raise Exception("Test failed")
+        else:
+            print("Test succeeded!")
+
+    def prepare_switch(self):
+        self.message("Configuring switch")
+        # Re-invokes this script in the switch namespace;
+        # this causes the setup_switch method to be run in that context.
+        # This is the same as running self.setup_switch()
+        # but in the switch namespace
+        self.run_method_in_node(self.switch, "setup_switch", [])
+
+
+def compile(source, destination):
+    try:
+        status = subprocess.call(
+            "../compiler/p4toEbpf.py " + source + " -o " + destination,
+            shell=True)
+        if status < 0:
+            print("Child was terminated by signal", -status, file=sys.stderr)
+        else:
+            print("Child returned", status, file=sys.stderr)
+    except OSError as e:
+        print("Execution failed:", e, file=sys.stderr)
+        raise e
+
+def start_simulation():
+    compile("testprograms/simple.p4", "simple.c")
+    network = SimulatedNetwork()
+    network.instantiate()
+    network.prepare_switch()
+    network.run()
+    network.delete()
+    os.remove("simple.c")
+
+def main(argv):
+    print(str(argv))
+    if len(argv) == 1:
+        # Main entry point: start simulation
+        start_simulation()
+    else:
+        # We are invoked with some arguments (probably in a different namespace)
+        # First argument is a method name, rest are method arguments.
+        # Create a SimulatedNetwork and invoke the specified method with the
+        # specified arguments.
+        network = SimulatedNetwork()
+        methodname = argv[1]
+        arguments = argv[2:]
+        method = getattr(network, methodname)
+        method(*arguments)
+
+if __name__ == '__main__':
+    main(sys.argv)
+
diff --git a/src/cc/frontends/p4/test/testP4toEbpf.py b/src/cc/frontends/p4/test/testP4toEbpf.py
new file mode 100755
index 0000000..5406f59
--- /dev/null
+++ b/src/cc/frontends/p4/test/testP4toEbpf.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+# Copyright (c) Barefoot Networks, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# Runs the compiler on all files in the 'testprograms' folder
+# Writes outputs in the 'testoutputs' folder
+
+from __future__ import print_function
+from bcc import BPF
+import os, sys
+sys.path.append("../compiler") # To get hold of p4toEbpf
+                               # We want to run it without installing it
+import p4toEbpf
+import os
+
+def drop_extension(filename):
+    return os.path.splitext(os.path.basename(filename))[0]
+
+filesFailed = {}  # map error kind -> list[ (file, error) ]
+
+def set_error(kind, file, error):
+    if kind in filesFailed:
+        filesFailed[kind].append((file, error))
+    else:
+        filesFailed[kind] = [(file, error)]
+
+def is_root():
+    # Is this code portable?
+    return os.getuid() == 0
+
+def main():
+    testpath = "testprograms"
+    destFolder = "testoutputs"
+    files = os.listdir(testpath)
+    files.sort()
+    filesDone = 0
+    errors = 0
+
+    if not is_root():
+        print("Loading EBPF programs requires root privilege.")
+        print("Will only test compilation, not loading.")
+        print("(Run with sudo to test program loading.)")
+
+    for f in files:
+        path = os.path.join(testpath, f)
+
+        if not os.path.isfile(path):
+            continue
+        if not path.endswith(".p4"):
+            continue
+
+        destname = drop_extension(path) + ".c"
+        destname = os.path.join(destFolder, destname)
+
+        args = [path, "-o", destname]
+
+        result = p4toEbpf.process(args)
+        if result.kind != "OK":
+            errors += 1
+            print(path, result.error)
+            set_error(result.kind, path, result.error)
+        else:
+            # Try to load the compiled function
+            if is_root():
+                try:
+                    print("Compiling and loading BPF program")
+                    b = BPF(src_file=destname, debug=0)
+                    fn = b.load_func("ebpf_filter", BPF.SCHED_CLS)
+                except Exception as e:
+                    print(e)
+                    set_error("BPF error", path, str(e))
+
+        filesDone += 1
+
+    print("Compiled", filesDone, "files", errors, "errors")
+    for key in sorted(filesFailed):
+        print(key, ":", len(filesFailed[key]), "programs")
+        for v in filesFailed[key]:
+            print("\t", v)
+    exit(len(filesFailed) != 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cc/frontends/p4/test/testoutputs/.empty b/src/cc/frontends/p4/test/testoutputs/.empty
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/cc/frontends/p4/test/testoutputs/.empty
diff --git a/src/cc/frontends/p4/test/testprograms/arrayKey.p4 b/src/cc/frontends/p4/test/testprograms/arrayKey.p4
new file mode 100644
index 0000000..cc6f028
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/arrayKey.p4
@@ -0,0 +1,34 @@
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+        srcAddr : 48;
+        etherType : 16;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return ingress;
+}
+
+action nop() 
+{}
+
+table routing {
+   reads {
+      ethernet.dstAddr: exact;
+   }
+   actions { nop; }
+   size : 512;
+}
+
+control ingress
+{
+    apply(routing);
+}
\ No newline at end of file
diff --git a/src/cc/frontends/p4/test/testprograms/basic_routing.p4 b/src/cc/frontends/p4/test/testprograms/basic_routing.p4
new file mode 100644
index 0000000..5644071
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/basic_routing.p4
@@ -0,0 +1,231 @@
+/*
+Copyright 2013-present Barefoot Networks, Inc. 
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+        srcAddr : 48;
+        etherType : 16;
+    }
+}
+
+header_type ipv4_t {
+    fields {
+        version : 4;
+        ihl : 4;
+        diffserv : 8;
+        totalLen : 16;
+        identification : 16;
+        flags : 3;
+        fragOffset : 13;
+        ttl : 8;
+        protocol : 8;
+        hdrChecksum : 16;
+        srcAddr : 32;
+        dstAddr: 32;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+#define ETHERTYPE_IPV4 0x0800
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return select(latest.etherType) {
+        ETHERTYPE_IPV4 : parse_ipv4;
+        default: ingress;
+    }
+}
+
+header ipv4_t ipv4;
+
+/* Not yet supported on EBPF target
+
+field_list ipv4_checksum_list {
+        ipv4.version;
+        ipv4.ihl;
+        ipv4.diffserv;
+        ipv4.totalLen;
+        ipv4.identification;
+        ipv4.flags;
+        ipv4.fragOffset;
+        ipv4.ttl;
+        ipv4.protocol;
+        ipv4.srcAddr;
+        ipv4.dstAddr;
+}
+
+field_list_calculation ipv4_checksum {
+    input {
+        ipv4_checksum_list;
+    }
+    algorithm : csum16;
+    output_width : 16;
+}
+
+calculated_field ipv4.hdrChecksum  {
+    verify ipv4_checksum;
+    update ipv4_checksum;
+}
+*/
+
+parser parse_ipv4 {
+    extract(ipv4);
+    return ingress;
+}
+
+#define PORT_VLAN_TABLE_SIZE                   32768
+#define BD_TABLE_SIZE                          65536
+#define IPV4_LPM_TABLE_SIZE                    16384
+#define IPV4_HOST_TABLE_SIZE                   131072
+#define NEXTHOP_TABLE_SIZE                     32768
+#define REWRITE_MAC_TABLE_SIZE                 32768
+
+#define VRF_BIT_WIDTH                          12
+#define BD_BIT_WIDTH                           16
+#define IFINDEX_BIT_WIDTH                      10
+
+/* METADATA */
+header_type ingress_metadata_t {
+    fields {
+        vrf : VRF_BIT_WIDTH;                   /* VRF */
+        bd : BD_BIT_WIDTH;                     /* ingress BD */
+        nexthop_index : 16;                    /* final next hop index */
+    }
+}
+
+metadata ingress_metadata_t ingress_metadata;
+
+action on_miss() {
+}
+
+action set_bd(bd) {
+    modify_field(ingress_metadata.bd, bd);
+}
+
+table port_mapping {
+    reads {
+        standard_metadata.ingress_port : exact;
+    }
+    actions {
+        set_bd;
+    }
+    size : PORT_VLAN_TABLE_SIZE;
+}
+
+action set_vrf(vrf) {
+    modify_field(ingress_metadata.vrf, vrf);
+}
+
+table bd {
+    reads {
+        ingress_metadata.bd : exact;
+    }
+    actions {
+        set_vrf;
+    }
+    size : BD_TABLE_SIZE;
+}
+
+action fib_hit_nexthop(nexthop_index) {
+    modify_field(ingress_metadata.nexthop_index, nexthop_index);
+    subtract_from_field(ipv4.ttl, 1);
+}
+
+table ipv4_fib {
+    reads {
+        ingress_metadata.vrf : exact;
+        ipv4.dstAddr : exact;
+    }
+    actions {
+        on_miss;
+        fib_hit_nexthop;
+    }
+    size : IPV4_HOST_TABLE_SIZE;
+}
+
+table ipv4_fib_lpm {
+    reads {
+        ingress_metadata.vrf : exact;
+        ipv4.dstAddr : exact; // lpm not supported
+    }
+    actions {
+        on_miss;
+        fib_hit_nexthop;
+    }
+    size : IPV4_LPM_TABLE_SIZE;
+}
+
+action set_egress_details(egress_spec) {
+    modify_field(standard_metadata.egress_spec, egress_spec);
+}
+
+table nexthop {
+    reads {
+        ingress_metadata.nexthop_index : exact;
+    }
+    actions {
+        on_miss;
+        set_egress_details;
+    }
+    size : NEXTHOP_TABLE_SIZE;
+}
+
+control ingress {
+    if (valid(ipv4)) {
+        /* derive ingress_metadata.bd */
+        apply(port_mapping);
+
+        /* derive ingress_metadata.vrf */
+        apply(bd);
+
+        /* fib lookup, set ingress_metadata.nexthop_index */
+        apply(ipv4_fib) {
+            on_miss {
+                apply(ipv4_fib_lpm);
+            }
+        }
+
+        /* derive standard_metadata.egress_spec from ingress_metadata.nexthop_index */
+        apply(nexthop);
+    }
+}
+
+action rewrite_src_dst_mac(smac, dmac) {
+    modify_field(ethernet.srcAddr, smac);
+    modify_field(ethernet.dstAddr, dmac);
+}
+
+table rewrite_mac {
+    reads {
+        ingress_metadata.nexthop_index : exact;
+    }
+    actions {
+        on_miss;
+        rewrite_src_dst_mac;
+    }
+    size : REWRITE_MAC_TABLE_SIZE;
+}
+
+control egress {
+    /* set smac and dmac from ingress_metadata.nexthop_index */
+    apply(rewrite_mac);
+}
\ No newline at end of file
diff --git a/src/cc/frontends/p4/test/testprograms/bitfields.p4 b/src/cc/frontends/p4/test/testprograms/bitfields.p4
new file mode 100644
index 0000000..9123c49
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/bitfields.p4
@@ -0,0 +1,66 @@
+header_type ht
+{
+  fields
+  {
+     f1 : 1;
+     f2 : 2;
+     f3 : 3;
+     f4 : 4;
+     f5 : 5;
+     f6 : 6;
+     f7 : 7;
+     f8 : 8;
+     f9 : 9;
+     f10 : 10;
+     f11 : 11;
+     f12 : 12;
+     f13 : 13;
+     f14 : 14;
+     f15 : 15;
+     f16 : 16;
+     f17 : 17;
+     f18 : 18;
+     f19 : 19;
+     f20 : 20;
+     f21 : 21;
+     f22 : 22;
+     f23 : 23;
+     f24 : 24;
+     f25 : 25;
+     f26 : 26;
+     f27 : 27;
+     f28 : 28;
+     f29 : 29;
+     f30 : 30;
+     f31 : 31;
+     f32 : 32;
+  }
+}
+
+header_type larget
+{
+  fields 
+  {
+    f48 : 48;
+    f1: 1;
+    f49 : 48;
+    f2 : 1;
+    f64 : 64;
+    f3 : 1;
+    f128 : 128;
+  }
+}
+
+header ht h;
+header larget large;
+
+parser start
+{
+	extract(h);
+	extract(large);
+	return ingress;
+}
+
+control ingress
+{
+}
diff --git a/src/cc/frontends/p4/test/testprograms/compositeArray.p4 b/src/cc/frontends/p4/test/testprograms/compositeArray.p4
new file mode 100644
index 0000000..5524042
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/compositeArray.p4
@@ -0,0 +1,46 @@
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+    }
+}
+
+header_type ipv4_t {
+    fields {
+        srcAddr : 32;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return parse_ipv4;
+}
+
+action nop() 
+{}
+
+header ipv4_t ipv4;
+
+parser parse_ipv4 {
+    extract(ipv4);
+    return ingress;
+}
+
+table routing {
+   reads {
+      ethernet.dstAddr: exact;
+      ipv4.srcAddr: exact;
+   }
+   actions { nop; }
+   size : 512;
+}
+
+control ingress
+{
+    apply(routing);
+}
\ No newline at end of file
diff --git a/src/cc/frontends/p4/test/testprograms/compositeKey.p4 b/src/cc/frontends/p4/test/testprograms/compositeKey.p4
new file mode 100644
index 0000000..ed04e9f
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/compositeKey.p4
@@ -0,0 +1,72 @@
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+        srcAddr : 48;
+        etherType : 16;
+    }
+}
+
+header_type ipv4_t {
+    fields {
+        version : 4;
+        ihl : 4;
+        diffserv : 8;
+        totalLen : 16;
+        identification : 16;
+        flags : 3;
+        fragOffset : 13;
+        ttl : 8;
+        protocol : 8;
+        hdrChecksum : 16;
+        srcAddr : 32;
+        dstAddr: 32;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return select(latest.etherType) {
+        0x800 : parse_ipv4;
+        default: ingress;
+    }
+}
+
+action nop() 
+{}
+
+action forward(port)
+{
+   modify_field(standard_metadata.egress_port, port);
+}
+
+header ipv4_t ipv4;
+
+parser parse_ipv4 {
+    extract(ipv4);
+    return ingress;
+}
+
+table routing {
+   reads {
+      ipv4.dstAddr: exact;
+      ipv4.srcAddr: exact;
+   }
+   actions { nop; forward; }
+   size : 512;
+}
+
+counter cnt {
+   type: bytes;
+   direct: routing;
+}
+
+control ingress
+{
+    apply(routing);
+}
\ No newline at end of file
diff --git a/src/cc/frontends/p4/test/testprograms/do_nothing.p4 b/src/cc/frontends/p4/test/testprograms/do_nothing.p4
new file mode 100644
index 0000000..845f8d4
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/do_nothing.p4
@@ -0,0 +1,36 @@
+/* Sample P4 program */
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+        srcAddr : 48;
+        etherType : 16;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return ingress;
+}
+
+action action_0(){
+    no_op();
+}
+
+table table_0 {
+   reads {
+      ethernet.etherType : exact;
+   }
+   actions {
+      action_0;
+   }
+}
+
+control ingress {
+    apply(table_0);
+}
diff --git a/src/cc/frontends/p4/test/testprograms/simple.p4 b/src/cc/frontends/p4/test/testprograms/simple.p4
new file mode 100644
index 0000000..7f28561
--- /dev/null
+++ b/src/cc/frontends/p4/test/testprograms/simple.p4
@@ -0,0 +1,74 @@
+// Routes a packet to an interface based on its IPv4 address
+// Maintains a set of counters on the routing table
+
+header_type ethernet_t {
+    fields {
+        dstAddr : 48;
+        srcAddr : 48;
+        etherType : 16;
+    }
+}
+
+header_type ipv4_t {
+    fields {
+        version : 4;
+        ihl : 4;
+        diffserv : 8;
+        totalLen : 16;
+        identification : 16;
+        flags : 3;
+        fragOffset : 13;
+        ttl : 8;
+        protocol : 8;
+        hdrChecksum : 16;
+        srcAddr : 32;
+        dstAddr: 32;
+    }
+}
+
+parser start {
+    return parse_ethernet;
+}
+
+header ethernet_t ethernet;
+
+parser parse_ethernet {
+    extract(ethernet);
+    return select(latest.etherType) {
+        0x800 : parse_ipv4;
+        default: ingress;
+    }
+}
+
+action nop() 
+{}
+
+action forward(port)
+{
+   modify_field(standard_metadata.egress_port, port);
+}
+
+header ipv4_t ipv4;
+
+parser parse_ipv4 {
+    extract(ipv4);
+    return ingress;
+}
+
+table routing {
+   reads {
+      ipv4.dstAddr: exact;
+   }
+   actions { nop; forward; }
+   size : 512;
+}
+
+counter cnt {
+   type: bytes;
+   direct: routing;
+}
+
+control ingress
+{
+    apply(routing);
+}
\ No newline at end of file
diff --git a/src/cc/includes/bcc_debug.h b/src/cc/includes/bcc_debug.h
new file mode 120000
index 0000000..7354422
--- /dev/null
+++ b/src/cc/includes/bcc_debug.h
@@ -0,0 +1 @@
+../bcc_debug.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_elf.h b/src/cc/includes/bcc_elf.h
new file mode 120000
index 0000000..85b2dbe
--- /dev/null
+++ b/src/cc/includes/bcc_elf.h
@@ -0,0 +1 @@
+../bcc_elf.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_exception.h b/src/cc/includes/bcc_exception.h
new file mode 120000
index 0000000..c1106ba
--- /dev/null
+++ b/src/cc/includes/bcc_exception.h
@@ -0,0 +1 @@
+../bcc_exception.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_perf_map.h b/src/cc/includes/bcc_perf_map.h
new file mode 120000
index 0000000..96db653
--- /dev/null
+++ b/src/cc/includes/bcc_perf_map.h
@@ -0,0 +1 @@
+../bcc_perf_map.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_proc.h b/src/cc/includes/bcc_proc.h
new file mode 120000
index 0000000..08bc745
--- /dev/null
+++ b/src/cc/includes/bcc_proc.h
@@ -0,0 +1 @@
+../bcc_proc.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_syms.h b/src/cc/includes/bcc_syms.h
new file mode 120000
index 0000000..fca7fc8
--- /dev/null
+++ b/src/cc/includes/bcc_syms.h
@@ -0,0 +1 @@
+../bcc_syms.h
\ No newline at end of file
diff --git a/src/cc/includes/bcc_usdt.h b/src/cc/includes/bcc_usdt.h
new file mode 120000
index 0000000..027f013
--- /dev/null
+++ b/src/cc/includes/bcc_usdt.h
@@ -0,0 +1 @@
+../bcc_usdt.h
\ No newline at end of file
diff --git a/src/cc/includes/bpf_common.h b/src/cc/includes/bpf_common.h
new file mode 120000
index 0000000..b98d5f3
--- /dev/null
+++ b/src/cc/includes/bpf_common.h
@@ -0,0 +1 @@
+../bpf_common.h
\ No newline at end of file
diff --git a/src/cc/includes/bpf_module.h b/src/cc/includes/bpf_module.h
new file mode 120000
index 0000000..2e79b69
--- /dev/null
+++ b/src/cc/includes/bpf_module.h
@@ -0,0 +1 @@
+../bpf_module.h
\ No newline at end of file
diff --git a/src/cc/includes/common.h b/src/cc/includes/common.h
new file mode 120000
index 0000000..98e8e7c
--- /dev/null
+++ b/src/cc/includes/common.h
@@ -0,0 +1 @@
+../common.h
\ No newline at end of file
diff --git a/src/cc/includes/compat/linux/bpf.h b/src/cc/includes/compat/linux/bpf.h
new file mode 120000
index 0000000..f637ca9
--- /dev/null
+++ b/src/cc/includes/compat/linux/bpf.h
@@ -0,0 +1 @@
+../../../compat/linux/bpf.h
\ No newline at end of file
diff --git a/src/cc/includes/exported_files.h b/src/cc/includes/exported_files.h
new file mode 120000
index 0000000..b9051df
--- /dev/null
+++ b/src/cc/includes/exported_files.h
@@ -0,0 +1 @@
+../exported_files.h
\ No newline at end of file
diff --git a/src/cc/includes/file_desc.h b/src/cc/includes/file_desc.h
new file mode 120000
index 0000000..5dce7e9
--- /dev/null
+++ b/src/cc/includes/file_desc.h
@@ -0,0 +1 @@
+../file_desc.h
\ No newline at end of file
diff --git a/src/cc/includes/libbpf.h b/src/cc/includes/libbpf.h
new file mode 120000
index 0000000..e2ca5aa
--- /dev/null
+++ b/src/cc/includes/libbpf.h
@@ -0,0 +1 @@
+../libbpf.h
\ No newline at end of file
diff --git a/src/cc/includes/ns_guard.h b/src/cc/includes/ns_guard.h
new file mode 120000
index 0000000..4d4919e
--- /dev/null
+++ b/src/cc/includes/ns_guard.h
@@ -0,0 +1 @@
+../ns_guard.h
\ No newline at end of file
diff --git a/src/cc/includes/perf_reader.h b/src/cc/includes/perf_reader.h
new file mode 120000
index 0000000..36b2ca4
--- /dev/null
+++ b/src/cc/includes/perf_reader.h
@@ -0,0 +1 @@
+../perf_reader.h
\ No newline at end of file
diff --git a/src/cc/includes/setns.h b/src/cc/includes/setns.h
new file mode 120000
index 0000000..56bae12
--- /dev/null
+++ b/src/cc/includes/setns.h
@@ -0,0 +1 @@
+../setns.h
\ No newline at end of file
diff --git a/src/cc/includes/syms.h b/src/cc/includes/syms.h
new file mode 120000
index 0000000..c765478
--- /dev/null
+++ b/src/cc/includes/syms.h
@@ -0,0 +1 @@
+../syms.h
\ No newline at end of file
diff --git a/src/cc/includes/table_desc.h b/src/cc/includes/table_desc.h
new file mode 120000
index 0000000..9d471d5
--- /dev/null
+++ b/src/cc/includes/table_desc.h
@@ -0,0 +1 @@
+../table_desc.h
\ No newline at end of file
diff --git a/src/cc/includes/table_storage.h b/src/cc/includes/table_storage.h
new file mode 120000
index 0000000..35ad2b3
--- /dev/null
+++ b/src/cc/includes/table_storage.h
@@ -0,0 +1 @@
+../table_storage.h
\ No newline at end of file
diff --git a/src/cc/includes/table_storage_impl.h b/src/cc/includes/table_storage_impl.h
new file mode 120000
index 0000000..b16a081
--- /dev/null
+++ b/src/cc/includes/table_storage_impl.h
@@ -0,0 +1 @@
+../table_storage_impl.h
\ No newline at end of file
diff --git a/src/cc/includes/usdt.h b/src/cc/includes/usdt.h
new file mode 120000
index 0000000..91af6ed
--- /dev/null
+++ b/src/cc/includes/usdt.h
@@ -0,0 +1 @@
+../usdt.h
\ No newline at end of file
diff --git a/src/cc/json_map_decl_visitor.cc b/src/cc/json_map_decl_visitor.cc
new file mode 100644
index 0000000..c7fe9b8
--- /dev/null
+++ b/src/cc/json_map_decl_visitor.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <string>
+
+#include <clang/AST/ASTContext.h>
+#include <clang/AST/RecordLayout.h>
+#include <clang/AST/RecursiveASTVisitor.h>
+#include "common.h"
+#include "table_desc.h"
+
+namespace ebpf {
+
+using std::string;
+using std::to_string;
+using std::unique_ptr;
+using namespace clang;
+
+// Helper visitor for constructing a string representation of a key/leaf decl
+class BMapDeclVisitor : public clang::RecursiveASTVisitor<BMapDeclVisitor> {
+ public:
+  explicit BMapDeclVisitor(clang::ASTContext &C, std::string &result);
+  bool TraverseRecordDecl(clang::RecordDecl *Decl);
+  bool VisitRecordDecl(clang::RecordDecl *Decl);
+  bool VisitFieldDecl(clang::FieldDecl *Decl);
+  bool VisitBuiltinType(const clang::BuiltinType *T);
+  bool VisitTypedefType(const clang::TypedefType *T);
+  bool VisitTagType(const clang::TagType *T);
+  bool VisitPointerType(const clang::PointerType *T);
+  bool VisitEnumConstantDecl(clang::EnumConstantDecl *D);
+  bool VisitEnumDecl(clang::EnumDecl *D);
+
+ private:
+  bool shouldSkipPadding(const RecordDecl *D);
+  void genJSONForField(FieldDecl *F);
+
+ private:
+  clang::ASTContext &C;
+  std::string &result_;
+};
+
+// Encode the struct layout as a json description
+BMapDeclVisitor::BMapDeclVisitor(ASTContext &C, string &result) : C(C), result_(result) {}
+
+bool BMapDeclVisitor::shouldSkipPadding(const RecordDecl *D) {
+  if (D->isUnion() || D->field_empty())
+    return true;
+  for (auto F : D->getDefinition()->fields()) {
+    if (F->isBitField())
+      return true;
+    QualType Ty = F->getType();
+    if (Ty->isIncompleteArrayType())
+      return true;
+  }
+  return false;
+}
+
+void BMapDeclVisitor::genJSONForField(FieldDecl *F) {
+  if (F->isAnonymousStructOrUnion()) {
+    if (const RecordType *R = dyn_cast<RecordType>(F->getType()))
+      TraverseDecl(R->getDecl());
+    result_ += ", ";
+    return;
+  }
+  result_ += "[";
+  TraverseDecl(F);
+  if (const ConstantArrayType *T = dyn_cast<ConstantArrayType>(F->getType()))
+    result_ += ", [" + T->getSize().toString(10, false) + "]";
+  if (F->isBitField())
+    result_ += ", " + to_string(F->getBitWidthValue(C));
+  result_ += "], ";
+}
+
+bool BMapDeclVisitor::VisitFieldDecl(FieldDecl *D) {
+  result_ += "\"";
+  result_ += D->getName();
+  result_ += "\",";
+  return true;
+}
+
+bool BMapDeclVisitor::VisitEnumConstantDecl(EnumConstantDecl *D) {
+  result_ += "\"";
+  result_ += D->getName();
+  result_ += "\",";
+  return false;
+}
+
+bool BMapDeclVisitor::VisitEnumDecl(EnumDecl *D) {
+  result_ += "[\"";
+  result_ += D->getName();
+  result_ += "\", [";
+  for (auto it = D->enumerator_begin(); it != D->enumerator_end(); ++it) {
+    TraverseDecl(*it);
+  }
+  result_.erase(result_.end() - 1);
+  result_ += "], \"enum\"]";
+  return false;
+}
+
+bool BMapDeclVisitor::TraverseRecordDecl(RecordDecl *D) {
+  // skip children, handled in Visit...
+  if (!WalkUpFromRecordDecl(D))
+    return false;
+  return true;
+}
+
+bool BMapDeclVisitor::VisitRecordDecl(RecordDecl *D) {
+  result_ += "[\"";
+  result_ += D->getName();
+  result_ += "\", [";
+
+  bool SkipPadding = shouldSkipPadding(D);
+  if (SkipPadding) {
+    for (auto F : D->getDefinition()->fields()) {
+      genJSONForField(F);
+    }
+  } else {
+    const ASTRecordLayout &Layout = C.getASTRecordLayout(D);
+    CharUnits Offset = C.toCharUnitsFromBits(Layout.getFieldOffset(0));
+    for (auto F : D->getDefinition()->fields()) {
+      CharUnits FieldSize = C.getTypeSizeInChars(F->getType());
+      auto FieldOffsetBits = Layout.getFieldOffset(F->getFieldIndex());
+      CharUnits FieldOffset = C.toCharUnitsFromBits(FieldOffsetBits);
+
+      uint64_t Padding = (FieldOffset - Offset).getQuantity();
+      if (Padding) {
+        /* Padding before this field with "char __pad_<FieldIndex>[Padding]". */
+        result_ += "[\"__pad_" + to_string(F->getFieldIndex()) + "\",\"char\",["
+                + to_string(Padding) + "]], ";
+      }
+      Offset = FieldOffset + FieldSize;
+      genJSONForField(F);
+    }
+
+    /* Additional Padding after the last field so that the Record Size matches */
+    CharUnits RecordSize = Layout.getSize();
+    if (RecordSize > Offset) {
+        result_ += "[\"__pad_end\",\"char\",["
+                + to_string((RecordSize - Offset).getQuantity()) + "]], ";
+    }
+  }
+
+  if (!D->getDefinition()->field_empty())
+    result_.erase(result_.end() - 2);
+  result_ += "]";
+  if (D->isUnion())
+    result_ += ", \"union\"";
+  else if (D->isStruct()) {
+    if (SkipPadding)
+      result_ += ", \"struct\"";
+    else
+      result_ += ", \"struct_packed\"";
+  }
+  result_ += "]";
+  return true;
+}
+// pointer to anything should be treated as terminal, don't recurse further
+bool BMapDeclVisitor::VisitPointerType(const PointerType *T) {
+  result_ += "\"unsigned long long\"";
+  return false;
+}
+bool BMapDeclVisitor::VisitTagType(const TagType *T) {
+  return TraverseDecl(T->getDecl()->getDefinition());
+}
+bool BMapDeclVisitor::VisitTypedefType(const TypedefType *T) { return TraverseDecl(T->getDecl()); }
+bool BMapDeclVisitor::VisitBuiltinType(const BuiltinType *T) {
+  result_ += "\"";
+  result_ += T->getName(C.getPrintingPolicy());
+  result_ += "\"";
+  return true;
+}
+
+class JsonMapTypesVisitor : public virtual MapTypesVisitor {
+ public:
+  virtual void Visit(TableDesc &desc, clang::ASTContext &C, clang::QualType key_type,
+                     clang::QualType leaf_type) {
+    BMapDeclVisitor v1(C, desc.key_desc), v2(C, desc.leaf_desc);
+    v1.TraverseType(key_type);
+    v2.TraverseType(leaf_type);
+  }
+};
+
+unique_ptr<MapTypesVisitor> createJsonMapTypesVisitor() {
+  return make_unique<JsonMapTypesVisitor>();
+}
+
+}  // namespace ebpf
diff --git a/src/cc/libbcc.pc.in b/src/cc/libbcc.pc.in
new file mode 100644
index 0000000..69b28c1
--- /dev/null
+++ b/src/cc/libbcc.pc.in
@@ -0,0 +1,14 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+datarootdir=${prefix}/share
+
+compatdir=${includedir}/bcc/compat
+
+Name: libbcc
+Version: @REVISION@
+Description: BCC Program library
+Requires:
+Libs: -L${libdir} -lbcc
+Cflags: -I${includedir} -I${compatdir}
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
new file mode 100644
index 0000000..9852f8c
--- /dev/null
+++ b/src/cc/libbpf.c
@@ -0,0 +1,1470 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/bpf.h>
+#include <linux/bpf_common.h>
+#include <linux/if_packet.h>
+#include <linux/perf_event.h>
+#include <linux/pkt_cls.h>
+#include <linux/rtnetlink.h>
+#include <linux/sched.h>
+#include <linux/unistd.h>
+#include <linux/version.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/if_alg.h>
+
+#include "libbpf.h"
+#include "perf_reader.h"
+
+// TODO: Remove this when CentOS 6 support is not needed anymore
+#include "setns.h"
+
+// TODO: remove these defines when linux-libc-dev exports them properly
+
+#ifndef __NR_bpf
+#if defined(__powerpc64__)
+#define __NR_bpf 361
+#elif defined(__s390x__)
+#define __NR_bpf 351
+#elif defined(__aarch64__)
+#define __NR_bpf 280
+#else
+#define __NR_bpf 321
+#endif
+#endif
+
+#ifndef SO_ATTACH_BPF
+#define SO_ATTACH_BPF 50
+#endif
+
+#ifndef PERF_EVENT_IOC_SET_BPF
+#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
+#endif
+
+#ifndef PERF_FLAG_FD_CLOEXEC
+#define PERF_FLAG_FD_CLOEXEC (1UL << 3)
+#endif
+
+// TODO: Remove this when CentOS 6 support is not needed anymore
+#ifndef AF_ALG
+#define AF_ALG 38
+#endif
+
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+struct bpf_helper {
+  char *name;
+  char *required_version;
+};
+
+static struct bpf_helper helpers[] = {
+  {"map_lookup_elem", "3.19"},
+  {"map_update_elem", "3.19"},
+  {"map_delete_elem", "3.19"},
+  {"probe_read", "4.1"},
+  {"ktime_get_ns", "4.1"},
+  {"trace_printk", "4.1"},
+  {"get_prandom_u32", "4.1"},
+  {"get_smp_processor_id", "4.1"},
+  {"skb_store_bytes", "4.1"},
+  {"l3_csum_replace", "4.1"},
+  {"l4_csum_replace", "4.1"},
+  {"tail_call", "4.2"},
+  {"clone_redirect", "4.2"},
+  {"get_current_pid_tgid", "4.2"},
+  {"get_current_uid_gid", "4.2"},
+  {"get_current_comm", "4.2"},
+  {"get_cgroup_classid", "4.3"},
+  {"skb_vlan_push", "4.3"},
+  {"skb_vlan_pop", "4.3"},
+  {"skb_get_tunnel_key", "4.3"},
+  {"skb_set_tunnel_key", "4.3"},
+  {"perf_event_read", "4.3"},
+  {"redirect", "4.4"},
+  {"get_route_realm", "4.4"},
+  {"perf_event_output", "4.4"},
+  {"skb_load_bytes", "4.5"},
+  {"get_stackid", "4.6"},
+  {"csum_diff", "4.6"},
+  {"skb_get_tunnel_opt", "4.6"},
+  {"skb_set_tunnel_opt", "4.6"},
+  {"skb_change_proto", "4.8"},
+  {"skb_change_type", "4.8"},
+  {"skb_under_cgroup", "4.8"},
+  {"get_hash_recalc", "4.8"},
+  {"get_current_task", "4.8"},
+  {"probe_write_user", "4.8"},
+  {"current_task_under_cgroup", "4.9"},
+  {"skb_change_tail", "4.9"},
+  {"skb_pull_data", "4.9"},
+  {"csum_update", "4.9"},
+  {"set_hash_invalid", "4.9"},
+  {"get_numa_node_id", "4.10"},
+  {"skb_change_head", "4.10"},
+  {"xdp_adjust_head", "4.10"},
+  {"probe_read_str", "4.11"},
+  {"get_socket_cookie", "4.12"},
+  {"get_socket_uid", "4.12"},
+  {"set_hash", "4.13"},
+  {"setsockopt", "4.13"},
+  {"skb_adjust_room", "4.13"},
+  {"redirect_map", "4.14"},
+  {"sk_redirect_map", "4.14"},
+  {"sock_map_update", "4.14"},
+  {"xdp_adjust_meta", "4.15"},
+  {"perf_event_read_value", "4.15"},
+  {"perf_prog_read_value", "4.15"},
+  {"getsockopt", "4.15"},
+  {"override_return", "4.16"},
+  {"sock_ops_cb_flags_set", "4.16"},
+  {"msg_redirect_map", "4.17"},
+  {"msg_apply_bytes", "4.17"},
+  {"msg_cork_bytes", "4.17"},
+  {"msg_pull_data", "4.17"},
+  {"bind", "4.17"},
+  {"xdp_adjust_tail", "4.18"},
+  {"skb_get_xfrm_state", "4.18"},
+  {"get_stack", "4.18"},
+  {"skb_load_bytes_relative", "4.18"},
+  {"fib_lookup", "4.18"},
+  {"sock_hash_update", "4.18"},
+  {"msg_redirect_hash", "4.18"},
+  {"sk_redirect_hash", "4.18"},
+  {"lwt_push_encap", "4.18"},
+  {"lwt_seg6_store_bytes", "4.18"},
+  {"lwt_seg6_adjust_srh", "4.18"},
+  {"lwt_seg6_action", "4.18"},
+  {"rc_repeat", "4.18"},
+  {"rc_keydown", "4.18"},
+  {"skb_cgroup_id", "4.18"},
+  {"get_current_cgroup_id", "4.18"},
+  {"get_local_storage", "4.19"},
+  {"sk_select_reuseport", "4.19"},
+  {"skb_ancestor_cgroup_id", "4.19"},
+  {"sk_lookup_tcp", "4.20"},
+  {"sk_lookup_udp", "4.20"},
+  {"sk_release", "4.20"},
+  {"map_push_elem", "4.20"},
+  {"map_pop_elem", "4.20"},
+  {"map_peak_elem", "4.20"},
+  {"msg_push_data", "4.20"},
+};
+
+static uint64_t ptr_to_u64(void *ptr)
+{
+  return (uint64_t) (unsigned long) ptr;
+}
+
+int bpf_create_map(enum bpf_map_type map_type, const char *name,
+                   int key_size, int value_size,
+                   int max_entries, int map_flags)
+{
+  size_t name_len = name ? strlen(name) : 0;
+  union bpf_attr attr;
+  memset(&attr, 0, sizeof(attr));
+  attr.map_type = map_type;
+  attr.key_size = key_size;
+  attr.value_size = value_size;
+  attr.max_entries = max_entries;
+  attr.map_flags = map_flags;
+  memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+
+  int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+
+  if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
+    memset(attr.map_name, 0, BPF_OBJ_NAME_LEN);
+    ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+  }
+
+  if (ret < 0 && errno == EPERM) {
+    // see note below about the rationale for this retry
+
+    struct rlimit rl = {};
+    if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
+      rl.rlim_max = RLIM_INFINITY;
+      rl.rlim_cur = rl.rlim_max;
+      if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
+        ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+    }
+  }
+  return ret;
+}
+
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
+{
+  union bpf_attr attr;
+  memset(&attr, 0, sizeof(attr));
+  attr.map_fd = fd;
+  attr.key = ptr_to_u64(key);
+  attr.value = ptr_to_u64(value);
+  attr.flags = flags;
+
+  return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_lookup_elem(int fd, void *key, void *value)
+{
+  union bpf_attr attr;
+  memset(&attr, 0, sizeof(attr));
+  attr.map_fd = fd;
+  attr.key = ptr_to_u64(key);
+  attr.value = ptr_to_u64(value);
+
+  return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_delete_elem(int fd, void *key)
+{
+  union bpf_attr attr;
+  memset(&attr, 0, sizeof(attr));
+  attr.map_fd = fd;
+  attr.key = ptr_to_u64(key);
+
+  return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+
+int bpf_get_first_key(int fd, void *key, size_t key_size)
+{
+  union bpf_attr attr;
+  int i, res;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.map_fd = fd;
+  attr.key = 0;
+  attr.next_key = ptr_to_u64(key);
+
+  // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY
+  // to get first key of the map. For older kernels, the call will fail.
+  res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+  if (res < 0 && errno == EFAULT) {
+    // Fall back to try to find a non-existing key.
+    static unsigned char try_values[3] = {0, 0xff, 0x55};
+    attr.key = ptr_to_u64(key);
+    for (i = 0; i < 3; i++) {
+      memset(key, try_values[i], key_size);
+      // We want to check the existence of the key but we don't know the size
+      // of map's value. So we pass an invalid pointer for value, expect
+      // the call to fail and check if the error is ENOENT indicating the
+      // key doesn't exist. If we use NULL for the invalid pointer, it might
+      // trigger a page fault in kernel and affect performance. Hence we use
+      // ~0 which will fail and return fast.
+      // This should fail since we pass an invalid pointer for value.
+      if (bpf_lookup_elem(fd, key, (void *)~0) >= 0)
+        return -1;
+      // This means the key doesn't exist.
+      if (errno == ENOENT)
+        return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+    }
+    return -1;
+  } else {
+    return res;
+  }
+}
+
+int bpf_get_next_key(int fd, void *key, void *next_key)
+{
+  union bpf_attr attr;
+  memset(&attr, 0, sizeof(attr));
+  attr.map_fd = fd;
+  attr.key = ptr_to_u64(key);
+  attr.next_key = ptr_to_u64(next_key);
+
+  return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+
+static void bpf_print_hints(int ret, char *log)
+{
+  if (ret < 0)
+    fprintf(stderr, "bpf: Failed to load program: %s\n", strerror(errno));
+  if (log == NULL)
+    return;
+  else
+    fprintf(stderr, "%s\n", log);
+
+  if (ret >= 0)
+    return;
+
+  // The following error strings will need maintenance to match LLVM.
+
+  // stack busting
+  if (strstr(log, "invalid stack off=-") != NULL) {
+    fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. "
+      "This can happen if you allocate too much local variable storage. "
+      "For example, if you allocated a 1 Kbyte struct (maybe for "
+      "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n");
+  }
+
+  // didn't check NULL on map lookup
+  if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) {
+    fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if "
+      "you dereference a pointer value from a map lookup without first "
+      "checking if that pointer is NULL.\n\n");
+  }
+
+  // lacking a bpf_probe_read
+  if (strstr(log, "invalid mem access 'inv'") != NULL) {
+    fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen "
+      "if you try to dereference memory without first using "
+      "bpf_probe_read() to copy it to the BPF stack. Sometimes the "
+      "bpf_probe_read is automatic by the bcc rewriter, other times "
+      "you'll need to be explicit.\n\n");
+  }
+
+  // helper function not found in kernel
+  char *helper_str = strstr(log, "invalid func ");
+  if (helper_str != NULL) {
+    helper_str += strlen("invalid func ");
+    char *str = strchr(helper_str, '#');
+    if (str != NULL) {
+      helper_str = str + 1;
+    }
+    unsigned int helper_id = atoi(helper_str);
+    if (helper_id && helper_id < sizeof(helpers) / sizeof(struct bpf_helper)) {
+      struct bpf_helper helper = helpers[helper_id - 1];
+      fprintf(stderr, "HINT: bpf_%s missing (added in Linux %s).\n\n",
+              helper.name, helper.required_version);
+    }
+  }
+}
+#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))
+
+int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len)
+{
+  union bpf_attr attr;
+  int err;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.info.bpf_fd = prog_map_fd;
+  attr.info.info_len = *info_len;
+  attr.info.info = ptr_to_u64(info);
+
+  err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr));
+  if (!err)
+          *info_len = attr.info.info_len;
+
+  return err;
+}
+
+int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len,
+                         unsigned long long *ptag)
+{
+  struct sockaddr_alg alg = {
+    .salg_family    = AF_ALG,
+    .salg_type      = "hash",
+    .salg_name      = "sha1",
+  };
+  int shafd = socket(AF_ALG, SOCK_SEQPACKET, 0);
+  if (shafd < 0) {
+    fprintf(stderr, "sha1 socket not available %s\n", strerror(errno));
+    return -1;
+  }
+  int ret = bind(shafd, (struct sockaddr *)&alg, sizeof(alg));
+  if (ret < 0) {
+    fprintf(stderr, "sha1 bind fail %s\n", strerror(errno));
+    close(shafd);
+    return ret;
+  }
+  int shafd2 = accept(shafd, NULL, 0);
+  if (shafd2 < 0) {
+    fprintf(stderr, "sha1 accept fail %s\n", strerror(errno));
+    close(shafd);
+    return -1;
+  }
+  struct bpf_insn prog[prog_len / 8];
+  bool map_ld_seen = false;
+  int i;
+  for (i = 0; i < prog_len / 8; i++) {
+    prog[i] = insns[i];
+    if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM) &&
+        insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
+        !map_ld_seen) {
+      prog[i].imm = 0;
+      map_ld_seen = true;
+    } else if (insns[i].code == 0 && map_ld_seen) {
+      prog[i].imm = 0;
+      map_ld_seen = false;
+    } else {
+      map_ld_seen = false;
+    }
+  }
+  ret = write(shafd2, prog, prog_len);
+  if (ret != prog_len) {
+    fprintf(stderr, "sha1 write fail %s\n", strerror(errno));
+    close(shafd2);
+    close(shafd);
+    return -1;
+  }
+
+  union {
+	  unsigned char sha[20];
+	  unsigned long long tag;
+  } u = {};
+  ret = read(shafd2, u.sha, 20);
+  if (ret != 20) {
+    fprintf(stderr, "sha1 read fail %s\n", strerror(errno));
+    close(shafd2);
+    close(shafd);
+    return -1;
+  }
+  *ptag = __builtin_bswap64(u.tag);
+  close(shafd2);
+  close(shafd);
+  return 0;
+}
+
+int bpf_prog_get_tag(int fd, unsigned long long *ptag)
+{
+  char fmt[64];
+  snprintf(fmt, sizeof(fmt), "/proc/self/fdinfo/%d", fd);
+  FILE * f = fopen(fmt, "r");
+  if (!f) {
+/*    fprintf(stderr, "failed to open fdinfo %s\n", strerror(errno));*/
+    return -1;
+  }
+  fgets(fmt, sizeof(fmt), f); // pos
+  fgets(fmt, sizeof(fmt), f); // flags
+  fgets(fmt, sizeof(fmt), f); // mnt_id
+  fgets(fmt, sizeof(fmt), f); // prog_type
+  fgets(fmt, sizeof(fmt), f); // prog_jited
+  fgets(fmt, sizeof(fmt), f); // prog_tag
+  fclose(f);
+  char *p = strchr(fmt, ':');
+  if (!p) {
+/*    fprintf(stderr, "broken fdinfo %s\n", fmt);*/
+    return -2;
+  }
+  unsigned long long tag = 0;
+  sscanf(p + 1, "%llx", &tag);
+  *ptag = tag;
+  return 0;
+}
+
+int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
+                  const struct bpf_insn *insns, int prog_len,
+                  const char *license, unsigned kern_version,
+                  int log_level, char *log_buf, unsigned log_buf_size)
+{
+  size_t name_len = name ? strlen(name) : 0;
+  union bpf_attr attr;
+  char *tmp_log_buf = NULL;
+  unsigned tmp_log_buf_size = 0;
+  int ret = 0, name_offset = 0;
+
+  memset(&attr, 0, sizeof(attr));
+
+  attr.prog_type = prog_type;
+  attr.kern_version = kern_version;
+  attr.license = ptr_to_u64((void *)license);
+
+  attr.insns = ptr_to_u64((void *)insns);
+  attr.insn_cnt = prog_len / sizeof(struct bpf_insn);
+  if (attr.insn_cnt > BPF_MAXINSNS) {
+    errno = EINVAL;
+    fprintf(stderr,
+            "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n",
+            strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS);
+    return -1;
+  }
+
+  attr.log_level = log_level;
+  if (attr.log_level > 0) {
+    if (log_buf_size > 0) {
+      // Use user-provided log buffer if availiable.
+      log_buf[0] = 0;
+      attr.log_buf = ptr_to_u64(log_buf);
+      attr.log_size = log_buf_size;
+    } else {
+      // Create and use temporary log buffer if user didn't provide one.
+      tmp_log_buf_size = LOG_BUF_SIZE;
+      tmp_log_buf = malloc(tmp_log_buf_size);
+      if (!tmp_log_buf) {
+        fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
+                strerror(errno));
+        attr.log_level = 0;
+      } else {
+        tmp_log_buf[0] = 0;
+        attr.log_buf = ptr_to_u64(tmp_log_buf);
+        attr.log_size = tmp_log_buf_size;
+      }
+    }
+  }
+
+  if (strncmp(name, "kprobe__", 8) == 0)
+    name_offset = 8;
+  else if (strncmp(name, "tracepoint__", 12) == 0)
+    name_offset = 12;
+  else if (strncmp(name, "raw_tracepoint__", 16) == 0)
+    name_offset = 16;
+  memcpy(attr.prog_name, name + name_offset,
+         min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
+
+  ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+  // BPF object name is not supported on older Kernels.
+  // If we failed due to this, clear the name and try again.
+  if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) {
+    memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN);
+    ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+  }
+
+  if (ret < 0 && errno == EPERM) {
+    // When EPERM is returned, two reasons are possible:
+    //  1. user has no permissions for bpf()
+    //  2. user has insufficent rlimit for locked memory
+    // Unfortunately, there is no api to inspect the current usage of locked
+    // mem for the user, so an accurate calculation of how much memory to lock
+    // for this new program is difficult to calculate. As a hack, bump the limit
+    // to unlimited. If program load fails again, return the error.
+    struct rlimit rl = {};
+    if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
+      rl.rlim_max = RLIM_INFINITY;
+      rl.rlim_cur = rl.rlim_max;
+      if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
+        ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+    }
+  }
+
+  // The load has failed. Handle log message.
+  if (ret < 0) {
+    // User has provided a log buffer.
+    if (log_buf_size) {
+      // If logging is not already enabled, enable it and do the syscall again.
+      if (attr.log_level == 0) {
+        attr.log_level = 1;
+        attr.log_buf = ptr_to_u64(log_buf);
+        attr.log_size = log_buf_size;
+        ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+      }
+      // Print the log message and return.
+      bpf_print_hints(ret, log_buf);
+      if (errno == ENOSPC)
+        fprintf(stderr, "bpf: log_buf size may be insufficient\n");
+      goto return_result;
+    }
+
+    // User did not provide log buffer. We will try to increase size of
+    // our temporary log buffer to get full error message.
+    if (tmp_log_buf)
+      free(tmp_log_buf);
+    tmp_log_buf_size = LOG_BUF_SIZE;
+    if (attr.log_level == 0)
+      attr.log_level = 1;
+    for (;;) {
+      tmp_log_buf = malloc(tmp_log_buf_size);
+      if (!tmp_log_buf) {
+        fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n",
+                strerror(errno));
+        goto return_result;
+      }
+      tmp_log_buf[0] = 0;
+      attr.log_buf = ptr_to_u64(tmp_log_buf);
+      attr.log_size = tmp_log_buf_size;
+
+      ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+      if (ret < 0 && errno == ENOSPC) {
+        // Temporary buffer size is not enough. Double it and try again.
+        free(tmp_log_buf);
+        tmp_log_buf = NULL;
+        tmp_log_buf_size <<= 1;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Check if we should print the log message if log_level is not 0,
+  // either specified by user or set due to error.
+  if (attr.log_level > 0) {
+    // Don't print if user enabled logging and provided log buffer,
+    // but there is no error.
+    if (log_buf && ret < 0)
+      bpf_print_hints(ret, log_buf);
+    else if (tmp_log_buf)
+      bpf_print_hints(ret, tmp_log_buf);
+  }
+
+return_result:
+  if (tmp_log_buf)
+    free(tmp_log_buf);
+  return ret;
+}
+
+int bpf_open_raw_sock(const char *name)
+{
+  struct sockaddr_ll sll;
+  int sock;
+
+  sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
+  if (sock < 0) {
+    fprintf(stderr, "cannot create raw socket\n");
+    return -1;
+  }
+
+  /* Do not bind on empty interface names */
+  if (!name || *name == '\0')
+    return sock;
+
+  memset(&sll, 0, sizeof(sll));
+  sll.sll_family = AF_PACKET;
+  sll.sll_ifindex = if_nametoindex(name);
+  if (sll.sll_ifindex == 0) {
+    fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
+    close(sock);
+    return -1;
+  }
+  sll.sll_protocol = htons(ETH_P_ALL);
+  if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
+    fprintf(stderr, "bind to %s: %s\n", name, strerror(errno));
+    close(sock);
+    return -1;
+  }
+
+  return sock;
+}
+
+int bpf_attach_socket(int sock, int prog) {
+  return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+  int fd;
+  int ret;
+  char buf[PATH_MAX];
+
+  ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+  if (ret < 0 || ret >= (int)sizeof(buf))
+    return -1;
+
+  fd = open(buf, O_RDONLY);
+  if (fd < 0)
+    return -1;
+  ret = read(fd, buf, sizeof(buf));
+  close(fd);
+  if (ret < 0 || ret >= (int)sizeof(buf))
+    return -1;
+  errno = 0;
+  ret = (int)strtol(buf, NULL, 10);
+  return errno ? -1 : ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+  int fd;
+  int ret;
+  char buf[PATH_MAX];
+
+  ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+  if (ret < 0 || ret >= (int)sizeof(buf))
+    return -1;
+
+  fd = open(buf, O_RDONLY);
+  if (fd < 0)
+    return -1;
+  ret = read(fd, buf, sizeof(buf));
+  close(fd);
+  if (ret < 0 || ret >= (int)sizeof(buf))
+    return -1;
+  if (strlen(buf) < strlen("config:"))
+    return -1;
+  errno = 0;
+  ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+  return errno ? -1 : ret;
+}
+
+/*
+ * new kernel API allows creating [k,u]probe with perf_event_open, which
+ * makes it easier to clean up the [k,u]probe. This function tries to
+ * create pfd with the new API.
+ */
+static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
+             int pid, char *event_type, int is_return)
+{
+  struct perf_event_attr attr = {};
+  int type = bpf_find_probe_type(event_type);
+  int is_return_bit = bpf_get_retprobe_bit(event_type);
+  int cpu = 0;
+
+  if (type < 0 || is_return_bit < 0)
+    return -1;
+  attr.sample_period = 1;
+  attr.wakeup_events = 1;
+  if (is_return)
+    attr.config |= 1 << is_return_bit;
+
+  /*
+   * struct perf_event_attr in latest perf_event.h has the following
+   * extension to config1 and config2. To keep bcc compatibe with
+   * older perf_event.h, we use config1 and config2 here instead of
+   * kprobe_func, uprobe_path, kprobe_addr, and probe_offset.
+   *
+   * union {
+   *  __u64 bp_addr;
+   *  __u64 kprobe_func;
+   *  __u64 uprobe_path;
+   *  __u64 config1;
+   * };
+   * union {
+   *   __u64 bp_len;
+   *   __u64 kprobe_addr;
+   *   __u64 probe_offset;
+   *   __u64 config2;
+   * };
+   */
+  attr.config2 = offs;  /* config2 here is kprobe_addr or probe_offset */
+  attr.size = sizeof(attr);
+  attr.type = type;
+  /* config1 here is kprobe_func or  uprobe_path */
+  attr.config1 = ptr_to_u64((void *)name);
+  // PID filter is only possible for uprobe events.
+  if (pid < 0)
+    pid = -1;
+  // perf_event_open API doesn't allow both pid and cpu to be -1.
+  // So only set it to -1 when PID is not -1.
+  // Tracing events do not do CPU filtering in any cases.
+  if (pid != -1)
+    cpu = -1;
+  return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */,
+                 PERF_FLAG_FD_CLOEXEC);
+}
+
+// When a valid Perf Event FD provided through pfd, it will be used to enable
+// and attach BPF program to the event, and event_path will be ignored.
+// Otherwise, event_path is expected to contain the path to the event in debugfs
+// and it will be used to open the Perf Event FD.
+// In either case, if the attach partially failed (such as issue with the
+// ioctl operations), the **caller** need to clean up the Perf Event FD, either
+// provided by the caller or opened here.
+static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid,
+                                    int *pfd)
+{
+  int efd, cpu = 0;
+  ssize_t bytes;
+  char buf[PATH_MAX];
+  struct perf_event_attr attr = {};
+  // Caller did not provided a valid Perf Event FD. Create one with the debugfs
+  // event path provided.
+  if (*pfd < 0) {
+    snprintf(buf, sizeof(buf), "%s/id", event_path);
+    efd = open(buf, O_RDONLY, 0);
+    if (efd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      return -1;
+    }
+
+    bytes = read(efd, buf, sizeof(buf));
+    if (bytes <= 0 || bytes >= (int)sizeof(buf)) {
+      fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
+      close(efd);
+      return -1;
+    }
+    close(efd);
+    buf[bytes] = '\0';
+    attr.config = strtol(buf, NULL, 0);
+    attr.type = PERF_TYPE_TRACEPOINT;
+    attr.sample_period = 1;
+    attr.wakeup_events = 1;
+    // PID filter is only possible for uprobe events.
+    if (pid < 0)
+      pid = -1;
+    // perf_event_open API doesn't allow both pid and cpu to be -1.
+    // So only set it to -1 when PID is not -1.
+    // Tracing events do not do CPU filtering in any cases.
+    if (pid != -1)
+      cpu = -1;
+    *pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
+    if (*pfd < 0) {
+      fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
+      return -1;
+    }
+  }
+
+  if (ioctl(*pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
+    perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
+    return -1;
+  }
+  if (ioctl(*pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
+    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
+    return -1;
+  }
+
+  return 0;
+}
+
+int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
+                      const char *ev_name, const char *fn_name, uint64_t fn_offset)
+{
+  int kfd, pfd = -1;
+  char buf[256];
+  char event_alias[128];
+  static char *event_type = "kprobe";
+
+  // Try create the kprobe Perf Event with perf_event_open API.
+  pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type,
+                                           attach_type != BPF_PROBE_ENTRY);
+  // If failed, most likely Kernel doesn't support the new perf_event_open API
+  // yet. Try create the event using debugfs.
+  if (pfd < 0) {
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+    kfd = open(buf, O_WRONLY | O_APPEND, 0);
+    if (kfd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      goto error;
+    }
+
+    snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
+
+    if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY)
+      snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64,
+               event_type, event_alias, fn_name, fn_offset);
+    else
+      snprintf(buf, sizeof(buf), "%c:%ss/%s %s",
+               attach_type == BPF_PROBE_ENTRY ? 'p' : 'r',
+               event_type, event_alias, fn_name);
+
+    if (write(kfd, buf, strlen(buf)) < 0) {
+      if (errno == ENOENT)
+         fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n");
+      else
+         fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno));
+      close(kfd);
+      goto error;
+    }
+    close(kfd);
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
+  }
+  // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
+  // Perf Event FD directly and buf would be empty and unused.
+  // Otherwise it will read the event ID from the path in buf, create the
+  // Perf Event event using that ID, and updated value of pfd.
+  if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
+    return pfd;
+
+error:
+  bpf_close_perf_event_fd(pfd);
+  return -1;
+}
+
+static int enter_mount_ns(int pid) {
+  struct stat self_stat, target_stat;
+  int self_fd = -1, target_fd = -1;
+  char buf[64];
+
+  if (pid < 0)
+    return -1;
+
+  if ((size_t)snprintf(buf, sizeof(buf), "/proc/%d/ns/mnt", pid) >= sizeof(buf))
+    return -1;
+
+  self_fd = open("/proc/self/ns/mnt", O_RDONLY);
+  if (self_fd < 0) {
+    perror("open(/proc/self/ns/mnt)");
+    return -1;
+  }
+
+  target_fd = open(buf, O_RDONLY);
+  if (target_fd < 0) {
+    perror("open(/proc/<pid>/ns/mnt)");
+    goto error;
+  }
+
+  if (fstat(self_fd, &self_stat)) {
+    perror("fstat(self_fd)");
+    goto error;
+  }
+
+  if (fstat(target_fd, &target_stat)) {
+    perror("fstat(target_fd)");
+    goto error;
+  }
+
+  // both target and current ns are same, avoid setns and close all fds
+  if (self_stat.st_ino == target_stat.st_ino)
+    goto error;
+
+  if (setns(target_fd, CLONE_NEWNS)) {
+    perror("setns(target)");
+    goto error;
+  }
+
+  close(target_fd);
+  return self_fd;
+
+error:
+  if (self_fd >= 0)
+    close(self_fd);
+  if (target_fd >= 0)
+    close(target_fd);
+  return -1;
+}
+
+static void exit_mount_ns(int fd) {
+  if (fd < 0)
+    return;
+
+  if (setns(fd, CLONE_NEWNS))
+    perror("setns");
+  close(fd);
+}
+
+int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
+                      const char *ev_name, const char *binary_path,
+                      uint64_t offset, pid_t pid)
+{
+  char buf[PATH_MAX];
+  char event_alias[PATH_MAX];
+  static char *event_type = "uprobe";
+  int res, kfd = -1, pfd = -1, ns_fd = -1;
+  // Try create the uprobe Perf Event with perf_event_open API.
+  pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type,
+                                           attach_type != BPF_PROBE_ENTRY);
+  // If failed, most likely Kernel doesn't support the new perf_event_open API
+  // yet. Try create the event using debugfs.
+  if (pfd < 0) {
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+    kfd = open(buf, O_WRONLY | O_APPEND, 0);
+    if (kfd < 0) {
+      fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+      goto error;
+    }
+
+    res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
+    if (res < 0 || res >= (int)sizeof(event_alias)) {
+      fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
+      goto error;
+    }
+    res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
+                   event_type, event_alias, binary_path, (unsigned long)offset);
+    if (res < 0 || res >= (int)sizeof(buf)) {
+      fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
+      goto error;
+    }
+
+    ns_fd = enter_mount_ns(pid);
+    if (write(kfd, buf, strlen(buf)) < 0) {
+      if (errno == EINVAL)
+        fprintf(stderr, "check dmesg output for possible cause\n");
+      goto error;
+    }
+    close(kfd);
+    kfd = -1;
+    exit_mount_ns(ns_fd);
+    ns_fd = -1;
+
+    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
+  }
+  // If perf_event_open succeeded, bpf_attach_tracing_event will use the created
+  // Perf Event FD directly and buf would be empty and unused.
+  // Otherwise it will read the event ID from the path in buf, create the
+  // Perf Event event using that ID, and updated value of pfd.
+  if (bpf_attach_tracing_event(progfd, buf, pid, &pfd) == 0)
+    return pfd;
+
+error:
+  if (kfd >= 0)
+    close(kfd);
+  exit_mount_ns(ns_fd);
+  bpf_close_perf_event_fd(pfd);
+  return -1;
+}
+
+static int bpf_detach_probe(const char *ev_name, const char *event_type)
+{
+  int kfd = -1, res;
+  char buf[PATH_MAX];
+  int found_event = 0;
+  size_t bufsize = 0;
+  char *cptr = NULL;
+  FILE *fp;
+
+  /*
+   * For [k,u]probe created with perf_event_open (on newer kernel), it is
+   * not necessary to clean it up in [k,u]probe_events. We first look up
+   * the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
+   * it is safe to skip the cleaning up process (write -:... to the file).
+   */
+  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+  fp = fopen(buf, "r");
+  if (!fp) {
+    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+    goto error;
+  }
+
+  res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
+  if (res < 0 || res >= (int)sizeof(buf)) {
+    fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
+    goto error;
+  }
+
+  while (getline(&cptr, &bufsize, fp) != -1)
+    if (strstr(cptr, buf) != NULL) {
+      found_event = 1;
+      break;
+    }
+  free(cptr);
+  fclose(fp);
+  fp = NULL;
+
+  if (!found_event)
+    return 0;
+
+  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
+  kfd = open(buf, O_WRONLY | O_APPEND, 0);
+  if (kfd < 0) {
+    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
+    goto error;
+  }
+
+  res = snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid());
+  if (res < 0 || res >= (int)sizeof(buf)) {
+    fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
+    goto error;
+  }
+  if (write(kfd, buf, strlen(buf)) < 0) {
+    fprintf(stderr, "write(%s): %s\n", buf, strerror(errno));
+    goto error;
+  }
+
+  close(kfd);
+  return 0;
+
+error:
+  if (kfd >= 0)
+    close(kfd);
+  if (fp)
+    fclose(fp);
+  return -1;
+}
+
+int bpf_detach_kprobe(const char *ev_name)
+{
+  return bpf_detach_probe(ev_name, "kprobe");
+}
+
+int bpf_detach_uprobe(const char *ev_name)
+{
+  return bpf_detach_probe(ev_name, "uprobe");
+}
+
+
+int bpf_attach_tracepoint(int progfd, const char *tp_category,
+                          const char *tp_name)
+{
+  char buf[256];
+  int pfd = -1;
+
+  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
+           tp_category, tp_name);
+  if (bpf_attach_tracing_event(progfd, buf, -1 /* PID */, &pfd) == 0)
+    return pfd;
+
+  bpf_close_perf_event_fd(pfd);
+  return -1;
+}
+
+int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
+  tp_category = NULL;
+  tp_name = NULL;
+  // Right now, there is nothing to do, but it's a good idea to encourage
+  // callers to detach anything they attach.
+  return 0;
+}
+
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
+{
+  union bpf_attr attr;
+  int ret;
+
+  bzero(&attr, sizeof(attr));
+  attr.raw_tracepoint.name = ptr_to_u64(tp_name);
+  attr.raw_tracepoint.prog_fd = progfd;
+
+  ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
+  if (ret < 0)
+    fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
+  return ret;
+}
+
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
+                            perf_reader_lost_cb lost_cb, void *cb_cookie,
+                            int pid, int cpu, int page_cnt) {
+  int pfd;
+  struct perf_event_attr attr = {};
+  struct perf_reader *reader = NULL;
+
+  reader = perf_reader_new(raw_cb, lost_cb, cb_cookie, page_cnt);
+  if (!reader)
+    goto error;
+
+  attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT;
+  attr.type = PERF_TYPE_SOFTWARE;
+  attr.sample_type = PERF_SAMPLE_RAW;
+  attr.sample_period = 1;
+  attr.wakeup_events = 1;
+  pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
+  if (pfd < 0) {
+    fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
+    fprintf(stderr, "   (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n");
+    goto error;
+  }
+  perf_reader_set_fd(reader, pfd);
+
+  if (perf_reader_mmap(reader) < 0)
+    goto error;
+
+  if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
+    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
+    goto error;
+  }
+
+  return reader;
+
+error:
+  if (reader)
+    perf_reader_free(reader);
+
+  return NULL;
+}
+
+static int invalid_perf_config(uint32_t type, uint64_t config) {
+  switch (type) {
+  case PERF_TYPE_HARDWARE:
+    if (config >= PERF_COUNT_HW_MAX) {
+      fprintf(stderr, "HARDWARE perf event config out of range\n");
+      goto is_invalid;
+    }
+    return 0;
+  case PERF_TYPE_SOFTWARE:
+    if (config >= PERF_COUNT_SW_MAX) {
+      fprintf(stderr, "SOFTWARE perf event config out of range\n");
+      goto is_invalid;
+    } else if (config == 10 /* PERF_COUNT_SW_BPF_OUTPUT */) {
+      fprintf(stderr, "Unable to open or attach perf event for BPF_OUTPUT\n");
+      goto is_invalid;
+    }
+    return 0;
+  case PERF_TYPE_HW_CACHE:
+    if (((config >> 16) >= PERF_COUNT_HW_CACHE_RESULT_MAX) ||
+        (((config >> 8) & 0xff) >= PERF_COUNT_HW_CACHE_OP_MAX) ||
+        ((config & 0xff) >= PERF_COUNT_HW_CACHE_MAX)) {
+      fprintf(stderr, "HW_CACHE perf event config out of range\n");
+      goto is_invalid;
+    }
+    return 0;
+  case PERF_TYPE_TRACEPOINT:
+  case PERF_TYPE_BREAKPOINT:
+    fprintf(stderr,
+            "Unable to open or attach TRACEPOINT or BREAKPOINT events\n");
+    goto is_invalid;
+  default:
+    return 0;
+  }
+is_invalid:
+  fprintf(stderr, "Invalid perf event type %" PRIu32 " config %" PRIu64 "\n",
+          type, config);
+  return 1;
+}
+
+int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) {
+  int fd;
+  struct perf_event_attr attr = {};
+
+  if (invalid_perf_config(type, config)) {
+    return -1;
+  }
+
+  attr.sample_period = LONG_MAX;
+  attr.type = type;
+  attr.config = config;
+
+  fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
+  if (fd < 0) {
+    fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
+    return -1;
+  }
+
+  if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
+    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
+    close(fd);
+    return -1;
+  }
+
+  return fd;
+}
+
+int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) {
+    struct sockaddr_nl sa;
+    int sock, seq = 0, len, ret = -1;
+    char buf[4096];
+    struct nlattr *nla, *nla_xdp;
+    struct {
+        struct nlmsghdr  nh;
+        struct ifinfomsg ifinfo;
+        char             attrbuf[64];
+    } req;
+    struct nlmsghdr *nh;
+    struct nlmsgerr *err;
+    socklen_t addrlen;
+
+    memset(&sa, 0, sizeof(sa));
+    sa.nl_family = AF_NETLINK;
+
+    sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+    if (sock < 0) {
+        fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno));
+        return -1;
+    }
+
+    if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+        fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno));
+        goto cleanup;
+    }
+
+    addrlen = sizeof(sa);
+    if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) {
+        fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno));
+        goto cleanup;
+    }
+
+    if (addrlen != sizeof(sa)) {
+        fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen);
+        goto cleanup;
+    }
+
+    memset(&req, 0, sizeof(req));
+    req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+    req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+    req.nh.nlmsg_type = RTM_SETLINK;
+    req.nh.nlmsg_pid = 0;
+    req.nh.nlmsg_seq = ++seq;
+    req.ifinfo.ifi_family = AF_UNSPEC;
+    req.ifinfo.ifi_index = if_nametoindex(dev_name);
+    if (req.ifinfo.ifi_index == 0) {
+        fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
+        goto cleanup;
+    }
+
+    nla = (struct nlattr *)(((char *)&req)
+                            + NLMSG_ALIGN(req.nh.nlmsg_len));
+    nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
+
+    nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);
+    nla->nla_len = NLA_HDRLEN;
+
+    // we specify the FD passed over by the user
+    nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
+    nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd);
+    memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd));
+    nla->nla_len += nla_xdp->nla_len;
+
+    // parse flags as passed by the user
+    if (flags) {
+        nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
+        nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
+        nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
+        memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
+        nla->nla_len += nla_xdp->nla_len;
+    }
+
+    req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
+
+    if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+        fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno));
+        goto cleanup;
+    }
+
+    len = recv(sock, buf, sizeof(buf), 0);
+    if (len < 0) {
+        fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno));
+        goto cleanup;
+    }
+
+    for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, (unsigned int)len);
+         nh = NLMSG_NEXT(nh, len)) {
+        if (nh->nlmsg_pid != sa.nl_pid) {
+            fprintf(stderr, "bpf: Wrong pid %u, expected %u\n",
+                   nh->nlmsg_pid, sa.nl_pid);
+            errno = EBADMSG;
+            goto cleanup;
+        }
+        if (nh->nlmsg_seq != (unsigned int)seq) {
+            fprintf(stderr, "bpf: Wrong seq %d, expected %d\n",
+                   nh->nlmsg_seq, seq);
+            errno = EBADMSG;
+            goto cleanup;
+        }
+        switch (nh->nlmsg_type) {
+            case NLMSG_ERROR:
+                err = (struct nlmsgerr *)NLMSG_DATA(nh);
+                if (!err->error)
+                    continue;
+                fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error));
+                errno = -err->error;
+                goto cleanup;
+            case NLMSG_DONE:
+                break;
+        }
+    }
+
+    ret = 0;
+
+cleanup:
+    close(sock);
+    return ret;
+}
+
+int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid,
+                              int cpu, int group_fd, unsigned long extra_flags) {
+  int fd = syscall(__NR_perf_event_open, perf_event_attr, pid, cpu, group_fd,
+                   PERF_FLAG_FD_CLOEXEC | extra_flags);
+  if (fd < 0) {
+    perror("perf_event_open failed");
+    return -1;
+  }
+  if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
+    perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
+    close(fd);
+    return -1;
+  }
+  if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
+    perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
+    close(fd);
+    return -1;
+  }
+
+  return fd;
+}
+
+int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
+                          uint64_t sample_period, uint64_t sample_freq,
+                          pid_t pid, int cpu, int group_fd) {
+  if (invalid_perf_config(ev_type, ev_config)) {
+    return -1;
+  }
+  if (!((sample_period > 0) ^ (sample_freq > 0))) {
+    fprintf(
+      stderr, "Exactly one of sample_period / sample_freq should be set\n"
+    );
+    return -1;
+  }
+
+  struct perf_event_attr attr = {};
+  attr.type = ev_type;
+  attr.config = ev_config;
+  if (pid > 0)
+    attr.inherit = 1;
+  if (sample_freq > 0) {
+    attr.freq = 1;
+    attr.sample_freq = sample_freq;
+  } else {
+    attr.sample_period = sample_period;
+  }
+
+  return bpf_attach_perf_event_raw(progfd, &attr, pid, cpu, group_fd, 0);
+}
+
+int bpf_close_perf_event_fd(int fd) {
+  int res, error = 0;
+  if (fd >= 0) {
+    res = ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+    if (res != 0) {
+      perror("ioctl(PERF_EVENT_IOC_DISABLE) failed");
+      error = res;
+    }
+    res = close(fd);
+    if (res != 0) {
+      perror("close perf event FD failed");
+      error = (res && !error) ? res : error;
+    }
+  }
+  return error;
+}
+
+int bpf_obj_pin(int fd, const char *pathname)
+{
+  union bpf_attr attr;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.pathname = ptr_to_u64((void *)pathname);
+  attr.bpf_fd = fd;
+
+  return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
+}
+
+int bpf_obj_get(const char *pathname)
+{
+  union bpf_attr attr;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.pathname = ptr_to_u64((void *)pathname);
+
+  return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
+}
+
+int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id)
+{
+  union bpf_attr attr;
+  int err;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.start_id = start_id;
+
+  err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
+  if (!err)
+    *next_id = attr.next_id;
+
+  return err;
+}
+
+int bpf_prog_get_fd_by_id(uint32_t id)
+{
+  union bpf_attr attr;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.prog_id = id;
+
+  return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
+}
+
+int bpf_map_get_fd_by_id(uint32_t id)
+{
+  union bpf_attr attr;
+
+  memset(&attr, 0, sizeof(attr));
+  attr.map_id = id;
+
+  return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
+}
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
new file mode 100644
index 0000000..2728b29
--- /dev/null
+++ b/src/cc/libbpf.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* eBPF mini library */
+
+#ifndef LIBBPF_H
+#define LIBBPF_H
+
+#include "compat/linux/bpf.h"
+#include <stdint.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum bpf_probe_attach_type {
+	BPF_PROBE_ENTRY,
+	BPF_PROBE_RETURN
+};
+
+int bpf_create_map(enum bpf_map_type map_type, const char *name,
+                   int key_size, int value_size, int max_entries,
+                   int map_flags);
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
+int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_delete_elem(int fd, void *key);
+int bpf_get_first_key(int fd, void *key, size_t key_size);
+int bpf_get_next_key(int fd, void *key, void *next_key);
+
+/*
+ * Load a BPF program, and return the FD of the loaded program.
+ *
+ * On newer Kernels, the parameter name is used to identify the loaded program
+ * for inspection and debugging. It could be different from the function name.
+ *
+ * If log_level has value greater than 0, or the load failed, it will enable
+ * extra logging of loaded BPF bytecode and register status, and will print the
+ * logging message to stderr. In such cases:
+ *   - If log_buf and log_buf_size are provided, it will use and also write the
+ *     log messages to the provided log_buf. If log_buf is insufficient in size,
+ *     it will not to any additional memory allocation.
+ *   - Otherwise, it will allocate an internal temporary buffer for log message
+ *     printing, and continue to attempt increase that allocated buffer size if
+ *     initial attemp was insufficient in size.
+ */
+int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
+                  const struct bpf_insn *insns, int insn_len,
+                  const char *license, unsigned kern_version,
+                  int log_level, char *log_buf, unsigned log_buf_size);
+
+int bpf_attach_socket(int sockfd, int progfd);
+
+/* create RAW socket. If name is not NULL/a non-empty null-terminated string,
+ * bind the raw socket to the interface 'name' */
+int bpf_open_raw_sock(const char *name);
+
+typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size);
+typedef void (*perf_reader_lost_cb)(void *cb_cookie, uint64_t lost);
+
+int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
+                      const char *ev_name, const char *fn_name, uint64_t fn_offset);
+int bpf_detach_kprobe(const char *ev_name);
+
+int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
+                      const char *ev_name, const char *binary_path,
+                      uint64_t offset, pid_t pid);
+int bpf_detach_uprobe(const char *ev_name);
+
+int bpf_attach_tracepoint(int progfd, const char *tp_category,
+                          const char *tp_name);
+int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
+
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name);
+
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
+                            perf_reader_lost_cb lost_cb, void *cb_cookie,
+                            int pid, int cpu, int page_cnt);
+
+/* attached a prog expressed by progfd to the device specified in dev_name */
+int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags);
+
+// attach a prog expressed by progfd to run on a specific perf event. The perf
+// event will be created using the perf_event_attr pointer provided.
+int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid,
+                              int cpu, int group_fd, unsigned long extra_flags);
+// attach a prog expressed by progfd to run on a specific perf event, with
+// certain sample period or sample frequency
+int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
+                          uint64_t sample_period, uint64_t sample_freq,
+                          pid_t pid, int cpu, int group_fd);
+
+int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu);
+
+int bpf_close_perf_event_fd(int fd);
+
+int bpf_obj_pin(int fd, const char *pathname);
+int bpf_obj_get(const char *pathname);
+int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len);
+int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len,
+                         unsigned long long *tag);
+int bpf_prog_get_tag(int fd, unsigned long long *tag);
+int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id);
+int bpf_prog_get_fd_by_id(uint32_t id);
+int bpf_map_get_fd_by_id(uint32_t id);
+
+#define LOG_BUF_SIZE 65536
+
+// Put non-static/inline functions in their own section with this prefix +
+// fn_name to enable discovery by the bcc library.
+#define BPF_FN_PREFIX ".bpf.fn."
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM)					\
+	BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = 0,					\
+		.imm   = (__u32) (IMM) }),			\
+	((struct bpf_insn) {					\
+		.code  = 0, /* zero is reserved opcode */	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = ((__u64) (IMM)) >> 32 })
+
+#define BPF_PSEUDO_MAP_FD	1
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD)				\
+	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM)					\
+	((struct bpf_insn) {					\
+		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
+	((struct bpf_insn) {					\
+		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
+		.dst_reg = DST,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
+	((struct bpf_insn) {					\
+		.code  = CODE,					\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = OFF,					\
+		.imm   = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN()						\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_EXIT,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cc/link_all.cc b/src/cc/link_all.cc
new file mode 100644
index 0000000..e03ea76
--- /dev/null
+++ b/src/cc/link_all.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2017 VMware, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <cstdlib>
+
+#include "bcc_usdt.h"
+
+namespace {
+  // Take this trick from llvm for forcing exported functions in helper
+  // libraries to be included in the final .so
+  struct LinkAll {
+    LinkAll() {
+      // getenv never returns -1, but compiler doesn't know!
+      if (::getenv("bar") != (char *)-1)
+        return;
+
+      (void)bcc_usdt_new_frompid(-1, nullptr);
+      (void)bcc_usdt_new_frompath(nullptr);
+      (void)bcc_usdt_close(nullptr);
+    }
+  } LinkAll;  // declare one instance to invoke the constructor
+}
diff --git a/src/cc/ns_guard.cc b/src/cc/ns_guard.cc
new file mode 100644
index 0000000..c5baf5a
--- /dev/null
+++ b/src/cc/ns_guard.cc
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <string>
+
+#include "ns_guard.h"
+
+// TODO: Remove this when CentOS 6 support is not needed anymore
+#include "setns.h"
+
+ProcMountNS::ProcMountNS(int pid) : target_ino_(0) {
+  if (pid < 0)
+    return;
+
+  std::string target_path = "/proc/" + std::to_string(pid) + "/ns/mnt";
+  ebpf::FileDesc target_fd(open(target_path.c_str(), O_RDONLY));
+  ebpf::FileDesc self_fd(open("/proc/self/ns/mnt", O_RDONLY));
+
+  if (self_fd < 0 || target_fd < 0)
+    return;
+
+  struct stat self_stat, target_stat;
+  if (fstat(self_fd, &self_stat) != 0)
+    return;
+  if (fstat(target_fd, &target_stat) != 0)
+    return;
+
+  target_ino_ = target_stat.st_ino;
+  if (self_stat.st_ino == target_stat.st_ino)
+    // Both current and target Process are in same mount namespace
+    return;
+
+  self_fd_ = std::move(self_fd);
+  target_fd_ = std::move(target_fd);
+}
+
+ProcMountNSGuard::ProcMountNSGuard(ProcMountNS *mount_ns)
+    : mount_ns_instance_(nullptr), mount_ns_(mount_ns), entered_(false) {
+  init();
+}
+
+ProcMountNSGuard::ProcMountNSGuard(int pid)
+    : mount_ns_instance_(pid > 0 ? new ProcMountNS(pid) : nullptr),
+      mount_ns_(mount_ns_instance_.get()),
+      entered_(false) {
+  init();
+}
+
+void ProcMountNSGuard::init() {
+  if (!mount_ns_ || mount_ns_->self() < 0 || mount_ns_->target() < 0)
+    return;
+
+  if (setns(mount_ns_->target(), CLONE_NEWNS) == 0)
+    entered_ = true;
+}
+
+ProcMountNSGuard::~ProcMountNSGuard() {
+  if (mount_ns_ && entered_ && mount_ns_->self() >= 0)
+    setns(mount_ns_->self(), CLONE_NEWNS);
+}
diff --git a/src/cc/ns_guard.h b/src/cc/ns_guard.h
new file mode 100644
index 0000000..ce4b61b
--- /dev/null
+++ b/src/cc/ns_guard.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <sys/types.h>
+
+#include "file_desc.h"
+
+class ProcMountNSGuard;
+
+// ProcMountNS opens an fd corresponding to the current mount namespace and the
+// mount namespace of the target process.
+// The fds will remain uninitialized (<0) if the open fails, or if the current
+// and target namespaces are identical.
+class ProcMountNS {
+ public:
+  explicit ProcMountNS(int pid);
+  int self() const { return self_fd_; }
+  int target() const { return target_fd_; }
+  ino_t target_ino() const { return target_ino_; }
+
+ private:
+  ebpf::FileDesc self_fd_;
+  ebpf::FileDesc target_fd_;
+  ino_t target_ino_;
+};
+
+// ProcMountNSGuard switches to the target mount namespace and restores the
+// original upon going out of scope.
+class ProcMountNSGuard {
+ public:
+  explicit ProcMountNSGuard(ProcMountNS *mount_ns);
+  explicit ProcMountNSGuard(int pid);
+
+  ~ProcMountNSGuard();
+
+ private:
+  void init();
+
+  std::unique_ptr<ProcMountNS> mount_ns_instance_;
+  ProcMountNS *mount_ns_;
+  bool entered_;
+};
diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c
new file mode 100644
index 0000000..3cab015
--- /dev/null
+++ b/src/cc/perf_reader.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <inttypes.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+
+#include "libbpf.h"
+#include "perf_reader.h"
+
+enum {
+  RB_NOT_USED = 0, // ring buffer not usd
+  RB_USED_IN_MUNMAP = 1, // used in munmap
+  RB_USED_IN_READ = 2, // used in read
+};
+
+struct perf_reader {
+  perf_reader_raw_cb raw_cb;
+  perf_reader_lost_cb lost_cb;
+  void *cb_cookie; // to be returned in the cb
+  void *buf; // for keeping segmented data
+  size_t buf_size;
+  void *base;
+  int rb_use_state;
+  pid_t rb_read_tid;
+  int page_size;
+  int page_cnt;
+  int fd;
+};
+
+struct perf_reader * perf_reader_new(perf_reader_raw_cb raw_cb,
+                                     perf_reader_lost_cb lost_cb,
+                                     void *cb_cookie, int page_cnt) {
+  struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
+  if (!reader)
+    return NULL;
+  reader->raw_cb = raw_cb;
+  reader->lost_cb = lost_cb;
+  reader->cb_cookie = cb_cookie;
+  reader->fd = -1;
+  reader->page_size = getpagesize();
+  reader->page_cnt = page_cnt;
+  return reader;
+}
+
+void perf_reader_free(void *ptr) {
+  if (ptr) {
+    struct perf_reader *reader = ptr;
+    pid_t tid = syscall(__NR_gettid);
+    while (!__sync_bool_compare_and_swap(&reader->rb_use_state, RB_NOT_USED, RB_USED_IN_MUNMAP)) {
+      // If the same thread, it is called from call back handler, no locking needed
+      if (tid == reader->rb_read_tid)
+        break;
+    }
+    munmap(reader->base, reader->page_size * (reader->page_cnt + 1));
+    if (reader->fd >= 0) {
+      ioctl(reader->fd, PERF_EVENT_IOC_DISABLE, 0);
+      close(reader->fd);
+    }
+    free(reader->buf);
+    free(ptr);
+  }
+}
+
+int perf_reader_mmap(struct perf_reader *reader) {
+  int mmap_size = reader->page_size * (reader->page_cnt + 1);
+
+  if (reader->fd < 0) {
+    fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__);
+    return -1;
+  }
+
+  reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
+  if (reader->base == MAP_FAILED) {
+    perror("mmap");
+    return -1;
+  }
+
+  return 0;
+}
+
+struct perf_sample_trace_common {
+  uint16_t id;
+  uint8_t flags;
+  uint8_t preempt_count;
+  int pid;
+};
+
+struct perf_sample_trace_kprobe {
+  struct perf_sample_trace_common common;
+  uint64_t ip;
+};
+
+static void parse_sw(struct perf_reader *reader, void *data, int size) {
+  uint8_t *ptr = data;
+  struct perf_event_header *header = (void *)data;
+
+  struct {
+      uint32_t size;
+      char data[0];
+  } *raw = NULL;
+
+  ptr += sizeof(*header);
+  if (ptr > (uint8_t *)data + size) {
+    fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
+    return;
+  }
+
+  raw = (void *)ptr;
+  ptr += sizeof(raw->size) + raw->size;
+  if (ptr > (uint8_t *)data + size) {
+    fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
+    return;
+  }
+
+  // sanity check
+  if (ptr != (uint8_t *)data + size) {
+    fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
+    return;
+  }
+
+  if (reader->raw_cb)
+    reader->raw_cb(reader->cb_cookie, raw->data, raw->size);
+}
+
+static uint64_t read_data_head(volatile struct perf_event_mmap_page *perf_header) {
+  uint64_t data_head = perf_header->data_head;
+  asm volatile("" ::: "memory");
+  return data_head;
+}
+
+static void write_data_tail(volatile struct perf_event_mmap_page *perf_header, uint64_t data_tail) {
+  asm volatile("" ::: "memory");
+  perf_header->data_tail = data_tail;
+}
+
+void perf_reader_event_read(struct perf_reader *reader) {
+  volatile struct perf_event_mmap_page *perf_header = reader->base;
+  uint64_t buffer_size = (uint64_t)reader->page_size * reader->page_cnt;
+  uint64_t data_head;
+  uint8_t *base = (uint8_t *)reader->base + reader->page_size;
+  uint8_t *sentinel = (uint8_t *)reader->base + buffer_size + reader->page_size;
+  uint8_t *begin, *end;
+
+  reader->rb_read_tid = syscall(__NR_gettid);
+  if (!__sync_bool_compare_and_swap(&reader->rb_use_state, RB_NOT_USED, RB_USED_IN_READ))
+    return;
+
+  // Consume all the events on this ring, calling the cb function for each one.
+  // The message may fall on the ring boundary, in which case copy the message
+  // into a malloced buffer.
+  for (data_head = read_data_head(perf_header); perf_header->data_tail != data_head;
+      data_head = read_data_head(perf_header)) {
+    uint64_t data_tail = perf_header->data_tail;
+    uint8_t *ptr;
+
+    begin = base + data_tail % buffer_size;
+    // event header is u64, won't wrap
+    struct perf_event_header *e = (void *)begin;
+    ptr = begin;
+    end = base + (data_tail + e->size) % buffer_size;
+    if (end < begin) {
+      // perf event wraps around the ring, make a contiguous copy
+      reader->buf = realloc(reader->buf, e->size);
+      size_t len = sentinel - begin;
+      memcpy(reader->buf, begin, len);
+      memcpy((void *)((unsigned long)reader->buf + len), base, e->size - len);
+      ptr = reader->buf;
+    }
+
+    if (e->type == PERF_RECORD_LOST) {
+      /*
+       * struct {
+       *    struct perf_event_header    header;
+       *    u64                id;
+       *    u64                lost;
+       *    struct sample_id        sample_id;
+       * };
+       */
+      uint64_t lost = *(uint64_t *)(ptr + sizeof(*e) + sizeof(uint64_t));
+      if (reader->lost_cb) {
+        reader->lost_cb(reader->cb_cookie, lost);
+      } else {
+        fprintf(stderr, "Possibly lost %" PRIu64 " samples\n", lost);
+      }
+    } else if (e->type == PERF_RECORD_SAMPLE) {
+      parse_sw(reader, ptr, e->size);
+    } else {
+      fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
+    }
+
+    write_data_tail(perf_header, perf_header->data_tail + e->size);
+  }
+  reader->rb_use_state = RB_NOT_USED;
+  __sync_synchronize();
+  reader->rb_read_tid = 0;
+}
+
+int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout) {
+  struct pollfd pfds[num_readers];
+  int i;
+
+  for (i = 0; i <num_readers; ++i) {
+    pfds[i].fd = readers[i]->fd;
+    pfds[i].events = POLLIN;
+  }
+
+  if (poll(pfds, num_readers, timeout) > 0) {
+    for (i = 0; i < num_readers; ++i) {
+      if (pfds[i].revents & POLLIN)
+        perf_reader_event_read(readers[i]);
+    }
+  }
+  return 0;
+}
+
+void perf_reader_set_fd(struct perf_reader *reader, int fd) {
+  reader->fd = fd;
+}
+
+int perf_reader_fd(struct perf_reader *reader) {
+  return reader->fd;
+}
diff --git a/src/cc/perf_reader.h b/src/cc/perf_reader.h
new file mode 100644
index 0000000..dbe9cfb
--- /dev/null
+++ b/src/cc/perf_reader.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PERF_READER_H
+#define PERF_READER_H
+
+#include "libbpf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct perf_reader;
+
+struct perf_reader * perf_reader_new(perf_reader_raw_cb raw_cb,
+                                     perf_reader_lost_cb lost_cb,
+                                     void *cb_cookie, int page_cnt);
+void perf_reader_free(void *ptr);
+int perf_reader_mmap(struct perf_reader *reader);
+void perf_reader_event_read(struct perf_reader *reader);
+int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
+int perf_reader_fd(struct perf_reader *reader);
+void perf_reader_set_fd(struct perf_reader *reader, int fd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/cc/setns.h b/src/cc/setns.h
new file mode 100644
index 0000000..e198303
--- /dev/null
+++ b/src/cc/setns.h
@@ -0,0 +1,12 @@
+// This file is only needed to support build for CentOS 6
+// Remove it when no longer needed.
+// File is trivial and therefore is in public domain.
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#define setns(FD, NSTYPE) syscall(__NR_setns, (int)(FD), (int)(NSTYPE))
diff --git a/src/cc/shared_table.cc b/src/cc/shared_table.cc
new file mode 100644
index 0000000..29744a5
--- /dev/null
+++ b/src/cc/shared_table.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+#include <iostream>
+
+#include "common.h"
+#include "compat/linux/bpf.h"
+#include "table_storage.h"
+#include "table_storage_impl.h"
+
+namespace ebpf {
+
+using std::string;
+using std::unique_ptr;
+
+/// A process-wide singleton of shared tables
+class SharedTableStorage : public TableStorageImpl {
+ public:
+  class iterator : public TableStorageIteratorImpl {
+    std::map<string, TableDesc>::iterator it_;
+
+   public:
+    explicit iterator(const std::map<string, TableDesc>::iterator &it) : it_(it) {}
+    virtual ~iterator() {}
+    virtual unique_ptr<self_type> clone() const override { return make_unique<iterator>(it_); }
+    virtual self_type &operator++() override {
+      ++it_;
+      return *this;
+    }
+    virtual value_type &operator*() const override { return *it_; }
+    virtual pointer operator->() const override { return &*it_; }
+  };
+  virtual ~SharedTableStorage() {}
+  virtual bool Find(const string &name, TableStorage::iterator &result) const override;
+  virtual bool Insert(const string &name, TableDesc &&desc) override;
+  virtual bool Delete(const string &name) override;
+  virtual unique_ptr<TableStorageIteratorImpl> begin() override;
+  virtual unique_ptr<TableStorageIteratorImpl> end() override;
+  virtual unique_ptr<TableStorageIteratorImpl> lower_bound(const string &k) override;
+  virtual unique_ptr<TableStorageIteratorImpl> upper_bound(const string &k) override;
+  virtual unique_ptr<TableStorageIteratorImpl> erase(const TableStorageIteratorImpl &it) override;
+
+ private:
+  static std::map<string, TableDesc> tables_;
+};
+
+bool SharedTableStorage::Find(const string &name, TableStorage::iterator &result) const {
+  auto it = tables_.find(name);
+  if (it == tables_.end())
+    return false;
+  result = TableStorage::iterator(make_unique<iterator>(it));
+  return true;
+}
+
+bool SharedTableStorage::Insert(const string &name, TableDesc &&desc) {
+  auto it = tables_.find(name);
+  if (it != tables_.end())
+    return false;
+  tables_[name] = std::move(desc);
+  return true;
+}
+
+bool SharedTableStorage::Delete(const string &name) {
+  auto it = tables_.find(name);
+  if (it == tables_.end())
+    return false;
+  tables_.erase(it);
+  return true;
+}
+
+unique_ptr<TableStorageIteratorImpl> SharedTableStorage::begin() {
+  return make_unique<iterator>(tables_.begin());
+}
+unique_ptr<TableStorageIteratorImpl> SharedTableStorage::end() {
+  return make_unique<iterator>(tables_.end());
+}
+
+unique_ptr<TableStorageIteratorImpl> SharedTableStorage::lower_bound(const string &k) {
+  return make_unique<iterator>(tables_.lower_bound(k));
+}
+unique_ptr<TableStorageIteratorImpl> SharedTableStorage::upper_bound(const string &k) {
+  return make_unique<iterator>(tables_.upper_bound(k));
+}
+unique_ptr<TableStorageIteratorImpl> SharedTableStorage::erase(const TableStorageIteratorImpl &it) {
+  auto i = tables_.find((*it).first);
+  if (i == tables_.end())
+    return unique_ptr<iterator>();
+  return make_unique<iterator>(tables_.erase(i));
+}
+
+// All maps for this process are kept in global static storage.
+std::map<string, TableDesc> SharedTableStorage::tables_;
+
+unique_ptr<TableStorage> createSharedTableStorage() {
+  auto t = make_unique<TableStorage>();
+  t->Init(make_unique<SharedTableStorage>());
+  t->AddMapTypesVisitor(createJsonMapTypesVisitor());
+  return t;
+}
+}
diff --git a/src/cc/syms.h b/src/cc/syms.h
new file mode 100644
index 0000000..d7dabfa
--- /dev/null
+++ b/src/cc/syms.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <sys/types.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "bcc_syms.h"
+#include "file_desc.h"
+#include "ns_guard.h"
+
+class ProcStat {
+  std::string procfs_;
+  ino_t inode_;
+  ino_t getinode_();
+
+public:
+  ProcStat(int pid);
+  bool is_stale();
+  void reset() { inode_ = getinode_(); }
+};
+
+class SymbolCache {
+public:
+  virtual ~SymbolCache() = default;
+
+  virtual void refresh() = 0;
+  virtual bool resolve_addr(uint64_t addr, struct bcc_symbol *sym, bool demangle = true) = 0;
+  virtual bool resolve_name(const char *module, const char *name,
+                            uint64_t *addr) = 0;
+};
+
+class KSyms : SymbolCache {
+  struct Symbol {
+    Symbol(const char *name, uint64_t addr) : name(name), addr(addr) {}
+    std::string name;
+    uint64_t addr;
+
+    bool operator<(const Symbol &rhs) const { return addr < rhs.addr; }
+  };
+
+  std::vector<Symbol> syms_;
+  std::unordered_map<std::string, uint64_t> symnames_;
+  static void _add_symbol(const char *, uint64_t, void *);
+
+public:
+  virtual bool resolve_addr(uint64_t addr, struct bcc_symbol *sym, bool demangle = true);
+  virtual bool resolve_name(const char *unused, const char *name,
+                            uint64_t *addr);
+  virtual void refresh();
+};
+
+class ProcSyms : SymbolCache {
+  struct Symbol {
+    Symbol(const std::string *name, uint64_t start, uint64_t size)
+        : name(name), start(start), size(size) {}
+    const std::string *name;
+    uint64_t start;
+    uint64_t size;
+
+    bool operator<(const struct Symbol& rhs) const {
+      return start < rhs.start;
+    }
+  };
+
+  enum class ModuleType {
+    UNKNOWN,
+    EXEC,
+    SO,
+    PERF_MAP,
+    VDSO
+  };
+
+  struct Module {
+    struct Range {
+      uint64_t start;
+      uint64_t end;
+      uint64_t file_offset;
+      Range(uint64_t s, uint64_t e, uint64_t f)
+          : start(s), end(e), file_offset(f) {}
+    };
+
+    Module(const char *name, ProcMountNS *mount_ns,
+           struct bcc_symbol_option *option);
+
+    std::string name_;
+    std::vector<Range> ranges_;
+    bool loaded_;
+    ProcMountNS *mount_ns_;
+    bcc_symbol_option *symbol_option_;
+    ModuleType type_;
+
+    // The file offset within the ELF of the SO's first text section.
+    uint64_t elf_so_offset_;
+    uint64_t elf_so_addr_;
+
+    std::unordered_set<std::string> symnames_;
+    std::vector<Symbol> syms_;
+
+    void load_sym_table();
+
+    bool contains(uint64_t addr, uint64_t &offset) const;
+    uint64_t start() const { return ranges_.begin()->start; }
+
+    bool find_addr(uint64_t offset, struct bcc_symbol *sym);
+    bool find_name(const char *symname, uint64_t *addr);
+
+    static int _add_symbol(const char *symname, uint64_t start, uint64_t size,
+                           void *p);
+  };
+
+  int pid_;
+  std::vector<Module> modules_;
+  ProcStat procstat_;
+  std::unique_ptr<ProcMountNS> mount_ns_instance_;
+  bcc_symbol_option symbol_option_;
+
+  static int _add_load_sections(uint64_t v_addr, uint64_t mem_sz,
+                                uint64_t file_offset, void *payload);
+  static int _add_module(const char *, uint64_t, uint64_t, uint64_t, bool,
+                         void *);
+  void load_exe();
+  void load_modules();
+
+public:
+  ProcSyms(int pid, struct bcc_symbol_option *option = nullptr);
+  virtual void refresh();
+  virtual bool resolve_addr(uint64_t addr, struct bcc_symbol *sym, bool demangle = true);
+  virtual bool resolve_name(const char *module, const char *name,
+                            uint64_t *addr);
+};
diff --git a/src/cc/table_desc.h b/src/cc/table_desc.h
new file mode 100644
index 0000000..da0927f
--- /dev/null
+++ b/src/cc/table_desc.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <unistd.h>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "bcc_exception.h"
+#include "file_desc.h"
+
+namespace clang {
+class ASTContext;
+class QualType;
+}
+
+namespace ebpf {
+
+typedef std::function<StatusTuple(const char *, void *)> sscanf_fn;
+typedef std::function<StatusTuple(char *, size_t, const void *)> snprintf_fn;
+
+/// TableDesc uniquely stores all of the runtime state for an active bpf table.
+/// The copy constructor/assign operator are disabled since the file handles
+/// owned by this table are not implicitly copyable. One should call the dup()
+/// method if an explicit new handle is required. We define the move operators
+/// so that objects of this class can reside in stl containers.
+class TableDesc {
+ private:
+  TableDesc(const TableDesc &that)
+      : name(that.name),
+        fd(that.fd.dup()),
+        type(that.type),
+        key_size(that.key_size),
+        leaf_size(that.leaf_size),
+        max_entries(that.max_entries),
+        flags(that.flags),
+        key_desc(that.key_desc),
+        leaf_desc(that.leaf_desc),
+        key_sscanf(that.key_sscanf),
+        leaf_sscanf(that.leaf_sscanf),
+        key_snprintf(that.key_snprintf),
+        leaf_snprintf(that.leaf_snprintf),
+        is_shared(that.is_shared),
+        is_extern(that.is_extern) {}
+
+ public:
+  TableDesc()
+      : type(0),
+        key_size(0),
+        leaf_size(0),
+        max_entries(0),
+        flags(0),
+        is_shared(false),
+        is_extern(false) {}
+  TableDesc(const std::string &name, FileDesc &&fd, int type, size_t key_size,
+            size_t leaf_size, size_t max_entries, int flags)
+      : name(name),
+        fd(std::move(fd)),
+        type(type),
+        key_size(key_size),
+        leaf_size(leaf_size),
+        max_entries(max_entries),
+        flags(flags),
+        is_shared(false),
+        is_extern(false) {}
+  TableDesc(TableDesc &&that) = default;
+
+  TableDesc &operator=(TableDesc &&that) = default;
+  TableDesc &operator=(const TableDesc &that) = delete;
+
+  TableDesc dup() const { return TableDesc(*this); }
+
+  std::string name;
+  FileDesc fd;
+  int type;
+  size_t key_size;  // sizes are in bytes
+  size_t leaf_size;
+  size_t max_entries;
+  int flags;
+  std::string key_desc;
+  std::string leaf_desc;
+  sscanf_fn key_sscanf;
+  sscanf_fn leaf_sscanf;
+  snprintf_fn key_snprintf;
+  snprintf_fn leaf_snprintf;
+  bool is_shared;
+  bool is_extern;
+};
+
+/// MapTypesVisitor gets notified of new bpf tables, and has a chance to parse
+/// the key and leaf types for their own usage. Subclass this abstract class and
+/// implement the Visit method, then add an instance of this class to the
+/// StorageTable instance to be notified of each new key/leaf type.
+class MapTypesVisitor {
+ public:
+  virtual ~MapTypesVisitor() {}
+  virtual void Visit(TableDesc &desc, clang::ASTContext &C, clang::QualType key_type,
+                     clang::QualType leaf_type) = 0;
+};
+
+std::unique_ptr<MapTypesVisitor> createJsonMapTypesVisitor();
+
+}  // namespace ebpf
diff --git a/src/cc/table_storage.cc b/src/cc/table_storage.cc
new file mode 100644
index 0000000..2211401
--- /dev/null
+++ b/src/cc/table_storage.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+
+#include <clang/AST/Type.h>
+
+#include "table_storage_impl.h"
+
+namespace ebpf {
+
+using std::move;
+using std::string;
+using std::unique_ptr;
+
+const string Path::DELIM = "/";
+
+TableStorage::TableStorage() {}
+TableStorage::~TableStorage() {}
+void TableStorage::Init(unique_ptr<TableStorageImpl> impl) { impl_ = move(impl); }
+bool TableStorage::Find(const Path &path, TableStorage::iterator &result) const {
+  return impl_->Find(path.to_string(), result);
+}
+bool TableStorage::Insert(const Path &path, TableDesc &&desc) {
+  return impl_->Insert(path.to_string(), move(desc));
+}
+bool TableStorage::Delete(const Path &path) { return impl_->Delete(path.to_string()); }
+size_t TableStorage::DeletePrefix(const Path &path) {
+  size_t i = 0;
+  auto it = lower_bound(path);
+  auto upper = upper_bound(path);
+  while (it != upper) {
+    it = impl_->erase(*it.impl_);
+    ++i;
+  }
+  return i;
+}
+
+void TableStorage::AddMapTypesVisitor(unique_ptr<MapTypesVisitor> visitor) {
+  visitors_.push_back(move(visitor));
+}
+void TableStorage::VisitMapType(TableDesc &desc, clang::ASTContext &C, clang::QualType key_type,
+                                clang::QualType leaf_type) {
+  for (auto &v : visitors_)
+    v->Visit(desc, C, key_type, leaf_type);
+}
+
+TableStorage::iterator TableStorage::begin() { return impl_->begin(); }
+TableStorage::iterator TableStorage::end() { return impl_->end(); }
+TableStorage::iterator TableStorage::lower_bound(const Path &p) {
+  return impl_->lower_bound(p.to_string());
+}
+TableStorage::iterator TableStorage::upper_bound(const Path &p) {
+  return impl_->upper_bound(p.to_string() + "\x7f");
+}
+
+/// TableStorage::iterator implementation
+TableStorage::iterator::iterator() {}
+TableStorage::iterator::iterator(unique_ptr<TableStorageIteratorImpl> impl) : impl_(move(impl)) {}
+TableStorage::iterator::iterator(const iterator &that) : impl_(that.impl_->clone()) {}
+TableStorage::iterator::~iterator() {}
+TableStorage::iterator::iterator(iterator &&that) { *this = move(that); }
+TableStorage::iterator &TableStorage::iterator::operator=(iterator &&that) {
+  impl_ = move(that.impl_);
+  return *this;
+}
+
+TableStorage::iterator &TableStorage::iterator::operator++() {
+  ++*impl_;
+  return *this;
+}
+TableStorage::iterator TableStorage::iterator::operator++(int) {
+  iterator tmp(*this);
+  operator++();
+  return tmp;
+}
+bool TableStorage::iterator::operator==(const iterator &rhs) const {
+  // assumes that the underlying pair is stored in only one place
+  return &**impl_ == &**rhs.impl_;
+}
+bool TableStorage::iterator::operator!=(const iterator &rhs) const {
+  return &**impl_ != &**rhs.impl_;
+}
+TableStorage::iterator::reference TableStorage::iterator::operator*() const { return **impl_; }
+TableStorage::iterator::pointer TableStorage::iterator::operator->() const { return &**impl_; }
+
+}  // namespace ebpf
diff --git a/src/cc/table_storage.h b/src/cc/table_storage.h
new file mode 100644
index 0000000..87aaa33
--- /dev/null
+++ b/src/cc/table_storage.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "table_desc.h"
+
+namespace ebpf {
+
+class TableStorageImpl;
+class TableStorageIteratorImpl;
+
+class Path {
+ public:
+  static const std::string DELIM;
+  Path() = default;
+  Path(const Path &other) = default;
+  Path &operator=(const Path &other) = default;
+  Path(std::initializer_list<std::string> parts) {
+    size_t len = parts.size() * DELIM.size();
+    for (const auto &s : parts)
+      len += s.size();
+    path_.reserve(len);
+    for (const auto &s : parts)
+      path_ += DELIM + s;
+  }
+  const std::string &to_string() const { return path_; }
+
+ private:
+  std::string path_;
+};
+
+class TableStorage {
+ public:
+  /// iterator is an abstract class for traversing the map entries in a table
+  /// storage object.
+  class iterator {
+   private:
+    friend class TableStorage;
+    iterator(const iterator &);
+
+   public:
+    typedef std::pair<const std::string, TableDesc> value_type;
+    typedef std::ptrdiff_t difference_type;
+    typedef value_type *pointer;
+    typedef value_type &reference;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef iterator self_type;
+
+    iterator();
+    iterator(std::unique_ptr<TableStorageIteratorImpl>);
+    ~iterator();
+    iterator(iterator &&);
+    iterator &operator=(iterator &&);
+    self_type &operator++();
+    self_type operator++(int);
+    bool operator==(const self_type &) const;
+    bool operator!=(const self_type &) const;
+    value_type &operator*() const;
+    pointer operator->() const;
+
+   private:
+    std::unique_ptr<TableStorageIteratorImpl> impl_;
+  };
+
+  TableStorage();
+  ~TableStorage();
+  void Init(std::unique_ptr<TableStorageImpl>);
+
+  bool Find(const Path &path, TableStorage::iterator &result) const;
+  bool Insert(const Path &path, TableDesc &&desc);
+  bool Delete(const Path &path);
+  size_t DeletePrefix(const Path &path);
+
+  void AddMapTypesVisitor(std::unique_ptr<MapTypesVisitor>);
+  void VisitMapType(TableDesc &desc, clang::ASTContext &C, clang::QualType key_type,
+                    clang::QualType leaf_type);
+  iterator begin();
+  iterator end();
+  iterator lower_bound(const Path &p);
+  iterator upper_bound(const Path &p);
+
+ private:
+  std::unique_ptr<TableStorageImpl> impl_;
+  std::vector<std::unique_ptr<MapTypesVisitor>> visitors_;
+};
+
+std::unique_ptr<TableStorage> createSharedTableStorage();
+std::unique_ptr<TableStorage> createBpfFsTableStorage();
+}
diff --git a/src/cc/table_storage_impl.h b/src/cc/table_storage_impl.h
new file mode 100644
index 0000000..401b404
--- /dev/null
+++ b/src/cc/table_storage_impl.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2017 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "table_storage.h"
+
+namespace ebpf {
+
+class TableStorageIteratorImpl {
+ public:
+  typedef std::pair<const std::string, TableDesc> value_type;
+  typedef value_type *pointer;
+  typedef value_type &reference;
+  typedef TableStorageIteratorImpl self_type;
+  virtual ~TableStorageIteratorImpl() {}
+  virtual std::unique_ptr<self_type> clone() const = 0;
+  virtual self_type &operator++() = 0;
+  virtual value_type &operator*() const = 0;
+  virtual pointer operator->() const = 0;
+
+ private:
+};
+
+class TableStorageImpl {
+ public:
+  virtual ~TableStorageImpl(){};
+  virtual bool Find(const std::string &name, TableStorage::iterator &result) const = 0;
+  virtual bool Insert(const std::string &name, TableDesc &&desc) = 0;
+  virtual bool Delete(const std::string &name) = 0;
+  virtual std::unique_ptr<TableStorageIteratorImpl> begin() = 0;
+  virtual std::unique_ptr<TableStorageIteratorImpl> end() = 0;
+  virtual std::unique_ptr<TableStorageIteratorImpl> lower_bound(const std::string &k) = 0;
+  virtual std::unique_ptr<TableStorageIteratorImpl> upper_bound(const std::string &k) = 0;
+  virtual std::unique_ptr<TableStorageIteratorImpl> erase(const TableStorageIteratorImpl &it) = 0;
+};
+
+}  // namespace ebpf
diff --git a/src/cc/usdt.h b/src/cc/usdt.h
new file mode 100644
index 0000000..6d89fd6
--- /dev/null
+++ b/src/cc/usdt.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "ns_guard.h"
+#include "syms.h"
+#include "vendor/optional.hpp"
+
+struct bcc_usdt;
+
+namespace ebpf {
+  class BPF;
+  class USDT;
+}
+
+namespace USDT {
+
+using std::experimental::optional;
+using std::experimental::nullopt;
+class ArgumentParser;
+
+static const std::string USDT_PROGRAM_HEADER =
+    "#include <uapi/linux/ptrace.h>\n";
+
+static const std::string COMPILER_BARRIER =
+    "__asm__ __volatile__(\"\": : :\"memory\");";
+
+class Argument {
+private:
+  optional<int> arg_size_;
+  optional<int> constant_;
+  optional<int> deref_offset_;
+  optional<std::string> deref_ident_;
+  optional<std::string> base_register_name_;
+  optional<std::string> index_register_name_;
+  optional<int> scale_;
+
+  bool get_global_address(uint64_t *address, const std::string &binpath,
+                          const optional<int> &pid) const;
+
+public:
+  Argument();
+  ~Argument();
+
+  bool assign_to_local(std::ostream &stream, const std::string &local_name,
+                       const std::string &binpath,
+                       const optional<int> &pid = nullopt) const;
+
+  int arg_size() const { return arg_size_.value_or(sizeof(void *)); }
+  std::string ctype() const;
+
+  const optional<std::string> &deref_ident() const { return deref_ident_; }
+  const optional<std::string> &base_register_name() const {
+    return base_register_name_;
+  }
+  const optional<std::string> &index_register_name() const {
+    return index_register_name_;
+  }
+  const optional<int> scale() const { return scale_; }
+  const optional<int> constant() const { return constant_; }
+  const optional<int> deref_offset() const { return deref_offset_; }
+
+  friend class ArgumentParser;
+  friend class ArgumentParser_aarch64;
+  friend class ArgumentParser_powerpc64;
+  friend class ArgumentParser_x64;
+};
+
+class ArgumentParser {
+ protected:
+  const char *arg_;
+  ssize_t cur_pos_;
+
+  void skip_whitespace_from(size_t pos);
+  void skip_until_whitespace_from(size_t pos);
+  void print_error(ssize_t pos);
+  ssize_t parse_number(ssize_t pos, optional<int> *result) {
+    char *endp;
+    int number = strtol(arg_ + pos, &endp, 0);
+    if (endp > arg_ + pos)
+      *result = number;
+    return endp - arg_;
+  }
+  bool error_return(ssize_t error_start, ssize_t skip_start) {
+    print_error(error_start);
+    skip_until_whitespace_from(skip_start);
+    return false;
+  }
+
+ public:
+  virtual bool parse(Argument *dest) = 0;
+  bool done() { return cur_pos_ < 0 || arg_[cur_pos_] == '\0'; }
+
+  ArgumentParser(const char *arg) : arg_(arg), cur_pos_(0) {}
+};
+
+class ArgumentParser_aarch64 : public ArgumentParser {
+ private:
+  bool parse_register(ssize_t pos, ssize_t &new_pos, optional<int> *reg_num);
+  bool parse_size(ssize_t pos, ssize_t &new_pos, optional<int> *arg_size);
+  bool parse_mem(ssize_t pos, ssize_t &new_pos, optional<int> *reg_num,
+                 optional<int> *offset);
+
+ public:
+  bool parse(Argument *dest);
+  ArgumentParser_aarch64(const char *arg) : ArgumentParser(arg) {}
+};
+
+class ArgumentParser_powerpc64 : public ArgumentParser {
+public:
+  bool parse(Argument *dest);
+  ArgumentParser_powerpc64(const char *arg) : ArgumentParser(arg) {}
+};
+
+class ArgumentParser_x64 : public ArgumentParser {
+private:
+  enum Register {
+    REG_A,
+    REG_B,
+    REG_C,
+    REG_D,
+    REG_SI,
+    REG_DI,
+    REG_BP,
+    REG_SP,
+    REG_8,
+    REG_9,
+    REG_10,
+    REG_11,
+    REG_12,
+    REG_13,
+    REG_14,
+    REG_15,
+    REG_RIP,
+  };
+
+  struct RegInfo {
+    Register reg;
+    int size;
+  };
+
+  static const std::unordered_map<std::string, RegInfo> registers_;
+  bool normalize_register(std::string *reg, int *reg_size);
+  void reg_to_name(std::string *norm, Register reg);
+  ssize_t parse_register(ssize_t pos, std::string &name, int &size);
+  ssize_t parse_identifier(ssize_t pos, optional<std::string> *ident);
+  ssize_t parse_base_register(ssize_t pos, Argument *dest);
+  ssize_t parse_index_register(ssize_t pos, Argument *dest);
+  ssize_t parse_scale(ssize_t pos, Argument *dest);
+  ssize_t parse_expr(ssize_t pos, Argument *dest);
+  ssize_t parse_1(ssize_t pos, Argument *dest);
+
+public:
+  bool parse(Argument *dest);
+  ArgumentParser_x64(const char *arg) : ArgumentParser(arg) {}
+};
+
+struct Location {
+  uint64_t address_;
+  std::string bin_path_;
+  std::vector<Argument> arguments_;
+  Location(uint64_t addr, const std::string &bin_path, const char *arg_fmt);
+};
+
+class Probe {
+  std::string bin_path_; // initial bin_path when Probe is created
+  std::string provider_;
+  std::string name_;
+  uint64_t semaphore_;
+
+  std::vector<Location> locations_;
+
+  optional<int> pid_;
+  ProcMountNS *mount_ns_;
+  std::unordered_map<std::string, bool> object_type_map_; // bin_path => is shared lib?
+
+  optional<std::string> attached_to_;
+  optional<uint64_t> attached_semaphore_;
+
+  std::string largest_arg_type(size_t arg_n);
+
+  bool add_to_semaphore(int16_t val);
+  bool resolve_global_address(uint64_t *global, const std::string &bin_path,
+                              const uint64_t addr);
+  bool lookup_semaphore_addr(uint64_t *address);
+  void add_location(uint64_t addr, const std::string &bin_path, const char *fmt);
+
+public:
+  Probe(const char *bin_path, const char *provider, const char *name,
+        uint64_t semaphore, const optional<int> &pid, ProcMountNS *ns);
+
+  size_t num_locations() const { return locations_.size(); }
+  size_t num_arguments() const { return locations_.front().arguments_.size(); }
+  uint64_t semaphore()   const { return semaphore_; }
+
+  uint64_t address(size_t n = 0) const { return locations_[n].address_; }
+  const char *location_bin_path(size_t n = 0) const { return locations_[n].bin_path_.c_str(); }
+  const Location &location(size_t n) const { return locations_[n]; }
+
+  bool usdt_getarg(std::ostream &stream);
+  bool usdt_getarg(std::ostream &stream, const std::string& probe_func);
+  std::string get_arg_ctype(int arg_index) {
+    return largest_arg_type(arg_index);
+  }
+
+  void finalize_locations();
+  bool need_enable() const { return semaphore_ != 0x0; }
+  bool enable(const std::string &fn_name);
+  bool disable();
+  bool enabled() const { return !!attached_to_; }
+
+  bool in_shared_object(const std::string &bin_path);
+  const std::string &name() { return name_; }
+  const std::string &bin_path() { return bin_path_; }
+  const std::string &provider() { return provider_; }
+
+  friend class Context;
+
+  friend class ::ebpf::BPF;
+  friend class ::ebpf::USDT;
+};
+
+class Context {
+  std::vector<std::unique_ptr<Probe>> probes_;
+  std::unordered_set<std::string> modules_;
+
+  optional<int> pid_;
+  optional<ProcStat> pid_stat_;
+  std::unique_ptr<ProcMountNS> mount_ns_instance_;
+  std::string cmd_bin_path_;
+  bool loaded_;
+
+  static void _each_probe(const char *binpath, const struct bcc_elf_usdt *probe,
+                          void *p);
+  static int _each_module(const char *modpath, uint64_t, uint64_t, uint64_t,
+                          bool, void *p);
+
+  void add_probe(const char *binpath, const struct bcc_elf_usdt *probe);
+  std::string resolve_bin_path(const std::string &bin_path);
+
+public:
+  Context(const std::string &bin_path);
+  Context(int pid);
+  Context(int pid, const std::string &bin_path);
+  ~Context();
+
+  optional<int> pid() const { return pid_; }
+  bool loaded() const { return loaded_; }
+  size_t num_probes() const { return probes_.size(); }
+  const std::string & cmd_bin_path() const { return cmd_bin_path_; }
+  ino_t inode() const { return mount_ns_instance_->target_ino(); }
+
+  Probe *get(const std::string &probe_name);
+  Probe *get(const std::string &provider_name, const std::string &probe_name);
+  Probe *get(int pos) { return probes_[pos].get(); }
+
+  bool enable_probe(const std::string &probe_name, const std::string &fn_name);
+
+  typedef void (*each_cb)(struct bcc_usdt *);
+  void each(each_cb callback);
+
+  typedef void (*each_uprobe_cb)(const char *, const char *, uint64_t, int);
+  void each_uprobe(each_uprobe_cb callback);
+
+  friend class ::ebpf::BPF;
+  friend class ::ebpf::USDT;
+};
+}
diff --git a/src/cc/usdt/CMakeLists.txt b/src/cc/usdt/CMakeLists.txt
new file mode 100644
index 0000000..6a2e895
--- /dev/null
+++ b/src/cc/usdt/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(usdt-static STATIC usdt_args.cc usdt.cc)
diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc
new file mode 100644
index 0000000..7408d2f
--- /dev/null
+++ b/src/cc/usdt/usdt.cc
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+#include <unordered_set>
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "bcc_elf.h"
+#include "bcc_proc.h"
+#include "common.h"
+#include "usdt.h"
+#include "vendor/tinyformat.hpp"
+#include "bcc_usdt.h"
+
+namespace USDT {
+
+Location::Location(uint64_t addr, const std::string &bin_path, const char *arg_fmt)
+    : address_(addr),
+      bin_path_(bin_path) {
+
+#ifdef __aarch64__
+  ArgumentParser_aarch64 parser(arg_fmt);
+#elif __powerpc64__
+  ArgumentParser_powerpc64 parser(arg_fmt);
+#else
+  ArgumentParser_x64 parser(arg_fmt);
+#endif
+  while (!parser.done()) {
+    Argument arg;
+    if (!parser.parse(&arg))
+      continue;
+    arguments_.push_back(std::move(arg));
+  }
+}
+
+Probe::Probe(const char *bin_path, const char *provider, const char *name,
+             uint64_t semaphore, const optional<int> &pid, ProcMountNS *ns)
+    : bin_path_(bin_path),
+      provider_(provider),
+      name_(name),
+      semaphore_(semaphore),
+      pid_(pid),
+      mount_ns_(ns) {}
+
+bool Probe::in_shared_object(const std::string &bin_path) {
+    if (object_type_map_.find(bin_path) == object_type_map_.end()) {
+      ProcMountNSGuard g(mount_ns_);
+      return (object_type_map_[bin_path] = bcc_elf_is_shared_obj(bin_path.c_str()));
+    }
+    return object_type_map_[bin_path];
+}
+
+bool Probe::resolve_global_address(uint64_t *global, const std::string &bin_path,
+                                   const uint64_t addr) {
+  if (in_shared_object(bin_path)) {
+    return (pid_ &&
+            !bcc_resolve_global_addr(*pid_, bin_path.c_str(), addr, global));
+  }
+
+  *global = addr;
+  return true;
+}
+
+bool Probe::add_to_semaphore(int16_t val) {
+  assert(pid_);
+
+  if (!attached_semaphore_) {
+    uint64_t addr;
+    if (!resolve_global_address(&addr, bin_path_, semaphore_))
+      return false;
+    attached_semaphore_ = addr;
+  }
+
+  off_t address = static_cast<off_t>(attached_semaphore_.value());
+
+  std::string procmem = tfm::format("/proc/%d/mem", pid_.value());
+  int memfd = ::open(procmem.c_str(), O_RDWR);
+  if (memfd < 0)
+    return false;
+
+  int16_t original;
+
+  if (::lseek(memfd, address, SEEK_SET) < 0 ||
+      ::read(memfd, &original, 2) != 2) {
+    ::close(memfd);
+    return false;
+  }
+
+  original = original + val;
+
+  if (::lseek(memfd, address, SEEK_SET) < 0 ||
+      ::write(memfd, &original, 2) != 2) {
+    ::close(memfd);
+    return false;
+  }
+
+  ::close(memfd);
+  return true;
+}
+
+bool Probe::enable(const std::string &fn_name) {
+  if (attached_to_)
+    return false;
+
+  if (need_enable()) {
+    if (!pid_)
+      return false;
+
+    if (!add_to_semaphore(+1))
+      return false;
+  }
+
+  attached_to_ = fn_name;
+  return true;
+}
+
+bool Probe::disable() {
+  if (!attached_to_)
+    return false;
+
+  attached_to_ = nullopt;
+
+  if (need_enable()) {
+    assert(pid_);
+    return add_to_semaphore(-1);
+  }
+  return true;
+}
+
+std::string Probe::largest_arg_type(size_t arg_n) {
+  Argument *largest = nullptr;
+  for (Location &location : locations_) {
+    Argument *candidate = &location.arguments_[arg_n];
+    if (!largest ||
+        std::abs(candidate->arg_size()) > std::abs(largest->arg_size()))
+      largest = candidate;
+  }
+
+  assert(largest);
+  return largest->ctype();
+}
+
+bool Probe::usdt_getarg(std::ostream &stream) {
+  if (!attached_to_ || attached_to_->empty())
+    return false;
+
+  return usdt_getarg(stream, attached_to_.value());
+}
+
+bool Probe::usdt_getarg(std::ostream &stream, const std::string& probe_func) {
+  const size_t arg_count = locations_[0].arguments_.size();
+
+  if (arg_count == 0)
+    return true;
+
+  for (size_t arg_n = 0; arg_n < arg_count; ++arg_n) {
+    std::string ctype = largest_arg_type(arg_n);
+    std::string cptr = tfm::format("*((%s *)dest)", ctype);
+
+    tfm::format(stream,
+                "static __always_inline int _bpf_readarg_%s_%d("
+                "struct pt_regs *ctx, void *dest, size_t len) {\n"
+                "  if (len != sizeof(%s)) return -1;\n",
+                probe_func, arg_n + 1, ctype);
+
+    if (locations_.size() == 1) {
+      Location &location = locations_.front();
+      stream << "  ";
+      if (!location.arguments_[arg_n].assign_to_local(stream, cptr, location.bin_path_,
+                                                      pid_))
+        return false;
+      stream << "\n  return 0;\n}\n";
+    } else {
+      stream << "  switch(PT_REGS_IP(ctx)) {\n";
+      for (Location &location : locations_) {
+        uint64_t global_address;
+
+        if (!resolve_global_address(&global_address, location.bin_path_,
+                                    location.address_))
+          return false;
+
+        tfm::format(stream, "  case 0x%xULL: ", global_address);
+        if (!location.arguments_[arg_n].assign_to_local(stream, cptr, location.bin_path_,
+                                                        pid_))
+          return false;
+
+        stream << " return 0;\n";
+      }
+      stream << "  }\n";
+      stream << "  return -1;\n}\n";
+    }
+  }
+  return true;
+}
+
+void Probe::add_location(uint64_t addr, const std::string &bin_path, const char *fmt) {
+  locations_.emplace_back(addr, bin_path, fmt);
+}
+
+void Probe::finalize_locations() {
+  std::sort(locations_.begin(), locations_.end(),
+            [](const Location &a, const Location &b) {
+              return a.bin_path_ < b.bin_path_ || a.address_ < b.address_;
+            });
+  auto last = std::unique(locations_.begin(), locations_.end(),
+                          [](const Location &a, const Location &b) {
+                            return a.bin_path_ == b.bin_path_ && a.address_ == b.address_;
+                          });
+  locations_.erase(last, locations_.end());
+}
+
+void Context::_each_probe(const char *binpath, const struct bcc_elf_usdt *probe,
+                          void *p) {
+  Context *ctx = static_cast<Context *>(p);
+  ctx->add_probe(binpath, probe);
+}
+
+int Context::_each_module(const char *modpath, uint64_t, uint64_t, uint64_t,
+                          bool, void *p) {
+  Context *ctx = static_cast<Context *>(p);
+  // Modules may be reported multiple times if they contain more than one
+  // executable region. We are going to parse the ELF on disk anyway, so we
+  // don't need these duplicates.
+  if (ctx->modules_.insert(modpath).second /*inserted new?*/) {
+    ProcMountNSGuard g(ctx->mount_ns_instance_.get());
+    bcc_elf_foreach_usdt(modpath, _each_probe, p);
+  }
+  return 0;
+}
+
+void Context::add_probe(const char *binpath, const struct bcc_elf_usdt *probe) {
+  for (auto &p : probes_) {
+    if (p->provider_ == probe->provider && p->name_ == probe->name) {
+      p->add_location(probe->pc, binpath, probe->arg_fmt);
+      return;
+    }
+  }
+
+  probes_.emplace_back(
+      new Probe(binpath, probe->provider, probe->name, probe->semaphore, pid_,
+	mount_ns_instance_.get()));
+  probes_.back()->add_location(probe->pc, binpath, probe->arg_fmt);
+}
+
+std::string Context::resolve_bin_path(const std::string &bin_path) {
+  std::string result;
+
+  if (char *which = bcc_procutils_which(bin_path.c_str())) {
+    result = which;
+    ::free(which);
+  } else if (char *which_so = bcc_procutils_which_so(bin_path.c_str(), 0)) {
+    result = which_so;
+    ::free(which_so);
+  }
+
+  return result;
+}
+
+Probe *Context::get(const std::string &probe_name) {
+  for (auto &p : probes_) {
+    if (p->name_ == probe_name)
+      return p.get();
+  }
+  return nullptr;
+}
+
+Probe *Context::get(const std::string &provider_name,
+                    const std::string &probe_name) {
+  for (auto &p : probes_) {
+    if (p->provider_ == provider_name && p->name_ == probe_name)
+      return p.get();
+  }
+  return nullptr;
+}
+
+bool Context::enable_probe(const std::string &probe_name,
+                           const std::string &fn_name) {
+  if (pid_stat_ && pid_stat_->is_stale())
+    return false;
+
+  // FIXME: we may have issues here if the context has two same probes's
+  // but different providers. For example, libc:setjmp and rtld:setjmp,
+  // libc:lll_futex_wait and rtld:lll_futex_wait.
+  Probe *found_probe = nullptr;
+  for (auto &p : probes_) {
+    if (p->name_ == probe_name) {
+      if (found_probe != nullptr) {
+         fprintf(stderr, "Two same-name probes (%s) but different providers\n",
+                 probe_name.c_str());
+         return false;
+      }
+      found_probe = p.get();
+    }
+  }
+
+  if (found_probe != nullptr)
+    return found_probe->enable(fn_name);
+
+  return false;
+}
+
+void Context::each(each_cb callback) {
+  for (const auto &probe : probes_) {
+    struct bcc_usdt info = {0};
+    info.provider = probe->provider().c_str();
+    info.bin_path = probe->bin_path().c_str();
+    info.name = probe->name().c_str();
+    info.semaphore = probe->semaphore();
+    info.num_locations = probe->num_locations();
+    info.num_arguments = probe->num_arguments();
+    callback(&info);
+  }
+}
+
+void Context::each_uprobe(each_uprobe_cb callback) {
+  for (auto &p : probes_) {
+    if (!p->enabled())
+      continue;
+
+    for (Location &loc : p->locations_) {
+      callback(loc.bin_path_.c_str(), p->attached_to_->c_str(), loc.address_,
+               pid_.value_or(-1));
+    }
+  }
+}
+
+Context::Context(const std::string &bin_path)
+    : mount_ns_instance_(new ProcMountNS(-1)), loaded_(false) {
+  std::string full_path = resolve_bin_path(bin_path);
+  if (!full_path.empty()) {
+    if (bcc_elf_foreach_usdt(full_path.c_str(), _each_probe, this) == 0) {
+      cmd_bin_path_ = full_path;
+      loaded_ = true;
+    }
+  }
+  for (const auto &probe : probes_)
+    probe->finalize_locations();
+}
+
+Context::Context(int pid) : pid_(pid), pid_stat_(pid),
+  mount_ns_instance_(new ProcMountNS(pid)), loaded_(false) {
+  if (bcc_procutils_each_module(pid, _each_module, this) == 0) {
+    cmd_bin_path_ = ebpf::get_pid_exe(pid);
+    if (cmd_bin_path_.empty())
+      return;
+
+    loaded_ = true;
+  }
+  for (const auto &probe : probes_)
+    probe->finalize_locations();
+}
+
+Context::Context(int pid, const std::string &bin_path)
+    : pid_(pid), pid_stat_(pid),
+      mount_ns_instance_(new ProcMountNS(pid)), loaded_(false) {
+  std::string full_path = resolve_bin_path(bin_path);
+  if (!full_path.empty()) {
+    if (bcc_elf_foreach_usdt(full_path.c_str(), _each_probe, this) == 0) {
+      cmd_bin_path_ = ebpf::get_pid_exe(pid);
+      if (cmd_bin_path_.empty())
+        return;
+      loaded_ = true;
+    }
+  }
+  for (const auto &probe : probes_)
+    probe->finalize_locations();
+}
+
+Context::~Context() {
+  if (pid_stat_ && !pid_stat_->is_stale()) {
+    for (auto &p : probes_) p->disable();
+  }
+}
+}
+
+extern "C" {
+
+void *bcc_usdt_new_frompid(int pid, const char *path) {
+  USDT::Context *ctx;
+
+  if (!path) {
+    ctx = new USDT::Context(pid);
+  } else {
+    struct stat buffer;
+    if (strlen(path) >= 1 && path[0] != '/') {
+      fprintf(stderr, "HINT: Binary path should be absolute.\n\n");
+      return nullptr;
+    } else if (stat(path, &buffer) == -1) {
+      fprintf(stderr, "HINT: Specified binary doesn't exist.\n\n");
+      return nullptr;
+    }
+    ctx = new USDT::Context(pid, path);
+  }
+  if (!ctx->loaded()) {
+    delete ctx;
+    return nullptr;
+  }
+  return static_cast<void *>(ctx);
+}
+
+void *bcc_usdt_new_frompath(const char *path) {
+  USDT::Context *ctx = new USDT::Context(path);
+  if (!ctx->loaded()) {
+    delete ctx;
+    return nullptr;
+  }
+  return static_cast<void *>(ctx);
+}
+
+void bcc_usdt_close(void *usdt) {
+  if (usdt) {
+    USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+    delete ctx;
+  }
+}
+
+int bcc_usdt_enable_probe(void *usdt, const char *probe_name,
+                          const char *fn_name) {
+  USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+  return ctx->enable_probe(probe_name, fn_name) ? 0 : -1;
+}
+
+const char *bcc_usdt_genargs(void **usdt_array, int len) {
+  static std::string storage_;
+  std::ostringstream stream;
+
+  if (!len)
+    return "";
+
+  stream << USDT::USDT_PROGRAM_HEADER;
+  // Generate genargs codes for an array of USDT Contexts.
+  //
+  // Each mnt_point + cmd_bin_path + probe_provider + probe_name
+  // uniquely identifies a probe.
+  std::unordered_set<std::string> generated_probes;
+  for (int i = 0; i < len; i++) {
+    USDT::Context *ctx = static_cast<USDT::Context *>(usdt_array[i]);
+
+    for (size_t j = 0; j < ctx->num_probes(); j++) {
+      USDT::Probe *p = ctx->get(j);
+      if (p->enabled()) {
+        std::string key = std::to_string(ctx->inode()) + "*"
+          + ctx->cmd_bin_path() + "*" + p->provider() + "*" + p->name();
+        if (generated_probes.find(key) != generated_probes.end())
+          continue;
+        if (!p->usdt_getarg(stream))
+          return nullptr;
+        generated_probes.insert(key);
+      }
+    }
+  }
+
+  storage_ = stream.str();
+  return storage_.c_str();
+}
+
+const char *bcc_usdt_get_probe_argctype(
+  void *ctx, const char* probe_name, const int arg_index
+) {
+  USDT::Probe *p = static_cast<USDT::Context *>(ctx)->get(probe_name);
+  if (p)
+    return p->get_arg_ctype(arg_index).c_str();
+  return "";
+}
+
+void bcc_usdt_foreach(void *usdt, bcc_usdt_cb callback) {
+  USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+  ctx->each(callback);
+}
+
+int bcc_usdt_get_location(void *usdt, const char *provider_name,
+                          const char *probe_name,
+                          int index, struct bcc_usdt_location *location) {
+  USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+  USDT::Probe *probe = ctx->get(provider_name, probe_name);
+  if (!probe)
+    return -1;
+  if (index < 0 || (size_t)index >= probe->num_locations())
+    return -1;
+  location->address = probe->address(index);
+  location->bin_path = probe->location_bin_path(index);
+  return 0;
+}
+
+int bcc_usdt_get_argument(void *usdt, const char *provider_name,
+                          const char *probe_name,
+                          int location_index, int argument_index,
+                          struct bcc_usdt_argument *argument) {
+  USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+  USDT::Probe *probe = ctx->get(provider_name, probe_name);
+  if (!probe)
+    return -1;
+  if (argument_index < 0 || (size_t)argument_index >= probe->num_arguments())
+    return -1;
+  if (location_index < 0 || (size_t)location_index >= probe->num_locations())
+    return -1;
+  auto const &location = probe->location(location_index);
+  auto const &arg = location.arguments_[argument_index];
+  argument->size = arg.arg_size();
+  argument->valid = BCC_USDT_ARGUMENT_NONE;
+  if (arg.constant()) {
+    argument->valid |= BCC_USDT_ARGUMENT_CONSTANT;
+    argument->constant = *(arg.constant());
+  }
+  if (arg.deref_offset()) {
+    argument->valid |= BCC_USDT_ARGUMENT_DEREF_OFFSET;
+    argument->deref_offset = *(arg.deref_offset());
+  }
+  if (arg.deref_ident()) {
+    argument->valid |= BCC_USDT_ARGUMENT_DEREF_IDENT;
+    argument->deref_ident = arg.deref_ident()->c_str();
+  }
+  if (arg.base_register_name()) {
+    argument->valid |= BCC_USDT_ARGUMENT_BASE_REGISTER_NAME;
+    argument->base_register_name = arg.base_register_name()->c_str();
+  }
+  if (arg.index_register_name()) {
+    argument->valid |= BCC_USDT_ARGUMENT_INDEX_REGISTER_NAME;
+    argument->index_register_name = arg.index_register_name()->c_str();
+  }
+  if (arg.scale()) {
+    argument->valid |= BCC_USDT_ARGUMENT_SCALE;
+    argument->scale = *(arg.scale());
+  }
+  return 0;
+}
+
+void bcc_usdt_foreach_uprobe(void *usdt, bcc_usdt_uprobe_cb callback) {
+  USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+  ctx->each_uprobe(callback);
+}
+}
diff --git a/src/cc/usdt/usdt_args.cc b/src/cc/usdt/usdt_args.cc
new file mode 100644
index 0000000..b27e515
--- /dev/null
+++ b/src/cc/usdt/usdt_args.cc
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <unordered_map>
+#include <regex>
+
+#include "syms.h"
+#include "usdt.h"
+#include "vendor/tinyformat.hpp"
+
+#include "bcc_elf.h"
+#include "bcc_syms.h"
+
+namespace USDT {
+
+Argument::Argument() {}
+Argument::~Argument() {}
+
+std::string Argument::ctype() const {
+  const int s = arg_size() * 8;
+  return (s < 0) ? tfm::format("int%d_t", -s) : tfm::format("uint%d_t", s);
+}
+
+bool Argument::get_global_address(uint64_t *address, const std::string &binpath,
+                                  const optional<int> &pid) const {
+  if (pid) {
+    static struct bcc_symbol_option default_option = {
+      .use_debug_file = 1,
+      .check_debug_file_crc = 1,
+      .use_symbol_type = BCC_SYM_ALL_TYPES
+    };
+    return ProcSyms(*pid, &default_option)
+        .resolve_name(binpath.c_str(), deref_ident_->c_str(), address);
+  }
+
+  if (!bcc_elf_is_shared_obj(binpath.c_str())) {
+    struct bcc_symbol sym;
+    if (bcc_resolve_symname(binpath.c_str(), deref_ident_->c_str(), 0x0, -1, nullptr, &sym) == 0) {
+      *address = sym.offset;
+      if (sym.module)
+        ::free(const_cast<char*>(sym.module));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool Argument::assign_to_local(std::ostream &stream,
+                               const std::string &local_name,
+                               const std::string &binpath,
+                               const optional<int> &pid) const {
+  if (constant_) {
+    tfm::format(stream, "%s = %d;", local_name, *constant_);
+    return true;
+  }
+
+  if (!deref_offset_) {
+    tfm::format(stream, "%s = ctx->%s;", local_name, *base_register_name_);
+    // Put a compiler barrier to prevent optimization
+    // like llvm SimplifyCFG SinkThenElseCodeToEnd
+    // Volatile marking is not sufficient to prevent such optimization.
+    tfm::format(stream, " %s", COMPILER_BARRIER);
+    return true;
+  }
+
+  if (deref_offset_ && !deref_ident_) {
+    tfm::format(stream, "{ u64 __addr = ctx->%s + %d",
+                *base_register_name_, *deref_offset_);
+    if (index_register_name_) {
+      int scale = scale_.value_or(1);
+      tfm::format(stream, " + (ctx->%s * %d);", *index_register_name_, scale);
+    } else {
+      tfm::format(stream, ";");
+    }
+    // Theoretically, llvm SimplifyCFG SinkThenElseCodeToEnd may still
+    // sink bpf_probe_read call, so put a barrier here to prevent sinking
+    // of ctx->#fields.
+    tfm::format(stream, " %s ", COMPILER_BARRIER);
+    tfm::format(stream,
+                "%s __res = 0x0; "
+                "bpf_probe_read(&__res, sizeof(__res), (void *)__addr); "
+                "%s = __res; }",
+                ctype(), local_name);
+    return true;
+  }
+
+  if (deref_offset_ && deref_ident_ && *base_register_name_ == "ip") {
+    uint64_t global_address;
+    if (!get_global_address(&global_address, binpath, pid))
+      return false;
+
+    tfm::format(stream,
+                "{ u64 __addr = 0x%xull + %d; %s __res = 0x0; "
+                "bpf_probe_read(&__res, sizeof(__res), (void *)__addr); "
+                "%s = __res; }",
+                global_address, *deref_offset_, ctype(), local_name);
+    return true;
+  }
+
+  return false;
+}
+
+void ArgumentParser::print_error(ssize_t pos) {
+  fprintf(stderr, "Parse error:\n    %s\n", arg_);
+  for (ssize_t i = 0; i < pos + 4; ++i) fputc('-', stderr);
+  fputc('^', stderr);
+  fputc('\n', stderr);
+}
+
+void ArgumentParser::skip_whitespace_from(size_t pos) {
+    while (isspace(arg_[pos])) pos++;
+    cur_pos_ = pos;
+}
+
+void ArgumentParser::skip_until_whitespace_from(size_t pos) {
+    while (arg_[pos] != '\0' && !isspace(arg_[pos]))
+        pos++;
+    cur_pos_ = pos;
+}
+
+bool ArgumentParser_aarch64::parse_register(ssize_t pos, ssize_t &new_pos,
+                                            optional<int> *reg_num) {
+  new_pos = parse_number(pos, reg_num);
+  if (new_pos == pos || *reg_num < 0 || *reg_num > 31)
+    return error_return(pos, pos);
+  return true;
+}
+
+bool ArgumentParser_aarch64::parse_size(ssize_t pos, ssize_t &new_pos,
+                                        optional<int> *arg_size) {
+  int abs_arg_size;
+
+  new_pos = parse_number(pos, arg_size);
+  if (new_pos == pos)
+    return error_return(pos, pos);
+
+  abs_arg_size = abs(arg_size->value());
+  if (abs_arg_size != 1 && abs_arg_size != 2 && abs_arg_size != 4 &&
+      abs_arg_size != 8)
+    return error_return(pos, pos);
+  return true;
+}
+
+bool ArgumentParser_aarch64::parse_mem(ssize_t pos, ssize_t &new_pos,
+                                       optional<int> *reg_num,
+                                       optional<int> *offset) {
+  if (arg_[pos] != 'x')
+    return error_return(pos, pos);
+  if (parse_register(pos + 1, new_pos, reg_num) == false)
+    return false;
+
+  if (arg_[new_pos] == ',') {
+    pos = new_pos + 1;
+    new_pos = parse_number(pos, offset);
+    if (new_pos == pos)
+      return error_return(pos, pos);
+  }
+  if (arg_[new_pos] != ']')
+    return error_return(new_pos, new_pos);
+  new_pos++;
+  return true;
+}
+
+bool ArgumentParser_aarch64::parse(Argument *dest) {
+  if (done())
+    return false;
+
+  // Support the following argument patterns:
+  //   [-]<size>@<value>, [-]<size>@<reg>, [-]<size>@[<reg>], or
+  //   [-]<size>@[<reg>,<offset>]
+  ssize_t cur_pos = cur_pos_, new_pos;
+  optional<int> arg_size;
+
+  // Parse [-]<size>
+  if (parse_size(cur_pos, new_pos, &arg_size) == false)
+    return false;
+  dest->arg_size_ = arg_size;
+
+  // Make sure '@' present
+  if (arg_[new_pos] != '@')
+    return error_return(new_pos, new_pos);
+  cur_pos = new_pos + 1;
+
+  if (arg_[cur_pos] == 'x') {
+    // Parse ...@<reg>
+    optional<int> reg_num;
+    if (parse_register(cur_pos + 1, new_pos, &reg_num) == false)
+      return false;
+    cur_pos_ = new_pos;
+    dest->base_register_name_ = "regs[" + std::to_string(reg_num.value()) + "]";
+  } else if (arg_[cur_pos] == '[') {
+    // Parse ...@[<reg>] and ...@[<reg,<offset>]
+    optional<int> reg_num, offset = 0;
+    if (parse_mem(cur_pos + 1, new_pos, &reg_num, &offset) == false)
+      return false;
+    cur_pos_ = new_pos;
+    dest->base_register_name_ = "regs[" + std::to_string(reg_num.value()) + "]";
+    dest->deref_offset_ = offset;
+  } else {
+    // Parse ...@<value>
+    optional<int> val;
+    new_pos = parse_number(cur_pos, &val);
+    if (cur_pos == new_pos)
+      return error_return(cur_pos, cur_pos);
+    cur_pos_ = new_pos;
+    dest->constant_ = val;
+  }
+
+  skip_whitespace_from(cur_pos_);
+  return true;
+}
+
+bool ArgumentParser_powerpc64::parse(Argument *dest) {
+  if (done())
+    return false;
+
+  bool matched;
+  std::smatch matches;
+  std::string arg_str(&arg_[cur_pos_]);
+  std::regex arg_n_regex("^(\\-?[1248])\\@");
+  // Operands with constants of form iNUM or i-NUM
+  std::regex arg_op_regex_const("^i(\\-?[0-9]+)( +|$)");
+  // Operands with register only of form REG or %rREG
+  std::regex arg_op_regex_reg("^(?:%r)?([1-2]?[0-9]|3[0-1])( +|$)");
+  // Operands with a base register and an offset of form
+  // NUM(REG) or -NUM(REG) or NUM(%rREG) or -NUM(%rREG)
+  std::regex arg_op_regex_breg_off(
+        "^(\\-?[0-9]+)\\((?:%r)?([1-2]?[0-9]|3[0-1])\\)( +|$)");
+  // Operands with a base register and an index register
+  // of form REG,REG or %rREG,%rREG
+  std::regex arg_op_regex_breg_ireg(
+        "^(?:%r)?([1-2]?[0-9]|3[0-1])\\,(?:%r)?([1-2]?[0-9]|3[0-1])( +|$)");
+
+  matched = std::regex_search(arg_str, matches, arg_n_regex);
+  if (matched) {
+    dest->arg_size_ = stoi(matches.str(1));
+    cur_pos_ += matches.length(0);
+    arg_str = &arg_[cur_pos_];
+
+    if (std::regex_search(arg_str, matches, arg_op_regex_const)) {
+      dest->constant_ = stoi(matches.str(1));
+    } else if (std::regex_search(arg_str, matches, arg_op_regex_reg)) {
+      dest->base_register_name_ = "gpr[" + matches.str(1) + "]";
+    } else if (std::regex_search(arg_str, matches, arg_op_regex_breg_off)) {
+      dest->deref_offset_ = stoi(matches.str(1));
+      dest->base_register_name_ = "gpr[" + matches.str(2) + "]";
+    } else if (std::regex_search(arg_str, matches, arg_op_regex_breg_ireg)) {
+      dest->deref_offset_ = 0; // In powerpc64, such operands contain a base
+                               // register and an index register which are
+                               // part of an indexed load/store operation.
+                               // Even if no offset value is present, this
+                               // is required by Argument::assign_to_local()
+                               // in order to generate code for reading the
+                               // argument. So, this is set to zero.
+      dest->base_register_name_ = "gpr[" + matches.str(1) + "]";
+      dest->index_register_name_ = "gpr[" + matches.str(2) + "]";
+      dest->scale_ = abs(*dest->arg_size_);
+    } else {
+      matched = false;
+    }
+  }
+
+  if (!matched) {
+    print_error(cur_pos_);
+    skip_until_whitespace_from(cur_pos_);
+    skip_whitespace_from(cur_pos_);
+    return false;
+  }
+
+  cur_pos_ += matches.length(0);
+  skip_whitespace_from(cur_pos_);
+  return true;
+}
+
+ssize_t ArgumentParser_x64::parse_identifier(ssize_t pos,
+                                             optional<std::string> *result) {
+  if (isalpha(arg_[pos]) || arg_[pos] == '_') {
+    ssize_t start = pos++;
+    while (isalnum(arg_[pos]) || arg_[pos] == '_') pos++;
+    if (pos - start)
+      result->emplace(arg_ + start, pos - start);
+  }
+  return pos;
+}
+
+ssize_t ArgumentParser_x64::parse_register(ssize_t pos, std::string &name,
+                                           int &size) {
+  ssize_t start = ++pos;
+  if (arg_[start - 1] != '%')
+    return -start;
+
+  while (isalnum(arg_[pos])) pos++;
+
+  std::string regname(arg_ + start, pos - start);
+  if (!normalize_register(&regname, &size))
+    return -start;
+
+  name = regname;
+  return pos;
+}
+
+ssize_t ArgumentParser_x64::parse_base_register(ssize_t pos, Argument *dest) {
+  int size;
+  std::string name;
+  ssize_t res = parse_register(pos, name, size);
+  if (res < 0)
+      return res;
+
+  dest->base_register_name_ = name;
+  if (!dest->arg_size_)
+    dest->arg_size_ = size;
+
+  return res;
+}
+
+ssize_t ArgumentParser_x64::parse_index_register(ssize_t pos, Argument *dest) {
+  int size;
+  std::string name;
+  ssize_t res = parse_register(pos, name, size);
+  if (res < 0)
+      return res;
+
+  dest->index_register_name_ = name;
+
+  return res;
+}
+
+ssize_t ArgumentParser_x64::parse_scale(ssize_t pos, Argument *dest) {
+  return parse_number(pos, &dest->scale_);
+}
+
+ssize_t ArgumentParser_x64::parse_expr(ssize_t pos, Argument *dest) {
+  if (arg_[pos] == '$')
+    return parse_number(pos + 1, &dest->constant_);
+
+  if (arg_[pos] == '%')
+    return parse_base_register(pos, dest);
+
+  if (isdigit(arg_[pos]) || arg_[pos] == '-') {
+    pos = parse_number(pos, &dest->deref_offset_);
+    if (arg_[pos] == '+') {
+      pos = parse_identifier(pos + 1, &dest->deref_ident_);
+      if (!dest->deref_ident_)
+        return -pos;
+    }
+  } else {
+    dest->deref_offset_ = 0;
+    pos = parse_identifier(pos, &dest->deref_ident_);
+    if (arg_[pos] == '+' || arg_[pos] == '-') {
+      pos = parse_number(pos, &dest->deref_offset_);
+    }
+  }
+
+  if (arg_[pos] != '(')
+    return -pos;
+
+  pos = parse_base_register(pos + 1, dest);
+  if (pos < 0)
+    return pos;
+
+  if (arg_[pos] == ',') {
+    pos = parse_index_register(pos + 1, dest);
+    if (pos < 0)
+      return pos;
+
+    if (arg_[pos] == ',') {
+      pos = parse_scale(pos + 1, dest);
+      if (pos < 0)
+        return pos;
+    }
+  }
+
+  return (arg_[pos] == ')') ? pos + 1 : -pos;
+}
+
+ssize_t ArgumentParser_x64::parse_1(ssize_t pos, Argument *dest) {
+  if (isdigit(arg_[pos]) || arg_[pos] == '-') {
+    optional<int> asize;
+    ssize_t m = parse_number(pos, &asize);
+    if (arg_[m] == '@' && asize) {
+      dest->arg_size_ = asize;
+      return parse_expr(m + 1, dest);
+    }
+  }
+  return parse_expr(pos, dest);
+}
+
+bool ArgumentParser_x64::parse(Argument *dest) {
+  if (done())
+    return false;
+
+  ssize_t res = parse_1(cur_pos_, dest);
+  if (res < 0)
+    return error_return(-res, -res + 1);
+  if (!isspace(arg_[res]) && arg_[res] != '\0')
+    return error_return(res, res);
+  skip_whitespace_from(res);
+  return true;
+}
+
+const std::unordered_map<std::string, ArgumentParser_x64::RegInfo>
+    ArgumentParser_x64::registers_ = {
+        {"rax", {REG_A, 8}},   {"eax", {REG_A, 4}},
+        {"ax", {REG_A, 2}},    {"al", {REG_A, 1}},
+
+        {"rbx", {REG_B, 8}},   {"ebx", {REG_B, 4}},
+        {"bx", {REG_B, 2}},    {"bl", {REG_B, 1}},
+
+        {"rcx", {REG_C, 8}},   {"ecx", {REG_C, 4}},
+        {"cx", {REG_C, 2}},    {"cl", {REG_C, 1}},
+
+        {"rdx", {REG_D, 8}},   {"edx", {REG_D, 4}},
+        {"dx", {REG_D, 2}},    {"dl", {REG_D, 1}},
+
+        {"rsi", {REG_SI, 8}},  {"esi", {REG_SI, 4}},
+        {"si", {REG_SI, 2}},   {"sil", {REG_SI, 1}},
+
+        {"rdi", {REG_DI, 8}},  {"edi", {REG_DI, 4}},
+        {"di", {REG_DI, 2}},   {"dil", {REG_DI, 1}},
+
+        {"rbp", {REG_BP, 8}},  {"ebp", {REG_BP, 4}},
+        {"bp", {REG_BP, 2}},   {"bpl", {REG_BP, 1}},
+
+        {"rsp", {REG_SP, 8}},  {"esp", {REG_SP, 4}},
+        {"sp", {REG_SP, 2}},   {"spl", {REG_SP, 1}},
+
+        {"r8", {REG_8, 8}},    {"r8d", {REG_8, 4}},
+        {"r8w", {REG_8, 2}},   {"r8b", {REG_8, 1}},
+
+        {"r9", {REG_9, 8}},    {"r9d", {REG_9, 4}},
+        {"r9w", {REG_9, 2}},   {"r9b", {REG_9, 1}},
+
+        {"r10", {REG_10, 8}},  {"r10d", {REG_10, 4}},
+        {"r10w", {REG_10, 2}}, {"r10b", {REG_10, 1}},
+
+        {"r11", {REG_11, 8}},  {"r11d", {REG_11, 4}},
+        {"r11w", {REG_11, 2}}, {"r11b", {REG_11, 1}},
+
+        {"r12", {REG_12, 8}},  {"r12d", {REG_12, 4}},
+        {"r12w", {REG_12, 2}}, {"r12b", {REG_12, 1}},
+
+        {"r13", {REG_13, 8}},  {"r13d", {REG_13, 4}},
+        {"r13w", {REG_13, 2}}, {"r13b", {REG_13, 1}},
+
+        {"r14", {REG_14, 8}},  {"r14d", {REG_14, 4}},
+        {"r14w", {REG_14, 2}}, {"r14b", {REG_14, 1}},
+
+        {"r15", {REG_15, 8}},  {"r15d", {REG_15, 4}},
+        {"r15w", {REG_15, 2}}, {"r15b", {REG_15, 1}},
+
+        {"rip", {REG_RIP, 8}},
+};
+
+void ArgumentParser_x64::reg_to_name(std::string *norm, Register reg) {
+  switch (reg) {
+  case REG_A:
+    *norm = "ax";
+    break;
+  case REG_B:
+    *norm = "bx";
+    break;
+  case REG_C:
+    *norm = "cx";
+    break;
+  case REG_D:
+    *norm = "dx";
+    break;
+
+  case REG_SI:
+    *norm = "si";
+    break;
+  case REG_DI:
+    *norm = "di";
+    break;
+  case REG_BP:
+    *norm = "bp";
+    break;
+  case REG_SP:
+    *norm = "sp";
+    break;
+
+  case REG_8:
+    *norm = "r8";
+    break;
+  case REG_9:
+    *norm = "r9";
+    break;
+  case REG_10:
+    *norm = "r10";
+    break;
+  case REG_11:
+    *norm = "r11";
+    break;
+  case REG_12:
+    *norm = "r12";
+    break;
+  case REG_13:
+    *norm = "r13";
+    break;
+  case REG_14:
+    *norm = "r14";
+    break;
+  case REG_15:
+    *norm = "r15";
+    break;
+
+  case REG_RIP:
+    *norm = "ip";
+    break;
+  }
+}
+
+bool ArgumentParser_x64::normalize_register(std::string *reg, int *reg_size) {
+  auto it = registers_.find(*reg);
+  if (it == registers_.end())
+    return false;
+
+  *reg_size = it->second.size;
+  reg_to_name(reg, it->second.reg);
+  return true;
+}
+}
diff --git a/src/cc/vendor/optional.hpp b/src/cc/vendor/optional.hpp
new file mode 100644
index 0000000..b28ca63
--- /dev/null
+++ b/src/cc/vendor/optional.hpp
@@ -0,0 +1,1042 @@
+// Copyright (C) 2011 - 2012 Andrzej Krzemienski.
+//
+// Use, modification, and distribution is subject to the Boost Software
+// License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+//
+// The idea and interface is based on Boost.Optional library
+// authored by Fernando Luis Cacciola Carballal
+
+# ifndef ___OPTIONAL_HPP___
+# define ___OPTIONAL_HPP___
+
+# include <utility>
+# include <type_traits>
+# include <initializer_list>
+# include <cassert>
+# include <functional>
+# include <string>
+# include <stdexcept>
+
+# define TR2_OPTIONAL_REQUIRES(...) typename enable_if<__VA_ARGS__::value, bool>::type = false
+
+# if defined __GNUC__ // NOTE: GNUC is also defined for Clang
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_7_AND_HIGHER___
+#   endif
+#
+#   if (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) && (__GNUC_PATCHLEVEL__ >= 1)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ == 4) && (__GNUC_MINOR__ >= 9)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   elif (__GNUC__ > 4)
+#     define TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   endif
+# endif
+#
+# if defined __clang_major__
+#   if (__clang_major__ == 3 && __clang_minor__ >= 5)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   elif (__clang_major__ > 3)
+#     define TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#   endif
+#   if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   elif (__clang_major__ == 3 && __clang_minor__ == 4 && __clang_patchlevel__ >= 2)
+#     define TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+#   endif
+# endif
+#
+# if defined _MSC_VER
+#   if (_MSC_VER >= 1900)
+#     define TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   endif
+# endif
+
+# if defined __clang__
+#   if (__clang_major__ > 2) || (__clang_major__ == 2) && (__clang_minor__ >= 9)
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+#   else
+#     define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+#   endif
+# elif defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 1
+# else
+#   define OPTIONAL_HAS_THIS_RVALUE_REFS 0
+# endif
+
+
+# if defined TR2_OPTIONAL_GCC_4_8_1_AND_HIGHER___
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 1
+#   define OPTIONAL_CONSTEXPR_INIT_LIST constexpr
+# else
+#   define OPTIONAL_HAS_CONSTEXPR_INIT_LIST 0
+#   define OPTIONAL_CONSTEXPR_INIT_LIST
+# endif
+
+# if defined TR2_OPTIONAL_CLANG_3_5_AND_HIGHTER_ && (defined __cplusplus) && (__cplusplus != 201103L)
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 1
+# else
+#   define OPTIONAL_HAS_MOVE_ACCESSORS 0
+# endif
+
+# // In C++11 constexpr implies const, so we need to make non-const members also non-constexpr
+# if (defined __cplusplus) && (__cplusplus == 201103L)
+#   define OPTIONAL_MUTABLE_CONSTEXPR
+# else
+#   define OPTIONAL_MUTABLE_CONSTEXPR constexpr
+# endif
+
+namespace std{
+
+namespace experimental{
+
+// BEGIN workaround for missing is_trivially_destructible
+# if defined TR2_OPTIONAL_GCC_4_8_AND_HIGHER___
+    // leave it: it is already there
+# elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+    // leave it: it is already there
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+    // leave it: it is already there
+# elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS
+    // leave it: the user doesn't want it
+# else
+	template <typename T>
+	using is_trivially_destructible = std::has_trivial_destructor<T>;
+# endif
+// END workaround for missing is_trivially_destructible
+
+# if (defined TR2_OPTIONAL_GCC_4_7_AND_HIGHER___)
+    // leave it; our metafunctions are already defined.
+# elif defined TR2_OPTIONAL_CLANG_3_4_2_AND_HIGHER_
+    // leave it; our metafunctions are already defined.
+# elif defined TR2_OPTIONAL_MSVC_2015_AND_HIGHER___
+    // leave it: it is already there
+# elif defined TR2_OPTIONAL_DISABLE_EMULATION_OF_TYPE_TRAITS
+    // leave it: the user doesn't want it
+# else
+
+
+// workaround for missing traits in GCC and CLANG
+template <class T>
+struct is_nothrow_move_constructible
+{
+  constexpr static bool value = std::is_nothrow_constructible<T, T&&>::value;
+};
+
+
+template <class T, class U>
+struct is_assignable
+{
+  template <class X, class Y>
+  constexpr static bool has_assign(...) { return false; }
+
+  template <class X, class Y, size_t S = sizeof((std::declval<X>() = std::declval<Y>(), true)) >
+  // the comma operator is necessary for the cases where operator= returns void
+  constexpr static bool has_assign(bool) { return true; }
+
+  constexpr static bool value = has_assign<T, U>(true);
+};
+
+
+template <class T>
+struct is_nothrow_move_assignable
+{
+  template <class X, bool has_any_move_assign>
+  struct has_nothrow_move_assign {
+    constexpr static bool value = false;
+  };
+
+  template <class X>
+  struct has_nothrow_move_assign<X, true> {
+    constexpr static bool value = noexcept( std::declval<X&>() = std::declval<X&&>() );
+  };
+
+  constexpr static bool value = has_nothrow_move_assign<T, is_assignable<T&, T&&>::value>::value;
+};
+// end workaround
+
+
+# endif
+
+
+
+// 20.5.4, optional for object types
+template <class T> class optional;
+
+// 20.5.5, optional for lvalue reference types
+template <class T> class optional<T&>;
+
+
+// workaround: std utility functions aren't constexpr yet
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type& t) noexcept
+{
+  return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr T&& constexpr_forward(typename std::remove_reference<T>::type&& t) noexcept
+{
+    static_assert(!std::is_lvalue_reference<T>::value, "!!");
+    return static_cast<T&&>(t);
+}
+
+template <class T> inline constexpr typename std::remove_reference<T>::type&& constexpr_move(T&& t) noexcept
+{
+    return static_cast<typename std::remove_reference<T>::type&&>(t);
+}
+
+
+#if defined NDEBUG
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) (EXPR)
+#else
+# define TR2_OPTIONAL_ASSERTED_EXPRESSION(CHECK, EXPR) ((CHECK) ? (EXPR) : ([]{assert(!#CHECK);}(), (EXPR)))
+#endif
+
+
+namespace detail_
+{
+
+// static_addressof: a constexpr version of addressof
+template <typename T>
+struct has_overloaded_addressof
+{
+  template <class X>
+  constexpr static bool has_overload(...) { return false; }
+
+  template <class X, size_t S = sizeof(std::declval<X&>().operator&()) >
+  constexpr static bool has_overload(bool) { return true; }
+
+  constexpr static bool value = has_overload<T>(true);
+};
+
+template <typename T, TR2_OPTIONAL_REQUIRES(!has_overloaded_addressof<T>)>
+constexpr T* static_addressof(T& ref)
+{
+  return &ref;
+}
+
+template <typename T, TR2_OPTIONAL_REQUIRES(has_overloaded_addressof<T>)>
+T* static_addressof(T& ref)
+{
+  return std::addressof(ref);
+}
+
+
+// the call to convert<A>(b) has return type A and converts b to type A iff b decltype(b) is implicitly convertible to A
+template <class U>
+U convert(U v) { return v; }
+
+} // namespace detail
+
+
+constexpr struct trivial_init_t{} trivial_init{};
+
+
+// 20.5.6, In-place construction
+constexpr struct in_place_t{} in_place{};
+
+
+// 20.5.7, Disengaged state indicator
+struct nullopt_t
+{
+  struct init{};
+  constexpr explicit nullopt_t(init){}
+};
+constexpr nullopt_t nullopt{nullopt_t::init()};
+
+
+// 20.5.8, class bad_optional_access
+class bad_optional_access : public logic_error {
+public:
+  explicit bad_optional_access(const string& what_arg) : logic_error{what_arg} {}
+  explicit bad_optional_access(const char* what_arg) : logic_error{what_arg} {}
+};
+
+
+template <class T>
+union storage_t
+{
+  unsigned char dummy_;
+  T value_;
+
+  constexpr storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+  template <class... Args>
+  constexpr storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+  ~storage_t(){}
+};
+
+
+template <class T>
+union constexpr_storage_t
+{
+    unsigned char dummy_;
+    T value_;
+
+    constexpr constexpr_storage_t( trivial_init_t ) noexcept : dummy_() {};
+
+    template <class... Args>
+    constexpr constexpr_storage_t( Args&&... args ) : value_(constexpr_forward<Args>(args)...) {}
+
+    ~constexpr_storage_t() = default;
+};
+
+
+template <class T>
+struct optional_base
+{
+    bool init_;
+    storage_t<T> storage_;
+
+    constexpr optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit optional_base(in_place_t, Args&&... args)
+        : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(is_constructible<T, std::initializer_list<U>>)>
+    explicit optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+        : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~optional_base() { if (init_) storage_.value_.T::~T(); }
+};
+
+
+template <class T>
+struct constexpr_optional_base
+{
+    bool init_;
+    constexpr_storage_t<T> storage_;
+
+    constexpr constexpr_optional_base() noexcept : init_(false), storage_(trivial_init) {};
+
+    explicit constexpr constexpr_optional_base(const T& v) : init_(true), storage_(v) {}
+
+    explicit constexpr constexpr_optional_base(T&& v) : init_(true), storage_(constexpr_move(v)) {}
+
+    template <class... Args> explicit constexpr constexpr_optional_base(in_place_t, Args&&... args)
+      : init_(true), storage_(constexpr_forward<Args>(args)...) {}
+
+    template <class U, class... Args, TR2_OPTIONAL_REQUIRES(is_constructible<T, std::initializer_list<U>>)>
+    OPTIONAL_CONSTEXPR_INIT_LIST explicit constexpr_optional_base(in_place_t, std::initializer_list<U> il, Args&&... args)
+      : init_(true), storage_(il, std::forward<Args>(args)...) {}
+
+    ~constexpr_optional_base() = default;
+};
+
+template <class T>
+using OptionalBase = typename std::conditional<
+    is_trivially_destructible<T>::value,
+    constexpr_optional_base<T>,
+    optional_base<T>
+>::type;
+
+
+
+template <class T>
+class optional : private OptionalBase<T>
+{
+  static_assert( !std::is_same<typename std::decay<T>::type, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<typename std::decay<T>::type, in_place_t>::value, "bad T" );
+
+
+  constexpr bool initialized() const noexcept { return OptionalBase<T>::init_; }
+  T* dataptr() {  return std::addressof(OptionalBase<T>::storage_.value_); }
+  constexpr const T* dataptr() const { return detail_::static_addressof(OptionalBase<T>::storage_.value_); }
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+  constexpr const T& contained_val() const& { return OptionalBase<T>::storage_.value_; }
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+  OPTIONAL_MUTABLE_CONSTEXPR T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+  OPTIONAL_MUTABLE_CONSTEXPR T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+#   else
+  T& contained_val() & { return OptionalBase<T>::storage_.value_; }
+  T&& contained_val() && { return std::move(OptionalBase<T>::storage_.value_); }
+#   endif
+# else
+  constexpr const T& contained_val() const { return OptionalBase<T>::storage_.value_; }
+  T& contained_val() { return OptionalBase<T>::storage_.value_; }
+# endif
+
+  void clear() noexcept {
+    if (initialized()) dataptr()->T::~T();
+    OptionalBase<T>::init_ = false;
+  }
+
+  template <class... Args>
+  void initialize(Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+  template <class U, class... Args>
+  void initialize(std::initializer_list<U> il, Args&&... args) noexcept(noexcept(T(il, std::forward<Args>(args)...)))
+  {
+    assert(!OptionalBase<T>::init_);
+    ::new (static_cast<void*>(dataptr())) T(il, std::forward<Args>(args)...);
+    OptionalBase<T>::init_ = true;
+  }
+
+public:
+  typedef T value_type;
+
+  // 20.5.5.1, constructors
+  constexpr optional() noexcept : OptionalBase<T>()  {};
+  constexpr optional(nullopt_t) noexcept : OptionalBase<T>() {};
+
+  optional(const optional& rhs)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(*rhs);
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  optional(optional&& rhs) noexcept(is_nothrow_move_constructible<T>::value)
+  : OptionalBase<T>()
+  {
+    if (rhs.initialized()) {
+        ::new (static_cast<void*>(dataptr())) T(std::move(*rhs));
+        OptionalBase<T>::init_ = true;
+    }
+  }
+
+  constexpr optional(const T& v) : OptionalBase<T>(v) {}
+
+  constexpr optional(T&& v) : OptionalBase<T>(constexpr_move(v)) {}
+
+  template <class... Args>
+  explicit constexpr optional(in_place_t, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, constexpr_forward<Args>(args)...) {}
+
+  template <class U, class... Args, TR2_OPTIONAL_REQUIRES(is_constructible<T, std::initializer_list<U>>)>
+  OPTIONAL_CONSTEXPR_INIT_LIST explicit optional(in_place_t, std::initializer_list<U> il, Args&&... args)
+  : OptionalBase<T>(in_place_t{}, il, constexpr_forward<Args>(args)...) {}
+
+  // 20.5.4.2, Destructor
+  ~optional() = default;
+
+  // 20.5.4.3, assignment
+  optional& operator=(nullopt_t) noexcept
+  {
+    clear();
+    return *this;
+  }
+
+  optional& operator=(const optional& rhs)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(*rhs);
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = *rhs;
+    return *this;
+  }
+
+  optional& operator=(optional&& rhs)
+  noexcept(is_nothrow_move_assignable<T>::value && is_nothrow_move_constructible<T>::value)
+  {
+    if      (initialized() == true  && rhs.initialized() == false) clear();
+    else if (initialized() == false && rhs.initialized() == true)  initialize(std::move(*rhs));
+    else if (initialized() == true  && rhs.initialized() == true)  contained_val() = std::move(*rhs);
+    return *this;
+  }
+
+  template <class U>
+  auto operator=(U&& v)
+  -> typename enable_if
+  <
+    is_same<typename decay<U>::type, T>::value,
+    optional&
+  >::type
+  {
+    if (initialized()) { contained_val() = std::forward<U>(v); }
+    else               { initialize(std::forward<U>(v));  }
+    return *this;
+  }
+
+
+  template <class... Args>
+  void emplace(Args&&... args)
+  {
+    clear();
+    initialize(std::forward<Args>(args)...);
+  }
+
+  template <class U, class... Args>
+  void emplace(initializer_list<U> il, Args&&... args)
+  {
+    clear();
+    initialize<U, Args...>(il, std::forward<Args>(args)...);
+  }
+
+  // 20.5.4.4, Swap
+  void swap(optional<T>& rhs) noexcept(is_nothrow_move_constructible<T>::value && noexcept(swap(declval<T&>(), declval<T&>())))
+  {
+    if      (initialized() == true  && rhs.initialized() == false) { rhs.initialize(std::move(**this)); clear(); }
+    else if (initialized() == false && rhs.initialized() == true)  { initialize(std::move(*rhs)); rhs.clear(); }
+    else if (initialized() == true  && rhs.initialized() == true)  { using std::swap; swap(**this, *rhs); }
+  }
+
+  // 20.5.4.5, Observers
+
+  explicit constexpr operator bool() const noexcept { return initialized(); }
+
+  constexpr T const* operator ->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), dataptr());
+  }
+
+# if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  OPTIONAL_MUTABLE_CONSTEXPR T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const& {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& operator *() & {
+    assert (initialized());
+    return contained_val();
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& operator *() && {
+    assert (initialized());
+    return constexpr_move(contained_val());
+  }
+
+  constexpr T const& value() const& {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T& value() & {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  OPTIONAL_MUTABLE_CONSTEXPR T&& value() && {
+    if (!initialized()) throw bad_optional_access("bad optional access");
+	return std::move(contained_val());
+  }
+
+# else
+
+  T* operator ->() {
+    assert (initialized());
+    return dataptr();
+  }
+
+  constexpr T const& operator *() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(initialized(), contained_val());
+  }
+
+  T& operator *() {
+    assert (initialized());
+    return contained_val();
+  }
+
+  constexpr T const& value() const {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+  T& value() {
+    return initialized() ? contained_val() : (throw bad_optional_access("bad optional access"), contained_val());
+  }
+
+# endif
+
+# if OPTIONAL_HAS_THIS_RVALUE_REFS == 1
+
+  template <class V>
+  constexpr T value_or(V&& v) const&
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   if OPTIONAL_HAS_MOVE_ACCESSORS == 1
+
+  template <class V>
+  OPTIONAL_MUTABLE_CONSTEXPR T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   else
+
+  template <class V>
+  T value_or(V&& v) &&
+  {
+    return *this ? constexpr_move(const_cast<optional<T>&>(*this).contained_val()) : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+#   endif
+
+# else
+
+  template <class V>
+  constexpr T value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<T>(constexpr_forward<V>(v));
+  }
+
+# endif
+
+};
+
+
+template <class T>
+class optional<T&>
+{
+  static_assert( !std::is_same<T, nullopt_t>::value, "bad T" );
+  static_assert( !std::is_same<T, in_place_t>::value, "bad T" );
+  T* ref;
+
+public:
+
+  // 20.5.5.1, construction/destruction
+  constexpr optional() noexcept : ref(nullptr) {}
+
+  constexpr optional(nullopt_t) noexcept : ref(nullptr) {}
+
+  constexpr optional(T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  optional(T&&) = delete;
+
+  constexpr optional(const optional& rhs) noexcept : ref(rhs.ref) {}
+
+  explicit constexpr optional(in_place_t, T& v) noexcept : ref(detail_::static_addressof(v)) {}
+
+  explicit optional(in_place_t, T&&) = delete;
+
+  ~optional() = default;
+
+  // 20.5.5.2, mutation
+  optional& operator=(nullopt_t) noexcept {
+    ref = nullptr;
+    return *this;
+  }
+
+  // optional& operator=(const optional& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  // optional& operator=(optional&& rhs) noexcept {
+    // ref = rhs.ref;
+    // return *this;
+  // }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename enable_if
+  <
+    is_same<typename decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  {
+    ref = rhs.ref;
+    return *this;
+  }
+
+  template <typename U>
+  auto operator=(U&& rhs) noexcept
+  -> typename enable_if
+  <
+    !is_same<typename decay<U>::type, optional<T&>>::value,
+    optional&
+  >::type
+  = delete;
+
+  void emplace(T& v) noexcept {
+    ref = detail_::static_addressof(v);
+  }
+
+  void emplace(T&&) = delete;
+
+
+  void swap(optional<T&>& rhs) noexcept
+  {
+    std::swap(ref, rhs.ref);
+  }
+
+  // 20.5.5.3, observers
+  constexpr T* operator->() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, ref);
+  }
+
+  constexpr T& operator*() const {
+    return TR2_OPTIONAL_ASSERTED_EXPRESSION(ref, *ref);
+  }
+
+  constexpr T& value() const {
+    return ref ? *ref : (throw bad_optional_access("bad optional access"), *ref);
+  }
+
+  explicit constexpr operator bool() const noexcept {
+    return ref != nullptr;
+  }
+
+  template <class V>
+  constexpr typename decay<T>::type value_or(V&& v) const
+  {
+    return *this ? **this : detail_::convert<typename decay<T>::type>(constexpr_forward<V>(v));
+  }
+};
+
+
+template <class T>
+class optional<T&&>
+{
+  static_assert( sizeof(T) == 0, "optional rvalue references disallowed" );
+};
+
+
+// 20.5.8, Relational operators
+template <class T> constexpr bool operator==(const optional<T>& x, const optional<T>& y)
+{
+  return bool(x) != bool(y) ? false : bool(x) == false ? true : *x == *y;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x == y);
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const optional<T>& y)
+{
+  return (!y) ? false : (!x) ? true : *x < *y;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const optional<T>& y)
+{
+  return (y < x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const optional<T>& y)
+{
+  return !(y < x);
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const optional<T>& y)
+{
+  return !(x < y);
+}
+
+
+// 20.5.9, Comparison with nullopt
+template <class T> constexpr bool operator==(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator==(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator!=(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator<(nullopt_t, const optional<T>& x) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, nullopt_t) noexcept
+{
+  return (!x);
+}
+
+template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, nullopt_t) noexcept
+{
+  return bool(x);
+}
+
+template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept
+{
+  return false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept
+{
+  return true;
+}
+
+template <class T> constexpr bool operator>=(nullopt_t, const optional<T>& x) noexcept
+{
+  return (!x);
+}
+
+
+
+// 20.5.10, Comparison with T
+template <class T> constexpr bool operator==(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// Comparison of optional<T&> with T
+template <class T> constexpr bool operator==(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+// Comparison of optional<T const&> with T
+template <class T> constexpr bool operator==(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x == v : false;
+}
+
+template <class T> constexpr bool operator==(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v == *x : false;
+}
+
+template <class T> constexpr bool operator!=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x != v : true;
+}
+
+template <class T> constexpr bool operator!=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v != *x : true;
+}
+
+template <class T> constexpr bool operator<(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x < v : true;
+}
+
+template <class T> constexpr bool operator>(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v > *x : true;
+}
+
+template <class T> constexpr bool operator>(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x > v : false;
+}
+
+template <class T> constexpr bool operator<(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v < *x : false;
+}
+
+template <class T> constexpr bool operator>=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x >= v : false;
+}
+
+template <class T> constexpr bool operator<=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v <= *x : false;
+}
+
+template <class T> constexpr bool operator<=(const optional<const T&>& x, const T& v)
+{
+  return bool(x) ? *x <= v : true;
+}
+
+template <class T> constexpr bool operator>=(const T& v, const optional<const T&>& x)
+{
+  return bool(x) ? v >= *x : true;
+}
+
+
+// 20.5.12, Specialized algorithms
+template <class T>
+void swap(optional<T>& x, optional<T>& y) noexcept(noexcept(x.swap(y)))
+{
+  x.swap(y);
+}
+
+
+template <class T>
+constexpr optional<typename decay<T>::type> make_optional(T&& v)
+{
+  return optional<typename decay<T>::type>(constexpr_forward<T>(v));
+}
+
+template <class X>
+constexpr optional<X&> make_optional(reference_wrapper<X> v)
+{
+  return optional<X&>(v.get());
+}
+
+
+} // namespace experimental
+} // namespace std
+
+namespace std
+{
+  template <typename T>
+  struct hash<std::experimental::optional<T>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef std::experimental::optional<T> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+
+  template <typename T>
+  struct hash<std::experimental::optional<T&>>
+  {
+    typedef typename hash<T>::result_type result_type;
+    typedef std::experimental::optional<T&> argument_type;
+
+    constexpr result_type operator()(argument_type const& arg) const {
+      return arg ? std::hash<T>{}(*arg) : result_type{};
+    }
+  };
+}
+
+# undef TR2_OPTIONAL_REQUIRES
+# undef TR2_OPTIONAL_ASSERTED_EXPRESSION
+
+# endif //___OPTIONAL_HPP___
diff --git a/src/cc/vendor/tinyformat.hpp b/src/cc/vendor/tinyformat.hpp
new file mode 100644
index 0000000..3273ad6
--- /dev/null
+++ b/src/cc/vendor/tinyformat.hpp
@@ -0,0 +1,1039 @@
+// tinyformat.h
+// Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
+//
+// Boost Software License - Version 1.0
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+//------------------------------------------------------------------------------
+// Tinyformat: A minimal type safe printf replacement
+//
+// tinyformat.h is a type safe printf replacement library in a single C++
+// header file.  Design goals include:
+//
+// * Type safety and extensibility for user defined types.
+// * C99 printf() compatibility, to the extent possible using std::ostream
+// * Simplicity and minimalism.  A single header file to include and distribute
+//   with your projects.
+// * Augment rather than replace the standard stream formatting mechanism
+// * C++98 support, with optional C++11 niceties
+//
+//
+// Main interface example usage
+// ----------------------------
+//
+// To print a date to std::cout:
+//
+//   std::string weekday = "Wednesday";
+//   const char* month = "July";
+//   size_t day = 27;
+//   long hour = 14;
+//   int min = 44;
+//
+//   tfm::printf("%s, %s %d, %.2d:%.2d\n", weekday, month, day, hour, min);
+//
+// The strange types here emphasize the type safety of the interface; it is
+// possible to print a std::string using the "%s" conversion, and a
+// size_t using the "%d" conversion.  A similar result could be achieved
+// using either of the tfm::format() functions.  One prints on a user provided
+// stream:
+//
+//   tfm::format(std::cerr, "%s, %s %d, %.2d:%.2d\n",
+//               weekday, month, day, hour, min);
+//
+// The other returns a std::string:
+//
+//   std::string date = tfm::format("%s, %s %d, %.2d:%.2d\n",
+//                                  weekday, month, day, hour, min);
+//   std::cout << date;
+//
+// These are the three primary interface functions.  There is also a
+// convenience function printfln() which appends a newline to the usual result
+// of printf() for super simple logging.
+//
+//
+// User defined format functions
+// -----------------------------
+//
+// Simulating variadic templates in C++98 is pretty painful since it requires
+// writing out the same function for each desired number of arguments.  To make
+// this bearable tinyformat comes with a set of macros which are used
+// internally to generate the API, but which may also be used in user code.
+//
+// The three macros TINYFORMAT_ARGTYPES(n), TINYFORMAT_VARARGS(n) and
+// TINYFORMAT_PASSARGS(n) will generate a list of n argument types,
+// type/name pairs and argument names respectively when called with an integer
+// n between 1 and 16.  We can use these to define a macro which generates the
+// desired user defined function with n arguments.  To generate all 16 user
+// defined function bodies, use the macro TINYFORMAT_FOREACH_ARGNUM.  For an
+// example, see the implementation of printf() at the end of the source file.
+//
+// Sometimes it's useful to be able to pass a list of format arguments through
+// to a non-template function.  The FormatList class is provided as a way to do
+// this by storing the argument list in a type-opaque way.  Continuing the
+// example from above, we construct a FormatList using makeFormatList():
+//
+//   FormatListRef formatList = tfm::makeFormatList(weekday, month, day, hour, min);
+//
+// The format list can now be passed into any non-template function and used
+// via a call to the vformat() function:
+//
+//   tfm::vformat(std::cout, "%s, %s %d, %.2d:%.2d\n", formatList);
+//
+//
+// Additional API information
+// --------------------------
+//
+// Error handling: Define TINYFORMAT_ERROR to customize the error handling for
+// format strings which are unsupported or have the wrong number of format
+// specifiers (calls assert() by default).
+//
+// User defined types: Uses operator<< for user defined types by default.
+// Overload formatValue() for more control.
+
+
+#ifndef TINYFORMAT_H_INCLUDED
+#define TINYFORMAT_H_INCLUDED
+
+namespace tinyformat {}
+//------------------------------------------------------------------------------
+// Config section.  Customize to your liking!
+
+// Namespace alias to encourage brevity
+namespace tfm = tinyformat;
+
+// Error handling; calls assert() by default.
+// #define TINYFORMAT_ERROR(reasonString) your_error_handler(reasonString)
+
+// Define for C++11 variadic templates which make the code shorter & more
+// general.  If you don't define this, C++11 support is autodetected below.
+// #define TINYFORMAT_USE_VARIADIC_TEMPLATES
+
+
+//------------------------------------------------------------------------------
+// Implementation details.
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+
+#ifndef TINYFORMAT_ERROR
+#   define TINYFORMAT_ERROR(reason) assert(0 && reason)
+#endif
+
+#if !defined(TINYFORMAT_USE_VARIADIC_TEMPLATES) && !defined(TINYFORMAT_NO_VARIADIC_TEMPLATES)
+#   ifdef __GXX_EXPERIMENTAL_CXX0X__
+#       define TINYFORMAT_USE_VARIADIC_TEMPLATES
+#   endif
+#endif
+
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20080201
+//  std::showpos is broken on old libstdc++ as provided with OSX.  See
+//  http://gcc.gnu.org/ml/libstdc++/2007-11/msg00075.html
+#   define TINYFORMAT_OLD_LIBSTDCPLUSPLUS_WORKAROUND
+#endif
+
+#ifdef __APPLE__
+// Workaround OSX linker warning: xcode uses different default symbol
+// visibilities for static libs vs executables (see issue #25)
+#   define TINYFORMAT_HIDDEN __attribute__((visibility("hidden")))
+#else
+#   define TINYFORMAT_HIDDEN
+#endif
+
+namespace tinyformat {
+
+//------------------------------------------------------------------------------
+namespace detail {
+
+// Test whether type T1 is convertible to type T2
+template <typename T1, typename T2>
+struct is_convertible
+{
+    private:
+        // two types of different size
+        struct fail { char dummy[2]; };
+        struct succeed { char dummy; };
+        // Try to convert a T1 to a T2 by plugging into tryConvert
+        static fail tryConvert(...);
+        static succeed tryConvert(const T2&);
+        static const T1& makeT1();
+    public:
+#       ifdef _MSC_VER
+        // Disable spurious loss of precision warnings in tryConvert(makeT1())
+#       pragma warning(push)
+#       pragma warning(disable:4244)
+#       pragma warning(disable:4267)
+#       endif
+        // Standard trick: the (...) version of tryConvert will be chosen from
+        // the overload set only if the version taking a T2 doesn't match.
+        // Then we compare the sizes of the return types to check which
+        // function matched.  Very neat, in a disgusting kind of way :)
+        static const bool value =
+            sizeof(tryConvert(makeT1())) == sizeof(succeed);
+#       ifdef _MSC_VER
+#       pragma warning(pop)
+#       endif
+};
+
+
+// Detect when a type is not a wchar_t string
+template<typename T> struct is_wchar { typedef int tinyformat_wchar_is_not_supported; };
+template<> struct is_wchar<wchar_t*> {};
+template<> struct is_wchar<const wchar_t*> {};
+template<int n> struct is_wchar<const wchar_t[n]> {};
+template<int n> struct is_wchar<wchar_t[n]> {};
+
+
+// Format the value by casting to type fmtT.  This default implementation
+// should never be called.
+template<typename T, typename fmtT, bool convertible = is_convertible<T, fmtT>::value>
+struct formatValueAsType
+{
+    static void invoke(std::ostream& /*out*/, const T& /*value*/) { assert(0); }
+};
+// Specialized version for types that can actually be converted to fmtT, as
+// indicated by the "convertible" template parameter.
+template<typename T, typename fmtT>
+struct formatValueAsType<T,fmtT,true>
+{
+    static void invoke(std::ostream& out, const T& value)
+        { out << static_cast<fmtT>(value); }
+};
+
+#ifdef TINYFORMAT_OLD_LIBSTDCPLUSPLUS_WORKAROUND
+template<typename T, bool convertible = is_convertible<T, int>::value>
+struct formatZeroIntegerWorkaround
+{
+    static bool invoke(std::ostream& /**/, const T& /**/) { return false; }
+};
+template<typename T>
+struct formatZeroIntegerWorkaround<T,true>
+{
+    static bool invoke(std::ostream& out, const T& value)
+    {
+        if (static_cast<int>(value) == 0 && out.flags() & std::ios::showpos)
+        {
+            out << "+0";
+            return true;
+        }
+        return false;
+    }
+};
+#endif // TINYFORMAT_OLD_LIBSTDCPLUSPLUS_WORKAROUND
+
+// Convert an arbitrary type to integer.  The version with convertible=false
+// throws an error.
+template<typename T, bool convertible = is_convertible<T,int>::value>
+struct convertToInt
+{
+    static int invoke(const T& /*value*/)
+    {
+        TINYFORMAT_ERROR("tinyformat: Cannot convert from argument type to "
+                         "integer for use as variable width or precision");
+        return 0;
+    }
+};
+// Specialization for convertToInt when conversion is possible
+template<typename T>
+struct convertToInt<T,true>
+{
+    static int invoke(const T& value) { return static_cast<int>(value); }
+};
+
+// Format at most ntrunc characters to the given stream.
+template<typename T>
+inline void formatTruncated(std::ostream& out, const T& value, int ntrunc)
+{
+    std::ostringstream tmp;
+    tmp << value;
+    std::string result = tmp.str();
+    out.write(result.c_str(), std::min(ntrunc, static_cast<int>(result.size())));
+}
+#define TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(type)       \
+inline void formatTruncated(std::ostream& out, type* value, int ntrunc) \
+{                                                           \
+    std::streamsize len = 0;                                \
+    while(len < ntrunc && value[len] != 0)                  \
+        ++len;                                              \
+    out.write(value, len);                                  \
+}
+// Overload for const char* and char*.  Could overload for signed & unsigned
+// char too, but these are technically unneeded for printf compatibility.
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(const char)
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
+#undef TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR
+
+} // namespace detail
+
+
+//------------------------------------------------------------------------------
+// Variable formatting functions.  May be overridden for user-defined types if
+// desired.
+
+
+/// Format a value into a stream, delegating to operator<< by default.
+///
+/// Users may override this for their own types.  When this function is called,
+/// the stream flags will have been modified according to the format string.
+/// The format specification is provided in the range [fmtBegin, fmtEnd).  For
+/// truncating conversions, ntrunc is set to the desired maximum number of
+/// characters, for example "%.7s" calls formatValue with ntrunc = 7.
+///
+/// By default, formatValue() uses the usual stream insertion operator
+/// operator<< to format the type T, with special cases for the %c and %p
+/// conversions.
+template<typename T>
+inline void formatValue(std::ostream& out, const char* /*fmtBegin*/,
+                        const char* fmtEnd, int ntrunc, const T& value)
+{
+#ifndef TINYFORMAT_ALLOW_WCHAR_STRINGS
+    // Since we don't support printing of wchar_t using "%ls", make it fail at
+    // compile time in preference to printing as a void* at runtime.
+    typedef typename detail::is_wchar<T>::tinyformat_wchar_is_not_supported DummyType;
+    (void) DummyType(); // avoid unused type warning with gcc-4.8
+#endif
+    // The mess here is to support the %c and %p conversions: if these
+    // conversions are active we try to convert the type to a char or const
+    // void* respectively and format that instead of the value itself.  For the
+    // %p conversion it's important to avoid dereferencing the pointer, which
+    // could otherwise lead to a crash when printing a dangling (const char*).
+    bool canConvertToChar = detail::is_convertible<T,char>::value;
+    bool canConvertToVoidPtr = detail::is_convertible<T, const void*>::value;
+    if(canConvertToChar && *(fmtEnd-1) == 'c')
+        detail::formatValueAsType<T, char>::invoke(out, value);
+    else if(canConvertToVoidPtr && *(fmtEnd-1) == 'p')
+        detail::formatValueAsType<T, const void*>::invoke(out, value);
+#ifdef TINYFORMAT_OLD_LIBSTDCPLUSPLUS_WORKAROUND
+    else if(detail::formatZeroIntegerWorkaround<T>::invoke(out, value)) /**/;
+#endif
+    else if(ntrunc >= 0)
+    {
+        // Take care not to overread C strings in truncating conversions like
+        // "%.4s" where at most 4 characters may be read.
+        detail::formatTruncated(out, value, ntrunc);
+    }
+    else
+        out << value;
+}
+
+
+// Overloaded version for char types to support printing as an integer
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                  \
+inline void formatValue(std::ostream& out, const char* /*fmtBegin*/,  \
+                        const char* fmtEnd, int /**/, charType value) \
+{                                                                     \
+    switch(*(fmtEnd-1))                                               \
+    {                                                                 \
+        case 'u': case 'd': case 'i': case 'o': case 'X': case 'x':   \
+            out << static_cast<int>(value); break;                    \
+        default:                                                      \
+            out << value;                   break;                    \
+    }                                                                 \
+}
+// per 3.9.1: char, signed char and unsigned char are all distinct types
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(signed char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(unsigned char)
+#undef TINYFORMAT_DEFINE_FORMATVALUE_CHAR
+
+
+//------------------------------------------------------------------------------
+// Tools for emulating variadic templates in C++98.  The basic idea here is
+// stolen from the boost preprocessor metaprogramming library and cut down to
+// be just general enough for what we need.
+
+#define TINYFORMAT_ARGTYPES(n) TINYFORMAT_ARGTYPES_ ## n
+#define TINYFORMAT_VARARGS(n) TINYFORMAT_VARARGS_ ## n
+#define TINYFORMAT_PASSARGS(n) TINYFORMAT_PASSARGS_ ## n
+#define TINYFORMAT_PASSARGS_TAIL(n) TINYFORMAT_PASSARGS_TAIL_ ## n
+
+// To keep it as transparent as possible, the macros below have been generated
+// using python via the excellent cog.py code generation script.  This avoids
+// the need for a bunch of complex (but more general) preprocessor tricks as
+// used in boost.preprocessor.
+//
+// To rerun the code generation in place, use `cog.py -r tinyformat.h`
+// (see http://nedbatchelder.com/code/cog).  Alternatively you can just create
+// extra versions by hand.
+
+/*[[[cog
+maxParams = 16
+
+def makeCommaSepLists(lineTemplate, elemTemplate, startInd=1):
+    for j in range(startInd,maxParams+1):
+        list = ', '.join([elemTemplate % {'i':i} for i in range(startInd,j+1)])
+        cog.outl(lineTemplate % {'j':j, 'list':list})
+
+makeCommaSepLists('#define TINYFORMAT_ARGTYPES_%(j)d %(list)s',
+                  'class T%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_VARARGS_%(j)d %(list)s',
+                  'const T%(i)d& v%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_%(j)d %(list)s', 'v%(i)d')
+
+cog.outl()
+cog.outl('#define TINYFORMAT_PASSARGS_TAIL_1')
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_TAIL_%(j)d , %(list)s',
+                  'v%(i)d', startInd = 2)
+
+cog.outl()
+cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
+         ' '.join(['m(%d)' % (j,) for j in range(1,maxParams+1)]))
+]]]*/
+#define TINYFORMAT_ARGTYPES_1 class T1
+#define TINYFORMAT_ARGTYPES_2 class T1, class T2
+#define TINYFORMAT_ARGTYPES_3 class T1, class T2, class T3
+#define TINYFORMAT_ARGTYPES_4 class T1, class T2, class T3, class T4
+#define TINYFORMAT_ARGTYPES_5 class T1, class T2, class T3, class T4, class T5
+#define TINYFORMAT_ARGTYPES_6 class T1, class T2, class T3, class T4, class T5, class T6
+#define TINYFORMAT_ARGTYPES_7 class T1, class T2, class T3, class T4, class T5, class T6, class T7
+#define TINYFORMAT_ARGTYPES_8 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8
+#define TINYFORMAT_ARGTYPES_9 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9
+#define TINYFORMAT_ARGTYPES_10 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10
+#define TINYFORMAT_ARGTYPES_11 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11
+#define TINYFORMAT_ARGTYPES_12 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11, class T12
+#define TINYFORMAT_ARGTYPES_13 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11, class T12, class T13
+#define TINYFORMAT_ARGTYPES_14 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11, class T12, class T13, class T14
+#define TINYFORMAT_ARGTYPES_15 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11, class T12, class T13, class T14, class T15
+#define TINYFORMAT_ARGTYPES_16 class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10, class T11, class T12, class T13, class T14, class T15, class T16
+
+#define TINYFORMAT_VARARGS_1 const T1& v1
+#define TINYFORMAT_VARARGS_2 const T1& v1, const T2& v2
+#define TINYFORMAT_VARARGS_3 const T1& v1, const T2& v2, const T3& v3
+#define TINYFORMAT_VARARGS_4 const T1& v1, const T2& v2, const T3& v3, const T4& v4
+#define TINYFORMAT_VARARGS_5 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5
+#define TINYFORMAT_VARARGS_6 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6
+#define TINYFORMAT_VARARGS_7 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7
+#define TINYFORMAT_VARARGS_8 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8
+#define TINYFORMAT_VARARGS_9 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9
+#define TINYFORMAT_VARARGS_10 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10
+#define TINYFORMAT_VARARGS_11 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11
+#define TINYFORMAT_VARARGS_12 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11, const T12& v12
+#define TINYFORMAT_VARARGS_13 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11, const T12& v12, const T13& v13
+#define TINYFORMAT_VARARGS_14 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11, const T12& v12, const T13& v13, const T14& v14
+#define TINYFORMAT_VARARGS_15 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11, const T12& v12, const T13& v13, const T14& v14, const T15& v15
+#define TINYFORMAT_VARARGS_16 const T1& v1, const T2& v2, const T3& v3, const T4& v4, const T5& v5, const T6& v6, const T7& v7, const T8& v8, const T9& v9, const T10& v10, const T11& v11, const T12& v12, const T13& v13, const T14& v14, const T15& v15, const T16& v16
+
+#define TINYFORMAT_PASSARGS_1 v1
+#define TINYFORMAT_PASSARGS_2 v1, v2
+#define TINYFORMAT_PASSARGS_3 v1, v2, v3
+#define TINYFORMAT_PASSARGS_4 v1, v2, v3, v4
+#define TINYFORMAT_PASSARGS_5 v1, v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_6 v1, v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_7 v1, v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_8 v1, v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_9 v1, v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_10 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_11 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_12 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_13 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_14 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_15 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_16 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_PASSARGS_TAIL_1
+#define TINYFORMAT_PASSARGS_TAIL_2 , v2
+#define TINYFORMAT_PASSARGS_TAIL_3 , v2, v3
+#define TINYFORMAT_PASSARGS_TAIL_4 , v2, v3, v4
+#define TINYFORMAT_PASSARGS_TAIL_5 , v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_TAIL_6 , v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_TAIL_7 , v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_TAIL_8 , v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_TAIL_9 , v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_TAIL_10 , v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_TAIL_11 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_TAIL_12 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_TAIL_13 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_TAIL_14 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_TAIL_15 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_TAIL_16 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_FOREACH_ARGNUM(m) \
+    m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) m(15) m(16)
+//[[[end]]]
+
+
+
+namespace detail {
+
+// Type-opaque holder for an argument to format(), with associated actions on
+// the type held as explicit function pointers.  This allows FormatArg's for
+// each argument to be allocated as a homogenous array inside FormatList
+// whereas a naive implementation based on inheritance does not.
+class FormatArg
+{
+    public:
+        FormatArg() {}
+
+        template<typename T>
+        FormatArg(const T& value)
+            : m_value(static_cast<const void*>(&value)),
+            m_formatImpl(&formatImpl<T>),
+            m_toIntImpl(&toIntImpl<T>)
+        { }
+
+        void format(std::ostream& out, const char* fmtBegin,
+                    const char* fmtEnd, int ntrunc) const
+        {
+            m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
+        }
+
+        int toInt() const
+        {
+            return m_toIntImpl(m_value);
+        }
+
+    private:
+        template<typename T>
+        TINYFORMAT_HIDDEN static void formatImpl(std::ostream& out, const char* fmtBegin,
+                        const char* fmtEnd, int ntrunc, const void* value)
+        {
+            formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T*>(value));
+        }
+
+        template<typename T>
+        TINYFORMAT_HIDDEN static int toIntImpl(const void* value)
+        {
+            return convertToInt<T>::invoke(*static_cast<const T*>(value));
+        }
+
+        const void* m_value;
+        void (*m_formatImpl)(std::ostream& out, const char* fmtBegin,
+                             const char* fmtEnd, int ntrunc, const void* value);
+        int (*m_toIntImpl)(const void* value);
+};
+
+
+// Parse and return an integer from the string c, as atoi()
+// On return, c is set to one past the end of the integer.
+inline int parseIntAndAdvance(const char*& c)
+{
+    int i = 0;
+    for(;*c >= '0' && *c <= '9'; ++c)
+        i = 10*i + (*c - '0');
+    return i;
+}
+
+// Print literal part of format string and return next format spec
+// position.
+//
+// Skips over any occurrences of '%%', printing a literal '%' to the
+// output.  The position of the first % character of the next
+// nontrivial format spec is returned, or the end of string.
+inline const char* printFormatStringLiteral(std::ostream& out, const char* fmt)
+{
+    const char* c = fmt;
+    for(;; ++c)
+    {
+        switch(*c)
+        {
+            case '\0':
+                out.write(fmt, c - fmt);
+                return c;
+            case '%':
+                out.write(fmt, c - fmt);
+                if(*(c+1) != '%')
+                    return c;
+                // for "%%", tack trailing % onto next literal section.
+                fmt = ++c;
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+
+// Parse a format string and set the stream state accordingly.
+//
+// The format mini-language recognized here is meant to be the one from C99,
+// with the form "%[flags][width][.precision][length]type".
+//
+// Formatting options which can't be natively represented using the ostream
+// state are returned in spacePadPositive (for space padded positive numbers)
+// and ntrunc (for truncating conversions).  argIndex is incremented if
+// necessary to pull out variable width and precision .  The function returns a
+// pointer to the character after the end of the current format spec.
+inline const char* streamStateFromFormat(std::ostream& out, bool& spacePadPositive,
+                                         int& ntrunc, const char* fmtStart,
+                                         const detail::FormatArg* formatters,
+                                         int& argIndex, int numFormatters)
+{
+    if(*fmtStart != '%')
+    {
+        TINYFORMAT_ERROR("tinyformat: Not enough conversion specifiers in format string");
+        return fmtStart;
+    }
+    // Reset stream state to defaults.
+    out.width(0);
+    out.precision(6);
+    out.fill(' ');
+    // Reset most flags; ignore irrelevant unitbuf & skipws.
+    out.unsetf(std::ios::adjustfield | std::ios::basefield |
+               std::ios::floatfield | std::ios::showbase | std::ios::boolalpha |
+               std::ios::showpoint | std::ios::showpos | std::ios::uppercase);
+    bool precisionSet = false;
+    bool widthSet = false;
+    int widthExtra = 0;
+    const char* c = fmtStart + 1;
+    // 1) Parse flags
+    for(;; ++c)
+    {
+        switch(*c)
+        {
+            case '#':
+                out.setf(std::ios::showpoint | std::ios::showbase);
+                continue;
+            case '0':
+                // overridden by left alignment ('-' flag)
+                if(!(out.flags() & std::ios::left))
+                {
+                    // Use internal padding so that numeric values are
+                    // formatted correctly, eg -00010 rather than 000-10
+                    out.fill('0');
+                    out.setf(std::ios::internal, std::ios::adjustfield);
+                }
+                continue;
+            case '-':
+                out.fill(' ');
+                out.setf(std::ios::left, std::ios::adjustfield);
+                continue;
+            case ' ':
+                // overridden by show positive sign, '+' flag.
+                if(!(out.flags() & std::ios::showpos))
+                    spacePadPositive = true;
+                continue;
+            case '+':
+                out.setf(std::ios::showpos);
+                spacePadPositive = false;
+                widthExtra = 1;
+                continue;
+            default:
+                break;
+        }
+        break;
+    }
+    // 2) Parse width
+    if(*c >= '0' && *c <= '9')
+    {
+        widthSet = true;
+        out.width(parseIntAndAdvance(c));
+    }
+    if(*c == '*')
+    {
+        widthSet = true;
+        int width = 0;
+        if(argIndex < numFormatters)
+            width = formatters[argIndex++].toInt();
+        else
+            TINYFORMAT_ERROR("tinyformat: Not enough arguments to read variable width");
+        if(width < 0)
+        {
+            // negative widths correspond to '-' flag set
+            out.fill(' ');
+            out.setf(std::ios::left, std::ios::adjustfield);
+            width = -width;
+        }
+        out.width(width);
+        ++c;
+    }
+    // 3) Parse precision
+    if(*c == '.')
+    {
+        ++c;
+        int precision = 0;
+        if(*c == '*')
+        {
+            ++c;
+            if(argIndex < numFormatters)
+                precision = formatters[argIndex++].toInt();
+            else
+                TINYFORMAT_ERROR("tinyformat: Not enough arguments to read variable precision");
+        }
+        else
+        {
+            if(*c >= '0' && *c <= '9')
+                precision = parseIntAndAdvance(c);
+            else if(*c == '-') // negative precisions ignored, treated as zero.
+                parseIntAndAdvance(++c);
+        }
+        out.precision(precision);
+        precisionSet = true;
+    }
+    // 4) Ignore any C99 length modifier
+    while(*c == 'l' || *c == 'h' || *c == 'L' ||
+          *c == 'j' || *c == 'z' || *c == 't')
+        ++c;
+    // 5) We're up to the conversion specifier character.
+    // Set stream flags based on conversion specifier (thanks to the
+    // boost::format class for forging the way here).
+    bool intConversion = false;
+    switch(*c)
+    {
+        case 'u': case 'd': case 'i':
+            out.setf(std::ios::dec, std::ios::basefield);
+            intConversion = true;
+            break;
+        case 'o':
+            out.setf(std::ios::oct, std::ios::basefield);
+            intConversion = true;
+            break;
+        case 'X':
+            out.setf(std::ios::uppercase);
+        case 'x': case 'p':
+            out.setf(std::ios::hex, std::ios::basefield);
+            intConversion = true;
+            break;
+        case 'E':
+            out.setf(std::ios::uppercase);
+        case 'e':
+            out.setf(std::ios::scientific, std::ios::floatfield);
+            out.setf(std::ios::dec, std::ios::basefield);
+            break;
+        case 'F':
+            out.setf(std::ios::uppercase);
+        case 'f':
+            out.setf(std::ios::fixed, std::ios::floatfield);
+            break;
+        case 'G':
+            out.setf(std::ios::uppercase);
+        case 'g':
+            out.setf(std::ios::dec, std::ios::basefield);
+            // As in boost::format, let stream decide float format.
+            out.flags(out.flags() & ~std::ios::floatfield);
+            break;
+        case 'a': case 'A':
+            TINYFORMAT_ERROR("tinyformat: the %a and %A conversion specs "
+                             "are not supported");
+            break;
+        case 'c':
+            // Handled as special case inside formatValue()
+            break;
+        case 's':
+            if(precisionSet)
+                ntrunc = static_cast<int>(out.precision());
+            // Make %s print booleans as "true" and "false"
+            out.setf(std::ios::boolalpha);
+            break;
+        case 'n':
+            // Not supported - will cause problems!
+            TINYFORMAT_ERROR("tinyformat: %n conversion spec not supported");
+            break;
+        case '\0':
+            TINYFORMAT_ERROR("tinyformat: Conversion spec incorrectly "
+                             "terminated by end of string");
+            return c;
+        default:
+            break;
+    }
+    if(intConversion && precisionSet && !widthSet)
+    {
+        // "precision" for integers gives the minimum number of digits (to be
+        // padded with zeros on the left).  This isn't really supported by the
+        // iostreams, but we can approximately simulate it with the width if
+        // the width isn't otherwise used.
+        out.width(out.precision() + widthExtra);
+        out.setf(std::ios::internal, std::ios::adjustfield);
+        out.fill('0');
+    }
+    return c+1;
+}
+
+
+//------------------------------------------------------------------------------
+inline void formatImpl(std::ostream& out, const char* fmt,
+                       const detail::FormatArg* formatters,
+                       int numFormatters)
+{
+    // Saved stream state
+    std::streamsize origWidth = out.width();
+    std::streamsize origPrecision = out.precision();
+    std::ios::fmtflags origFlags = out.flags();
+    char origFill = out.fill();
+
+    for (int argIndex = 0; argIndex < numFormatters; ++argIndex)
+    {
+        // Parse the format string
+        fmt = printFormatStringLiteral(out, fmt);
+        bool spacePadPositive = false;
+        int ntrunc = -1;
+        const char* fmtEnd = streamStateFromFormat(out, spacePadPositive, ntrunc, fmt,
+                                                   formatters, argIndex, numFormatters);
+        if (argIndex >= numFormatters)
+        {
+            // Check args remain after reading any variable width/precision
+            TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
+            return;
+        }
+        const FormatArg& arg = formatters[argIndex];
+        // Format the arg into the stream.
+        if(!spacePadPositive)
+            arg.format(out, fmt, fmtEnd, ntrunc);
+        else
+        {
+            // The following is a special case with no direct correspondence
+            // between stream formatting and the printf() behaviour.  Simulate
+            // it crudely by formatting into a temporary string stream and
+            // munging the resulting string.
+            std::ostringstream tmpStream;
+            tmpStream.copyfmt(out);
+            tmpStream.setf(std::ios::showpos);
+            arg.format(tmpStream, fmt, fmtEnd, ntrunc);
+            std::string result = tmpStream.str(); // allocates... yuck.
+            for(size_t i = 0, iend = result.size(); i < iend; ++i)
+                if(result[i] == '+') result[i] = ' ';
+            out << result;
+        }
+        fmt = fmtEnd;
+    }
+
+    // Print remaining part of format string.
+    fmt = printFormatStringLiteral(out, fmt);
+    if(*fmt != '\0')
+        TINYFORMAT_ERROR("tinyformat: Too many conversion specifiers in format string");
+
+    // Restore stream state
+    out.width(origWidth);
+    out.precision(origPrecision);
+    out.flags(origFlags);
+    out.fill(origFill);
+}
+
+} // namespace detail
+
+
+/// List of template arguments format(), held in a type-opaque way.
+///
+/// A const reference to FormatList (typedef'd as FormatListRef) may be
+/// conveniently used to pass arguments to non-template functions: All type
+/// information has been stripped from the arguments, leaving just enough of a
+/// common interface to perform formatting as required.
+class FormatList
+{
+    public:
+        FormatList(detail::FormatArg* formatters, int N)
+            : m_formatters(formatters), m_N(N) { }
+
+        friend void vformat(std::ostream& out, const char* fmt,
+                            const FormatList& list);
+
+    private:
+        const detail::FormatArg* m_formatters;
+        int m_N;
+};
+
+/// Reference to type-opaque format list for passing to vformat()
+typedef const FormatList& FormatListRef;
+
+
+namespace detail {
+
+// Format list subclass with fixed storage to avoid dynamic allocation
+template<int N>
+class FormatListN : public FormatList
+{
+    public:
+#ifdef TINYFORMAT_USE_VARIADIC_TEMPLATES
+        template<typename... Args>
+        FormatListN(const Args&... args)
+            : FormatList(&m_formatterStore[0], N),
+            m_formatterStore { FormatArg(args)... }
+        { static_assert(sizeof...(args) == N, "Number of args must be N"); }
+#else // C++98 version
+        void init(int) {}
+#       define TINYFORMAT_MAKE_FORMATLIST_CONSTRUCTOR(n)       \
+                                                               \
+        template<TINYFORMAT_ARGTYPES(n)>                       \
+        FormatListN(TINYFORMAT_VARARGS(n))                     \
+            : FormatList(&m_formatterStore[0], n)              \
+        { assert(n == N); init(0, TINYFORMAT_PASSARGS(n)); }   \
+                                                               \
+        template<TINYFORMAT_ARGTYPES(n)>                       \
+        void init(int i, TINYFORMAT_VARARGS(n))                \
+        {                                                      \
+            m_formatterStore[i] = FormatArg(v1);               \
+            init(i+1 TINYFORMAT_PASSARGS_TAIL(n));             \
+        }
+
+        TINYFORMAT_FOREACH_ARGNUM(TINYFORMAT_MAKE_FORMATLIST_CONSTRUCTOR)
+#       undef TINYFORMAT_MAKE_FORMATLIST_CONSTRUCTOR
+#endif
+
+    private:
+        FormatArg m_formatterStore[N];
+};
+
+// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
+template<> class FormatListN<0> : public FormatList
+{
+    public: FormatListN() : FormatList(0, 0) {}
+};
+
+} // namespace detail
+
+
+//------------------------------------------------------------------------------
+// Primary API functions
+
+#ifdef TINYFORMAT_USE_VARIADIC_TEMPLATES
+
+/// Make type-agnostic format list from list of template arguments.
+///
+/// The exact return type of this function is an implementation detail and
+/// shouldn't be relied upon.  Instead it should be stored as a FormatListRef:
+///
+///   FormatListRef formatList = makeFormatList( /*...*/ );
+template<typename... Args>
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args&... args)
+{
+    return detail::FormatListN<sizeof...(args)>(args...);
+}
+
+#else // C++98 version
+
+inline detail::FormatListN<0> makeFormatList()
+{
+    return detail::FormatListN<0>();
+}
+#define TINYFORMAT_MAKE_MAKEFORMATLIST(n)                     \
+template<TINYFORMAT_ARGTYPES(n)>                              \
+detail::FormatListN<n> makeFormatList(TINYFORMAT_VARARGS(n))  \
+{                                                             \
+    return detail::FormatListN<n>(TINYFORMAT_PASSARGS(n));    \
+}
+TINYFORMAT_FOREACH_ARGNUM(TINYFORMAT_MAKE_MAKEFORMATLIST)
+#undef TINYFORMAT_MAKE_MAKEFORMATLIST
+
+#endif
+
+/// Format list of arguments to the stream according to the given format string.
+///
+/// The name vformat() is chosen for the semantic similarity to vprintf(): the
+/// list of format arguments is held in a single function argument.
+inline void vformat(std::ostream& out, const char* fmt, FormatListRef list)
+{
+    detail::formatImpl(out, fmt, list.m_formatters, list.m_N);
+}
+
+
+#ifdef TINYFORMAT_USE_VARIADIC_TEMPLATES
+
+/// Format list of arguments to the stream according to given format string.
+template<typename... Args>
+void format(std::ostream& out, const char* fmt, const Args&... args)
+{
+    vformat(out, fmt, makeFormatList(args...));
+}
+
+/// Format list of arguments according to the given format string and return
+/// the result as a string.
+template<typename... Args>
+std::string format(const char* fmt, const Args&... args)
+{
+    std::ostringstream oss;
+    format(oss, fmt, args...);
+    return oss.str();
+}
+
+/// Format list of arguments to std::cout, according to the given format string
+template<typename... Args>
+void printf(const char* fmt, const Args&... args)
+{
+    format(std::cout, fmt, args...);
+}
+
+template<typename... Args>
+void printfln(const char* fmt, const Args&... args)
+{
+    format(std::cout, fmt, args...);
+    std::cout << '\n';
+}
+
+
+#else // C++98 version
+
+inline void format(std::ostream& out, const char* fmt)
+{
+    vformat(out, fmt, makeFormatList());
+}
+
+inline std::string format(const char* fmt)
+{
+    std::ostringstream oss;
+    format(oss, fmt);
+    return oss.str();
+}
+
+inline void printf(const char* fmt)
+{
+    format(std::cout, fmt);
+}
+
+inline void printfln(const char* fmt)
+{
+    format(std::cout, fmt);
+    std::cout << '\n';
+}
+
+#define TINYFORMAT_MAKE_FORMAT_FUNCS(n)                                   \
+                                                                          \
+template<TINYFORMAT_ARGTYPES(n)>                                          \
+void format(std::ostream& out, const char* fmt, TINYFORMAT_VARARGS(n))    \
+{                                                                         \
+    vformat(out, fmt, makeFormatList(TINYFORMAT_PASSARGS(n)));            \
+}                                                                         \
+                                                                          \
+template<TINYFORMAT_ARGTYPES(n)>                                          \
+std::string format(const char* fmt, TINYFORMAT_VARARGS(n))                \
+{                                                                         \
+    std::ostringstream oss;                                               \
+    format(oss, fmt, TINYFORMAT_PASSARGS(n));                             \
+    return oss.str();                                                     \
+}                                                                         \
+                                                                          \
+template<TINYFORMAT_ARGTYPES(n)>                                          \
+void printf(const char* fmt, TINYFORMAT_VARARGS(n))                       \
+{                                                                         \
+    format(std::cout, fmt, TINYFORMAT_PASSARGS(n));                       \
+}                                                                         \
+                                                                          \
+template<TINYFORMAT_ARGTYPES(n)>                                          \
+void printfln(const char* fmt, TINYFORMAT_VARARGS(n))                     \
+{                                                                         \
+    format(std::cout, fmt, TINYFORMAT_PASSARGS(n));                       \
+    std::cout << '\n';                                                    \
+}
+
+TINYFORMAT_FOREACH_ARGNUM(TINYFORMAT_MAKE_FORMAT_FUNCS)
+#undef TINYFORMAT_MAKE_FORMAT_FUNCS
+
+#endif
+
+
+} // namespace tinyformat
+
+#endif // TINYFORMAT_H_INCLUDED
diff --git a/src/lua/.busted b/src/lua/.busted
new file mode 100644
index 0000000..8ffdc19
--- /dev/null
+++ b/src/lua/.busted
@@ -0,0 +1,9 @@
+-- Configuration for unit tests
+-- See: http://olivinelabs.com/busted/
+return {
+	default = {
+		lpath = "./?.lua;./?/init.lua",
+		helper = "./bpf/spec/helper.lua",
+		["auto-insulate"] = false,
+	}
+}
diff --git a/src/lua/.luacheckrc b/src/lua/.luacheckrc
new file mode 100644
index 0000000..18645ab
--- /dev/null
+++ b/src/lua/.luacheckrc
@@ -0,0 +1,21 @@
+std = 'luajit'
+
+new_read_globals = {
+	'assert',
+	'describe',
+	'it',
+}
+new_globals = {
+	'math',
+}
+
+-- Luacheck < 0.18 doesn't support new_read_globals
+for _, v in ipairs(new_read_globals) do
+	table.insert(new_globals, v)
+end
+
+-- Ignore some pedantic checks
+ignore = {
+	'4.1/err', -- Shadowing err
+	'4.1/.',   -- Shadowing one letter variables
+}
diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt
new file mode 100644
index 0000000..7541d48
--- /dev/null
+++ b/src/lua/CMakeLists.txt
@@ -0,0 +1,32 @@
+find_package(LuaJIT)
+find_program(LUAJIT luajit)
+
+if (LUAJIT_LIBRARIES AND LUAJIT)
+	FILE(GLOB_RECURSE SRC_LUA
+		${CMAKE_CURRENT_SOURCE_DIR}/bcc/*.lua
+		${CMAKE_CURRENT_SOURCE_DIR}/bcc/vendor/*.lua
+		${CMAKE_CURRENT_SOURCE_DIR}/bpf/*.lua)
+
+	ADD_CUSTOM_COMMAND(
+		OUTPUT bcc.lua
+		COMMAND ${LUAJIT} ${CMAKE_CURRENT_SOURCE_DIR}/src/squish.lua ${CMAKE_CURRENT_SOURCE_DIR}
+		DEPENDS ${SRC_LUA} ${CMAKE_CURRENT_SOURCE_DIR}/squishy
+	)
+
+	ADD_CUSTOM_COMMAND(
+		OUTPUT bcc.o
+		COMMAND ${LUAJIT} -bg bcc.lua bcc.o
+		DEPENDS bcc.lua
+	)
+
+	include_directories(${LUAJIT_INCLUDE_DIR})
+	add_executable(bcc-lua src/main.c bcc.o)
+	set_target_properties(bcc-lua PROPERTIES LINKER_LANGUAGE C)
+	target_link_libraries(bcc-lua ${LUAJIT_LIBRARIES})
+	target_link_libraries(bcc-lua ${bcc-lua-static})
+	if (NOT COMPILER_NOPIE_FLAG EQUAL "")
+		target_link_libraries(bcc-lua ${COMPILER_NOPIE_FLAG})
+	endif()
+
+	install(TARGETS bcc-lua RUNTIME DESTINATION bin)
+endif()
diff --git a/src/lua/README.md b/src/lua/README.md
new file mode 100644
index 0000000..6057176
--- /dev/null
+++ b/src/lua/README.md
@@ -0,0 +1,156 @@
+Lua Tools for BCC
+-----------------
+
+This directory contains Lua tooling for [BCC][bcc]
+(the BPF Compiler Collection).
+
+BCC is a toolkit for creating userspace and kernel tracing programs. By
+default, it comes with a library `libbcc`, some example tooling and a Python
+frontend for the library.
+
+Here we present an alternate frontend for `libbcc` implemented in LuaJIT. This
+lets you write the userspace part of your tracer in Lua instead of Python.
+
+Since LuaJIT is a JIT compiled language, tracers implemented in `bcc-lua`
+exhibit significantly reduced overhead compared to their Python equivalents.
+This is particularly noticeable in tracers that actively use the table APIs to
+get information from the kernel.
+
+If your tracer makes extensive use of `BPF_MAP_TYPE_PERF_EVENT_ARRAY` or
+`BPF_MAP_TYPE_HASH`, you may find the performance characteristics of this
+implementation very appealing, as LuaJIT can compile to native code a lot of
+the callchain to process the events, and this wrapper has been designed to
+benefit from such JIT compilation.
+
+## Quickstart Guide
+
+The following instructions assume Ubuntu 14.04 LTS.
+
+1. Install a **very new kernel**. It has to be new and shiny for this to work. 4.3+
+
+    ```
+    VER=4.4.2-040402
+    PREFIX=http://kernel.ubuntu.com/~kernel-ppa/mainline/v4.4.2-wily/
+    REL=201602171633
+    wget ${PREFIX}/linux-headers-${VER}-generic_${VER}.${REL}_amd64.deb
+    wget ${PREFIX}/linux-headers-${VER}_${VER}.${REL}_all.deb
+    wget ${PREFIX}/linux-image-${VER}-generic_${VER}.${REL}_amd64.deb
+    sudo dpkg -i linux-*${VER}.${REL}*.deb
+    ```
+
+2. Install the `libbcc` binary packages and `luajit`
+
+    ```
+    sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys D4284CDD
+    echo "deb https://repo.iovisor.org/apt trusty main" | sudo tee /etc/apt/sources.list.d/iovisor.list
+    sudo apt-get update
+    sudo apt-get install libbcc luajit
+    ```
+
+3. Test one of the examples to ensure `libbcc` is properly installed
+
+    ```
+    sudo ./bcc-probe examples/lua/task_switch.lua
+    ```
+
+## LuaJIT BPF compiler
+
+Now it is also possible to write Lua functions and compile them transparently to BPF bytecode, here is a simple socket filter example:
+
+```lua
+local S = require('syscall')
+local bpf = require('bpf')
+local map = bpf.map('array', 256)
+-- Kernel-space part of the program
+local prog = assert(bpf(function ()
+    local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+    xadd(map[proto], 1)         -- Increment packet count
+end))
+-- User-space part of the program
+local sock = assert(bpf.socket('lo', prog))
+for i=1,10 do
+    local icmp, udp, tcp = map[1], map[17], map[6]
+    print('TCP', tcp, 'UDP', udp, 'ICMP', icmp, 'packets')
+    S.sleep(1)
+end
+```
+
+The other application of BPF programs is attaching to probes for [perf event tracing][tracing]. That means you can trace events inside the kernel (or user-space), and then collect results - for example histogram of `sendto()` latency, off-cpu time stack traces, syscall latency, and so on. While kernel probes and perf events have unstable ABI, with a dynamic language we can create and use proper type based on the tracepoint ABI on runtime.
+
+Runtime automatically recognizes reads that needs a helper to be accessed. The type casts denote source of the objects, for example the [bashreadline][bashreadline] example that prints entered bash commands from all running shells:
+
+```lua
+local ffi = require('ffi')
+local bpf = require('bpf')
+-- Perf event map
+local sample_t = 'struct { uint64_t pid; char str[80]; }'
+local events = bpf.map('perf_event_array')
+-- Kernel-space part of the program
+bpf.uprobe('/bin/bash:readline' function (ptregs)
+    local sample = ffi.new(sample_t)
+    sample.pid = pid_tgid()
+    ffi.copy(sample.str, ffi.cast('char *', req.ax)) -- Cast `ax` to string pointer and copy to buffer
+    perf_submit(events, sample)                      -- Write sample to perf event map
+end, true, -1, 0)
+-- User-space part of the program
+local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
+while true do
+    log:block()               -- Wait until event reader is readable
+    for _,e in log:read() do  -- Collect available reader events
+        print(tonumber(e.pid), ffi.string(e.str))
+    end
+end
+```
+
+Where cast to `struct pt_regs` flags the source of data as probe arguments, which means any pointer derived
+from this structure points to kernel and a helper is needed to access it. Casting `req.ax` to pointer is then required for `ffi.copy` semantics, otherwise it would be treated as `u64` and only it's value would be
+copied. The type detection is automatic most of the times (socket filters and `bpf.tracepoint`), but not with uprobes and kprobes.
+
+### Installation
+
+```bash
+$ luarocks install bpf
+```
+
+### Examples
+
+See `examples/lua` directory.
+
+### Helpers
+
+* `print(...)` is a wrapper for `bpf_trace_printk`, the output is captured in `cat /sys/kernel/debug/tracing/trace_pipe`
+* `bit.*` library **is** supported (`lshift, rshift, arshift, bnot, band, bor, bxor`)
+* `math.*` library *partially* supported (`log2, log, log10`)
+* `ffi.cast()` is implemented (including structures and arrays)
+* `ffi.new(...)` allocates memory on stack, initializers are NYI
+* `ffi.copy(...)` copies memory (possibly using helpers) between stack/kernel/registers
+* `ntoh(x[, width])` - convert from network to host byte order.
+* `hton(x[, width])` - convert from host to network byte order.
+* `xadd(dst, inc)` - exclusive add, a synchronous `*dst += b` if Lua had `+=` operator
+
+Below is a list of BPF-specific helpers:
+
+* `time()` - return current monotonic time in nanoseconds (uses `bpf_ktime_get_ns`)
+* `cpu()` - return current CPU number (uses `bpf_get_smp_processor_id`)
+* `pid_tgid()` - return caller `tgid << 32 | pid` (uses `bpf_get_current_pid_tgid`)
+* `uid_gid()` - return caller `gid << 32 | uid` (uses `bpf_get_current_uid_gid`)
+* `comm(var)` - write current process name (uses `bpf_get_current_comm`)
+* `perf_submit(map, var)` - submit variable to perf event array BPF map
+* `stack_id(map, flags)` - return stack trace identifier from stack trace BPF map
+* `load_bytes(off, var)` - helper for direct packet access with `skb_load_bytes()`
+
+### Current state
+
+* Not all LuaJIT bytecode opcodes are supported *(notable mentions below)*
+* Closures `UCLO` will probably never be supported, although you can use upvalues inside compiled function.
+* Type narrowing is opportunistic. Numbers are 64-bit by default, but 64-bit immediate loads are not supported (e.g. `local x = map[ffi.cast('uint64_t', 1000)]`)
+* Tail calls `CALLT`, and iterators `ITERI` are NYI (as of now)
+* Arbitrary ctype **is** supported both for map keys and values
+* Basic optimisations like: constant propagation, partial DCE, liveness analysis and speculative register allocation are implement, but there's no control flow analysis yet. This means the compiler has the visibility when things are used and dead-stores occur, but there's no rewriter pass to eliminate them.
+* No register sub-allocations, no aggressive use of caller-saved `R1-5`, no aggressive narrowing (this would require variable range assertions and variable relationships)
+* Slices with not 1/2/4/8 length are NYI (requires allocating a memory on stack and using pointer type)
+
+
+[bcc]: https://github.com/iovisor/bcc
+[tracing]: http://www.brendangregg.com/blog/2016-03-05/linux-bpf-superpowers.html
+[bashreadline]: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
\ No newline at end of file
diff --git a/src/lua/bcc-probe b/src/lua/bcc-probe
new file mode 100755
index 0000000..6ab1871
--- /dev/null
+++ b/src/lua/bcc-probe
@@ -0,0 +1,20 @@
+#!/usr/bin/env luajit
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local str = require("debug").getinfo(1, "S").source:sub(2)
+local script_path = str:match("(.*/)").."/?.lua;"
+package.path = script_path..package.path
+require("bcc.run")()
diff --git a/src/lua/bcc/bpf.lua b/src/lua/bcc/bpf.lua
new file mode 100644
index 0000000..fa987f3
--- /dev/null
+++ b/src/lua/bcc/bpf.lua
@@ -0,0 +1,301 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+local libbcc = require("bcc.libbcc")
+
+local TracerPipe = require("bcc.tracerpipe")
+local Table = require("bcc.table")
+local Sym = require("bcc.sym")
+
+local Bpf = class("BPF")
+
+Bpf.static.open_kprobes = {}
+Bpf.static.open_uprobes = {}
+Bpf.static.perf_buffers = {}
+Bpf.static.KPROBE_LIMIT = 1000
+Bpf.static.tracer_pipe = nil
+Bpf.static.DEFAULT_CFLAGS = {
+  '-D__HAVE_BUILTIN_BSWAP16__',
+  '-D__HAVE_BUILTIN_BSWAP32__',
+  '-D__HAVE_BUILTIN_BSWAP64__',
+}
+
+function Bpf.static.check_probe_quota(n)
+  local cur = table.count(Bpf.static.open_kprobes) + table.count(Bpf.static.open_uprobes)
+  assert(cur + n <= Bpf.static.KPROBE_LIMIT, "number of open probes would exceed quota")
+end
+
+function Bpf.static.cleanup()
+  local function detach_all(probe_type, all_probes)
+    for key, fd in pairs(all_probes) do
+      libbcc.bpf_close_perf_event_fd(fd)
+      -- skip bcc-specific kprobes
+      if not key:starts("bcc:") then
+        if probe_type == "kprobes" then
+          libbcc.bpf_detach_kprobe(key)
+        elseif probe_type == "uprobes" then
+          libbcc.bpf_detach_uprobe(key)
+        end
+      end
+      all_probes[key] = nil
+    end
+  end
+
+  detach_all("kprobes", Bpf.static.open_kprobes)
+  detach_all("uprobes", Bpf.static.open_uprobes)
+
+  for key, perf_buffer in pairs(Bpf.static.perf_buffers) do
+    libbcc.perf_reader_free(perf_buffer)
+    Bpf.static.perf_buffers[key] = nil
+  end
+
+  if Bpf.static.tracer_pipe ~= nil then
+    Bpf.static.tracer_pipe:close()
+  end
+end
+
+function Bpf.static.SymbolCache(pid)
+  return Sym.create_cache(pid)
+end
+
+function Bpf.static.num_open_uprobes()
+  return table.count(Bpf.static.open_uprobes)
+end
+
+function Bpf.static.num_open_kprobes()
+  return table.count(Bpf.static.open_kprobes)
+end
+
+Bpf.static.SCRIPT_ROOT = "./"
+function Bpf.static.script_root(root)
+  local dir, file = root:match'(.*/)(.*)'
+  Bpf.static.SCRIPT_ROOT = dir or "./"
+  return Bpf
+end
+
+local function _find_file(script_root, filename)
+  if filename == nil then
+    return nil
+  end
+
+  if os.exists(filename) then
+    return filename
+  end
+
+  if not filename:starts("/") then
+    filename = script_root .. filename
+    if os.exists(filename) then
+      return filename
+    end
+  end
+
+  assert(nil, "failed to find file "..filename.." (root="..script_root..")")
+end
+
+function Bpf:initialize(args)
+  self.funcs = {}
+  self.tables = {}
+
+  if args.usdt and args.text then
+    args.text = args.usdt:_get_text() .. args.text
+  end
+
+  local cflags = table.join(Bpf.DEFAULT_CFLAGS, args.cflags)
+  local cflags_ary = ffi.new("const char *[?]", #cflags, cflags)
+
+  local llvm_debug = rawget(_G, "LIBBCC_LLVM_DEBUG") or args.debug or 0
+  assert(type(llvm_debug) == "number")
+
+  if args.text then
+    log.info("\n%s\n", args.text)
+    self.module = libbcc.bpf_module_create_c_from_string(args.text, llvm_debug, cflags_ary, #cflags)
+  elseif args.src_file then
+    local src = _find_file(Bpf.SCRIPT_ROOT, args.src_file)
+
+    if src:ends(".b") then
+      local hdr = _find_file(Bpf.SCRIPT_ROOT, args.hdr_file)
+      self.module = libbcc.bpf_module_create_b(src, hdr, llvm_debug)
+    else
+      self.module = libbcc.bpf_module_create_c(src, llvm_debug, cflags_ary, #cflags)
+    end
+  end
+
+  assert(self.module ~= nil, "failed to compile BPF module")
+
+  if args.usdt then
+    args.usdt:_attach_uprobes(self)
+  end
+end
+
+function Bpf:load_funcs(prog_type)
+  prog_type = prog_type or "BPF_PROG_TYPE_KPROBE"
+
+  local result = {}
+  local fn_count = tonumber(libbcc.bpf_num_functions(self.module))
+
+  for i = 0,fn_count-1 do
+    local name = ffi.string(libbcc.bpf_function_name(self.module, i))
+    table.insert(result, self:load_func(name, prog_type))
+  end
+
+  return result
+end
+
+function Bpf:load_func(fn_name, prog_type)
+  if self.funcs[fn_name] ~= nil then
+    return self.funcs[fn_name]
+  end
+
+  assert(libbcc.bpf_function_start(self.module, fn_name) ~= nil,
+    "unknown program: "..fn_name)
+
+  local fd = libbcc.bpf_prog_load(prog_type,
+    fn_name,
+    libbcc.bpf_function_start(self.module, fn_name),
+    libbcc.bpf_function_size(self.module, fn_name),
+    libbcc.bpf_module_license(self.module),
+    libbcc.bpf_module_kern_version(self.module),
+    0, nil, 0)
+
+  assert(fd >= 0, "failed to load BPF program "..fn_name)
+  log.info("loaded %s (%d)", fn_name, fd)
+
+  local fn = {bpf=self, name=fn_name, fd=fd}
+  self.funcs[fn_name] = fn
+  return fn
+end
+
+function Bpf:dump_func(fn_name)
+  local start = libbcc.bpf_function_start(self.module, fn_name)
+  assert(start ~= nil, "unknown program")
+
+  local len = libbcc.bpf_function_size(self.module, fn_name)
+  return ffi.string(start, tonumber(len))
+end
+
+function Bpf:attach_uprobe(args)
+  Bpf.check_probe_quota(1)
+
+  local path, addr = Sym.check_path_symbol(args.name, args.sym, args.addr, args.pid)
+  local fn = self:load_func(args.fn_name, 'BPF_PROG_TYPE_KPROBE')
+  local ptype = args.retprobe and "r" or "p"
+  local ev_name = string.format("%s_%s_0x%p", ptype, path:gsub("[^%a%d]", "_"), addr)
+  local retprobe = args.retprobe and 1 or 0
+
+  local res = libbcc.bpf_attach_uprobe(fn.fd, retprobe, ev_name, path, addr,
+    args.pid or -1)
+
+  assert(res >= 0, "failed to attach BPF to uprobe")
+  self:probe_store("uprobe", ev_name, res)
+  return self
+end
+
+function Bpf:attach_kprobe(args)
+  -- TODO: allow the caller to glob multiple functions together
+  Bpf.check_probe_quota(1)
+
+  local fn = self:load_func(args.fn_name, 'BPF_PROG_TYPE_KPROBE')
+  local event = args.event or ""
+  local ptype = args.retprobe and "r" or "p"
+  local ev_name = string.format("%s_%s", ptype, event:gsub("[%+%.]", "_"))
+  local retprobe = args.retprobe and 1 or 0
+
+  local res = libbcc.bpf_attach_kprobe(fn.fd, retprobe, ev_name, event)
+
+  assert(res >= 0, "failed to attach BPF to kprobe")
+  self:probe_store("kprobe", ev_name, res)
+  return self
+end
+
+function Bpf:pipe()
+  if Bpf.tracer_pipe == nil then
+    Bpf.tracer_pipe = TracerPipe:new()
+  end
+  return Bpf.tracer_pipe
+end
+
+function Bpf:get_table(name, key_type, leaf_type)
+  if self.tables[name] == nil then
+    self.tables[name] = Table(self, name, key_type, leaf_type)
+  end
+  return self.tables[name]
+end
+
+function Bpf:probe_store(t, id, fd)
+  if t == "kprobe" then
+    Bpf.open_kprobes[id] = fd
+  elseif t == "uprobe" then
+    Bpf.open_uprobes[id] = fd
+  else
+    error("unknown probe type '%s'" % t)
+  end
+
+  log.info("%s -> %s", id, fd)
+end
+
+function Bpf:perf_buffer_store(id, reader)
+    Bpf.perf_buffers[id] = reader
+
+    log.info("%s -> %s", id, reader)
+end
+
+function Bpf:probe_lookup(t, id)
+  if t == "kprobe" then
+    return Bpf.open_kprobes[id]
+  elseif t == "uprobe" then
+    return Bpf.open_uprobes[id]
+  else
+    return nil
+  end
+end
+
+function Bpf:_perf_buffer_array()
+  local perf_buffer_count = table.count(Bpf.perf_buffers)
+  local readers = ffi.new("struct perf_reader*[?]", perf_buffer_count)
+  local n = 0
+
+  for _, r in pairs(Bpf.perf_buffers) do
+    readers[n] = r
+    n = n + 1
+  end
+
+  assert(n == perf_buffer_count)
+  return readers, n
+end
+
+function Bpf:perf_buffer_poll_loop()
+  local perf_buffers, perf_buffer_count = self:_perf_buffer_array()
+  return pcall(function()
+    while true do
+      libbcc.perf_reader_poll(perf_buffer_count, perf_buffers, -1)
+    end
+  end)
+end
+
+function Bpf:kprobe_poll_loop()
+  return self:perf_buffer_poll_loop()
+end
+
+function Bpf:perf_buffer_poll(timeout)
+  local perf_buffers, perf_buffer_count = self:_perf_buffer_array()
+  libbcc.perf_reader_poll(perf_buffer_count, perf_buffers, timeout or -1)
+end
+
+function Bpf:kprobe_poll(timeout)
+  self:perf_buffer_poll(timeout)
+end
+
+return Bpf
diff --git a/src/lua/bcc/init.lua b/src/lua/bcc/init.lua
new file mode 100644
index 0000000..0abbb6d
--- /dev/null
+++ b/src/lua/bcc/init.lua
@@ -0,0 +1,17 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+require("bcc.vendor.helpers")
+return { BPF = require("bcc.bpf") }
diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua
new file mode 100644
index 0000000..c518a89
--- /dev/null
+++ b/src/lua/bcc/libbcc.lua
@@ -0,0 +1,151 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+
+ffi.cdef[[
+enum bpf_prog_type {
+  BPF_PROG_TYPE_UNSPEC,
+  BPF_PROG_TYPE_SOCKET_FILTER,
+  BPF_PROG_TYPE_KPROBE,
+  BPF_PROG_TYPE_SCHED_CLS,
+  BPF_PROG_TYPE_SCHED_ACT,
+};
+
+int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, int map_flags);
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
+int bpf_lookup_elem(int fd, void *key, void *value);
+int bpf_delete_elem(int fd, void *key);
+int bpf_get_next_key(int fd, void *key, void *next_key);
+
+int bpf_prog_load(enum bpf_prog_type prog_type, const char *name,
+  const struct bpf_insn *insns, int insn_len,
+  const char *license, unsigned kern_version,
+  int log_level, char *log_buf, unsigned log_buf_size);
+int bpf_attach_socket(int sockfd, int progfd);
+
+/* create RAW socket and bind to interface 'name' */
+int bpf_open_raw_sock(const char *name);
+
+typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size);
+typedef void (*perf_reader_lost_cb)(void *cb_cookie, uint64_t lost);
+
+int bpf_attach_kprobe(int progfd, int attach_type, const char *ev_name,
+                      const char *fn_name);
+
+int bpf_detach_kprobe(const char *ev_name);
+
+int bpf_attach_uprobe(int progfd, int attach_type, const char *ev_name,
+                      const char *binary_path, uint64_t offset, int pid);
+
+int bpf_detach_uprobe(const char *ev_name);
+
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, perf_reader_lost_cb lost_cb, void *cb_cookie, int pid, int cpu, int page_cnt);
+
+int bpf_close_perf_event_fd(int fd);
+]]
+
+ffi.cdef[[
+void * bpf_module_create_b(const char *filename, const char *proto_filename, unsigned flags);
+void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags);
+void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags);
+void bpf_module_destroy(void *program);
+char * bpf_module_license(void *program);
+unsigned bpf_module_kern_version(void *program);
+size_t bpf_num_functions(void *program);
+const char * bpf_function_name(void *program, size_t id);
+void * bpf_function_start_id(void *program, size_t id);
+void * bpf_function_start(void *program, const char *name);
+size_t bpf_function_size_id(void *program, size_t id);
+size_t bpf_function_size(void *program, const char *name);
+size_t bpf_num_tables(void *program);
+size_t bpf_table_id(void *program, const char *table_name);
+int bpf_table_fd(void *program, const char *table_name);
+int bpf_table_fd_id(void *program, size_t id);
+int bpf_table_type(void *program, const char *table_name);
+int bpf_table_type_id(void *program, size_t id);
+size_t bpf_table_max_entries(void *program, const char *table_name);
+size_t bpf_table_max_entries_id(void *program, size_t id);
+int bpf_table_flags(void *program, const char *table_name);
+int bpf_table_flags_id(void *program, size_t id);
+const char * bpf_table_name(void *program, size_t id);
+const char * bpf_table_key_desc(void *program, const char *table_name);
+const char * bpf_table_key_desc_id(void *program, size_t id);
+const char * bpf_table_leaf_desc(void *program, const char *table_name);
+const char * bpf_table_leaf_desc_id(void *program, size_t id);
+size_t bpf_table_key_size(void *program, const char *table_name);
+size_t bpf_table_key_size_id(void *program, size_t id);
+size_t bpf_table_leaf_size(void *program, const char *table_name);
+size_t bpf_table_leaf_size_id(void *program, size_t id);
+int bpf_table_key_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *key);
+int bpf_table_leaf_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *leaf);
+int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key);
+int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf);
+]]
+
+ffi.cdef[[
+struct perf_reader;
+
+void perf_reader_free(void *ptr);
+int perf_reader_mmap(struct perf_reader *reader);
+int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
+int perf_reader_fd(struct perf_reader *reader);
+void perf_reader_set_fd(struct perf_reader *reader, int fd);
+]]
+
+ffi.cdef[[
+struct bcc_symbol {
+	const char *name;
+	const char *demangle_name;
+	const char *module;
+	uint64_t offset;
+};
+
+struct bcc_symbol_option {
+  int use_debug_file;
+  int check_debug_file_crc;
+  uint32_t use_symbol_type;
+};
+
+int bcc_resolve_symname(const char *module, const char *symname, const uint64_t addr,
+                        int pid, struct bcc_symbol_option *option,
+                        struct bcc_symbol *sym);
+void bcc_procutils_free(const char *ptr);
+void *bcc_symcache_new(int pid, struct bcc_symbol_option *option);
+void bcc_symbol_free_demangle_name(struct bcc_symbol *sym);
+int bcc_symcache_resolve(void *symcache, uint64_t addr, struct bcc_symbol *sym);
+void bcc_symcache_refresh(void *resolver);
+]]
+
+ffi.cdef[[
+void *bcc_usdt_new_frompid(int pid);
+void *bcc_usdt_new_frompath(const char *path);
+void bcc_usdt_close(void *usdt);
+
+int bcc_usdt_enable_probe(void *, const char *, const char *);
+char *bcc_usdt_genargs(void *);
+
+typedef void (*bcc_usdt_uprobe_cb)(const char *, const char *, uint64_t, int);
+void bcc_usdt_foreach_uprobe(void *usdt, bcc_usdt_uprobe_cb callback);
+]]
+
+if rawget(_G, "BCC_STANDALONE") then
+  return ffi.C
+else
+  return ffi.load(
+    os.getenv("LIBBCC_SO_PATH") or
+    rawget(_G, "LIBBCC_SO_PATH") or
+    "bcc")
+end
diff --git a/src/lua/bcc/run.lua b/src/lua/bcc/run.lua
new file mode 100644
index 0000000..7f09f43
--- /dev/null
+++ b/src/lua/bcc/run.lua
@@ -0,0 +1,77 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+
+return function()
+  require("bcc.vendor.helpers")
+  local standalone = rawget(_G, "BCC_STANDALONE")
+  local progname = standalone or "bcc-probe"
+
+  local function print_usage()
+    io.stderr:write(string.format(
+      "usage: %s [[--version|--verbose] --] path_to_script.lua [...]\n",
+      progname))
+    os.exit(1)
+  end
+
+  local function print_version()
+    local jit = require("jit")
+    print(string.format("%s %s -- Running on %s (%s/%s)",
+      progname, rawget(_G, "BCC_VERSION") or "HEAD",
+      jit.version, jit.os, jit.arch))
+    os.exit(0)
+  end
+
+  while arg[1] and string.starts(arg[1], "-") do
+    local k = table.remove(arg, 1)
+    if k == "--" then
+      break
+    elseif standalone == nil and string.starts(k, "--so-path=") then
+      rawset(_G, "LIBBCC_SO_PATH", string.lstrip(k, "--so-path="))
+    elseif k == "--llvm-debug" then
+      rawset(_G, "LIBBCC_LLVM_DEBUG", 1)
+    elseif k == "-V" or k == "--verbose" then
+      log.enabled = true
+    elseif k == "-v" or k == "--version" then
+      print_version()
+    else
+      print_usage()
+    end
+  end
+
+  local tracefile = table.remove(arg, 1)
+  if not tracefile then print_usage() end
+
+  local BPF = require("bcc.bpf")
+  BPF.script_root(tracefile)
+
+  local USDT = require("bcc.usdt")
+  local utils = {
+    argparse = require("bcc.vendor.argparse"),
+    posix = require("bcc.vendor.posix"),
+    USDT = USDT,
+  }
+
+  local command = dofile(tracefile)
+  local res, err = xpcall(command, debug.traceback, BPF, utils)
+
+  if not res and err ~= "interrupted!" then
+    io.stderr:write("[ERROR] "..err.."\n")
+  end
+
+  BPF.cleanup()
+  USDT.cleanup()
+  return res, err
+end
diff --git a/src/lua/bcc/sym.lua b/src/lua/bcc/sym.lua
new file mode 100644
index 0000000..d30546a
--- /dev/null
+++ b/src/lua/bcc/sym.lua
@@ -0,0 +1,53 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+local libbcc = require("bcc.libbcc")
+local SYM = ffi.typeof("struct bcc_symbol[1]")
+
+local function create_cache(pid)
+  return {
+    _CACHE = libbcc.bcc_symcache_new(pid or -1, nil),
+    resolve = function(self, addr)
+      local sym = SYM()
+      if libbcc.bcc_symcache_resolve(self._CACHE, addr, sym) < 0 then
+        return "[unknown]", 0x0
+      end
+      local name_res = ffi.string(sym[0].demangle_name)
+      libbcc.bcc_symbol_free_demangle_name(sym);
+      return name_res, sym[0].offset
+    end
+  }
+end
+
+local function check_path_symbol(module, symname, addr, pid)
+  local sym = SYM()
+  local module_path
+  if libbcc.bcc_resolve_symname(module, symname, addr or 0x0, pid or 0, nil, sym) < 0 then
+    if sym[0].module == nil then
+      error("could not find library '%s' in the library path" % module)
+    else
+      module_path = ffi.string(sym[0].module)
+      libbcc.bcc_procutils_free(sym[0].module)
+      error("failed to resolve symbol '%s' in '%s'" % {
+        symname, module_path})
+    end
+  end
+  module_path = ffi.string(sym[0].module)
+  libbcc.bcc_procutils_free(sym[0].module)
+  return module_path, sym[0].offset
+end
+
+return { create_cache=create_cache, check_path_symbol=check_path_symbol }
diff --git a/src/lua/bcc/table.lua b/src/lua/bcc/table.lua
new file mode 100644
index 0000000..9729751
--- /dev/null
+++ b/src/lua/bcc/table.lua
@@ -0,0 +1,402 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+local libbcc = require("bcc.libbcc")
+local Posix = require("bcc.vendor.posix")
+
+local BaseTable = class("BaseTable")
+
+BaseTable.static.BPF_MAP_TYPE_HASH = 1
+BaseTable.static.BPF_MAP_TYPE_ARRAY = 2
+BaseTable.static.BPF_MAP_TYPE_PROG_ARRAY = 3
+BaseTable.static.BPF_MAP_TYPE_PERF_EVENT_ARRAY = 4
+BaseTable.static.BPF_MAP_TYPE_PERCPU_HASH = 5
+BaseTable.static.BPF_MAP_TYPE_PERCPU_ARRAY = 6
+BaseTable.static.BPF_MAP_TYPE_STACK_TRACE = 7
+BaseTable.static.BPF_MAP_TYPE_CGROUP_ARRAY = 8
+BaseTable.static.BPF_MAP_TYPE_LRU_HASH = 9
+BaseTable.static.BPF_MAP_TYPE_LRU_PERCPU_HASH = 10
+BaseTable.static.BPF_MAP_TYPE_LPM_TRIE = 11
+
+function BaseTable:initialize(t_type, bpf, map_id, map_fd, key_type, leaf_type)
+  assert(t_type == libbcc.bpf_table_type_id(bpf.module, map_id))
+
+  self.t_type = t_type
+  self.bpf = bpf
+  self.map_id = map_id
+  self.map_fd = map_fd
+  self.c_key = ffi.typeof(key_type.."[1]")
+  self.c_leaf = ffi.typeof(leaf_type.."[1]")
+end
+
+function BaseTable:key_sprintf(key)
+  local pkey = self.c_key(key)
+  local buf_len = ffi.sizeof(self.c_key) * 8
+  local pbuf = ffi.new("char[?]", buf_len)
+
+  local res = libbcc.bpf_table_key_snprintf(
+    self.bpf.module, self.map_id, pbuf, buf_len, pkey)
+  assert(res == 0, "could not print key")
+
+  return ffi.string(pbuf)
+end
+
+function BaseTable:leaf_sprintf(leaf)
+  local pleaf = self.c_leaf(leaf)
+  local buf_len = ffi.sizeof(self.c_leaf) * 8
+  local pbuf = ffi.new("char[?]", buf_len)
+
+  local res = libbcc.bpf_table_leaf_snprintf(
+    self.bpf.module, self.map_id, pbuf, buf_len, pleaf)
+  assert(res == 0, "could not print leaf")
+
+  return ffi.string(pbuf)
+end
+
+function BaseTable:key_scanf(key_str)
+  local pkey = self.c_key()
+  local res = libbcc.bpf_table_key_sscanf(
+    self.bpf.module, self.map_id, key_str, pkey)
+  assert(res == 0, "could not scanf key")
+  return pkey[0]
+end
+
+function BaseTable:leaf_scanf(leaf_str)
+  local pleaf = self.c_leaf()
+  local res = libbcc.bpf_table_leaf_sscanf(
+    self.bpf.module, self.map_id, leaf_str, pleaf)
+  assert(res == 0, "could not scanf leaf")
+  return pleaf[0]
+end
+
+function BaseTable:get(key)
+  local pkey = self.c_key(key)
+  local pvalue = self.c_leaf()
+
+  if libbcc.bpf_lookup_elem(self.map_fd, pkey, pvalue) < 0 then
+    return nil
+  end
+
+  return pvalue[0]
+end
+
+function BaseTable:set(key, value)
+  local pkey = self.c_key(key)
+  local pvalue = self.c_leaf(value)
+  assert(libbcc.bpf_update_elem(self.map_fd, pkey, pvalue, 0) == 0, "could not update table")
+end
+
+function BaseTable:_empty_key()
+  local pkey = self.c_key()
+  local pvalue = self.c_leaf()
+
+  for _, v in ipairs({0x0, 0x55, 0xff}) do
+    ffi.fill(pkey, ffi.sizeof(pkey[0]), v)
+    if libbcc.bpf_lookup_elem(self.map_fd, pkey, pvalue) < 0 then
+      return pkey
+    end
+  end
+
+  error("failed to find an empty key for table iteration")
+end
+
+function BaseTable:keys()
+  local pkey = self:_empty_key()
+
+  return function()
+    local pkey_next = self.c_key()
+
+    if libbcc.bpf_get_next_key(self.map_fd, pkey, pkey_next) < 0 then
+      return nil
+    end
+
+    pkey = pkey_next
+    return pkey[0]
+  end
+end
+
+function BaseTable:items()
+  local pkey = self:_empty_key()
+
+  return function()
+    local pkey_next = self.c_key()
+    local pvalue = self.c_leaf()
+
+    if libbcc.bpf_get_next_key(self.map_fd, pkey, pkey_next) < 0 then
+      return nil
+    end
+
+    pkey = pkey_next
+    assert(libbcc.bpf_lookup_elem(self.map_fd, pkey, pvalue) == 0)
+    return pkey[0], pvalue[0]
+  end
+end
+
+
+
+local HashTable = class("HashTable", BaseTable)
+
+function HashTable:initialize(bpf, map_id, map_fd, key_type, leaf_type)
+  BaseTable.initialize(self, BaseTable.BPF_MAP_TYPE_HASH, bpf, map_id, map_fd, key_type, leaf_type)
+end
+
+function HashTable:delete(key)
+  local pkey = self.c_key(key)
+  return libbcc.bpf_delete_elem(self.map_fd, pkey) == 0
+end
+
+function HashTable:size()
+  local n = 0
+  self:each(function() n = n + 1 end)
+  return n
+end
+
+
+
+local BaseArray = class("BaseArray", BaseTable)
+
+function BaseArray:initialize(t_type, bpf, map_id, map_fd, key_type, leaf_type)
+  BaseTable.initialize(self, t_type, bpf, map_id, map_fd, key_type, leaf_type)
+  self.max_entries = tonumber(libbcc.bpf_table_max_entries_id(self.bpf.module, self.map_id))
+end
+
+function BaseArray:_normalize_key(key)
+  assert(type(key) == "number", "invalid key (expected a number")
+  if key < 0 then
+    key = self.max_entries + key
+  end
+  assert(key < self.max_entries, string.format("out of range (%d >= %d)", key, self.max_entries))
+  return key
+end
+
+function BaseArray:get(key)
+  return BaseTable.get(self, self:_normalize_key(key))
+end
+
+function BaseArray:set(key, value)
+  return BaseTable.set(self, self:_normalize_key(key), value)
+end
+
+function BaseArray:delete(key)
+  assert(nil, "unsupported")
+end
+
+function BaseArray:items(with_index)
+  local pkey = self.c_key()
+  local max = self.max_entries
+  local n = 0
+
+  -- TODO
+  return function()
+    local pvalue = self.c_leaf()
+
+    if n == max then
+      return nil
+    end
+
+    pkey[0] = n
+    n = n + 1
+
+    if libbcc.bpf_lookup_elem(self.map_fd, pkey, pvalue) ~= 0 then
+      return nil
+    end
+
+    if with_index then
+      return n, pvalue[0] -- return 1-based index
+    else
+      return pvalue[0]
+    end
+  end
+end
+
+
+
+local Array = class("Array", BaseArray)
+
+function Array:initialize(bpf, map_id, map_fd, key_type, leaf_type)
+  BaseArray.initialize(self, BaseTable.BPF_MAP_TYPE_ARRAY, bpf, map_id, map_fd, key_type, leaf_type)
+end
+
+
+
+local PerfEventArray = class("PerfEventArray", BaseArray)
+
+function PerfEventArray:initialize(bpf, map_id, map_fd, key_type, leaf_type)
+  BaseArray.initialize(self, BaseTable.BPF_MAP_TYPE_PERF_EVENT_ARRAY, bpf, map_id, map_fd, key_type, leaf_type)
+  self._callbacks = {}
+end
+
+local function _perf_id(id, cpu)
+  return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0)
+end
+
+function PerfEventArray:_open_perf_buffer(cpu, callback, ctype, page_cnt, lost_cb)
+  local _cb = ffi.cast("perf_reader_raw_cb",
+    function (cookie, data, size)
+      callback(cpu, ctype(data)[0])
+    end)
+
+  local _lost_cb = nil
+  if lost_cb then
+    _lost_cb = ffi.cast("perf_reader_lost_cb",
+      function (cookie, lost)
+        lost_cb(cookie, lost)
+      end)
+  end
+
+  -- default to 8 pages per buffer
+  local reader = libbcc.bpf_open_perf_buffer(_cb, _lost_cb, nil, -1, cpu, page_cnt or 8)
+  assert(reader, "failed to open perf buffer")
+
+  local fd = libbcc.perf_reader_fd(reader)
+  self:set(cpu, fd)
+  self.bpf:perf_buffer_store(_perf_id(self.map_id, cpu), reader)
+  self._callbacks[cpu] = _cb
+end
+
+function PerfEventArray:open_perf_buffer(callback, data_type, data_params, page_cnt, lost_cb)
+  assert(data_type, "a data type is needed for callback conversion")
+  local ctype = ffi.typeof(data_type.."*", unpack(data_params or {}))
+  for i = 0, Posix.cpu_count() - 1 do
+    self:_open_perf_buffer(i, callback, ctype, page_cnt, lost_cb)
+  end
+end
+
+
+local StackTrace = class("StackTrace", BaseTable)
+
+StackTrace.static.MAX_STACK = 127
+
+function StackTrace:initialize(bpf, map_id, map_fd, key_type, leaf_type)
+  BaseTable.initialize(self, BaseTable.BPF_MAP_TYPE_STACK_TRACE, bpf, map_id, map_fd, key_type, leaf_type)
+  self._stackp = self.c_leaf() -- FIXME: not threadsafe
+end
+
+function StackTrace:walk(id)
+  local pkey = self.c_key(id)
+  local pstack = self._stackp
+  local i = 0
+
+  if libbcc.bpf_lookup_elem(self.map_fd, pkey, pstack) < 0 then
+    return nil
+  end
+
+  return function()
+    if i >= StackTrace.MAX_STACK then
+      return nil
+    end
+
+    local addr = pstack[0].ip[i]
+    if addr == 0 then
+      return nil
+    end
+
+    i = i + 1
+    return addr
+  end
+end
+
+function StackTrace:get(id, resolver)
+  local stack = {}
+  for addr in self:walk(id) do
+    table.insert(stack, resolver and resolver(addr) or addr)
+  end
+  return stack
+end
+
+local function _decode_table_type(desc)
+  local json = require("bcc.vendor.json")
+  local json_desc = ffi.string(desc)
+
+  local function _dec(t)
+    if type(t) == "string" then
+      return t
+    end
+
+    local fields = {}
+    local struct = t[3] or "struct"
+
+    for _, value in ipairs(t[2]) do
+      local f = nil
+
+      if #value == 2 then
+        f = string.format("%s %s;", _dec(value[2]), value[1])
+      elseif #value == 3 then
+        if type(value[3]) == "table" then
+          f = string.format("%s %s[%d];", _dec(value[2]), value[1], value[3][1])
+        elseif type(value[3]) == "number" then
+          local t = _dec(value[2])
+          assert(t == "int" or t == "unsigned int",
+            "bitfields can only appear in [unsigned] int types")
+          f = string.format("%s %s:%d;", t, value[1], value[3])
+        end
+      end
+
+      assert(f ~= nil, "failed to decode type "..json_desc)
+      table.insert(fields, f)
+    end
+
+    assert(struct == "struct" or struct == "struct_packed" or struct == "union",
+           "unknown complex type: "..struct)
+    if struct == "union" then
+      return string.format("union { %s }", table.concat(fields, " "))
+    else
+      return string.format("struct { %s }", table.concat(fields, " "))
+    end
+  end
+  return _dec(json.parse(json_desc))
+end
+
+local function NewTable(bpf, name, key_type, leaf_type)
+  local id = libbcc.bpf_table_id(bpf.module, name)
+  local fd = libbcc.bpf_table_fd(bpf.module, name)
+
+  if fd < 0 then
+    return nil
+  end
+
+  local t_type = libbcc.bpf_table_type_id(bpf.module, id)
+  local table = nil
+
+  if t_type == BaseTable.BPF_MAP_TYPE_HASH then
+    table = HashTable
+  elseif t_type == BaseTable.BPF_MAP_TYPE_ARRAY then
+    table = Array
+  elseif t_type == BaseTable.BPF_MAP_TYPE_PERF_EVENT_ARRAY then
+    table = PerfEventArray
+  elseif t_type == BaseTable.BPF_MAP_TYPE_STACK_TRACE then
+    table = StackTrace
+  end
+
+  assert(table, "unsupported table type %d" % t_type)
+
+  if key_type == nil then
+    local desc = libbcc.bpf_table_key_desc(bpf.module, name)
+    assert(desc, "Failed to load BPF table description for "..name)
+    key_type = _decode_table_type(desc)
+  end
+
+  if leaf_type == nil then
+    local desc = libbcc.bpf_table_leaf_desc(bpf.module, name)
+    assert(desc, "Failed to load BPF table description for "..name)
+    leaf_type = _decode_table_type(desc)
+  end
+
+  log.info("key = %s value = %s", key_type, leaf_type)
+  return table:new(bpf, id, fd, key_type, leaf_type)
+end
+
+return NewTable
diff --git a/src/lua/bcc/tracerpipe.lua b/src/lua/bcc/tracerpipe.lua
new file mode 100644
index 0000000..c40267b
--- /dev/null
+++ b/src/lua/bcc/tracerpipe.lua
@@ -0,0 +1,58 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local TracerPipe = class("TracerPipe")
+
+TracerPipe.static.TRACEFS = "/sys/kernel/debug/tracing"
+TracerPipe.static.fields = "%s+(.-)%-(%d+)%s+%[(%d+)%]%s+(....)%s+([%d%.]+):.-:%s+(.+)"
+
+function TracerPipe:close()
+  if self.pipe ~= nil then
+    self.pipe:close()
+  end
+end
+
+function TracerPipe:open()
+  if self.pipe == nil then
+    self.pipe = assert(io.open(TracerPipe.TRACEFS .. "/trace_pipe"))
+  end
+  return self.pipe
+end
+
+function TracerPipe:readline()
+  return self:open():read()
+end
+
+function TracerPipe:trace_fields()
+  while true do
+    local line = self:readline()
+    if not line and self.nonblocking then
+      return nil
+    end
+
+    if not line:starts("CPU:") then
+      local task, pid, cpu, flags, ts, msg = line:match(TracerPipe.fields)
+      if task ~= nil then
+        return task, tonumber(pid), tonumber(cpu), flags, tonumber(ts), msg
+      end
+    end
+  end
+end
+
+function TracerPipe:initialize(nonblocking)
+  self.nonblocking = nonblocking
+end
+
+return TracerPipe
diff --git a/src/lua/bcc/usdt.lua b/src/lua/bcc/usdt.lua
new file mode 100644
index 0000000..e9788da
--- /dev/null
+++ b/src/lua/bcc/usdt.lua
@@ -0,0 +1,76 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+local libbcc = require("bcc.libbcc")
+local Usdt = class("USDT")
+
+Usdt.static.open_contexts = {}
+
+function Usdt.static.cleanup()
+  for _, context in ipairs(Usdt.static.open_contexts) do
+    context:_cleanup()
+  end
+end
+
+function Usdt:initialize(args)
+  assert(args.pid or args.path)
+
+  if args.pid then
+    self.pid = args.pid
+    self.context = libbcc.bcc_usdt_new_frompid(args.pid)
+  elseif args.path then
+    self.path = args.path
+    self.context = libbcc.bcc_usdt_new_frompath(args.path)
+  end
+
+  assert(self.context ~= nil, "failed to create USDT context")
+  table.insert(Usdt.open_contexts, self)
+end
+
+function Usdt:enable_probe(args)
+  assert(args.probe and args.fn_name)
+  assert(libbcc.bcc_usdt_enable_probe(
+    self.context, args.probe, args.fn_name) == 0)
+end
+
+function Usdt:_cleanup()
+  libbcc.bcc_usdt_close(self.context)
+  self.context = nil
+end
+
+function Usdt:_get_text()
+  local argc = libbcc.bcc_usdt_genargs(self.context)
+  assert(argc ~= nil)
+  return ffi.string(argc)
+end
+
+function Usdt:_attach_uprobes(bpf)
+  local uprobes = {}
+  local cb = ffi.cast("bcc_usdt_uprobe_cb",
+    function(binpath, fn_name, addr, pid)
+      table.insert(uprobes, {name=ffi.string(binpath),
+        addr=addr, fn_name=ffi.string(fn_name), pid=pid})
+    end)
+
+  libbcc.bcc_usdt_foreach_uprobe(self.context, cb)
+  cb:free()
+
+  for _, args in ipairs(uprobes) do
+    bpf:attach_uprobe(args)
+  end
+end
+
+return Usdt
diff --git a/src/lua/bcc/vendor/argparse.lua b/src/lua/bcc/vendor/argparse.lua
new file mode 100644
index 0000000..53ef173
--- /dev/null
+++ b/src/lua/bcc/vendor/argparse.lua
@@ -0,0 +1,1180 @@
+-- The MIT License (MIT)
+
+-- Copyright (c) 2013 - 2015 Peter Melnichenko
+
+-- Permission is hereby granted, free of charge, to any person obtaining a copy of
+-- this software and associated documentation files (the "Software"), to deal in
+-- the Software without restriction, including without limitation the rights to
+-- use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+-- the Software, and to permit persons to whom the Software is furnished to do so,
+-- subject to the following conditions:
+
+-- The above copyright notice and this permission notice shall be included in all
+-- copies or substantial portions of the Software.
+
+-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+-- FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+-- COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+-- IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+-- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+local function deep_update(t1, t2)
+   for k, v in pairs(t2) do
+      if type(v) == "table" then
+         v = deep_update({}, v)
+      end
+
+      t1[k] = v
+   end
+
+   return t1
+end
+
+-- A property is a tuple {name, callback}.
+-- properties.args is number of properties that can be set as arguments
+-- when calling an object.
+local function class(prototype, properties, parent)
+   -- Class is the metatable of its instances.
+   local cl = {}
+   cl.__index = cl
+
+   if parent then
+      cl.__prototype = deep_update(deep_update({}, parent.__prototype), prototype)
+   else
+      cl.__prototype = prototype
+   end
+
+   if properties then
+      local names = {}
+
+      -- Create setter methods and fill set of property names.
+      for _, property in ipairs(properties) do
+         local name, callback = property[1], property[2]
+
+         cl[name] = function(self, value)
+            if not callback(self, value) then
+               self["_" .. name] = value
+            end
+
+            return self
+         end
+
+         names[name] = true
+      end
+
+      function cl.__call(self, ...)
+         -- When calling an object, if the first argument is a table,
+         -- interpret keys as property names, else delegate arguments
+         -- to corresponding setters in order.
+         if type((...)) == "table" then
+            for name, value in pairs((...)) do
+               if names[name] then
+                  self[name](self, value)
+               end
+            end
+         else
+            local nargs = select("#", ...)
+
+            for i, property in ipairs(properties) do
+               if i > nargs or i > properties.args then
+                  break
+               end
+
+               local arg = select(i, ...)
+
+               if arg ~= nil then
+                  self[property[1]](self, arg)
+               end
+            end
+         end
+
+         return self
+      end
+   end
+
+   -- If indexing class fails, fallback to its parent.
+   local class_metatable = {}
+   class_metatable.__index = parent
+
+   function class_metatable.__call(self, ...)
+      -- Calling a class returns its instance.
+      -- Arguments are delegated to the instance.
+      local object = deep_update({}, self.__prototype)
+      setmetatable(object, self)
+      return object(...)
+   end
+
+   return setmetatable(cl, class_metatable)
+end
+
+local function typecheck(name, types, value)
+   for _, type_ in ipairs(types) do
+      if type(value) == type_ then
+         return true
+      end
+   end
+
+   error(("bad property '%s' (%s expected, got %s)"):format(name, table.concat(types, " or "), type(value)))
+end
+
+local function typechecked(name, ...)
+   local types = {...}
+   return {name, function(_, value) typecheck(name, types, value) end}
+end
+
+local multiname = {"name", function(self, value)
+   typecheck("name", {"string"}, value)
+
+   for alias in value:gmatch("%S+") do
+      self._name = self._name or alias
+      table.insert(self._aliases, alias)
+   end
+
+   -- Do not set _name as with other properties.
+   return true
+end}
+
+local function parse_boundaries(str)
+   if tonumber(str) then
+      return tonumber(str), tonumber(str)
+   end
+
+   if str == "*" then
+      return 0, math.huge
+   end
+
+   if str == "+" then
+      return 1, math.huge
+   end
+
+   if str == "?" then
+      return 0, 1
+   end
+
+   if str:match "^%d+%-%d+$" then
+      local min, max = str:match "^(%d+)%-(%d+)$"
+      return tonumber(min), tonumber(max)
+   end
+
+   if str:match "^%d+%+$" then
+      local min = str:match "^(%d+)%+$"
+      return tonumber(min), math.huge
+   end
+end
+
+local function boundaries(name)
+   return {name, function(self, value)
+      typecheck(name, {"number", "string"}, value)
+
+      local min, max = parse_boundaries(value)
+
+      if not min then
+         error(("bad property '%s'"):format(name))
+      end
+
+      self["_min" .. name], self["_max" .. name] = min, max
+   end}
+end
+
+local actions = {}
+
+local option_action = {"action", function(_, value)
+   typecheck("action", {"function", "string"}, value)
+
+   if type(value) == "string" and not actions[value] then
+      error(("unknown action '%s'"):format(value))
+   end
+end}
+
+local option_init = {"init", function(self)
+   self._has_init = true
+end}
+
+local option_default = {"default", function(self, value)
+   if type(value) ~= "string" then
+      self._init = value
+      self._has_init = true
+      return true
+   end
+end}
+
+local add_help = {"add_help", function(self, value)
+   typecheck("add_help", {"boolean", "string", "table"}, value)
+
+   if self._has_help then
+      table.remove(self._options)
+      self._has_help = false
+   end
+
+   if value then
+      local help = self:flag()
+         :description "Show this help message and exit."
+         :action(function()
+            print(self:get_help())
+            os.exit(0)
+         end)
+
+      if value ~= true then
+         help = help(value)
+      end
+
+      if not help._name then
+         help "-h" "--help"
+      end
+
+      self._has_help = true
+   end
+end}
+
+local Parser = class({
+   _arguments = {},
+   _options = {},
+   _commands = {},
+   _mutexes = {},
+   _require_command = true,
+   _handle_options = true
+}, {
+   args = 3,
+   typechecked("name", "string"),
+   typechecked("description", "string"),
+   typechecked("epilog", "string"),
+   typechecked("usage", "string"),
+   typechecked("help", "string"),
+   typechecked("require_command", "boolean"),
+   typechecked("handle_options", "boolean"),
+   typechecked("action", "function"),
+   typechecked("command_target", "string"),
+   add_help
+})
+
+local Command = class({
+   _aliases = {}
+}, {
+   args = 3,
+   multiname,
+   typechecked("description", "string"),
+   typechecked("epilog", "string"),
+   typechecked("target", "string"),
+   typechecked("usage", "string"),
+   typechecked("help", "string"),
+   typechecked("require_command", "boolean"),
+   typechecked("handle_options", "boolean"),
+   typechecked("action", "function"),
+   typechecked("command_target", "string"),
+   add_help
+}, Parser)
+
+local Argument = class({
+   _minargs = 1,
+   _maxargs = 1,
+   _mincount = 1,
+   _maxcount = 1,
+   _defmode = "unused",
+   _show_default = true
+}, {
+   args = 5,
+   typechecked("name", "string"),
+   typechecked("description", "string"),
+   option_default,
+   typechecked("convert", "function", "table"),
+   boundaries("args"),
+   typechecked("target", "string"),
+   typechecked("defmode", "string"),
+   typechecked("show_default", "boolean"),
+   typechecked("argname", "string", "table"),
+   option_action,
+   option_init
+})
+
+local Option = class({
+   _aliases = {},
+   _mincount = 0,
+   _overwrite = true
+}, {
+   args = 6,
+   multiname,
+   typechecked("description", "string"),
+   option_default,
+   typechecked("convert", "function", "table"),
+   boundaries("args"),
+   boundaries("count"),
+   typechecked("target", "string"),
+   typechecked("defmode", "string"),
+   typechecked("show_default", "boolean"),
+   typechecked("overwrite", "boolean"),
+   typechecked("argname", "string", "table"),
+   option_action,
+   option_init
+}, Argument)
+
+function Argument:_get_argument_list()
+   local buf = {}
+   local i = 1
+
+   while i <= math.min(self._minargs, 3) do
+      local argname = self:_get_argname(i)
+
+      if self._default and self._defmode:find "a" then
+         argname = "[" .. argname .. "]"
+      end
+
+      table.insert(buf, argname)
+      i = i+1
+   end
+
+   while i <= math.min(self._maxargs, 3) do
+      table.insert(buf, "[" .. self:_get_argname(i) .. "]")
+      i = i+1
+
+      if self._maxargs == math.huge then
+         break
+      end
+   end
+
+   if i < self._maxargs then
+      table.insert(buf, "...")
+   end
+
+   return buf
+end
+
+function Argument:_get_usage()
+   local usage = table.concat(self:_get_argument_list(), " ")
+
+   if self._default and self._defmode:find "u" then
+      if self._maxargs > 1 or (self._minargs == 1 and not self._defmode:find "a") then
+         usage = "[" .. usage .. "]"
+      end
+   end
+
+   return usage
+end
+
+function actions.store_true(result, target)
+   result[target] = true
+end
+
+function actions.store_false(result, target)
+   result[target] = false
+end
+
+function actions.store(result, target, argument)
+   result[target] = argument
+end
+
+function actions.count(result, target, _, overwrite)
+   if not overwrite then
+      result[target] = result[target] + 1
+   end
+end
+
+function actions.append(result, target, argument, overwrite)
+   result[target] = result[target] or {}
+   table.insert(result[target], argument)
+
+   if overwrite then
+      table.remove(result[target], 1)
+   end
+end
+
+function actions.concat(result, target, arguments, overwrite)
+   if overwrite then
+      error("'concat' action can't handle too many invocations")
+   end
+
+   result[target] = result[target] or {}
+
+   for _, argument in ipairs(arguments) do
+      table.insert(result[target], argument)
+   end
+end
+
+function Argument:_get_action()
+   local action, init
+
+   if self._maxcount == 1 then
+      if self._maxargs == 0 then
+         action, init = "store_true", nil
+      else
+         action, init = "store", nil
+      end
+   else
+      if self._maxargs == 0 then
+         action, init = "count", 0
+      else
+         action, init = "append", {}
+      end
+   end
+
+   if self._action then
+      action = self._action
+   end
+
+   if self._has_init then
+      init = self._init
+   end
+
+   if type(action) == "string" then
+      action = actions[action]
+   end
+
+   return action, init
+end
+
+-- Returns placeholder for `narg`-th argument.
+function Argument:_get_argname(narg)
+   local argname = self._argname or self:_get_default_argname()
+
+   if type(argname) == "table" then
+      return argname[narg]
+   else
+      return argname
+   end
+end
+
+function Argument:_get_default_argname()
+   return "<" .. self._name .. ">"
+end
+
+function Option:_get_default_argname()
+   return "<" .. self:_get_default_target() .. ">"
+end
+
+-- Returns label to be shown in the help message.
+function Argument:_get_label()
+   return self._name
+end
+
+function Option:_get_label()
+   local variants = {}
+   local argument_list = self:_get_argument_list()
+   table.insert(argument_list, 1, nil)
+
+   for _, alias in ipairs(self._aliases) do
+      argument_list[1] = alias
+      table.insert(variants, table.concat(argument_list, " "))
+   end
+
+   return table.concat(variants, ", ")
+end
+
+function Command:_get_label()
+   return table.concat(self._aliases, ", ")
+end
+
+function Argument:_get_description()
+   if self._default and self._show_default then
+      if self._description then
+         return ("%s (default: %s)"):format(self._description, self._default)
+      else
+         return ("default: %s"):format(self._default)
+      end
+   else
+      return self._description or ""
+   end
+end
+
+function Command:_get_description()
+   return self._description or ""
+end
+
+function Option:_get_usage()
+   local usage = self:_get_argument_list()
+   table.insert(usage, 1, self._name)
+   usage = table.concat(usage, " ")
+
+   if self._mincount == 0 or self._default then
+      usage = "[" .. usage .. "]"
+   end
+
+   return usage
+end
+
+function Argument:_get_default_target()
+   return self._name
+end
+
+function Option:_get_default_target()
+   local res
+
+   for _, alias in ipairs(self._aliases) do
+      if alias:sub(1, 1) == alias:sub(2, 2) then
+         res = alias:sub(3)
+         break
+      end
+   end
+
+   res = res or self._name:sub(2)
+   return (res:gsub("-", "_"))
+end
+
+function Option:_is_vararg()
+   return self._maxargs ~= self._minargs
+end
+
+function Parser:_get_fullname()
+   local parent = self._parent
+   local buf = {self._name}
+
+   while parent do
+      table.insert(buf, 1, parent._name)
+      parent = parent._parent
+   end
+
+   return table.concat(buf, " ")
+end
+
+function Parser:_update_charset(charset)
+   charset = charset or {}
+
+   for _, command in ipairs(self._commands) do
+      command:_update_charset(charset)
+   end
+
+   for _, option in ipairs(self._options) do
+      for _, alias in ipairs(option._aliases) do
+         charset[alias:sub(1, 1)] = true
+      end
+   end
+
+   return charset
+end
+
+function Parser:argument(...)
+   local argument = Argument(...)
+   table.insert(self._arguments, argument)
+   return argument
+end
+
+function Parser:option(...)
+   local option = Option(...)
+
+   if self._has_help then
+      table.insert(self._options, #self._options, option)
+   else
+      table.insert(self._options, option)
+   end
+
+   return option
+end
+
+function Parser:flag(...)
+   return self:option():args(0)(...)
+end
+
+function Parser:command(...)
+   local command = Command():add_help(true)(...)
+   command._parent = self
+   table.insert(self._commands, command)
+   return command
+end
+
+function Parser:mutex(...)
+   local options = {...}
+
+   for i, option in ipairs(options) do
+      assert(getmetatable(option) == Option, ("bad argument #%d to 'mutex' (Option expected)"):format(i))
+   end
+
+   table.insert(self._mutexes, options)
+   return self
+end
+
+local max_usage_width = 70
+local usage_welcome = "Usage: "
+
+function Parser:get_usage()
+   if self._usage then
+      return self._usage
+   end
+
+   local lines = {usage_welcome .. self:_get_fullname()}
+
+   local function add(s)
+      if #lines[#lines]+1+#s <= max_usage_width then
+         lines[#lines] = lines[#lines] .. " " .. s
+      else
+         lines[#lines+1] = (" "):rep(#usage_welcome) .. s
+      end
+   end
+
+   -- This can definitely be refactored into something cleaner
+   local mutex_options = {}
+   local vararg_mutexes = {}
+
+   -- First, put mutexes which do not contain vararg options and remember those which do
+   for _, mutex in ipairs(self._mutexes) do
+      local buf = {}
+      local is_vararg = false
+
+      for _, option in ipairs(mutex) do
+         if option:_is_vararg() then
+            is_vararg = true
+         end
+
+         table.insert(buf, option:_get_usage())
+         mutex_options[option] = true
+      end
+
+      local repr = "(" .. table.concat(buf, " | ") .. ")"
+
+      if is_vararg then
+         table.insert(vararg_mutexes, repr)
+      else
+         add(repr)
+      end
+   end
+
+   -- Second, put regular options
+   for _, option in ipairs(self._options) do
+      if not mutex_options[option] and not option:_is_vararg() then
+         add(option:_get_usage())
+      end
+   end
+
+   -- Put positional arguments
+   for _, argument in ipairs(self._arguments) do
+      add(argument:_get_usage())
+   end
+
+   -- Put mutexes containing vararg options
+   for _, mutex_repr in ipairs(vararg_mutexes) do
+      add(mutex_repr)
+   end
+
+   for _, option in ipairs(self._options) do
+      if not mutex_options[option] and option:_is_vararg() then
+         add(option:_get_usage())
+      end
+   end
+
+   if #self._commands > 0 then
+      if self._require_command then
+         add("<command>")
+      else
+         add("[<command>]")
+      end
+
+      add("...")
+   end
+
+   return table.concat(lines, "\n")
+end
+
+local margin_len = 3
+local margin_len2 = 25
+local margin = (" "):rep(margin_len)
+local margin2 = (" "):rep(margin_len2)
+
+local function make_two_columns(s1, s2)
+   if s2 == "" then
+      return margin .. s1
+   end
+
+   s2 = s2:gsub("\n", "\n" .. margin2)
+
+   if #s1 < (margin_len2-margin_len) then
+      return margin .. s1 .. (" "):rep(margin_len2-margin_len-#s1) .. s2
+   else
+      return margin .. s1 .. "\n" .. margin2 .. s2
+   end
+end
+
+function Parser:get_help()
+   if self._help then
+      return self._help
+   end
+
+   local blocks = {self:get_usage()}
+
+   if self._description then
+      table.insert(blocks, self._description)
+   end
+
+   local labels = {"Arguments:", "Options:", "Commands:"}
+
+   for i, elements in ipairs{self._arguments, self._options, self._commands} do
+      if #elements > 0 then
+         local buf = {labels[i]}
+
+         for _, element in ipairs(elements) do
+            table.insert(buf, make_two_columns(element:_get_label(), element:_get_description()))
+         end
+
+         table.insert(blocks, table.concat(buf, "\n"))
+      end
+   end
+
+   if self._epilog then
+      table.insert(blocks, self._epilog)
+   end
+
+   return table.concat(blocks, "\n\n")
+end
+
+local function get_tip(context, wrong_name)
+   local context_pool = {}
+   local possible_name
+   local possible_names = {}
+
+   for name in pairs(context) do
+      if type(name) == "string" then
+         for i = 1, #name do
+            possible_name = name:sub(1, i - 1) .. name:sub(i + 1)
+
+            if not context_pool[possible_name] then
+               context_pool[possible_name] = {}
+            end
+
+            table.insert(context_pool[possible_name], name)
+         end
+      end
+   end
+
+   for i = 1, #wrong_name + 1 do
+      possible_name = wrong_name:sub(1, i - 1) .. wrong_name:sub(i + 1)
+
+      if context[possible_name] then
+         possible_names[possible_name] = true
+      elseif context_pool[possible_name] then
+         for _, name in ipairs(context_pool[possible_name]) do
+            possible_names[name] = true
+         end
+      end
+   end
+
+   local first = next(possible_names)
+
+   if first then
+      if next(possible_names, first) then
+         local possible_names_arr = {}
+
+         for name in pairs(possible_names) do
+            table.insert(possible_names_arr, "'" .. name .. "'")
+         end
+
+         table.sort(possible_names_arr)
+         return "\nDid you mean one of these: " .. table.concat(possible_names_arr, " ") .. "?"
+      else
+         return "\nDid you mean '" .. first .. "'?"
+      end
+   else
+      return ""
+   end
+end
+
+local ElementState = class({
+   invocations = 0
+})
+
+function ElementState:__call(state, element)
+   self.state = state
+   self.result = state.result
+   self.element = element
+   self.target = element._target or element:_get_default_target()
+   self.action, self.result[self.target] = element:_get_action()
+   return self
+end
+
+function ElementState:error(fmt, ...)
+   self.state:error(fmt, ...)
+end
+
+function ElementState:convert(argument)
+   local converter = self.element._convert
+
+   if converter then
+      local ok, err
+
+      if type(converter) == "function" then
+         ok, err = converter(argument)
+      else
+         ok = converter[argument]
+      end
+
+      if ok == nil then
+         self:error(err and "%s" or "malformed argument '%s'", err or argument)
+      end
+
+      argument = ok
+   end
+
+   return argument
+end
+
+function ElementState:default(mode)
+   return self.element._defmode:find(mode) and self.element._default
+end
+
+local function bound(noun, min, max, is_max)
+   local res = ""
+
+   if min ~= max then
+      res = "at " .. (is_max and "most" or "least") .. " "
+   end
+
+   local number = is_max and max or min
+   return res .. tostring(number) .. " " .. noun ..  (number == 1 and "" or "s")
+end
+
+function ElementState:invoke(alias)
+   self.open = true
+   self.name = ("%s '%s'"):format(alias and "option" or "argument", alias or self.element._name)
+   self.overwrite = false
+
+   if self.invocations >= self.element._maxcount then
+      if self.element._overwrite then
+         self.overwrite = true
+      else
+         self:error("%s must be used %s", self.name, bound("time", self.element._mincount, self.element._maxcount, true))
+      end
+   else
+      self.invocations = self.invocations + 1
+   end
+
+   self.args = {}
+
+   if self.element._maxargs <= 0 then
+      self:close()
+   end
+
+   return self.open
+end
+
+function ElementState:pass(argument)
+   argument = self:convert(argument)
+   table.insert(self.args, argument)
+
+   if #self.args >= self.element._maxargs then
+      self:close()
+   end
+
+   return self.open
+end
+
+function ElementState:complete_invocation()
+   while #self.args < self.element._minargs do
+      self:pass(self.element._default)
+   end
+end
+
+function ElementState:close()
+   if self.open then
+      self.open = false
+
+      if #self.args < self.element._minargs then
+         if self:default("a") then
+            self:complete_invocation()
+         else
+            if #self.args == 0 then
+               if getmetatable(self.element) == Argument then
+                  self:error("missing %s", self.name)
+               elseif self.element._maxargs == 1 then
+                  self:error("%s requires an argument", self.name)
+               end
+            end
+
+            self:error("%s requires %s", self.name, bound("argument", self.element._minargs, self.element._maxargs))
+         end
+      end
+
+      local args = self.args
+
+      if self.element._maxargs <= 1 then
+         args = args[1]
+      end
+
+      if self.element._maxargs == 1 and self.element._minargs == 0 and self.element._mincount ~= self.element._maxcount then
+         args = self.args
+      end
+
+      self.action(self.result, self.target, args, self.overwrite)
+   end
+end
+
+local ParseState = class({
+   result = {},
+   options = {},
+   arguments = {},
+   argument_i = 1,
+   element_to_mutexes = {},
+   mutex_to_used_option = {},
+   command_actions = {}
+})
+
+function ParseState:__call(parser, error_handler)
+   self.parser = parser
+   self.error_handler = error_handler
+   self.charset = parser:_update_charset()
+   self:switch(parser)
+   return self
+end
+
+function ParseState:error(fmt, ...)
+   self.error_handler(self.parser, fmt:format(...))
+end
+
+function ParseState:switch(parser)
+   self.parser = parser
+
+   if parser._action then
+      table.insert(self.command_actions, {action = parser._action, name = parser._name})
+   end
+
+   for _, option in ipairs(parser._options) do
+      option = ElementState(self, option)
+      table.insert(self.options, option)
+
+      for _, alias in ipairs(option.element._aliases) do
+         self.options[alias] = option
+      end
+   end
+
+   for _, mutex in ipairs(parser._mutexes) do
+      for _, option in ipairs(mutex) do
+         if not self.element_to_mutexes[option] then
+            self.element_to_mutexes[option] = {}
+         end
+
+         table.insert(self.element_to_mutexes[option], mutex)
+      end
+   end
+
+   for _, argument in ipairs(parser._arguments) do
+      argument = ElementState(self, argument)
+      table.insert(self.arguments, argument)
+      argument:invoke()
+   end
+
+   self.handle_options = parser._handle_options
+   self.argument = self.arguments[self.argument_i]
+   self.commands = parser._commands
+
+   for _, command in ipairs(self.commands) do
+      for _, alias in ipairs(command._aliases) do
+         self.commands[alias] = command
+      end
+   end
+end
+
+function ParseState:get_option(name)
+   local option = self.options[name]
+
+   if not option then
+      self:error("unknown option '%s'%s", name, get_tip(self.options, name))
+   else
+      return option
+   end
+end
+
+function ParseState:get_command(name)
+   local command = self.commands[name]
+
+   if not command then
+      if #self.commands > 0 then
+         self:error("unknown command '%s'%s", name, get_tip(self.commands, name))
+      else
+         self:error("too many arguments")
+      end
+   else
+      return command
+   end
+end
+
+function ParseState:invoke(option, name)
+   self:close()
+
+   if self.element_to_mutexes[option.element] then
+      for _, mutex in ipairs(self.element_to_mutexes[option.element]) do
+         local used_option = self.mutex_to_used_option[mutex]
+
+         if used_option and used_option ~= option then
+            self:error("option '%s' can not be used together with %s", name, used_option.name)
+         else
+            self.mutex_to_used_option[mutex] = option
+         end
+      end
+   end
+
+   if option:invoke(name) then
+      self.option = option
+   end
+end
+
+function ParseState:pass(arg)
+   if self.option then
+      if not self.option:pass(arg) then
+         self.option = nil
+      end
+   elseif self.argument then
+      if not self.argument:pass(arg) then
+         self.argument_i = self.argument_i + 1
+         self.argument = self.arguments[self.argument_i]
+      end
+   else
+      local command = self:get_command(arg)
+      self.result[command._target or command._name] = true
+
+      if self.parser._command_target then
+         self.result[self.parser._command_target] = command._name
+      end
+
+      self:switch(command)
+   end
+end
+
+function ParseState:close()
+   if self.option then
+      self.option:close()
+      self.option = nil
+   end
+end
+
+function ParseState:finalize()
+   self:close()
+
+   for i = self.argument_i, #self.arguments do
+      local argument = self.arguments[i]
+      if #argument.args == 0 and argument:default("u") then
+         argument:complete_invocation()
+      else
+         argument:close()
+      end
+   end
+
+   if self.parser._require_command and #self.commands > 0 then
+      self:error("a command is required")
+   end
+
+   for _, option in ipairs(self.options) do
+      local name = option.name or ("option '%s'"):format(option.element._name)
+
+      if option.invocations == 0 then
+         if option:default("u") then
+            option:invoke(name)
+            option:complete_invocation()
+            option:close()
+         end
+      end
+
+      local mincount = option.element._mincount
+
+      if option.invocations < mincount then
+         if option:default("a") then
+            while option.invocations < mincount do
+               option:invoke(name)
+               option:close()
+            end
+         elseif option.invocations == 0 then
+            self:error("missing %s", name)
+         else
+            self:error("%s must be used %s", name, bound("time", mincount, option.element._maxcount))
+         end
+      end
+   end
+
+   for i = #self.command_actions, 1, -1 do
+      self.command_actions[i].action(self.result, self.command_actions[i].name)
+   end
+end
+
+function ParseState:parse(args)
+   for _, arg in ipairs(args) do
+      local plain = true
+
+      if self.handle_options then
+         local first = arg:sub(1, 1)
+
+         if self.charset[first] then
+            if #arg > 1 then
+               plain = false
+
+               if arg:sub(2, 2) == first then
+                  if #arg == 2 then
+                     self:close()
+                     self.handle_options = false
+                  else
+                     local equals = arg:find "="
+                     if equals then
+                        local name = arg:sub(1, equals - 1)
+                        local option = self:get_option(name)
+
+                        if option.element._maxargs <= 0 then
+                           self:error("option '%s' does not take arguments", name)
+                        end
+
+                        self:invoke(option, name)
+                        self:pass(arg:sub(equals + 1))
+                     else
+                        local option = self:get_option(arg)
+                        self:invoke(option, arg)
+                     end
+                  end
+               else
+                  for i = 2, #arg do
+                     local name = first .. arg:sub(i, i)
+                     local option = self:get_option(name)
+                     self:invoke(option, name)
+
+                     if i ~= #arg and option.element._maxargs > 0 then
+                        self:pass(arg:sub(i + 1))
+                        break
+                     end
+                  end
+               end
+            end
+         end
+      end
+
+      if plain then
+         self:pass(arg)
+      end
+   end
+
+   self:finalize()
+   return self.result
+end
+
+function Parser:error(msg)
+   io.stderr:write(("%s\n\nError: %s\n"):format(self:get_usage(), msg))
+   os.exit(1)
+end
+
+-- Compatibility with strict.lua and other checkers:
+local default_cmdline = rawget(_G, "arg") or {}
+
+function Parser:_parse(args, error_handler)
+   return ParseState(self, error_handler):parse(args or default_cmdline)
+end
+
+function Parser:parse(args)
+   return self:_parse(args, self.error)
+end
+
+local function xpcall_error_handler(err)
+   return tostring(err) .. "\noriginal " .. debug.traceback("", 2):sub(2)
+end
+
+function Parser:pparse(args)
+   local parse_error
+
+   local ok, result = xpcall(function()
+      return self:_parse(args, function(_, err)
+         parse_error = err
+         error(err, 0)
+      end)
+   end, xpcall_error_handler)
+
+   if ok then
+      return true, result
+   elseif not parse_error then
+      error(result, 0)
+   else
+      return false, parse_error
+   end
+end
+
+return function(...)
+   return Parser(default_cmdline[0]):add_help(true)(...)
+end
diff --git a/src/lua/bcc/vendor/helpers.lua b/src/lua/bcc/vendor/helpers.lua
new file mode 100644
index 0000000..28d5a0c
--- /dev/null
+++ b/src/lua/bcc/vendor/helpers.lua
@@ -0,0 +1,199 @@
+do
+  local ffi = require("ffi")
+  local ptrtype = ffi.typeof("uint64_t")
+  local strformat = string.format
+  function string.format(format, ...)
+    local args = {...}
+    local match_no = 1
+    local newfmt, count = string.gsub(format, "()%%(.-)(%a)",
+      function(_, mods, t)
+        local n = match_no
+        match_no = match_no + 1
+        if t == 'p' and ffi.istype(ptrtype, args[n]) then
+          local lo = tonumber(args[n] % 4294967296ULL)
+          local hi = tonumber(args[n] / 4294967296ULL)
+          args[n] = (hi == 0) and strformat("%x", lo) or strformat("%x%08x", hi, lo)
+          return "%"..mods.."s"
+        end
+      end)
+    if count == 0 then
+      return strformat(format, ...)
+    else
+      return strformat(newfmt, unpack(args,1,select('#',...)))
+    end
+  end
+end
+
+function string.starts(s, p)
+  return string.sub(s, 1, string.len(p)) == p
+end
+
+function string.lstrip(s, p)
+  return string.sub(s, string.len(p) + 1)
+end
+
+function string.ends(s, e)
+  return e == '' or string.sub(s, -string.len(e))==e
+end
+
+function string.escape(s)
+  return s:gsub('[%-%.%+%[%]%(%)%$%^%%%?%*]','%%%1')
+end
+
+--- split a string into a list of strings separated by a delimiter.
+-- @param s The input string
+-- @param re A Lua string pattern; defaults to '%s+'
+-- @param plain don't use Lua patterns
+-- @param n optional maximum number of splits
+-- @return a list-like table
+-- @raise error if s is not a string
+function string.split(s,re,plain,n)
+  local find,sub,append = string.find, string.sub, table.insert
+  local i1,ls = 1,{}
+  if not re then re = '%s+' end
+  if re == '' then return {s} end
+  while true do
+    local i2,i3 = find(s,re,i1,plain)
+    if not i2 then
+      local last = sub(s,i1)
+      if last ~= '' then append(ls,last) end
+      if #ls == 1 and ls[1] == '' then
+        return {}
+      else
+        return ls
+      end
+    end
+    append(ls,sub(s,i1,i2-1))
+    if n and #ls == n then
+      ls[#ls] = sub(s,i1)
+      return ls
+    end
+    i1 = i3+1
+  end
+end
+
+function table.count(T)
+  local count = 0
+  for _ in pairs(T) do count = count + 1 end
+  return count
+end
+
+function table.bsearch(list, value, mkval)
+  local low = 1
+  local high = #list
+  while low <= high do
+    local mid = math.floor((low+high)/2)
+    local this = mkval and mkval(list[mid]) or list[mid]
+    if this > value then
+      high = mid - 1
+    elseif this < value then
+      low = mid + 1
+    else
+      return mid
+    end
+  end
+  return low - 1
+end
+
+function table.join(a, b)
+  assert(a)
+  if b == nil or #b == 0 then
+    return a
+  end
+
+  local res = {}
+  for _, v in ipairs(a) do
+    table.insert(res, v)
+  end
+  for _, v in ipairs(b) do
+    table.insert(res, v)
+  end
+  return res
+end
+
+function table.build(iterator_fn, build_fn)
+  build_fn = (build_fn or function(arg) return arg end)
+  local res = {}
+  while true do
+    local vars = {iterator_fn()}
+    if vars[1] == nil then break end
+    table.insert(res, build_fn(vars))
+  end
+  return res
+end
+
+function table.values(T)
+  local V = {}
+  for k, v in pairs(T) do
+    table.insert(V, v)
+  end
+  return V
+end
+
+function table.tuples(T)
+  local i = 0
+  local n = table.getn(t)
+  return function ()
+    i = i + 1
+    if i <= n then return t[i][1], t[i][2] end
+  end
+end
+
+getmetatable("").__mod = function(a, b)
+  if not b then
+    return a
+  elseif type(b) == "table" then
+    return string.format(a, unpack(b))
+  else
+    return string.format(a, b)
+  end
+end
+
+function os.exists(path)
+  local f=io.open(path,"r")
+  if f~=nil then
+    io.close(f)
+    return true
+  else
+    return false
+  end
+end
+
+function os.spawn(...)
+  local cmd = string.format(...)
+  local proc = assert(io.popen(cmd))
+  local out = proc:read("*a")
+  proc:close()
+  return out
+end
+
+local function logline(...)
+  if not log.enabled then
+    return
+  end
+
+  local c_green = "\27[32m"
+  local c_grey = "\27[1;30m"
+  local c_clear = "\27[0m"
+
+  local msg = string.format(...)
+  local info = debug.getinfo(2, "Sln")
+  local line = string.format("%s[%s:%s]%s %s", c_grey,
+    info.short_src:match("^.+/(.+)$"), info.currentline, c_clear, info.name)
+
+  io.stderr:write(
+    string.format("%s[%s]%s %s: %s\n", c_green,
+      os.date("%H:%M:%S"), c_clear, line, msg))
+end
+
+setmetatable(_G, {
+  __newindex = function (_, n)
+    error("attempt to write to undeclared variable "..n, 2)
+  end,
+  __index = function (_, n)
+    error("attempt to read undeclared variable "..n, 2)
+  end,
+})
+
+rawset(_G, "log", { info = logline, enabled = false })
+rawset(_G, "class", require("bcc.vendor.middleclass"))
diff --git a/src/lua/bcc/vendor/json.lua b/src/lua/bcc/vendor/json.lua
new file mode 100644
index 0000000..3e4648d
--- /dev/null
+++ b/src/lua/bcc/vendor/json.lua
@@ -0,0 +1,198 @@
+--[[ json.lua
+
+A compact pure-Lua JSON library.
+
+This code is in the public domain:
+https://gist.github.com/tylerneylon/59f4bcf316be525b30ab
+
+The main functions are: json.stringify, json.parse.
+
+## json.stringify:
+
+This expects the following to be true of any tables being encoded:
+ * They only have string or number keys. Number keys must be represented as
+   strings in json; this is part of the json spec.
+ * They are not recursive. Such a structure cannot be specified in json.
+
+A Lua table is considered to be an array if and only if its set of keys is a
+consecutive sequence of positive integers starting at 1. Arrays are encoded like
+so: `[2, 3, false, "hi"]`. Any other type of Lua table is encoded as a json
+object, encoded like so: `{"key1": 2, "key2": false}`.
+
+Because the Lua nil value cannot be a key, and as a table value is considered
+equivalent to a missing key, there is no way to express the json "null" value in
+a Lua table. The only way this will output "null" is if your entire input obj is
+nil itself.
+
+An empty Lua table, {}, could be considered either a json object or array -
+it's an ambiguous edge case. We choose to treat this as an object as it is the
+more general type.
+
+To be clear, none of the above considerations is a limitation of this code.
+Rather, it is what we get when we completely observe the json specification for
+as arbitrary a Lua object as json is capable of expressing.
+
+## json.parse:
+
+This function parses json, with the exception that it does not pay attention to
+\u-escaped unicode code points in strings.
+
+It is difficult for Lua to return null as a value. In order to prevent the loss
+of keys with a null value in a json string, this function uses the one-off
+table value json.null (which is just an empty table) to indicate null values.
+This way you can check if a value is null with the conditional
+`val == json.null`.
+
+If you have control over the data and are using Lua, I would recommend just
+avoiding null values in your data to begin with.
+
+--]]
+
+
+local json = {}
+
+
+-- Internal functions.
+
+local function kind_of(obj)
+  if type(obj) ~= 'table' then return type(obj) end
+  local i = 1
+  for _ in pairs(obj) do
+    if obj[i] ~= nil then i = i + 1 else return 'table' end
+  end
+  if i == 1 then return 'table' else return 'array' end
+end
+
+local function escape_str(s)
+  local in_char  = {'\\', '"', '/', '\b', '\f', '\n', '\r', '\t'}
+  local out_char = {'\\', '"', '/',  'b',  'f',  'n',  'r',  't'}
+  for i, c in ipairs(in_char) do
+    s = s:gsub(c, '\\' .. out_char[i])
+  end
+  return s
+end
+
+-- Returns pos, did_find; there are two cases:
+-- 1. Delimiter found: pos = pos after leading space + delim; did_find = true.
+-- 2. Delimiter not found: pos = pos after leading space;     did_find = false.
+-- This throws an error if err_if_missing is true and the delim is not found.
+local function skip_delim(str, pos, delim, err_if_missing)
+  pos = pos + #str:match('^%s*', pos)
+  if str:sub(pos, pos) ~= delim then
+    if err_if_missing then
+      error('Expected ' .. delim .. ' near position ' .. pos)
+    end
+    return pos, false
+  end
+  return pos + 1, true
+end
+
+-- Expects the given pos to be the first character after the opening quote.
+-- Returns val, pos; the returned pos is after the closing quote character.
+local function parse_str_val(str, pos, val)
+  val = val or ''
+  local early_end_error = 'End of input found while parsing string.'
+  if pos > #str then error(early_end_error) end
+  local c = str:sub(pos, pos)
+  if c == '"'  then return val, pos + 1 end
+  if c ~= '\\' then return parse_str_val(str, pos + 1, val .. c) end
+  -- We must have a \ character.
+  local esc_map = {b = '\b', f = '\f', n = '\n', r = '\r', t = '\t'}
+  local nextc = str:sub(pos + 1, pos + 1)
+  if not nextc then error(early_end_error) end
+  return parse_str_val(str, pos + 2, val .. (esc_map[nextc] or nextc))
+end
+
+-- Returns val, pos; the returned pos is after the number's final character.
+local function parse_num_val(str, pos)
+  local num_str = str:match('^-?%d+%.?%d*[eE]?[+-]?%d*', pos)
+  local val = tonumber(num_str)
+  if not val then error('Error parsing number at position ' .. pos .. '.') end
+  return val, pos + #num_str
+end
+
+
+-- Public values and functions.
+
+function json.stringify(obj, as_key)
+  local s = {}  -- We'll build the string as an array of strings to be concatenated.
+  local kind = kind_of(obj)  -- This is 'array' if it's an array or type(obj) otherwise.
+  if kind == 'array' then
+    if as_key then error('Can\'t encode array as key.') end
+    s[#s + 1] = '['
+    for i, val in ipairs(obj) do
+      if i > 1 then s[#s + 1] = ', ' end
+      s[#s + 1] = json.stringify(val)
+    end
+    s[#s + 1] = ']'
+  elseif kind == 'table' then
+    if as_key then error('Can\'t encode table as key.') end
+    s[#s + 1] = '{'
+    for k, v in pairs(obj) do
+      if #s > 1 then s[#s + 1] = ', ' end
+      s[#s + 1] = json.stringify(k, true)
+      s[#s + 1] = ':'
+      s[#s + 1] = json.stringify(v)
+    end
+    s[#s + 1] = '}'
+  elseif kind == 'string' then
+    return '"' .. escape_str(obj) .. '"'
+  elseif kind == 'number' then
+    if as_key then return '"' .. tostring(obj) .. '"' end
+    return tostring(obj)
+  elseif kind == 'boolean' then
+    return tostring(obj)
+  elseif kind == 'nil' then
+    return 'null'
+  else
+    error('Unjsonifiable type: ' .. kind .. '.')
+  end
+  return table.concat(s)
+end
+
+json.null = {}  -- This is a one-off table to represent the null value.
+
+function json.parse(str, pos, end_delim)
+  pos = pos or 1
+  if pos > #str then error('Reached unexpected end of input.') end
+  local pos = pos + #str:match('^%s*', pos)  -- Skip whitespace.
+  local first = str:sub(pos, pos)
+  if first == '{' then  -- Parse an object.
+    local obj, key, delim_found = {}, true, true
+    pos = pos + 1
+    while true do
+      key, pos = json.parse(str, pos, '}')
+      if key == nil then return obj, pos end
+      if not delim_found then error('Comma missing between object items.') end
+      pos = skip_delim(str, pos, ':', true)  -- true -> error if missing.
+      obj[key], pos = json.parse(str, pos)
+      pos, delim_found = skip_delim(str, pos, ',')
+    end
+  elseif first == '[' then  -- Parse an array.
+    local arr, val, delim_found = {}, true, true
+    pos = pos + 1
+    while true do
+      val, pos = json.parse(str, pos, ']')
+      if val == nil then return arr, pos end
+      if not delim_found then error('Comma missing between array items.') end
+      arr[#arr + 1] = val
+      pos, delim_found = skip_delim(str, pos, ',')
+    end
+  elseif first == '"' then  -- Parse a string.
+    return parse_str_val(str, pos + 1)
+  elseif first == '-' or first:match('%d') then  -- Parse a number.
+    return parse_num_val(str, pos)
+  elseif first == end_delim then  -- End of an object or array.
+    return nil, pos + 1
+  else  -- Parse true, false, or null.
+    local literals = {['true'] = true, ['false'] = false, ['null'] = json.null}
+    for lit_str, lit_val in pairs(literals) do
+      local lit_end = pos + #lit_str - 1
+      if str:sub(pos, lit_end) == lit_str then return lit_val, lit_end + 1 end
+    end
+    local pos_info_str = 'position ' .. pos .. ': ' .. str:sub(pos, pos + 10)
+    error('Invalid json syntax starting at ' .. pos_info_str)
+  end
+end
+
+return json
diff --git a/src/lua/bcc/vendor/middleclass.lua b/src/lua/bcc/vendor/middleclass.lua
new file mode 100644
index 0000000..90fda19
--- /dev/null
+++ b/src/lua/bcc/vendor/middleclass.lua
@@ -0,0 +1,178 @@
+local middleclass = {
+  _VERSION     = 'middleclass v4.0.0',
+  _DESCRIPTION = 'Object Orientation for Lua',
+  _URL         = 'https://github.com/kikito/middleclass',
+  _LICENSE     = [[
+    MIT LICENSE
+
+    Copyright (c) 2011 Enrique García Cota
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ]]
+}
+
+local function _createIndexWrapper(aClass, f)
+  if f == nil then
+    return aClass.__instanceDict
+  else
+    return function(self, name)
+      local value = aClass.__instanceDict[name]
+
+      if value ~= nil then
+        return value
+      elseif type(f) == "function" then
+        return (f(self, name))
+      else
+        return f[name]
+      end
+    end
+  end
+end
+
+local function _propagateInstanceMethod(aClass, name, f)
+  f = name == "__index" and _createIndexWrapper(aClass, f) or f
+  aClass.__instanceDict[name] = f
+
+  for subclass in pairs(aClass.subclasses) do
+    if rawget(subclass.__declaredMethods, name) == nil then
+      _propagateInstanceMethod(subclass, name, f)
+    end
+  end
+end
+
+local function _declareInstanceMethod(aClass, name, f)
+  aClass.__declaredMethods[name] = f
+
+  if f == nil and aClass.super then
+    f = aClass.super.__instanceDict[name]
+  end
+
+  _propagateInstanceMethod(aClass, name, f)
+end
+
+local function _tostring(self) return "class " .. self.name end
+local function _call(self, ...) return self:new(...) end
+
+local function _createClass(name, super)
+  local dict = {}
+  dict.__index = dict
+
+  local aClass = { name = name, super = super, static = {},
+                   __instanceDict = dict, __declaredMethods = {},
+                   subclasses = setmetatable({}, {__mode='k'})  }
+
+  if super then
+    setmetatable(aClass.static, { __index = function(_,k) return rawget(dict,k) or super.static[k] end })
+  else
+    setmetatable(aClass.static, { __index = function(_,k) return rawget(dict,k) end })
+  end
+
+  setmetatable(aClass, { __index = aClass.static, __tostring = _tostring,
+                         __call = _call, __newindex = _declareInstanceMethod })
+
+  return aClass
+end
+
+local function _includeMixin(aClass, mixin)
+  assert(type(mixin) == 'table', "mixin must be a table")
+
+  for name,method in pairs(mixin) do
+    if name ~= "included" and name ~= "static" then aClass[name] = method end
+  end
+
+  for name,method in pairs(mixin.static or {}) do
+    aClass.static[name] = method
+  end
+
+  if type(mixin.included)=="function" then mixin:included(aClass) end
+  return aClass
+end
+
+local DefaultMixin = {
+  __tostring   = function(self) return "instance of " .. tostring(self.class) end,
+
+  initialize   = function(self, ...) end,
+
+  isInstanceOf = function(self, aClass)
+    return type(self)       == 'table' and
+           type(self.class) == 'table' and
+           type(aClass)     == 'table' and
+           ( aClass == self.class or
+             type(aClass.isSubclassOf) == 'function' and
+             self.class:isSubclassOf(aClass) )
+  end,
+
+  static = {
+    allocate = function(self)
+      assert(type(self) == 'table', "Make sure that you are using 'Class:allocate' instead of 'Class.allocate'")
+      return setmetatable({ class = self }, self.__instanceDict)
+    end,
+
+    new = function(self, ...)
+      assert(type(self) == 'table', "Make sure that you are using 'Class:new' instead of 'Class.new'")
+      local instance = self:allocate()
+      instance:initialize(...)
+      return instance
+    end,
+
+    subclass = function(self, name)
+      assert(type(self) == 'table', "Make sure that you are using 'Class:subclass' instead of 'Class.subclass'")
+      assert(type(name) == "string", "You must provide a name(string) for your class")
+
+      local subclass = _createClass(name, self)
+
+      for methodName, f in pairs(self.__instanceDict) do
+        _propagateInstanceMethod(subclass, methodName, f)
+      end
+      subclass.initialize = function(instance, ...) return self.initialize(instance, ...) end
+
+      self.subclasses[subclass] = true
+      self:subclassed(subclass)
+
+      return subclass
+    end,
+
+    subclassed = function(self, other) end,
+
+    isSubclassOf = function(self, other)
+      return type(other)      == 'table' and
+             type(self)       == 'table' and
+             type(self.super) == 'table' and
+             ( self.super == other or
+               type(self.super.isSubclassOf) == 'function' and
+               self.super:isSubclassOf(other) )
+    end,
+
+    include = function(self, ...)
+      assert(type(self) == 'table', "Make sure you that you are using 'Class:include' instead of 'Class.include'")
+      for _,mixin in ipairs({...}) do _includeMixin(self, mixin) end
+      return self
+    end
+  }
+}
+
+function middleclass.class(name, super)
+  assert(type(name) == 'string', "A name (string) is needed for the new class")
+  return super and super:subclass(name) or _includeMixin(_createClass(name), DefaultMixin)
+end
+
+setmetatable(middleclass, { __call = function(_, ...) return middleclass.class(...) end })
+
+return middleclass
diff --git a/src/lua/bcc/vendor/posix.lua b/src/lua/bcc/vendor/posix.lua
new file mode 100644
index 0000000..8e46713
--- /dev/null
+++ b/src/lua/bcc/vendor/posix.lua
@@ -0,0 +1,82 @@
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require("ffi")
+
+-- Avoid duplicate declarations if syscall library is present
+local has_syscall, _ = pcall(require, "syscall")
+if not has_syscall then
+  ffi.cdef [[
+  typedef int clockid_t;
+  typedef long time_t;
+
+  struct timespec {
+    time_t tv_sec;
+    long tv_nsec;
+  };
+
+  int clock_gettime(clockid_t clk_id, struct timespec *tp);
+  int clock_nanosleep(clockid_t clock_id, int flags,
+    const struct timespec *request, struct timespec *remain);
+  ]]
+end
+ffi.cdef [[
+int get_nprocs(void);
+uint64_t strtoull(const char *nptr, char **endptr, int base);
+]]
+
+local CLOCK = {
+  REALTIME                  = 0,
+  MONOTONIC                 = 1,
+  PROCESS_CPUTIME_ID        = 2,
+  THREAD_CPUTIME_ID         = 3,
+  MONOTONIC_RAW             = 4,
+  REALTIME_COARSE           = 5,
+  MONOTONIC_COARSE          = 6,
+}
+
+local function time_ns(clock)
+  local ts = ffi.new("struct timespec[1]")
+  assert(ffi.C.clock_gettime(clock or CLOCK.MONOTONIC_RAW, ts) == 0,
+    "clock_gettime() failed: "..ffi.errno())
+  return tonumber(ts[0].tv_sec * 1e9 + ts[0].tv_nsec)
+end
+
+local function sleep(seconds, clock)
+  local s, ns = math.modf(seconds)
+  local ts = ffi.new("struct timespec[1]")
+
+  ts[0].tv_sec = s
+  ts[0].tv_nsec = ns / 1e9
+
+  ffi.C.clock_nanosleep(clock or CLOCK.MONOTONIC, 0, ts, nil)
+end
+
+local function cpu_count()
+  return tonumber(ffi.C.get_nprocs())
+end
+
+local function tonumber64(n, base)
+  assert(type(n) == "string")
+  return ffi.C.strtoull(n, nil, base or 10)
+end
+
+return {
+  time_ns=time_ns,
+  sleep=sleep,
+  CLOCK=CLOCK,
+  cpu_count=cpu_count,
+  tonumber64=tonumber64,
+}
diff --git a/src/lua/bpf-scm-1.rockspec b/src/lua/bpf-scm-1.rockspec
new file mode 100644
index 0000000..7f6ba63
--- /dev/null
+++ b/src/lua/bpf-scm-1.rockspec
@@ -0,0 +1,37 @@
+package = "bpf"
+version = "scm-1"
+source = {
+   url = "git://github.com/iovisor/bcc.git"
+}
+description = {
+   summary = "BCC - LuaJIT to BPF compiler.",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/iovisor/bcc",
+   license = "BSD"
+}
+dependencies = {
+   "lua >= 5.1",
+   "ljsyscall >= 0.12",
+}
+external_dependencies = {
+    LIBELF = {
+       library = "elf"
+    }
+}
+build = {
+  type = "builtin",
+  install = {
+    bin = {
+    }
+  },
+  modules = {
+    bpf = "src/lua/bpf/bpf.lua",
+    ["bpf.builtins"] = "src/lua/bpf/builtins.lua",
+    ["bpf.cdef"] = "src/lua/bpf/cdef.lua",
+    ["bpf.elf"] = "src/lua/bpf/elf.lua",
+    ["bpf.init"] = "src/lua/bpf/init.lua",
+    ["bpf.ljbytecode"] = "src/lua/bpf/ljbytecode.lua",
+    ["bpf.proto"] = "src/lua/bpf/proto.lua",
+  }
+}
diff --git a/src/lua/bpf/bpf.lua b/src/lua/bpf/bpf.lua
new file mode 100644
index 0000000..215fb73
--- /dev/null
+++ b/src/lua/bpf/bpf.lua
@@ -0,0 +1,1630 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- LuaJIT to BPF bytecode compiler.
+--
+-- The code generation phase is currently one-pass and produces:
+-- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt)
+-- * Variables with liveness analysis and other meta (spill information, compile-time value)
+--
+-- The code generator optimises as much as possible in single pass:
+-- * Fold compile-time expressions and constant propagation
+-- * Basic control flow analysis with dead code elimination (based on compile-time expressions)
+-- * Single-pass optimistic register allocation
+--
+-- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further
+-- optimisations such as:
+-- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used)
+-- * Common sub-expression elimination (relies on DCE and liveness analysis)
+-- * Orphan JMP elimination (removing this in first pass would break previous JMP targets)
+-- * Better register allocation (needs to be recomputed after optimisations)
+
+local ffi = require('ffi')
+local bit = require('bit')
+local S = require('syscall')
+local bytecode = require('bpf.ljbytecode')
+local cdef = require('bpf.cdef')
+local proto = require('bpf.proto')
+local builtins = require('bpf.builtins')
+
+-- Constants
+local ALWAYS, NEVER = -1, -2
+local BPF = ffi.typeof('struct bpf')
+local HELPER = ffi.typeof('struct bpf_func_id')
+
+-- Symbolic table of constant expressions over numbers
+local const_expr = {
+	ADD = function (a, b) return a + b end,
+	SUB = function (a, b) return a - b end,
+	DIV = function (a, b) return a / b end,
+	MOD = function (a, b) return a % b end,
+	JEQ = function (a, b) return a == b end,
+	JNE = function (a, b) return a ~= b end,
+	JGE = function (a, b) return a >= b end,
+	JGT = function (a, b) return a > b end,
+}
+
+local const_width = {
+	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
+}
+
+-- Built-ins that are strict only (never compile-time expandable)
+local builtins_strict = {
+	[ffi.new] = true,
+	[print]   = true,
+}
+
+-- Deep copy a table
+local function table_copy(t)
+	local copy = {}
+	for n,v in pairs(t) do
+		if type(v) == 'table' then
+			v = table_copy(v)
+		end
+		copy[n] = v
+	end
+	return copy
+end
+
+-- Return true if the constant part is a proxy
+local function is_proxy(x)
+	return type(x) == 'table' and (x.__dissector or x.__map or x.__base)
+end
+
+-- Create compiler closure
+local function create_emitter(env, stackslots, params, param_types)
+
+local V = {}   -- Variable tracking / register allocator
+local code = { -- Generated code
+	pc = 0, bc_pc = 0,
+	insn = ffi.new('struct bpf_insn[4096]'),
+	fixup = {},
+	reachable = true,
+	seen_cmp = nil,
+}
+local Vstate = {} -- Track variable layout at basic block exits
+
+-- Anything below this stack offset is free to use by caller
+-- @note: There is no tracking memory allocator, so the caller may
+-- lower it for persistent objects, but such memory will never
+-- be reclaimed and the caller is responsible for resetting stack
+-- top whenever the memory below is free to be reused
+local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t')
+
+local function emit(op, dst, src, off, imm)
+	local ins = code.insn[code.pc]
+	ins.code = op
+	ins.dst_reg = dst
+	ins.src_reg = src
+	ins.off = off
+	ins.imm = imm
+	code.pc = code.pc + 1
+end
+
+local function reg_spill(var)
+	local vinfo = V[var]
+	assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register')
+	vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width)
+	emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0)
+	vinfo.reg = nil
+end
+
+local function reg_fill(var, reg)
+	local vinfo = V[var]
+	assert(reg, 'attempt to fill variable to register but not register is allocated')
+	assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled')
+	emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0)
+	vinfo.reg = reg
+	vinfo.spill = nil
+end
+
+-- Allocate a register (lazy simple allocator)
+local function reg_alloc(var, reg)
+	-- Specific register requested, must spill/move existing variable
+	if reg then
+		for k,v in pairs(V) do -- Spill any variable that has this register
+			if v.reg == reg and not v.shadow then
+				reg_spill(k)
+				break
+			end
+		end
+		return reg
+	end
+	-- Find free or least recently used slot
+	local last, last_seen, used = nil, 0xffff, 0
+	for k,v in pairs(V) do
+		if v.reg then
+			if not v.live_to or v.live_to < last_seen then
+				last, last_seen = k, v.live_to or last_seen
+			end
+			used = bit.bor(used, bit.lshift(1, v.reg))
+		end
+	end
+	-- Attempt to select a free register from R7-R9 (callee saved)
+	local free = bit.bnot(used)
+	if     bit.band(free, 0x80) ~= 0 then reg = 7
+	elseif bit.band(free,0x100) ~= 0 then reg = 8
+	elseif bit.band(free,0x200) ~= 0 then reg = 9
+	end
+	-- Select another variable to be spilled
+	if not reg then
+		assert(last)
+		reg = V[last].reg
+		reg_spill(last)
+	end
+	assert(reg, 'VAR '..var..'fill/spill failed')
+	return reg
+end
+
+-- Set new variable
+local function vset(var, reg, const, vtype)
+	-- Must materialise all variables shadowing this variable slot, as it will be overwritten
+	if V[var] and V[var].reg then
+		for _, vinfo in pairs(V) do
+			-- Shadowing variable MUST share the same type and attributes,
+			-- but the register assignment may have changed
+			if vinfo.shadow == var then
+				vinfo.reg = V[var].reg
+				vinfo.shadow = nil
+			end
+		end
+	end
+	-- Get precise type for CDATA or attempt to narrow numeric constant
+	if not vtype and type(const) == 'cdata' then
+		vtype = ffi.typeof(const)
+	end
+	V[var] = {reg=reg, const=const, type=vtype}
+	-- Track variable source
+	if V[var].const and type(const) == 'table' then
+		V[var].source = V[var].const.source
+	end
+end
+
+-- Materialize (or register) a variable in a register
+-- If the register is nil, then the a new register is assigned (if not already assigned)
+local function vreg(var, reg, reserve, vtype)
+	local vinfo = V[var]
+	assert(vinfo, 'VAR '..var..' not registered')
+	vinfo.live_to = code.pc-1
+	if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end
+	reg = reg_alloc(var, reg)
+	-- Materialize variable shadow copy
+	local src = vinfo
+	while src.shadow do src = V[src.shadow] end
+	if reserve then -- luacheck: ignore
+		-- No load to register occurs
+	elseif src.reg then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0)
+	elseif src.spill then
+		vinfo.spill = src.spill
+		reg_fill(var, reg)
+	elseif src.const then
+		vtype = vtype or src.type
+		if type(src.const) == 'table' and src.const.__base then
+			-- Load pointer type
+			emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0)
+			emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base)
+		elseif type(src.const) == 'table' and src.const.__dissector then
+			-- Load dissector offset (imm32), but keep the constant part (dissector proxy)
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0)
+		elseif vtype and ffi.sizeof(vtype) == 8 then
+			-- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
+			emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const))
+			emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16)))
+			vinfo.const = nil -- The variable is live
+		else
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const)
+			vinfo.const = nil -- The variable is live
+		end
+	else assert(false, 'VAR '..var..' has neither register nor constant value') end
+	vinfo.reg = reg
+	vinfo.shadow = nil
+	vinfo.live_from = code.pc-1
+	vinfo.type = vtype or vinfo.type
+	return reg
+end
+
+-- Copy variable
+local function vcopy(dst, src)
+	if dst == src then return end
+	V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type}
+end
+
+-- Dereference variable of pointer type
+local function vderef(dst_reg, src_reg, vinfo)
+	-- Dereference map pointers for primitive types
+	-- BPF doesn't allow pointer arithmetics, so use the entry value
+	assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable')
+	local vtype = vinfo.const.__dissector
+	local w = ffi.sizeof(vtype)
+	assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes')
+	if dst_reg ~= src_reg then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0)    -- dst = src
+	end
+	-- Optimize the NULL check away if provably not NULL
+	if not vinfo.source or vinfo.source:find('_or_null', 1, true) then
+		emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0)            -- if (src != NULL)
+	end
+	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) --     dst = *src;
+end
+
+-- Allocate a space for variable
+local function valloc(size, blank)
+	local base = stack_top
+	assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB')
+	stack_top = stack_top + size
+	-- Align to 8 byte boundary
+	stack_top = math.ceil(stack_top/8)*8
+	-- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK
+	-- so we always need to have memory initialized, remove this when supported
+	if blank then
+		if type(blank) == 'string' then
+			local sp = 0
+			while sp < size do
+				-- TODO: no BPF_ST + BPF_DW instruction yet
+				local as_u32 = ffi.new('uint32_t [1]')
+				local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32))
+				ffi.copy(as_u32, sub, #sub)
+				emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0])
+				sp = sp + ffi.sizeof(as_u32)
+			end
+		elseif type(blank) == 'boolean' then
+			reg_alloc(stackslots, 0)
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
+			for sp = base+8,stack_top,8 do
+				emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0)
+			end
+		else error('NYI: will with unknown type '..type(blank)) end
+	end
+	return stack_top
+end
+
+-- Turn variable into scalar in register (or constant)
+local function vscalar(a, w)
+	assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8')
+	local src_reg
+	-- If source is a pointer, we must dereference it first
+	if cdef.isptr(V[a].type) then
+		src_reg = vreg(a)
+		local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0)
+		vderef(tmp_reg, tmp_reg, V[a])
+		src_reg = tmp_reg -- Materialize and dereference it
+	-- Source is a value on stack, we must load it first
+	elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then
+		src_reg = vreg(a)
+		emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0)
+		V[a].type = V[a].const.__dissector
+		V[a].const = nil -- Value is dereferenced
+	-- If source is an imm32 number, avoid register load
+	elseif type(V[a].const) == 'number' and w < 8 then
+		return nil, V[a].const
+	-- Load variable from any other source
+	else
+		src_reg = vreg(a)
+	end
+
+	return src_reg, nil
+end
+
+-- Emit compensation code at the end of basic block to unify variable set layout on all block exits
+-- 1. we need to free registers by spilling
+-- 2. fill registers to match other exits from this BB
+local function bb_end(Vcomp)
+	for i,v in pairs(V) do
+		if Vcomp[i] and Vcomp[i].spill and not v.spill then
+			-- Materialize constant or shadowing variable to be able to spill
+			if not v.reg and (v.shadow or cdef.isimmconst(v)) then
+				vreg(i)
+			end
+			reg_spill(i)
+		end
+	end
+	for i,v in pairs(V) do
+		if Vcomp[i] and Vcomp[i].reg and not v.reg then
+			vreg(i, Vcomp[i].reg)
+		end
+		-- Compensate variable metadata change
+		if Vcomp[i] and Vcomp[i].source then
+			V[i].source = Vcomp[i].source
+		end
+	end
+end
+
+local function CMP_STR(a, b, op)
+	assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=')
+	-- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP
+	-- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ...
+	--     EQ(a,b) <=> X == 0
+	-- This could be optimised by placing early exits by rewriter in second phase for long strings
+	local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type))
+	local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1)
+	local sp = 0
+	emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0)
+	while sp < size do
+		-- Load string chunk as imm32
+		local as_u32 = ffi.new('uint32_t [1]')
+		local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32))
+		ffi.copy(as_u32, sub, #sub)
+		-- TODO: make this faster by interleaved load/compare steps with DW length
+		emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0)
+		emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0])
+		emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0)
+		sp = sp + ffi.sizeof(as_u32)
+	end
+	emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0)
+	code.seen_cmp = code.pc-1
+end
+
+local function CMP_REG(a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
+		code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER
+	else
+		-- Comparison against compile-time string or stack memory
+		if V[b].const and type(V[b].const) == 'string' then
+			return CMP_STR(a, V[b].const, op)
+		end
+		-- The 0xFFFF target here has no significance, it's just a placeholder for
+		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
+		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
+		local a_reg, b_reg = vreg(a), vreg(b)
+		emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0)
+		code.seen_cmp = code.pc-1
+	end
+end
+
+local function CMP_IMM(a, b, op)
+	local c = V[a].const
+	if c and not is_proxy(c) then -- Fold compile-time expressions
+		code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER
+	else
+		-- Convert imm32 to number
+		if type(b) == 'string' then
+			if     #b == 1 then b = b:byte()
+			elseif cdef.isptr(V[a].type) then
+				-- String comparison between stack/constant string
+				return CMP_STR(a, b, op)
+			elseif #b <= 4 then
+				-- Convert to u32 with network byte order
+				local imm = ffi.new('uint32_t[1]')
+				ffi.copy(imm, b, #b)
+				b = builtins.hton(imm[0])
+			else error('NYI: compare register with string, where #string > sizeof(u32)') end
+		end
+		-- The 0xFFFF target here has no significance, it's just a placeholder for
+		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
+		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
+		local reg = vreg(a)
+		emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b)
+		code.seen_cmp = code.pc-1
+		-- Remember NULL pointer checks as BPF prohibits pointer comparisons
+		-- and repeated checks wouldn't pass the verifier, only comparisons
+		-- against constants are checked.
+		if op == 'JEQ' and tonumber(b) == 0 and V[a].source then
+			local pos = V[a].source:find('_or_null', 1, true)
+			if pos then
+				code.seen_null_guard = a
+			end
+		-- Inverse NULL pointer check (if a ~= nil)
+		elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then
+			local pos = V[a].source:find('_or_null', 1, true)
+			if pos then
+				code.seen_null_guard = a
+				code.seen_null_guard_inverse = true
+			end
+		end
+	end
+end
+
+local function ALU_IMM(dst, a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and not is_proxy(V[a].const) then
+			assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
+			vset(dst, nil, const_expr[op](V[a].const, b))
+	-- Now we need to materialize dissected value at DST, and add it
+	else
+		vcopy(dst, a)
+		local dst_reg = vreg(dst)
+		if cdef.isptr(V[a].type) then
+			vderef(dst_reg, dst_reg, V[a])
+			V[dst].type = V[a].const.__dissector
+		else
+			V[dst].type = V[a].type
+		end
+		emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b)
+	end
+end
+
+local function ALU_REG(dst, a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
+		assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
+		assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric')
+		if type(op) == 'string' then op = const_expr[op] end
+		vcopy(dst, a)
+		V[dst].const = op(V[a].const, V[b].const)
+	else
+		local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations
+		if b and cdef.isptr(V[b].type) then
+			-- We have to allocate a temporary register for dereferencing to preserve
+			-- pointer in source variable that MUST NOT be altered
+			reg_alloc(stackslots, 2)
+			vderef(2, src_reg, V[b])
+			src_reg = 2
+		end
+		vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B
+		local dst_reg = vreg(dst)
+		if cdef.isptr(V[a].type) then
+			vderef(dst_reg, dst_reg, V[a])
+			V[dst].type = V[a].const.__dissector
+		end
+		emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0)
+		V[stackslots].reg = nil  -- Free temporary registers
+	end
+end
+
+local function ALU_IMM_NV(dst, a, b, op)
+	-- Do DST = IMM(a) op VAR(b) where we can't invert because
+	-- the registers are u64 but immediates are u32, so complement
+	-- arithmetics wouldn't work
+	vset(stackslots+1, nil, a)
+	ALU_REG(dst, stackslots+1, b, op)
+end
+
+local function LD_ABS(dst, w, off)
+	assert(off, 'LD_ABS called without offset')
+	if w < 8 then
+		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
+		emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off)
+		if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
+			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
+		end
+	elseif w == 8 then
+		-- LD_ABS|IND prohibits DW, we need to do two W loads and combine them
+		local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0
+		emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4)
+		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
+			emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32)
+		end
+		ALU_IMM(stackslots, stackslots, 32, 'LSH')
+		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable
+		emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off)
+		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
+			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32)
+		end
+		ALU_REG(dst, dst, stackslots, 'OR')
+		V[stackslots].reg = nil -- Free temporary registers
+	else
+		assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported')
+	end
+end
+
+local function LD_IND(dst, src, w, off)
+	local src_reg = vreg(src) -- Must materialize first in case dst == src
+	local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
+	emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0)
+	if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
+		emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
+	end
+end
+
+local function LD_MEM(dst, src, w, off)
+	local src_reg = vreg(src) -- Must materialize first in case dst == src
+	local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0
+	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0)
+end
+
+-- @note: This is specific now as it expects registers reserved
+local function LD_IMM_X(dst_reg, src_type, imm, w)
+	if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
+		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm))
+		-- Must shift in two steps as bit.lshift supports [0..31]
+		emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16)))
+	else
+		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm)
+	end
+end
+
+local function BUILTIN(func, ...)
+	local builtin_export = {
+		-- Compiler primitives (work with variable slots, emit instructions)
+		V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit,
+		reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width,
+		-- Extensions and helpers (use with care)
+		LD_IMM_X = LD_IMM_X,
+	}
+	func(builtin_export, ...)
+end
+
+local function LOAD(dst, src, off, vtype)
+	local base = V[src].const
+	assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector')
+	assert(V[src].source, 'NYI: load() on variable with unknown source')
+	-- Cast to different type if requested
+	vtype = vtype or base.__dissector
+	local w = ffi.sizeof(vtype)
+	assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w))
+	-- Packet access with a dissector (use BPF_LD)
+	if V[src].source:find('ptr_to_pkt', 1, true) then
+		if base.off then -- Absolute address to payload
+			LD_ABS(dst, w, off + base.off)
+		else -- Indirect address to payload
+			LD_IND(dst, src, w, off)
+		end
+	-- Direct access to first argument (skb fields, pt regs, ...)
+	elseif V[src].source:find('ptr_to_ctx', 1, true) then
+		LD_MEM(dst, src, w, off)
+	-- Direct skb access with a dissector (use BPF_MEM)
+	elseif V[src].source:find('ptr_to_skb', 1, true) then
+		LD_MEM(dst, src, w, off)
+	-- Pointer to map-backed memory (use BPF_MEM)
+	elseif V[src].source:find('ptr_to_map_value', 1, true) then
+		LD_MEM(dst, src, w, off)
+	-- Indirect read using probe (uprobe or kprobe, uses helper)
+	elseif V[src].source:find('ptr_to_probe', 1, true) then
+		BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off)
+		V[dst].source = V[src].source -- Builtin handles everything
+	else
+		error('NYI: load() on variable from ' .. V[src].source)
+	end
+	V[dst].type = vtype
+	V[dst].const = nil -- Dissected value is not constant anymore
+end
+
+local function CALL(a, b, d)
+	assert(b-1 <= 1, 'NYI: CALL with >1 return values')
+	-- Perform either compile-time, helper, or builtin
+	local func = V[a].const
+	-- Gather all arguments and check if they're constant
+	local args, const, nargs = {}, true, d - 1
+	for i = a+1, a+d-1 do
+		table.insert(args, V[i].const)
+		if not V[i].const or is_proxy(V[i].const) then const = false end
+	end
+	local builtin = builtins[func]
+	if not const or nargs == 0 then
+		if builtin and type(builtin) == 'function' then
+			args = {a}
+			for i = a+1, a+nargs do table.insert(args, i) end
+			BUILTIN(builtin, unpack(args))
+		elseif V[a+2] and V[a+2].const then -- var OP imm
+			ALU_IMM(a, a+1, V[a+2].const, builtin)
+		elseif nargs <= 2 then              -- var OP var
+			ALU_REG(a, a+1, V[a+2] and a+2, builtin)
+		else
+			error('NYI: CALL non-builtin with 3 or more arguments')
+		end
+	-- Call on dissector implies slice retrieval
+	elseif type(func) == 'table' and func.__dissector then
+		assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments')
+		assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant')
+		local off = V[a+1].const
+		local vtype = builtins.width_type(V[a+2].const - off)
+		-- Access to packet via packet (use BPF_LD)
+		if V[a].source and V[a].source:find('ptr_to_', 1, true) then
+			LOAD(a, a, off, vtype)
+		else
+			error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown'))
+		end
+	-- Strict builtins cannot be expanded on compile-time
+	elseif builtins_strict[func] and builtin then
+		args = {a}
+		for i = a+1, a+nargs do table.insert(args, i) end
+		BUILTIN(builtin, unpack(args))
+	-- Attempt compile-time call expansion (expects all argument compile-time known)
+	else
+		assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant')
+		V[a].const = func(unpack(args))
+	end
+end
+
+local function MAP_INIT(map_var, key, imm)
+	local map = V[map_var].const
+	vreg(map_var, 1, true, ffi.typeof('uint64_t'))
+	-- Reserve R1 and load ptr for process-local map fd
+	LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type))
+	V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation
+	-- Reserve R2 and load R2 = key pointer
+	local key_size = ffi.sizeof(map.key_type)
+	local w = const_width[key_size] or BPF.DW
+	local pod_type = const_width[key_size]
+	local sp = stack_top + key_size -- Must use stack below spill slots
+	-- Store immediate value on stack
+	reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable)
+	local key_base = key and V[key].const
+	imm = imm or key_base
+	if imm and (not key or not is_proxy(key_base)) then
+		assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8')
+		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm)
+	-- Key is in register, spill it
+	elseif V[key].reg and pod_type then
+		if cdef.isptr(V[key].type) then
+			-- There is already pointer in register, dereference before spilling
+			emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0)
+			emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0)
+		else -- Variable in register is POD, spill it on the stack
+			emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0)
+		end
+	-- Key is spilled from register to stack
+	elseif V[key].spill then
+		sp = V[key].spill
+	-- Key is already on stack, write to base-relative address
+	elseif key_base.__base then
+		assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type')
+		sp = key_base.__base
+	else
+		error('VAR '..key..' is neither const-expr/register/stack/spilled')
+	end
+	-- If [FP+K] addressing, emit it
+	if sp then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0)
+		emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp)
+	end
+end
+
+local function MAP_GET(dst, map_var, key, imm)
+	local map = V[map_var].const
+	MAP_INIT(map_var, key, imm)
+	-- Flag as pointer type and associate dissector for map value type
+	vreg(dst, 0, true, ffi.typeof('uint8_t *'))
+	V[dst].const = {__dissector=map.val_type}
+	V[dst].source = 'ptr_to_map_value_or_null'
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+local function MAP_DEL(map_var, key, key_imm)
+	-- Set R0, R1 (map fd, preempt R0)
+	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
+	MAP_INIT(map_var, key, key_imm)
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+local function MAP_SET(map_var, key, key_imm, src)
+	local map = V[map_var].const
+	-- Delete when setting nil
+	if V[src].type == ffi.typeof('void') then
+		return MAP_DEL(map_var, key, key_imm)
+	end
+	-- Set R0, R1 (map fd, preempt R0)
+	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
+	MAP_INIT(map_var, key, key_imm)
+	reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable)
+	emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing
+	-- Reserve R3 for value pointer
+	reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable)
+	local val_size = ffi.sizeof(map.val_type)
+	local w = const_width[val_size] or BPF.DW
+	local pod_type = const_width[val_size]
+	-- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value)
+	local sp = stack_top + ffi.sizeof(map.key_type) + val_size
+	sp = sp + (sp % val_size)
+	local base = V[src].const
+	if base and not is_proxy(base) then
+		assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8')
+		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base)
+	-- Value is in register, spill it
+	elseif V[src].reg and pod_type then
+		-- Value is a pointer, derefernce it and spill it
+		if cdef.isptr(V[src].type) then
+			vderef(3, V[src].reg, V[src])
+			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
+		else
+			emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0)
+		end
+	-- We get a pointer to spilled register on stack
+	elseif V[src].spill then
+		-- If variable is a pointer, we can load it to R3 directly (save "LEA")
+		if cdef.isptr(V[src].type) then
+			reg_fill(src, 3)
+			-- If variable is a stack pointer, we don't have to check it
+			if base.__base then
+				emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
+				return
+			end
+			vderef(3, V[src].reg, V[src])
+			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
+		else
+			sp = V[src].spill
+		end
+	-- Value is already on stack, write to base-relative address
+	elseif base.__base then
+		if val_size ~= ffi.sizeof(V[src].type) then
+			local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d',
+				src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type))
+			error(err)
+		end
+		sp = base.__base
+	-- Value is constant, materialize it on stack
+	else
+		error('VAR '.. src ..' is neither const-expr/register/stack/spilled')
+	end
+	emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
+	emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp)
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+-- Finally - this table translates LuaJIT bytecode into code emitter actions.
+local BC = {
+	-- Constants
+	KNUM = function(a, _, c, _) -- KNUM
+		if c < 2147483648 then
+			vset(a, nil, c, ffi.typeof('int32_t'))
+		else
+			vset(a, nil, c, ffi.typeof('uint64_t'))
+		end
+	end,
+	KSHORT = function(a, _, _, d) -- KSHORT
+		vset(a, nil, d, ffi.typeof('int16_t'))
+	end,
+	KCDATA = function(a, _, c, _) -- KCDATA
+		-- Coerce numeric types if possible
+		local ct = ffi.typeof(c)
+		if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then
+			vset(a, nil, c, ct)
+		elseif tonumber(c) ~= nil then
+			-- TODO: this should not be possible
+			vset(a, nil, tonumber(c), ct)
+		else
+			error('NYI: cannot use CDATA constant of type ' .. ct)
+		end
+	end,
+	KPRI = function(a, _, _, d) -- KPRI
+		-- KNIL is 0, must create a special type to identify it
+		local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t')
+		vset(a, nil, (d < 2) and 0 or 1, vtype)
+	end,
+	KSTR = function(a, _, c, _) -- KSTR
+		vset(a, nil, c, ffi.typeof('const char[?]'))
+	end,
+	MOV = function(a, _, _, d) -- MOV var, var
+		vcopy(a, d)
+	end,
+
+	-- Comparison ops
+	-- Note: comparisons are always followed by JMP opcode, that
+	--       will fuse following JMP to JMP+CMP instruction in BPF
+	-- Note:  we're narrowed to integers, so operand/operator inversion is legit
+	ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted)
+	ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d)
+	ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d)
+	ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d)
+	ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d)
+	ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c))
+	ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c))
+	ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c)
+	ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c)
+	IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d)
+	ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d)
+	ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c)
+	-- Binary operations with RHS constants
+	ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end,
+	SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end,
+	MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end,
+	DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end,
+	MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end,
+	-- Binary operations with LHS constants
+	-- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative
+	ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV
+	MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV
+	SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV
+	DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV
+	-- Binary operations between registers
+	ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end,
+	SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end,
+	MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end,
+	DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end,
+	MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end,
+	-- Strings
+	CAT = function(a, b, _, d) -- CAT A = B ~ D
+		assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions')
+		assert(type(V[b].const) == 'string' and type(V[d].const) == 'string',
+			'NYI: CAT only works on compile-time strings')
+		vset(a, nil, V[b].const .. V[d].const)
+	end,
+	-- Tables
+	GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c])
+		if env[c] ~= nil then
+			vset(a, nil, env[c])
+		else error(string.format("undefined global '%s'", c)) end
+	end,
+	UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c])
+		if env[c] ~= nil then
+			vset(a, nil, env[c])
+		else error(string.format("undefined upvalue '%s'", c)) end
+	end,
+	TSETB = function (a, b, _, d) -- TSETB (B[D] = A)
+		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
+		local vinfo = V[b].const
+		if vinfo.__map then -- BPF map read (constant)
+			return MAP_SET(b, nil, d, a) -- D is literal
+		elseif vinfo.__dissector then
+			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
+			local w = ffi.sizeof(vinfo.__dissector)
+			-- TODO: support vectorized moves larger than register width
+			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
+			local src_reg, const = vscalar(a, w)
+			-- If changing map value, write to absolute address + offset
+			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
+				local dst_reg = vreg(b)
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' then
+					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0)
+				end
+			-- Table is already on stack, write to vinfo-relative address
+			elseif vinfo.__base then
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' then
+					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0)
+				end
+			else
+				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
+			end
+		elseif vinfo and vinfo and V[a].const then
+			vinfo[V[d].const] = V[a].const
+		else
+			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
+		end
+	end,
+	TSETV = function (a, b, _, d) -- TSETV (B[D] = A)
+		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
+		local vinfo = V[b].const
+		if vinfo.__map then -- BPF map read (constant)
+			return MAP_SET(b, d, nil, a) -- D is variable
+		elseif vinfo.__dissector then
+			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
+			local w = ffi.sizeof(vinfo.__dissector)
+			-- TODO: support vectorized moves larger than register width
+			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
+			local src_reg, const = vscalar(a, w)
+			-- If changing map value, write to absolute address + offset
+			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
+				-- Calculate variable address from two registers
+				local tmp_var = stackslots + 1
+				vset(tmp_var, nil, d)
+				ALU_REG(tmp_var, tmp_var, b, 'ADD')
+				local dst_reg = vreg(tmp_var)
+				V[tmp_var].reg = nil -- Only temporary allocation
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' and w < 8 then
+					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0)
+				end
+			-- Table is already on stack, write to vinfo-relative address
+			elseif vinfo.__base then
+				-- Calculate variable address from two registers
+				local tmp_var = stackslots + 1
+				vcopy(tmp_var, d)                       -- Element position
+				if w > 1 then
+					ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size
+				end
+				local dst_reg = vreg(tmp_var)           -- add R10 (stack pointer)
+				emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0)
+				V[tmp_var].reg = nil -- Only temporary allocation
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' and w < 8 then
+					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0)
+				end
+			else
+				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
+			end
+		elseif vinfo and V[d].const and V[a].const then
+			vinfo[V[d].const] = V[a].const
+		else
+			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
+		end
+	end,
+	TSETS = function (a, b, c, _) -- TSETS (B[C] = A)
+		assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
+		local base = V[b].const
+		if base.__dissector then
+			local ofs,bpos = ffi.offsetof(base.__dissector, c)
+			assert(not bpos, 'NYI: B[C] = A, where C is a bitfield')
+			local w = builtins.sizeofattr(base.__dissector, c)
+			-- TODO: support vectorized moves larger than register width
+			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
+			local src_reg, const = vscalar(a, w)
+			-- If changing map value, write to absolute address + offset
+			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
+				local dst_reg = vreg(b)
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' and w < 8 then
+					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0)
+				end
+			-- Table is already on stack, write to base-relative address
+			elseif base.__base then
+				-- Optimization: immediate values (imm32) can be stored directly
+				if type(const) == 'number' and w < 8 then
+					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const)
+				else
+					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0)
+				end
+			else
+				error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
+			end
+		elseif V[a].const then
+			base[c] = V[a].const
+		else
+			error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
+		end
+	end,
+	TGETB = function (a, b, _, d) -- TGETB (A = B[D])
+		local base = V[b].const
+		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
+		if a ~= b then vset(a) end
+		if base.__map then -- BPF map read (constant)
+			MAP_GET(a, b, nil, d)
+		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
+		elseif V[b].source and V[b].source:find('ptr_to_') then
+			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
+			LOAD(a, b, d, vtype)
+		-- Specialise PTR[0] as dereference operator
+		elseif cdef.isptr(V[b].type) and d == 0 then
+			vcopy(a, b)
+			local dst_reg = vreg(a)
+			vderef(dst_reg, dst_reg, V[a])
+			V[a].type = V[a].const.__dissector
+		else
+			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
+		end
+	end,
+	TGETV = function (a, b, _, d) -- TGETV (A = B[D])
+		local base = V[b].const
+		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
+		if a ~= b then vset(a) end
+		if base.__map then -- BPF map read
+			MAP_GET(a, b, d)
+		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
+		elseif V[b].source and V[b].source:find('ptr_to_') then
+			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
+			LOAD(a, b, d, vtype)
+		-- Constant dereference
+		elseif type(V[d].const) == 'number' then
+			V[a].const = base[V[d].const]
+		else
+			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
+		end
+	end,
+	TGETS = function (a, b, c, _) -- TGETS (A = B[C])
+		local base = V[b].const
+		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
+		if a ~= b then vset(a) end
+		if base.__dissector then
+			local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
+			-- Resolve table key using metatable
+			if not ofs and type(base.__dissector[c]) == 'string' then
+				c = base.__dissector[c]
+				ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
+			end
+			if not ofs and proto[c] then -- Load new dissector on given offset
+				BUILTIN(proto[c], a, b, c)
+			else
+				-- Loading register from offset is a little bit tricky as there are
+				-- several data sources and value loading modes with different restrictions
+				-- such as checking pointer values for NULL compared to using stack.
+				assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists')
+				if a ~= b then vset(a) end
+				-- Dissected value is probably not constant anymore
+				local new_const = nil
+				local w, atype = builtins.sizeofattr(base.__dissector, c)
+				-- [SP+K] addressing using R10 (stack pointer)
+				-- Doesn't need to be checked for NULL
+				if base.__base and base.__base > 0 then
+					if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset
+						new_const = {__base = base.__base-ofs}
+					else
+						local dst_reg = vreg(a, nil, true)
+						emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0)
+					end
+				-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
+				elseif V[b].source and V[b].source:find('ptr_to_') then
+					LOAD(a, b, ofs, atype)
+				else
+					error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
+				end
+				-- Bitfield, must be further narrowed with a bitmask/shift
+				if bpos then
+					local mask = 0
+					for i=bpos+1,bpos+bsize do
+						mask = bit.bor(mask, bit.lshift(1, w*8-i))
+					end
+					emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask)
+					-- Free optimization: single-bit values need just boolean result
+					if bsize > 1 then
+						local shift = w*8-bsize-bpos
+						if shift > 0 then
+							emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift)
+						end
+					end
+				end
+				V[a].type = atype
+				V[a].const = new_const
+				V[a].source = V[b].source
+				-- Track direct access to skb data
+				-- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access"
+				if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then
+					-- Direct access to skb uses skb->data and skb->data_end
+					-- which are encoded as u32, but are actually pointers
+					if c == 'data' or c == 'data_end' then
+						V[a].const = {__dissector = ffi.typeof('uint8_t')}
+						V[a].source = 'ptr_to_skb'
+					end
+				end
+			end
+		else
+			V[a].const = base[c]
+		end
+	end,
+	-- Loops and branches
+	CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES)
+		-- NYI: Support single result only
+		CALL(a, b, d+2)
+	end,
+	CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1)
+		CALL(a, b, d)
+	end,
+	JMP = function (a, _, c, _) -- JMP
+		-- Discard unused slots after jump
+		for i, _ in pairs(V) do
+			if i >= a and i < stackslots then
+				V[i] = nil
+			end
+		end
+		-- Cross basic block boundary if the jump target isn't provably unreachable
+		local val = code.fixup[c] or {}
+		if code.seen_cmp and code.seen_cmp ~= ALWAYS then
+			if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup
+				-- Store previous CMP insn for reemitting after compensation code
+				local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1])
+				code.pc = code.pc - 1
+				-- First branch point, emit compensation code
+				local Vcomp = Vstate[c]
+				if not Vcomp then
+					-- Select scratch register (R0-5) that isn't used as operand
+					-- in the CMP instruction, as the variable may not be live, after
+					-- the JMP, but it may be used in the JMP+CMP instruction itself
+					local tmp_reg = 0
+					for reg = 0, 5 do
+						if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then
+							tmp_reg = reg
+							break
+						end
+					end
+					-- Force materialization of constants at the end of BB
+					for i, v in pairs(V) do
+						if not v.reg and cdef.isimmconst(v) then
+							vreg(i, tmp_reg) -- Load to TMP register (not saved)
+							reg_spill(i) -- Spill caller-saved registers
+						end
+					end
+					-- Record variable state
+					Vstate[c] = V
+					Vcomp = V
+					V = table_copy(V)
+				-- Variable state already set, emit specific compensation code
+				else
+					bb_end(Vcomp)
+				end
+				-- Record pointer NULL check from condition
+				-- If the condition checks pointer variable against NULL,
+				-- we can assume it will not be NULL in the fall-through block
+				if code.seen_null_guard then
+					local var = code.seen_null_guard
+					-- The null guard can have two forms:
+					--   if x == nil then goto
+					--   if x ~= nil then goto
+					-- First form guarantees that the variable will be non-nil on the following instruction
+					-- Second form guarantees that the variable will be non-nil at the jump target
+					local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var]
+					if vinfo.source then
+						local pos = vinfo.source:find('_or_null', 1, true)
+						if pos then
+							vinfo.source = vinfo.source:sub(1, pos - 1)
+						end
+					end
+				end
+				-- Reemit CMP insn
+				emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm)
+				-- Fuse JMP into previous CMP opcode, mark JMP target for fixup
+				-- as we don't knot the relative offset in generated code yet
+				table.insert(val, code.pc-1)
+				code.fixup[c] = val
+			end
+			code.seen_cmp = nil
+			code.seen_null_guard = nil
+			code.seen_null_guard_inverse = nil
+		elseif c == code.bc_pc + 1 then -- luacheck: ignore 542
+			-- Eliminate jumps to next immediate instruction
+			-- e.g. 0002    JMP      1 => 0003
+		else
+			-- We need to synthesise a condition that's always true, however
+			-- BPF prohibits pointer arithmetic to prevent pointer leaks
+			-- so we have to clear out one register and use it for cmp that's always true
+			local dst_reg = reg_alloc(stackslots)
+			V[stackslots].reg = nil -- Only temporary allocation
+			-- First branch point, emit compensation code
+			local Vcomp = Vstate[c]
+			if not Vcomp then
+				-- Force materialization of constants at the end of BB
+				for i, v in pairs(V) do
+					if not v.reg and cdef.isimmconst(v) then
+						vreg(i, dst_reg) -- Load to TMP register (not saved)
+						reg_spill(i) -- Spill caller-saved registers
+					end
+				end
+				-- Record variable state
+				Vstate[c] = V
+				V = table_copy(V)
+			-- Variable state already set, emit specific compensation code
+			else
+				bb_end(Vcomp)
+			end
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0)
+			emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0)
+			table.insert(val, code.pc-1) -- Fixup JMP target
+			code.reachable = false -- Code following the JMP is not reachable
+			code.fixup[c] = val
+		end
+	end,
+	RET1 = function (a, _, _, _) -- RET1
+		-- Free optimisation: spilled variable will not be filled again
+		for i, v in pairs(V) do
+			if i ~= a then v.reg = nil end
+		end
+		if V[a].reg ~= 0 then vreg(a, 0) end
+		-- Convenience: dereference pointer variables
+		-- e.g. 'return map[k]' will return actual map value, not pointer
+		if cdef.isptr(V[a].type) then
+			vderef(0, 0, V[a])
+		end
+		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
+		code.reachable = false
+	end,
+	RET0 = function (_, _, _, _) -- RET0
+		emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
+		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
+		code.reachable = false
+	end,
+	compile = function ()
+		return code
+	end
+}
+
+-- Composite instructions
+function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1)
+	CALL(a, 1, d)
+	BC.RET1(a)
+end
+
+-- Always initialize R6 with R1 context
+emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0)
+-- Register R6 as context variable (first argument)
+if params and params > 0 then
+	vset(0, 6, param_types[1] or proto.skb)
+	assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo
+end
+-- Register tmpvars
+vset(stackslots)
+vset(stackslots+1)
+return setmetatable(BC, {
+	__index = function (_, k, _)
+		if type(k) == 'number' then
+			local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6)
+			error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str))
+		end
+	end,
+	__call = function (t, op, a, b, c, d)
+		code.bc_pc = code.bc_pc + 1
+		-- Exitting BB straight through, emit compensation code
+		if Vstate[code.bc_pc] then
+			if code.reachable then
+				-- Instruction is reachable from previous line
+				-- so we must make the variable allocation consistent
+				-- with the variable allocation at the jump source
+				-- e.g. 0001 x:R0 = 5
+				--      0002 if rand() then goto 0005
+				--      0003 x:R0 -> x:stack
+				--      0004 y:R0 = 5
+				--      0005 x:? = 10 <-- x was in R0 before jump, and stack after jump
+				bb_end(Vstate[code.bc_pc])
+			else
+				-- Instruction isn't reachable from previous line, restore variable layout
+				-- e.g. RET or condition-less JMP on previous line
+				V = table_copy(Vstate[code.bc_pc])
+			end
+		end
+		-- Perform fixup of jump targets
+		-- We need to do this because the number of consumed and emitted
+		-- bytecode instructions is different
+		local fixup = code.fixup[code.bc_pc]
+		if fixup ~= nil then
+			-- Patch JMP source insn with relative offset
+			for _,pc in ipairs(fixup) do
+				code.insn[pc].off = code.pc - 1 - pc
+			end
+			code.fixup[code.bc_pc] = nil
+			code.reachable = true
+		end
+		-- Execute
+		if code.reachable then
+			assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d))
+			return t[op](a, b, c, d)
+		end
+	end,
+})
+end
+
+-- Emitted code dump
+local function dump_mem(cls, ins, _, fuse)
+	-- This is a very dense MEM instruction decoder without much explanation
+	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
+	local mode = bit.band(ins.code, 0xe0)
+	if mode == BPF.XADD then cls = 5 end -- The only mode
+	local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'}
+	local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'}
+	local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)]
+	local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed
+	local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off)
+	local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg
+	if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end
+	if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end
+	if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end
+	return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src)
+end
+
+local function dump_alu(cls, ins, pc)
+	local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' }
+	local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'}
+	local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns',
+					'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes',
+					'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid',
+					'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop',
+					'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm',
+					'perf_event_output', 'skb_load_bytes'}
+	local op = 0
+	-- This is a very dense ALU instruction decoder without much explanation
+	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
+	for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end
+	local name = (cls == 5) and jmp[op] or alu[op]
+	local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm
+	local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or ''
+	if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end
+	return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target)
+end
+
+local function dump_string(code, off, hide_counter)
+	if not code then return end
+	local cls_map = {
+		[0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem,
+		[4] = dump_alu, [5] = dump_alu, [7] = dump_alu,
+	}
+	local result = {}
+	local fused = false
+	for i = off or 0, code.pc - 1 do
+		local ins = code.insn[i]
+		local cls = bit.band(ins.code, 0x07)
+		local line = cls_map[cls](cls, ins, i, fused)
+		if hide_counter then
+			table.insert(result, line)
+		else
+			table.insert(result, string.format('%04u\t%s', i, line))
+		end
+		fused = string.find(line, 'LDDW', 1)
+	end
+	return table.concat(result, '\n')
+end
+
+local function dump(code)
+	if not code then return end
+	print(string.format('-- BPF %s:0-%u', code.insn, code.pc))
+	print(dump_string(code))
+end
+
+local function compile(prog, params)
+	-- Create code emitter sandbox, include caller locals
+	local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi }
+	-- Include upvalues up to 4 nested scopes back
+	-- the narrower scope overrides broader scope
+	for k = 5, 2, -1 do
+		local i = 1
+		while true do
+			local ok, n, v = pcall(debug.getlocal, k, i)
+			if not ok or not n then break end
+			env[n] = v
+			i = i + 1
+		end
+	end
+	setmetatable(env, {
+		__index = function (_, k)
+			return proto[k] or builtins[k] or _G[k]
+		end
+	})
+	-- Create code emitter and compile LuaJIT bytecode
+	if type(prog) == 'string' then prog = loadstring(prog) end
+	-- Create error handler to print traceback
+	local funci, pc = bytecode.funcinfo(prog), 0
+	local E = create_emitter(env, funci.stackslots, funci.params, params or {})
+	local on_err = function (e)
+			funci = bytecode.funcinfo(prog, pc)
+			local from, to = 0, 0
+			for _ = 1, funci.currentline do
+				from = to
+				to = string.find(funci.source, '\n', from+1, true) or 0
+			end
+			print(funci.loc..':'..string.sub(funci.source, from+1, to-1))
+			print('error: '..e)
+			print(debug.traceback())
+	end
+	for _,op,a,b,c,d in bytecode.decoder(prog) do
+		local ok, _, err = xpcall(E,on_err,op,a,b,c,d)
+		if not ok then
+			return nil, err
+		end
+	end
+	return E:compile()
+end
+
+-- BPF map interface
+local bpf_map_mt = {
+	__gc = function (map) S.close(map.fd) end,
+	__len = function(map) return map.max_entries end,
+	__index = function (map, k)
+		if type(k) == 'string' then
+			-- Return iterator
+			if k == 'pairs' then
+				return function(t, key)
+					-- Get next key
+					local next_key = ffi.new(ffi.typeof(t.key))
+					local cur_key
+					if key then
+						cur_key = t.key
+						t.key[0] = key
+					else
+						cur_key = ffi.new(ffi.typeof(t.key))
+					end
+					local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key)
+					if not ok then return nil, err end
+					-- Get next value
+					assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val))
+					return next_key[0], map.val[0]
+				end, map, nil
+			-- Read for perf event map
+			elseif k == 'reader' then
+				return function (pmap, pid, cpu, event_type)
+					-- Caller must either specify PID or CPU
+					if not pid or pid < 0 then
+						assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs')
+						pid = -1
+					end
+					-- Create BPF output reader
+					local pe = S.t.perf_event_attr1()
+					pe[0].type = 'software'
+					pe[0].config = 'sw_bpf_output'
+					pe[0].sample_type = 'raw'
+					pe[0].sample_period = 1
+					pe[0].wakeup_events = 1
+					local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1))
+					if not reader then return nil, tostring(err) end
+					-- Register event reader fd in BPF map
+					assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu))
+					pmap[cpu] = reader.fd
+					-- Open memory map and start reading
+					local ok, err = reader:start()
+					assert(ok, tostring(err))
+					ok, err = reader:mmap()
+					assert(ok, tostring(err))
+					return cdef.event_reader(reader, event_type)
+				end
+			-- Signalise this is a map type
+			end
+			return k == '__map'
+		end
+		-- Retrieve key
+		map.key[0] = k
+		local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val)
+		if not ok then return nil, err end
+		return ffi.new(map.val_type, map.val[0])
+	end,
+	__newindex = function (map, k, v)
+		map.key[0] = k
+		if v == nil then
+			return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil)
+		end
+		map.val[0] = v
+		return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val)
+	end,
+}
+
+-- Linux tracing interface
+local function trace_check_enabled(path)
+	path = path or '/sys/kernel/debug/tracing'
+	if S.statfs(path) then return true end
+	return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?'
+end
+
+-- Tracepoint interface
+local tracepoint_mt = {
+	__index = {
+		bpf = function (t, prog)
+			if type(prog) ~= 'table' then
+				-- Create protocol parser with source probe
+				prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})})
+			end
+			-- Load the BPF program
+			local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc)
+			assert(prog_fd, tostring(err)..': '..tostring(log))
+			-- Open tracepoint and attach
+			t.reader:setbpf(prog_fd:getfd())
+			table.insert(t.progs, prog_fd)
+			return prog_fd
+		end,
+	}
+}
+-- Open tracepoint
+local function tracepoint_open(path, pid, cpu, group_fd)
+	-- Open tracepoint and compile tracepoint type
+	local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path))
+	local tp_type = assert(cdef.tracepoint_type(path))
+	-- Open tracepoint reader and create interface
+	local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd))
+	return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt)
+end
+
+local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	-- Load BPF program
+	if type(prog) ~= 'table' then
+		prog = compile(prog, {proto.pt_regs})
+	end
+	local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc)
+	assert(prog_fd, tostring(err)..': '..tostring(log))
+	-- Open tracepoint and attach
+	local tp, err = S.perf_probe(ptype, pname, pdef, retprobe)
+	if not tp then
+		prog_fd:close()
+		return nil, tostring(err)
+	end
+	local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'})
+	if not reader then
+		prog_fd:close()
+		S.perf_probe(ptype, pname, false)
+		return nil, tostring(err)
+	end
+	local ok, err = reader:setbpf(prog_fd:getfd())
+	if not ok then
+		prog_fd:close()
+		reader:close()
+		S.perf_probe(ptype, pname, false)
+		return nil, tostring(err)..' (kernel version should be at least 4.1)'
+	end
+	-- Create GC closure for reader to close BPF program
+	-- and detach probe in correct order
+	ffi.gc(reader, function ()
+		prog_fd:close()
+		reader:close()
+		S.perf_probe(ptype, pname, false)
+	end)
+	return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype}
+end
+
+-- Module interface
+return setmetatable({
+	new = create_emitter,
+	dump = dump,
+	dump_string = dump_string,
+	maps = {},
+	map = function (type, max_entries, key_ctype, val_ctype)
+		if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
+		if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
+		if not max_entries then max_entries = 4096 end
+		-- Special case for BPF_MAP_STACK_TRACE
+		if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then
+			key_ctype = ffi.typeof('int32_t')
+			val_ctype = ffi.typeof('struct bpf_stacktrace')
+		end
+		local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries)
+		if not fd then return nil, tostring(err) end
+		local map = setmetatable({
+			max_entries = max_entries,
+			key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
+			val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
+			map_type = S.c.BPF_MAP[type],
+			key_type = key_ctype,
+			val_type = val_ctype,
+			fd = fd:nogc():getfd(),
+		}, bpf_map_mt)
+		return map
+	end,
+	socket = function (sock, prog)
+		-- Expect socket type, if sock is string then assume it's
+		-- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket
+		local ok, err
+		if type(sock) == 'string' then
+			local iface = assert(S.nl.getlink())[sock]
+			assert(iface, sock..' is not interface name')
+			sock, err = S.socket('packet', 'raw')
+			assert(sock, tostring(err))
+			ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index}))
+			assert(ok, tostring(err))
+		elseif type(sock) == 'number' then
+			sock = S.t.fd(sock):nogc()
+		elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore
+			-- No cast required
+		else
+			return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket'
+		end
+		-- Load program and attach it to socket
+		if type(prog) ~= 'table' then
+			prog = compile(prog, {proto.skb})
+		end
+		local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc)
+		assert(prog_fd, tostring(err)..': '..tostring(log))
+		assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd()))
+		return prog_fd, err
+	end,
+	tracepoint = function(tp, prog, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Return tracepoint instance if no program specified
+		-- this allows free specialisation of arg0 to tracepoint type
+		local probe = tracepoint_open(tp, pid, cpu, group_fd)
+		-- Load the BPF program
+		if prog then
+			probe:bpf(prog)
+		end
+		return probe
+	end,
+	kprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Open tracepoint and attach
+		local pname, pdef = tp:match('([^:]+):(.+)')
+		return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	end,
+	uprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Translate symbol to address
+		local obj, sym_want = tp:match('([^:]+):(.+)')
+		if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end
+		-- Resolve Elf object (no support for anything else)
+		local elf = require('bpf.elf').open(obj)
+		local sym = elf:resolve(sym_want)
+		if not sym then return nil, 'no such symbol' end
+		sym = sym.st_value - elf:loadaddr()
+		local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)),
+		                                         tonumber(ffi.cast('uint32_t', sym)))
+		-- Convert it to expected uprobe format
+		local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr)
+		local pdef = obj..':0x'..sym_addr
+		return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	end,
+	tracelog = function(path)
+		assert(trace_check_enabled())
+		path = path or '/sys/kernel/debug/tracing/trace_pipe'
+		return io.open(path, 'r')
+	end,
+	ntoh = builtins.ntoh, hton = builtins.hton,
+}, {
+	__call = function (_, prog) return compile(prog) end,
+})
diff --git a/src/lua/bpf/builtins.lua b/src/lua/bpf/builtins.lua
new file mode 100644
index 0000000..161f7e5
--- /dev/null
+++ b/src/lua/bpf/builtins.lua
@@ -0,0 +1,454 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local bit = require('bit')
+local cdef = require('bpf.cdef')
+
+local BPF, HELPER = ffi.typeof('struct bpf'), ffi.typeof('struct bpf_func_id')
+local const_width = {
+	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
+}
+local const_width_type = {
+	[1] = ffi.typeof('uint8_t'), [2] = ffi.typeof('uint16_t'), [4] = ffi.typeof('uint32_t'), [8] = ffi.typeof('uint64_t'),
+}
+
+-- Built-ins that will be translated into BPF instructions
+-- i.e. bit.bor(0xf0, 0x0f) becomes {'alu64, or, k', reg(0xf0), reg(0x0f), 0, 0}
+local builtins = {
+	[bit.lshift]  = 'LSH',
+	[bit.rshift]  = 'RSH',
+	[bit.band]    = 'AND',
+	[bit.bnot]    = 'NEG',
+	[bit.bor]     = 'OR',
+	[bit.bxor]    = 'XOR',
+	[bit.arshift] = 'ARSH',
+	-- Extensions and intrinsics
+}
+
+local function width_type(w)
+	-- Note: ffi.typeof doesn't accept '?' as template
+	return const_width_type[w] or ffi.typeof(string.format('uint8_t [%d]', w))
+end
+builtins.width_type = width_type
+
+-- Return struct member size/type (requires LuaJIT 2.1+)
+-- I am ashamed that there's no easier way around it.
+local function sizeofattr(ct, name)
+	if not ffi.typeinfo then error('LuaJIT 2.1+ is required for ffi.typeinfo') end
+	local cinfo = ffi.typeinfo(ct)
+	while true do
+		cinfo = ffi.typeinfo(cinfo.sib)
+		if not cinfo then return end
+		if cinfo.name == name then break end
+	end
+	local size = math.max(1, ffi.typeinfo(cinfo.sib or ct).size - cinfo.size)
+	-- Guess type name
+	return size, builtins.width_type(size)
+end
+builtins.sizeofattr = sizeofattr
+
+-- Byte-order conversions for little endian
+local function ntoh(x, w)
+	if w then x = ffi.cast(const_width_type[w/8], x) end
+	return bit.bswap(x)
+end
+local function hton(x, w) return ntoh(x, w) end
+builtins.ntoh = ntoh
+builtins.hton = hton
+builtins[ntoh] = function (e, dst, a, w)
+	-- This is trickery, but TO_LE means cpu_to_le(),
+	-- and we want exactly the opposite as network is always 'be'
+	w = w or ffi.sizeof(e.V[a].type)*8
+	if w == 8 then return end -- NOOP
+	assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
+	-- Allocate registers and execute
+	e.vcopy(dst, a)
+	e.emit(BPF.ALU + BPF.END + BPF.TO_BE, e.vreg(dst), 0, 0, w)
+end
+builtins[hton] = function (e, dst, a, w)
+	w = w or ffi.sizeof(e.V[a].type)*8
+	if w == 8 then return end -- NOOP
+	assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
+	-- Allocate registers and execute
+	e.vcopy(dst, a)
+	e.emit(BPF.ALU + BPF.END + BPF.TO_LE, e.vreg(dst), 0, 0, w)
+end
+-- Byte-order conversions for big endian are no-ops
+if ffi.abi('be') then
+	ntoh = function (x, w)
+		return w and ffi.cast(const_width_type[w/8], x) or x
+	end
+	hton = ntoh
+	builtins[ntoh] = function(_, _, _) return end
+	builtins[hton] = function(_, _, _) return end
+end
+-- Other built-ins
+local function xadd() error('NYI') end
+builtins.xadd = xadd
+builtins[xadd] = function (e, ret, a, b, off)
+	local vinfo = e.V[a].const
+	assert(vinfo and vinfo.__dissector, 'xadd(a, b[, offset]) called on non-pointer')
+	local w = ffi.sizeof(vinfo.__dissector)
+	-- Calculate structure attribute offsets
+	if e.V[off] and type(e.V[off].const) == 'string' then
+		local ct, field = vinfo.__dissector, e.V[off].const
+		off = ffi.offsetof(ct, field)
+		assert(off, 'xadd(a, b, offset) - offset is not valid in given structure')
+		w = sizeofattr(ct, field)
+	end
+	assert(w == 4 or w == 8, 'NYI: xadd() - 1 and 2 byte atomic increments are not supported')
+	-- Allocate registers and execute
+	local src_reg = e.vreg(b)
+	local dst_reg = e.vreg(a)
+	-- Set variable for return value and call
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	-- Optimize the NULL check away if provably not NULL
+	if not e.V[a].source or e.V[a].source:find('_or_null', 1, true) then
+		e.emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 1, 0) -- if (dst != NULL)
+	end
+	e.emit(BPF.XADD + BPF.STX + const_width[w], dst_reg, src_reg, off or 0, 0)
+end
+
+local function probe_read() error('NYI') end
+builtins.probe_read = probe_read
+builtins[probe_read] = function (e, ret, dst, src, vtype, ofs)
+	e.reg_alloc(e.tmpvar, 1)
+	-- Load stack pointer to dst, since only load to stack memory is supported
+	-- we have to use allocated stack memory or create a new allocation and convert
+	-- to pointer type
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	if not e.V[dst].const or not e.V[dst].const.__base > 0 then
+		builtins[ffi.new](e, dst, vtype) -- Allocate stack memory
+	end
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+	-- Set stack memory maximum size bound
+	e.reg_alloc(e.tmpvar, 2)
+	if not vtype then
+		vtype = cdef.typename(e.V[dst].type)
+		-- Dereference pointer type to pointed type for size calculation
+		if vtype:sub(-1) == '*' then vtype = vtype:sub(0, -2) end
+	end
+	local w = ffi.sizeof(vtype)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, w)
+	-- Set source pointer
+	if e.V[src].reg then
+		e.reg_alloc(e.tmpvar, 3) -- Copy from original register
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
+	else
+		e.vreg(src, 3)
+		e.reg_spill(src) -- Spill to avoid overwriting
+	end
+	if ofs and ofs > 0 then
+		e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, ofs)
+	end
+	-- Call probe read helper
+	ret = ret or e.tmpvar
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+builtins[ffi.cast] = function (e, dst, ct, x)
+	assert(e.V[ct].const, 'ffi.cast(ctype, x) called with bad ctype')
+	e.vcopy(dst, x)
+	if e.V[x].const and type(e.V[x].const) == 'table' then
+		e.V[dst].const.__dissector = ffi.typeof(e.V[ct].const)
+	end
+	e.V[dst].type = ffi.typeof(e.V[ct].const)
+	-- Specific types also encode source of the data
+	-- This is because BPF has different helpers for reading
+	-- different data sources, so variables must track origins.
+	-- struct pt_regs - source of the data is probe
+	-- struct skb     - source of the data is socket buffer
+	-- struct X       - source of the data is probe/tracepoint
+	if ffi.typeof(e.V[ct].const) == ffi.typeof('struct pt_regs') then
+		e.V[dst].source = 'ptr_to_probe'
+	end
+end
+
+builtins[ffi.new] = function (e, dst, ct, x)
+	if type(ct) == 'number' then
+		ct = ffi.typeof(e.V[ct].const) -- Get ctype from variable
+	end
+	assert(not x, 'NYI: ffi.new(ctype, ...) - initializer is not supported')
+	assert(not cdef.isptr(ct, true), 'NYI: ffi.new(ctype, ...) - ctype MUST NOT be a pointer')
+	e.vset(dst, nil, ct)
+	e.V[dst].source = 'ptr_to_stack'
+	e.V[dst].const = {__base = e.valloc(ffi.sizeof(ct), true), __dissector = ct}
+	-- Set array dissector if created an array
+	-- e.g. if ct is 'char [2]', then dissector is 'char'
+	local elem_type = tostring(ct):match('ctype<(.+)%s%[(%d+)%]>')
+	if elem_type then
+		e.V[dst].const.__dissector = ffi.typeof(elem_type)
+	end
+end
+
+builtins[ffi.copy] = function (e, ret, dst, src)
+	assert(cdef.isptr(e.V[dst].type), 'ffi.copy(dst, src) - dst MUST be a pointer type')
+	assert(cdef.isptr(e.V[src].type), 'ffi.copy(dst, src) - src MUST be a pointer type')
+	-- Specific types also encode source of the data
+	-- struct pt_regs - source of the data is probe
+	-- struct skb     - source of the data is socket buffer
+	if e.V[src].source and e.V[src].source:find('ptr_to_probe', 1, true) then
+		e.reg_alloc(e.tmpvar, 1)
+		-- Load stack pointer to dst, since only load to stack memory is supported
+		-- we have to either use spilled variable or allocated stack memory offset
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+		if e.V[dst].spill then
+			e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].spill)
+		elseif e.V[dst].const.__base then
+			e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+		else error('ffi.copy(dst, src) - can\'t get stack offset of dst') end
+		-- Set stack memory maximum size bound
+		local dst_tname = cdef.typename(e.V[dst].type)
+		if dst_tname:sub(-1) == '*' then dst_tname = dst_tname:sub(0, -2) end
+		e.reg_alloc(e.tmpvar, 2)
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(dst_tname))
+		-- Set source pointer
+		if e.V[src].reg then
+			e.reg_alloc(e.tmpvar, 3) -- Copy from original register
+			e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
+		else
+			e.vreg(src, 3)
+			e.reg_spill(src) -- Spill to avoid overwriting
+		end
+		-- Call probe read helper
+		e.vset(ret)
+		e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+		e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
+		e.V[e.tmpvar].reg = nil  -- Free temporary registers
+	elseif e.V[src].const and e.V[src].const.__map then
+		error('NYI: ffi.copy(dst, src) - src is backed by BPF map')
+	elseif e.V[src].const and e.V[src].const.__dissector then
+		error('NYI: ffi.copy(dst, src) - src is backed by socket buffer')
+	else
+		-- TODO: identify cheap register move
+		-- TODO: identify copy to/from stack
+		error('NYI: ffi.copy(dst, src) - src is neither BPF map/socket buffer or probe')
+	end
+end
+-- print(format, ...) builtin changes semantics from Lua print(...)
+-- the first parameter has to be format and only reduced set of conversion specificers
+-- is allowed: %d %u %x %ld %lu %lx %lld %llu %llx %p %s
+builtins[print] = function (e, ret, fmt, a1, a2, a3)
+	-- Load format string and length
+	e.reg_alloc(e.V[e.tmpvar], 1)
+	e.reg_alloc(e.V[e.tmpvar+1], 1)
+	if type(e.V[fmt].const) == 'string' then
+		local src = e.V[fmt].const
+		local len = #src + 1
+		local dst = e.valloc(len, src)
+		-- TODO: this is materialize step
+		e.V[fmt].const = {__base=dst}
+		e.V[fmt].type = ffi.typeof('char ['..len..']')
+	elseif e.V[fmt].const.__base then -- luacheck: ignore
+		-- NOP
+	else error('NYI: print(fmt, ...) - format variable is not literal/stack memory') end
+	-- Prepare helper call
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[fmt].const.__base)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[fmt].type))
+	if a1 then
+		local args = {a1, a2, a3}
+		assert(#args <= 3, 'print(fmt, ...) - maximum of 3 arguments supported')
+		for i, arg in ipairs(args) do
+			e.vcopy(e.tmpvar, arg)  -- Copy variable
+			e.vreg(e.tmpvar, 3+i-1) -- Materialize it in arg register
+		end
+	end
+	-- Call helper
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t')) -- Return is integer
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.trace_printk)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Implements bpf_perf_event_output(ctx, map, flags, var, vlen) on perf event map
+local function perf_submit(e, dst, map_var, src)
+	-- Set R2 = map fd (indirect load)
+	local map = e.V[map_var].const
+	e.vcopy(e.tmpvar, map_var)
+	e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
+	e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
+	-- Set R1 = ctx
+	e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
+	-- Set R3 = flags
+	e.vset(e.tmpvar, nil, 0) -- BPF_F_CURRENT_CPU
+	e.vreg(e.tmpvar, 3, false, ffi.typeof('uint64_t'))
+	-- Set R4 = pointer to src on stack
+	assert(e.V[src].const.__base, 'NYI: submit(map, var) - variable is not on stack')
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 4, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 4, 0, 0, -e.V[src].const.__base)
+	-- Set R5 = src length
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 5, 0, 0, ffi.sizeof(e.V[src].type))
+	-- Set R0 = ret and call
+	e.vset(dst)
+	e.vreg(dst, 0, true, ffi.typeof('int32_t')) -- Return is integer
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.perf_event_output)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Implements bpf_skb_load_bytes(ctx, off, var, vlen) on skb->data
+local function load_bytes(e, dst, off, var)
+	-- Set R2 = offset
+	e.vset(e.tmpvar, nil, off)
+	e.vreg(e.tmpvar, 2, false, ffi.typeof('uint64_t'))
+	-- Set R1 = ctx
+	e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
+	-- Set R3 = pointer to var on stack
+	assert(e.V[var].const.__base, 'NYI: load_bytes(off, var, len) - variable is not on stack')
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -e.V[var].const.__base)
+	-- Set R4 = var length
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, ffi.sizeof(e.V[var].type))
+	-- Set R0 = ret and call
+	e.vset(dst)
+	e.vreg(dst, 0, true, ffi.typeof('int32_t')) -- Return is integer
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.skb_load_bytes)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Implements bpf_get_stack_id()
+local function stack_id(e, ret, map_var, key)
+	-- Set R2 = map fd (indirect load)
+	local map = e.V[map_var].const
+	e.vcopy(e.tmpvar, map_var)
+	e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
+	e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
+	-- Set R1 = ctx
+	e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
+	-- Load flags in R2 (immediate value or key)
+	local imm = e.V[key].const
+	assert(tonumber(imm), 'NYI: stack_id(map, var), var must be constant number')
+	e.reg_alloc(e.tmpvar, 3) -- Spill anything in R2 (unnamed tmp variable)
+	e.LD_IMM_X(3, 0, imm, 8)
+	-- Return R0 as signed integer
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_stackid)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- table.insert(table, value) keeps semantics with the exception of BPF maps
+-- map `perf_event` -> submit inserted value
+builtins[table.insert] = function (e, dst, map_var, value)
+	assert(e.V[map_var].const.__map, 'NYI: table.insert() supported only on BPF maps')
+	return perf_submit(e, dst, map_var, value)
+end
+
+-- bpf_get_current_comm(buffer) - write current process name to byte buffer
+local function comm() error('NYI') end
+builtins[comm] = function (e, ret, dst)
+	-- Set R1 = buffer
+	assert(e.V[dst].const.__base, 'NYI: comm(buffer) - buffer variable is not on stack')
+	e.reg_alloc(e.tmpvar, 1) -- Spill
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+	-- Set R2 = length
+	e.reg_alloc(e.tmpvar, 2) -- Spill
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[dst].type))
+	-- Return is integer
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_current_comm)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Math library built-ins
+math.log2 = function () error('NYI') end
+builtins[math.log2] = function (e, dst, x)
+	-- Classic integer bits subdivison algorithm to find the position
+	-- of the highest bit set, adapted for BPF bytecode-friendly operations.
+	-- https://graphics.stanford.edu/~seander/bithacks.html
+	-- r = 0
+	local r = e.vreg(dst, nil, true)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, r, 0, 0, 0)
+	-- v = x
+	e.vcopy(e.tmpvar, x)
+	local v = e.vreg(e.tmpvar, 2)
+	if cdef.isptr(e.V[x].const) then -- No pointer arithmetics, dereference
+		e.vderef(v, v, {const = {__dissector=ffi.typeof('uint64_t')}})
+	end
+	-- Invert value to invert all tests, otherwise we would need and+jnz
+	e.emit(BPF.ALU64 + BPF.NEG + BPF.K, v, 0, 0, 0)        -- v = ~v
+	-- Unrolled test cases, converted masking to arithmetic as we don't have "if !(a & b)"
+	-- As we're testing inverted value, we have to use arithmetic shift to copy MSB
+	for i=4,0,-1 do
+		local k = bit.lshift(1, i)
+		e.emit(BPF.JMP + BPF.JGT + BPF.K, v, 0, 2, bit.bnot(bit.lshift(1, k))) -- if !upper_half(x)
+		e.emit(BPF.ALU64 + BPF.ARSH + BPF.K, v, 0, 0, k)                       --     v >>= k
+		e.emit(BPF.ALU64 + BPF.OR + BPF.K, r, 0, 0, k)                         --     r |= k
+	end
+	-- No longer constant, cleanup tmpvars
+	e.V[dst].const = nil
+	e.V[e.tmpvar].reg = nil
+end
+builtins[math.log10] = function (e, dst, x)
+	-- Compute log2(x) and transform
+	builtins[math.log2](e, dst, x)
+	-- Relationship: log10(v) = log2(v) / log2(10)
+	local r = e.V[dst].reg
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1)    -- Compensate round-down
+	e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 1233) -- log2(10) ~ 1233>>12
+	e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
+end
+builtins[math.log] = function (e, dst, x)
+	-- Compute log2(x) and transform
+	builtins[math.log2](e, dst, x)
+	-- Relationship: ln(v) = log2(v) / log2(e)
+	local r = e.V[dst].reg
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1)    -- Compensate round-down
+	e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 2839) -- log2(e) ~ 2839>>12
+	e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
+end
+
+-- Call-type helpers
+local function call_helper(e, dst, h, vtype)
+	e.vset(dst)
+	e.vreg(dst, 0, true, vtype or ffi.typeof('uint64_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, h)
+	e.V[dst].const = nil -- Target is not a function anymore
+end
+local function cpu() error('NYI') end
+local function rand() error('NYI') end
+local function time() error('NYI') end
+local function pid_tgid() error('NYI') end
+local function uid_gid() error('NYI') end
+
+-- Export helpers and builtin variants
+builtins.cpu = cpu
+builtins.time = time
+builtins.pid_tgid = pid_tgid
+builtins.uid_gid = uid_gid
+builtins.comm = comm
+builtins.perf_submit = perf_submit
+builtins.stack_id = stack_id
+builtins.load_bytes = load_bytes
+builtins[cpu] = function (e, dst) return call_helper(e, dst, HELPER.get_smp_processor_id) end
+builtins[rand] = function (e, dst) return call_helper(e, dst, HELPER.get_prandom_u32, ffi.typeof('uint32_t')) end
+builtins[time] = function (e, dst) return call_helper(e, dst, HELPER.ktime_get_ns) end
+builtins[pid_tgid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_pid_tgid) end
+builtins[uid_gid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_uid_gid) end
+builtins[perf_submit] = function (e, dst, map, value) return perf_submit(e, dst, map, value) end
+builtins[stack_id] = function (e, dst, map, key) return stack_id(e, dst, map, key) end
+builtins[load_bytes] = function (e, dst, off, var, len) return load_bytes(e, dst, off, var, len) end
+
+return builtins
diff --git a/src/lua/bpf/cdef.lua b/src/lua/bpf/cdef.lua
new file mode 100644
index 0000000..6be8bdf
--- /dev/null
+++ b/src/lua/bpf/cdef.lua
@@ -0,0 +1,280 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local bit = require('bit')
+local has_syscall, S = pcall(require, 'syscall')
+local M = {}
+
+ffi.cdef [[
+struct bpf {
+	/* Instruction classes */
+	static const int LD   = 0x00;
+	static const int LDX  = 0x01;
+	static const int ST   = 0x02;
+	static const int STX  = 0x03;
+	static const int ALU  = 0x04;
+	static const int JMP  = 0x05;
+	static const int ALU64 = 0x07;
+	/* ld/ldx fields */
+	static const int W    = 0x00;
+	static const int H    = 0x08;
+	static const int B    = 0x10;
+	static const int ABS  = 0x20;
+	static const int IND  = 0x40;
+	static const int MEM  = 0x60;
+	static const int LEN  = 0x80;
+	static const int MSH  = 0xa0;
+	/* alu/jmp fields */
+	static const int ADD  = 0x00;
+	static const int SUB  = 0x10;
+	static const int MUL  = 0x20;
+	static const int DIV  = 0x30;
+	static const int OR   = 0x40;
+	static const int AND  = 0x50;
+	static const int LSH  = 0x60;
+	static const int RSH  = 0x70;
+	static const int NEG  = 0x80;
+	static const int MOD  = 0x90;
+	static const int XOR  = 0xa0;
+	static const int JA   = 0x00;
+	static const int JEQ  = 0x10;
+	static const int JGT  = 0x20;
+	static const int JGE  = 0x30;
+	static const int JSET = 0x40;
+	static const int K    = 0x00;
+	static const int X    = 0x08;
+	static const int JNE  = 0x50;	/* jump != */
+	static const int JSGT = 0x60;	/* SGT is signed '>', GT in x86 */
+	static const int JSGE = 0x70;	/* SGE is signed '>=', GE in x86 */
+	static const int CALL = 0x80;	/* function call */
+	static const int EXIT = 0x90;	/* function return */
+	/* ld/ldx fields */
+	static const int DW    = 0x18;	/* double word */
+	static const int XADD  = 0xc0;	/* exclusive add */
+	/* alu/jmp fields */
+	static const int MOV   = 0xb0;	/* mov reg to reg */
+	static const int ARSH  = 0xc0;	/* sign extending arithmetic shift right */
+	/* change endianness of a register */
+	static const int END   = 0xd0;	/* flags for endianness conversion: */
+	static const int TO_LE = 0x00;	/* convert to little-endian */
+	static const int TO_BE = 0x08;	/* convert to big-endian */
+	/* misc */
+	static const int PSEUDO_MAP_FD = 0x01;
+	/* helper functions */
+	static const int F_CURRENT_CPU    = 0xffffffff;
+	static const int F_USER_STACK     = 1 << 8;
+	static const int F_FAST_STACK_CMP = 1 << 9;
+	static const int F_REUSE_STACKID  = 1 << 10;
+	/* special offsets for ancillary data */
+	static const int NET_OFF          = -0x100000;
+	static const int LL_OFF           = -0x200000;
+};
+/* eBPF commands */
+struct bpf_cmd {
+	static const int MAP_CREATE       = 0;
+	static const int MAP_LOOKUP_ELEM  = 1;
+	static const int MAP_UPDATE_ELEM  = 2;
+	static const int MAP_DELETE_ELEM  = 3;
+	static const int MAP_GET_NEXT_KEY = 4;
+	static const int PROG_LOAD        = 5;
+	static const int OBJ_PIN          = 6;
+	static const int OBJ_GET          = 7;
+};
+/* eBPF helpers */
+struct bpf_func_id {
+	static const int unspec               = 0;
+	static const int map_lookup_elem      = 1;
+	static const int map_update_elem      = 2;
+	static const int map_delete_elem      = 3;
+	static const int probe_read           = 4;
+	static const int ktime_get_ns         = 5;
+	static const int trace_printk         = 6;
+	static const int get_prandom_u32      = 7;
+	static const int get_smp_processor_id = 8;
+	static const int skb_store_bytes      = 9;
+	static const int l3_csum_replace      = 10;
+	static const int l4_csum_replace      = 11;
+	static const int tail_call            = 12;
+	static const int clone_redirect       = 13;
+	static const int get_current_pid_tgid = 14;
+	static const int get_current_uid_gid  = 15;
+	static const int get_current_comm     = 16;
+	static const int get_cgroup_classid   = 17;
+	static const int skb_vlan_push        = 18;
+	static const int skb_vlan_pop         = 19;
+	static const int skb_get_tunnel_key   = 20;
+	static const int skb_set_tunnel_key   = 21;
+	static const int perf_event_read      = 22;
+	static const int redirect             = 23;
+	static const int get_route_realm      = 24;
+	static const int perf_event_output    = 25;
+	static const int skb_load_bytes       = 26;
+	static const int get_stackid          = 27;
+};
+/* BPF_MAP_STACK_TRACE structures and constants */
+static const int BPF_MAX_STACK_DEPTH = 127;
+struct bpf_stacktrace {
+	uint64_t ip[BPF_MAX_STACK_DEPTH];
+};
+]]
+
+-- Compatibility: ljsyscall doesn't have support for BPF syscall
+if not has_syscall or not S.bpf then
+	error("ljsyscall doesn't support bpf(), must be updated")
+else
+	local strflag = require('syscall.helpers').strflag
+	-- Compatibility: ljsyscall<=0.12
+	if not S.c.BPF_MAP.LRU_HASH then
+		S.c.BPF_MAP = strflag {
+			UNSPEC           = 0,
+			HASH             = 1,
+			ARRAY            = 2,
+			PROG_ARRAY       = 3,
+			PERF_EVENT_ARRAY = 4,
+			PERCPU_HASH      = 5,
+			PERCPU_ARRAY     = 6,
+			STACK_TRACE      = 7,
+			CGROUP_ARRAY     = 8,
+			LRU_HASH         = 9,
+			LRU_PERCPU_HASH  = 10,
+			LPM_TRIE         = 11,
+			ARRAY_OF_MAPS    = 12,
+			HASH_OF_MAPS     = 13,
+			DEVMAP           = 14,
+			SOCKMAP          = 15,
+			CPUMAP           = 16,
+		}
+	end
+	if not S.c.BPF_PROG.TRACEPOINT then
+		S.c.BPF_PROG = strflag {
+			UNSPEC           = 0,
+			SOCKET_FILTER    = 1,
+			KPROBE           = 2,
+			SCHED_CLS        = 3,
+			SCHED_ACT        = 4,
+			TRACEPOINT       = 5,
+			XDP              = 6,
+			PERF_EVENT       = 7,
+			CGROUP_SKB       = 8,
+			CGROUP_SOCK      = 9,
+			LWT_IN           = 10,
+			LWT_OUT          = 11,
+			LWT_XMIT         = 12,
+			SOCK_OPS         = 13,
+			SK_SKB           = 14,
+			CGROUP_DEVICE    = 15,
+			SK_MSG           = 16,
+			RAW_TRACEPOINT   = 17,
+			CGROUP_SOCK_ADDR = 18,
+		}
+	end
+end
+
+-- Compatibility: metatype for stacktrace
+local function stacktrace_iter(t, i)
+	i = i + 1
+	if i < #t and t.ip[i] > 0 then
+		return i, t.ip[i]
+	end
+end
+ffi.metatype('struct bpf_stacktrace', {
+	__len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end,
+	__ipairs = function (t) return stacktrace_iter, t, -1 end,
+})
+
+-- Reflect cdata type
+function M.typename(v)
+	if not v or type(v) ~= 'cdata' then return nil end
+	return string.match(tostring(ffi.typeof(v)), '<([^>]+)')
+end
+
+-- Reflect if cdata type can be pointer (accepts array or pointer)
+function M.isptr(v, noarray)
+	local ctname = M.typename(v)
+	if ctname then
+		ctname = string.sub(ctname, -1)
+		ctname = ctname == '*' or (not noarray and ctname == ']')
+	end
+	return ctname
+end
+
+-- Return true if variable is a non-nil constant that can be used as immediate value
+-- e.g. result of KSHORT and KNUM
+function M.isimmconst(v)
+	return (type(v.const) == 'number' and not ffi.istype(v.type, ffi.typeof('void')))
+		or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('uint64_t')) -- Lua numbers are at most 52 bits
+		or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('int64_t'))
+end
+
+function M.osversion()
+	-- We have no better way to extract current kernel hex-string other
+	-- than parsing headers, compiling a helper function or reading /proc
+	local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2
+	if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease
+		ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+')
+	end
+	local version = 0
+	for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ
+		version = bit.bor(version, bit.lshift(tonumber(i), 8*count))
+		count = count - 1
+	end
+	return version
+end
+
+function M.event_reader(reader, event_type)
+	-- Caller can specify event message binary format
+	if event_type then
+		assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader')
+		event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type
+	end
+	-- Wrap reader in interface that can interpret read event messages
+	return setmetatable({reader=reader,type=event_type}, {__index = {
+		block = function(_ --[[self]])
+			return S.select { readfds = {reader.fd} }
+		end,
+		next = function(_ --[[self]], k)
+			local len, ev = reader:next(k)
+			-- Filter out only sample frames
+			while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do
+				len, ev = reader:next(len)
+			end
+			if ev and event_type then
+				-- The perf event reader returns framed data with header and variable length
+				-- This is going skip the frame header and cast data to given type
+				ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t'))
+			end
+			return len, ev
+		end,
+		read = function(self)
+			return self.next, self, nil
+		end,
+	}})
+end
+
+function M.tracepoint_type(tp)
+	-- Read tracepoint format string
+	local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r'))
+	local fmt = fp:read '*a'
+	fp:close()
+	-- Parse struct fields
+	local fields = {}
+	for f in fmt:gmatch 'field:([^;]+;)' do
+		table.insert(fields, f)
+	end
+	return string.format('struct { %s }', table.concat(fields))
+end
+
+return M
diff --git a/src/lua/bpf/elf.lua b/src/lua/bpf/elf.lua
new file mode 100644
index 0000000..533fe2f
--- /dev/null
+++ b/src/lua/bpf/elf.lua
@@ -0,0 +1,261 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This is a tiny wrapper over libelf to extract load address
+-- and offsets of dynamic symbols
+
+local S = require('syscall')
+local ffi = require('ffi')
+ffi.cdef [[
+/* Type for a 16-bit quantity.  */
+typedef uint16_t Elf32_Half;
+typedef uint16_t Elf64_Half;
+
+/* Types for signed and unsigned 32-bit quantities.  */
+typedef uint32_t Elf32_Word;
+typedef int32_t  Elf32_Sword;
+typedef uint32_t Elf64_Word;
+typedef int32_t  Elf64_Sword;
+
+/* Types for signed and unsigned 64-bit quantities.  */
+typedef uint64_t Elf32_Xword;
+typedef int64_t  Elf32_Sxword;
+typedef uint64_t Elf64_Xword;
+typedef int64_t  Elf64_Sxword;
+
+/* Type of addresses.  */
+typedef uint32_t Elf32_Addr;
+typedef uint64_t Elf64_Addr;
+
+/* Type of file offsets.  */
+typedef uint32_t Elf32_Off;
+typedef uint64_t Elf64_Off;
+
+/* Type for section indices, which are 16-bit quantities.  */
+typedef uint16_t Elf32_Section;
+typedef uint16_t Elf64_Section;
+
+/* Constants */
+struct Elf_Cmd
+{
+  static const int READ              = 1;
+  static const int RDWR              = 2;
+  static const int WRITE             = 3;
+  static const int CLR               = 4;
+  static const int SET               = 5;
+  static const int FDDONE            = 6;
+  static const int FDREAD            = 7;
+  static const int READ_MMAP         = 8;
+  static const int RDWR_MMAP         = 9;
+  static const int WRITE_MMAP        =10;
+  static const int READ_MMAP_PRIVATE =11;
+  static const int EMPTY             =12;
+  static const int NUM               =13;
+};
+
+/* Descriptor for the ELF file.  */
+typedef struct Elf Elf;
+/* Descriptor for ELF file section.  */
+typedef struct Elf_Scn Elf_Scn;
+/* Container type for metatable */
+struct Elf_object { int fd; Elf *elf; };
+/* Program segment header.  */
+typedef struct
+{
+  Elf64_Word    p_type;                 /* Segment type */
+  Elf64_Word    p_flags;                /* Segment flags */
+  Elf64_Off     p_offset;               /* Segment file offset */
+  Elf64_Addr    p_vaddr;                /* Segment virtual address */
+  Elf64_Addr    p_paddr;                /* Segment physical address */
+  Elf64_Xword   p_filesz;               /* Segment size in file */
+  Elf64_Xword   p_memsz;                /* Segment size in memory */
+  Elf64_Xword   p_align;                /* Segment alignment */
+} Elf64_Phdr;
+typedef Elf64_Phdr GElf_Phdr;
+/* Section header.  */
+typedef struct
+{
+  Elf64_Word    sh_name;                /* Section name (string tbl index) */
+  Elf64_Word    sh_type;                /* Section type */
+  Elf64_Xword   sh_flags;               /* Section flags */
+  Elf64_Addr    sh_addr;                /* Section virtual addr at execution */
+  Elf64_Off     sh_offset;              /* Section file offset */
+  Elf64_Xword   sh_size;                /* Section size in bytes */
+  Elf64_Word    sh_link;                /* Link to another section */
+  Elf64_Word    sh_info;                /* Additional section information */
+  Elf64_Xword   sh_addralign;           /* Section alignment */
+  Elf64_Xword   sh_entsize;             /* Entry size if section holds table */
+} Elf64_Shdr;
+typedef Elf64_Shdr GElf_Shdr;
+/* Descriptor for data to be converted to or from memory format.  */
+typedef struct
+{
+  void *d_buf;                  /* Pointer to the actual data.  */
+  int d_type;                   /* Type of this piece of data.  */
+  unsigned int d_version;       /* ELF version.  */
+  size_t d_size;                /* Size in bytes.  */
+  uint64_t d_off;               /* Offset into section.  */
+  size_t d_align;               /* Alignment in section.  */
+} Elf_Data;
+/* Symbol table entry.  */
+typedef struct
+{
+  Elf64_Word    st_name;                /* Symbol name (string tbl index) */
+  unsigned char st_info;                /* Symbol type and binding */
+  unsigned char st_other;               /* Symbol visibility */
+  Elf64_Section st_shndx;               /* Section index */
+  Elf64_Addr    st_value;               /* Symbol value */
+  Elf64_Xword   st_size;                /* Symbol size */
+} Elf64_Sym;
+typedef Elf64_Sym GElf_Sym;
+
+/* Coordinate ELF library and application versions.  */
+unsigned int elf_version (unsigned int __version);
+/* Return descriptor for ELF file to work according to CMD.  */
+Elf *elf_begin (int __fildes, int __cmd, Elf *__ref);
+/* Free resources allocated for ELF.  */
+int elf_end (Elf *__elf);
+/* Get the number of program headers in the ELF file.  If the file uses
+   more headers than can be represented in the e_phnum field of the ELF
+   header the information from the sh_info field in the zeroth section
+   header is used.  */
+int elf_getphdrnum (Elf *__elf, size_t *__dst);
+/* Retrieve program header table entry.  */
+GElf_Phdr *gelf_getphdr (Elf *__elf, int __ndx, GElf_Phdr *__dst);
+/* Retrieve section header.  */
+GElf_Shdr *gelf_getshdr (Elf_Scn *__scn, GElf_Shdr *__dst);
+/* Retrieve symbol information from the symbol table at the given index.  */
+GElf_Sym *gelf_getsym (Elf_Data *__data, int __ndx, GElf_Sym *__dst);
+/* Get section with next section index.  */
+Elf_Scn *elf_nextscn (Elf *__elf, Elf_Scn *__scn);
+/* Get data from section while translating from file representation
+   to memory representation.  */
+Elf_Data *elf_getdata (Elf_Scn *__scn, Elf_Data *__data);
+/* Return pointer to string at OFFSET in section INDEX.  */
+char *elf_strptr (Elf *__elf, size_t __index, size_t __offset);
+]]
+
+local elf = ffi.load('elf')
+local EV = { NONE=0, CURRENT=1, NUM=2 }
+local PT = { NULL=0, LOAD=1, DYNAMIC=2, INTERP=3, NOTE=4, SHLIB=5, PHDR=6, TLS=7, NUM=8 }
+local SHT = { NULL=0, PROGBITS=1, SYMTAB=2, STRTAB=3, RELA=4, HASH=5, DYNAMIC=6, NOTE=7,
+              NOBITS=8, REL=9, SHLIB=10, DYNSYM=11, INIT_ARRAY=14, FINI_ARRAY=15, PREINIT_ARRAY=16,
+              GROUP=17, SYMTAB_SHNDX=18, NUM=19 }
+local ELF_C = ffi.new('struct Elf_Cmd')
+local M = {}
+
+-- Optional poor man's C++ demangler
+local cpp_demangler = os.getenv('CPP_DEMANGLER')
+if not cpp_demangler then
+	for prefix in string.gmatch(os.getenv('PATH'), '[^;:]+') do
+		if S.statfs(prefix..'/c++filt') then
+			cpp_demangler = prefix..'/c++filt'
+			break
+		end
+	end
+end
+local cpp_demangle = function (name) return name end
+if cpp_demangler then
+	cpp_demangle = function (name)
+		local cmd = string.format('%s -p %s', cpp_demangler, name)
+		local fp = assert(io.popen(cmd, 'r'))
+		local output = fp:read('*all')
+		fp:close()
+		return output:match '^(.-)%s*$'
+	end
+end
+
+-- Metatable for ELF object
+ffi.metatype('struct Elf_object', {
+	__gc = function (t) t:close() end,
+	__index = {
+		close = function (t)
+			if t.elf ~= nil then
+				elf.elf_end(t.elf)
+				S.close(t.fd)
+				t.elf = nil
+			end
+		end,
+		-- Load library load address
+		loadaddr = function(t)
+			local phnum = ffi.new('size_t [1]')
+			if elf.elf_getphdrnum(t.elf, phnum) == nil then
+				return nil, 'cannot get phdrnum'
+			end
+			local header = ffi.new('GElf_Phdr [1]')
+			for i = 0, tonumber(phnum[0])-1 do
+				if elf.gelf_getphdr(t.elf, i, header) ~= nil
+				   and header[0].p_type == PT.LOAD then
+				   return header[0].p_vaddr
+				end
+			end
+		end,
+		-- Resolve symbol address
+		resolve = function (t, k, pattern)
+			local section = elf.elf_nextscn(t.elf, nil)
+			while section ~= nil do
+				local header = ffi.new('GElf_Shdr [1]')
+				if elf.gelf_getshdr(section, header) ~= nil then
+					if header[0].sh_type == SHT.SYMTAB or header[0].sh_type == SHT.DYNSYM then
+						local data = elf.elf_getdata(section, nil)
+						while data ~= nil do
+							if data.d_size % header[0].sh_entsize > 0 then
+								return nil, 'bad section header entity size'
+							end
+							local symcount = tonumber(data.d_size / header[0].sh_entsize)
+							local sym = ffi.new('GElf_Sym [1]')
+							for i = 0, symcount - 1 do
+								if elf.gelf_getsym(data, i, sym) ~= nil then
+									local name = elf.elf_strptr(t.elf, header[0].sh_link, sym[0].st_name)
+									if name ~= nil then
+										-- Demangle C++ symbols if necessary
+										name = ffi.string(name)
+										if name:sub(1,2) == '_Z' then
+											name = cpp_demangle(name)
+										end
+										-- Match symbol name against pattern
+										if pattern and string.match(name, k) or k == name then
+											return sym[0]
+										end
+									end
+								end
+							end
+							data = elf.elf_getdata(section, data)
+						end
+					end
+				end
+				section = elf.elf_nextscn(t.elf, section)
+			end
+		end,
+	}
+})
+
+-- Open an ELF object
+function M.open(path)
+	if elf.elf_version(EV.CURRENT) == EV.NONE then
+		return nil, 'bad version'
+	end
+	local fd, err = S.open(path, 'rdonly')
+	if not fd then return nil, err end
+	local pt = ffi.new('Elf *')
+	pt = elf.elf_begin(fd:getfd(), ELF_C.READ, pt)
+	if not pt then
+		fd:close()
+		return nil, 'cannot open elf object'
+	end
+	return ffi.new('struct Elf_object', fd:nogc():getfd(), pt)
+end
+
+return M
\ No newline at end of file
diff --git a/src/lua/bpf/init.lua b/src/lua/bpf/init.lua
new file mode 100644
index 0000000..1cccbd7
--- /dev/null
+++ b/src/lua/bpf/init.lua
@@ -0,0 +1,16 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+return require('bpf.bpf')
diff --git a/src/lua/bpf/ljbytecode.lua b/src/lua/bpf/ljbytecode.lua
new file mode 100644
index 0000000..c4516d0
--- /dev/null
+++ b/src/lua/bpf/ljbytecode.lua
@@ -0,0 +1,74 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local jutil = require("jit.util")
+local vmdef = require("jit.vmdef")
+local bit = require('bit')
+local shr, band = bit.rshift, bit.band
+
+-- Decode LuaJIT 2.0 Byte Format
+-- Reference: http://wiki.luajit.org/Bytecode-2.0
+-- Thanks to LJ, we get code in portable bytecode with constants folded, basic
+-- virtual registers allocated etc.
+-- No SSA IR, type inference or advanced optimizations because the code wasn't traced yet.
+local function decode_ins(func, pc)
+	local ins, m = jutil.funcbc(func, pc)
+	if not ins then return nil end
+	local op, ma, mb, mc = band(ins, 0xff), band(m, 7), band(m, 15*8), band(m, 15*128)
+	local a, b, c, d = band(shr(ins, 8), 0xff), nil, nil, shr(ins, 16)
+	if mb ~= 0 then
+		d = band(d, 0xff)
+		b = shr(ins, 24)
+	end
+	if ma == 5 then          -- BCMuv
+	    a = jutil.funcuvname(func, a)
+	end
+	if mc == 13*128 then     -- BCMjump
+		c = pc+d-0x7fff
+	elseif mc == 14*128 then -- BCMcdata
+		c = jutil.funck(func, -d-1)
+	elseif mc == 9*128 then  -- BCMint
+		c = jutil.funck(func, d)
+	elseif mc == 10*128 then -- BCMstr
+		c = jutil.funck(func, -d-1)
+	elseif mc == 5*128 then  -- BCMuv
+	    c = jutil.funcuvname(func, d)
+	end
+	-- Convert version-specific opcode to string
+	op = 6*op
+	op = string.sub(vmdef.bcnames, op+1, op+6):match('[^%s]+')
+	return pc, op, a, b, c, d
+end
+
+-- Decoder closure
+local function decoder(func)
+	local pc = 0
+	return function ()
+		pc = pc + 1
+		return decode_ins(func, pc)
+	end
+end
+
+-- Hexdump generated code
+local function dump(func)
+	return require('jit.bc').dump(func)
+end
+
+return {
+	decode = decode_ins,
+	decoder = decoder,
+	dump = dump,
+	funcinfo = function (...) return jutil.funcinfo(...) end,
+}
\ No newline at end of file
diff --git a/src/lua/bpf/proto.lua b/src/lua/bpf/proto.lua
new file mode 100644
index 0000000..01a201d
--- /dev/null
+++ b/src/lua/bpf/proto.lua
@@ -0,0 +1,542 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local BPF = ffi.typeof('struct bpf')
+
+ffi.cdef [[
+struct sk_buff {
+	uint32_t len;
+	uint32_t pkt_type;
+	uint32_t mark;
+	uint32_t queue_mapping;
+	uint32_t protocol;
+	uint32_t vlan_present;
+	uint32_t vlan_tci;
+	uint32_t vlan_proto;
+	uint32_t priority;
+	uint32_t ingress_ifindex;
+	uint32_t ifindex;
+	uint32_t tc_index;
+	uint32_t cb[5];
+	uint32_t hash;
+	uint32_t tc_classid;
+	uint32_t data;
+	uint32_t data_end;
+	uint32_t napi_id;
+
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
+	uint32_t family;
+	uint32_t remote_ip4;	/* Stored in network byte order */
+	uint32_t local_ip4;	/* Stored in network byte order */
+	uint32_t remote_ip6[4];	/* Stored in network byte order */
+	uint32_t local_ip6[4];	/* Stored in network byte order */
+	uint32_t remote_port;	/* Stored in network byte order */
+	uint32_t local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	uint32_t data_meta;
+};
+
+struct net_off_t {
+	uint8_t  ver:4;
+} __attribute__((packed));
+
+struct eth_t {
+	uint8_t  dst[6];
+	uint8_t  src[6];
+	uint16_t type;
+} __attribute__((packed));
+
+struct dot1q_t {
+	uint16_t pri:3;
+	uint16_t cfi:1;
+	uint16_t vlanid:12;
+	uint16_t type;
+} __attribute__((packed));
+
+struct arp_t {
+	uint16_t htype;
+	uint16_t ptype;
+	uint8_t  hlen;
+	uint8_t  plen;
+	uint16_t oper;
+	uint8_t  sha[6];
+	uint32_t spa;
+	uint8_t  tha[6];
+	uint32_t tpa;
+} __attribute__((packed));
+
+struct ip_t {
+	uint8_t  ver:4;
+	uint8_t  hlen:4;
+	uint8_t  tos;
+	uint16_t tlen;
+	uint16_t identification;
+	uint16_t ffo_unused:1;
+	uint16_t df:1;
+	uint16_t mf:1;
+	uint16_t foffset:13;
+	uint8_t  ttl;
+	uint8_t  proto;
+	uint16_t hchecksum;
+	uint32_t src;
+	uint32_t dst;
+} __attribute__((packed));
+
+struct icmp_t {
+	uint8_t  type;
+	uint8_t  code;
+	uint16_t checksum;
+} __attribute__((packed));
+
+struct ip6_t {
+	uint32_t ver:4;
+	uint32_t priority:8;
+	uint32_t flow_label:20;
+	uint16_t payload_len;
+	uint8_t  next_header;
+	uint8_t  hop_limit;
+	uint64_t src_hi;
+	uint64_t src_lo;
+	uint64_t dst_hi;
+	uint64_t dst_lo;
+} __attribute__((packed));
+
+struct ip6_opt_t {
+	uint8_t  next_header;
+	uint8_t  ext_len;
+	uint8_t  pad[6];
+} __attribute__((packed));
+
+struct icmp6_t {
+	uint8_t  type;
+	uint8_t  code;
+	uint16_t checksum;
+} __attribute__((packed));
+
+struct udp_t {
+	uint16_t src_port;
+	uint16_t dst_port;
+	uint16_t length;
+	uint16_t crc;
+} __attribute__((packed));
+
+struct tcp_t {
+	uint16_t src_port;
+	uint16_t dst_port;
+	uint32_t seq_num;
+	uint32_t ack_num;
+	uint8_t  offset:4;
+	uint8_t  reserved:4;
+	uint8_t  flag_cwr:1;
+	uint8_t  flag_ece:1;
+	uint8_t  flag_urg:1;
+	uint8_t  flag_ack:1;
+	uint8_t  flag_psh:1;
+	uint8_t  flag_rst:1;
+	uint8_t  flag_syn:1;
+	uint8_t  flag_fin:1;
+	uint16_t rcv_wnd;
+	uint16_t cksum;
+	uint16_t urg_ptr;
+} __attribute__((packed));
+
+struct vxlan_t {
+	uint32_t rsv1:4;
+	uint32_t iflag:1;
+	uint32_t rsv2:3;
+	uint32_t rsv3:24;
+	uint32_t key:24;
+	uint32_t rsv4:8;
+} __attribute__((packed));
+]]
+
+
+-- Architecture-specific ptrace register layout
+local S = require('syscall')
+local arch = S.abi.arch
+local parm_to_reg = {}
+if arch == 'x64' then
+	ffi.cdef [[
+	struct pt_regs {
+		unsigned long r15;
+		unsigned long r14;
+		unsigned long r13;
+		unsigned long r12;
+		unsigned long bp;
+		unsigned long bx;
+		unsigned long r11;
+		unsigned long r10;
+		unsigned long r9;
+		unsigned long r8;
+		unsigned long ax;
+		unsigned long cx;
+		unsigned long dx;
+		unsigned long si;
+		unsigned long di;
+		unsigned long orig_ax;
+		unsigned long ip;
+		unsigned long cs;
+		unsigned long flags;
+		unsigned long sp;
+		unsigned long ss;
+	};]]
+	parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'}
+else
+	ffi.cdef 'struct pt_regs {};'
+end
+-- Map symbolic registers to architecture ABI
+ffi.metatype('struct pt_regs', {
+		__index = function (_ --[[t]],k)
+			return assert(parm_to_reg[k], 'no such register: '..k)
+		end,
+})
+
+local M = {}
+
+-- Dissector interface
+local function dissector(type, e, dst, src, field)
+	local parent = e.V[src].const
+	-- Create new dissector variable
+	e.vcopy(dst, src)
+	-- Compute and materialize new dissector offset from parent
+	e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector}
+	parent.__dissector[field](e, dst)
+	e.V[dst].const.__dissector = type
+end
+M.dissector = dissector
+
+-- Get current effective offset, load field value at an offset relative to it and
+-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen)
+local function next_offset(e, var, type, off, mask, shift)
+	local d = e.V[var].const
+	-- Materialize relative offset value in R0
+	local dst_reg, tmp_reg
+	if d.off then
+		dst_reg = e.vreg(var, 0, true)
+		tmp_reg = dst_reg -- Use target register to avoid copy
+		e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0)
+	else
+		tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset
+		dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var)
+		e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0)
+	end
+	-- Finalize relative offset
+	if mask then
+		e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask)
+	end
+	if shift and shift ~= 0 then
+		local op = BPF.LSH
+		if shift < 0 then
+			op = BPF.RSH
+			shift = -shift
+		end
+		e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift)
+	end
+	-- Add to base offset to turn it into effective address
+	if dst_reg ~= tmp_reg then
+		e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0)
+	else
+		e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off)
+	end
+	-- Discard temporary allocations
+	d.off = nil
+	e.V[e.tmpvar].reg = nil
+end
+
+local function next_skip(e, var, off)
+	local d = e.V[var].const
+	if not d.off then
+		local dst_reg = e.vreg(var)
+		e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off)
+	else
+		d.off = d.off + off
+	end
+end
+
+local function skip_eth(e, dst)
+	-- IP starts right after ETH header (fixed size)
+	local d = e.V[dst].const
+	d.off = d.off + ffi.sizeof('struct eth_t')
+end
+
+-- Export types
+M.type = function(typestr, t)
+	t = t or {}
+	t.__dissector=ffi.typeof(typestr)
+	return t
+end
+M.skb     = M.type('struct sk_buff', {source='ptr_to_ctx'})
+M.pt_regs = M.type('struct pt_regs', {source='ptr_to_probe'})
+M.pkt     = M.type('struct eth_t',   {off=0, source='ptr_to_pkt'}) -- skb needs special accessors
+-- M.eth     = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end
+M.dot1q   = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end
+M.arp     = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end
+M.icmp    = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end
+M.ip      = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end
+M.icmp6   = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end
+M.ip6     = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end
+M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end
+M.udp     = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end
+M.tcp     = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end
+M.vxlan   = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end
+M.data    = function (...) return dissector(ffi.typeof('uint8_t'), ...) end
+M.net_off = function (...) return dissector(ffi.typeof('struct net_off_t'), ...) end
+
+-- Metatables
+ffi.metatype(ffi.typeof('struct eth_t'), {
+	__index = {
+		ip = skip_eth,
+		ip6 = skip_eth,
+		net_off = function (e, dst)
+			next_skip(e, dst, BPF.NET_OFF)
+		end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct net_off_t'), {
+	__index = {
+		ip = function () end,
+		ip6 = function () end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct ip_t'), {
+	__index = {
+		-- Skip IP header length (stored as number of words)
+		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
+		-- Mask first nibble and shift by 2 (multiplication by 4)
+		icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+		udp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+		tcp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct ip6_t'), {
+	__index = {
+		-- Skip fixed IPv6 header length (40 bytes)
+		-- The caller must check the value of `next_header` to skip any extension headers
+		icmp6 = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
+		udp  = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
+		tcp  = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
+		ip6_opt = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
+	}
+})
+
+local ip6_opt_ext_len_off = ffi.offsetof('struct ip6_opt_t', 'ext_len')
+ffi.metatype(ffi.typeof('struct ip6_opt_t'), {
+	__index = {
+		-- Skip IPv6 extension header length (field `ext_len`)
+		icmp6 = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
+		udp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
+		tcp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
+		ip6_opt = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct tcp_t'), {
+	__index = {
+		-- Skip TCP header length (stored as number of words)
+		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
+		data = function(e, dst)
+			next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2)
+		end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct udp_t'), {
+	__index = {
+		-- Skip UDP header length (8 octets)
+		data = function(e, dst)
+			next_skip(e, dst, ffi.sizeof('struct udp_t'))
+		end,
+	}
+})
+
+-- Constants
+M.c = {
+	eth = { -- Constants http://standards.ieee.org/regauth/ethertype
+		ip     = 0x0800, -- IP (v4) protocol
+		ip6    = 0x86dd, -- IP (v6) protocol
+		arp    = 0x0806, -- Address resolution protocol
+		revarp = 0x8035, -- Reverse addr resolution protocol
+		vlan   = 0x8100, -- IEEE 802.1Q VLAN tagging
+	},
+	ip = {
+		-- Reserved Addresses
+		addr_any         = 0x00000000, -- 0.0.0.0
+		addr_broadcast   = 0xffffffff, -- 255.255.255.255
+		addr_loopback    = 0x7f000001, -- 127.0.0.1
+		addr_mcast_all   = 0xe0000001, -- 224.0.0.1
+		addr_mcast_local = 0xe00000ff, -- 224.0.0.255
+		-- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474")
+		tos_default      = 0x00, -- default
+		tos_lowdelay     = 0x10, -- low delay
+		tos_throughput   = 0x08, -- high throughput
+		tos_reliability  = 0x04, -- high reliability
+		tos_lowcost      = 0x02, -- low monetary cost - XXX
+		tos_ect          = 0x02, -- ECN-capable transport
+		tos_ce           = 0x01, -- congestion experienced
+		-- Fragmentation flags (ip_off)
+		rf = 0x8000, -- reserved
+		df = 0x4000, -- don't fragment
+		mf = 0x2000, -- more fragments (not last frag)
+		offmask  = 0x1fff, -- mask for fragment offset
+		-- Time-to-live (ip_ttl), seconds
+		ttl_default = 64,  -- default ttl, RFC 1122, RFC 1340
+		ttl_max     = 255, -- maximum ttl
+		-- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers
+		proto_ip      = 0,  -- dummy for IP
+		proto_hopopts = 0,  -- IPv6 hop-by-hop options
+		proto_icmp    = 1,  -- ICMP
+		proto_igmp    = 2,  -- IGMP
+		proto_ggp     = 3,  -- gateway-gateway protocol
+		proto_ipip    = 4,  -- IP in IP
+		proto_st      = 5,  -- ST datagram mode
+		proto_tcp     = 6,  -- TCP
+		proto_cbt     = 7,  -- CBT
+		proto_egp     = 8,  -- exterior gateway protocol
+		proto_igp     = 9,  -- interior gateway protocol
+		proto_bbnrcc  = 10,  -- BBN RCC monitoring
+		proto_nvp     = 11,  -- Network Voice Protocol
+		proto_pup     = 12,  -- PARC universal packet
+		proto_argus   = 13,  -- ARGUS
+		proto_emcon   = 14,  -- EMCON
+		proto_xnet    = 15,  -- Cross Net Debugger
+		proto_chaos   = 16,  -- Chaos
+		proto_udp     = 17,  -- UDP
+		proto_mux     = 18,  -- multiplexing
+		proto_dcnmeas = 19,  -- DCN measurement
+		proto_hmp     = 20,  -- Host Monitoring Protocol
+		proto_prm     = 21,  -- Packet Radio Measurement
+		proto_idp     = 22,  -- Xerox NS IDP
+		proto_trunk1  = 23,  -- Trunk-1
+		proto_trunk2  = 24,  -- Trunk-2
+		proto_leaf1   = 25,  -- Leaf-1
+		proto_leaf2   = 26,  -- Leaf-2
+		proto_rdp     = 27,  -- "Reliable Datagram" proto
+		proto_irtp    = 28,  -- Inet Reliable Transaction
+		proto_tp      = 29,  -- ISO TP class 4
+		proto_netblt  = 30,  -- Bulk Data Transfer
+		proto_mfpnsp  = 31,  -- MFE Network Services
+		proto_meritinp= 32,  -- Merit Internodal Protocol
+		proto_sep     = 33,  -- Sequential Exchange proto
+		proto_3pc     = 34,  -- Third Party Connect proto
+		proto_idpr    = 35,  -- Interdomain Policy Route
+		proto_xtp     = 36,  -- Xpress Transfer Protocol
+		proto_ddp     = 37,  -- Datagram Delivery Proto
+		proto_cmtp    = 38,  -- IDPR Ctrl Message Trans
+		proto_tppp    = 39,  -- TP++ Transport Protocol
+		proto_il      = 40,  -- IL Transport Protocol
+		proto_ip6     = 41,  -- IPv6
+		proto_sdrp    = 42,  -- Source Demand Routing
+		proto_routing = 43,  -- IPv6 routing header
+		proto_fragment= 44,  -- IPv6 fragmentation header
+		proto_rsvp    = 46,  -- Reservation protocol
+		proto_gre     = 47,  -- General Routing Encap
+		proto_mhrp    = 48,  -- Mobile Host Routing
+		proto_ena     = 49,  -- ENA
+		proto_esp     = 50,  -- Encap Security Payload
+		proto_ah      = 51,  -- Authentication Header
+		proto_inlsp   = 52,  -- Integated Net Layer Sec
+		proto_swipe   = 53,  -- SWIPE
+		proto_narp    = 54,  -- NBMA Address Resolution
+		proto_mobile  = 55,  -- Mobile IP, RFC 2004
+		proto_tlsp    = 56,  -- Transport Layer Security
+		proto_skip    = 57,  -- SKIP
+		proto_icmp6   = 58,  -- ICMP for IPv6
+		proto_none    = 59,  -- IPv6 no next header
+		proto_dstopts = 60,  -- IPv6 destination options
+		proto_anyhost = 61,  -- any host internal proto
+		proto_cftp    = 62,  -- CFTP
+		proto_anynet  = 63,  -- any local network
+		proto_expak   = 64,  -- SATNET and Backroom EXPAK
+		proto_kryptolan = 65,  -- Kryptolan
+		proto_rvd     = 66,  -- MIT Remote Virtual Disk
+		proto_ippc    = 67,  -- Inet Pluribus Packet Core
+		proto_distfs  = 68,  -- any distributed fs
+		proto_satmon  = 69,  -- SATNET Monitoring
+		proto_visa    = 70,  -- VISA Protocol
+		proto_ipcv    = 71,  -- Inet Packet Core Utility
+		proto_cpnx    = 72,  -- Comp Proto Net Executive
+		proto_cphb    = 73,  -- Comp Protocol Heart Beat
+		proto_wsn     = 74,  -- Wang Span Network
+		proto_pvp     = 75,  -- Packet Video Protocol
+		proto_brsatmon= 76,  -- Backroom SATNET Monitor
+		proto_sunnd   = 77,  -- SUN ND Protocol
+		proto_wbmon   = 78,  -- WIDEBAND Monitoring
+		proto_wbexpak = 79,  -- WIDEBAND EXPAK
+		proto_eon     = 80,  -- ISO CNLP
+		proto_vmtp    = 81,  -- Versatile Msg Transport
+		proto_svmtp   = 82,  -- Secure VMTP
+		proto_vines   = 83,  -- VINES
+		proto_ttp     = 84,  -- TTP
+		proto_nsfigp  = 85,  -- NSFNET-IGP
+		proto_dgp     = 86,  -- Dissimilar Gateway Proto
+		proto_tcf     = 87,  -- TCF
+		proto_eigrp   = 88,  -- EIGRP
+		proto_ospf    = 89,  -- Open Shortest Path First
+		proto_spriterpc= 90,  -- Sprite RPC Protocol
+		proto_larp    = 91,  -- Locus Address Resolution
+		proto_mtp     = 92,  -- Multicast Transport Proto
+		proto_ax25    = 93,  -- AX.25 Frames
+		proto_ipipencap= 94,  -- yet-another IP encap
+		proto_micp    = 95,  -- Mobile Internet Ctrl
+		proto_sccsp   = 96,  -- Semaphore Comm Sec Proto
+		proto_etherip = 97,  -- Ethernet in IPv4
+		proto_encap   = 98,  -- encapsulation header
+		proto_anyenc  = 99,  -- private encryption scheme
+		proto_gmtp    = 100,  -- GMTP
+		proto_ifmp    = 101,  -- Ipsilon Flow Mgmt Proto
+		proto_pnni    = 102,  -- PNNI over IP
+		proto_pim     = 103,  -- Protocol Indep Multicast
+		proto_aris    = 104,  -- ARIS
+		proto_scps    = 105,  -- SCPS
+		proto_qnx     = 106,  -- QNX
+		proto_an      = 107,  -- Active Networks
+		proto_ipcomp  = 108,  -- IP Payload Compression
+		proto_snp     = 109,  -- Sitara Networks Protocol
+		proto_compaqpeer= 110,  -- Compaq Peer Protocol
+		proto_ipxip   = 111,  -- IPX in IP
+		proto_vrrp    = 112,  -- Virtual Router Redundancy
+		proto_pgm     = 113,  -- PGM Reliable Transport
+		proto_any0hop = 114,  -- 0-hop protocol
+		proto_l2tp    = 115,  -- Layer 2 Tunneling Proto
+		proto_ddx     = 116,  -- D-II Data Exchange (DDX)
+		proto_iatp    = 117,  -- Interactive Agent Xfer
+		proto_stp     = 118,  -- Schedule Transfer Proto
+		proto_srp     = 119,  -- SpectraLink Radio Proto
+		proto_uti     = 120,  -- UTI
+		proto_smp     = 121,  -- Simple Message Protocol
+		proto_sm      = 122,  -- SM
+		proto_ptp     = 123,  -- Performance Transparency
+		proto_isis    = 124,  -- ISIS over IPv4
+		proto_fire    = 125,  -- FIRE
+		proto_crtp    = 126,  -- Combat Radio Transport
+		proto_crudp   = 127,  -- Combat Radio UDP
+		proto_sscopmce= 128,  -- SSCOPMCE
+		proto_iplt    = 129,  -- IPLT
+		proto_sps     = 130,  -- Secure Packet Shield
+		proto_pipe    = 131,  -- Private IP Encap in IP
+		proto_sctp    = 132,  -- Stream Ctrl Transmission
+		proto_fc      = 133,  -- Fibre Channel
+		proto_rsvpign = 134,  -- RSVP-E2E-IGNORE
+		proto_raw     = 255,  -- Raw IP packets
+		proto_reserved= 255,  -- Reserved
+	},
+}
+
+return M
\ No newline at end of file
diff --git a/src/lua/bpf/spec/README.md b/src/lua/bpf/spec/README.md
new file mode 100644
index 0000000..e19305c
--- /dev/null
+++ b/src/lua/bpf/spec/README.md
@@ -0,0 +1,5 @@
+# Unit test specs
+
+This directory contains spec files for Lua BPF in [Busted] unit test format.
+
+[Busted]: http://olivinelabs.com/busted/
diff --git a/src/lua/bpf/spec/codegen_spec.lua b/src/lua/bpf/spec/codegen_spec.lua
new file mode 100644
index 0000000..05450bd
--- /dev/null
+++ b/src/lua/bpf/spec/codegen_spec.lua
@@ -0,0 +1,1035 @@
+local ffi = require('ffi')
+local S = require('syscall')
+
+-- Normalize whitespace and remove empty lines
+local function normalize_code(c)
+	local res = {}
+	for line in string.gmatch(c,'[^\r\n]+') do
+		local op, d, s, t = line:match('(%S+)%s+(%S+)%s+(%S+)%s*([^-]*)')
+		if op then
+			t = t and t:match('^%s*(.-)%s*$')
+			table.insert(res, string.format('%s\t%s %s %s', op, d, s, t))
+		end
+	end
+	return table.concat(res, '\n')
+end
+
+-- Compile code and check result
+local function compile(t)
+	local bpf = require('bpf')
+	-- require('jit.bc').dump(t.input)
+	local code, err = bpf(t.input)
+	assert.truthy(code)
+	assert.falsy(err)
+	if code then
+		if t.expect then
+			local got = normalize_code(bpf.dump_string(code, 1, true))
+			-- if normalize_code(t.expect) ~= got then print(bpf.dump_string(code, 1)) end
+			assert.same(normalize_code(t.expect), got)
+		end
+	end
+end
+
+-- Make a mock map variable
+local function makemap(type, max_entries, key_ctype, val_ctype)
+	if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
+	if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
+	if not max_entries then max_entries = 4096 end
+	return {
+		__map = true,
+		max_entries = max_entries,
+		key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
+		val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
+		map_type = S.c.BPF_MAP[type],
+		key_type = key_ctype,
+		val_type = val_ctype,
+		fd = 42,
+	}
+end
+
+describe('codegen', function()
+	-- luacheck: ignore 113 211 212 311 511
+
+	describe('constants', function()
+		it('remove dead constant store', function()
+			compile {
+				input = function ()
+					local proto = 5
+				end,
+				expect = [[
+					MOV		R0	#0
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize constant', function()
+			compile {
+				input = function ()
+					return 5
+				end,
+				expect = [[
+					MOV		R0	#5
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize constant longer than i32', function()
+			compile {
+				input = function ()
+					return 4294967295
+				end,
+				expect = [[
+					LDDW	R0	#4294967295
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize cdata constant', function()
+			compile {
+				input = function ()
+					return 5ULL
+				end,
+				expect = [[
+					LDDW	R0	#5 -- composed instruction
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize signed cdata constant', function()
+			compile {
+				input = function ()
+					return 5LL
+				end,
+				expect = [[
+					LDDW	R0	#5 -- composed instruction
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize coercible numeric cdata constant', function()
+			compile {
+				input = function ()
+					return 0x00005
+				end,
+				expect = [[
+					MOV		R0	#5
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('materialize constant through variable', function()
+		compile {
+			input = function ()
+				local proto = 5
+				return proto
+			end,
+			expect = [[
+				MOV		R0	#5
+				EXIT	R0	#0
+			]]
+		}
+		end)
+		it('eliminate constant expressions', function()
+			compile {
+				input = function ()
+					return 2 + 3 - 0
+				end,
+				expect = [[
+					MOV		R0	#5
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('eliminate constant expressions (if block)', function()
+			compile {
+				input = function ()
+					local proto = 5
+					if proto == 5 then
+						proto = 1
+					end
+					return proto
+				end,
+				expect = [[
+					MOV		R0	#1
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('eliminate negative constant expressions (if block) NYI', function()
+			-- always negative condition is not fully eliminated
+			compile {
+				input = function ()
+					local proto = 5
+					if false then
+						proto = 1
+					end
+					return proto
+				end,
+				expect = [[
+					MOV		R7		#5
+					STXDW	[R10-8] R7
+					MOV		R7		#0
+					JEQ		R7		#0 => 0005
+					LDXDW	R0 		[R10-8]
+					EXIT	R0		#0
+				]]
+			}
+		end)
+	end)
+
+	describe('variables', function()
+		it('classic packet access (fold constant offset)', function()
+			compile {
+				input = function (skb)
+					return eth.ip.tos -- constant expression will fold
+				end,
+				expect = [[
+					LDB		R0	skb[15]
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('classic packet access (load non-constant offset)', function()
+			compile {
+				input = function (skb)
+					return eth.ip.udp.src_port -- need to skip variable-length header
+				end,
+				expect = [[
+					LDB		R0			skb[14]
+					AND		R0			#15
+					LSH		R0			#2
+					ADD		R0 			#14
+					STXDW	[R10-16]	R0 -- NYI: erase dead store
+					LDH		R0 			skb[R0+0]
+					END		R0 			R0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('classic packet access (manipulate dissector offset)', function()
+			compile {
+				input = function (skb)
+					local ptr = eth.ip.udp.data + 1
+					return ptr[0] -- dereference dissector pointer
+				end,
+				expect = [[
+					LDB		R0			skb[14]
+					AND		R0			#15
+					LSH		R0			#2
+					ADD		R0			#14 -- NYI: fuse commutative operations in second pass
+					ADD		R0			#8
+					ADD		R0			#1
+					STXDW	[R10-16] 	R0
+					LDB		R0			skb[R0+0]
+					EXIT	R0			#0
+				]]
+			}
+		end)
+		it('classic packet access (multi-byte load)', function()
+			compile {
+				input = function (skb)
+					local ptr = eth.ip.udp.data
+					return ptr(1, 5) -- load 4 bytes
+				end,
+				expect = [[
+					LDB		R0			skb[14]
+					AND		R0			#15
+					LSH		R0			#2
+					ADD		R0			#14
+					ADD		R0			#8
+					MOV		R7			R0
+					STXDW	[R10-16]	R0 -- NYI: erase dead store
+					LDW		R0			skb[R7+1]
+					END		R0			R0
+					EXIT	R0			#0
+				]]
+			}
+		end)
+		it('direct skb field access', function()
+			compile {
+				input = function (skb)
+					return skb.len
+				end,
+				expect = [[
+					LDXW	R7	[R6+0]
+					MOV		R0	R7
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('direct skb data access (manipulate offset)', function()
+			compile {
+				input = function (skb)
+					local ptr = skb.data + 5
+					return ptr[0]
+				end,
+				expect = [[
+					LDXW	R7	[R6+76]
+					ADD		R7	#5
+					LDXB	R8 	[R7+0] -- NYI: transform LD + ADD to LD + offset addressing
+					MOV		R0 	R8
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('direct skb data access (offset boundary check)', function()
+			compile {
+				input = function (skb)
+					local ptr = skb.data + 5
+					if ptr < skb.data_end then
+						return ptr[0]
+					end
+				end,
+				expect = [[
+					LDXW	R7	[R6+76]
+					ADD		R7	#5
+					LDXW	R8	[R6+80]
+					JGE		R7	R8 => 0008
+					LDXB	R8	[R7+0]
+					MOV		R0 	R8
+					EXIT	R0	#0
+					MOV		R0	#0
+					EXIT	R0	#0
+				]]
+			}
+		end)
+		it('access stack memory (array, const load, const store)', function()
+			compile {
+				input = function (skb)
+					local mem = ffi.new('uint8_t [16]')
+					mem[0] = 5
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-40] 	R0
+					STXDW	[R10-48] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					STB		[R10-48] 	#5
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('access stack memory (array, const load, packet store)', function()
+			compile {
+				input = function (skb)
+					local mem = ffi.new('uint8_t [7]')
+					mem[0] = eth.ip.tos
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-40] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					STXB	[R10-40] 	R0
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('access stack memory (array, packet load, const store)', function()
+			compile {
+				input = function (skb)
+					local mem = ffi.new('uint8_t [1]')
+					mem[eth.ip.tos] = 5
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					MOV		R7 			R0
+					ADD		R7 			R10
+					STB		[R7-48] 	#5
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('access stack memory (array, packet load, packet store)', function()
+			compile {
+				input = function (skb)
+					local mem = ffi.new('uint8_t [7]')
+					local v = eth.ip.tos
+					mem[v] = v
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-40] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					MOV		R7 			R0
+					ADD		R7 			R10
+					STXB	[R7-40] 	R0
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('access stack memory (struct, const/packet store)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			compile {
+				input = function (skb)
+					local mem = ffi.new(kv_t)
+					mem.a = 5
+					mem.b = eth.ip.tos
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-40] 	R0
+					STXDW	[R10-48] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					MOV		R7 			#5
+					STXDW	[R10-48] 	R7
+					LDB		R0 			skb[15]
+					STXDW	[R10-40] 	R0
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('access stack memory (struct, const/stack store)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			compile {
+				input = function (skb)
+					local m1 = ffi.new(kv_t)
+					local m2 = ffi.new(kv_t)
+					m1.a = 5
+					m2.b = m1.a
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					MOV		R0 			#0
+					STXDW	[R10-64] 	R0
+					STXDW	[R10-72] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					MOV		R7 			#5
+					STXDW	[R10-56] 	R7
+					LDXDW	R7 			[R10-56]
+					STXDW	[R10-64] 	R7
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key load)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					return array_map[0]
+				end,
+				expect = [[
+					LDDW	R1			#42
+					STW		[R10-28]	#0
+					MOV		R2			R10
+					ADD		R2			#4294967268
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0			#0 => 0009
+					LDXW	R0			[R0+0]
+					EXIT	R0			#0
+				]]
+			}
+		end)
+		it('array map (u32, packet key load)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					return array_map[eth.ip.tos]
+				end,
+				expect = [[
+					LDB 	R0 			skb[15]
+					LDDW	R1			#42
+					STXW	[R10-36] 	R0
+					MOV		R2			R10
+					ADD		R2			#4294967260
+					STXDW	[R10-24] 	R0 -- NYI: erase dead store
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0			#0 => 0011
+					LDXW	R0			[R0+0]
+					EXIT	R0			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key store, const value)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					array_map[0] = 5
+				end,
+				expect = [[
+					LDDW	R1 			#42
+					STW		[R10-36] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967260
+					MOV		R4 			#0
+					STW		[R10-40] 	#5
+					MOV		R3 			R10
+					ADD		R3 			#4294967256
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key store, packet value)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					array_map[0] = eth.ip.tos
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					STXDW	[R10-24] 	R0
+					LDDW	R1 			#42
+					STW		[R10-36] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967260
+					MOV		R4 			#0
+					MOV		R3 			R10
+					ADD		R3 			#4294967272
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key store, map value)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					array_map[0] = array_map[1]
+				end,
+				expect = [[
+					LDDW	R1 			#42
+					STW		[R10-36] 	#1
+					MOV		R2 			R10
+					ADD		R2 			#4294967260
+					CALL	R0 			#1 ; map_lookup_elem
+					STXDW	[R10-24] 	R0
+					LDDW	R1 			#42
+					STW		[R10-36]	#0
+					MOV		R2			R10
+					ADD		R2			#4294967260
+					MOV		R4			#0
+					LDXDW	R3			[R10-24]
+					JEQ		R3			#0 => 0017
+					LDXW	R3			[R3+0]
+					STXW	[R10-40]	R3
+					MOV		R3 			R10
+					ADD		R3 			#4294967256
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key replace, const value)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					local val = array_map[0]
+					if val then
+						val[0] = val[0] + 1
+					else
+						array_map[0] = 5
+					end
+				end,
+				expect = [[
+					LDDW	R1 			#42
+					STW		[R10-44] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967252
+					CALL	R0 			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0013 -- if (map_value ~= NULL)
+					LDXW	R7 			[R0+0]
+					ADD		R7 			#1
+					STXW	[R0+0] 		R7
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0025 -- skip false branch
+					STXDW	[R10-16] 	R0
+					LDDW	R1 			#42
+					STW		[R10-44] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967252
+					MOV		R4 			#0
+					STW		[R10-48] 	#5
+					MOV		R3 			R10
+					ADD		R3 			#4294967248
+					CALL	R0 			#2 ; map_update_elem
+					LDXDW	R0 			[R10-16]
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key replace xadd, const value)', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					local val = array_map[0]
+					if val then
+						xadd(val, 1)
+					else
+						array_map[0] = 5
+					end
+				end,
+				expect = [[
+					LDDW	R1 			#42
+					STW		[R10-52] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967244
+					CALL	R0 			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0014 -- if (map_value ~= NULL)
+					MOV		R7 			#1
+					MOV		R8 			R0
+					STXDW	[R10-16] 	R0
+					XADDW	[R8+0] 		R7
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0025 -- skip false branch
+					STXDW	[R10-16] 	R0
+					LDDW	R1 			#42
+					STW		[R10-52] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967244
+					MOV		R4 			#0
+					STW		[R10-56] 	#5
+					MOV		R3 			R10
+					ADD		R3 			#4294967240
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (u32, const key replace xadd, const value) inverse nil check', function()
+			local array_map = makemap('array', 256)
+			compile {
+				input = function (skb)
+					local val = array_map[0]
+					if not val then
+						array_map[0] = 5
+					else
+						xadd(val, 1)
+					end
+				end,
+				expect = [[
+					LDDW	R1 			#42
+					STW		[R10-52] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967244
+					CALL	R0 			#1 ; map_lookup_elem
+					JNE		R0 			#0 => 0021
+					STXDW	[R10-16] 	R0
+					LDDW	R1 			#42
+					STW		[R10-52] 	#0
+					MOV		R2 			R10
+					ADD		R2 			#4294967244
+					MOV		R4 			#0
+					STW		[R10-56] 	#5
+					MOV		R3 			R10
+					ADD		R3 			#4294967240
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0025
+					MOV		R7 			#1
+					MOV		R8 			R0
+					STXDW	[R10-16] 	R0
+					XADDW	[R8+0] 		R7
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack key load)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = 2
+					key.b = 3
+					local val = array_map[key] -- Use composite key from stack memory
+					if val then
+						return val.a
+					end
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					MOV		R7 			#2
+					STXDW	[R10-56] 	R7
+					MOV		R7 			#3
+					STXDW	[R10-48] 	R7
+					LDDW	R1			#42
+					MOV		R2			R10
+					ADD		R2			#4294967240
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0017
+					LDXDW	R7 			[R0+0]
+					MOV		R0 			R7
+					EXIT	R0 			#0
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack key store)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = 2
+					key.b = 3
+					array_map[key] = key -- Use composite key from stack memory
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-40] 	R0
+					STXDW	[R10-48] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					MOV		R7 			#2
+					STXDW	[R10-48] 	R7
+					MOV		R7 			#3
+					STXDW	[R10-40] 	R7
+					LDDW	R1 			#42
+					MOV		R2 			R10
+					ADD		R2 			#4294967248
+					MOV		R4 			#0
+					MOV		R3 			R10
+					ADD		R3 			#4294967248
+					CALL	R0 			#2 ; map_update_elem
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack/packet key update, const value)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = eth.ip.tos   -- Load key part from dissector
+					local val = array_map[key]
+					if val then
+						val.a = 5
+					end
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					STXDW	[R10-56] 	R0
+					LDDW	R1			#42
+					MOV		R2			R10
+					ADD		R2			#4294967240
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0014
+					MOV		R7 			#5
+					STXDW	[R0+0] 		R7
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack/packet key update, map value)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = eth.ip.tos   -- Load key part from dissector
+					local val = array_map[key]
+					if val then
+						val.a = val.b
+					end
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					STXDW	[R10-56] 	R0
+					LDDW	R1			#42
+					MOV		R2			R10
+					ADD		R2			#4294967240
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0014
+					LDXDW	R7 			[R0+8]
+					STXDW	[R0+0] 		R7
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack/packet key update, stack value)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = eth.ip.tos   -- Load key part from dissector
+					local val = array_map[key]
+					if val then
+						val.a = key.b
+					end
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0 -- NYI: erase zero-fill on allocation when it's loaded later
+					LDB		R0 			skb[15]
+					STXDW	[R10-56] 	R0
+					LDDW	R1			#42
+					MOV		R2			R10
+					ADD		R2			#4294967240
+					CALL	R0			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0014
+					LDXDW	R7 			[R10-48]
+					STXDW	[R0+0] 		R7
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('array map (struct, stack/packet key replace, stack value)', function()
+			local kv_t = 'struct { uint64_t a; uint64_t b; }'
+			local array_map = makemap('array', 256, ffi.typeof(kv_t), ffi.typeof(kv_t))
+			compile {
+				input = function (skb)
+					local key = ffi.new(kv_t)
+					key.a = eth.ip.tos   -- Load key part from dissector
+					local val = array_map[key]
+					if val then
+						val.a = key.b
+					else
+						array_map[key] = key
+					end
+				end,
+				expect = [[
+					MOV		R0 			#0
+					STXDW	[R10-48] 	R0
+					STXDW	[R10-56] 	R0
+					LDB		R0 			skb[15]
+					STXDW	[R10-56] 	R0
+					LDDW	R1 			#42
+					MOV		R2 			R10
+					ADD		R2 			#4294967240
+					CALL	R0 			#1 ; map_lookup_elem
+					JEQ		R0 			#0 => 0016 -- if (map_value ~= NULL)
+					LDXDW	R7 			[R10-48]
+					STXDW	[R0+0] 		R7
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0026 -- jump over false branch
+					STXDW	[R10-24] 	R0
+					LDDW	R1 			#42
+					MOV		R2 			R10
+					ADD		R2 			#4294967240
+					MOV		R4 			#0
+					MOV		R3 			R10
+					ADD		R3 			#4294967240
+					CALL	R0 			#2 ; map_update_elem
+					LDXDW	R0 			[R10-24]
+					MOV		R0 			#0
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+	end)
+	describe('control flow', function()
+		it('condition with constant return', function()
+			compile {
+				input = function (skb)
+					local v = eth.ip.tos
+					if v then
+						return 1
+					else
+						return 0
+					end
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					JEQ		R0 			#0 => 0005
+					MOV		R0 			#1
+					EXIT	R0 			#0
+					MOV		R0 			#0 -- 0005 jump target
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('condition with cdata constant return', function()
+			local cdata = 2ULL
+			compile {
+				input = function (skb)
+					local v = eth.ip.tos
+					if v then
+						return cdata + 1
+					else
+						return 0
+					end
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					JEQ		R0 			#0 => 0006
+					LDDW	R0 			#3
+					EXIT	R0 			#0
+					MOV		R0 			#0 -- 0006 jump target
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('condition with constant return (inversed)', function()
+			compile {
+				input = function (skb)
+					local v = eth.ip.tos
+					if not v then
+						return 1
+					else
+						return 0
+					end
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					JNE		R0 			#0 => 0005
+					MOV		R0 			#1
+					EXIT	R0 			#0
+					MOV		R0 			#0 -- 0005 jump target
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('condition with variable mutation', function()
+			compile {
+				input = function (skb)
+					local v = 0
+					if eth.ip.tos then
+						v = 1
+					end
+					return v
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					MOV		R1 			#0
+					STXDW	[R10-16] 	R1
+					JEQ		R0 			#0 => 0007
+					MOV		R7 			#1
+					STXDW	[R10-16] 	R7
+					LDXDW	R0 			[R10-16]
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('condition with nil variable mutation', function()
+			compile {
+				input = function (skb)
+					local v -- nil, will be elided
+					if eth.ip.tos then
+						v = 1
+					else
+						v = 0
+					end
+					return v
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					JEQ		R0 			#0 => 0007
+					MOV		R7 			#1
+					STXDW	[R10-16] 	R7
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0009
+					MOV		R7 			#0
+					STXDW	[R10-16] 	R7
+					LDXDW	R0 			[R10-16]
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('nested condition with variable mutation', function()
+			compile {
+				input = function (skb)
+					local v = 0
+					local tos = eth.ip.tos
+					if tos then
+						if tos > 5 then
+							v = 5
+						else
+							v = 1
+						end
+					end
+					return v
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					MOV		R1 			#0
+					STXDW	[R10-16] 	R1 -- materialize v = 0
+					JEQ		R0 			#0 => 0013 -- if not tos
+					MOV		R7 			#5
+					JGE		R7 			R0 => 0011 -- if 5 > tos
+					MOV		R7 			#5
+					STXDW	[R10-16] 	R7 -- materialize v = 5
+					MOV		R7 			#0
+					JEQ		R7 			#0 => 0013
+					MOV		R7 			#1 -- 0011 jump target
+					STXDW	[R10-16]	R7 -- materialize v = 1
+					LDXDW	R0 			[R10-16]
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('nested condition with variable shadowing', function()
+			compile {
+				input = function (skb)
+					local v = 0
+					local tos = eth.ip.tos
+					if tos then
+						local v = 0 -- luacheck: ignore 231
+						if tos > 5 then
+							v = 5 -- changing shadowing variable
+						end
+					else
+						v = 1
+					end
+					return v
+				end,
+				expect = [[
+					LDB		R0 			skb[15]
+					MOV		R1 			#0
+					STXDW	[R10-16] 	R1 -- materialize v = 0
+					JEQ		R0 			#0 => 0011 -- if not tos
+					MOV		R7 			#5
+					MOV		R1 			#0
+					STXDW	[R10-32] 	R1 -- materialize shadowing variable
+					JGE		R7 			R0 => 0013 -- if 5 > tos
+					MOV		R7 			#0 -- erased 'v = 5' dead store
+					JEQ		R7 			#0 => 0013
+					MOV		R7 			#1 -- 0011 jump target
+					STXDW	[R10-16]	R7 -- materialize v = 1
+					LDXDW	R0 			[R10-16] -- 0013 jump target
+					EXIT	R0 			#0
+				]]
+			}
+		end)
+		it('condition materializes shadowing variable at the end of BB', function()
+			compile {
+				input = function (skb)
+					local v = time()
+					local v1 = 0 -- luacheck: ignore 231
+					if eth.ip.tos then
+						v1 = v
+					end
+				end,
+				expect = [[
+					CALL	R0 			#5 ; ktime_get_ns
+					STXDW	[R10-16] 	R0
+					LDB		R0 			skb[15]
+					MOV		R1 			#0
+					STXDW	[R10-24] 	R1 -- materialize v1 = 0
+					JEQ		R0 			#0 => 0009
+					LDXDW	R7 			[R10-16]
+					STXDW	[R10-24] 	R7 -- v1 = v0
+					MOV		R0 #0
+					EXIT	R0 #0
+				]]
+			}
+		end)
+
+	end)
+end)
diff --git a/src/lua/bpf/spec/compile_spec.lua b/src/lua/bpf/spec/compile_spec.lua
new file mode 100644
index 0000000..823a2e5
--- /dev/null
+++ b/src/lua/bpf/spec/compile_spec.lua
@@ -0,0 +1,23 @@
+describe('compile', function()
+	local ffi = require('ffi')
+	local bpf = require('bpf')
+
+	it('can compile socket filter', function()
+		-- Create mock BPF map
+		local mock_map = {
+			max_entries = 16,
+			key_type = ffi.typeof('uint64_t [1]'),
+			val_type = ffi.typeof('uint64_t [1]'),
+			fd = 1,
+			__map = true,
+		}
+		-- Compile small code example
+		local code = bpf(function ()
+		   local proto = pkt.ip.proto
+		   xadd(mock_map[proto], 1)
+		end)
+		assert.truthy(code)
+		assert.same(type(code), 'table')
+		assert.same(code.pc, 15)
+	end)
+end)
diff --git a/src/lua/bpf/spec/decoder_spec.lua b/src/lua/bpf/spec/decoder_spec.lua
new file mode 100644
index 0000000..a175879
--- /dev/null
+++ b/src/lua/bpf/spec/decoder_spec.lua
@@ -0,0 +1,31 @@
+describe('decoder', function()
+
+	-- Decode simple function
+	local bytecode = require('bpf.ljbytecode')
+	local f = function (x) return x + 1 end
+
+	it('should decode functions', function()
+		-- Make sure it calls LJ decoder
+		local bc = bytecode.decoder(f)
+		assert.truthy(bc)
+		-- Decode bytecode bytecode to instructions
+		local jutil = require("jit.util")
+		spy.on(jutil, 'funcbc')
+		local pc, op = bc()
+		-- Check bytecode for sanity (starts with ADDVN(x, 1))
+		assert.equal(pc, 1)
+		assert.equal(op, 'ADDVN')
+		for pc, op in bc do
+			assert.truthy(pc and op)
+		end
+		assert.spy(jutil.funcbc).was.called()
+	end)
+	it('should fail on bad input', function()
+		assert.has_error(function() bytecode.decoder(nil)() end)
+		assert.has_error(function() bytecode.decoder(5)() end)
+		assert.has_error(function() bytecode.decoder('test')() end)
+	end)
+	it('should dump bytecode', function()
+		bytecode.dump(f)
+	end)
+end)
diff --git a/src/lua/bpf/spec/elf_spec.lua b/src/lua/bpf/spec/elf_spec.lua
new file mode 100644
index 0000000..0be050d
--- /dev/null
+++ b/src/lua/bpf/spec/elf_spec.lua
@@ -0,0 +1,24 @@
+describe('elf reader', function()
+
+	local ok, elf = pcall(require, 'bpf.elf')
+	if not ok then return end
+
+	it('should handle C library', function()
+		-- Open libc
+		local sh = elf.open('/bin/sh')
+		assert.truthy(sh)
+		-- Find load address
+		local base = sh:loadaddr()
+		assert.truthy(base)
+		-- Find something from ISO C
+		local malloc_addr = sh:resolve('malloc')
+		assert.truthy(malloc_addr)
+		-- Find something that doesn't exist
+		local bad_addr = sh:resolve('thisnotexists')
+		assert.falsy(bad_addr)
+	end)
+	it('should fail on bad input', function()
+		assert.falsy(elf.open(nil))
+		assert.falsy(elf.open('/tmp'):loadaddr())
+	end)
+end)
diff --git a/src/lua/bpf/spec/helper.lua b/src/lua/bpf/spec/helper.lua
new file mode 100644
index 0000000..63ee0b1
--- /dev/null
+++ b/src/lua/bpf/spec/helper.lua
@@ -0,0 +1,35 @@
+local ffi = require('ffi')
+
+-- Define basic ctypes
+ffi.cdef [[
+	struct bpf_insn {
+	  uint8_t code;   /* opcode */
+	  uint8_t dst_reg:4;  /* dest register */
+	  uint8_t src_reg:4;  /* source register */
+	  uint16_t off;   /* signed offset */
+	  uint32_t imm;   /* signed immediate constant */
+	};
+]]
+
+-- Inject mock ljsyscall for tests
+package.loaded['syscall'] = {
+	bpf = function() error('mock') end,
+	c = { BPF_MAP = {}, BPF_PROG = {} },
+	abi = { arch = 'x64' },
+}
+
+package.loaded['syscall.helpers'] = {
+	strflag = function (tab)
+		local function flag(cache, str)
+			if type(str) ~= "string" then return str end
+			if #str == 0 then return 0 end
+			local s = str:upper()
+			if #s == 0 then return 0 end
+			local val = rawget(tab, s)
+			if not val then return nil end
+			cache[str] = val
+			return val
+		end
+		return setmetatable(tab, {__index = setmetatable({}, {__index = flag}), __call = function(t, a) return t[a] end})
+	end
+}
\ No newline at end of file
diff --git a/src/lua/squishy b/src/lua/squishy
new file mode 100644
index 0000000..a642005
--- /dev/null
+++ b/src/lua/squishy
@@ -0,0 +1,25 @@
+Module "bcc.vendor.argparse" "bcc/vendor/argparse.lua"
+Module "bcc.vendor.posix" "bcc/vendor/posix.lua"
+Module "bcc.vendor.middleclass" "bcc/vendor/middleclass.lua"
+Module "bcc.vendor.json" "bcc/vendor/json.lua"
+Module "bcc.vendor.helpers" "bcc/vendor/helpers.lua"
+
+Module "bcc.init" "bcc/init.lua"
+Module "bcc.run" "bcc/run.lua"
+Module "bcc.bpf" "bcc/bpf.lua"
+Module "bcc.sym" "bcc/sym.lua"
+Module "bcc.libbcc" "bcc/libbcc.lua"
+Module "bcc.tracerpipe" "bcc/tracerpipe.lua"
+Module "bcc.table" "bcc/table.lua"
+Module "bcc.usdt" "bcc/usdt.lua"
+
+Module "bpf" "bpf/init.lua"
+Module "bpf.bpf" "bpf/bpf.lua"
+Module "bpf.builtins" "bpf/builtins.lua"
+Module "bpf.cdef" "bpf/cdef.lua"
+Module "bpf.elf" "bpf/elf.lua"
+Module "bpf.ljbytecode" "bpf/ljbytecode.lua"
+Module "bpf.proto" "bpf/proto.lua"
+
+Main "bcc/run.lua"
+Output "bcc.lua"
diff --git a/src/lua/src/main.c b/src/lua/src/main.c
new file mode 100644
index 0000000..bec3767
--- /dev/null
+++ b/src/lua/src/main.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2016 GitHub, Inc
+ *
+ * Based on lua.c, the Lua C Interpreter
+ * Copyright (C) 1994-2012 Lua.org, PUC-Rio.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "lauxlib.h"
+#include "lua.h"
+#include "lualib.h"
+
+static lua_State *globalL = NULL;
+static const char *progname = NULL;
+
+static void lstop(lua_State *L, lua_Debug *ar) {
+  (void)ar; /* unused arg. */
+  lua_sethook(L, NULL, 0, 0);
+  luaL_error(L, "interrupted!");
+}
+
+static void laction(int i) {
+  signal(i, SIG_DFL);
+  lua_sethook(globalL, lstop, LUA_MASKCALL | LUA_MASKRET | LUA_MASKCOUNT, 1);
+}
+
+static void l_message(const char *pname, const char *msg) {
+  if (pname)
+    fprintf(stderr, "%s: ", pname);
+  fprintf(stderr, "%s\n", msg);
+  fflush(stderr);
+}
+
+static int report(lua_State *L, int status) {
+  if (status && !lua_isnil(L, -1)) {
+    const char *msg = lua_tostring(L, -1);
+    if (msg == NULL)
+      msg = "(error object is not a string)";
+    l_message(progname, msg);
+    lua_pop(L, 1);
+  }
+  return status;
+}
+
+static int traceback(lua_State *L) {
+  if (!lua_isstring(L, 1)) /* 'message' not a string? */
+    return 1;              /* keep it intact */
+  lua_getglobal(L, "debug");
+  if (!lua_istable(L, -1)) {
+    lua_pop(L, 1);
+    return 1;
+  }
+  lua_getfield(L, -1, "traceback");
+  if (!lua_isfunction(L, -1)) {
+    lua_pop(L, 2);
+    return 1;
+  }
+  lua_pushvalue(L, 1);   /* pass error message */
+  lua_pushinteger(L, 2); /* skip this function and traceback */
+  lua_call(L, 2, 1);     /* call debug.traceback */
+  return 1;
+}
+
+static int docall(lua_State *L, int narg, int clear) {
+  int status;
+  int base = lua_gettop(L) - narg; /* function index */
+  lua_pushcfunction(L, traceback); /* push traceback function */
+  lua_insert(L, base);             /* put it under chunk and args */
+  signal(SIGINT, laction);
+  status = lua_pcall(L, narg, (clear ? 0 : LUA_MULTRET), base);
+  signal(SIGINT, SIG_DFL);
+  lua_remove(L, base); /* remove traceback function */
+  /* force a complete garbage collection in case of errors */
+  if (status != 0)
+    lua_gc(L, LUA_GCCOLLECT, 0);
+  return status;
+}
+
+static int dolibrary(lua_State *L, const char *name, int clear) {
+  lua_getglobal(L, "require");
+  lua_pushstring(L, name);
+  return report(L, docall(L, 1, clear));
+}
+
+struct Smain {
+  int argc;
+  char **argv;
+  int status;
+};
+
+static void pushargv(lua_State *L, char **argv, int argc, int offset) {
+  int i, j;
+  lua_createtable(L, argc, 0);
+  for (i = offset, j = 1; i < argc; i++, j++) {
+    lua_pushstring(L, argv[i]);
+    lua_rawseti(L, -2, j);
+  }
+}
+
+static int pmain(lua_State *L) {
+  struct Smain *s = (struct Smain *)lua_touserdata(L, 1);
+  globalL = L;
+
+  lua_gc(L, LUA_GCSTOP, 0);
+  luaL_openlibs(L);
+  lua_gc(L, LUA_GCRESTART, 0);
+
+  s->status = dolibrary(L, "bcc", 0);
+  if (s->status)
+    return 0;
+
+  lua_pushstring(L, progname);
+  lua_setglobal(L, "BCC_STANDALONE");
+
+  pushargv(L, s->argv, s->argc, 1);
+  lua_setglobal(L, "arg");
+
+  s->status = report(L, docall(L, 0, 1));
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int status;
+  struct Smain s;
+  lua_State *L = lua_open(); /* create state */
+
+  if (L == NULL) {
+    l_message(argv[0], "cannot create state: not enough memory");
+    return EXIT_FAILURE;
+  }
+
+  if (geteuid() != 0) {
+    l_message(argv[0], "bcc-lua must be ran as root");
+    return EXIT_FAILURE;
+  }
+
+  progname = argv[0];
+  s.argc = argc;
+  s.argv = argv;
+  s.status = 0;
+
+  status = lua_cpcall(L, &pmain, &s);
+  report(L, status);
+  lua_close(L);
+
+  return (status || s.status) ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/src/lua/src/squish.lua b/src/lua/src/squish.lua
new file mode 100755
index 0000000..132fb1a
--- /dev/null
+++ b/src/lua/src/squish.lua
@@ -0,0 +1,314 @@
+#!/usr/bin/env lua
+
+local short_opts = { v = "verbose", vv = "very_verbose", o = "output", q = "quiet", qq = "very_quiet", g = "debug" }
+local opts = { use_http = false };
+
+for _, opt in ipairs(arg) do
+	if opt:match("^%-") then
+		local name = opt:match("^%-%-?([^%s=]+)()")
+		name = (short_opts[name] or name):gsub("%-+", "_");
+		if name:match("^no_") then
+			name = name:sub(4, -1);
+			opts[name] = false;
+		else
+			opts[name] = opt:match("=(.*)$") or true;
+		end
+	else
+		base_path = opt;
+	end
+end
+
+if opts.very_verbose then opts.verbose = true; end
+if opts.very_quiet then opts.quiet = true; end
+
+local noprint = function () end
+local print_err, print_info, print_verbose, print_debug = noprint, noprint, noprint, noprint;
+
+if not opts.very_quiet then print_err = print; end
+if not opts.quiet then print_info = print; end
+if opts.verbose or opts.very_verbose then print_verbose = print; end
+if opts.very_verbose then print_debug = print; end
+
+print = print_verbose;
+
+local modules, main_files, resources = {}, {}, {};
+
+--  Functions to be called from squishy file  --
+
+function Module(name)
+	if modules[name] then
+		print_verbose("Ignoring duplicate module definition for "..name);
+		return function () end
+	end
+	local i = #modules+1;
+	modules[i] = { name = name, url = ___fetch_url };
+	modules[name] = modules[i];
+	return function (path)
+		modules[i].path = path;
+	end
+end
+
+function Resource(name, path)
+	local i = #resources+1;
+	resources[i] = { name = name, path = path or name };
+	return function (path)
+		resources[i].path = path;
+	end
+end
+
+function AutoFetchURL(url)
+	___fetch_url = url;
+end
+
+function Main(fn)
+	table.insert(main_files, fn);
+end
+
+function Output(fn)
+	if opts.output == nil then
+		out_fn = fn;
+	end
+end
+
+function Option(name)
+	name = name:gsub("%-", "_");
+	if opts[name] == nil then
+		opts[name] = true;
+		return function (value)
+			opts[name] = value;
+		end
+	else
+		return function () end;
+	end
+end
+
+function GetOption(name)
+	return opts[name:gsub('%-', '_')];
+end
+
+function Message(message)
+	if not opts.quiet then
+		print_info(message);
+	end
+end
+
+function Error(message)
+	if not opts.very_quiet then
+		print_err(message);
+	end
+end
+
+function Exit()
+	os.exit(1);
+end
+-- -- -- -- -- -- -- --- -- -- -- -- -- -- -- --
+
+base_path = (base_path or "."):gsub("/$", "").."/"
+squishy_file = base_path .. "squishy";
+out_fn = opts.output;
+
+local ok, err = pcall(dofile, squishy_file);
+
+if not ok then
+	print_err("Couldn't read squishy file: "..err);
+	os.exit(1);
+end
+
+if not out_fn then
+	print_err("No output file specified by user or squishy file");
+	os.exit(1);
+elseif #main_files == 0 and #modules == 0 and #resources == 0 then
+	print_err("No files, modules or resources. Not going to generate an empty file.");
+	os.exit(1);
+end
+
+local fetch = {};
+function fetch.filesystem(path)
+	local f, err = io.open(path);
+	if not f then return false, err; end
+
+	local data = f:read("*a");
+	f:close();
+
+	return data;
+end
+
+if opts.use_http then
+	function fetch.http(url)
+		local http = require "socket.http";
+
+		local body, status = http.request(url);
+		if status == 200 then
+			return body;
+		end
+		return false, "HTTP status code: "..tostring(status);
+	end
+else
+	function fetch.http(url)
+		return false, "Module not found. Re-squish with --use-http option to fetch it from "..url;
+	end
+end
+
+print_info("Writing "..out_fn.."...");
+local f, err = io.open(out_fn, "w+");
+if not f then
+	print_err("Couldn't open output file: "..tostring(err));
+	os.exit(1);
+end
+
+if opts.executable then
+	if opts.executable == true then
+		f:write("#!/usr/bin/env lua\n");
+	else
+		f:write(opts.executable, "\n");
+	end
+end
+
+if opts.debug then
+	f:write(require_resource("squish.debug"));
+end
+
+print_verbose("Resolving modules...");
+do
+	local LUA_DIRSEP = package.config:sub(1,1);
+	local LUA_PATH_MARK = package.config:sub(5,5);
+
+	local package_path = package.path:gsub("[^;]+", function (path)
+			if not path:match("^%"..LUA_DIRSEP) then
+				return base_path..path;
+			end
+		end):gsub("/%./", "/");
+	local package_cpath = package.cpath:gsub("[^;]+", function (path)
+			if not path:match("^%"..LUA_DIRSEP) then
+				return base_path..path;
+			end
+		end):gsub("/%./", "/");
+
+	function resolve_module(name, path)
+	        name = name:gsub("%.", LUA_DIRSEP);
+	        for c in path:gmatch("[^;]+") do
+	                c = c:gsub("%"..LUA_PATH_MARK, name);
+	                print_debug("Looking for "..c)
+	                local f = io.open(c);
+	                if f then
+				print_debug("Found!");
+	                        f:close();
+                        return c;
+			end
+		end
+		return nil; -- not found
+	end
+
+	for i, module in ipairs(modules) do
+		if not module.path then
+			module.path = resolve_module(module.name, package_path);
+			if not module.path then
+				print_err("Couldn't resolve module: "..module.name);
+			else
+				-- Strip base_path from resolved path
+				module.path = module.path:gsub("^"..base_path:gsub("%p", "%%%1"), "");
+			end
+		end
+	end
+end
+
+
+print_verbose("Packing modules...");
+for _, module in ipairs(modules) do
+	local modulename, path = module.name, module.path;
+	if module.path:sub(1,1) ~= "/" then
+		path = base_path..module.path;
+	end
+	print_debug("Packing "..modulename.." ("..path..")...");
+	local data, err = fetch.filesystem(path);
+	if (not data) and module.url then
+		print_debug("Fetching: ".. module.url:gsub("%?", module.path))
+		data, err = fetch.http(module.url:gsub("%?", module.path));
+	end
+	if data then
+		f:write("package.preload['", modulename, "'] = (function (...)\n");
+		f:write(data);
+		f:write(" end)\n");
+		if opts.debug then
+			f:write(string.format("package.preload[%q] = ___adjust_chunk(package.preload[%q], %q);\n\n",
+				modulename, modulename, "@"..path));
+		end
+	else
+		print_err("Couldn't pack module '"..modulename.."': "..(err or "unknown error... path to module file correct?"));
+		os.exit(1);
+	end
+end
+
+if #resources > 0 then
+	print_verbose("Packing resources...")
+	f:write("do local resources = {};\n");
+	for _, resource in ipairs(resources) do
+		local name, path = resource.name, resource.path;
+		local res_file, err = io.open(base_path..path, "rb");
+		if not res_file then
+			print_err("Couldn't load resource: "..tostring(err));
+			os.exit(1);
+		end
+		local data = res_file:read("*a");
+		local maxequals = 0;
+		data:gsub("(=+)", function (equals_string) maxequals = math.max(maxequals, #equals_string); end);
+
+		f:write(("resources[%q] = %q"):format(name, data));
+--[[		f:write(("resources[%q] = ["):format(name), string.rep("=", maxequals+1), "[");
+		f:write(data);
+		f:write("]", string.rep("=", maxequals+1), "];"); ]]
+	end
+	if opts.virtual_io then
+		local vio = require_resource("vio");
+		if not vio then
+			print_err("Virtual IO requested but is not enabled in this build of squish");
+		else
+			-- Insert vio library
+			f:write(vio, "\n")
+			-- Override standard functions to use vio if opening a resource
+			f:write[[local io_open, io_lines = io.open, io.lines; function io.open(fn, mode)
+					if not resources[fn] then
+						return io_open(fn, mode);
+					else
+						return vio.open(resources[fn]);
+				end end
+				function io.lines(fn)
+					if not resources[fn] then
+						return io_lines(fn);
+					else
+						return vio.open(resources[fn]):lines()
+				end end
+				local _dofile = dofile;
+				function dofile(fn)
+					if not resources[fn] then
+						return _dofile(fn);
+					else
+						return assert(loadstring(resources[fn]))();
+				end end
+				local _loadfile = loadfile;
+				function loadfile(fn)
+					if not resources[fn] then
+						return _loadfile(fn);
+					else
+						return loadstring(resources[fn], "@"..fn);
+				end end ]]
+		end
+	end
+	f:write[[function require_resource(name) return resources[name] or error("resource '"..tostring(name).."' not found"); end end ]]
+end
+
+print_debug("Finalising...")
+for _, fn in pairs(main_files) do
+	local fin, err = io.open(base_path..fn);
+	if not fin then
+		print_err("Failed to open "..fn..": "..err);
+		os.exit(1);
+	else
+		f:write((fin:read("*a"):gsub("^#.-\n", "")));
+		fin:close();
+	end
+end
+
+f:close();
+
+print_info("OK!");
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
new file mode 100644
index 0000000..7ce8366
--- /dev/null
+++ b/src/python/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+set(PYTHON_SRC __init__.py perf.py tcp.py utils.py libbcc.py table.py usdt.py)
+
+foreach (PY_SRC ${PYTHON_SRC})
+  configure_file(bcc/${PY_SRC} ${CMAKE_CURRENT_BINARY_DIR}/bcc/${PY_SRC} COPYONLY)
+endforeach()
+
+if(NOT PYTHON_CMD)
+  set(PYTHON_CMD "python")
+endif()
+
+configure_file(setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py @ONLY)
+configure_file(bcc/version.py.in ${CMAKE_CURRENT_BINARY_DIR}/bcc/version.py @ONLY)
+if(EXISTS "/etc/debian_version")
+  set(PYTHON_FLAGS "${PYTHON_FLAGS} --install-layout deb")
+endif()
+
+foreach(PY_CMD ${PYTHON_CMD})
+  string(REPLACE "/" "-" PY_CMD_ESCAPED ${PY_CMD})
+
+  set(PY_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/bcc/__init__.py ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+  if (PREVIOUS_PY)
+    set(PY_DEPENDS ${PY_DEPENDS} ${PREVIOUS_PY})
+  endif()
+
+  set(PIP_INSTALLABLE "${CMAKE_CURRENT_BINARY_DIR}/dist-${PY_CMD_ESCAPED}/bcc-${REVISION}.tar.gz")
+  # build the pip installable
+  add_custom_command(OUTPUT ${PIP_INSTALLABLE}
+    COMMAND ${PY_CMD} setup.py sdist --dist-dir dist-${PY_CMD_ESCAPED}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS ${PY_DEPENDS}
+    )
+  add_custom_target(bcc_py_${PY_CMD_ESCAPED} ALL DEPENDS ${PIP_INSTALLABLE})
+
+  install(CODE "execute_process(COMMAND ${PY_CMD} setup.py install -f ${PYTHON_FLAGS}
+    --prefix=${CMAKE_INSTALL_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})"
+    COMPONENT python)
+
+  set(PREVIOUS_PY ${PIP_INSTALLABLE})
+endforeach()
diff --git a/src/python/MANIFEST b/src/python/MANIFEST
new file mode 100644
index 0000000..f6e1add
--- /dev/null
+++ b/src/python/MANIFEST
@@ -0,0 +1,5 @@
+# file GENERATED by distutils, do NOT edit
+setup.py
+bcc/__init__.py
+bcc/table.py
+bcc/libbcc.py
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
new file mode 100644
index 0000000..1dfd830
--- /dev/null
+++ b/src/python/bcc/__init__.py
@@ -0,0 +1,1269 @@
+# Copyright 2015 PLUMgrid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import atexit
+import ctypes as ct
+import fcntl
+import json
+import os
+import re
+import struct
+import errno
+import sys
+basestring = (unicode if sys.version_info[0] < 3 else str)
+
+from .libbcc import lib, bcc_symbol, bcc_symbol_option, _SYM_CB_TYPE
+from .table import Table, PerfEventArray
+from .perf import Perf
+from .utils import get_online_cpus, printb, _assert_is_bytes, ArgString
+from .version import __version__
+
+_probe_limit = 1000
+_num_open_probes = 0
+
+# for tests
+def _get_num_open_probes():
+    global _num_open_probes
+    return _num_open_probes
+
+TRACEFS = "/sys/kernel/debug/tracing"
+
+# Debug flags
+
+# Debug output compiled LLVM IR.
+DEBUG_LLVM_IR = 0x1
+# Debug output loaded BPF bytecode and register state on branches.
+DEBUG_BPF = 0x2
+# Debug output pre-processor result.
+DEBUG_PREPROCESSOR = 0x4
+# Debug output ASM instructions embedded with source.
+DEBUG_SOURCE = 0x8
+#Debug output register state on all instructions in addition to DEBUG_BPF.
+DEBUG_BPF_REGISTER_STATE = 0x10
+
+class SymbolCache(object):
+    def __init__(self, pid):
+        self.cache = lib.bcc_symcache_new(
+                pid, ct.cast(None, ct.POINTER(bcc_symbol_option)))
+
+    def resolve(self, addr, demangle):
+        """
+        Return a tuple of the symbol (function), its offset from the beginning
+        of the function, and the module in which it lies. For example:
+            ("start_thread", 0x202, "/usr/lib/.../libpthread-2.24.so")
+        If the symbol cannot be found but we know which module it is in,
+        return the module name and the offset from the beginning of the
+        module. If we don't even know the module, return the absolute
+        address as the offset.
+        """
+        sym = bcc_symbol()
+        if demangle:
+            res = lib.bcc_symcache_resolve(self.cache, addr, ct.byref(sym))
+        else:
+            res = lib.bcc_symcache_resolve_no_demangle(self.cache, addr,
+                                                       ct.byref(sym))
+        if res < 0:
+            if sym.module and sym.offset:
+                return (None, sym.offset,
+                        ct.cast(sym.module, ct.c_char_p).value)
+            return (None, addr, None)
+        if demangle:
+            name_res = sym.demangle_name
+            lib.bcc_symbol_free_demangle_name(ct.byref(sym))
+        else:
+            name_res = sym.name
+        return (name_res, sym.offset, ct.cast(sym.module, ct.c_char_p).value)
+
+    def resolve_name(self, module, name):
+        module = _assert_is_bytes(module)
+        name = _assert_is_bytes(name)
+        addr = ct.c_ulonglong()
+        if lib.bcc_symcache_resolve_name(self.cache, module, name,
+                ct.byref(addr)) < 0:
+            return -1
+        return addr.value
+
+class PerfType:
+    # From perf_type_id in uapi/linux/perf_event.h
+    HARDWARE = 0
+    SOFTWARE = 1
+
+class PerfHWConfig:
+    # From perf_hw_id in uapi/linux/perf_event.h
+    CPU_CYCLES = 0
+    INSTRUCTIONS = 1
+    CACHE_REFERENCES = 2
+    CACHE_MISSES = 3
+    BRANCH_INSTRUCTIONS = 4
+    BRANCH_MISSES = 5
+    BUS_CYCLES = 6
+    STALLED_CYCLES_FRONTEND = 7
+    STALLED_CYCLES_BACKEND = 8
+    REF_CPU_CYCLES = 9
+
+class PerfSWConfig:
+    # From perf_sw_id in uapi/linux/perf_event.h
+    CPU_CLOCK = 0
+    TASK_CLOCK = 1
+    PAGE_FAULTS = 2
+    CONTEXT_SWITCHES = 3
+    CPU_MIGRATIONS = 4
+    PAGE_FAULTS_MIN = 5
+    PAGE_FAULTS_MAJ = 6
+    ALIGNMENT_FAULTS = 7
+    EMULATION_FAULTS = 8
+    DUMMY = 9
+    BPF_OUTPUT = 10
+
+class BPF(object):
+    # From bpf_prog_type in uapi/linux/bpf.h
+    SOCKET_FILTER = 1
+    KPROBE = 2
+    SCHED_CLS = 3
+    SCHED_ACT = 4
+    TRACEPOINT = 5
+    XDP = 6
+    PERF_EVENT = 7
+    CGROUP_SKB = 8
+    CGROUP_SOCK = 9
+    LWT_IN = 10
+    LWT_OUT = 11
+    LWT_XMIT = 12
+    SOCK_OPS = 13
+    SK_SKB = 14
+    CGROUP_DEVICE = 15
+    SK_MSG = 16
+    RAW_TRACEPOINT = 17
+    CGROUP_SOCK_ADDR = 18
+
+    # from xdp_action uapi/linux/bpf.h
+    XDP_ABORTED = 0
+    XDP_DROP = 1
+    XDP_PASS = 2
+    XDP_TX = 3
+    XDP_REDIRECT = 4
+
+    _probe_repl = re.compile(b"[^a-zA-Z0-9_]")
+    _sym_caches = {}
+
+    _auto_includes = {
+        "linux/time.h": ["time"],
+        "linux/fs.h": ["fs", "file"],
+        "linux/blkdev.h": ["bio", "request"],
+        "linux/slab.h": ["alloc"],
+        "linux/netdevice.h": ["sk_buff", "net_device"]
+    }
+
+    _syscall_prefixes = [
+        b"sys_",
+        b"__x64_sys_",
+        b"__x32_compat_sys_",
+        b"__ia32_compat_sys_",
+    ]
+
+    # BPF timestamps come from the monotonic clock. To be able to filter
+    # and compare them from Python, we need to invoke clock_gettime.
+    # Adapted from http://stackoverflow.com/a/1205762
+    CLOCK_MONOTONIC = 1         # see <linux/time.h>
+
+    class timespec(ct.Structure):
+        _fields_ = [('tv_sec', ct.c_long), ('tv_nsec', ct.c_long)]
+
+    _librt = ct.CDLL('librt.so.1', use_errno=True)
+    _clock_gettime = _librt.clock_gettime
+    _clock_gettime.argtypes = [ct.c_int, ct.POINTER(timespec)]
+
+    @classmethod
+    def monotonic_time(cls):
+        """monotonic_time()
+        Returns the system monotonic time from clock_gettime, using the
+        CLOCK_MONOTONIC constant. The time returned is in nanoseconds.
+        """
+        t = cls.timespec()
+        if cls._clock_gettime(cls.CLOCK_MONOTONIC, ct.byref(t)) != 0:
+            errno = ct.get_errno()
+            raise OSError(errno, os.strerror(errno))
+        return t.tv_sec * 1e9 + t.tv_nsec
+
+    @classmethod
+    def generate_auto_includes(cls, program_words):
+        """
+        Generates #include statements automatically based on a set of
+        recognized types such as sk_buff and bio. The input is all the words
+        that appear in the BPF program, and the output is a (possibly empty)
+        string of #include statements, such as "#include <linux/fs.h>".
+        """
+        headers = ""
+        for header, keywords in cls._auto_includes.items():
+            for keyword in keywords:
+                for word in program_words:
+                    if keyword in word and header not in headers:
+                        headers += "#include <%s>\n" % header
+        return headers
+
+    # defined for compatibility reasons, to be removed
+    Table = Table
+
+    class Function(object):
+        def __init__(self, bpf, name, fd):
+            self.bpf = bpf
+            self.name = name
+            self.fd = fd
+
+    @staticmethod
+    def _find_file(filename):
+        """ If filename is invalid, search in ./ of argv[0] """
+        if filename:
+            if not os.path.isfile(filename):
+                argv0 = ArgString(sys.argv[0])
+                t = b"/".join([os.path.abspath(os.path.dirname(argv0.__str__())), filename])
+                if os.path.isfile(t):
+                    filename = t
+                else:
+                    raise Exception("Could not find file %s" % filename)
+        return filename
+
+    @staticmethod
+    def find_exe(bin_path):
+        """
+        find_exe(bin_path)
+
+        Traverses the PATH environment variable, looking for the first
+        directory that contains an executable file named bin_path, and
+        returns the full path to that file, or None if no such file
+        can be found. This is meant to replace invocations of the
+        "which" shell utility, which doesn't have portable semantics
+        for skipping aliases.
+        """
+        # Source: http://stackoverflow.com/a/377028
+        def is_exe(fpath):
+            return os.path.isfile(fpath) and \
+                os.access(fpath, os.X_OK)
+
+        fpath, fname = os.path.split(bin_path)
+        if fpath:
+            if is_exe(bin_path):
+                return bin_path
+        else:
+            for path in os.environ["PATH"].split(os.pathsep):
+                path = path.strip('"')
+                exe_file = os.path.join(path, bin_path)
+                if is_exe(exe_file):
+                    return exe_file
+        return None
+
+    def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0,
+            cflags=[], usdt_contexts=[]):
+        """Create a new BPF module with the given source code.
+
+        Note:
+            All fields are marked as optional, but either `src_file` or `text`
+            must be supplied, and not both.
+
+        Args:
+            src_file (Optional[str]): Path to a source file for the module
+            hdr_file (Optional[str]): Path to a helper header file for the `src_file`
+            text (Optional[str]): Contents of a source file for the module
+            debug (Optional[int]): Flags used for debug prints, can be |'d together
+                                   See "Debug flags" for explanation
+        """
+
+        src_file = _assert_is_bytes(src_file)
+        hdr_file = _assert_is_bytes(hdr_file)
+        text = _assert_is_bytes(text)
+
+        self.kprobe_fds = {}
+        self.uprobe_fds = {}
+        self.tracepoint_fds = {}
+        self.raw_tracepoint_fds = {}
+        self.perf_buffers = {}
+        self.open_perf_events = {}
+        self.tracefile = None
+        atexit.register(self.cleanup)
+
+        self.debug = debug
+        self.funcs = {}
+        self.tables = {}
+        self.module = None
+        cflags_array = (ct.c_char_p * len(cflags))()
+        for i, s in enumerate(cflags): cflags_array[i] = bytes(ArgString(s))
+        if text:
+            ctx_array = (ct.c_void_p * len(usdt_contexts))()
+            for i, usdt in enumerate(usdt_contexts):
+                ctx_array[i] = ct.c_void_p(usdt.get_context())
+            usdt_text = lib.bcc_usdt_genargs(ctx_array, len(usdt_contexts))
+            if usdt_text is None:
+                raise Exception("can't generate USDT probe arguments; " +
+                                "possible cause is missing pid when a " +
+                                "probe in a shared object has multiple " +
+                                "locations")
+            text = usdt_text + text
+
+        if text:
+            self.module = lib.bpf_module_create_c_from_string(text,
+                    self.debug, cflags_array, len(cflags_array))
+            if not self.module:
+                raise Exception("Failed to compile BPF text")
+        else:
+            src_file = BPF._find_file(src_file)
+            hdr_file = BPF._find_file(hdr_file)
+            if src_file.endswith(b".b"):
+                self.module = lib.bpf_module_create_b(src_file, hdr_file,
+                        self.debug)
+            else:
+                self.module = lib.bpf_module_create_c(src_file, self.debug,
+                        cflags_array, len(cflags_array))
+            if not self.module:
+                raise Exception("Failed to compile BPF module %s" % src_file)
+
+        for usdt_context in usdt_contexts:
+            usdt_context.attach_uprobes(self)
+
+        # If any "kprobe__" or "tracepoint__" or "raw_tracepoint__"
+        # prefixed functions were defined,
+        # they will be loaded and attached here.
+        self._trace_autoload()
+
+    def load_funcs(self, prog_type=KPROBE):
+        """load_funcs(prog_type=KPROBE)
+
+        Load all functions in this BPF module with the given type.
+        Returns a list of the function handles."""
+
+        fns = []
+        for i in range(0, lib.bpf_num_functions(self.module)):
+            func_name = lib.bpf_function_name(self.module, i)
+            fns.append(self.load_func(func_name, prog_type))
+
+        return fns
+
+    def load_func(self, func_name, prog_type):
+        func_name = _assert_is_bytes(func_name)
+        if func_name in self.funcs:
+            return self.funcs[func_name]
+        if not lib.bpf_function_start(self.module, func_name):
+            raise Exception("Unknown program %s" % func_name)
+        log_level = 0
+        if (self.debug & DEBUG_BPF_REGISTER_STATE):
+            log_level = 2
+        elif (self.debug & DEBUG_BPF):
+            log_level = 1
+        fd = lib.bpf_prog_load(prog_type, func_name,
+                lib.bpf_function_start(self.module, func_name),
+                lib.bpf_function_size(self.module, func_name),
+                lib.bpf_module_license(self.module),
+                lib.bpf_module_kern_version(self.module),
+                log_level, None, 0);
+
+        if fd < 0:
+            atexit.register(self.donothing)
+            if ct.get_errno() == errno.EPERM:
+                raise Exception("Need super-user privileges to run")
+
+            errstr = os.strerror(ct.get_errno())
+            raise Exception("Failed to load BPF program %s: %s" %
+                            (func_name, errstr))
+
+        fn = BPF.Function(self, func_name, fd)
+        self.funcs[func_name] = fn
+
+        return fn
+
+    def dump_func(self, func_name):
+        """
+        Return the eBPF bytecodes for the specified function as a string
+        """
+        func_name = _assert_is_bytes(func_name)
+        if not lib.bpf_function_start(self.module, func_name):
+            raise Exception("Unknown program %s" % func_name)
+
+        start, = lib.bpf_function_start(self.module, func_name),
+        size, = lib.bpf_function_size(self.module, func_name),
+        return ct.string_at(start, size)
+
+    str2ctype = {
+        u"_Bool": ct.c_bool,
+        u"char": ct.c_char,
+        u"wchar_t": ct.c_wchar,
+        u"unsigned char": ct.c_ubyte,
+        u"short": ct.c_short,
+        u"unsigned short": ct.c_ushort,
+        u"int": ct.c_int,
+        u"unsigned int": ct.c_uint,
+        u"long": ct.c_long,
+        u"unsigned long": ct.c_ulong,
+        u"long long": ct.c_longlong,
+        u"unsigned long long": ct.c_ulonglong,
+        u"float": ct.c_float,
+        u"double": ct.c_double,
+        u"long double": ct.c_longdouble,
+        u"__int128": ct.c_int64 * 2,
+        u"unsigned __int128": ct.c_uint64 * 2,
+    }
+    @staticmethod
+    def _decode_table_type(desc):
+        if isinstance(desc, basestring):
+            return BPF.str2ctype[desc]
+        anon = []
+        fields = []
+        for t in desc[1]:
+            if len(t) == 2:
+                fields.append((t[0], BPF._decode_table_type(t[1])))
+            elif len(t) == 3:
+                if isinstance(t[2], list):
+                    fields.append((t[0], BPF._decode_table_type(t[1]) * t[2][0]))
+                elif isinstance(t[2], int):
+                    fields.append((t[0], BPF._decode_table_type(t[1]), t[2]))
+                elif isinstance(t[2], basestring) and (
+                        t[2] == u"union" or t[2] == u"struct" or
+                        t[2] == u"struct_packed"):
+                    name = t[0]
+                    if name == "":
+                        name = "__anon%d" % len(anon)
+                        anon.append(name)
+                    fields.append((name, BPF._decode_table_type(t)))
+                else:
+                    raise Exception("Failed to decode type %s" % str(t))
+            else:
+                raise Exception("Failed to decode type %s" % str(t))
+        base = ct.Structure
+        is_packed = False
+        if len(desc) > 2:
+            if desc[2] == u"union":
+                base = ct.Union
+            elif desc[2] == u"struct":
+                base = ct.Structure
+            elif desc[2] == u"struct_packed":
+                base = ct.Structure
+                is_packed = True
+        if is_packed:
+            cls = type(str(desc[0]), (base,), dict(_anonymous_=anon, _pack_=1,
+                _fields_=fields))
+        else:
+            cls = type(str(desc[0]), (base,), dict(_anonymous_=anon,
+                _fields_=fields))
+        return cls
+
+    def get_table(self, name, keytype=None, leaftype=None, reducer=None):
+        name = _assert_is_bytes(name)
+        map_id = lib.bpf_table_id(self.module, name)
+        map_fd = lib.bpf_table_fd(self.module, name)
+        if map_fd < 0:
+            raise KeyError
+        if not keytype:
+            key_desc = lib.bpf_table_key_desc(self.module, name).decode("utf-8")
+            if not key_desc:
+                raise Exception("Failed to load BPF Table %s key desc" % name)
+            keytype = BPF._decode_table_type(json.loads(key_desc))
+        if not leaftype:
+            leaf_desc = lib.bpf_table_leaf_desc(self.module, name).decode("utf-8")
+            if not leaf_desc:
+                raise Exception("Failed to load BPF Table %s leaf desc" % name)
+            leaftype = BPF._decode_table_type(json.loads(leaf_desc))
+        return Table(self, map_id, map_fd, keytype, leaftype, reducer=reducer)
+
+    def __getitem__(self, key):
+        if key not in self.tables:
+            self.tables[key] = self.get_table(key)
+        return self.tables[key]
+
+    def __setitem__(self, key, leaf):
+        self.tables[key] = leaf
+
+    def __len__(self):
+        return len(self.tables)
+
+    def __delitem__(self, key):
+        del self.tables[key]
+
+    def __iter__(self):
+        return self.tables.__iter__()
+
+    @staticmethod
+    def attach_raw_socket(fn, dev):
+        dev = _assert_is_bytes(dev)
+        if not isinstance(fn, BPF.Function):
+            raise Exception("arg 1 must be of type BPF.Function")
+        sock = lib.bpf_open_raw_sock(dev)
+        if sock < 0:
+            errstr = os.strerror(ct.get_errno())
+            raise Exception("Failed to open raw device %s: %s" % (dev, errstr))
+        res = lib.bpf_attach_socket(sock, fn.fd)
+        if res < 0:
+            errstr = os.strerror(ct.get_errno())
+            raise Exception("Failed to attach BPF to device %s: %s"
+                    % (dev, errstr))
+        fn.sock = sock
+
+    @staticmethod
+    def get_kprobe_functions(event_re):
+        with open("%s/../kprobes/blacklist" % TRACEFS, "rb") as blacklist_f:
+            blacklist = set([line.rstrip().split()[1] for line in blacklist_f])
+        fns = []
+
+        in_init_section = 0
+        with open("/proc/kallsyms", "rb") as avail_file:
+            for line in avail_file:
+                (t, fn) = line.rstrip().split()[1:3]
+                if in_init_section == 0:
+                    if fn == b'__init_begin':
+                        in_init_section = 1
+                        continue
+                elif in_init_section == 1:
+                    if fn == b'__init_end':
+                        in_init_section = 2
+                    continue
+                if (t.lower() in [b't', b'w']) and re.match(event_re, fn) \
+                    and fn not in blacklist:
+                    fns.append(fn)
+        return set(fns)     # Some functions may appear more than once
+
+    def _check_probe_quota(self, num_new_probes):
+        global _num_open_probes
+        if _num_open_probes + num_new_probes > _probe_limit:
+            raise Exception("Number of open probes would exceed global quota")
+
+    def _add_kprobe_fd(self, name, fd):
+        global _num_open_probes
+        self.kprobe_fds[name] = fd
+        _num_open_probes += 1
+
+    def _del_kprobe_fd(self, name):
+        global _num_open_probes
+        del self.kprobe_fds[name]
+        _num_open_probes -= 1
+ 
+    def _add_uprobe_fd(self, name, fd):
+        global _num_open_probes
+        self.uprobe_fds[name] = fd
+        _num_open_probes += 1
+
+    def _del_uprobe_fd(self, name):
+        global _num_open_probes
+        del self.uprobe_fds[name]
+        _num_open_probes -= 1
+
+    # Find current system's syscall prefix by testing on the BPF syscall.
+    # If no valid value found, will return the first possible value which
+    # would probably lead to error in later API calls.
+    def get_syscall_prefix(self):
+        for prefix in self._syscall_prefixes:
+            if self.ksymname(b"%sbpf" % prefix) != -1:
+                return prefix
+        return self._syscall_prefixes[0]
+
+    # Given a syscall's name, return the full Kernel function name with current
+    # system's syscall prefix. For example, given "clone" the helper would
+    # return "sys_clone" or "__x64_sys_clone".
+    def get_syscall_fnname(self, name):
+        name = _assert_is_bytes(name)
+        return self.get_syscall_prefix() + name
+
+    # Given a Kernel function name that represents a syscall but already has a
+    # prefix included, transform it to current system's prefix. For example,
+    # if "sys_clone" provided, the helper may translate it to "__x64_sys_clone".
+    def fix_syscall_fnname(self, name):
+        name = _assert_is_bytes(name)
+        for prefix in self._syscall_prefixes:
+            if name.startswith(prefix):
+                return self.get_syscall_fnname(name[len(prefix):])
+        return name
+       
+    def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""):
+        event = _assert_is_bytes(event)
+        fn_name = _assert_is_bytes(fn_name)
+        event_re = _assert_is_bytes(event_re)
+
+        # allow the caller to glob multiple functions together
+        if event_re:
+            matches = BPF.get_kprobe_functions(event_re)
+            self._check_probe_quota(len(matches))
+            for line in matches:
+                try:
+                    self.attach_kprobe(event=line, fn_name=fn_name)
+                except:
+                    pass
+            return
+
+        self._check_probe_quota(1)
+        fn = self.load_func(fn_name, BPF.KPROBE)
+        ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_")
+        fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to kprobe")
+        self._add_kprobe_fd(ev_name, fd)
+        return self
+
+    def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b""):
+        event = _assert_is_bytes(event)
+        fn_name = _assert_is_bytes(fn_name)
+        event_re = _assert_is_bytes(event_re)
+
+        # allow the caller to glob multiple functions together
+        if event_re:
+            for line in BPF.get_kprobe_functions(event_re):
+                try:
+                    self.attach_kretprobe(event=line, fn_name=fn_name)
+                except:
+                    pass
+            return
+
+        self._check_probe_quota(1)
+        fn = self.load_func(fn_name, BPF.KPROBE)
+        ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_")
+        fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to kretprobe")
+        self._add_kprobe_fd(ev_name, fd)
+        return self
+
+    def detach_kprobe_event(self, ev_name):
+        if ev_name not in self.kprobe_fds:
+            raise Exception("Kprobe %s is not attached" % event)
+        res = lib.bpf_close_perf_event_fd(self.kprobe_fds[ev_name])
+        if res < 0:
+            raise Exception("Failed to close kprobe FD")
+        res = lib.bpf_detach_kprobe(ev_name)
+        if res < 0:
+            raise Exception("Failed to detach BPF from kprobe")
+        self._del_kprobe_fd(ev_name)
+
+    def detach_kprobe(self, event):
+        event = _assert_is_bytes(event)
+        ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_")
+        self.detach_kprobe_event(ev_name)
+
+    def detach_kretprobe(self, event):
+        event = _assert_is_bytes(event)
+        ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_")
+        self.detach_kprobe_event(ev_name)
+
+    @staticmethod
+    def attach_xdp(dev, fn, flags=0):
+        '''
+            This function attaches a BPF function to a device on the device
+            driver level (XDP)
+        '''
+        dev = _assert_is_bytes(dev)
+        if not isinstance(fn, BPF.Function):
+            raise Exception("arg 1 must be of type BPF.Function")
+        res = lib.bpf_attach_xdp(dev, fn.fd, flags)
+        if res < 0:
+            err_no = ct.get_errno()
+            if err_no == errno.EBADMSG:
+                raise Exception("Internal error while attaching BPF to device,"+
+                    " try increasing the debug level!")
+            else:
+                errstr = os.strerror(err_no)
+                raise Exception("Failed to attach BPF to device %s: %s"
+                            % (dev, errstr))
+
+    @staticmethod
+    def remove_xdp(dev, flags=0):
+        '''
+            This function removes any BPF function from a device on the
+            device driver level (XDP)
+        '''
+        dev = _assert_is_bytes(dev)
+        res = lib.bpf_attach_xdp(dev, -1, flags)
+        if res < 0:
+            errstr = os.strerror(ct.get_errno())
+            raise Exception("Failed to detach BPF from device %s: %s"
+                            % (dev, errstr))
+
+
+
+    @classmethod
+    def _check_path_symbol(cls, module, symname, addr, pid):
+        module = _assert_is_bytes(module)
+        symname = _assert_is_bytes(symname)
+        sym = bcc_symbol()
+        c_pid = 0 if pid == -1 else pid
+        if lib.bcc_resolve_symname(
+            module, symname,
+            addr or 0x0, c_pid,
+            ct.cast(None, ct.POINTER(bcc_symbol_option)),
+            ct.byref(sym),
+        ) < 0:
+            raise Exception("could not determine address of symbol %s" % symname)
+        module_path = ct.cast(sym.module, ct.c_char_p).value
+        lib.bcc_procutils_free(sym.module)
+        return module_path, sym.offset
+
+    @staticmethod
+    def find_library(libname):
+        libname = _assert_is_bytes(libname)
+        res = lib.bcc_procutils_which_so(libname, 0)
+        if not res:
+            return None
+        libpath = ct.cast(res, ct.c_char_p).value
+        lib.bcc_procutils_free(res)
+        return libpath
+
+    @staticmethod
+    def get_tracepoints(tp_re):
+        results = []
+        events_dir = os.path.join(TRACEFS, "events")
+        for category in os.listdir(events_dir):
+            cat_dir = os.path.join(events_dir, category)
+            if not os.path.isdir(cat_dir):
+                continue
+            for event in os.listdir(cat_dir):
+                evt_dir = os.path.join(cat_dir, event)
+                if os.path.isdir(evt_dir):
+                    tp = ("%s:%s" % (category, event))
+                    if re.match(tp_re, tp):
+                        results.append(tp)
+        return results
+
+    @staticmethod
+    def tracepoint_exists(category, event):
+        evt_dir = os.path.join(TRACEFS, "events", category, event)
+        return os.path.isdir(evt_dir)
+
+    def attach_tracepoint(self, tp=b"", tp_re=b"", fn_name=b""):
+        """attach_tracepoint(tp="", tp_re="", fn_name="")
+
+        Run the bpf function denoted by fn_name every time the kernel tracepoint
+        specified by 'tp' is hit. The optional parameters pid, cpu, and group_fd
+        can be used to filter the probe. The tracepoint specification is simply
+        the tracepoint category and the tracepoint name, separated by a colon.
+        For example: sched:sched_switch, syscalls:sys_enter_bind, etc.
+
+        Instead of a tracepoint name, a regular expression can be provided in
+        tp_re. The program will then attach to tracepoints that match the
+        provided regular expression.
+
+        To obtain a list of kernel tracepoints, use the tplist tool or cat the
+        file /sys/kernel/debug/tracing/available_events.
+
+        Examples:
+            BPF(text).attach_tracepoint(tp="sched:sched_switch", fn_name="on_switch")
+            BPF(text).attach_tracepoint(tp_re="sched:.*", fn_name="on_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        tp_re = _assert_is_bytes(tp_re)
+        fn_name = _assert_is_bytes(fn_name)
+        if tp_re:
+            for tp in BPF.get_tracepoints(tp_re):
+                self.attach_tracepoint(tp=tp, fn_name=fn_name)
+            return
+
+        fn = self.load_func(fn_name, BPF.TRACEPOINT)
+        (tp_category, tp_name) = tp.split(b':')
+        fd = lib.bpf_attach_tracepoint(fn.fd, tp_category, tp_name)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to tracepoint")
+        self.tracepoint_fds[tp] = fd
+        return self
+
+    def attach_raw_tracepoint(self, tp=b"", fn_name=b""):
+        """attach_raw_tracepoint(self, tp=b"", fn_name=b"")
+
+        Run the bpf function denoted by fn_name every time the kernel tracepoint
+        specified by 'tp' is hit. The bpf function should be loaded as a
+        RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name,
+        e.g., sched_switch, sys_enter_bind, etc.
+
+        Examples:
+            BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        if tp in self.raw_tracepoint_fds:
+            raise Exception("Raw tracepoint %s has been attached" % tp)
+
+        fn_name = _assert_is_bytes(fn_name)
+        fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT)
+        fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to raw tracepoint")
+        self.raw_tracepoint_fds[tp] = fd;
+        return self
+
+    def detach_raw_tracepoint(self, tp=b""):
+        """detach_raw_tracepoint(tp="")
+
+        Stop running the bpf function that is attached to the kernel tracepoint
+        specified by 'tp'.
+
+        Example: bpf.detach_raw_tracepoint("sched_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        if tp not in self.raw_tracepoint_fds:
+            raise Exception("Raw tracepoint %s is not attached" % tp)
+        os.close(self.raw_tracepoint_fds[tp])
+        del self.raw_tracepoint_fds[tp]
+
+    @staticmethod
+    def support_raw_tracepoint():
+        # kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support
+        if BPF.ksymname("bpf_find_raw_tracepoint") != -1:
+            return True
+        return False
+
+    def detach_tracepoint(self, tp=b""):
+        """detach_tracepoint(tp="")
+
+        Stop running a bpf function that is attached to the kernel tracepoint
+        specified by 'tp'.
+
+        Example: bpf.detach_tracepoint("sched:sched_switch")
+        """
+
+        tp = _assert_is_bytes(tp)
+        if tp not in self.tracepoint_fds:
+            raise Exception("Tracepoint %s is not attached" % tp)
+        res = lib.bpf_close_perf_event_fd(self.tracepoint_fds[tp])
+        if res < 0:
+            raise Exception("Failed to detach BPF from tracepoint")
+        (tp_category, tp_name) = tp.split(b':')
+        res = lib.bpf_detach_tracepoint(tp_category, tp_name)
+        if res < 0:
+            raise Exception("Failed to detach BPF from tracepoint")
+        del self.tracepoint_fds[tp]
+
+    def _attach_perf_event(self, progfd, ev_type, ev_config,
+            sample_period, sample_freq, pid, cpu, group_fd):
+        res = lib.bpf_attach_perf_event(progfd, ev_type, ev_config,
+                sample_period, sample_freq, pid, cpu, group_fd)
+        if res < 0:
+            raise Exception("Failed to attach BPF to perf event")
+        return res
+
+    def attach_perf_event(self, ev_type=-1, ev_config=-1, fn_name=b"",
+            sample_period=0, sample_freq=0, pid=-1, cpu=-1, group_fd=-1):
+        fn_name = _assert_is_bytes(fn_name)
+        fn = self.load_func(fn_name, BPF.PERF_EVENT)
+        res = {}
+        if cpu >= 0:
+            res[cpu] = self._attach_perf_event(fn.fd, ev_type, ev_config,
+                    sample_period, sample_freq, pid, cpu, group_fd)
+        else:
+            for i in get_online_cpus():
+                res[i] = self._attach_perf_event(fn.fd, ev_type, ev_config,
+                        sample_period, sample_freq, pid, i, group_fd)
+        self.open_perf_events[(ev_type, ev_config)] = res
+
+    def detach_perf_event(self, ev_type=-1, ev_config=-1):
+        try:
+            fds = self.open_perf_events[(ev_type, ev_config)]
+        except KeyError:
+            raise Exception("Perf event type {} config {} not attached".format(
+                ev_type, ev_config))
+
+        res = 0
+        for fd in fds.values():
+            res = lib.bpf_close_perf_event_fd(fd) or res
+        if res != 0:
+            raise Exception("Failed to detach BPF from perf event")
+        del self.open_perf_events[(ev_type, ev_config)]
+
+    @staticmethod
+    def get_user_functions(name, sym_re):
+        return set([name for (name, _) in
+                    BPF.get_user_functions_and_addresses(name, sym_re)])
+
+    @staticmethod
+    def get_user_addresses(name, sym_re):
+        """
+        We are returning addresses here instead of symbol names because it
+        turns out that the same name may appear multiple times with different
+        addresses, and the same address may appear multiple times with the same
+        name. We can't attach a uprobe to the same address more than once, so
+        it makes sense to return the unique set of addresses that are mapped to
+        a symbol that matches the provided regular expression.
+        """
+        return set([address for (_, address) in
+                    BPF.get_user_functions_and_addresses(name, sym_re)])
+
+    @staticmethod
+    def get_user_functions_and_addresses(name, sym_re):
+        name = _assert_is_bytes(name)
+        sym_re = _assert_is_bytes(sym_re)
+        addresses = []
+        def sym_cb(sym_name, addr):
+            dname = sym_name
+            if re.match(sym_re, dname):
+                addresses.append((dname, addr))
+            return 0
+
+        res = lib.bcc_foreach_function_symbol(name, _SYM_CB_TYPE(sym_cb))
+        if res < 0:
+            raise Exception("Error %d enumerating symbols in %s" % (res, name))
+        return addresses
+
+    def _get_uprobe_evname(self, prefix, path, addr, pid):
+        if pid == -1:
+            return b"%s_%s_0x%x" % (prefix, self._probe_repl.sub(b"_", path), addr)
+        else:
+            # if pid is valid, put pid in the name, so different pid
+            # can have different event names
+            return b"%s_%s_0x%x_%d" % (prefix, self._probe_repl.sub(b"_", path), addr, pid)
+
+    def attach_uprobe(self, name=b"", sym=b"", sym_re=b"", addr=None,
+            fn_name=b"", pid=-1):
+        """attach_uprobe(name="", sym="", sym_re="", addr=None, fn_name=""
+                         pid=-1)
+
+        Run the bpf function denoted by fn_name every time the symbol sym in
+        the library or binary 'name' is encountered. The real address addr may
+        be supplied in place of sym. Optional parameters pid, cpu, and group_fd
+        can be used to filter the probe.
+
+        Instead of a symbol name, a regular expression can be provided in
+        sym_re. The uprobe will then attach to symbols that match the provided
+        regular expression.
+
+        Libraries can be given in the name argument without the lib prefix, or
+        with the full path (/usr/lib/...). Binaries can be given only with the
+        full path (/bin/sh). If a PID is given, the uprobe will attach to the
+        version of the library used by the process.
+
+        Example: BPF(text).attach_uprobe("c", "malloc")
+                 BPF(text).attach_uprobe("/usr/bin/python", "main")
+        """
+
+        name = _assert_is_bytes(name)
+        sym = _assert_is_bytes(sym)
+        sym_re = _assert_is_bytes(sym_re)
+        fn_name = _assert_is_bytes(fn_name)
+
+        if sym_re:
+            addresses = BPF.get_user_addresses(name, sym_re)
+            self._check_probe_quota(len(addresses))
+            for sym_addr in addresses:
+                self.attach_uprobe(name=name, addr=sym_addr,
+                                   fn_name=fn_name, pid=pid)
+            return
+
+        (path, addr) = BPF._check_path_symbol(name, sym, addr, pid)
+
+        self._check_probe_quota(1)
+        fn = self.load_func(fn_name, BPF.KPROBE)
+        ev_name = self._get_uprobe_evname(b"p", path, addr, pid)
+        fd = lib.bpf_attach_uprobe(fn.fd, 0, ev_name, path, addr, pid)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to uprobe")
+        self._add_uprobe_fd(ev_name, fd)
+        return self
+
+    def attach_uretprobe(self, name=b"", sym=b"", sym_re=b"", addr=None,
+            fn_name=b"", pid=-1):
+        """attach_uretprobe(name="", sym="", sym_re="", addr=None, fn_name=""
+                            pid=-1)
+
+        Run the bpf function denoted by fn_name every time the symbol sym in
+        the library or binary 'name' finishes execution. See attach_uprobe for
+        meaning of additional parameters.
+        """
+
+        name = _assert_is_bytes(name)
+        sym = _assert_is_bytes(sym)
+        sym_re = _assert_is_bytes(sym_re)
+        fn_name = _assert_is_bytes(fn_name)
+
+        if sym_re:
+            for sym_addr in BPF.get_user_addresses(name, sym_re):
+                self.attach_uretprobe(name=name, addr=sym_addr,
+                                      fn_name=fn_name, pid=pid)
+            return
+
+        (path, addr) = BPF._check_path_symbol(name, sym, addr, pid)
+
+        self._check_probe_quota(1)
+        fn = self.load_func(fn_name, BPF.KPROBE)
+        ev_name = self._get_uprobe_evname(b"r", path, addr, pid)
+        fd = lib.bpf_attach_uprobe(fn.fd, 1, ev_name, path, addr, pid)
+        if fd < 0:
+            raise Exception("Failed to attach BPF to uretprobe")
+        self._add_uprobe_fd(ev_name, fd)
+        return self
+
+    def detach_uprobe_event(self, ev_name):
+        if ev_name not in self.uprobe_fds:
+            raise Exception("Uprobe %s is not attached" % ev_name)
+        res = lib.bpf_close_perf_event_fd(self.uprobe_fds[ev_name])
+        if res < 0:
+            raise Exception("Failed to detach BPF from uprobe")
+        res = lib.bpf_detach_uprobe(ev_name)
+        if res < 0:
+            raise Exception("Failed to detach BPF from uprobe")
+        self._del_uprobe_fd(ev_name)
+
+    def detach_uprobe(self, name=b"", sym=b"", addr=None, pid=-1):
+        """detach_uprobe(name="", sym="", addr=None, pid=-1)
+
+        Stop running a bpf function that is attached to symbol 'sym' in library
+        or binary 'name'.
+        """
+
+        name = _assert_is_bytes(name)
+        sym = _assert_is_bytes(sym)
+        (path, addr) = BPF._check_path_symbol(name, sym, addr, pid)
+        ev_name = self._get_uprobe_evname(b"p", path, addr, pid)
+        self.detach_uprobe_event(ev_name)
+
+    def detach_uretprobe(self, name=b"", sym=b"", addr=None, pid=-1):
+        """detach_uretprobe(name="", sym="", addr=None, pid=-1)
+
+        Stop running a bpf function that is attached to symbol 'sym' in library
+        or binary 'name'.
+        """
+
+        name = _assert_is_bytes(name)
+        sym = _assert_is_bytes(sym)
+
+        (path, addr) = BPF._check_path_symbol(name, sym, addr, pid)
+        ev_name = self._get_uprobe_evname(b"r", path, addr, pid)
+        self.detach_uprobe_event(ev_name)
+
+    def _trace_autoload(self):
+        for i in range(0, lib.bpf_num_functions(self.module)):
+            func_name = lib.bpf_function_name(self.module, i)
+            if func_name.startswith(b"kprobe__"):
+                fn = self.load_func(func_name, BPF.KPROBE)
+                self.attach_kprobe(
+                    event=self.fix_syscall_fnname(func_name[8:]),
+                    fn_name=fn.name)
+            elif func_name.startswith(b"kretprobe__"):
+                fn = self.load_func(func_name, BPF.KPROBE)
+                self.attach_kretprobe(
+                    event=self.fix_syscall_fnname(func_name[11:]),
+                    fn_name=fn.name)
+            elif func_name.startswith(b"tracepoint__"):
+                fn = self.load_func(func_name, BPF.TRACEPOINT)
+                tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":")
+                self.attach_tracepoint(tp=tp, fn_name=fn.name)
+            elif func_name.startswith(b"raw_tracepoint__"):
+                fn = self.load_func(func_name, BPF.RAW_TRACEPOINT)
+                tp = fn.name[len(b"raw_tracepoint__"):]
+                self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
+
+    def trace_open(self, nonblocking=False):
+        """trace_open(nonblocking=False)
+
+        Open the trace_pipe if not already open
+        """
+        if not self.tracefile:
+            self.tracefile = open("%s/trace_pipe" % TRACEFS, "rb")
+            if nonblocking:
+                fd = self.tracefile.fileno()
+                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+        return self.tracefile
+
+    def trace_fields(self, nonblocking=False):
+        """trace_fields(nonblocking=False)
+
+        Read from the kernel debug trace pipe and return a tuple of the
+        fields (task, pid, cpu, flags, timestamp, msg) or None if no
+        line was read (nonblocking=True)
+        """
+        while True:
+            line = self.trace_readline(nonblocking)
+            if not line and nonblocking: return (None,) * 6
+            # don't print messages related to lost events
+            if line.startswith(b"CPU:"): continue
+            task = line[:16].lstrip()
+            line = line[17:]
+            ts_end = line.find(b":")
+            pid, cpu, flags, ts = line[:ts_end].split()
+            cpu = cpu[1:-1]
+            # line[ts_end:] will have ": [sym_or_addr]: msgs"
+            # For trace_pipe debug output, the addr typically
+            # is invalid (e.g., 0x1). For kernel 4.12 or earlier,
+            # if address is not able to match a kernel symbol,
+            # nothing will be printed out. For kernel 4.13 and later,
+            # however, the illegal address will be printed out.
+            # Hence, both cases are handled here.
+            line = line[ts_end + 1:]
+            sym_end = line.find(b":")
+            msg = line[sym_end + 2:]
+            return (task, int(pid), int(cpu), flags, float(ts), msg)
+
+    def trace_readline(self, nonblocking=False):
+        """trace_readline(nonblocking=False)
+
+        Read from the kernel debug trace pipe and return one line
+        If nonblocking is False, this will block until ctrl-C is pressed.
+        """
+
+        trace = self.trace_open(nonblocking)
+
+        line = None
+        try:
+            line = trace.readline(1024).rstrip()
+        except IOError:
+            pass
+        return line
+
+    def trace_print(self, fmt=None):
+        """trace_print(self, fmt=None)
+
+        Read from the kernel debug trace pipe and print on stdout.
+        If fmt is specified, apply as a format string to the output. See
+        trace_fields for the members of the tuple
+        example: trace_print(fmt="pid {1}, msg = {5}")
+        """
+
+        while True:
+            if fmt:
+                fields = self.trace_fields(nonblocking=False)
+                if not fields: continue
+                line = fmt.format(*fields)
+            else:
+                line = self.trace_readline(nonblocking=False)
+            print(line)
+            sys.stdout.flush()
+
+    @staticmethod
+    def _sym_cache(pid):
+        """_sym_cache(pid)
+
+        Returns a symbol cache for the specified PID.
+        The kernel symbol cache is accessed by providing any PID less than zero.
+        """
+        if pid < 0 and pid != -1:
+            pid = -1
+        if not pid in BPF._sym_caches:
+            BPF._sym_caches[pid] = SymbolCache(pid)
+        return BPF._sym_caches[pid]
+
+    @staticmethod
+    def sym(addr, pid, show_module=False, show_offset=False, demangle=True):
+        """sym(addr, pid, show_module=False, show_offset=False)
+
+        Translate a memory address into a function name for a pid, which is
+        returned. When show_module is True, the module name is also included.
+        When show_offset is True, the instruction offset as a hexadecimal
+        number is also included in the string.
+
+        A pid of less than zero will access the kernel symbol cache.
+
+        Example output when both show_module and show_offset are True:
+            "start_thread+0x202 [libpthread-2.24.so]"
+
+        Example output when both show_module and show_offset are False:
+            "start_thread"
+        """
+        name, offset, module = BPF._sym_cache(pid).resolve(addr, demangle)
+        offset = b"+0x%x" % offset if show_offset and name is not None else b""
+        name = name or b"[unknown]"
+        name = name + offset
+        module = b" [%s]" % os.path.basename(module) \
+            if show_module and module is not None else b""
+        return name + module
+
+    @staticmethod
+    def ksym(addr, show_module=False, show_offset=False):
+        """ksym(addr)
+
+        Translate a kernel memory address into a kernel function name, which is
+        returned. When show_module is True, the module name ("kernel") is also
+        included. When show_offset is true, the instruction offset as a
+        hexadecimal number is also included in the string.
+
+        Example output when both show_module and show_offset are True:
+            "default_idle+0x0 [kernel]"
+        """
+        return BPF.sym(addr, -1, show_module, show_offset, False)
+
+    @staticmethod
+    def ksymname(name):
+        """ksymname(name)
+
+        Translate a kernel name into an address. This is the reverse of
+        ksym. Returns -1 when the function name is unknown."""
+        return BPF._sym_cache(-1).resolve_name(None, name)
+
+    def num_open_kprobes(self):
+        """num_open_kprobes()
+
+        Get the number of open K[ret]probes. Can be useful for scenarios where
+        event_re is used while attaching and detaching probes.
+        """
+        return len(self.kprobe_fds)
+
+    def num_open_uprobes(self):
+        """num_open_uprobes()
+
+        Get the number of open U[ret]probes.
+        """
+        return len(self.uprobe_fds)
+
+    def num_open_tracepoints(self):
+        """num_open_tracepoints()
+
+        Get the number of open tracepoints.
+        """
+        return len(self.tracepoint_fds)
+
+    def perf_buffer_poll(self, timeout = -1):
+        """perf_buffer_poll(self)
+
+        Poll from all open perf ring buffers, calling the callback that was
+        provided when calling open_perf_buffer for each entry.
+        """
+        readers = (ct.c_void_p * len(self.perf_buffers))()
+        for i, v in enumerate(self.perf_buffers.values()):
+            readers[i] = v
+        lib.perf_reader_poll(len(readers), readers, timeout)
+
+    def kprobe_poll(self, timeout = -1):
+        """kprobe_poll(self)
+
+        Deprecated. Use perf_buffer_poll instead.
+        """
+        self.perf_buffer_poll(timeout)
+
+    def donothing(self):
+        """the do nothing exit handler"""
+
+    def cleanup(self):
+        # Clean up opened probes
+        for k, v in list(self.kprobe_fds.items()):
+            self.detach_kprobe_event(k)
+        for k, v in list(self.uprobe_fds.items()):
+            self.detach_uprobe_event(k)
+        for k, v in list(self.tracepoint_fds.items()):
+            self.detach_tracepoint(k)
+        for k, v in list(self.raw_tracepoint_fds.items()):
+            self.detach_raw_tracepoint(k)
+
+        # Clean up opened perf ring buffer and perf events
+        table_keys = list(self.tables.keys())
+        for key in table_keys:
+            if isinstance(self.tables[key], PerfEventArray):
+                del self.tables[key]
+        for (ev_type, ev_config) in list(self.open_perf_events.keys()):
+            self.detach_perf_event(ev_type, ev_config)
+        if self.tracefile:
+            self.tracefile.close()
+            self.tracefile = None
+        if self.module:
+            lib.bpf_module_destroy(self.module)
+            self.module = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+
+
+from .usdt import USDT, USDTException
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
new file mode 100644
index 0000000..e61227e
--- /dev/null
+++ b/src/python/bcc/libbcc.py
@@ -0,0 +1,251 @@
+# Copyright 2015 PLUMgrid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes as ct
+
+lib = ct.CDLL("libbcc.so.0", use_errno=True)
+
+# keep in sync with bpf_common.h
+lib.bpf_module_create_b.restype = ct.c_void_p
+lib.bpf_module_create_b.argtypes = [ct.c_char_p, ct.c_char_p, ct.c_uint]
+lib.bpf_module_create_c.restype = ct.c_void_p
+lib.bpf_module_create_c.argtypes = [ct.c_char_p, ct.c_uint,
+        ct.POINTER(ct.c_char_p), ct.c_int]
+lib.bpf_module_create_c_from_string.restype = ct.c_void_p
+lib.bpf_module_create_c_from_string.argtypes = [ct.c_char_p, ct.c_uint,
+        ct.POINTER(ct.c_char_p), ct.c_int]
+lib.bpf_module_destroy.restype = None
+lib.bpf_module_destroy.argtypes = [ct.c_void_p]
+lib.bpf_module_license.restype = ct.c_char_p
+lib.bpf_module_license.argtypes = [ct.c_void_p]
+lib.bpf_module_kern_version.restype = ct.c_uint
+lib.bpf_module_kern_version.argtypes = [ct.c_void_p]
+lib.bpf_num_functions.restype = ct.c_ulonglong
+lib.bpf_num_functions.argtypes = [ct.c_void_p]
+lib.bpf_function_name.restype = ct.c_char_p
+lib.bpf_function_name.argtypes = [ct.c_void_p, ct.c_ulonglong]
+lib.bpf_function_start.restype = ct.c_void_p
+lib.bpf_function_start.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_function_size.restype = ct.c_size_t
+lib.bpf_function_size.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_table_id.restype = ct.c_ulonglong
+lib.bpf_table_id.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_table_fd.restype = ct.c_int
+lib.bpf_table_fd.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_table_type_id.restype = ct.c_int
+lib.bpf_table_type_id.argtypes = [ct.c_void_p, ct.c_ulonglong]
+lib.bpf_table_max_entries_id.restype = ct.c_ulonglong
+lib.bpf_table_max_entries_id.argtypes = [ct.c_void_p, ct.c_ulonglong]
+lib.bpf_table_flags_id.restype = ct.c_int
+lib.bpf_table_flags_id.argtypes = [ct.c_void_p, ct.c_ulonglong]
+lib.bpf_table_key_desc.restype = ct.c_char_p
+lib.bpf_table_key_desc.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_table_leaf_desc.restype = ct.c_char_p
+lib.bpf_table_leaf_desc.argtypes = [ct.c_void_p, ct.c_char_p]
+lib.bpf_table_key_snprintf.restype = ct.c_int
+lib.bpf_table_key_snprintf.argtypes = [ct.c_void_p, ct.c_ulonglong,
+        ct.c_char_p, ct.c_ulonglong, ct.c_void_p]
+lib.bpf_table_leaf_snprintf.restype = ct.c_int
+lib.bpf_table_leaf_snprintf.argtypes = [ct.c_void_p, ct.c_ulonglong,
+        ct.c_char_p, ct.c_ulonglong, ct.c_void_p]
+lib.bpf_table_key_sscanf.restype = ct.c_int
+lib.bpf_table_key_sscanf.argtypes = [ct.c_void_p, ct.c_ulonglong,
+        ct.c_char_p, ct.c_void_p]
+lib.bpf_table_leaf_sscanf.restype = ct.c_int
+lib.bpf_table_leaf_sscanf.argtypes = [ct.c_void_p, ct.c_ulonglong,
+        ct.c_char_p, ct.c_void_p]
+
+# keep in sync with libbpf.h
+lib.bpf_get_next_key.restype = ct.c_int
+lib.bpf_get_next_key.argtypes = [ct.c_int, ct.c_void_p, ct.c_void_p]
+lib.bpf_get_first_key.restype = ct.c_int
+lib.bpf_get_first_key.argtypes = [ct.c_int, ct.c_void_p, ct.c_uint]
+lib.bpf_lookup_elem.restype = ct.c_int
+lib.bpf_lookup_elem.argtypes = [ct.c_int, ct.c_void_p, ct.c_void_p]
+lib.bpf_update_elem.restype = ct.c_int
+lib.bpf_update_elem.argtypes = [ct.c_int, ct.c_void_p, ct.c_void_p,
+        ct.c_ulonglong]
+lib.bpf_delete_elem.restype = ct.c_int
+lib.bpf_delete_elem.argtypes = [ct.c_int, ct.c_void_p]
+lib.bpf_open_raw_sock.restype = ct.c_int
+lib.bpf_open_raw_sock.argtypes = [ct.c_char_p]
+lib.bpf_attach_socket.restype = ct.c_int
+lib.bpf_attach_socket.argtypes = [ct.c_int, ct.c_int]
+lib.bpf_prog_load.restype = ct.c_int
+lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_char_p, ct.c_void_p,
+        ct.c_size_t, ct.c_char_p, ct.c_uint, ct.c_int, ct.c_char_p, ct.c_uint]
+_RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int)
+_LOST_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_ulonglong)
+lib.bpf_attach_kprobe.restype = ct.c_int
+lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_int, ct.c_char_p, ct.c_char_p]
+lib.bpf_detach_kprobe.restype = ct.c_int
+lib.bpf_detach_kprobe.argtypes = [ct.c_char_p]
+lib.bpf_attach_uprobe.restype = ct.c_int
+lib.bpf_attach_uprobe.argtypes = [ct.c_int, ct.c_int, ct.c_char_p, ct.c_char_p,
+        ct.c_ulonglong, ct.c_int]
+lib.bpf_detach_uprobe.restype = ct.c_int
+lib.bpf_detach_uprobe.argtypes = [ct.c_char_p]
+lib.bpf_attach_tracepoint.restype = ct.c_int
+lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p]
+lib.bpf_detach_tracepoint.restype = ct.c_int
+lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
+lib.bpf_attach_raw_tracepoint.restype = ct.c_int
+lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p]
+lib.bpf_open_perf_buffer.restype = ct.c_void_p
+lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
+lib.bpf_open_perf_event.restype = ct.c_int
+lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int]
+lib.perf_reader_poll.restype = ct.c_int
+lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int]
+lib.perf_reader_free.restype = None
+lib.perf_reader_free.argtypes = [ct.c_void_p]
+lib.perf_reader_fd.restype = int
+lib.perf_reader_fd.argtypes = [ct.c_void_p]
+
+lib.bpf_attach_xdp.restype = ct.c_int
+lib.bpf_attach_xdp.argtypes = [ct.c_char_p, ct.c_int, ct.c_uint]
+
+lib.bpf_attach_perf_event.restype = ct.c_int
+lib.bpf_attach_perf_event.argtype = [ct.c_int, ct.c_uint, ct.c_uint, ct.c_ulonglong, ct.c_ulonglong,
+        ct.c_int, ct.c_int, ct.c_int]
+
+lib.bpf_close_perf_event_fd.restype = ct.c_int
+lib.bpf_close_perf_event_fd.argtype = [ct.c_int]
+
+# bcc symbol helpers
+class bcc_symbol(ct.Structure):
+    _fields_ = [
+            ('name', ct.c_char_p),
+            ('demangle_name', ct.c_char_p),
+            ('module', ct.POINTER(ct.c_char)),
+            ('offset', ct.c_ulonglong),
+        ]
+
+class bcc_symbol_option(ct.Structure):
+    _fields_ = [
+            ('use_debug_file', ct.c_int),
+            ('check_debug_file_crc', ct.c_int),
+            ('use_symbol_type', ct.c_uint),
+        ]
+
+lib.bcc_procutils_which_so.restype = ct.POINTER(ct.c_char)
+lib.bcc_procutils_which_so.argtypes = [ct.c_char_p, ct.c_int]
+lib.bcc_procutils_free.restype = None
+lib.bcc_procutils_free.argtypes = [ct.c_void_p]
+lib.bcc_procutils_language.restype = ct.POINTER(ct.c_char)
+lib.bcc_procutils_language.argtypes = [ct.c_int]
+
+lib.bcc_resolve_symname.restype = ct.c_int
+lib.bcc_resolve_symname.argtypes = [
+    ct.c_char_p, ct.c_char_p, ct.c_ulonglong, ct.c_int, ct.POINTER(bcc_symbol_option), ct.POINTER(bcc_symbol)]
+
+_SYM_CB_TYPE = ct.CFUNCTYPE(ct.c_int, ct.c_char_p, ct.c_ulonglong)
+lib.bcc_foreach_function_symbol.restype = ct.c_int
+lib.bcc_foreach_function_symbol.argtypes = [ct.c_char_p, _SYM_CB_TYPE]
+
+lib.bcc_symcache_new.restype = ct.c_void_p
+lib.bcc_symcache_new.argtypes = [ct.c_int, ct.POINTER(bcc_symbol_option)]
+
+lib.bcc_free_symcache.restype = ct.c_void_p
+lib.bcc_free_symcache.argtypes = [ct.c_void_p, ct.c_int]
+
+lib.bcc_symbol_free_demangle_name.restype = ct.c_void_p
+lib.bcc_symbol_free_demangle_name.argtypes = [ct.POINTER(bcc_symbol)]
+
+lib.bcc_symcache_resolve.restype = ct.c_int
+lib.bcc_symcache_resolve.argtypes = [ct.c_void_p, ct.c_ulonglong, ct.POINTER(bcc_symbol)]
+
+lib.bcc_symcache_resolve_no_demangle.restype = ct.c_int
+lib.bcc_symcache_resolve_no_demangle.argtypes = [ct.c_void_p, ct.c_ulonglong, ct.POINTER(bcc_symbol)]
+
+lib.bcc_symcache_resolve_name.restype = ct.c_int
+lib.bcc_symcache_resolve_name.argtypes = [
+    ct.c_void_p, ct.c_char_p, ct.c_char_p, ct.POINTER(ct.c_ulonglong)]
+
+lib.bcc_symcache_refresh.restype = None
+lib.bcc_symcache_refresh.argtypes = [ct.c_void_p]
+
+lib.bcc_usdt_new_frompid.restype = ct.c_void_p
+lib.bcc_usdt_new_frompid.argtypes = [ct.c_int, ct.c_char_p]
+
+lib.bcc_usdt_new_frompath.restype = ct.c_void_p
+lib.bcc_usdt_new_frompath.argtypes = [ct.c_char_p]
+
+lib.bcc_usdt_close.restype = None
+lib.bcc_usdt_close.argtypes = [ct.c_void_p]
+
+lib.bcc_usdt_enable_probe.restype = ct.c_int
+lib.bcc_usdt_enable_probe.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_char_p]
+
+lib.bcc_usdt_genargs.restype = ct.c_char_p
+lib.bcc_usdt_genargs.argtypes = [ct.POINTER(ct.c_void_p), ct.c_int]
+
+lib.bcc_usdt_get_probe_argctype.restype = ct.c_char_p
+lib.bcc_usdt_get_probe_argctype.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_int]
+
+class bcc_usdt(ct.Structure):
+    _fields_ = [
+            ('provider', ct.c_char_p),
+            ('name', ct.c_char_p),
+            ('bin_path', ct.c_char_p),
+            ('semaphore', ct.c_ulonglong),
+            ('num_locations', ct.c_int),
+            ('num_arguments', ct.c_int),
+        ]
+
+class bcc_usdt_location(ct.Structure):
+    _fields_ = [
+            ('address', ct.c_ulonglong),
+            ('bin_path', ct.c_char_p),
+        ]
+
+class BCC_USDT_ARGUMENT_FLAGS(object):
+    NONE = 0x0
+    CONSTANT = 0x1
+    DEREF_OFFSET = 0x2
+    DEREF_IDENT = 0x4
+    BASE_REGISTER_NAME = 0x8
+    INDEX_REGISTER_NAME = 0x10
+    SCALE = 0x20
+
+class bcc_usdt_argument(ct.Structure):
+    _fields_ = [
+            ('size', ct.c_int),
+            ('valid', ct.c_int),
+            ('constant', ct.c_int),
+            ('deref_offset', ct.c_int),
+            ('deref_ident', ct.c_char_p),
+            ('base_register_name', ct.c_char_p),
+            ('index_register_name', ct.c_char_p),
+            ('scale', ct.c_int)
+        ]
+
+_USDT_CB = ct.CFUNCTYPE(None, ct.POINTER(bcc_usdt))
+
+lib.bcc_usdt_foreach.restype = None
+lib.bcc_usdt_foreach.argtypes = [ct.c_void_p, _USDT_CB]
+
+lib.bcc_usdt_get_location.restype = ct.c_int
+lib.bcc_usdt_get_location.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_char_p, ct.c_int,
+                                      ct.POINTER(bcc_usdt_location)]
+
+lib.bcc_usdt_get_argument.restype = ct.c_int
+lib.bcc_usdt_get_argument.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_char_p, ct.c_int,
+                                      ct.c_int, ct.POINTER(bcc_usdt_argument)]
+
+_USDT_PROBE_CB = ct.CFUNCTYPE(None, ct.c_char_p, ct.c_char_p,
+                              ct.c_ulonglong, ct.c_int)
+
+lib.bcc_usdt_foreach_uprobe.restype = None
+lib.bcc_usdt_foreach_uprobe.argtypes = [ct.c_void_p, _USDT_PROBE_CB]
diff --git a/src/python/bcc/perf.py b/src/python/bcc/perf.py
new file mode 100644
index 0000000..44b0128
--- /dev/null
+++ b/src/python/bcc/perf.py
@@ -0,0 +1,109 @@
+# Copyright 2016 Sasha Goldshtein
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes as ct
+import os
+from .utils import get_online_cpus
+
+class Perf(object):
+        class perf_event_attr(ct.Structure):
+                _fields_ = [
+                        ('type', ct.c_uint),
+                        ('size', ct.c_uint),
+                        ('config', ct.c_ulong),
+                        ('sample_period', ct.c_ulong),
+                        ('sample_type', ct.c_ulong),
+                        ('read_format', ct.c_ulong),
+                        ('flags', ct.c_ulong),
+                        ('wakeup_events', ct.c_uint),
+                        ('IGNORE3', ct.c_uint),
+                        ('IGNORE4', ct.c_ulong),
+                        ('IGNORE5', ct.c_ulong),
+                        ('IGNORE6', ct.c_ulong),
+                        ('IGNORE7', ct.c_uint),
+                        ('IGNORE8', ct.c_int),
+                        ('IGNORE9', ct.c_ulong),
+                        ('IGNORE10', ct.c_uint),
+                        ('IGNORE11', ct.c_uint)
+                ]
+
+        # x86 specific, from arch/x86/include/generated/uapi/asm/unistd_64.h
+        NR_PERF_EVENT_OPEN = 298
+
+        #
+        # Selected constants from include/uapi/linux/perf_event.h.
+        # Values copied during Linux 4.7 series.
+        #
+
+        # perf_type_id
+        PERF_TYPE_HARDWARE = 0
+        PERF_TYPE_SOFTWARE = 1
+        PERF_TYPE_TRACEPOINT = 2
+        PERF_TYPE_HW_CACHE = 3
+
+        # perf_event_sample_format
+        PERF_SAMPLE_RAW = 1024      # it's a u32; could also try zero args
+
+        # perf_event_attr
+        PERF_ATTR_FLAG_FREQ = 1024
+
+        # perf_event.h
+        PERF_FLAG_FD_CLOEXEC = 8
+        PERF_EVENT_IOC_SET_FILTER = 1074275334
+        PERF_EVENT_IOC_ENABLE = 9216
+
+        # fetch syscall routines
+        libc = ct.CDLL('libc.so.6', use_errno=True)
+        syscall = libc.syscall          # not declaring vararg types
+        ioctl = libc.ioctl              # not declaring vararg types
+
+        @staticmethod
+        def _open_for_cpu(cpu, attr):
+                pfd = Perf.syscall(Perf.NR_PERF_EVENT_OPEN, ct.byref(attr),
+                                   attr.pid, cpu, -1,
+                                   Perf.PERF_FLAG_FD_CLOEXEC)
+                if pfd < 0:
+                        errno_ = ct.get_errno()
+                        raise OSError(errno_, os.strerror(errno_))
+
+                if attr.type == Perf.PERF_TYPE_TRACEPOINT:
+                    if Perf.ioctl(pfd, Perf.PERF_EVENT_IOC_SET_FILTER,
+                                  "common_pid == -17") < 0:
+                            errno_ = ct.get_errno()
+                            raise OSError(errno_, os.strerror(errno_))
+
+                # we don't setup the perf ring buffers, as we won't read them
+
+                if Perf.ioctl(pfd, Perf.PERF_EVENT_IOC_ENABLE, 0) < 0:
+                        errno_ = ct.get_errno()
+                        raise OSError(errno_, os.strerror(errno_))
+
+        @staticmethod
+        def perf_event_open(tpoint_id, pid=-1, ptype=PERF_TYPE_TRACEPOINT,
+                            freq=0):
+                attr = Perf.perf_event_attr()
+                attr.config = tpoint_id
+                attr.pid = pid
+                attr.type = ptype
+                attr.sample_type = Perf.PERF_SAMPLE_RAW
+                if freq > 0:
+                    # setup sampling
+                    attr.flags = Perf.PERF_ATTR_FLAG_FREQ   # no mmap or comm
+                    attr.sample_period = freq
+                else:
+                    attr.sample_period = 1
+                attr.wakeup_events = 9999999                # don't wake up
+
+                for cpu in get_online_cpus():
+                        Perf._open_for_cpu(cpu, attr)
diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py
new file mode 100644
index 0000000..301afac
--- /dev/null
+++ b/src/python/bcc/table.py
@@ -0,0 +1,783 @@
+# Copyright 2015 PLUMgrid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import MutableMapping
+import ctypes as ct
+from functools import reduce
+import multiprocessing
+import os
+import errno
+
+from .libbcc import lib, _RAW_CB_TYPE, _LOST_CB_TYPE
+from .perf import Perf
+from .utils import get_online_cpus
+from .utils import get_possible_cpus
+from subprocess import check_output
+
+BPF_MAP_TYPE_HASH = 1
+BPF_MAP_TYPE_ARRAY = 2
+BPF_MAP_TYPE_PROG_ARRAY = 3
+BPF_MAP_TYPE_PERF_EVENT_ARRAY = 4
+BPF_MAP_TYPE_PERCPU_HASH = 5
+BPF_MAP_TYPE_PERCPU_ARRAY = 6
+BPF_MAP_TYPE_STACK_TRACE = 7
+BPF_MAP_TYPE_CGROUP_ARRAY = 8
+BPF_MAP_TYPE_LRU_HASH = 9
+BPF_MAP_TYPE_LRU_PERCPU_HASH = 10
+BPF_MAP_TYPE_LPM_TRIE = 11
+BPF_MAP_TYPE_ARRAY_OF_MAPS = 12
+BPF_MAP_TYPE_HASH_OF_MAPS = 13
+BPF_MAP_TYPE_DEVMAP = 14
+BPF_MAP_TYPE_SOCKMAP = 15
+BPF_MAP_TYPE_CPUMAP = 16
+BPF_MAP_TYPE_XSKMAP = 17
+BPF_MAP_TYPE_SOCKHASH = 18
+
+stars_max = 40
+log2_index_max = 65
+linear_index_max = 1025
+
+# helper functions, consider moving these to a utils module
+def _stars(val, val_max, width):
+    i = 0
+    text = ""
+    while (1):
+        if (i > (width * val / val_max) - 1) or (i > width - 1):
+            break
+        text += "*"
+        i += 1
+    if val > val_max:
+        text = text[:-1] + "+"
+    return text
+
+
+def _print_log2_hist(vals, val_type, strip_leading_zero):
+    global stars_max
+    log2_dist_max = 64
+    idx_max = -1
+    val_max = 0
+
+    for i, v in enumerate(vals):
+        if v > 0: idx_max = i
+        if v > val_max: val_max = v
+
+    if idx_max <= 32:
+        header = "     %-19s : count     distribution"
+        body = "%10d -> %-10d : %-8d |%-*s|"
+        stars = stars_max
+    else:
+        header = "               %-29s : count     distribution"
+        body = "%20d -> %-20d : %-8d |%-*s|"
+        stars = int(stars_max / 2)
+
+    if idx_max > 0:
+        print(header % val_type)
+
+    for i in range(1, idx_max + 1):
+        low = (1 << i) >> 1
+        high = (1 << i) - 1
+        if (low == high):
+            low -= 1
+        val = vals[i]
+
+        if strip_leading_zero:
+            if val:
+                print(body % (low, high, val, stars,
+                              _stars(val, val_max, stars)))
+                strip_leading_zero = False
+        else:
+            print(body % (low, high, val, stars,
+                          _stars(val, val_max, stars)))
+
+def _print_linear_hist(vals, val_type):
+    global stars_max
+    log2_dist_max = 64
+    idx_max = -1
+    val_max = 0
+
+    for i, v in enumerate(vals):
+        if v > 0: idx_max = i
+        if v > val_max: val_max = v
+
+    header = "     %-13s : count     distribution"
+    body = "        %-10d : %-8d |%-*s|"
+    stars = stars_max
+
+    if idx_max >= 0:
+        print(header % val_type);
+    for i in range(0, idx_max + 1):
+        val = vals[i]
+        print(body % (i, val, stars,
+                      _stars(val, val_max, stars)))
+
+
+def Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs):
+    """Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs)
+
+    Create a python object out of a reference to a bpf table handle"""
+
+    ttype = lib.bpf_table_type_id(bpf.module, map_id)
+    t = None
+    if ttype == BPF_MAP_TYPE_HASH:
+        t = HashTable(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_ARRAY:
+        t = Array(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_PROG_ARRAY:
+        t = ProgArray(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+        t = PerfEventArray(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_PERCPU_HASH:
+        t = PerCpuHash(bpf, map_id, map_fd, keytype, leaftype, **kwargs)
+    elif ttype == BPF_MAP_TYPE_PERCPU_ARRAY:
+        t = PerCpuArray(bpf, map_id, map_fd, keytype, leaftype, **kwargs)
+    elif ttype == BPF_MAP_TYPE_LPM_TRIE:
+        t = LpmTrie(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_STACK_TRACE:
+        t = StackTrace(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_LRU_HASH:
+        t = LruHash(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_LRU_PERCPU_HASH:
+        t = LruPerCpuHash(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_CGROUP_ARRAY:
+        t = CgroupArray(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_DEVMAP:
+        t = DevMap(bpf, map_id, map_fd, keytype, leaftype)
+    elif ttype == BPF_MAP_TYPE_CPUMAP:
+        t = CpuMap(bpf, map_id, map_fd, keytype, leaftype)
+    if t == None:
+        raise Exception("Unknown table type %d" % ttype)
+    return t
+
+
+class TableBase(MutableMapping):
+
+    def __init__(self, bpf, map_id, map_fd, keytype, leaftype):
+        self.bpf = bpf
+        self.map_id = map_id
+        self.map_fd = map_fd
+        self.Key = keytype
+        self.Leaf = leaftype
+        self.ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id)
+        self.flags = lib.bpf_table_flags_id(self.bpf.module, self.map_id)
+        self._cbs = {}
+
+    def key_sprintf(self, key):
+        buf = ct.create_string_buffer(ct.sizeof(self.Key) * 8)
+        res = lib.bpf_table_key_snprintf(self.bpf.module, self.map_id, buf,
+                                         len(buf), ct.byref(key))
+        if res < 0:
+            raise Exception("Could not printf key")
+        return buf.value
+
+    def leaf_sprintf(self, leaf):
+        buf = ct.create_string_buffer(ct.sizeof(self.Leaf) * 8)
+        res = lib.bpf_table_leaf_snprintf(self.bpf.module, self.map_id, buf,
+                                          len(buf), ct.byref(leaf))
+        if res < 0:
+            raise Exception("Could not printf leaf")
+        return buf.value
+
+    def key_scanf(self, key_str):
+        key = self.Key()
+        res = lib.bpf_table_key_sscanf(self.bpf.module, self.map_id, key_str,
+                                       ct.byref(key))
+        if res < 0:
+            raise Exception("Could not scanf key")
+        return key
+
+    def leaf_scanf(self, leaf_str):
+        leaf = self.Leaf()
+        res = lib.bpf_table_leaf_sscanf(self.bpf.module, self.map_id, leaf_str,
+                                        ct.byref(leaf))
+        if res < 0:
+            raise Exception("Could not scanf leaf")
+        return leaf
+
+    def __getitem__(self, key):
+        leaf = self.Leaf()
+        res = lib.bpf_lookup_elem(self.map_fd, ct.byref(key), ct.byref(leaf))
+        if res < 0:
+            raise KeyError
+        return leaf
+
+    def __setitem__(self, key, leaf):
+        res = lib.bpf_update_elem(self.map_fd, ct.byref(key), ct.byref(leaf), 0)
+        if res < 0:
+            errstr = os.strerror(ct.get_errno())
+            raise Exception("Could not update table: %s" % errstr)
+
+    def __delitem__(self, key):
+        res = lib.bpf_delete_elem(self.map_fd, ct.byref(key))
+        if res < 0:
+            raise KeyError
+
+    # override the MutableMapping's implementation of these since they
+    # don't handle KeyError nicely
+    def itervalues(self):
+        for key in self:
+            # a map entry may be deleted in between discovering the key and
+            # fetching the value, suppress such errors
+            try:
+                yield self[key]
+            except KeyError:
+                pass
+
+    def iteritems(self):
+        for key in self:
+            try:
+                yield (key, self[key])
+            except KeyError:
+                pass
+
+    def items(self):
+        return [item for item in self.iteritems()]
+
+    def values(self):
+        return [value for value in self.itervalues()]
+
+    def clear(self):
+        # default clear uses popitem, which can race with the bpf prog
+        for k in self.keys():
+            self.__delitem__(k)
+
+    def zero(self):
+        # Even though this is not very efficient, we grab the entire list of
+        # keys before enumerating it. This helps avoid a potential race where
+        # the leaf assignment changes a hash table bucket that is being
+        # enumerated by the same loop, and may lead to a hang.
+        for k in list(self.keys()):
+            self[k] = self.Leaf()
+
+    def __iter__(self):
+        return TableBase.Iter(self)
+
+    def iter(self): return self.__iter__()
+    def keys(self): return self.__iter__()
+
+    class Iter(object):
+        def __init__(self, table):
+            self.table = table
+            self.key = None
+        def __iter__(self):
+            return self
+        def __next__(self):
+            return self.next()
+        def next(self):
+            self.key = self.table.next(self.key)
+            return self.key
+
+    def next(self, key):
+        next_key = self.Key()
+
+        if key is None:
+            res = lib.bpf_get_first_key(self.map_fd, ct.byref(next_key),
+                                        ct.sizeof(self.Key))
+        else:
+            res = lib.bpf_get_next_key(self.map_fd, ct.byref(key),
+                                       ct.byref(next_key))
+
+        if res < 0:
+            raise StopIteration()
+        return next_key
+
+    def print_log2_hist(self, val_type="value", section_header="Bucket ptr",
+            section_print_fn=None, bucket_fn=None, strip_leading_zero=None,
+            bucket_sort_fn=None):
+        """print_log2_hist(val_type="value", section_header="Bucket ptr",
+                           section_print_fn=None, bucket_fn=None,
+                           strip_leading_zero=None, bucket_sort_fn=None):
+
+        Prints a table as a log2 histogram. The table must be stored as
+        log2. The val_type argument is optional, and is a column header.
+        If the histogram has a secondary key, multiple tables will print
+        and section_header can be used as a header description for each.
+        If section_print_fn is not None, it will be passed the bucket value
+        to format into a string as it sees fit. If bucket_fn is not None,
+        it will be used to produce a bucket value for the histogram keys.
+        If the value of strip_leading_zero is not False, prints a histogram
+        that is omitted leading zeros from the beginning.
+        If bucket_sort_fn is not None, it will be used to sort the buckets
+        before iterating them, and it is useful when there are multiple fields
+        in the secondary key.
+        The maximum index allowed is log2_index_max (65), which will
+        accommodate any 64-bit integer in the histogram.
+        """
+        if isinstance(self.Key(), ct.Structure):
+            tmp = {}
+            f1 = self.Key._fields_[0][0]
+            f2 = self.Key._fields_[1][0]
+            for k, v in self.items():
+                bucket = getattr(k, f1)
+                if bucket_fn:
+                    bucket = bucket_fn(bucket)
+                vals = tmp[bucket] = tmp.get(bucket, [0] * log2_index_max)
+                slot = getattr(k, f2)
+                vals[slot] = v.value
+
+            buckets = list(tmp.keys())
+            if bucket_sort_fn:
+                buckets = bucket_sort_fn(buckets)
+
+            for bucket in buckets:
+                vals = tmp[bucket]
+                if section_print_fn:
+                    print("\n%s = %s" % (section_header,
+                        section_print_fn(bucket)))
+                else:
+                    print("\n%s = %r" % (section_header, bucket))
+                _print_log2_hist(vals, val_type, strip_leading_zero)
+        else:
+            vals = [0] * log2_index_max
+            for k, v in self.items():
+                vals[k.value] = v.value
+            _print_log2_hist(vals, val_type, strip_leading_zero)
+
+    def print_linear_hist(self, val_type="value", section_header="Bucket ptr",
+            section_print_fn=None, bucket_fn=None, bucket_sort_fn=None):
+        """print_linear_hist(val_type="value", section_header="Bucket ptr",
+                           section_print_fn=None, bucket_fn=None,
+                           bucket_sort_fn=None)
+
+        Prints a table as a linear histogram. This is intended to span integer
+        ranges, eg, from 0 to 100. The val_type argument is optional, and is a
+        column header.  If the histogram has a secondary key, multiple tables
+        will print and section_header can be used as a header description for
+        each.  If section_print_fn is not None, it will be passed the bucket
+        value to format into a string as it sees fit. If bucket_fn is not None,
+        it will be used to produce a bucket value for the histogram keys.
+        If bucket_sort_fn is not None, it will be used to sort the buckets
+        before iterating them, and it is useful when there are multiple fields
+        in the secondary key.
+        The maximum index allowed is linear_index_max (1025), which is hoped
+        to be sufficient for integer ranges spanned.
+        """
+        if isinstance(self.Key(), ct.Structure):
+            tmp = {}
+            f1 = self.Key._fields_[0][0]
+            f2 = self.Key._fields_[1][0]
+            for k, v in self.items():
+                bucket = getattr(k, f1)
+                if bucket_fn:
+                    bucket = bucket_fn(bucket)
+                vals = tmp[bucket] = tmp.get(bucket, [0] * linear_index_max)
+                slot = getattr(k, f2)
+                vals[slot] = v.value
+
+            buckets = tmp.keys()
+            if bucket_sort_fn:
+                buckets = bucket_sort_fn(buckets)
+
+            for bucket in buckets:
+                vals = tmp[bucket]
+                if section_print_fn:
+                    print("\n%s = %s" % (section_header,
+                        section_print_fn(bucket)))
+                else:
+                    print("\n%s = %r" % (section_header, bucket))
+                _print_linear_hist(vals, val_type)
+        else:
+            vals = [0] * linear_index_max
+            for k, v in self.items():
+                try:
+                    vals[k.value] = v.value
+                except IndexError:
+                    # Improve error text. If the limit proves a nusiance, this
+                    # function be rewritten to avoid having one.
+                    raise IndexError(("Index in print_linear_hist() of %d " +
+                        "exceeds max of %d.") % (k.value, linear_index_max))
+            _print_linear_hist(vals, val_type)
+
+
+class HashTable(TableBase):
+    def __init__(self, *args, **kwargs):
+        super(HashTable, self).__init__(*args, **kwargs)
+
+    def __len__(self):
+        i = 0
+        for k in self: i += 1
+        return i
+
+class LruHash(HashTable):
+    def __init__(self, *args, **kwargs):
+        super(LruHash, self).__init__(*args, **kwargs)
+
+class ArrayBase(TableBase):
+    def __init__(self, *args, **kwargs):
+        super(ArrayBase, self).__init__(*args, **kwargs)
+        self.max_entries = int(lib.bpf_table_max_entries_id(self.bpf.module,
+                self.map_id))
+
+    def _normalize_key(self, key):
+        if isinstance(key, int):
+            if key < 0:
+                key = len(self) + key
+            key = self.Key(key)
+        if not isinstance(key, ct._SimpleCData):
+            raise IndexError("Array index must be an integer type")
+        if key.value >= len(self):
+            raise IndexError("Array index out of range")
+        return key
+
+    def __len__(self):
+        return self.max_entries
+
+    def __getitem__(self, key):
+        key = self._normalize_key(key)
+        return super(ArrayBase, self).__getitem__(key)
+
+    def __setitem__(self, key, leaf):
+        key = self._normalize_key(key)
+        super(ArrayBase, self).__setitem__(key, leaf)
+
+    def __delitem__(self, key):
+        key = self._normalize_key(key)
+        super(ArrayBase, self).__delitem__(key)
+
+    def clearitem(self, key):
+        key = self._normalize_key(key)
+        leaf = self.Leaf()
+        res = lib.bpf_update_elem(self.map_fd, ct.byref(key), ct.byref(leaf), 0)
+        if res < 0:
+            raise Exception("Could not clear item")
+
+    def __iter__(self):
+        return ArrayBase.Iter(self, self.Key)
+
+    class Iter(object):
+        def __init__(self, table, keytype):
+            self.Key = keytype
+            self.table = table
+            self.i = -1
+
+        def __iter__(self):
+            return self
+        def __next__(self):
+            return self.next()
+        def next(self):
+            self.i += 1
+            if self.i == len(self.table):
+                raise StopIteration()
+            return self.Key(self.i)
+
+class Array(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        super(Array, self).__init__(*args, **kwargs)
+
+    def __delitem__(self, key):
+        # Delete in Array type does not have an effect, so zero out instead
+        self.clearitem(key)
+
+class ProgArray(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        super(ProgArray, self).__init__(*args, **kwargs)
+
+    def __setitem__(self, key, leaf):
+        if isinstance(leaf, int):
+            leaf = self.Leaf(leaf)
+        if isinstance(leaf, self.bpf.Function):
+            leaf = self.Leaf(leaf.fd)
+        super(ProgArray, self).__setitem__(key, leaf)
+
+class FileDesc:
+    def __init__(self, fd):
+        if (fd is None) or (fd < 0):
+            raise Exception("Invalid file descriptor")
+        self.fd = fd
+
+    def clean_up(self):
+        if (self.fd is not None) and (self.fd >= 0):
+            os.close(self.fd)
+            self.fd = None
+
+    def __del__(self):
+        self.clean_up()
+
+    def __enter__(self, *args, **kwargs):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.clean_up()
+
+class CgroupArray(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        super(CgroupArray, self).__init__(*args, **kwargs)
+
+    def __setitem__(self, key, leaf):
+        if isinstance(leaf, int):
+            super(CgroupArray, self).__setitem__(key, self.Leaf(leaf))
+        elif isinstance(leaf, str):
+            # TODO: Add os.O_CLOEXEC once we move to Python version >3.3
+            with FileDesc(os.open(leaf, os.O_RDONLY)) as f:
+                super(CgroupArray, self).__setitem__(key, self.Leaf(f.fd))
+        else:
+            raise Exception("Cgroup array key must be either FD or cgroup path")
+
+class PerfEventArray(ArrayBase):
+
+    def __init__(self, *args, **kwargs):
+        super(PerfEventArray, self).__init__(*args, **kwargs)
+        self._open_key_fds = {}
+
+    def __del__(self):
+        keys = list(self._open_key_fds.keys())
+        for key in keys:
+            del self[key]
+
+    def __delitem__(self, key):
+        if key not in self._open_key_fds:
+            return
+        # Delete entry from the array
+        super(PerfEventArray, self).__delitem__(key)
+        key_id = (id(self), key)
+        if key_id in self.bpf.perf_buffers:
+            # The key is opened for perf ring buffer
+            lib.perf_reader_free(self.bpf.perf_buffers[key_id])
+            del self.bpf.perf_buffers[key_id]
+            del self._cbs[key]
+        else:
+            # The key is opened for perf event read
+            lib.bpf_close_perf_event_fd(self._open_key_fds[key])
+        del self._open_key_fds[key]
+
+    def open_perf_buffer(self, callback, page_cnt=8, lost_cb=None):
+        """open_perf_buffers(callback)
+
+        Opens a set of per-cpu ring buffer to receive custom perf event
+        data from the bpf program. The callback will be invoked for each
+        event submitted from the kernel, up to millions per second. Use
+        page_cnt to change the size of the per-cpu ring buffer. The value
+        must be a power of two and defaults to 8.
+        """
+
+        if page_cnt & (page_cnt - 1) != 0:
+            raise Exception("Perf buffer page_cnt must be a power of two")
+
+        for i in get_online_cpus():
+            self._open_perf_buffer(i, callback, page_cnt, lost_cb)
+
+    def _open_perf_buffer(self, cpu, callback, page_cnt, lost_cb):
+        def raw_cb_(_, data, size):
+            try:
+                callback(cpu, data, size)
+            except IOError as e:
+                if e.errno == errno.EPIPE:
+                    exit()
+                else:
+                    raise e
+        def lost_cb_(_, lost):
+            try:
+                lost_cb(lost)
+            except IOError as e:
+                if e.errno == errno.EPIPE:
+                    exit()
+                else:
+                    raise e
+        fn = _RAW_CB_TYPE(raw_cb_)
+        lost_fn = _LOST_CB_TYPE(lost_cb_) if lost_cb else ct.cast(None, _LOST_CB_TYPE)
+        reader = lib.bpf_open_perf_buffer(fn, lost_fn, None, -1, cpu, page_cnt)
+        if not reader:
+            raise Exception("Could not open perf buffer")
+        fd = lib.perf_reader_fd(reader)
+        self[self.Key(cpu)] = self.Leaf(fd)
+        self.bpf.perf_buffers[(id(self), cpu)] = reader
+        # keep a refcnt
+        self._cbs[cpu] = (fn, lost_fn)
+        # The actual fd is held by the perf reader, add to track opened keys
+        self._open_key_fds[cpu] = -1
+
+    def _open_perf_event(self, cpu, typ, config):
+        fd = lib.bpf_open_perf_event(typ, config, -1, cpu)
+        if fd < 0:
+            raise Exception("bpf_open_perf_event failed")
+        self[self.Key(cpu)] = self.Leaf(fd)
+        self._open_key_fds[cpu] = fd
+
+    def open_perf_event(self, typ, config):
+        """open_perf_event(typ, config)
+
+        Configures the table such that calls from the bpf program to
+        table.perf_read(CUR_CPU_IDENTIFIER) will return the hardware
+        counter denoted by event ev on the local cpu.
+        """
+        for i in get_online_cpus():
+            self._open_perf_event(i, typ, config)
+
+
+class PerCpuHash(HashTable):
+    def __init__(self, *args, **kwargs):
+        self.reducer = kwargs.pop("reducer", None)
+        super(PerCpuHash, self).__init__(*args, **kwargs)
+        self.sLeaf = self.Leaf
+        self.total_cpu = len(get_possible_cpus())
+        # This needs to be 8 as hard coded into the linux kernel.
+        self.alignment = ct.sizeof(self.sLeaf) % 8
+        if self.alignment is 0:
+            self.Leaf = self.sLeaf * self.total_cpu
+        else:
+            # Currently Float, Char, un-aligned structs are not supported
+            if self.sLeaf == ct.c_uint:
+                self.Leaf = ct.c_uint64 * self.total_cpu
+            elif self.sLeaf == ct.c_int:
+                self.Leaf = ct.c_int64 * self.total_cpu
+            else:
+                raise IndexError("Leaf must be aligned to 8 bytes")
+
+    def getvalue(self, key):
+        result = super(PerCpuHash, self).__getitem__(key)
+        if self.alignment is 0:
+            ret = result
+        else:
+            ret = (self.sLeaf * self.total_cpu)()
+            for i in range(0, self.total_cpu):
+                ret[i] = result[i]
+        return ret
+
+    def __getitem__(self, key):
+        if self.reducer:
+            return reduce(self.reducer, self.getvalue(key))
+        else:
+            return self.getvalue(key)
+
+    def __setitem__(self, key, leaf):
+        super(PerCpuHash, self).__setitem__(key, leaf)
+
+    def sum(self, key):
+        if isinstance(self.Leaf(), ct.Structure):
+            raise IndexError("Leaf must be an integer type for default sum functions")
+        return self.sLeaf(sum(self.getvalue(key)))
+
+    def max(self, key):
+        if isinstance(self.Leaf(), ct.Structure):
+            raise IndexError("Leaf must be an integer type for default max functions")
+        return self.sLeaf(max(self.getvalue(key)))
+
+    def average(self, key):
+        result = self.sum(key)
+        return result.value / self.total_cpu
+
+class LruPerCpuHash(PerCpuHash):
+    def __init__(self, *args, **kwargs):
+        super(LruPerCpuHash, self).__init__(*args, **kwargs)
+
+class PerCpuArray(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        self.reducer = kwargs.pop("reducer", None)
+        super(PerCpuArray, self).__init__(*args, **kwargs)
+        self.sLeaf = self.Leaf
+        self.total_cpu = len(get_possible_cpus())
+        # This needs to be 8 as hard coded into the linux kernel.
+        self.alignment = ct.sizeof(self.sLeaf) % 8
+        if self.alignment is 0:
+            self.Leaf = self.sLeaf * self.total_cpu
+        else:
+            # Currently Float, Char, un-aligned structs are not supported
+            if self.sLeaf == ct.c_uint:
+                self.Leaf = ct.c_uint64 * self.total_cpu
+            elif self.sLeaf == ct.c_int:
+                self.Leaf = ct.c_int64 * self.total_cpu
+            else:
+                raise IndexError("Leaf must be aligned to 8 bytes")
+
+    def getvalue(self, key):
+        result = super(PerCpuArray, self).__getitem__(key)
+        if self.alignment is 0:
+            ret = result
+        else:
+            ret = (self.sLeaf * self.total_cpu)()
+            for i in range(0, self.total_cpu):
+                ret[i] = result[i]
+        return ret
+
+    def __getitem__(self, key):
+        if (self.reducer):
+            return reduce(self.reducer, self.getvalue(key))
+        else:
+            return self.getvalue(key)
+
+    def __setitem__(self, key, leaf):
+        super(PerCpuArray, self).__setitem__(key, leaf)
+
+    def __delitem__(self, key):
+        # Delete in this type does not have an effect, so zero out instead
+        self.clearitem(key)
+
+    def sum(self, key):
+        if isinstance(self.Leaf(), ct.Structure):
+            raise IndexError("Leaf must be an integer type for default sum functions")
+        return self.sLeaf(sum(self.getvalue(key)))
+
+    def max(self, key):
+        if isinstance(self.Leaf(), ct.Structure):
+            raise IndexError("Leaf must be an integer type for default max functions")
+        return self.sLeaf(max(self.getvalue(key)))
+
+    def average(self, key):
+        result = self.sum(key)
+        return result.value / self.total_cpu
+
+class LpmTrie(TableBase):
+    def __init__(self, *args, **kwargs):
+        super(LpmTrie, self).__init__(*args, **kwargs)
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class StackTrace(TableBase):
+    MAX_DEPTH = 127
+
+    def __init__(self, *args, **kwargs):
+        super(StackTrace, self).__init__(*args, **kwargs)
+
+    class StackWalker(object):
+        def __init__(self, stack, resolve=None):
+            self.stack = stack
+            self.n = -1
+            self.resolve = resolve
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            return self.next()
+
+        def next(self):
+            self.n += 1
+            if self.n == StackTrace.MAX_DEPTH:
+                raise StopIteration()
+
+            addr = self.stack.ip[self.n]
+            if addr == 0 :
+                raise StopIteration()
+
+            return self.resolve(addr) if self.resolve else addr
+
+    def walk(self, stack_id, resolve=None):
+        return StackTrace.StackWalker(self[self.Key(stack_id)], resolve)
+
+    def __len__(self):
+        i = 0
+        for k in self: i += 1
+        return i
+
+    def clear(self):
+        pass
+
+class DevMap(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        super(DevMap, self).__init__(*args, **kwargs)
+
+class CpuMap(ArrayBase):
+    def __init__(self, *args, **kwargs):
+        super(CpuMap, self).__init__(*args, **kwargs)
diff --git a/src/python/bcc/tcp.py b/src/python/bcc/tcp.py
new file mode 100644
index 0000000..0d25348
--- /dev/null
+++ b/src/python/bcc/tcp.py
@@ -0,0 +1,58 @@
+# Copyright 2018 Netflix, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from include/net/tcp_states.h:
+tcpstate = {}
+tcpstate[1] = 'ESTABLISHED'
+tcpstate[2] = 'SYN_SENT'
+tcpstate[3] = 'SYN_RECV'
+tcpstate[4] = 'FIN_WAIT1'
+tcpstate[5] = 'FIN_WAIT2'
+tcpstate[6] = 'TIME_WAIT'
+tcpstate[7] = 'CLOSE'
+tcpstate[8] = 'CLOSE_WAIT'
+tcpstate[9] = 'LAST_ACK'
+tcpstate[10] = 'LISTEN'
+tcpstate[11] = 'CLOSING'
+tcpstate[12] = 'NEW_SYN_RECV'
+
+# from include/net/tcp.h:
+TCPHDR_FIN = 0x01;
+TCPHDR_SYN = 0x02;
+TCPHDR_RST = 0x04;
+TCPHDR_PSH = 0x08;
+TCPHDR_ACK = 0x10;
+TCPHDR_URG = 0x20;
+TCPHDR_ECE = 0x40;
+TCPHDR_CWR = 0x80;
+
+def flags2str(flags):
+    arr = [];
+    if flags & TCPHDR_FIN:
+        arr.append("FIN");
+    if flags & TCPHDR_SYN:
+        arr.append("SYN");
+    if flags & TCPHDR_RST:
+        arr.append("RST");
+    if flags & TCPHDR_PSH:
+        arr.append("PSH");
+    if flags & TCPHDR_ACK:
+        arr.append("ACK");
+    if flags & TCPHDR_URG:
+        arr.append("URG");
+    if flags & TCPHDR_ECE:
+        arr.append("ECE");
+    if flags & TCPHDR_CWR:
+        arr.append("CWR");
+    return "|".join(arr);
diff --git a/src/python/bcc/usdt.py b/src/python/bcc/usdt.py
new file mode 100644
index 0000000..5fb1cda
--- /dev/null
+++ b/src/python/bcc/usdt.py
@@ -0,0 +1,202 @@
+# Copyright 2016 Sasha Goldshtein
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes as ct
+import os, sys
+from .libbcc import lib, _USDT_CB, _USDT_PROBE_CB, \
+                    bcc_usdt_location, bcc_usdt_argument, \
+                    BCC_USDT_ARGUMENT_FLAGS
+
+class USDTException(Exception):
+    pass
+
+class USDTProbeArgument(object):
+    def __init__(self, argument):
+        self.signed = argument.size < 0
+        self.size = abs(argument.size)
+        self.valid = argument.valid
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.CONSTANT != 0:
+            self.constant = argument.constant
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0:
+            self.deref_offset = argument.deref_offset
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT != 0:
+            self.deref_ident = argument.deref_ident
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.BASE_REGISTER_NAME != 0:
+            self.base_register_name = argument.base_register_name
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.INDEX_REGISTER_NAME != 0:
+            self.index_register_name = argument.index_register_name
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.SCALE != 0:
+            self.scale = argument.scale
+
+    def _size_prefix(self):
+        return "%d %s bytes" % \
+                (self.size, "signed  " if self.signed else "unsigned")
+
+    def _format(self):
+        # This mimics the logic in cc/usdt_args.cc that gives meaning to the
+        # various argument settings. A change there will require a change here.
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.CONSTANT != 0:
+            return "%d" % self.constant
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET == 0:
+            return "%s" % self.base_register_name
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT == 0:
+            if self.valid & BCC_USDT_ARGUMENT_FLAGS.INDEX_REGISTER_NAME != 0:
+                index_offset = " + %s" % self.index_register_name
+                if self.valid & BCC_USDT_ARGUMENT_FLAGS.SCALE != 0:
+                    index_offset += " * %d" % self.scale
+            else:
+                index_offset = ""
+            sign = '+' if self.deref_offset >= 0 else '-'
+            return "*(%s %s %d%s)" % (self.base_register_name,
+                                    sign, abs(self.deref_offset), index_offset)
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.BASE_REGISTER_NAME != 0 and \
+           self.base_register_name == "ip":
+            sign = '+' if self.deref_offset >= 0 else '-'
+            return "*(&%s %s %d)" % (self.deref_ident,
+                                     sign, abs(self.deref_offset))
+        # If we got here, this is an unrecognized case. Doesn't mean it's
+        # necessarily bad, so just provide the raw data. It just means that
+        # other tools won't be able to work with this argument.
+        return "unrecognized argument format, flags %d" % self.valid
+
+    def __str__(self):
+        return "%s @ %s" % (self._size_prefix(), self._format())
+
+class USDTProbeLocation(object):
+    def __init__(self, probe, index, location):
+        self.probe = probe
+        self.index = index
+        self.num_arguments = probe.num_arguments
+        self.address = location.address
+        self.bin_path = location.bin_path
+
+    def __str__(self):
+        return "%s 0x%x" % (self.bin_path, self.address)
+
+    def get_argument(self, index):
+        arg = bcc_usdt_argument()
+        res = lib.bcc_usdt_get_argument(self.probe.context, self.probe.provider,
+                                        self.probe.name,
+                                        self.index, index, ct.byref(arg))
+        if res != 0:
+            raise USDTException(
+                    "error retrieving probe argument %d location %d" %
+                    (index, self.index))
+        return USDTProbeArgument(arg)
+
+class USDTProbe(object):
+    def __init__(self, context, probe):
+        self.context = context
+        self.provider = probe.provider
+        self.name = probe.name
+        self.bin_path = probe.bin_path
+        self.semaphore = probe.semaphore
+        self.num_locations = probe.num_locations
+        self.num_arguments = probe.num_arguments
+
+    def __str__(self):
+        return "%s:%s [sema 0x%x]" % \
+               (self.provider, self.name, self.semaphore)
+
+    def short_name(self):
+        return "%s:%s" % (self.provider, self.name)
+
+    def get_location(self, index):
+        loc = bcc_usdt_location()
+        res = lib.bcc_usdt_get_location(self.context, self.provider, self.name,
+                                        index, ct.byref(loc))
+        if res != 0:
+            raise USDTException("error retrieving probe location %d" % index)
+        return USDTProbeLocation(self, index, loc)
+
+class USDT(object):
+    def __init__(self, pid=None, path=None):
+        if pid and pid != -1:
+            self.pid = pid
+            if path:
+                self.context = lib.bcc_usdt_new_frompid(pid, path.encode('ascii'))
+            else:
+                self.context = lib.bcc_usdt_new_frompid(pid, ct.c_char_p(0))
+            if self.context == None:
+                raise USDTException("USDT failed to instrument PID %d" % pid)
+        elif path:
+            self.path = path
+            self.context = lib.bcc_usdt_new_frompath(path.encode('ascii'))
+            if self.context == None:
+                raise USDTException("USDT failed to instrument path %s" % path)
+        else:
+            raise USDTException(
+                    "either a pid or a binary path must be specified")
+
+    def __del__(self):
+        lib.bcc_usdt_close(self.context)
+
+    def enable_probe(self, probe, fn_name):
+        if lib.bcc_usdt_enable_probe(self.context, probe.encode('ascii'),
+                fn_name.encode('ascii')) != 0:
+            raise USDTException(
+                    ("failed to enable probe '%s'; a possible cause " +
+                     "can be that the probe requires a pid to enable") %
+                     probe
+                  )
+
+    def enable_probe_or_bail(self, probe, fn_name):
+        if lib.bcc_usdt_enable_probe(self.context, probe.encode('ascii'),
+                fn_name.encode('ascii')) != 0:
+            print(
+"""Error attaching USDT probes: the specified pid might not contain the
+given language's runtime, or the runtime was not built with the required
+USDT probes. Look for a configure flag similar to --with-dtrace or
+--enable-dtrace. To check which probes are present in the process, use the
+tplist tool.""")
+            sys.exit(1)
+
+    def get_context(self):
+        return self.context
+
+    def get_text(self):
+        ctx_array = (ct.c_void_p * 1)()
+        ctx_array[0] = ct.c_void_p(self.context)
+        return lib.bcc_usdt_genargs(ctx_array, 1).decode()
+
+    def get_probe_arg_ctype(self, probe_name, arg_index):
+        return lib.bcc_usdt_get_probe_argctype(
+            self.context, probe_name.encode('ascii'), arg_index).decode()
+
+    def enumerate_probes(self):
+        probes = []
+        def _add_probe(probe):
+            probes.append(USDTProbe(self.context, probe.contents))
+
+        lib.bcc_usdt_foreach(self.context, _USDT_CB(_add_probe))
+        return probes
+
+    # This is called by the BPF module's __init__ when it realizes that there
+    # is a USDT context and probes need to be attached.
+    def attach_uprobes(self, bpf):
+        probes = self.enumerate_active_probes()
+        for (binpath, fn_name, addr, pid) in probes:
+            bpf.attach_uprobe(name=binpath.decode(), fn_name=fn_name.decode(),
+                              addr=addr, pid=pid)
+
+    def enumerate_active_probes(self):
+        probes = []
+        def _add_probe(binpath, fn_name, addr, pid):
+            probes.append((binpath, fn_name, addr, pid))
+
+        lib.bcc_usdt_foreach_uprobe(self.context, _USDT_PROBE_CB(_add_probe))
+        return probes
diff --git a/src/python/bcc/utils.py b/src/python/bcc/utils.py
new file mode 100644
index 0000000..ef6f81d
--- /dev/null
+++ b/src/python/bcc/utils.py
@@ -0,0 +1,99 @@
+# Copyright 2016 Catalysts GmbH
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ctypes as ct
+import sys
+import traceback
+import warnings
+
+from .libbcc import lib
+
+def _read_cpu_range(path):
+    cpus = []
+    with open(path, 'r') as f:
+        cpus_range_str = f.read()
+        for cpu_range in cpus_range_str.split(','):
+            rangeop = cpu_range.find('-')
+            if rangeop == -1:
+                cpus.append(int(cpu_range))
+            else:
+                start = int(cpu_range[:rangeop])
+                end = int(cpu_range[rangeop+1:])
+                cpus.extend(range(start, end+1))
+    return cpus
+
+def get_online_cpus():
+    return _read_cpu_range('/sys/devices/system/cpu/online')
+
+def get_possible_cpus():
+    return _read_cpu_range('/sys/devices/system/cpu/possible')
+
+def detect_language(candidates, pid):
+    res = lib.bcc_procutils_language(pid)
+    language = ct.cast(res, ct.c_char_p).value.decode()
+    return language if language in candidates else None
+
+FILESYSTEMENCODING = sys.getfilesystemencoding()
+
+def printb(s, file=sys.stdout, nl=1):
+    """
+    printb(s)
+
+    print a bytes object to stdout and flush
+    """
+    buf = file.buffer if hasattr(file, "buffer") else file
+
+    buf.write(s)
+    if nl:
+        buf.write(b"\n")
+    file.flush()
+
+class ArgString(object):
+    """
+    ArgString(arg)
+
+    encapsulate a system argument that can be easily coerced to a bytes()
+    object, which is better for comparing to kernel or probe data (which should
+    never be en/decode()'ed).
+    """
+    def __init__(self, arg):
+        if sys.version_info[0] >= 3:
+            self.s = arg
+        else:
+            self.s = arg.decode(FILESYSTEMENCODING)
+
+    def __bytes__(self):
+        return self.s.encode(FILESYSTEMENCODING)
+
+    def __str__(self):
+        return self.__bytes__()
+
+def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
+    log = file if hasattr(file, "write") else sys.stderr
+    traceback.print_stack(f=sys._getframe(2), file=log)
+    log.write(warnings.formatwarning(message, category, filename, lineno, line))
+
+# uncomment to get full tracebacks for invalid uses of python3+str in arguments
+#warnings.showwarning = warn_with_traceback
+
+_strict_bytes = False
+def _assert_is_bytes(arg):
+    if arg is None:
+        return arg
+    if _strict_bytes:
+        assert type(arg) is bytes, "not a bytes object: %r" % arg
+    elif type(arg) is not bytes:
+        warnings.warn("not a bytes object: %r" % arg, DeprecationWarning, 2)
+        return ArgString(arg).__bytes__()
+    return arg
+
diff --git a/src/python/bcc/version.py.in b/src/python/bcc/version.py.in
new file mode 100644
index 0000000..a9764a7
--- /dev/null
+++ b/src/python/bcc/version.py.in
@@ -0,0 +1 @@
+__version__ = '@REVISION@'
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
new file mode 100644
index 0000000..8e05d4f
--- /dev/null
+++ b/src/python/setup.py.in
@@ -0,0 +1,17 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+from distutils.core import setup
+import os
+import sys
+
+if os.environ.get('DESTDIR'):
+    sys.argv += ['--root', os.environ['DESTDIR']]
+
+setup(name='bcc',
+      version='@REVISION@',
+      description='BPF Loader Library',
+      author='Brenden Blanco',
+      author_email='bblanco@plumgrid.com',
+      url='https://github.com/iovisor/bcc',
+      packages=['bcc'],
+      platforms=['Linux'])
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..86abec9
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+configure_file(wrapper.sh.in "${CMAKE_CURRENT_BINARY_DIR}/wrapper.sh" @ONLY)
+
+set(TEST_WRAPPER ${CMAKE_CURRENT_BINARY_DIR}/wrapper.sh)
+
+add_test(NAME style-check COMMAND ${CMAKE_SOURCE_DIR}/scripts/style-check.sh)
+set_tests_properties(style-check PROPERTIES PASS_REGULAR_EXPRESSION ".*")
+
+if(ENABLE_CLANG_JIT)
+add_subdirectory(cc)
+add_subdirectory(python)
+add_subdirectory(lua)
+endif()
diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt
new file mode 100644
index 0000000..335b428
--- /dev/null
+++ b/tests/cc/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+include_directories(${CMAKE_SOURCE_DIR}/src/cc)
+include_directories(${CMAKE_SOURCE_DIR}/src/cc/api)
+
+add_executable(test_static test_static.c)
+target_link_libraries(test_static bcc-static)
+
+add_test(NAME c_test_static COMMAND ${TEST_WRAPPER} c_test_static sudo ${CMAKE_CURRENT_BINARY_DIR}/test_static)
+
+if(ENABLE_USDT)
+add_executable(test_libbcc
+	test_libbcc.cc
+	test_c_api.cc
+	test_array_table.cc
+	test_bpf_table.cc
+	test_hash_table.cc
+	test_perf_event.cc
+	test_prog_table.cc
+	test_shared_table.cc
+	test_usdt_args.cc
+	test_usdt_probes.cc
+	utils.cc)
+
+target_link_libraries(test_libbcc bcc-shared dl)
+add_test(NAME test_libbcc COMMAND ${TEST_WRAPPER} c_test_all sudo ${CMAKE_CURRENT_BINARY_DIR}/test_libbcc)
+
+find_path(SDT_HEADER NAMES "sys/sdt.h")
+if (SDT_HEADER)
+	target_compile_definitions(test_libbcc PRIVATE HAVE_SDT_HEADER=1)
+endif()
+endif(ENABLE_USDT)
diff --git a/tests/cc/catch.hpp b/tests/cc/catch.hpp
new file mode 100644
index 0000000..5e98ddd
--- /dev/null
+++ b/tests/cc/catch.hpp
@@ -0,0 +1,10444 @@
+/*
+ *  Catch v1.4.0
+ *  Generated: 2016-03-15 07:23:12.623111
+ *  ----------------------------------------------------------
+ *  This file has been merged from multiple headers. Please don't edit it directly
+ *  Copyright (c) 2012 Two Blue Cubes Ltd. All rights reserved.
+ *
+ *  Distributed under the Boost Software License, Version 1.0. (See accompanying
+ *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
+#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
+
+#define TWOBLUECUBES_CATCH_HPP_INCLUDED
+
+#ifdef __clang__
+#    pragma clang system_header
+#elif defined __GNUC__
+#    pragma GCC system_header
+#endif
+
+// #included from: internal/catch_suppress_warnings.h
+
+#ifdef __clang__
+#   ifdef __ICC // icpc defines the __clang__ macro
+#       pragma warning(push)
+#       pragma warning(disable: 161 1682)
+#   else // __ICC
+#       pragma clang diagnostic ignored "-Wglobal-constructors"
+#       pragma clang diagnostic ignored "-Wvariadic-macros"
+#       pragma clang diagnostic ignored "-Wc99-extensions"
+#       pragma clang diagnostic ignored "-Wunused-variable"
+#       pragma clang diagnostic push
+#       pragma clang diagnostic ignored "-Wpadded"
+#       pragma clang diagnostic ignored "-Wc++98-compat"
+#       pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#       pragma clang diagnostic ignored "-Wswitch-enum"
+#       pragma clang diagnostic ignored "-Wcovered-switch-default"
+#    endif
+#elif defined __GNUC__
+#    pragma GCC diagnostic ignored "-Wvariadic-macros"
+#    pragma GCC diagnostic ignored "-Wunused-variable"
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wpadded"
+#endif
+#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER)
+#  define CATCH_IMPL
+#endif
+
+#ifdef CATCH_IMPL
+#  ifndef CLARA_CONFIG_MAIN
+#    define CLARA_CONFIG_MAIN_NOT_DEFINED
+#    define CLARA_CONFIG_MAIN
+#  endif
+#endif
+
+// #included from: internal/catch_notimplemented_exception.h
+#define TWOBLUECUBES_CATCH_NOTIMPLEMENTED_EXCEPTION_H_INCLUDED
+
+// #included from: catch_common.h
+#define TWOBLUECUBES_CATCH_COMMON_H_INCLUDED
+
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
+
+#define INTERNAL_CATCH_STRINGIFY2( expr ) #expr
+#define INTERNAL_CATCH_STRINGIFY( expr ) INTERNAL_CATCH_STRINGIFY2( expr )
+
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+
+// #included from: catch_compiler_capabilities.h
+#define TWOBLUECUBES_CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+
+// Detect a number of compiler features - mostly C++11/14 conformance - by compiler
+// The following features are defined:
+//
+// CATCH_CONFIG_CPP11_NULLPTR : is nullptr supported?
+// CATCH_CONFIG_CPP11_NOEXCEPT : is noexcept supported?
+// CATCH_CONFIG_CPP11_GENERATED_METHODS : The delete and default keywords for compiler generated methods
+// CATCH_CONFIG_CPP11_IS_ENUM : std::is_enum is supported?
+// CATCH_CONFIG_CPP11_TUPLE : std::tuple is supported
+// CATCH_CONFIG_CPP11_LONG_LONG : is long long supported?
+// CATCH_CONFIG_CPP11_OVERRIDE : is override supported?
+// CATCH_CONFIG_CPP11_UNIQUE_PTR : is unique_ptr supported (otherwise use auto_ptr)
+
+// CATCH_CONFIG_CPP11_OR_GREATER : Is C++11 supported?
+
+// CATCH_CONFIG_VARIADIC_MACROS : are variadic macros supported?
+// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported?
+// ****************
+// Note to maintainers: if new toggles are added please document them
+// in configuration.md, too
+// ****************
+
+// In general each macro has a _NO_<feature name> form
+// (e.g. CATCH_CONFIG_CPP11_NO_NULLPTR) which disables the feature.
+// Many features, at point of detection, define an _INTERNAL_ macro, so they
+// can be combined, en-mass, with the _NO_ forms later.
+
+// All the C++11 features can be disabled with CATCH_CONFIG_NO_CPP11
+
+#if defined(__cplusplus) && __cplusplus >= 201103L
+#  define CATCH_CPP11_OR_GREATER
+#endif
+
+#ifdef __clang__
+
+#  if __has_feature(cxx_nullptr)
+#    define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR
+#  endif
+
+#  if __has_feature(cxx_noexcept)
+#    define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#  endif
+
+#   if defined(CATCH_CPP11_OR_GREATER)
+#       define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS _Pragma( "clang diagnostic ignored \"-Wparentheses\"" )
+#   endif
+
+#endif // __clang__
+
+////////////////////////////////////////////////////////////////////////////////
+// Borland
+#ifdef __BORLANDC__
+
+#endif // __BORLANDC__
+
+////////////////////////////////////////////////////////////////////////////////
+// EDG
+#ifdef __EDG_VERSION__
+
+#endif // __EDG_VERSION__
+
+////////////////////////////////////////////////////////////////////////////////
+// Digital Mars
+#ifdef __DMC__
+
+#endif // __DMC__
+
+////////////////////////////////////////////////////////////////////////////////
+// GCC
+#ifdef __GNUC__
+
+#   if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#       define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR
+#   endif
+
+#   if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) && defined(CATCH_CPP11_OR_GREATER)
+#       define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
+#   endif
+
+// - otherwise more recent versions define __cplusplus >= 201103L
+// and will get picked up below
+
+#endif // __GNUC__
+
+////////////////////////////////////////////////////////////////////////////////
+// Visual C++
+#ifdef _MSC_VER
+
+#if (_MSC_VER >= 1600)
+#   define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR
+#   define CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR
+#endif
+
+#if (_MSC_VER >= 1900 ) // (VC++ 13 (VS2015))
+#define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#define CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#endif
+
+#endif // _MSC_VER
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Use variadic macros if the compiler supports them
+#if ( defined _MSC_VER && _MSC_VER > 1400 && !defined __EDGE__) || \
+    ( defined __WAVE__ && __WAVE_HAS_VARIADICS ) || \
+    ( defined __GNUC__ && __GNUC__ >= 3 ) || \
+    ( !defined __cplusplus && __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L )
+
+#define CATCH_INTERNAL_CONFIG_VARIADIC_MACROS
+
+#endif
+
+// Use __COUNTER__ if the compiler supports it
+#if ( defined _MSC_VER && _MSC_VER >= 1300 ) || \
+    ( defined __GNUC__  && __GNUC__ >= 4 && __GNUC_MINOR__ >= 3 ) || \
+    ( defined __clang__ && __clang_major__ >= 3 )
+
+#define CATCH_INTERNAL_CONFIG_COUNTER
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// C++ language feature support
+
+// catch all support for C++11
+#if defined(CATCH_CPP11_OR_GREATER)
+
+#  if !defined(CATCH_INTERNAL_CONFIG_CPP11_NULLPTR)
+#    define CATCH_INTERNAL_CONFIG_CPP11_NULLPTR
+#  endif
+
+#  ifndef CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#    define CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#  endif
+
+#  ifndef CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#    define CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#  endif
+
+#  ifndef CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM
+#    define CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM
+#  endif
+
+#  ifndef CATCH_INTERNAL_CONFIG_CPP11_TUPLE
+#    define CATCH_INTERNAL_CONFIG_CPP11_TUPLE
+#  endif
+
+#  ifndef CATCH_INTERNAL_CONFIG_VARIADIC_MACROS
+#    define CATCH_INTERNAL_CONFIG_VARIADIC_MACROS
+#  endif
+
+#  if !defined(CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG)
+#    define CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG
+#  endif
+
+#  if !defined(CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE)
+#    define CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE
+#  endif
+#  if !defined(CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR)
+#    define CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR
+#  endif
+
+#endif // __cplusplus >= 201103L
+
+// Now set the actual defines based on the above + anything the user has configured
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_NULLPTR) && !defined(CATCH_CONFIG_CPP11_NO_NULLPTR) && !defined(CATCH_CONFIG_CPP11_NULLPTR) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_NULLPTR
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_CONFIG_CPP11_NO_NOEXCEPT) && !defined(CATCH_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_NOEXCEPT
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_GENERATED_METHODS) && !defined(CATCH_CONFIG_CPP11_NO_GENERATED_METHODS) && !defined(CATCH_CONFIG_CPP11_GENERATED_METHODS) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_GENERATED_METHODS
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_IS_ENUM) && !defined(CATCH_CONFIG_CPP11_NO_IS_ENUM) && !defined(CATCH_CONFIG_CPP11_IS_ENUM) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_IS_ENUM
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_TUPLE) && !defined(CATCH_CONFIG_CPP11_NO_TUPLE) && !defined(CATCH_CONFIG_CPP11_TUPLE) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_TUPLE
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_VARIADIC_MACROS) && !defined(CATCH_CONFIG_NO_VARIADIC_MACROS) && !defined(CATCH_CONFIG_VARIADIC_MACROS)
+#   define CATCH_CONFIG_VARIADIC_MACROS
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_LONG_LONG) && !defined(CATCH_CONFIG_NO_LONG_LONG) && !defined(CATCH_CONFIG_CPP11_LONG_LONG) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_LONG_LONG
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_OVERRIDE) && !defined(CATCH_CONFIG_NO_OVERRIDE) && !defined(CATCH_CONFIG_CPP11_OVERRIDE) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_OVERRIDE
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_CPP11_UNIQUE_PTR) && !defined(CATCH_CONFIG_NO_UNIQUE_PTR) && !defined(CATCH_CONFIG_CPP11_UNIQUE_PTR) && !defined(CATCH_CONFIG_NO_CPP11)
+#   define CATCH_CONFIG_CPP11_UNIQUE_PTR
+#endif
+#if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER)
+#   define CATCH_CONFIG_COUNTER
+#endif
+
+#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
+#endif
+
+// noexcept support:
+#if defined(CATCH_CONFIG_CPP11_NOEXCEPT) && !defined(CATCH_NOEXCEPT)
+#  define CATCH_NOEXCEPT noexcept
+#  define CATCH_NOEXCEPT_IS(x) noexcept(x)
+#else
+#  define CATCH_NOEXCEPT throw()
+#  define CATCH_NOEXCEPT_IS(x)
+#endif
+
+// nullptr support
+#ifdef CATCH_CONFIG_CPP11_NULLPTR
+#   define CATCH_NULL nullptr
+#else
+#   define CATCH_NULL NULL
+#endif
+
+// override support
+#ifdef CATCH_CONFIG_CPP11_OVERRIDE
+#   define CATCH_OVERRIDE override
+#else
+#   define CATCH_OVERRIDE
+#endif
+
+// unique_ptr support
+#ifdef CATCH_CONFIG_CPP11_UNIQUE_PTR
+#   define CATCH_AUTO_PTR( T ) std::unique_ptr<T>
+#else
+#   define CATCH_AUTO_PTR( T ) std::auto_ptr<T>
+#endif
+
+namespace Catch {
+
+    struct IConfig;
+
+    struct CaseSensitive { enum Choice {
+        Yes,
+        No
+    }; };
+
+    class NonCopyable {
+#ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        NonCopyable( NonCopyable const& )              = delete;
+        NonCopyable( NonCopyable && )                  = delete;
+        NonCopyable& operator = ( NonCopyable const& ) = delete;
+        NonCopyable& operator = ( NonCopyable && )     = delete;
+#else
+        NonCopyable( NonCopyable const& info );
+        NonCopyable& operator = ( NonCopyable const& );
+#endif
+
+    protected:
+        NonCopyable() {}
+        virtual ~NonCopyable();
+    };
+
+    class SafeBool {
+    public:
+        typedef void (SafeBool::*type)() const;
+
+        static type makeSafe( bool value ) {
+            return value ? &SafeBool::trueValue : 0;
+        }
+    private:
+        void trueValue() const {}
+    };
+
+    template<typename ContainerT>
+    inline void deleteAll( ContainerT& container ) {
+        typename ContainerT::const_iterator it = container.begin();
+        typename ContainerT::const_iterator itEnd = container.end();
+        for(; it != itEnd; ++it )
+            delete *it;
+    }
+    template<typename AssociativeContainerT>
+    inline void deleteAllValues( AssociativeContainerT& container ) {
+        typename AssociativeContainerT::const_iterator it = container.begin();
+        typename AssociativeContainerT::const_iterator itEnd = container.end();
+        for(; it != itEnd; ++it )
+            delete it->second;
+    }
+
+    bool startsWith( std::string const& s, std::string const& prefix );
+    bool endsWith( std::string const& s, std::string const& suffix );
+    bool contains( std::string const& s, std::string const& infix );
+    void toLowerInPlace( std::string& s );
+    std::string toLower( std::string const& s );
+    std::string trim( std::string const& str );
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis );
+
+    struct pluralise {
+        pluralise( std::size_t count, std::string const& label );
+
+        friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser );
+
+        std::size_t m_count;
+        std::string m_label;
+    };
+
+    struct SourceLineInfo {
+
+        SourceLineInfo();
+        SourceLineInfo( char const* _file, std::size_t _line );
+        SourceLineInfo( SourceLineInfo const& other );
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        SourceLineInfo( SourceLineInfo && )                  = default;
+        SourceLineInfo& operator = ( SourceLineInfo const& ) = default;
+        SourceLineInfo& operator = ( SourceLineInfo && )     = default;
+#  endif
+        bool empty() const;
+        bool operator == ( SourceLineInfo const& other ) const;
+        bool operator < ( SourceLineInfo const& other ) const;
+
+        std::string file;
+        std::size_t line;
+    };
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info );
+
+    // This is just here to avoid compiler warnings with macro constants and boolean literals
+    inline bool isTrue( bool value ){ return value; }
+    inline bool alwaysTrue() { return true; }
+    inline bool alwaysFalse() { return false; }
+
+    void throwLogicError( std::string const& message, SourceLineInfo const& locationInfo );
+
+    void seedRng( IConfig const& config );
+    unsigned int rngSeed();
+
+    // Use this in variadic streaming macros to allow
+    //    >> +StreamEndStop
+    // as well as
+    //    >> stuff +StreamEndStop
+    struct StreamEndStop {
+        std::string operator+() {
+            return std::string();
+        }
+    };
+    template<typename T>
+    T const& operator + ( T const& value, StreamEndStop ) {
+        return value;
+    }
+}
+
+#define CATCH_INTERNAL_LINEINFO ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+#define CATCH_INTERNAL_ERROR( msg ) ::Catch::throwLogicError( msg, CATCH_INTERNAL_LINEINFO );
+
+#include <ostream>
+
+namespace Catch {
+
+    class NotImplementedException : public std::exception
+    {
+    public:
+        NotImplementedException( SourceLineInfo const& lineInfo );
+        NotImplementedException( NotImplementedException const& ) {}
+
+        virtual ~NotImplementedException() CATCH_NOEXCEPT {}
+
+        virtual const char* what() const CATCH_NOEXCEPT;
+
+    private:
+        std::string m_what;
+        SourceLineInfo m_lineInfo;
+    };
+
+} // end namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define CATCH_NOT_IMPLEMENTED throw Catch::NotImplementedException( CATCH_INTERNAL_LINEINFO )
+
+// #included from: internal/catch_context.h
+#define TWOBLUECUBES_CATCH_CONTEXT_H_INCLUDED
+
+// #included from: catch_interfaces_generators.h
+#define TWOBLUECUBES_CATCH_INTERFACES_GENERATORS_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct IGeneratorInfo {
+        virtual ~IGeneratorInfo();
+        virtual bool moveNext() = 0;
+        virtual std::size_t getCurrentIndex() const = 0;
+    };
+
+    struct IGeneratorsForTest {
+        virtual ~IGeneratorsForTest();
+
+        virtual IGeneratorInfo& getGeneratorInfo( std::string const& fileInfo, std::size_t size ) = 0;
+        virtual bool moveNext() = 0;
+    };
+
+    IGeneratorsForTest* createGeneratorsForTest();
+
+} // end namespace Catch
+
+// #included from: catch_ptr.hpp
+#define TWOBLUECUBES_CATCH_PTR_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    // An intrusive reference counting smart pointer.
+    // T must implement addRef() and release() methods
+    // typically implementing the IShared interface
+    template<typename T>
+    class Ptr {
+    public:
+        Ptr() : m_p( CATCH_NULL ){}
+        Ptr( T* p ) : m_p( p ){
+            if( m_p )
+                m_p->addRef();
+        }
+        Ptr( Ptr const& other ) : m_p( other.m_p ){
+            if( m_p )
+                m_p->addRef();
+        }
+        ~Ptr(){
+            if( m_p )
+                m_p->release();
+        }
+        void reset() {
+            if( m_p )
+                m_p->release();
+            m_p = CATCH_NULL;
+        }
+        Ptr& operator = ( T* p ){
+            Ptr temp( p );
+            swap( temp );
+            return *this;
+        }
+        Ptr& operator = ( Ptr const& other ){
+            Ptr temp( other );
+            swap( temp );
+            return *this;
+        }
+        void swap( Ptr& other ) { std::swap( m_p, other.m_p ); }
+        T* get() const{ return m_p; }
+        T& operator*() const { return *m_p; }
+        T* operator->() const { return m_p; }
+        bool operator !() const { return m_p == CATCH_NULL; }
+        operator SafeBool::type() const { return SafeBool::makeSafe( m_p != CATCH_NULL ); }
+
+    private:
+        T* m_p;
+    };
+
+    struct IShared : NonCopyable {
+        virtual ~IShared();
+        virtual void addRef() const = 0;
+        virtual void release() const = 0;
+    };
+
+    template<typename T = IShared>
+    struct SharedImpl : T {
+
+        SharedImpl() : m_rc( 0 ){}
+
+        virtual void addRef() const {
+            ++m_rc;
+        }
+        virtual void release() const {
+            if( --m_rc == 0 )
+                delete this;
+        }
+
+        mutable unsigned int m_rc;
+    };
+
+} // end namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#include <memory>
+#include <vector>
+#include <stdlib.h>
+
+namespace Catch {
+
+    class TestCase;
+    class Stream;
+    struct IResultCapture;
+    struct IRunner;
+    struct IGeneratorsForTest;
+    struct IConfig;
+
+    struct IContext
+    {
+        virtual ~IContext();
+
+        virtual IResultCapture* getResultCapture() = 0;
+        virtual IRunner* getRunner() = 0;
+        virtual size_t getGeneratorIndex( std::string const& fileInfo, size_t totalSize ) = 0;
+        virtual bool advanceGeneratorsForCurrentTest() = 0;
+        virtual Ptr<IConfig const> getConfig() const = 0;
+    };
+
+    struct IMutableContext : IContext
+    {
+        virtual ~IMutableContext();
+        virtual void setResultCapture( IResultCapture* resultCapture ) = 0;
+        virtual void setRunner( IRunner* runner ) = 0;
+        virtual void setConfig( Ptr<IConfig const> const& config ) = 0;
+    };
+
+    IContext& getCurrentContext();
+    IMutableContext& getCurrentMutableContext();
+    void cleanUpContext();
+    Stream createStream( std::string const& streamName );
+
+}
+
+// #included from: internal/catch_test_registry.hpp
+#define TWOBLUECUBES_CATCH_TEST_REGISTRY_HPP_INCLUDED
+
+// #included from: catch_interfaces_testcase.h
+#define TWOBLUECUBES_CATCH_INTERFACES_TESTCASE_H_INCLUDED
+
+#include <vector>
+
+namespace Catch {
+
+    class TestSpec;
+
+    struct ITestCase : IShared {
+        virtual void invoke () const = 0;
+    protected:
+        virtual ~ITestCase();
+    };
+
+    class TestCase;
+    struct IConfig;
+
+    struct ITestCaseRegistry {
+        virtual ~ITestCaseRegistry();
+        virtual std::vector<TestCase> const& getAllTests() const = 0;
+        virtual std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const = 0;
+    };
+
+    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config );
+
+}
+
+namespace Catch {
+
+template<typename C>
+class MethodTestCase : public SharedImpl<ITestCase> {
+
+public:
+    MethodTestCase( void (C::*method)() ) : m_method( method ) {}
+
+    virtual void invoke() const {
+        C obj;
+        (obj.*m_method)();
+    }
+
+private:
+    virtual ~MethodTestCase() {}
+
+    void (C::*m_method)();
+};
+
+typedef void(*TestFunction)();
+
+struct NameAndDesc {
+    NameAndDesc( const char* _name = "", const char* _description= "" )
+    : name( _name ), description( _description )
+    {}
+
+    const char* name;
+    const char* description;
+};
+
+void registerTestCase
+    (   ITestCase* testCase,
+        char const* className,
+        NameAndDesc const& nameAndDesc,
+        SourceLineInfo const& lineInfo );
+
+struct AutoReg {
+
+    AutoReg
+        (   TestFunction function,
+            SourceLineInfo const& lineInfo,
+            NameAndDesc const& nameAndDesc );
+
+    template<typename C>
+    AutoReg
+        (   void (C::*method)(),
+            char const* className,
+            NameAndDesc const& nameAndDesc,
+            SourceLineInfo const& lineInfo ) {
+
+        registerTestCase
+            (   new MethodTestCase<C>( method ),
+                className,
+                nameAndDesc,
+                lineInfo );
+    }
+
+    ~AutoReg();
+
+private:
+    AutoReg( AutoReg const& );
+    void operator= ( AutoReg const& );
+};
+
+void registerTestCaseFunction
+    (   TestFunction function,
+        SourceLineInfo const& lineInfo,
+        NameAndDesc const& nameAndDesc );
+
+} // end namespace Catch
+
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
+        static void TestName(); \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( &TestName, CATCH_INTERNAL_LINEINFO, Catch::NameAndDesc( __VA_ARGS__ ) ); }\
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), __VA_ARGS__ )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( &QualifiedMethod, "&" #QualifiedMethod, Catch::NameAndDesc( __VA_ARGS__ ), CATCH_INTERNAL_LINEINFO ); }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
+        namespace{ \
+            struct TestName : ClassName{ \
+                void test(); \
+            }; \
+            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( &TestName::test, #ClassName, Catch::NameAndDesc( __VA_ARGS__ ), CATCH_INTERNAL_LINEINFO ); \
+        } \
+        void TestName::test()
+    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
+        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, __VA_ARGS__ )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
+        Catch::AutoReg( Function, CATCH_INTERNAL_LINEINFO, Catch::NameAndDesc( __VA_ARGS__ ) );
+
+#else
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TESTCASE2( TestName, Name, Desc ) \
+        static void TestName(); \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( &TestName, CATCH_INTERNAL_LINEINFO, Catch::NameAndDesc( Name, Desc ) ); }\
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE( Name, Desc ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), Name, Desc )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, Name, Desc ) \
+        namespace{ Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( &QualifiedMethod, "&" #QualifiedMethod, Catch::NameAndDesc( Name, Desc ), CATCH_INTERNAL_LINEINFO ); }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestCaseName, ClassName, TestName, Desc )\
+        namespace{ \
+            struct TestCaseName : ClassName{ \
+                void test(); \
+            }; \
+            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar ) ( &TestCaseName::test, #ClassName, Catch::NameAndDesc( TestName, Desc ), CATCH_INTERNAL_LINEINFO ); \
+        } \
+        void TestCaseName::test()
+    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, TestName, Desc )\
+        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( ____C_A_T_C_H____T_E_S_T____ ), ClassName, TestName, Desc )
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, Name, Desc ) \
+        Catch::AutoReg( Function, CATCH_INTERNAL_LINEINFO, Catch::NameAndDesc( Name, Desc ) );
+#endif
+
+// #included from: internal/catch_capture.hpp
+#define TWOBLUECUBES_CATCH_CAPTURE_HPP_INCLUDED
+
+// #included from: catch_result_builder.h
+#define TWOBLUECUBES_CATCH_RESULT_BUILDER_H_INCLUDED
+
+// #included from: catch_result_type.h
+#define TWOBLUECUBES_CATCH_RESULT_TYPE_H_INCLUDED
+
+namespace Catch {
+
+    // ResultWas::OfType enum
+    struct ResultWas { enum OfType {
+        Unknown = -1,
+        Ok = 0,
+        Info = 1,
+        Warning = 2,
+
+        FailureBit = 0x10,
+
+        ExpressionFailed = FailureBit | 1,
+        ExplicitFailure = FailureBit | 2,
+
+        Exception = 0x100 | FailureBit,
+
+        ThrewException = Exception | 1,
+        DidntThrowException = Exception | 2,
+
+        FatalErrorCondition = 0x200 | FailureBit
+
+    }; };
+
+    inline bool isOk( ResultWas::OfType resultType ) {
+        return ( resultType & ResultWas::FailureBit ) == 0;
+    }
+    inline bool isJustInfo( int flags ) {
+        return flags == ResultWas::Info;
+    }
+
+    // ResultDisposition::Flags enum
+    struct ResultDisposition { enum Flags {
+        Normal = 0x01,
+
+        ContinueOnFailure = 0x02,   // Failures fail test, but execution continues
+        FalseTest = 0x04,           // Prefix expression with !
+        SuppressFail = 0x08         // Failures are reported but do not fail the test
+    }; };
+
+    inline ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs ) {
+        return static_cast<ResultDisposition::Flags>( static_cast<int>( lhs ) | static_cast<int>( rhs ) );
+    }
+
+    inline bool shouldContinueOnFailure( int flags )    { return ( flags & ResultDisposition::ContinueOnFailure ) != 0; }
+    inline bool isFalseTest( int flags )                { return ( flags & ResultDisposition::FalseTest ) != 0; }
+    inline bool shouldSuppressFailure( int flags )      { return ( flags & ResultDisposition::SuppressFail ) != 0; }
+
+} // end namespace Catch
+
+// #included from: catch_assertionresult.h
+#define TWOBLUECUBES_CATCH_ASSERTIONRESULT_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionInfo
+    {
+        AssertionInfo() {}
+        AssertionInfo(  std::string const& _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        std::string const& _capturedExpression,
+                        ResultDisposition::Flags _resultDisposition );
+
+        std::string macroName;
+        SourceLineInfo lineInfo;
+        std::string capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+    };
+
+    struct AssertionResultData
+    {
+        AssertionResultData() : resultType( ResultWas::Unknown ) {}
+
+        std::string reconstructedExpression;
+        std::string message;
+        ResultWas::OfType resultType;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult();
+        AssertionResult( AssertionInfo const& info, AssertionResultData const& data );
+        ~AssertionResult();
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+         AssertionResult( AssertionResult const& )              = default;
+         AssertionResult( AssertionResult && )                  = default;
+         AssertionResult& operator = ( AssertionResult const& ) = default;
+         AssertionResult& operator = ( AssertionResult && )     = default;
+#  endif
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        std::string getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        std::string getTestMacroName() const;
+
+    protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+// #included from: catch_matchers.hpp
+#define TWOBLUECUBES_CATCH_MATCHERS_HPP_INCLUDED
+
+namespace Catch {
+namespace Matchers {
+    namespace Impl {
+
+    namespace Generic {
+        template<typename ExpressionT> class AllOf;
+        template<typename ExpressionT> class AnyOf;
+        template<typename ExpressionT> class Not;
+    }
+
+    template<typename ExpressionT>
+    struct Matcher : SharedImpl<IShared>
+    {
+        typedef ExpressionT ExpressionType;
+
+        virtual ~Matcher() {}
+        virtual Ptr<Matcher> clone() const = 0;
+        virtual bool match( ExpressionT const& expr ) const = 0;
+        virtual std::string toString() const = 0;
+
+        Generic::AllOf<ExpressionT> operator && ( Matcher<ExpressionT> const& other ) const;
+        Generic::AnyOf<ExpressionT> operator || ( Matcher<ExpressionT> const& other ) const;
+        Generic::Not<ExpressionT> operator ! () const;
+    };
+
+    template<typename DerivedT, typename ExpressionT>
+    struct MatcherImpl : Matcher<ExpressionT> {
+
+        virtual Ptr<Matcher<ExpressionT> > clone() const {
+            return Ptr<Matcher<ExpressionT> >( new DerivedT( static_cast<DerivedT const&>( *this ) ) );
+        }
+    };
+
+    namespace Generic {
+        template<typename ExpressionT>
+        class Not : public MatcherImpl<Not<ExpressionT>, ExpressionT> {
+        public:
+            explicit Not( Matcher<ExpressionT> const& matcher ) : m_matcher(matcher.clone()) {}
+            Not( Not const& other ) : m_matcher( other.m_matcher ) {}
+
+            virtual bool match( ExpressionT const& expr ) const CATCH_OVERRIDE {
+                return !m_matcher->match( expr );
+            }
+
+            virtual std::string toString() const CATCH_OVERRIDE {
+                return "not " + m_matcher->toString();
+            }
+        private:
+            Ptr< Matcher<ExpressionT> > m_matcher;
+        };
+
+        template<typename ExpressionT>
+        class AllOf : public MatcherImpl<AllOf<ExpressionT>, ExpressionT> {
+        public:
+
+            AllOf() {}
+            AllOf( AllOf const& other ) : m_matchers( other.m_matchers ) {}
+
+            AllOf& add( Matcher<ExpressionT> const& matcher ) {
+                m_matchers.push_back( matcher.clone() );
+                return *this;
+            }
+            virtual bool match( ExpressionT const& expr ) const
+            {
+                for( std::size_t i = 0; i < m_matchers.size(); ++i )
+                    if( !m_matchers[i]->match( expr ) )
+                        return false;
+                return true;
+            }
+            virtual std::string toString() const {
+                std::ostringstream oss;
+                oss << "( ";
+                for( std::size_t i = 0; i < m_matchers.size(); ++i ) {
+                    if( i != 0 )
+                        oss << " and ";
+                    oss << m_matchers[i]->toString();
+                }
+                oss << " )";
+                return oss.str();
+            }
+
+            AllOf operator && ( Matcher<ExpressionT> const& other ) const {
+                AllOf allOfExpr( *this );
+                allOfExpr.add( other );
+                return allOfExpr;
+            }
+
+        private:
+            std::vector<Ptr<Matcher<ExpressionT> > > m_matchers;
+        };
+
+        template<typename ExpressionT>
+        class AnyOf : public MatcherImpl<AnyOf<ExpressionT>, ExpressionT> {
+        public:
+
+            AnyOf() {}
+            AnyOf( AnyOf const& other ) : m_matchers( other.m_matchers ) {}
+
+            AnyOf& add( Matcher<ExpressionT> const& matcher ) {
+                m_matchers.push_back( matcher.clone() );
+                return *this;
+            }
+            virtual bool match( ExpressionT const& expr ) const
+            {
+                for( std::size_t i = 0; i < m_matchers.size(); ++i )
+                    if( m_matchers[i]->match( expr ) )
+                        return true;
+                return false;
+            }
+            virtual std::string toString() const {
+                std::ostringstream oss;
+                oss << "( ";
+                for( std::size_t i = 0; i < m_matchers.size(); ++i ) {
+                    if( i != 0 )
+                        oss << " or ";
+                    oss << m_matchers[i]->toString();
+                }
+                oss << " )";
+                return oss.str();
+            }
+
+            AnyOf operator || ( Matcher<ExpressionT> const& other ) const {
+                AnyOf anyOfExpr( *this );
+                anyOfExpr.add( other );
+                return anyOfExpr;
+            }
+
+        private:
+            std::vector<Ptr<Matcher<ExpressionT> > > m_matchers;
+        };
+
+    } // namespace Generic
+
+    template<typename ExpressionT>
+    Generic::AllOf<ExpressionT> Matcher<ExpressionT>::operator && ( Matcher<ExpressionT> const& other ) const {
+        Generic::AllOf<ExpressionT> allOfExpr;
+        allOfExpr.add( *this );
+        allOfExpr.add( other );
+        return allOfExpr;
+    }
+
+    template<typename ExpressionT>
+    Generic::AnyOf<ExpressionT> Matcher<ExpressionT>::operator || ( Matcher<ExpressionT> const& other ) const {
+        Generic::AnyOf<ExpressionT> anyOfExpr;
+        anyOfExpr.add( *this );
+        anyOfExpr.add( other );
+        return anyOfExpr;
+    }
+
+    template<typename ExpressionT>
+    Generic::Not<ExpressionT> Matcher<ExpressionT>::operator ! () const {
+        return Generic::Not<ExpressionT>( *this );
+    }
+
+    namespace StdString {
+
+        inline std::string makeString( std::string const& str ) { return str; }
+        inline std::string makeString( const char* str ) { return str ? std::string( str ) : std::string(); }
+
+        struct CasedString
+        {
+            CasedString( std::string const& str, CaseSensitive::Choice caseSensitivity )
+            :   m_caseSensitivity( caseSensitivity ),
+                m_str( adjustString( str ) )
+            {}
+            std::string adjustString( std::string const& str ) const {
+                return m_caseSensitivity == CaseSensitive::No
+                    ? toLower( str )
+                    : str;
+
+            }
+            std::string toStringSuffix() const
+            {
+                return m_caseSensitivity == CaseSensitive::No
+                    ? " (case insensitive)"
+                    : "";
+            }
+            CaseSensitive::Choice m_caseSensitivity;
+            std::string m_str;
+        };
+
+        struct Equals : MatcherImpl<Equals, std::string> {
+            Equals( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes )
+            :   m_data( str, caseSensitivity )
+            {}
+            Equals( Equals const& other ) : m_data( other.m_data ){}
+
+            virtual ~Equals();
+
+            virtual bool match( std::string const& expr ) const {
+                return m_data.m_str == m_data.adjustString( expr );;
+            }
+            virtual std::string toString() const {
+                return "equals: \"" + m_data.m_str + "\"" + m_data.toStringSuffix();
+            }
+
+            CasedString m_data;
+        };
+
+        struct Contains : MatcherImpl<Contains, std::string> {
+            Contains( std::string const& substr, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes )
+            : m_data( substr, caseSensitivity ){}
+            Contains( Contains const& other ) : m_data( other.m_data ){}
+
+            virtual ~Contains();
+
+            virtual bool match( std::string const& expr ) const {
+                return m_data.adjustString( expr ).find( m_data.m_str ) != std::string::npos;
+            }
+            virtual std::string toString() const {
+                return "contains: \"" + m_data.m_str  + "\"" + m_data.toStringSuffix();
+            }
+
+            CasedString m_data;
+        };
+
+        struct StartsWith : MatcherImpl<StartsWith, std::string> {
+            StartsWith( std::string const& substr, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes )
+            : m_data( substr, caseSensitivity ){}
+
+            StartsWith( StartsWith const& other ) : m_data( other.m_data ){}
+
+            virtual ~StartsWith();
+
+            virtual bool match( std::string const& expr ) const {
+                return startsWith( m_data.adjustString( expr ), m_data.m_str );
+            }
+            virtual std::string toString() const {
+                return "starts with: \"" + m_data.m_str + "\"" + m_data.toStringSuffix();
+            }
+
+            CasedString m_data;
+        };
+
+        struct EndsWith : MatcherImpl<EndsWith, std::string> {
+            EndsWith( std::string const& substr, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes )
+            : m_data( substr, caseSensitivity ){}
+            EndsWith( EndsWith const& other ) : m_data( other.m_data ){}
+
+            virtual ~EndsWith();
+
+            virtual bool match( std::string const& expr ) const {
+                return endsWith( m_data.adjustString( expr ), m_data.m_str );
+            }
+            virtual std::string toString() const {
+                return "ends with: \"" + m_data.m_str + "\"" + m_data.toStringSuffix();
+            }
+
+            CasedString m_data;
+        };
+    } // namespace StdString
+    } // namespace Impl
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+    template<typename ExpressionT>
+    inline Impl::Generic::Not<ExpressionT> Not( Impl::Matcher<ExpressionT> const& m ) {
+        return Impl::Generic::Not<ExpressionT>( m );
+    }
+
+    template<typename ExpressionT>
+    inline Impl::Generic::AllOf<ExpressionT> AllOf( Impl::Matcher<ExpressionT> const& m1,
+                                                    Impl::Matcher<ExpressionT> const& m2 ) {
+        return Impl::Generic::AllOf<ExpressionT>().add( m1 ).add( m2 );
+    }
+    template<typename ExpressionT>
+    inline Impl::Generic::AllOf<ExpressionT> AllOf( Impl::Matcher<ExpressionT> const& m1,
+                                                    Impl::Matcher<ExpressionT> const& m2,
+                                                    Impl::Matcher<ExpressionT> const& m3 ) {
+        return Impl::Generic::AllOf<ExpressionT>().add( m1 ).add( m2 ).add( m3 );
+    }
+    template<typename ExpressionT>
+    inline Impl::Generic::AnyOf<ExpressionT> AnyOf( Impl::Matcher<ExpressionT> const& m1,
+                                                    Impl::Matcher<ExpressionT> const& m2 ) {
+        return Impl::Generic::AnyOf<ExpressionT>().add( m1 ).add( m2 );
+    }
+    template<typename ExpressionT>
+    inline Impl::Generic::AnyOf<ExpressionT> AnyOf( Impl::Matcher<ExpressionT> const& m1,
+                                                    Impl::Matcher<ExpressionT> const& m2,
+                                                    Impl::Matcher<ExpressionT> const& m3 ) {
+        return Impl::Generic::AnyOf<ExpressionT>().add( m1 ).add( m2 ).add( m3 );
+    }
+
+    inline Impl::StdString::Equals      Equals( std::string const& str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes ) {
+        return Impl::StdString::Equals( str, caseSensitivity );
+    }
+    inline Impl::StdString::Equals      Equals( const char* str, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes ) {
+        return Impl::StdString::Equals( Impl::StdString::makeString( str ), caseSensitivity );
+    }
+    inline Impl::StdString::Contains    Contains( std::string const& substr, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes ) {
+        return Impl::StdString::Contains( substr, caseSensitivity );
+    }
+    inline Impl::StdString::Contains    Contains( const char* substr, CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes ) {
+        return Impl::StdString::Contains( Impl::StdString::makeString( substr ), caseSensitivity );
+    }
+    inline Impl::StdString::StartsWith  StartsWith( std::string const& substr ) {
+        return Impl::StdString::StartsWith( substr );
+    }
+    inline Impl::StdString::StartsWith  StartsWith( const char* substr ) {
+        return Impl::StdString::StartsWith( Impl::StdString::makeString( substr ) );
+    }
+    inline Impl::StdString::EndsWith    EndsWith( std::string const& substr ) {
+        return Impl::StdString::EndsWith( substr );
+    }
+    inline Impl::StdString::EndsWith    EndsWith( const char* substr ) {
+        return Impl::StdString::EndsWith( Impl::StdString::makeString( substr ) );
+    }
+
+} // namespace Matchers
+
+using namespace Matchers;
+
+} // namespace Catch
+
+namespace Catch {
+
+    struct TestFailureException{};
+
+    template<typename T> class ExpressionLhs;
+
+    struct STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison;
+
+    struct CopyableStream {
+        CopyableStream() {}
+        CopyableStream( CopyableStream const& other ) {
+            oss << other.oss.str();
+        }
+        CopyableStream& operator=( CopyableStream const& other ) {
+            oss.str("");
+            oss << other.oss.str();
+            return *this;
+        }
+        std::ostringstream oss;
+    };
+
+    class ResultBuilder {
+    public:
+        ResultBuilder(  char const* macroName,
+                        SourceLineInfo const& lineInfo,
+                        char const* capturedExpression,
+                        ResultDisposition::Flags resultDisposition,
+                        char const* secondArg = "" );
+
+        template<typename T>
+        ExpressionLhs<T const&> operator <= ( T const& operand );
+        ExpressionLhs<bool> operator <= ( bool value );
+
+        template<typename T>
+        ResultBuilder& operator << ( T const& value ) {
+            m_stream.oss << value;
+            return *this;
+        }
+
+        template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator && ( RhsT const& );
+        template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator || ( RhsT const& );
+
+        ResultBuilder& setResultType( ResultWas::OfType result );
+        ResultBuilder& setResultType( bool result );
+        ResultBuilder& setLhs( std::string const& lhs );
+        ResultBuilder& setRhs( std::string const& rhs );
+        ResultBuilder& setOp( std::string const& op );
+
+        void endExpression();
+
+        std::string reconstructExpression() const;
+        AssertionResult build() const;
+
+        void useActiveException( ResultDisposition::Flags resultDisposition = ResultDisposition::Normal );
+        void captureResult( ResultWas::OfType resultType );
+        void captureExpression();
+        void captureExpectedException( std::string const& expectedMessage );
+        void captureExpectedException( Matchers::Impl::Matcher<std::string> const& matcher );
+        void handleResult( AssertionResult const& result );
+        void react();
+        bool shouldDebugBreak() const;
+        bool allowThrows() const;
+
+    private:
+        AssertionInfo m_assertionInfo;
+        AssertionResultData m_data;
+        struct ExprComponents {
+            ExprComponents() : testFalse( false ) {}
+            bool testFalse;
+            std::string lhs, rhs, op;
+        } m_exprComponents;
+        CopyableStream m_stream;
+
+        bool m_shouldDebugBreak;
+        bool m_shouldThrow;
+    };
+
+} // namespace Catch
+
+// Include after due to circular dependency:
+// #included from: catch_expression_lhs.hpp
+#define TWOBLUECUBES_CATCH_EXPRESSION_LHS_HPP_INCLUDED
+
+// #included from: catch_evaluate.hpp
+#define TWOBLUECUBES_CATCH_EVALUATE_HPP_INCLUDED
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4389) // '==' : signed/unsigned mismatch
+#endif
+
+#include <cstddef>
+
+namespace Catch {
+namespace Internal {
+
+    enum Operator {
+        IsEqualTo,
+        IsNotEqualTo,
+        IsLessThan,
+        IsGreaterThan,
+        IsLessThanOrEqualTo,
+        IsGreaterThanOrEqualTo
+    };
+
+    template<Operator Op> struct OperatorTraits             { static const char* getName(){ return "*error*"; } };
+    template<> struct OperatorTraits<IsEqualTo>             { static const char* getName(){ return "=="; } };
+    template<> struct OperatorTraits<IsNotEqualTo>          { static const char* getName(){ return "!="; } };
+    template<> struct OperatorTraits<IsLessThan>            { static const char* getName(){ return "<"; } };
+    template<> struct OperatorTraits<IsGreaterThan>         { static const char* getName(){ return ">"; } };
+    template<> struct OperatorTraits<IsLessThanOrEqualTo>   { static const char* getName(){ return "<="; } };
+    template<> struct OperatorTraits<IsGreaterThanOrEqualTo>{ static const char* getName(){ return ">="; } };
+
+    template<typename T>
+    inline T& opCast(T const& t) { return const_cast<T&>(t); }
+
+// nullptr_t support based on pull request #154 from Konstantin Baumann
+#ifdef CATCH_CONFIG_CPP11_NULLPTR
+    inline std::nullptr_t opCast(std::nullptr_t) { return nullptr; }
+#endif // CATCH_CONFIG_CPP11_NULLPTR
+
+    // So the compare overloads can be operator agnostic we convey the operator as a template
+    // enum, which is used to specialise an Evaluator for doing the comparison.
+    template<typename T1, typename T2, Operator Op>
+    class Evaluator{};
+
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsEqualTo> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs) {
+            return bool( opCast( lhs ) ==  opCast( rhs ) );
+        }
+    };
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsNotEqualTo> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs ) {
+            return bool( opCast( lhs ) != opCast( rhs ) );
+        }
+    };
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsLessThan> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs ) {
+            return bool( opCast( lhs ) < opCast( rhs ) );
+        }
+    };
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsGreaterThan> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs ) {
+            return bool( opCast( lhs ) > opCast( rhs ) );
+        }
+    };
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsGreaterThanOrEqualTo> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs ) {
+            return bool( opCast( lhs ) >= opCast( rhs ) );
+        }
+    };
+    template<typename T1, typename T2>
+    struct Evaluator<T1, T2, IsLessThanOrEqualTo> {
+        static bool evaluate( T1 const& lhs, T2 const& rhs ) {
+            return bool( opCast( lhs ) <= opCast( rhs ) );
+        }
+    };
+
+    template<Operator Op, typename T1, typename T2>
+    bool applyEvaluator( T1 const& lhs, T2 const& rhs ) {
+        return Evaluator<T1, T2, Op>::evaluate( lhs, rhs );
+    }
+
+    // This level of indirection allows us to specialise for integer types
+    // to avoid signed/ unsigned warnings
+
+    // "base" overload
+    template<Operator Op, typename T1, typename T2>
+    bool compare( T1 const& lhs, T2 const& rhs ) {
+        return Evaluator<T1, T2, Op>::evaluate( lhs, rhs );
+    }
+
+    // unsigned X to int
+    template<Operator Op> bool compare( unsigned int lhs, int rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned int>( rhs ) );
+    }
+    template<Operator Op> bool compare( unsigned long lhs, int rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned int>( rhs ) );
+    }
+    template<Operator Op> bool compare( unsigned char lhs, int rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned int>( rhs ) );
+    }
+
+    // unsigned X to long
+    template<Operator Op> bool compare( unsigned int lhs, long rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned long>( rhs ) );
+    }
+    template<Operator Op> bool compare( unsigned long lhs, long rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned long>( rhs ) );
+    }
+    template<Operator Op> bool compare( unsigned char lhs, long rhs ) {
+        return applyEvaluator<Op>( lhs, static_cast<unsigned long>( rhs ) );
+    }
+
+    // int to unsigned X
+    template<Operator Op> bool compare( int lhs, unsigned int rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned int>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( int lhs, unsigned long rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned int>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( int lhs, unsigned char rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned int>( lhs ), rhs );
+    }
+
+    // long to unsigned X
+    template<Operator Op> bool compare( long lhs, unsigned int rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( long lhs, unsigned long rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( long lhs, unsigned char rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+
+    // pointer to long (when comparing against NULL)
+    template<Operator Op, typename T> bool compare( long lhs, T* rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( reinterpret_cast<T*>( lhs ), rhs );
+    }
+    template<Operator Op, typename T> bool compare( T* lhs, long rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( lhs, reinterpret_cast<T*>( rhs ) );
+    }
+
+    // pointer to int (when comparing against NULL)
+    template<Operator Op, typename T> bool compare( int lhs, T* rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( reinterpret_cast<T*>( lhs ), rhs );
+    }
+    template<Operator Op, typename T> bool compare( T* lhs, int rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( lhs, reinterpret_cast<T*>( rhs ) );
+    }
+
+#ifdef CATCH_CONFIG_CPP11_LONG_LONG
+    // long long to unsigned X
+    template<Operator Op> bool compare( long long lhs, unsigned int rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( long long lhs, unsigned long rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( long long lhs, unsigned long long rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( long long lhs, unsigned char rhs ) {
+        return applyEvaluator<Op>( static_cast<unsigned long>( lhs ), rhs );
+    }
+
+    // unsigned long long to X
+    template<Operator Op> bool compare( unsigned long long lhs, int rhs ) {
+        return applyEvaluator<Op>( static_cast<long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( unsigned long long lhs, long rhs ) {
+        return applyEvaluator<Op>( static_cast<long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( unsigned long long lhs, long long rhs ) {
+        return applyEvaluator<Op>( static_cast<long>( lhs ), rhs );
+    }
+    template<Operator Op> bool compare( unsigned long long lhs, char rhs ) {
+        return applyEvaluator<Op>( static_cast<long>( lhs ), rhs );
+    }
+
+    // pointer to long long (when comparing against NULL)
+    template<Operator Op, typename T> bool compare( long long lhs, T* rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( reinterpret_cast<T*>( lhs ), rhs );
+    }
+    template<Operator Op, typename T> bool compare( T* lhs, long long rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( lhs, reinterpret_cast<T*>( rhs ) );
+    }
+#endif // CATCH_CONFIG_CPP11_LONG_LONG
+
+#ifdef CATCH_CONFIG_CPP11_NULLPTR
+    // pointer to nullptr_t (when comparing against nullptr)
+    template<Operator Op, typename T> bool compare( std::nullptr_t, T* rhs ) {
+        return Evaluator<T*, T*, Op>::evaluate( nullptr, rhs );
+    }
+    template<Operator Op, typename T> bool compare( T* lhs, std::nullptr_t ) {
+        return Evaluator<T*, T*, Op>::evaluate( lhs, nullptr );
+    }
+#endif // CATCH_CONFIG_CPP11_NULLPTR
+
+} // end of namespace Internal
+} // end of namespace Catch
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// #included from: catch_tostring.h
+#define TWOBLUECUBES_CATCH_TOSTRING_H_INCLUDED
+
+#include <sstream>
+#include <iomanip>
+#include <limits>
+#include <vector>
+#include <cstddef>
+
+#ifdef __OBJC__
+// #included from: catch_objc_arc.hpp
+#define TWOBLUECUBES_CATCH_OBJC_ARC_HPP_INCLUDED
+
+#import <Foundation/Foundation.h>
+
+#ifdef __has_feature
+#define CATCH_ARC_ENABLED __has_feature(objc_arc)
+#else
+#define CATCH_ARC_ENABLED 0
+#endif
+
+void arcSafeRelease( NSObject* obj );
+id performOptionalSelector( id obj, SEL sel );
+
+#if !CATCH_ARC_ENABLED
+inline void arcSafeRelease( NSObject* obj ) {
+    [obj release];
+}
+inline id performOptionalSelector( id obj, SEL sel ) {
+    if( [obj respondsToSelector: sel] )
+        return [obj performSelector: sel];
+    return nil;
+}
+#define CATCH_UNSAFE_UNRETAINED
+#define CATCH_ARC_STRONG
+#else
+inline void arcSafeRelease( NSObject* ){}
+inline id performOptionalSelector( id obj, SEL sel ) {
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Warc-performSelector-leaks"
+#endif
+    if( [obj respondsToSelector: sel] )
+        return [obj performSelector: sel];
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+    return nil;
+}
+#define CATCH_UNSAFE_UNRETAINED __unsafe_unretained
+#define CATCH_ARC_STRONG __strong
+#endif
+
+#endif
+
+#ifdef CATCH_CONFIG_CPP11_TUPLE
+#include <tuple>
+#endif
+
+#ifdef CATCH_CONFIG_CPP11_IS_ENUM
+#include <type_traits>
+#endif
+
+namespace Catch {
+
+// Why we're here.
+template<typename T>
+std::string toString( T const& value );
+
+// Built in overloads
+
+std::string toString( std::string const& value );
+std::string toString( std::wstring const& value );
+std::string toString( const char* const value );
+std::string toString( char* const value );
+std::string toString( const wchar_t* const value );
+std::string toString( wchar_t* const value );
+std::string toString( int value );
+std::string toString( unsigned long value );
+std::string toString( unsigned int value );
+std::string toString( const double value );
+std::string toString( const float value );
+std::string toString( bool value );
+std::string toString( char value );
+std::string toString( signed char value );
+std::string toString( unsigned char value );
+
+#ifdef CATCH_CONFIG_CPP11_LONG_LONG
+std::string toString( long long value );
+std::string toString( unsigned long long value );
+#endif
+
+#ifdef CATCH_CONFIG_CPP11_NULLPTR
+std::string toString( std::nullptr_t );
+#endif
+
+#ifdef __OBJC__
+    std::string toString( NSString const * const& nsstring );
+    std::string toString( NSString * CATCH_ARC_STRONG const& nsstring );
+    std::string toString( NSObject* const& nsObject );
+#endif
+
+namespace Detail {
+
+    extern const std::string unprintableString;
+
+    struct BorgType {
+        template<typename T> BorgType( T const& );
+    };
+
+    struct TrueType { char sizer[1]; };
+    struct FalseType { char sizer[2]; };
+
+    TrueType& testStreamable( std::ostream& );
+    FalseType testStreamable( FalseType );
+
+    FalseType operator<<( std::ostream const&, BorgType const& );
+
+    template<typename T>
+    struct IsStreamInsertable {
+        static std::ostream &s;
+        static T  const&t;
+        enum { value = sizeof( testStreamable(s << t) ) == sizeof( TrueType ) };
+    };
+
+#if defined(CATCH_CONFIG_CPP11_IS_ENUM)
+    template<typename T,
+             bool IsEnum = std::is_enum<T>::value
+             >
+    struct EnumStringMaker
+    {
+        static std::string convert( T const& ) { return unprintableString; }
+    };
+
+    template<typename T>
+    struct EnumStringMaker<T,true>
+    {
+        static std::string convert( T const& v )
+        {
+            return ::Catch::toString(
+                static_cast<typename std::underlying_type<T>::type>(v)
+                );
+        }
+    };
+#endif
+    template<bool C>
+    struct StringMakerBase {
+#if defined(CATCH_CONFIG_CPP11_IS_ENUM)
+        template<typename T>
+        static std::string convert( T const& v )
+        {
+            return EnumStringMaker<T>::convert( v );
+        }
+#else
+        template<typename T>
+        static std::string convert( T const& ) { return unprintableString; }
+#endif
+    };
+
+    template<>
+    struct StringMakerBase<true> {
+        template<typename T>
+        static std::string convert( T const& _value ) {
+            std::ostringstream oss;
+            oss << _value;
+            return oss.str();
+        }
+    };
+
+    std::string rawMemoryToString( const void *object, std::size_t size );
+
+    template<typename T>
+    inline std::string rawMemoryToString( const T& object ) {
+      return rawMemoryToString( &object, sizeof(object) );
+    }
+
+} // end namespace Detail
+
+template<typename T>
+struct StringMaker :
+    Detail::StringMakerBase<Detail::IsStreamInsertable<T>::value> {};
+
+template<typename T>
+struct StringMaker<T*> {
+    template<typename U>
+    static std::string convert( U* p ) {
+        if( !p )
+            return "NULL";
+        else
+            return Detail::rawMemoryToString( p );
+    }
+};
+
+template<typename R, typename C>
+struct StringMaker<R C::*> {
+    static std::string convert( R C::* p ) {
+        if( !p )
+            return "NULL";
+        else
+            return Detail::rawMemoryToString( p );
+    }
+};
+
+namespace Detail {
+    template<typename InputIterator>
+    std::string rangeToString( InputIterator first, InputIterator last );
+}
+
+//template<typename T, typename Allocator>
+//struct StringMaker<std::vector<T, Allocator> > {
+//    static std::string convert( std::vector<T,Allocator> const& v ) {
+//        return Detail::rangeToString( v.begin(), v.end() );
+//    }
+//};
+
+template<typename T, typename Allocator>
+std::string toString( std::vector<T,Allocator> const& v ) {
+    return Detail::rangeToString( v.begin(), v.end() );
+}
+
+#ifdef CATCH_CONFIG_CPP11_TUPLE
+
+// toString for tuples
+namespace TupleDetail {
+  template<
+      typename Tuple,
+      std::size_t N = 0,
+      bool = (N < std::tuple_size<Tuple>::value)
+      >
+  struct ElementPrinter {
+      static void print( const Tuple& tuple, std::ostream& os )
+      {
+          os << ( N ? ", " : " " )
+             << Catch::toString(std::get<N>(tuple));
+          ElementPrinter<Tuple,N+1>::print(tuple,os);
+      }
+  };
+
+  template<
+      typename Tuple,
+      std::size_t N
+      >
+  struct ElementPrinter<Tuple,N,false> {
+      static void print( const Tuple&, std::ostream& ) {}
+  };
+
+}
+
+template<typename ...Types>
+struct StringMaker<std::tuple<Types...>> {
+
+    static std::string convert( const std::tuple<Types...>& tuple )
+    {
+        std::ostringstream os;
+        os << '{';
+        TupleDetail::ElementPrinter<std::tuple<Types...>>::print( tuple, os );
+        os << " }";
+        return os.str();
+    }
+};
+#endif // CATCH_CONFIG_CPP11_TUPLE
+
+namespace Detail {
+    template<typename T>
+    std::string makeString( T const& value ) {
+        return StringMaker<T>::convert( value );
+    }
+} // end namespace Detail
+
+/// \brief converts any type to a string
+///
+/// The default template forwards on to ostringstream - except when an
+/// ostringstream overload does not exist - in which case it attempts to detect
+/// that and writes {?}.
+/// Overload (not specialise) this template for custom typs that you don't want
+/// to provide an ostream overload for.
+template<typename T>
+std::string toString( T const& value ) {
+    return StringMaker<T>::convert( value );
+}
+
+    namespace Detail {
+    template<typename InputIterator>
+    std::string rangeToString( InputIterator first, InputIterator last ) {
+        std::ostringstream oss;
+        oss << "{ ";
+        if( first != last ) {
+            oss << Catch::toString( *first );
+            for( ++first ; first != last ; ++first )
+                oss << ", " << Catch::toString( *first );
+        }
+        oss << " }";
+        return oss.str();
+    }
+}
+
+} // end namespace Catch
+
+namespace Catch {
+
+// Wraps the LHS of an expression and captures the operator and RHS (if any) -
+// wrapping them all in a ResultBuilder object
+template<typename T>
+class ExpressionLhs {
+    ExpressionLhs& operator = ( ExpressionLhs const& );
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+    ExpressionLhs& operator = ( ExpressionLhs && ) = delete;
+#  endif
+
+public:
+    ExpressionLhs( ResultBuilder& rb, T lhs ) : m_rb( rb ), m_lhs( lhs ) {}
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+    ExpressionLhs( ExpressionLhs const& ) = default;
+    ExpressionLhs( ExpressionLhs && )     = default;
+#  endif
+
+    template<typename RhsT>
+    ResultBuilder& operator == ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsEqualTo>( rhs );
+    }
+
+    template<typename RhsT>
+    ResultBuilder& operator != ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsNotEqualTo>( rhs );
+    }
+
+    template<typename RhsT>
+    ResultBuilder& operator < ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsLessThan>( rhs );
+    }
+
+    template<typename RhsT>
+    ResultBuilder& operator > ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsGreaterThan>( rhs );
+    }
+
+    template<typename RhsT>
+    ResultBuilder& operator <= ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsLessThanOrEqualTo>( rhs );
+    }
+
+    template<typename RhsT>
+    ResultBuilder& operator >= ( RhsT const& rhs ) {
+        return captureExpression<Internal::IsGreaterThanOrEqualTo>( rhs );
+    }
+
+    ResultBuilder& operator == ( bool rhs ) {
+        return captureExpression<Internal::IsEqualTo>( rhs );
+    }
+
+    ResultBuilder& operator != ( bool rhs ) {
+        return captureExpression<Internal::IsNotEqualTo>( rhs );
+    }
+
+    void endExpression() {
+        bool value = m_lhs ? true : false;
+        m_rb
+            .setLhs( Catch::toString( value ) )
+            .setResultType( value )
+            .endExpression();
+    }
+
+    // Only simple binary expressions are allowed on the LHS.
+    // If more complex compositions are required then place the sub expression in parentheses
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator + ( RhsT const& );
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator - ( RhsT const& );
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator / ( RhsT const& );
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator * ( RhsT const& );
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator && ( RhsT const& );
+    template<typename RhsT> STATIC_ASSERT_Expression_Too_Complex_Please_Rewrite_As_Binary_Comparison& operator || ( RhsT const& );
+
+private:
+    template<Internal::Operator Op, typename RhsT>
+    ResultBuilder& captureExpression( RhsT const& rhs ) {
+        return m_rb
+            .setResultType( Internal::compare<Op>( m_lhs, rhs ) )
+            .setLhs( Catch::toString( m_lhs ) )
+            .setRhs( Catch::toString( rhs ) )
+            .setOp( Internal::OperatorTraits<Op>::getName() );
+    }
+
+private:
+    ResultBuilder& m_rb;
+    T m_lhs;
+};
+
+} // end namespace Catch
+
+
+namespace Catch {
+
+    template<typename T>
+    inline ExpressionLhs<T const&> ResultBuilder::operator <= ( T const& operand ) {
+        return ExpressionLhs<T const&>( *this, operand );
+    }
+
+    inline ExpressionLhs<bool> ResultBuilder::operator <= ( bool value ) {
+        return ExpressionLhs<bool>( *this, value );
+    }
+
+} // namespace Catch
+
+// #included from: catch_message.h
+#define TWOBLUECUBES_CATCH_MESSAGE_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    std::string const& _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        std::string macroName;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        std::string message;
+        unsigned int sequence;
+
+        bool operator == ( MessageInfo const& other ) const {
+            return sequence == other.sequence;
+        }
+        bool operator < ( MessageInfo const& other ) const {
+            return sequence < other.sequence;
+        }
+    private:
+        static unsigned int globalCount;
+    };
+
+    struct MessageBuilder {
+        MessageBuilder( std::string const& macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType type )
+        : m_info( macroName, lineInfo, type )
+        {}
+
+        template<typename T>
+        MessageBuilder& operator << ( T const& value ) {
+            m_stream << value;
+            return *this;
+        }
+
+        MessageInfo m_info;
+        std::ostringstream m_stream;
+    };
+
+    class ScopedMessage {
+    public:
+        ScopedMessage( MessageBuilder const& builder );
+        ScopedMessage( ScopedMessage const& other );
+        ~ScopedMessage();
+
+        MessageInfo m_info;
+    };
+
+} // end namespace Catch
+
+// #included from: catch_interfaces_capture.h
+#define TWOBLUECUBES_CATCH_INTERFACES_CAPTURE_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    class TestCase;
+    class AssertionResult;
+    struct AssertionInfo;
+    struct SectionInfo;
+    struct SectionEndInfo;
+    struct MessageInfo;
+    class ScopedMessageBuilder;
+    struct Counts;
+
+    struct IResultCapture {
+
+        virtual ~IResultCapture();
+
+        virtual void assertionEnded( AssertionResult const& result ) = 0;
+        virtual bool sectionStarted(    SectionInfo const& sectionInfo,
+                                        Counts& assertions ) = 0;
+        virtual void sectionEnded( SectionEndInfo const& endInfo ) = 0;
+        virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) = 0;
+        virtual void pushScopedMessage( MessageInfo const& message ) = 0;
+        virtual void popScopedMessage( MessageInfo const& message ) = 0;
+
+        virtual std::string getCurrentTestName() const = 0;
+        virtual const AssertionResult* getLastResult() const = 0;
+
+        virtual void handleFatalErrorCondition( std::string const& message ) = 0;
+    };
+
+    IResultCapture& getResultCapture();
+}
+
+// #included from: catch_debugger.h
+#define TWOBLUECUBES_CATCH_DEBUGGER_H_INCLUDED
+
+// #included from: catch_platform.h
+#define TWOBLUECUBES_CATCH_PLATFORM_H_INCLUDED
+
+#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED)
+#define CATCH_PLATFORM_MAC
+#elif  defined(__IPHONE_OS_VERSION_MIN_REQUIRED)
+#define CATCH_PLATFORM_IPHONE
+#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER)
+#define CATCH_PLATFORM_WINDOWS
+#endif
+
+#include <string>
+
+namespace Catch{
+
+    bool isDebuggerActive();
+    void writeToDebugConsole( std::string const& text );
+}
+
+#ifdef CATCH_PLATFORM_MAC
+
+    // The following code snippet based on:
+    // http://cocoawithlove.com/2008/03/break-into-debugger.html
+    #ifdef DEBUG
+        #if defined(__ppc64__) || defined(__ppc__)
+            #define CATCH_BREAK_INTO_DEBUGGER() \
+                if( Catch::isDebuggerActive() ) { \
+                    __asm__("li r0, 20\nsc\nnop\nli r0, 37\nli r4, 2\nsc\nnop\n" \
+                    : : : "memory","r0","r3","r4" ); \
+                }
+        #else
+            #define CATCH_BREAK_INTO_DEBUGGER() if( Catch::isDebuggerActive() ) {__asm__("int $3\n" : : );}
+        #endif
+    #endif
+
+#elif defined(_MSC_VER)
+    #define CATCH_BREAK_INTO_DEBUGGER() if( Catch::isDebuggerActive() ) { __debugbreak(); }
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) void __stdcall DebugBreak();
+    #define CATCH_BREAK_INTO_DEBUGGER() if( Catch::isDebuggerActive() ) { DebugBreak(); }
+#endif
+
+#ifndef CATCH_BREAK_INTO_DEBUGGER
+#define CATCH_BREAK_INTO_DEBUGGER() Catch::alwaysTrue();
+#endif
+
+// #included from: catch_interfaces_runner.h
+#define TWOBLUECUBES_CATCH_INTERFACES_RUNNER_H_INCLUDED
+
+namespace Catch {
+    class TestCase;
+
+    struct IRunner {
+        virtual ~IRunner();
+        virtual bool aborting() const = 0;
+    };
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// In the event of a failure works out if the debugger needs to be invoked
+// and/or an exception thrown and takes appropriate action.
+// This needs to be done as a macro so the debugger will stop in the user
+// source code rather than in Catch library code
+#define INTERNAL_CATCH_REACT( resultBuilder ) \
+    if( resultBuilder.shouldDebugBreak() ) CATCH_BREAK_INTO_DEBUGGER(); \
+    resultBuilder.react();
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TEST( expr, resultDisposition, macroName ) \
+    do { \
+        Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, #expr, resultDisposition ); \
+        try { \
+            CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+            ( __catchResult <= expr ).endExpression(); \
+        } \
+        catch( ... ) { \
+            __catchResult.useActiveException( Catch::ResultDisposition::Normal ); \
+        } \
+        INTERNAL_CATCH_REACT( __catchResult ) \
+    } while( Catch::isTrue( false && static_cast<bool>(expr) ) ) // expr here is never evaluated at runtime but it forces the compiler to give it a look
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_IF( expr, resultDisposition, macroName ) \
+    INTERNAL_CATCH_TEST( expr, resultDisposition, macroName ); \
+    if( Catch::getResultCapture().getLastResult()->succeeded() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_ELSE( expr, resultDisposition, macroName ) \
+    INTERNAL_CATCH_TEST( expr, resultDisposition, macroName ); \
+    if( !Catch::getResultCapture().getLastResult()->succeeded() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_NO_THROW( expr, resultDisposition, macroName ) \
+    do { \
+        Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, #expr, resultDisposition ); \
+        try { \
+            expr; \
+            __catchResult.captureResult( Catch::ResultWas::Ok ); \
+        } \
+        catch( ... ) { \
+            __catchResult.useActiveException( resultDisposition ); \
+        } \
+        INTERNAL_CATCH_REACT( __catchResult ) \
+    } while( Catch::alwaysFalse() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS( expr, resultDisposition, matcher, macroName ) \
+    do { \
+        Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, #expr, resultDisposition, #matcher ); \
+        if( __catchResult.allowThrows() ) \
+            try { \
+                expr; \
+                __catchResult.captureResult( Catch::ResultWas::DidntThrowException ); \
+            } \
+            catch( ... ) { \
+                __catchResult.captureExpectedException( matcher ); \
+            } \
+        else \
+            __catchResult.captureResult( Catch::ResultWas::Ok ); \
+        INTERNAL_CATCH_REACT( __catchResult ) \
+    } while( Catch::alwaysFalse() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_AS( expr, exceptionType, resultDisposition, macroName ) \
+    do { \
+        Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, #expr, resultDisposition ); \
+        if( __catchResult.allowThrows() ) \
+            try { \
+                expr; \
+                __catchResult.captureResult( Catch::ResultWas::DidntThrowException ); \
+            } \
+            catch( exceptionType ) { \
+                __catchResult.captureResult( Catch::ResultWas::Ok ); \
+            } \
+            catch( ... ) { \
+                __catchResult.useActiveException( resultDisposition ); \
+            } \
+        else \
+            __catchResult.captureResult( Catch::ResultWas::Ok ); \
+        INTERNAL_CATCH_REACT( __catchResult ) \
+    } while( Catch::alwaysFalse() )
+
+///////////////////////////////////////////////////////////////////////////////
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+    #define INTERNAL_CATCH_MSG( messageType, resultDisposition, macroName, ... ) \
+        do { \
+            Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, "", resultDisposition ); \
+            __catchResult << __VA_ARGS__ + ::Catch::StreamEndStop(); \
+            __catchResult.captureResult( messageType ); \
+            INTERNAL_CATCH_REACT( __catchResult ) \
+        } while( Catch::alwaysFalse() )
+#else
+    #define INTERNAL_CATCH_MSG( messageType, resultDisposition, macroName, log ) \
+        do { \
+            Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, "", resultDisposition ); \
+            __catchResult << log + ::Catch::StreamEndStop(); \
+            __catchResult.captureResult( messageType ); \
+            INTERNAL_CATCH_REACT( __catchResult ) \
+        } while( Catch::alwaysFalse() )
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_INFO( log, macroName ) \
+    Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME( scopedMessage ) = Catch::MessageBuilder( macroName, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log;
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CHECK_THAT( arg, matcher, resultDisposition, macroName ) \
+    do { \
+        Catch::ResultBuilder __catchResult( macroName, CATCH_INTERNAL_LINEINFO, #arg ", " #matcher, resultDisposition ); \
+        try { \
+            std::string matcherAsString = (matcher).toString(); \
+            __catchResult \
+                .setLhs( Catch::toString( arg ) ) \
+                .setRhs( matcherAsString == Catch::Detail::unprintableString ? #matcher : matcherAsString ) \
+                .setOp( "matches" ) \
+                .setResultType( (matcher).match( arg ) ); \
+            __catchResult.captureExpression(); \
+        } catch( ... ) { \
+            __catchResult.useActiveException( resultDisposition | Catch::ResultDisposition::ContinueOnFailure ); \
+        } \
+        INTERNAL_CATCH_REACT( __catchResult ) \
+    } while( Catch::alwaysFalse() )
+
+// #included from: internal/catch_section.h
+#define TWOBLUECUBES_CATCH_SECTION_H_INCLUDED
+
+// #included from: catch_section_info.h
+#define TWOBLUECUBES_CATCH_SECTION_INFO_H_INCLUDED
+
+// #included from: catch_totals.hpp
+#define TWOBLUECUBES_CATCH_TOTALS_HPP_INCLUDED
+
+#include <cstddef>
+
+namespace Catch {
+
+    struct Counts {
+        Counts() : passed( 0 ), failed( 0 ), failedButOk( 0 ) {}
+
+        Counts operator - ( Counts const& other ) const {
+            Counts diff;
+            diff.passed = passed - other.passed;
+            diff.failed = failed - other.failed;
+            diff.failedButOk = failedButOk - other.failedButOk;
+            return diff;
+        }
+        Counts& operator += ( Counts const& other ) {
+            passed += other.passed;
+            failed += other.failed;
+            failedButOk += other.failedButOk;
+            return *this;
+        }
+
+        std::size_t total() const {
+            return passed + failed + failedButOk;
+        }
+        bool allPassed() const {
+            return failed == 0 && failedButOk == 0;
+        }
+        bool allOk() const {
+            return failed == 0;
+        }
+
+        std::size_t passed;
+        std::size_t failed;
+        std::size_t failedButOk;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const {
+            Totals diff;
+            diff.assertions = assertions - other.assertions;
+            diff.testCases = testCases - other.testCases;
+            return diff;
+        }
+
+        Totals delta( Totals const& prevTotals ) const {
+            Totals diff = *this - prevTotals;
+            if( diff.assertions.failed > 0 )
+                ++diff.testCases.failed;
+            else if( diff.assertions.failedButOk > 0 )
+                ++diff.testCases.failedButOk;
+            else
+                ++diff.testCases.passed;
+            return diff;
+        }
+
+        Totals& operator += ( Totals const& other ) {
+            assertions += other.assertions;
+            testCases += other.testCases;
+            return *this;
+        }
+
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+namespace Catch {
+
+    struct SectionInfo {
+        SectionInfo
+            (   SourceLineInfo const& _lineInfo,
+                std::string const& _name,
+                std::string const& _description = std::string() );
+
+        std::string name;
+        std::string description;
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionEndInfo( SectionInfo const& _sectionInfo, Counts const& _prevAssertions, double _durationInSeconds )
+        : sectionInfo( _sectionInfo ), prevAssertions( _prevAssertions ), durationInSeconds( _durationInSeconds )
+        {}
+
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+// #included from: catch_timer.h
+#define TWOBLUECUBES_CATCH_TIMER_H_INCLUDED
+
+#ifdef CATCH_PLATFORM_WINDOWS
+typedef unsigned long long uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+namespace Catch {
+
+    class Timer {
+    public:
+        Timer() : m_ticks( 0 ) {}
+        void start();
+        unsigned int getElapsedMicroseconds() const;
+        unsigned int getElapsedMilliseconds() const;
+        double getElapsedSeconds() const;
+
+    private:
+        uint64_t m_ticks;
+    };
+
+} // namespace Catch
+
+#include <string>
+
+namespace Catch {
+
+    class Section : NonCopyable {
+    public:
+        Section( SectionInfo const& info );
+        ~Section();
+
+        // This indicates whether the section should be executed or not
+        operator bool() const;
+
+    private:
+        SectionInfo m_info;
+
+        std::string m_name;
+        Counts m_assertions;
+        bool m_sectionIncluded;
+        Timer m_timer;
+    };
+
+} // end namespace Catch
+
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+    #define INTERNAL_CATCH_SECTION( ... ) \
+        if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) )
+#else
+    #define INTERNAL_CATCH_SECTION( name, desc ) \
+        if( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME( catch_internal_Section ) = Catch::SectionInfo( CATCH_INTERNAL_LINEINFO, name, desc ) )
+#endif
+
+// #included from: internal/catch_generators.hpp
+#define TWOBLUECUBES_CATCH_GENERATORS_HPP_INCLUDED
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+
+namespace Catch {
+
+template<typename T>
+struct IGenerator {
+    virtual ~IGenerator() {}
+    virtual T getValue( std::size_t index ) const = 0;
+    virtual std::size_t size () const = 0;
+};
+
+template<typename T>
+class BetweenGenerator : public IGenerator<T> {
+public:
+    BetweenGenerator( T from, T to ) : m_from( from ), m_to( to ){}
+
+    virtual T getValue( std::size_t index ) const {
+        return m_from+static_cast<int>( index );
+    }
+
+    virtual std::size_t size() const {
+        return static_cast<std::size_t>( 1+m_to-m_from );
+    }
+
+private:
+
+    T m_from;
+    T m_to;
+};
+
+template<typename T>
+class ValuesGenerator : public IGenerator<T> {
+public:
+    ValuesGenerator(){}
+
+    void add( T value ) {
+        m_values.push_back( value );
+    }
+
+    virtual T getValue( std::size_t index ) const {
+        return m_values[index];
+    }
+
+    virtual std::size_t size() const {
+        return m_values.size();
+    }
+
+private:
+    std::vector<T> m_values;
+};
+
+template<typename T>
+class CompositeGenerator {
+public:
+    CompositeGenerator() : m_totalSize( 0 ) {}
+
+    // *** Move semantics, similar to auto_ptr ***
+    CompositeGenerator( CompositeGenerator& other )
+    :   m_fileInfo( other.m_fileInfo ),
+        m_totalSize( 0 )
+    {
+        move( other );
+    }
+
+    CompositeGenerator& setFileInfo( const char* fileInfo ) {
+        m_fileInfo = fileInfo;
+        return *this;
+    }
+
+    ~CompositeGenerator() {
+        deleteAll( m_composed );
+    }
+
+    operator T () const {
+        size_t overallIndex = getCurrentContext().getGeneratorIndex( m_fileInfo, m_totalSize );
+
+        typename std::vector<const IGenerator<T>*>::const_iterator it = m_composed.begin();
+        typename std::vector<const IGenerator<T>*>::const_iterator itEnd = m_composed.end();
+        for( size_t index = 0; it != itEnd; ++it )
+        {
+            const IGenerator<T>* generator = *it;
+            if( overallIndex >= index && overallIndex < index + generator->size() )
+            {
+                return generator->getValue( overallIndex-index );
+            }
+            index += generator->size();
+        }
+        CATCH_INTERNAL_ERROR( "Indexed past end of generated range" );
+        return T(); // Suppress spurious "not all control paths return a value" warning in Visual Studio - if you know how to fix this please do so
+    }
+
+    void add( const IGenerator<T>* generator ) {
+        m_totalSize += generator->size();
+        m_composed.push_back( generator );
+    }
+
+    CompositeGenerator& then( CompositeGenerator& other ) {
+        move( other );
+        return *this;
+    }
+
+    CompositeGenerator& then( T value ) {
+        ValuesGenerator<T>* valuesGen = new ValuesGenerator<T>();
+        valuesGen->add( value );
+        add( valuesGen );
+        return *this;
+    }
+
+private:
+
+    void move( CompositeGenerator& other ) {
+        std::copy( other.m_composed.begin(), other.m_composed.end(), std::back_inserter( m_composed ) );
+        m_totalSize += other.m_totalSize;
+        other.m_composed.clear();
+    }
+
+    std::vector<const IGenerator<T>*> m_composed;
+    std::string m_fileInfo;
+    size_t m_totalSize;
+};
+
+namespace Generators
+{
+    template<typename T>
+    CompositeGenerator<T> between( T from, T to ) {
+        CompositeGenerator<T> generators;
+        generators.add( new BetweenGenerator<T>( from, to ) );
+        return generators;
+    }
+
+    template<typename T>
+    CompositeGenerator<T> values( T val1, T val2 ) {
+        CompositeGenerator<T> generators;
+        ValuesGenerator<T>* valuesGen = new ValuesGenerator<T>();
+        valuesGen->add( val1 );
+        valuesGen->add( val2 );
+        generators.add( valuesGen );
+        return generators;
+    }
+
+    template<typename T>
+    CompositeGenerator<T> values( T val1, T val2, T val3 ){
+        CompositeGenerator<T> generators;
+        ValuesGenerator<T>* valuesGen = new ValuesGenerator<T>();
+        valuesGen->add( val1 );
+        valuesGen->add( val2 );
+        valuesGen->add( val3 );
+        generators.add( valuesGen );
+        return generators;
+    }
+
+    template<typename T>
+    CompositeGenerator<T> values( T val1, T val2, T val3, T val4 ) {
+        CompositeGenerator<T> generators;
+        ValuesGenerator<T>* valuesGen = new ValuesGenerator<T>();
+        valuesGen->add( val1 );
+        valuesGen->add( val2 );
+        valuesGen->add( val3 );
+        valuesGen->add( val4 );
+        generators.add( valuesGen );
+        return generators;
+    }
+
+} // end namespace Generators
+
+using namespace Generators;
+
+} // end namespace Catch
+
+#define INTERNAL_CATCH_LINESTR2( line ) #line
+#define INTERNAL_CATCH_LINESTR( line ) INTERNAL_CATCH_LINESTR2( line )
+
+#define INTERNAL_CATCH_GENERATE( expr ) expr.setFileInfo( __FILE__ "(" INTERNAL_CATCH_LINESTR( __LINE__ ) ")" )
+
+// #included from: internal/catch_interfaces_exception.h
+#define TWOBLUECUBES_CATCH_INTERFACES_EXCEPTION_H_INCLUDED
+
+#include <string>
+#include <vector>
+
+// #included from: catch_interfaces_registry_hub.h
+#define TWOBLUECUBES_CATCH_INTERFACES_REGISTRY_HUB_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    class TestCase;
+    struct ITestCaseRegistry;
+    struct IExceptionTranslatorRegistry;
+    struct IExceptionTranslator;
+    struct IReporterRegistry;
+    struct IReporterFactory;
+
+    struct IRegistryHub {
+        virtual ~IRegistryHub();
+
+        virtual IReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry& getExceptionTranslatorRegistry() = 0;
+    };
+
+    struct IMutableRegistryHub {
+        virtual ~IMutableRegistryHub();
+        virtual void registerReporter( std::string const& name, Ptr<IReporterFactory> const& factory ) = 0;
+        virtual void registerListener( Ptr<IReporterFactory> const& factory ) = 0;
+        virtual void registerTest( TestCase const& testInfo ) = 0;
+        virtual void registerTranslator( const IExceptionTranslator* translator ) = 0;
+    };
+
+    IRegistryHub& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
+
+}
+
+namespace Catch {
+
+    typedef std::string(*exceptionTranslateFunction)();
+
+    struct IExceptionTranslator;
+    typedef std::vector<const IExceptionTranslator*> ExceptionTranslators;
+
+    struct IExceptionTranslator {
+        virtual ~IExceptionTranslator();
+        virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const = 0;
+    };
+
+    struct IExceptionTranslatorRegistry {
+        virtual ~IExceptionTranslatorRegistry();
+
+        virtual std::string translateActiveException() const = 0;
+    };
+
+    class ExceptionTranslatorRegistrar {
+        template<typename T>
+        class ExceptionTranslator : public IExceptionTranslator {
+        public:
+
+            ExceptionTranslator( std::string(*translateFunction)( T& ) )
+            : m_translateFunction( translateFunction )
+            {}
+
+            virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const CATCH_OVERRIDE {
+                try {
+                    if( it == itEnd )
+                        throw;
+                    else
+                        return (*it)->translate( it+1, itEnd );
+                }
+                catch( T& ex ) {
+                    return m_translateFunction( ex );
+                }
+            }
+
+        protected:
+            std::string(*m_translateFunction)( T& );
+        };
+
+    public:
+        template<typename T>
+        ExceptionTranslatorRegistrar( std::string(*translateFunction)( T& ) ) {
+            getMutableRegistryHub().registerTranslator
+                ( new ExceptionTranslator<T>( translateFunction ) );
+        }
+    };
+}
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \
+    static std::string translatorName( signature ); \
+    namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); }\
+    static std::string translatorName( signature )
+
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+
+// #included from: internal/catch_approx.hpp
+#define TWOBLUECUBES_CATCH_APPROX_HPP_INCLUDED
+
+#include <cmath>
+#include <limits>
+
+namespace Catch {
+namespace Detail {
+
+    class Approx {
+    public:
+        explicit Approx ( double value )
+        :   m_epsilon( std::numeric_limits<float>::epsilon()*100 ),
+            m_scale( 1.0 ),
+            m_value( value )
+        {}
+
+        Approx( Approx const& other )
+        :   m_epsilon( other.m_epsilon ),
+            m_scale( other.m_scale ),
+            m_value( other.m_value )
+        {}
+
+        static Approx custom() {
+            return Approx( 0 );
+        }
+
+        Approx operator()( double value ) {
+            Approx approx( value );
+            approx.epsilon( m_epsilon );
+            approx.scale( m_scale );
+            return approx;
+        }
+
+        friend bool operator == ( double lhs, Approx const& rhs ) {
+            // Thanks to Richard Harris for his help refining this formula
+            return fabs( lhs - rhs.m_value ) < rhs.m_epsilon * (rhs.m_scale + (std::max)( fabs(lhs), fabs(rhs.m_value) ) );
+        }
+
+        friend bool operator == ( Approx const& lhs, double rhs ) {
+            return operator==( rhs, lhs );
+        }
+
+        friend bool operator != ( double lhs, Approx const& rhs ) {
+            return !operator==( lhs, rhs );
+        }
+
+        friend bool operator != ( Approx const& lhs, double rhs ) {
+            return !operator==( rhs, lhs );
+        }
+
+        Approx& epsilon( double newEpsilon ) {
+            m_epsilon = newEpsilon;
+            return *this;
+        }
+
+        Approx& scale( double newScale ) {
+            m_scale = newScale;
+            return *this;
+        }
+
+        std::string toString() const {
+            std::ostringstream oss;
+            oss << "Approx( " << Catch::toString( m_value ) << " )";
+            return oss.str();
+        }
+
+    private:
+        double m_epsilon;
+        double m_scale;
+        double m_value;
+    };
+}
+
+template<>
+inline std::string toString<Detail::Approx>( Detail::Approx const& value ) {
+    return value.toString();
+}
+
+} // end namespace Catch
+
+// #included from: internal/catch_interfaces_tag_alias_registry.h
+#define TWOBLUECUBES_CATCH_INTERFACES_TAG_ALIAS_REGISTRY_H_INCLUDED
+
+// #included from: catch_tag_alias.h
+#define TWOBLUECUBES_CATCH_TAG_ALIAS_H_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias {
+        TagAlias( std::string _tag, SourceLineInfo _lineInfo ) : tag( _tag ), lineInfo( _lineInfo ) {}
+
+        std::string tag;
+        SourceLineInfo lineInfo;
+    };
+
+    struct RegistrarForTagAliases {
+        RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
+    };
+
+} // end namespace Catch
+
+#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); }
+// #included from: catch_option.hpp
+#define TWOBLUECUBES_CATCH_OPTION_HPP_INCLUDED
+
+namespace Catch {
+
+    // An optional type
+    template<typename T>
+    class Option {
+    public:
+        Option() : nullableValue( CATCH_NULL ) {}
+        Option( T const& _value )
+        : nullableValue( new( storage ) T( _value ) )
+        {}
+        Option( Option const& _other )
+        : nullableValue( _other ? new( storage ) T( *_other ) : CATCH_NULL )
+        {}
+
+        ~Option() {
+            reset();
+        }
+
+        Option& operator= ( Option const& _other ) {
+            if( &_other != this ) {
+                reset();
+                if( _other )
+                    nullableValue = new( storage ) T( *_other );
+            }
+            return *this;
+        }
+        Option& operator = ( T const& _value ) {
+            reset();
+            nullableValue = new( storage ) T( _value );
+            return *this;
+        }
+
+        void reset() {
+            if( nullableValue )
+                nullableValue->~T();
+            nullableValue = CATCH_NULL;
+        }
+
+        T& operator*() { return *nullableValue; }
+        T const& operator*() const { return *nullableValue; }
+        T* operator->() { return nullableValue; }
+        const T* operator->() const { return nullableValue; }
+
+        T valueOr( T const& defaultValue ) const {
+            return nullableValue ? *nullableValue : defaultValue;
+        }
+
+        bool some() const { return nullableValue != CATCH_NULL; }
+        bool none() const { return nullableValue == CATCH_NULL; }
+
+        bool operator !() const { return nullableValue == CATCH_NULL; }
+        operator SafeBool::type() const {
+            return SafeBool::makeSafe( some() );
+        }
+
+    private:
+        T* nullableValue;
+        char storage[sizeof(T)];
+    };
+
+} // end namespace Catch
+
+namespace Catch {
+
+    struct ITagAliasRegistry {
+        virtual ~ITagAliasRegistry();
+        virtual Option<TagAlias> find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
+
+        static ITagAliasRegistry const& get();
+    };
+
+} // end namespace Catch
+
+// These files are included here so the single_include script doesn't put them
+// in the conditionally compiled sections
+// #included from: internal/catch_test_case_info.h
+#define TWOBLUECUBES_CATCH_TEST_CASE_INFO_H_INCLUDED
+
+#include <string>
+#include <set>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    struct ITestCase;
+
+    struct TestCaseInfo {
+        enum SpecialProperties{
+            None = 0,
+            IsHidden = 1 << 1,
+            ShouldFail = 1 << 2,
+            MayFail = 1 << 3,
+            Throws = 1 << 4
+        };
+
+        TestCaseInfo(   std::string const& _name,
+                        std::string const& _className,
+                        std::string const& _description,
+                        std::set<std::string> const& _tags,
+                        SourceLineInfo const& _lineInfo );
+
+        TestCaseInfo( TestCaseInfo const& other );
+
+        friend void setTags( TestCaseInfo& testCaseInfo, std::set<std::string> const& tags );
+
+        bool isHidden() const;
+        bool throws() const;
+        bool okToFail() const;
+        bool expectedToFail() const;
+
+        std::string name;
+        std::string className;
+        std::string description;
+        std::set<std::string> tags;
+        std::set<std::string> lcaseTags;
+        std::string tagsAsString;
+        SourceLineInfo lineInfo;
+        SpecialProperties properties;
+    };
+
+    class TestCase : public TestCaseInfo {
+    public:
+
+        TestCase( ITestCase* testCase, TestCaseInfo const& info );
+        TestCase( TestCase const& other );
+
+        TestCase withName( std::string const& _newName ) const;
+
+        void invoke() const;
+
+        TestCaseInfo const& getTestCaseInfo() const;
+
+        void swap( TestCase& other );
+        bool operator == ( TestCase const& other ) const;
+        bool operator < ( TestCase const& other ) const;
+        TestCase& operator = ( TestCase const& other );
+
+    private:
+        Ptr<ITestCase> test;
+    };
+
+    TestCase makeTestCase(  ITestCase* testCase,
+                            std::string const& className,
+                            std::string const& name,
+                            std::string const& description,
+                            SourceLineInfo const& lineInfo );
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+
+#ifdef __OBJC__
+// #included from: internal/catch_objc.hpp
+#define TWOBLUECUBES_CATCH_OBJC_HPP_INCLUDED
+
+#import <objc/runtime.h>
+
+#include <string>
+
+// NB. Any general catch headers included here must be included
+// in catch.hpp first to make sure they are included by the single
+// header for non obj-usage
+
+///////////////////////////////////////////////////////////////////////////////
+// This protocol is really only here for (self) documenting purposes, since
+// all its methods are optional.
+@protocol OcFixture
+
+@optional
+
+-(void) setUp;
+-(void) tearDown;
+
+@end
+
+namespace Catch {
+
+    class OcMethod : public SharedImpl<ITestCase> {
+
+    public:
+        OcMethod( Class cls, SEL sel ) : m_cls( cls ), m_sel( sel ) {}
+
+        virtual void invoke() const {
+            id obj = [[m_cls alloc] init];
+
+            performOptionalSelector( obj, @selector(setUp)  );
+            performOptionalSelector( obj, m_sel );
+            performOptionalSelector( obj, @selector(tearDown)  );
+
+            arcSafeRelease( obj );
+        }
+    private:
+        virtual ~OcMethod() {}
+
+        Class m_cls;
+        SEL m_sel;
+    };
+
+    namespace Detail{
+
+        inline std::string getAnnotation(   Class cls,
+                                            std::string const& annotationName,
+                                            std::string const& testCaseName ) {
+            NSString* selStr = [[NSString alloc] initWithFormat:@"Catch_%s_%s", annotationName.c_str(), testCaseName.c_str()];
+            SEL sel = NSSelectorFromString( selStr );
+            arcSafeRelease( selStr );
+            id value = performOptionalSelector( cls, sel );
+            if( value )
+                return [(NSString*)value UTF8String];
+            return "";
+        }
+    }
+
+    inline size_t registerTestMethods() {
+        size_t noTestMethods = 0;
+        int noClasses = objc_getClassList( CATCH_NULL, 0 );
+
+        Class* classes = (CATCH_UNSAFE_UNRETAINED Class *)malloc( sizeof(Class) * noClasses);
+        objc_getClassList( classes, noClasses );
+
+        for( int c = 0; c < noClasses; c++ ) {
+            Class cls = classes[c];
+            {
+                u_int count;
+                Method* methods = class_copyMethodList( cls, &count );
+                for( u_int m = 0; m < count ; m++ ) {
+                    SEL selector = method_getName(methods[m]);
+                    std::string methodName = sel_getName(selector);
+                    if( startsWith( methodName, "Catch_TestCase_" ) ) {
+                        std::string testCaseName = methodName.substr( 15 );
+                        std::string name = Detail::getAnnotation( cls, "Name", testCaseName );
+                        std::string desc = Detail::getAnnotation( cls, "Description", testCaseName );
+                        const char* className = class_getName( cls );
+
+                        getMutableRegistryHub().registerTest( makeTestCase( new OcMethod( cls, selector ), className, name.c_str(), desc.c_str(), SourceLineInfo() ) );
+                        noTestMethods++;
+                    }
+                }
+                free(methods);
+            }
+        }
+        return noTestMethods;
+    }
+
+    namespace Matchers {
+        namespace Impl {
+        namespace NSStringMatchers {
+
+            template<typename MatcherT>
+            struct StringHolder : MatcherImpl<MatcherT, NSString*>{
+                StringHolder( NSString* substr ) : m_substr( [substr copy] ){}
+                StringHolder( StringHolder const& other ) : m_substr( [other.m_substr copy] ){}
+                StringHolder() {
+                    arcSafeRelease( m_substr );
+                }
+
+                NSString* m_substr;
+            };
+
+            struct Equals : StringHolder<Equals> {
+                Equals( NSString* substr ) : StringHolder( substr ){}
+
+                virtual bool match( ExpressionType const& str ) const {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str isEqualToString:m_substr];
+                }
+
+                virtual std::string toString() const {
+                    return "equals string: " + Catch::toString( m_substr );
+                }
+            };
+
+            struct Contains : StringHolder<Contains> {
+                Contains( NSString* substr ) : StringHolder( substr ){}
+
+                virtual bool match( ExpressionType const& str ) const {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location != NSNotFound;
+                }
+
+                virtual std::string toString() const {
+                    return "contains string: " + Catch::toString( m_substr );
+                }
+            };
+
+            struct StartsWith : StringHolder<StartsWith> {
+                StartsWith( NSString* substr ) : StringHolder( substr ){}
+
+                virtual bool match( ExpressionType const& str ) const {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location == 0;
+                }
+
+                virtual std::string toString() const {
+                    return "starts with: " + Catch::toString( m_substr );
+                }
+            };
+            struct EndsWith : StringHolder<EndsWith> {
+                EndsWith( NSString* substr ) : StringHolder( substr ){}
+
+                virtual bool match( ExpressionType const& str ) const {
+                    return  (str != nil || m_substr == nil ) &&
+                            [str rangeOfString:m_substr].location == [str length] - [m_substr length];
+                }
+
+                virtual std::string toString() const {
+                    return "ends with: " + Catch::toString( m_substr );
+                }
+            };
+
+        } // namespace NSStringMatchers
+        } // namespace Impl
+
+        inline Impl::NSStringMatchers::Equals
+            Equals( NSString* substr ){ return Impl::NSStringMatchers::Equals( substr ); }
+
+        inline Impl::NSStringMatchers::Contains
+            Contains( NSString* substr ){ return Impl::NSStringMatchers::Contains( substr ); }
+
+        inline Impl::NSStringMatchers::StartsWith
+            StartsWith( NSString* substr ){ return Impl::NSStringMatchers::StartsWith( substr ); }
+
+        inline Impl::NSStringMatchers::EndsWith
+            EndsWith( NSString* substr ){ return Impl::NSStringMatchers::EndsWith( substr ); }
+
+    } // namespace Matchers
+
+    using namespace Matchers;
+
+} // namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define OC_TEST_CASE( name, desc )\
++(NSString*) INTERNAL_CATCH_UNIQUE_NAME( Catch_Name_test ) \
+{\
+return @ name; \
+}\
++(NSString*) INTERNAL_CATCH_UNIQUE_NAME( Catch_Description_test ) \
+{ \
+return @ desc; \
+} \
+-(void) INTERNAL_CATCH_UNIQUE_NAME( Catch_TestCase_test )
+
+#endif
+
+#ifdef CATCH_IMPL
+// #included from: internal/catch_impl.hpp
+#define TWOBLUECUBES_CATCH_IMPL_HPP_INCLUDED
+
+// Collect all the implementation files together here
+// These are the equivalent of what would usually be cpp files
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+// #included from: ../catch_session.hpp
+#define TWOBLUECUBES_CATCH_RUNNER_HPP_INCLUDED
+
+// #included from: internal/catch_commandline.hpp
+#define TWOBLUECUBES_CATCH_COMMANDLINE_HPP_INCLUDED
+
+// #included from: catch_config.hpp
+#define TWOBLUECUBES_CATCH_CONFIG_HPP_INCLUDED
+
+// #included from: catch_test_spec_parser.hpp
+#define TWOBLUECUBES_CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+// #included from: catch_test_spec.hpp
+#define TWOBLUECUBES_CATCH_TEST_SPEC_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+// #included from: catch_wildcard_pattern.hpp
+#define TWOBLUECUBES_CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+
+namespace Catch
+{
+    class WildcardPattern {
+        enum WildcardPosition {
+            NoWildcard = 0,
+            WildcardAtStart = 1,
+            WildcardAtEnd = 2,
+            WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
+        };
+
+    public:
+
+        WildcardPattern( std::string const& pattern, CaseSensitive::Choice caseSensitivity )
+        :   m_caseSensitivity( caseSensitivity ),
+            m_wildcard( NoWildcard ),
+            m_pattern( adjustCase( pattern ) )
+        {
+            if( startsWith( m_pattern, "*" ) ) {
+                m_pattern = m_pattern.substr( 1 );
+                m_wildcard = WildcardAtStart;
+            }
+            if( endsWith( m_pattern, "*" ) ) {
+                m_pattern = m_pattern.substr( 0, m_pattern.size()-1 );
+                m_wildcard = static_cast<WildcardPosition>( m_wildcard | WildcardAtEnd );
+            }
+        }
+        virtual ~WildcardPattern();
+        virtual bool matches( std::string const& str ) const {
+            switch( m_wildcard ) {
+                case NoWildcard:
+                    return m_pattern == adjustCase( str );
+                case WildcardAtStart:
+                    return endsWith( adjustCase( str ), m_pattern );
+                case WildcardAtEnd:
+                    return startsWith( adjustCase( str ), m_pattern );
+                case WildcardAtBothEnds:
+                    return contains( adjustCase( str ), m_pattern );
+            }
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunreachable-code"
+#endif
+            throw std::logic_error( "Unknown enum" );
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+        }
+    private:
+        std::string adjustCase( std::string const& str ) const {
+            return m_caseSensitivity == CaseSensitive::No ? toLower( str ) : str;
+        }
+        CaseSensitive::Choice m_caseSensitivity;
+        WildcardPosition m_wildcard;
+        std::string m_pattern;
+    };
+}
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class TestSpec {
+        struct Pattern : SharedImpl<> {
+            virtual ~Pattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const = 0;
+        };
+        class NamePattern : public Pattern {
+        public:
+            NamePattern( std::string const& name )
+            : m_wildcardPattern( toLower( name ), CaseSensitive::No )
+            {}
+            virtual ~NamePattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const {
+                return m_wildcardPattern.matches( toLower( testCase.name ) );
+            }
+        private:
+            WildcardPattern m_wildcardPattern;
+        };
+
+        class TagPattern : public Pattern {
+        public:
+            TagPattern( std::string const& tag ) : m_tag( toLower( tag ) ) {}
+            virtual ~TagPattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const {
+                return testCase.lcaseTags.find( m_tag ) != testCase.lcaseTags.end();
+            }
+        private:
+            std::string m_tag;
+        };
+
+        class ExcludedPattern : public Pattern {
+        public:
+            ExcludedPattern( Ptr<Pattern> const& underlyingPattern ) : m_underlyingPattern( underlyingPattern ) {}
+            virtual ~ExcludedPattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const { return !m_underlyingPattern->matches( testCase ); }
+        private:
+            Ptr<Pattern> m_underlyingPattern;
+        };
+
+        struct Filter {
+            std::vector<Ptr<Pattern> > m_patterns;
+
+            bool matches( TestCaseInfo const& testCase ) const {
+                // All patterns in a filter must match for the filter to be a match
+                for( std::vector<Ptr<Pattern> >::const_iterator it = m_patterns.begin(), itEnd = m_patterns.end(); it != itEnd; ++it )
+                    if( !(*it)->matches( testCase ) )
+                        return false;
+                    return true;
+            }
+        };
+
+    public:
+        bool hasFilters() const {
+            return !m_filters.empty();
+        }
+        bool matches( TestCaseInfo const& testCase ) const {
+            // A TestSpec matches if any filter matches
+            for( std::vector<Filter>::const_iterator it = m_filters.begin(), itEnd = m_filters.end(); it != itEnd; ++it )
+                if( it->matches( testCase ) )
+                    return true;
+            return false;
+        }
+
+    private:
+        std::vector<Filter> m_filters;
+
+        friend class TestSpecParser;
+    };
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+namespace Catch {
+
+    class TestSpecParser {
+        enum Mode{ None, Name, QuotedName, Tag };
+        Mode m_mode;
+        bool m_exclusion;
+        std::size_t m_start, m_pos;
+        std::string m_arg;
+        TestSpec::Filter m_currentFilter;
+        TestSpec m_testSpec;
+        ITagAliasRegistry const* m_tagAliases;
+
+    public:
+        TestSpecParser( ITagAliasRegistry const& tagAliases ) : m_tagAliases( &tagAliases ) {}
+
+        TestSpecParser& parse( std::string const& arg ) {
+            m_mode = None;
+            m_exclusion = false;
+            m_start = std::string::npos;
+            m_arg = m_tagAliases->expandAliases( arg );
+            for( m_pos = 0; m_pos < m_arg.size(); ++m_pos )
+                visitChar( m_arg[m_pos] );
+            if( m_mode == Name )
+                addPattern<TestSpec::NamePattern>();
+            return *this;
+        }
+        TestSpec testSpec() {
+            addFilter();
+            return m_testSpec;
+        }
+    private:
+        void visitChar( char c ) {
+            if( m_mode == None ) {
+                switch( c ) {
+                case ' ': return;
+                case '~': m_exclusion = true; return;
+                case '[': return startNewMode( Tag, ++m_pos );
+                case '"': return startNewMode( QuotedName, ++m_pos );
+                default: startNewMode( Name, m_pos ); break;
+                }
+            }
+            if( m_mode == Name ) {
+                if( c == ',' ) {
+                    addPattern<TestSpec::NamePattern>();
+                    addFilter();
+                }
+                else if( c == '[' ) {
+                    if( subString() == "exclude:" )
+                        m_exclusion = true;
+                    else
+                        addPattern<TestSpec::NamePattern>();
+                    startNewMode( Tag, ++m_pos );
+                }
+            }
+            else if( m_mode == QuotedName && c == '"' )
+                addPattern<TestSpec::NamePattern>();
+            else if( m_mode == Tag && c == ']' )
+                addPattern<TestSpec::TagPattern>();
+        }
+        void startNewMode( Mode mode, std::size_t start ) {
+            m_mode = mode;
+            m_start = start;
+        }
+        std::string subString() const { return m_arg.substr( m_start, m_pos - m_start ); }
+        template<typename T>
+        void addPattern() {
+            std::string token = subString();
+            if( startsWith( token, "exclude:" ) ) {
+                m_exclusion = true;
+                token = token.substr( 8 );
+            }
+            if( !token.empty() ) {
+                Ptr<TestSpec::Pattern> pattern = new T( token );
+                if( m_exclusion )
+                    pattern = new TestSpec::ExcludedPattern( pattern );
+                m_currentFilter.m_patterns.push_back( pattern );
+            }
+            m_exclusion = false;
+            m_mode = None;
+        }
+        void addFilter() {
+            if( !m_currentFilter.m_patterns.empty() ) {
+                m_testSpec.m_filters.push_back( m_currentFilter );
+                m_currentFilter = TestSpec::Filter();
+            }
+        }
+    };
+    inline TestSpec parseTestSpec( std::string const& arg ) {
+        return TestSpecParser( ITagAliasRegistry::get() ).parse( arg ).testSpec();
+    }
+
+} // namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// #included from: catch_interfaces_config.h
+#define TWOBLUECUBES_CATCH_INTERFACES_CONFIG_H_INCLUDED
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct Verbosity { enum Level {
+        NoOutput = 0,
+        Quiet,
+        Normal
+    }; };
+
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        NoAssertions = 0x01
+    }; };
+
+    struct ShowDurations { enum OrNot {
+        DefaultForReporter,
+        Always,
+        Never
+    }; };
+    struct RunTests { enum InWhatOrder {
+        InDeclarationOrder,
+        InLexicographicalOrder,
+        InRandomOrder
+    }; };
+    struct UseColour { enum YesOrNo {
+        Auto,
+        Yes,
+        No
+    }; };
+
+    class TestSpec;
+
+    struct IConfig : IShared {
+
+        virtual ~IConfig();
+
+        virtual bool allowThrows() const = 0;
+        virtual std::ostream& stream() const = 0;
+        virtual std::string name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations::OrNot showDurations() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual RunTests::InWhatOrder runOrder() const = 0;
+        virtual unsigned int rngSeed() const = 0;
+        virtual UseColour::YesOrNo useColour() const = 0;
+    };
+}
+
+// #included from: catch_stream.h
+#define TWOBLUECUBES_CATCH_STREAM_H_INCLUDED
+
+// #included from: catch_streambuf.h
+#define TWOBLUECUBES_CATCH_STREAMBUF_H_INCLUDED
+
+#include <streambuf>
+
+namespace Catch {
+
+    class StreamBufBase : public std::streambuf {
+    public:
+        virtual ~StreamBufBase() CATCH_NOEXCEPT;
+    };
+}
+
+#include <streambuf>
+#include <ostream>
+#include <fstream>
+
+namespace Catch {
+
+    std::ostream& cout();
+    std::ostream& cerr();
+
+    struct IStream {
+        virtual ~IStream() CATCH_NOEXCEPT;
+        virtual std::ostream& stream() const = 0;
+    };
+
+    class FileStream : public IStream {
+        mutable std::ofstream m_ofs;
+    public:
+        FileStream( std::string const& filename );
+        virtual ~FileStream() CATCH_NOEXCEPT;
+    public: // IStream
+        virtual std::ostream& stream() const CATCH_OVERRIDE;
+    };
+
+    class CoutStream : public IStream {
+        mutable std::ostream m_os;
+    public:
+        CoutStream();
+        virtual ~CoutStream() CATCH_NOEXCEPT;
+
+    public: // IStream
+        virtual std::ostream& stream() const CATCH_OVERRIDE;
+    };
+
+    class DebugOutStream : public IStream {
+        std::auto_ptr<StreamBufBase> m_streamBuf;
+        mutable std::ostream m_os;
+    public:
+        DebugOutStream();
+        virtual ~DebugOutStream() CATCH_NOEXCEPT;
+
+    public: // IStream
+        virtual std::ostream& stream() const CATCH_OVERRIDE;
+    };
+}
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <ctime>
+
+#ifndef CATCH_CONFIG_CONSOLE_WIDTH
+#define CATCH_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+namespace Catch {
+
+    struct ConfigData {
+
+        ConfigData()
+        :   listTests( false ),
+            listTags( false ),
+            listReporters( false ),
+            listTestNamesOnly( false ),
+            showSuccessfulTests( false ),
+            shouldDebugBreak( false ),
+            noThrow( false ),
+            showHelp( false ),
+            showInvisibles( false ),
+            filenamesAsTags( false ),
+            abortAfter( -1 ),
+            rngSeed( 0 ),
+            verbosity( Verbosity::Normal ),
+            warnings( WarnAbout::Nothing ),
+            showDurations( ShowDurations::DefaultForReporter ),
+            runOrder( RunTests::InDeclarationOrder ),
+            useColour( UseColour::Auto )
+        {}
+
+        bool listTests;
+        bool listTags;
+        bool listReporters;
+        bool listTestNamesOnly;
+
+        bool showSuccessfulTests;
+        bool shouldDebugBreak;
+        bool noThrow;
+        bool showHelp;
+        bool showInvisibles;
+        bool filenamesAsTags;
+
+        int abortAfter;
+        unsigned int rngSeed;
+
+        Verbosity::Level verbosity;
+        WarnAbout::What warnings;
+        ShowDurations::OrNot showDurations;
+        RunTests::InWhatOrder runOrder;
+        UseColour::YesOrNo useColour;
+
+        std::string outputFilename;
+        std::string name;
+        std::string processName;
+
+        std::vector<std::string> reporterNames;
+        std::vector<std::string> testsOrTags;
+    };
+
+    class Config : public SharedImpl<IConfig> {
+    private:
+        Config( Config const& other );
+        Config& operator = ( Config const& other );
+        virtual void dummy();
+    public:
+
+        Config()
+        {}
+
+        Config( ConfigData const& data )
+        :   m_data( data ),
+            m_stream( openStream() )
+        {
+            if( !data.testsOrTags.empty() ) {
+                TestSpecParser parser( ITagAliasRegistry::get() );
+                for( std::size_t i = 0; i < data.testsOrTags.size(); ++i )
+                    parser.parse( data.testsOrTags[i] );
+                m_testSpec = parser.testSpec();
+            }
+        }
+
+        virtual ~Config() {
+        }
+
+        std::string const& getFilename() const {
+            return m_data.outputFilename ;
+        }
+
+        bool listTests() const { return m_data.listTests; }
+        bool listTestNamesOnly() const { return m_data.listTestNamesOnly; }
+        bool listTags() const { return m_data.listTags; }
+        bool listReporters() const { return m_data.listReporters; }
+
+        std::string getProcessName() const { return m_data.processName; }
+
+        bool shouldDebugBreak() const { return m_data.shouldDebugBreak; }
+
+        std::vector<std::string> getReporterNames() const { return m_data.reporterNames; }
+
+        int abortAfter() const { return m_data.abortAfter; }
+
+        TestSpec const& testSpec() const { return m_testSpec; }
+
+        bool showHelp() const { return m_data.showHelp; }
+        bool showInvisibles() const { return m_data.showInvisibles; }
+
+        // IConfig interface
+        virtual bool allowThrows() const        { return !m_data.noThrow; }
+        virtual std::ostream& stream() const    { return m_stream->stream(); }
+        virtual std::string name() const        { return m_data.name.empty() ? m_data.processName : m_data.name; }
+        virtual bool includeSuccessfulResults() const   { return m_data.showSuccessfulTests; }
+        virtual bool warnAboutMissingAssertions() const { return m_data.warnings & WarnAbout::NoAssertions; }
+        virtual ShowDurations::OrNot showDurations() const { return m_data.showDurations; }
+        virtual RunTests::InWhatOrder runOrder() const  { return m_data.runOrder; }
+        virtual unsigned int rngSeed() const    { return m_data.rngSeed; }
+        virtual UseColour::YesOrNo useColour() const { return m_data.useColour; }
+
+    private:
+
+        IStream const* openStream() {
+            if( m_data.outputFilename.empty() )
+                return new CoutStream();
+            else if( m_data.outputFilename[0] == '%' ) {
+                if( m_data.outputFilename == "%debug" )
+                    return new DebugOutStream();
+                else
+                    throw std::domain_error( "Unrecognised stream: " + m_data.outputFilename );
+            }
+            else
+                return new FileStream( m_data.outputFilename );
+        }
+        ConfigData m_data;
+
+        std::auto_ptr<IStream const> m_stream;
+        TestSpec m_testSpec;
+    };
+
+} // end namespace Catch
+
+// #included from: catch_clara.h
+#define TWOBLUECUBES_CATCH_CLARA_H_INCLUDED
+
+// Use Catch's value for console width (store Clara's off to the side, if present)
+#ifdef CLARA_CONFIG_CONSOLE_WIDTH
+#define CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH CLARA_CONFIG_CONSOLE_WIDTH
+#undef CLARA_CONFIG_CONSOLE_WIDTH
+#endif
+#define CLARA_CONFIG_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH
+
+// Declare Clara inside the Catch namespace
+#define STITCH_CLARA_OPEN_NAMESPACE namespace Catch {
+// #included from: ../external/clara.h
+
+// Version 0.0.1.1
+
+// Only use header guard if we are not using an outer namespace
+#if !defined(TWOBLUECUBES_CLARA_H_INCLUDED) || defined(STITCH_CLARA_OPEN_NAMESPACE)
+
+#ifndef STITCH_CLARA_OPEN_NAMESPACE
+#define TWOBLUECUBES_CLARA_H_INCLUDED
+#define STITCH_CLARA_OPEN_NAMESPACE
+#define STITCH_CLARA_CLOSE_NAMESPACE
+#else
+#define STITCH_CLARA_CLOSE_NAMESPACE }
+#endif
+
+#define STITCH_TBC_TEXT_FORMAT_OPEN_NAMESPACE STITCH_CLARA_OPEN_NAMESPACE
+
+// ----------- #included from tbc_text_format.h -----------
+
+// Only use header guard if we are not using an outer namespace
+#if !defined(TBC_TEXT_FORMAT_H_INCLUDED) || defined(STITCH_TBC_TEXT_FORMAT_OUTER_NAMESPACE)
+#ifndef STITCH_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+#define TBC_TEXT_FORMAT_H_INCLUDED
+#endif
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+
+// Use optional outer namespace
+#ifdef STITCH_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+namespace STITCH_TBC_TEXT_FORMAT_OUTER_NAMESPACE {
+#endif
+
+namespace Tbc {
+
+#ifdef TBC_TEXT_FORMAT_CONSOLE_WIDTH
+    const unsigned int consoleWidth = TBC_TEXT_FORMAT_CONSOLE_WIDTH;
+#else
+    const unsigned int consoleWidth = 80;
+#endif
+
+    struct TextAttributes {
+        TextAttributes()
+        :   initialIndent( std::string::npos ),
+            indent( 0 ),
+            width( consoleWidth-1 ),
+            tabChar( '\t' )
+        {}
+
+        TextAttributes& setInitialIndent( std::size_t _value )  { initialIndent = _value; return *this; }
+        TextAttributes& setIndent( std::size_t _value )         { indent = _value; return *this; }
+        TextAttributes& setWidth( std::size_t _value )          { width = _value; return *this; }
+        TextAttributes& setTabChar( char _value )               { tabChar = _value; return *this; }
+
+        std::size_t initialIndent;  // indent of first line, or npos
+        std::size_t indent;         // indent of subsequent lines, or all if initialIndent is npos
+        std::size_t width;          // maximum width of text, including indent. Longer text will wrap
+        char tabChar;               // If this char is seen the indent is changed to current pos
+    };
+
+    class Text {
+    public:
+        Text( std::string const& _str, TextAttributes const& _attr = TextAttributes() )
+        : attr( _attr )
+        {
+            std::string wrappableChars = " [({.,/|\\-";
+            std::size_t indent = _attr.initialIndent != std::string::npos
+                ? _attr.initialIndent
+                : _attr.indent;
+            std::string remainder = _str;
+
+            while( !remainder.empty() ) {
+                if( lines.size() >= 1000 ) {
+                    lines.push_back( "... message truncated due to excessive size" );
+                    return;
+                }
+                std::size_t tabPos = std::string::npos;
+                std::size_t width = (std::min)( remainder.size(), _attr.width - indent );
+                std::size_t pos = remainder.find_first_of( '\n' );
+                if( pos <= width ) {
+                    width = pos;
+                }
+                pos = remainder.find_last_of( _attr.tabChar, width );
+                if( pos != std::string::npos ) {
+                    tabPos = pos;
+                    if( remainder[width] == '\n' )
+                        width--;
+                    remainder = remainder.substr( 0, tabPos ) + remainder.substr( tabPos+1 );
+                }
+
+                if( width == remainder.size() ) {
+                    spliceLine( indent, remainder, width );
+                }
+                else if( remainder[width] == '\n' ) {
+                    spliceLine( indent, remainder, width );
+                    if( width <= 1 || remainder.size() != 1 )
+                        remainder = remainder.substr( 1 );
+                    indent = _attr.indent;
+                }
+                else {
+                    pos = remainder.find_last_of( wrappableChars, width );
+                    if( pos != std::string::npos && pos > 0 ) {
+                        spliceLine( indent, remainder, pos );
+                        if( remainder[0] == ' ' )
+                            remainder = remainder.substr( 1 );
+                    }
+                    else {
+                        spliceLine( indent, remainder, width-1 );
+                        lines.back() += "-";
+                    }
+                    if( lines.size() == 1 )
+                        indent = _attr.indent;
+                    if( tabPos != std::string::npos )
+                        indent += tabPos;
+                }
+            }
+        }
+
+        void spliceLine( std::size_t _indent, std::string& _remainder, std::size_t _pos ) {
+            lines.push_back( std::string( _indent, ' ' ) + _remainder.substr( 0, _pos ) );
+            _remainder = _remainder.substr( _pos );
+        }
+
+        typedef std::vector<std::string>::const_iterator const_iterator;
+
+        const_iterator begin() const { return lines.begin(); }
+        const_iterator end() const { return lines.end(); }
+        std::string const& last() const { return lines.back(); }
+        std::size_t size() const { return lines.size(); }
+        std::string const& operator[]( std::size_t _index ) const { return lines[_index]; }
+        std::string toString() const {
+            std::ostringstream oss;
+            oss << *this;
+            return oss.str();
+        }
+
+        inline friend std::ostream& operator << ( std::ostream& _stream, Text const& _text ) {
+            for( Text::const_iterator it = _text.begin(), itEnd = _text.end();
+                it != itEnd; ++it ) {
+                if( it != _text.begin() )
+                    _stream << "\n";
+                _stream << *it;
+            }
+            return _stream;
+        }
+
+    private:
+        std::string str;
+        TextAttributes attr;
+        std::vector<std::string> lines;
+    };
+
+} // end namespace Tbc
+
+#ifdef STITCH_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+} // end outer namespace
+#endif
+
+#endif // TBC_TEXT_FORMAT_H_INCLUDED
+
+// ----------- end of #include from tbc_text_format.h -----------
+// ........... back in clara.h
+
+#undef STITCH_TBC_TEXT_FORMAT_OPEN_NAMESPACE
+
+// ----------- #included from clara_compilers.h -----------
+
+#ifndef TWOBLUECUBES_CLARA_COMPILERS_H_INCLUDED
+#define TWOBLUECUBES_CLARA_COMPILERS_H_INCLUDED
+
+// Detect a number of compiler features - mostly C++11/14 conformance - by compiler
+// The following features are defined:
+//
+// CLARA_CONFIG_CPP11_NULLPTR : is nullptr supported?
+// CLARA_CONFIG_CPP11_NOEXCEPT : is noexcept supported?
+// CLARA_CONFIG_CPP11_GENERATED_METHODS : The delete and default keywords for compiler generated methods
+// CLARA_CONFIG_CPP11_OVERRIDE : is override supported?
+// CLARA_CONFIG_CPP11_UNIQUE_PTR : is unique_ptr supported (otherwise use auto_ptr)
+
+// CLARA_CONFIG_CPP11_OR_GREATER : Is C++11 supported?
+
+// CLARA_CONFIG_VARIADIC_MACROS : are variadic macros supported?
+
+// In general each macro has a _NO_<feature name> form
+// (e.g. CLARA_CONFIG_CPP11_NO_NULLPTR) which disables the feature.
+// Many features, at point of detection, define an _INTERNAL_ macro, so they
+// can be combined, en-mass, with the _NO_ forms later.
+
+// All the C++11 features can be disabled with CLARA_CONFIG_NO_CPP11
+
+#ifdef __clang__
+
+#if __has_feature(cxx_nullptr)
+#define CLARA_INTERNAL_CONFIG_CPP11_NULLPTR
+#endif
+
+#if __has_feature(cxx_noexcept)
+#define CLARA_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#endif
+
+#endif // __clang__
+
+////////////////////////////////////////////////////////////////////////////////
+// GCC
+#ifdef __GNUC__
+
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#define CLARA_INTERNAL_CONFIG_CPP11_NULLPTR
+#endif
+
+// - otherwise more recent versions define __cplusplus >= 201103L
+// and will get picked up below
+
+#endif // __GNUC__
+
+////////////////////////////////////////////////////////////////////////////////
+// Visual C++
+#ifdef _MSC_VER
+
+#if (_MSC_VER >= 1600)
+#define CLARA_INTERNAL_CONFIG_CPP11_NULLPTR
+#define CLARA_INTERNAL_CONFIG_CPP11_UNIQUE_PTR
+#endif
+
+#if (_MSC_VER >= 1900 ) // (VC++ 13 (VS2015))
+#define CLARA_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#define CLARA_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#endif
+
+#endif // _MSC_VER
+
+////////////////////////////////////////////////////////////////////////////////
+// C++ language feature support
+
+// catch all support for C++11
+#if defined(__cplusplus) && __cplusplus >= 201103L
+
+#define CLARA_CPP11_OR_GREATER
+
+#if !defined(CLARA_INTERNAL_CONFIG_CPP11_NULLPTR)
+#define CLARA_INTERNAL_CONFIG_CPP11_NULLPTR
+#endif
+
+#ifndef CLARA_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#define CLARA_INTERNAL_CONFIG_CPP11_NOEXCEPT
+#endif
+
+#ifndef CLARA_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#define CLARA_INTERNAL_CONFIG_CPP11_GENERATED_METHODS
+#endif
+
+#if !defined(CLARA_INTERNAL_CONFIG_CPP11_OVERRIDE)
+#define CLARA_INTERNAL_CONFIG_CPP11_OVERRIDE
+#endif
+#if !defined(CLARA_INTERNAL_CONFIG_CPP11_UNIQUE_PTR)
+#define CLARA_INTERNAL_CONFIG_CPP11_UNIQUE_PTR
+#endif
+
+#endif // __cplusplus >= 201103L
+
+// Now set the actual defines based on the above + anything the user has configured
+#if defined(CLARA_INTERNAL_CONFIG_CPP11_NULLPTR) && !defined(CLARA_CONFIG_CPP11_NO_NULLPTR) && !defined(CLARA_CONFIG_CPP11_NULLPTR) && !defined(CLARA_CONFIG_NO_CPP11)
+#define CLARA_CONFIG_CPP11_NULLPTR
+#endif
+#if defined(CLARA_INTERNAL_CONFIG_CPP11_NOEXCEPT) && !defined(CLARA_CONFIG_CPP11_NO_NOEXCEPT) && !defined(CLARA_CONFIG_CPP11_NOEXCEPT) && !defined(CLARA_CONFIG_NO_CPP11)
+#define CLARA_CONFIG_CPP11_NOEXCEPT
+#endif
+#if defined(CLARA_INTERNAL_CONFIG_CPP11_GENERATED_METHODS) && !defined(CLARA_CONFIG_CPP11_NO_GENERATED_METHODS) && !defined(CLARA_CONFIG_CPP11_GENERATED_METHODS) && !defined(CLARA_CONFIG_NO_CPP11)
+#define CLARA_CONFIG_CPP11_GENERATED_METHODS
+#endif
+#if defined(CLARA_INTERNAL_CONFIG_CPP11_OVERRIDE) && !defined(CLARA_CONFIG_NO_OVERRIDE) && !defined(CLARA_CONFIG_CPP11_OVERRIDE) && !defined(CLARA_CONFIG_NO_CPP11)
+#define CLARA_CONFIG_CPP11_OVERRIDE
+#endif
+#if defined(CLARA_INTERNAL_CONFIG_CPP11_UNIQUE_PTR) && !defined(CLARA_CONFIG_NO_UNIQUE_PTR) && !defined(CLARA_CONFIG_CPP11_UNIQUE_PTR) && !defined(CLARA_CONFIG_NO_CPP11)
+#define CLARA_CONFIG_CPP11_UNIQUE_PTR
+#endif
+
+// noexcept support:
+#if defined(CLARA_CONFIG_CPP11_NOEXCEPT) && !defined(CLARA_NOEXCEPT)
+#define CLARA_NOEXCEPT noexcept
+#  define CLARA_NOEXCEPT_IS(x) noexcept(x)
+#else
+#define CLARA_NOEXCEPT throw()
+#  define CLARA_NOEXCEPT_IS(x)
+#endif
+
+// nullptr support
+#ifdef CLARA_CONFIG_CPP11_NULLPTR
+#define CLARA_NULL nullptr
+#else
+#define CLARA_NULL NULL
+#endif
+
+// override support
+#ifdef CLARA_CONFIG_CPP11_OVERRIDE
+#define CLARA_OVERRIDE override
+#else
+#define CLARA_OVERRIDE
+#endif
+
+// unique_ptr support
+#ifdef CLARA_CONFIG_CPP11_UNIQUE_PTR
+#   define CLARA_AUTO_PTR( T ) std::unique_ptr<T>
+#else
+#   define CLARA_AUTO_PTR( T ) std::auto_ptr<T>
+#endif
+
+#endif // TWOBLUECUBES_CLARA_COMPILERS_H_INCLUDED
+
+// ----------- end of #include from clara_compilers.h -----------
+// ........... back in clara.h
+
+#include <map>
+#include <stdexcept>
+#include <memory>
+
+// Use optional outer namespace
+#ifdef STITCH_CLARA_OPEN_NAMESPACE
+STITCH_CLARA_OPEN_NAMESPACE
+#endif
+
+namespace Clara {
+
+    struct UnpositionalTag {};
+
+    extern UnpositionalTag _;
+
+#ifdef CLARA_CONFIG_MAIN
+    UnpositionalTag _;
+#endif
+
+    namespace Detail {
+
+#ifdef CLARA_CONSOLE_WIDTH
+    const unsigned int consoleWidth = CLARA_CONFIG_CONSOLE_WIDTH;
+#else
+    const unsigned int consoleWidth = 80;
+#endif
+
+        // Use this to try and stop compiler from warning about unreachable code
+        inline bool isTrue( bool value ) { return value; }
+
+        using namespace Tbc;
+
+        inline bool startsWith( std::string const& str, std::string const& prefix ) {
+            return str.size() >= prefix.size() && str.substr( 0, prefix.size() ) == prefix;
+        }
+
+        template<typename T> struct RemoveConstRef{ typedef T type; };
+        template<typename T> struct RemoveConstRef<T&>{ typedef T type; };
+        template<typename T> struct RemoveConstRef<T const&>{ typedef T type; };
+        template<typename T> struct RemoveConstRef<T const>{ typedef T type; };
+
+        template<typename T>    struct IsBool       { static const bool value = false; };
+        template<>              struct IsBool<bool> { static const bool value = true; };
+
+        template<typename T>
+        void convertInto( std::string const& _source, T& _dest ) {
+            std::stringstream ss;
+            ss << _source;
+            ss >> _dest;
+            if( ss.fail() )
+                throw std::runtime_error( "Unable to convert " + _source + " to destination type" );
+        }
+        inline void convertInto( std::string const& _source, std::string& _dest ) {
+            _dest = _source;
+        }
+        inline void convertInto( std::string const& _source, bool& _dest ) {
+            std::string sourceLC = _source;
+            std::transform( sourceLC.begin(), sourceLC.end(), sourceLC.begin(), ::tolower );
+            if( sourceLC == "y" || sourceLC == "1" || sourceLC == "true" || sourceLC == "yes" || sourceLC == "on" )
+                _dest = true;
+            else if( sourceLC == "n" || sourceLC == "0" || sourceLC == "false" || sourceLC == "no" || sourceLC == "off" )
+                _dest = false;
+            else
+                throw std::runtime_error( "Expected a boolean value but did not recognise:\n  '" + _source + "'" );
+        }
+        inline void convertInto( bool _source, bool& _dest ) {
+            _dest = _source;
+        }
+        template<typename T>
+        inline void convertInto( bool, T& ) {
+            if( isTrue( true ) )
+                throw std::runtime_error( "Invalid conversion" );
+        }
+
+        template<typename ConfigT>
+        struct IArgFunction {
+            virtual ~IArgFunction() {}
+#ifdef CLARA_CONFIG_CPP11_GENERATED_METHODS
+            IArgFunction()                      = default;
+            IArgFunction( IArgFunction const& ) = default;
+#endif
+            virtual void set( ConfigT& config, std::string const& value ) const = 0;
+            virtual void setFlag( ConfigT& config ) const = 0;
+            virtual bool takesArg() const = 0;
+            virtual IArgFunction* clone() const = 0;
+        };
+
+        template<typename ConfigT>
+        class BoundArgFunction {
+        public:
+            BoundArgFunction() : functionObj( CLARA_NULL ) {}
+            BoundArgFunction( IArgFunction<ConfigT>* _functionObj ) : functionObj( _functionObj ) {}
+            BoundArgFunction( BoundArgFunction const& other ) : functionObj( other.functionObj ? other.functionObj->clone() : CLARA_NULL ) {}
+            BoundArgFunction& operator = ( BoundArgFunction const& other ) {
+                IArgFunction<ConfigT>* newFunctionObj = other.functionObj ? other.functionObj->clone() : CLARA_NULL;
+                delete functionObj;
+                functionObj = newFunctionObj;
+                return *this;
+            }
+            ~BoundArgFunction() { delete functionObj; }
+
+            void set( ConfigT& config, std::string const& value ) const {
+                functionObj->set( config, value );
+            }
+            void setFlag( ConfigT& config ) const {
+                functionObj->setFlag( config );
+            }
+            bool takesArg() const { return functionObj->takesArg(); }
+
+            bool isSet() const {
+                return functionObj != CLARA_NULL;
+            }
+        private:
+            IArgFunction<ConfigT>* functionObj;
+        };
+
+        template<typename C>
+        struct NullBinder : IArgFunction<C>{
+            virtual void set( C&, std::string const& ) const {}
+            virtual void setFlag( C& ) const {}
+            virtual bool takesArg() const { return true; }
+            virtual IArgFunction<C>* clone() const { return new NullBinder( *this ); }
+        };
+
+        template<typename C, typename M>
+        struct BoundDataMember : IArgFunction<C>{
+            BoundDataMember( M C::* _member ) : member( _member ) {}
+            virtual void set( C& p, std::string const& stringValue ) const {
+                convertInto( stringValue, p.*member );
+            }
+            virtual void setFlag( C& p ) const {
+                convertInto( true, p.*member );
+            }
+            virtual bool takesArg() const { return !IsBool<M>::value; }
+            virtual IArgFunction<C>* clone() const { return new BoundDataMember( *this ); }
+            M C::* member;
+        };
+        template<typename C, typename M>
+        struct BoundUnaryMethod : IArgFunction<C>{
+            BoundUnaryMethod( void (C::*_member)( M ) ) : member( _member ) {}
+            virtual void set( C& p, std::string const& stringValue ) const {
+                typename RemoveConstRef<M>::type value;
+                convertInto( stringValue, value );
+                (p.*member)( value );
+            }
+            virtual void setFlag( C& p ) const {
+                typename RemoveConstRef<M>::type value;
+                convertInto( true, value );
+                (p.*member)( value );
+            }
+            virtual bool takesArg() const { return !IsBool<M>::value; }
+            virtual IArgFunction<C>* clone() const { return new BoundUnaryMethod( *this ); }
+            void (C::*member)( M );
+        };
+        template<typename C>
+        struct BoundNullaryMethod : IArgFunction<C>{
+            BoundNullaryMethod( void (C::*_member)() ) : member( _member ) {}
+            virtual void set( C& p, std::string const& stringValue ) const {
+                bool value;
+                convertInto( stringValue, value );
+                if( value )
+                    (p.*member)();
+            }
+            virtual void setFlag( C& p ) const {
+                (p.*member)();
+            }
+            virtual bool takesArg() const { return false; }
+            virtual IArgFunction<C>* clone() const { return new BoundNullaryMethod( *this ); }
+            void (C::*member)();
+        };
+
+        template<typename C>
+        struct BoundUnaryFunction : IArgFunction<C>{
+            BoundUnaryFunction( void (*_function)( C& ) ) : function( _function ) {}
+            virtual void set( C& obj, std::string const& stringValue ) const {
+                bool value;
+                convertInto( stringValue, value );
+                if( value )
+                    function( obj );
+            }
+            virtual void setFlag( C& p ) const {
+                function( p );
+            }
+            virtual bool takesArg() const { return false; }
+            virtual IArgFunction<C>* clone() const { return new BoundUnaryFunction( *this ); }
+            void (*function)( C& );
+        };
+
+        template<typename C, typename T>
+        struct BoundBinaryFunction : IArgFunction<C>{
+            BoundBinaryFunction( void (*_function)( C&, T ) ) : function( _function ) {}
+            virtual void set( C& obj, std::string const& stringValue ) const {
+                typename RemoveConstRef<T>::type value;
+                convertInto( stringValue, value );
+                function( obj, value );
+            }
+            virtual void setFlag( C& obj ) const {
+                typename RemoveConstRef<T>::type value;
+                convertInto( true, value );
+                function( obj, value );
+            }
+            virtual bool takesArg() const { return !IsBool<T>::value; }
+            virtual IArgFunction<C>* clone() const { return new BoundBinaryFunction( *this ); }
+            void (*function)( C&, T );
+        };
+
+    } // namespace Detail
+
+    struct Parser {
+        Parser() : separators( " \t=:" ) {}
+
+        struct Token {
+            enum Type { Positional, ShortOpt, LongOpt };
+            Token( Type _type, std::string const& _data ) : type( _type ), data( _data ) {}
+            Type type;
+            std::string data;
+        };
+
+        void parseIntoTokens( int argc, char const* const argv[], std::vector<Parser::Token>& tokens ) const {
+            const std::string doubleDash = "--";
+            for( int i = 1; i < argc && argv[i] != doubleDash; ++i )
+                parseIntoTokens( argv[i] , tokens);
+        }
+        void parseIntoTokens( std::string arg, std::vector<Parser::Token>& tokens ) const {
+            while( !arg.empty() ) {
+                Parser::Token token( Parser::Token::Positional, arg );
+                arg = "";
+                if( token.data[0] == '-' ) {
+                    if( token.data.size() > 1 && token.data[1] == '-' ) {
+                        token = Parser::Token( Parser::Token::LongOpt, token.data.substr( 2 ) );
+                    }
+                    else {
+                        token = Parser::Token( Parser::Token::ShortOpt, token.data.substr( 1 ) );
+                        if( token.data.size() > 1 && separators.find( token.data[1] ) == std::string::npos ) {
+                            arg = "-" + token.data.substr( 1 );
+                            token.data = token.data.substr( 0, 1 );
+                        }
+                    }
+                }
+                if( token.type != Parser::Token::Positional ) {
+                    std::size_t pos = token.data.find_first_of( separators );
+                    if( pos != std::string::npos ) {
+                        arg = token.data.substr( pos+1 );
+                        token.data = token.data.substr( 0, pos );
+                    }
+                }
+                tokens.push_back( token );
+            }
+        }
+        std::string separators;
+    };
+
+    template<typename ConfigT>
+    struct CommonArgProperties {
+        CommonArgProperties() {}
+        CommonArgProperties( Detail::BoundArgFunction<ConfigT> const& _boundField ) : boundField( _boundField ) {}
+
+        Detail::BoundArgFunction<ConfigT> boundField;
+        std::string description;
+        std::string detail;
+        std::string placeholder; // Only value if boundField takes an arg
+
+        bool takesArg() const {
+            return !placeholder.empty();
+        }
+        void validate() const {
+            if( !boundField.isSet() )
+                throw std::logic_error( "option not bound" );
+        }
+    };
+    struct OptionArgProperties {
+        std::vector<std::string> shortNames;
+        std::string longName;
+
+        bool hasShortName( std::string const& shortName ) const {
+            return std::find( shortNames.begin(), shortNames.end(), shortName ) != shortNames.end();
+        }
+        bool hasLongName( std::string const& _longName ) const {
+            return _longName == longName;
+        }
+    };
+    struct PositionalArgProperties {
+        PositionalArgProperties() : position( -1 ) {}
+        int position; // -1 means non-positional (floating)
+
+        bool isFixedPositional() const {
+            return position != -1;
+        }
+    };
+
+    template<typename ConfigT>
+    class CommandLine {
+
+        struct Arg : CommonArgProperties<ConfigT>, OptionArgProperties, PositionalArgProperties {
+            Arg() {}
+            Arg( Detail::BoundArgFunction<ConfigT> const& _boundField ) : CommonArgProperties<ConfigT>( _boundField ) {}
+
+            using CommonArgProperties<ConfigT>::placeholder; // !TBD
+
+            std::string dbgName() const {
+                if( !longName.empty() )
+                    return "--" + longName;
+                if( !shortNames.empty() )
+                    return "-" + shortNames[0];
+                return "positional args";
+            }
+            std::string commands() const {
+                std::ostringstream oss;
+                bool first = true;
+                std::vector<std::string>::const_iterator it = shortNames.begin(), itEnd = shortNames.end();
+                for(; it != itEnd; ++it ) {
+                    if( first )
+                        first = false;
+                    else
+                        oss << ", ";
+                    oss << "-" << *it;
+                }
+                if( !longName.empty() ) {
+                    if( !first )
+                        oss << ", ";
+                    oss << "--" << longName;
+                }
+                if( !placeholder.empty() )
+                    oss << " <" << placeholder << ">";
+                return oss.str();
+            }
+        };
+
+        typedef CLARA_AUTO_PTR( Arg ) ArgAutoPtr;
+
+        friend void addOptName( Arg& arg, std::string const& optName )
+        {
+            if( optName.empty() )
+                return;
+            if( Detail::startsWith( optName, "--" ) ) {
+                if( !arg.longName.empty() )
+                    throw std::logic_error( "Only one long opt may be specified. '"
+                        + arg.longName
+                        + "' already specified, now attempting to add '"
+                        + optName + "'" );
+                arg.longName = optName.substr( 2 );
+            }
+            else if( Detail::startsWith( optName, "-" ) )
+                arg.shortNames.push_back( optName.substr( 1 ) );
+            else
+                throw std::logic_error( "option must begin with - or --. Option was: '" + optName + "'" );
+        }
+        friend void setPositionalArg( Arg& arg, int position )
+        {
+            arg.position = position;
+        }
+
+        class ArgBuilder {
+        public:
+            ArgBuilder( Arg* arg ) : m_arg( arg ) {}
+
+            // Bind a non-boolean data member (requires placeholder string)
+            template<typename C, typename M>
+            void bind( M C::* field, std::string const& placeholder ) {
+                m_arg->boundField = new Detail::BoundDataMember<C,M>( field );
+                m_arg->placeholder = placeholder;
+            }
+            // Bind a boolean data member (no placeholder required)
+            template<typename C>
+            void bind( bool C::* field ) {
+                m_arg->boundField = new Detail::BoundDataMember<C,bool>( field );
+            }
+
+            // Bind a method taking a single, non-boolean argument (requires a placeholder string)
+            template<typename C, typename M>
+            void bind( void (C::* unaryMethod)( M ), std::string const& placeholder ) {
+                m_arg->boundField = new Detail::BoundUnaryMethod<C,M>( unaryMethod );
+                m_arg->placeholder = placeholder;
+            }
+
+            // Bind a method taking a single, boolean argument (no placeholder string required)
+            template<typename C>
+            void bind( void (C::* unaryMethod)( bool ) ) {
+                m_arg->boundField = new Detail::BoundUnaryMethod<C,bool>( unaryMethod );
+            }
+
+            // Bind a method that takes no arguments (will be called if opt is present)
+            template<typename C>
+            void bind( void (C::* nullaryMethod)() ) {
+                m_arg->boundField = new Detail::BoundNullaryMethod<C>( nullaryMethod );
+            }
+
+            // Bind a free function taking a single argument - the object to operate on (no placeholder string required)
+            template<typename C>
+            void bind( void (* unaryFunction)( C& ) ) {
+                m_arg->boundField = new Detail::BoundUnaryFunction<C>( unaryFunction );
+            }
+
+            // Bind a free function taking a single argument - the object to operate on (requires a placeholder string)
+            template<typename C, typename T>
+            void bind( void (* binaryFunction)( C&, T ), std::string const& placeholder ) {
+                m_arg->boundField = new Detail::BoundBinaryFunction<C, T>( binaryFunction );
+                m_arg->placeholder = placeholder;
+            }
+
+            ArgBuilder& describe( std::string const& description ) {
+                m_arg->description = description;
+                return *this;
+            }
+            ArgBuilder& detail( std::string const& detail ) {
+                m_arg->detail = detail;
+                return *this;
+            }
+
+        protected:
+            Arg* m_arg;
+        };
+
+        class OptBuilder : public ArgBuilder {
+        public:
+            OptBuilder( Arg* arg ) : ArgBuilder( arg ) {}
+            OptBuilder( OptBuilder& other ) : ArgBuilder( other ) {}
+
+            OptBuilder& operator[]( std::string const& optName ) {
+                addOptName( *ArgBuilder::m_arg, optName );
+                return *this;
+            }
+        };
+
+    public:
+
+        CommandLine()
+        :   m_boundProcessName( new Detail::NullBinder<ConfigT>() ),
+            m_highestSpecifiedArgPosition( 0 ),
+            m_throwOnUnrecognisedTokens( false )
+        {}
+        CommandLine( CommandLine const& other )
+        :   m_boundProcessName( other.m_boundProcessName ),
+            m_options ( other.m_options ),
+            m_positionalArgs( other.m_positionalArgs ),
+            m_highestSpecifiedArgPosition( other.m_highestSpecifiedArgPosition ),
+            m_throwOnUnrecognisedTokens( other.m_throwOnUnrecognisedTokens )
+        {
+            if( other.m_floatingArg.get() )
+                m_floatingArg.reset( new Arg( *other.m_floatingArg ) );
+        }
+
+        CommandLine& setThrowOnUnrecognisedTokens( bool shouldThrow = true ) {
+            m_throwOnUnrecognisedTokens = shouldThrow;
+            return *this;
+        }
+
+        OptBuilder operator[]( std::string const& optName ) {
+            m_options.push_back( Arg() );
+            addOptName( m_options.back(), optName );
+            OptBuilder builder( &m_options.back() );
+            return builder;
+        }
+
+        ArgBuilder operator[]( int position ) {
+            m_positionalArgs.insert( std::make_pair( position, Arg() ) );
+            if( position > m_highestSpecifiedArgPosition )
+                m_highestSpecifiedArgPosition = position;
+            setPositionalArg( m_positionalArgs[position], position );
+            ArgBuilder builder( &m_positionalArgs[position] );
+            return builder;
+        }
+
+        // Invoke this with the _ instance
+        ArgBuilder operator[]( UnpositionalTag ) {
+            if( m_floatingArg.get() )
+                throw std::logic_error( "Only one unpositional argument can be added" );
+            m_floatingArg.reset( new Arg() );
+            ArgBuilder builder( m_floatingArg.get() );
+            return builder;
+        }
+
+        template<typename C, typename M>
+        void bindProcessName( M C::* field ) {
+            m_boundProcessName = new Detail::BoundDataMember<C,M>( field );
+        }
+        template<typename C, typename M>
+        void bindProcessName( void (C::*_unaryMethod)( M ) ) {
+            m_boundProcessName = new Detail::BoundUnaryMethod<C,M>( _unaryMethod );
+        }
+
+        void optUsage( std::ostream& os, std::size_t indent = 0, std::size_t width = Detail::consoleWidth ) const {
+            typename std::vector<Arg>::const_iterator itBegin = m_options.begin(), itEnd = m_options.end(), it;
+            std::size_t maxWidth = 0;
+            for( it = itBegin; it != itEnd; ++it )
+                maxWidth = (std::max)( maxWidth, it->commands().size() );
+
+            for( it = itBegin; it != itEnd; ++it ) {
+                Detail::Text usage( it->commands(), Detail::TextAttributes()
+                                                        .setWidth( maxWidth+indent )
+                                                        .setIndent( indent ) );
+                Detail::Text desc( it->description, Detail::TextAttributes()
+                                                        .setWidth( width - maxWidth - 3 ) );
+
+                for( std::size_t i = 0; i < (std::max)( usage.size(), desc.size() ); ++i ) {
+                    std::string usageCol = i < usage.size() ? usage[i] : "";
+                    os << usageCol;
+
+                    if( i < desc.size() && !desc[i].empty() )
+                        os  << std::string( indent + 2 + maxWidth - usageCol.size(), ' ' )
+                            << desc[i];
+                    os << "\n";
+                }
+            }
+        }
+        std::string optUsage() const {
+            std::ostringstream oss;
+            optUsage( oss );
+            return oss.str();
+        }
+
+        void argSynopsis( std::ostream& os ) const {
+            for( int i = 1; i <= m_highestSpecifiedArgPosition; ++i ) {
+                if( i > 1 )
+                    os << " ";
+                typename std::map<int, Arg>::const_iterator it = m_positionalArgs.find( i );
+                if( it != m_positionalArgs.end() )
+                    os << "<" << it->second.placeholder << ">";
+                else if( m_floatingArg.get() )
+                    os << "<" << m_floatingArg->placeholder << ">";
+                else
+                    throw std::logic_error( "non consecutive positional arguments with no floating args" );
+            }
+            // !TBD No indication of mandatory args
+            if( m_floatingArg.get() ) {
+                if( m_highestSpecifiedArgPosition > 1 )
+                    os << " ";
+                os << "[<" << m_floatingArg->placeholder << "> ...]";
+            }
+        }
+        std::string argSynopsis() const {
+            std::ostringstream oss;
+            argSynopsis( oss );
+            return oss.str();
+        }
+
+        void usage( std::ostream& os, std::string const& procName ) const {
+            validate();
+            os << "usage:\n  " << procName << " ";
+            argSynopsis( os );
+            if( !m_options.empty() ) {
+                os << " [options]\n\nwhere options are: \n";
+                optUsage( os, 2 );
+            }
+            os << "\n";
+        }
+        std::string usage( std::string const& procName ) const {
+            std::ostringstream oss;
+            usage( oss, procName );
+            return oss.str();
+        }
+
+        ConfigT parse( int argc, char const* const argv[] ) const {
+            ConfigT config;
+            parseInto( argc, argv, config );
+            return config;
+        }
+
+        std::vector<Parser::Token> parseInto( int argc, char const* argv[], ConfigT& config ) const {
+            std::string processName = argv[0];
+            std::size_t lastSlash = processName.find_last_of( "/\\" );
+            if( lastSlash != std::string::npos )
+                processName = processName.substr( lastSlash+1 );
+            m_boundProcessName.set( config, processName );
+            std::vector<Parser::Token> tokens;
+            Parser parser;
+            parser.parseIntoTokens( argc, argv, tokens );
+            return populate( tokens, config );
+        }
+
+        std::vector<Parser::Token> populate( std::vector<Parser::Token> const& tokens, ConfigT& config ) const {
+            validate();
+            std::vector<Parser::Token> unusedTokens = populateOptions( tokens, config );
+            unusedTokens = populateFixedArgs( unusedTokens, config );
+            unusedTokens = populateFloatingArgs( unusedTokens, config );
+            return unusedTokens;
+        }
+
+        std::vector<Parser::Token> populateOptions( std::vector<Parser::Token> const& tokens, ConfigT& config ) const {
+            std::vector<Parser::Token> unusedTokens;
+            std::vector<std::string> errors;
+            for( std::size_t i = 0; i < tokens.size(); ++i ) {
+                Parser::Token const& token = tokens[i];
+                typename std::vector<Arg>::const_iterator it = m_options.begin(), itEnd = m_options.end();
+                for(; it != itEnd; ++it ) {
+                    Arg const& arg = *it;
+
+                    try {
+                        if( ( token.type == Parser::Token::ShortOpt && arg.hasShortName( token.data ) ) ||
+                            ( token.type == Parser::Token::LongOpt && arg.hasLongName( token.data ) ) ) {
+                            if( arg.takesArg() ) {
+                                if( i == tokens.size()-1 || tokens[i+1].type != Parser::Token::Positional )
+                                    errors.push_back( "Expected argument to option: " + token.data );
+                                else
+                                    arg.boundField.set( config, tokens[++i].data );
+                            }
+                            else {
+                                arg.boundField.setFlag( config );
+                            }
+                            break;
+                        }
+                    }
+                    catch( std::exception& ex ) {
+                        errors.push_back( std::string( ex.what() ) + "\n- while parsing: (" + arg.commands() + ")" );
+                    }
+                }
+                if( it == itEnd ) {
+                    if( token.type == Parser::Token::Positional || !m_throwOnUnrecognisedTokens )
+                        unusedTokens.push_back( token );
+                    else if( errors.empty() && m_throwOnUnrecognisedTokens )
+                        errors.push_back( "unrecognised option: " + token.data );
+                }
+            }
+            if( !errors.empty() ) {
+                std::ostringstream oss;
+                for( std::vector<std::string>::const_iterator it = errors.begin(), itEnd = errors.end();
+                        it != itEnd;
+                        ++it ) {
+                    if( it != errors.begin() )
+                        oss << "\n";
+                    oss << *it;
+                }
+                throw std::runtime_error( oss.str() );
+            }
+            return unusedTokens;
+        }
+        std::vector<Parser::Token> populateFixedArgs( std::vector<Parser::Token> const& tokens, ConfigT& config ) const {
+            std::vector<Parser::Token> unusedTokens;
+            int position = 1;
+            for( std::size_t i = 0; i < tokens.size(); ++i ) {
+                Parser::Token const& token = tokens[i];
+                typename std::map<int, Arg>::const_iterator it = m_positionalArgs.find( position );
+                if( it != m_positionalArgs.end() )
+                    it->second.boundField.set( config, token.data );
+                else
+                    unusedTokens.push_back( token );
+                if( token.type == Parser::Token::Positional )
+                    position++;
+            }
+            return unusedTokens;
+        }
+        std::vector<Parser::Token> populateFloatingArgs( std::vector<Parser::Token> const& tokens, ConfigT& config ) const {
+            if( !m_floatingArg.get() )
+                return tokens;
+            std::vector<Parser::Token> unusedTokens;
+            for( std::size_t i = 0; i < tokens.size(); ++i ) {
+                Parser::Token const& token = tokens[i];
+                if( token.type == Parser::Token::Positional )
+                    m_floatingArg->boundField.set( config, token.data );
+                else
+                    unusedTokens.push_back( token );
+            }
+            return unusedTokens;
+        }
+
+        void validate() const
+        {
+            if( m_options.empty() && m_positionalArgs.empty() && !m_floatingArg.get() )
+                throw std::logic_error( "No options or arguments specified" );
+
+            for( typename std::vector<Arg>::const_iterator  it = m_options.begin(),
+                                                            itEnd = m_options.end();
+                    it != itEnd; ++it )
+                it->validate();
+        }
+
+    private:
+        Detail::BoundArgFunction<ConfigT> m_boundProcessName;
+        std::vector<Arg> m_options;
+        std::map<int, Arg> m_positionalArgs;
+        ArgAutoPtr m_floatingArg;
+        int m_highestSpecifiedArgPosition;
+        bool m_throwOnUnrecognisedTokens;
+    };
+
+} // end namespace Clara
+
+STITCH_CLARA_CLOSE_NAMESPACE
+#undef STITCH_CLARA_OPEN_NAMESPACE
+#undef STITCH_CLARA_CLOSE_NAMESPACE
+
+#endif // TWOBLUECUBES_CLARA_H_INCLUDED
+#undef STITCH_CLARA_OPEN_NAMESPACE
+
+// Restore Clara's value for console width, if present
+#ifdef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#define CLARA_CONFIG_CONSOLE_WIDTH CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#undef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH
+#endif
+
+#include <fstream>
+
+namespace Catch {
+
+    inline void abortAfterFirst( ConfigData& config ) { config.abortAfter = 1; }
+    inline void abortAfterX( ConfigData& config, int x ) {
+        if( x < 1 )
+            throw std::runtime_error( "Value after -x or --abortAfter must be greater than zero" );
+        config.abortAfter = x;
+    }
+    inline void addTestOrTags( ConfigData& config, std::string const& _testSpec ) { config.testsOrTags.push_back( _testSpec ); }
+    inline void addReporterName( ConfigData& config, std::string const& _reporterName ) { config.reporterNames.push_back( _reporterName ); }
+
+    inline void addWarning( ConfigData& config, std::string const& _warning ) {
+        if( _warning == "NoAssertions" )
+            config.warnings = static_cast<WarnAbout::What>( config.warnings | WarnAbout::NoAssertions );
+        else
+            throw std::runtime_error( "Unrecognised warning: '" + _warning + "'" );
+    }
+    inline void setOrder( ConfigData& config, std::string const& order ) {
+        if( startsWith( "declared", order ) )
+            config.runOrder = RunTests::InDeclarationOrder;
+        else if( startsWith( "lexical", order ) )
+            config.runOrder = RunTests::InLexicographicalOrder;
+        else if( startsWith( "random", order ) )
+            config.runOrder = RunTests::InRandomOrder;
+        else
+            throw std::runtime_error( "Unrecognised ordering: '" + order + "'" );
+    }
+    inline void setRngSeed( ConfigData& config, std::string const& seed ) {
+        if( seed == "time" ) {
+            config.rngSeed = static_cast<unsigned int>( std::time(0) );
+        }
+        else {
+            std::stringstream ss;
+            ss << seed;
+            ss >> config.rngSeed;
+            if( ss.fail() )
+                throw std::runtime_error( "Argment to --rng-seed should be the word 'time' or a number" );
+        }
+    }
+    inline void setVerbosity( ConfigData& config, int level ) {
+        // !TBD: accept strings?
+        config.verbosity = static_cast<Verbosity::Level>( level );
+    }
+    inline void setShowDurations( ConfigData& config, bool _showDurations ) {
+        config.showDurations = _showDurations
+            ? ShowDurations::Always
+            : ShowDurations::Never;
+    }
+    inline void setUseColour( ConfigData& config, std::string const& value ) {
+        std::string mode = toLower( value );
+
+        if( mode == "yes" )
+            config.useColour = UseColour::Yes;
+        else if( mode == "no" )
+            config.useColour = UseColour::No;
+        else if( mode == "auto" )
+            config.useColour = UseColour::Auto;
+        else
+            throw std::runtime_error( "colour mode must be one of: auto, yes or no" );
+    }
+    inline void forceColour( ConfigData& config ) {
+        config.useColour = UseColour::Yes;
+    }
+    inline void loadTestNamesFromFile( ConfigData& config, std::string const& _filename ) {
+        std::ifstream f( _filename.c_str() );
+        if( !f.is_open() )
+            throw std::domain_error( "Unable to load input file: " + _filename );
+
+        std::string line;
+        while( std::getline( f, line ) ) {
+            line = trim(line);
+            if( !line.empty() && !startsWith( line, "#" ) )
+                addTestOrTags( config, "\"" + line + "\"," );
+        }
+    }
+
+    inline Clara::CommandLine<ConfigData> makeCommandLineParser() {
+
+        using namespace Clara;
+        CommandLine<ConfigData> cli;
+
+        cli.bindProcessName( &ConfigData::processName );
+
+        cli["-?"]["-h"]["--help"]
+            .describe( "display usage information" )
+            .bind( &ConfigData::showHelp );
+
+        cli["-l"]["--list-tests"]
+            .describe( "list all/matching test cases" )
+            .bind( &ConfigData::listTests );
+
+        cli["-t"]["--list-tags"]
+            .describe( "list all/matching tags" )
+            .bind( &ConfigData::listTags );
+
+        cli["-s"]["--success"]
+            .describe( "include successful tests in output" )
+            .bind( &ConfigData::showSuccessfulTests );
+
+        cli["-b"]["--break"]
+            .describe( "break into debugger on failure" )
+            .bind( &ConfigData::shouldDebugBreak );
+
+        cli["-e"]["--nothrow"]
+            .describe( "skip exception tests" )
+            .bind( &ConfigData::noThrow );
+
+        cli["-i"]["--invisibles"]
+            .describe( "show invisibles (tabs, newlines)" )
+            .bind( &ConfigData::showInvisibles );
+
+        cli["-o"]["--out"]
+            .describe( "output filename" )
+            .bind( &ConfigData::outputFilename, "filename" );
+
+        cli["-r"]["--reporter"]
+//            .placeholder( "name[:filename]" )
+            .describe( "reporter to use (defaults to console)" )
+            .bind( &addReporterName, "name" );
+
+        cli["-n"]["--name"]
+            .describe( "suite name" )
+            .bind( &ConfigData::name, "name" );
+
+        cli["-a"]["--abort"]
+            .describe( "abort at first failure" )
+            .bind( &abortAfterFirst );
+
+        cli["-x"]["--abortx"]
+            .describe( "abort after x failures" )
+            .bind( &abortAfterX, "no. failures" );
+
+        cli["-w"]["--warn"]
+            .describe( "enable warnings" )
+            .bind( &addWarning, "warning name" );
+
+// - needs updating if reinstated
+//        cli.into( &setVerbosity )
+//            .describe( "level of verbosity (0=no output)" )
+//            .shortOpt( "v")
+//            .longOpt( "verbosity" )
+//            .placeholder( "level" );
+
+        cli[_]
+            .describe( "which test or tests to use" )
+            .bind( &addTestOrTags, "test name, pattern or tags" );
+
+        cli["-d"]["--durations"]
+            .describe( "show test durations" )
+            .bind( &setShowDurations, "yes|no" );
+
+        cli["-f"]["--input-file"]
+            .describe( "load test names to run from a file" )
+            .bind( &loadTestNamesFromFile, "filename" );
+
+        cli["-#"]["--filenames-as-tags"]
+            .describe( "adds a tag for the filename" )
+            .bind( &ConfigData::filenamesAsTags );
+
+        // Less common commands which don't have a short form
+        cli["--list-test-names-only"]
+            .describe( "list all/matching test cases names only" )
+            .bind( &ConfigData::listTestNamesOnly );
+
+        cli["--list-reporters"]
+            .describe( "list all reporters" )
+            .bind( &ConfigData::listReporters );
+
+        cli["--order"]
+            .describe( "test case order (defaults to decl)" )
+            .bind( &setOrder, "decl|lex|rand" );
+
+        cli["--rng-seed"]
+            .describe( "set a specific seed for random numbers" )
+            .bind( &setRngSeed, "'time'|number" );
+
+        cli["--force-colour"]
+            .describe( "force colourised output (deprecated)" )
+            .bind( &forceColour );
+
+        cli["--use-colour"]
+            .describe( "should output be colourised" )
+            .bind( &setUseColour, "yes|no" );
+
+        return cli;
+    }
+
+} // end namespace Catch
+
+// #included from: internal/catch_list.hpp
+#define TWOBLUECUBES_CATCH_LIST_HPP_INCLUDED
+
+// #included from: catch_text.h
+#define TWOBLUECUBES_CATCH_TEXT_H_INCLUDED
+
+#define TBC_TEXT_FORMAT_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH
+
+#define CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE Catch
+// #included from: ../external/tbc_text_format.h
+// Only use header guard if we are not using an outer namespace
+#ifndef CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+# ifdef TWOBLUECUBES_TEXT_FORMAT_H_INCLUDED
+#  ifndef TWOBLUECUBES_TEXT_FORMAT_H_ALREADY_INCLUDED
+#   define TWOBLUECUBES_TEXT_FORMAT_H_ALREADY_INCLUDED
+#  endif
+# else
+#  define TWOBLUECUBES_TEXT_FORMAT_H_INCLUDED
+# endif
+#endif
+#ifndef TWOBLUECUBES_TEXT_FORMAT_H_ALREADY_INCLUDED
+#include <string>
+#include <vector>
+#include <sstream>
+
+// Use optional outer namespace
+#ifdef CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+namespace CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE {
+#endif
+
+namespace Tbc {
+
+#ifdef TBC_TEXT_FORMAT_CONSOLE_WIDTH
+    const unsigned int consoleWidth = TBC_TEXT_FORMAT_CONSOLE_WIDTH;
+#else
+    const unsigned int consoleWidth = 80;
+#endif
+
+    struct TextAttributes {
+        TextAttributes()
+        :   initialIndent( std::string::npos ),
+            indent( 0 ),
+            width( consoleWidth-1 ),
+            tabChar( '\t' )
+        {}
+
+        TextAttributes& setInitialIndent( std::size_t _value )  { initialIndent = _value; return *this; }
+        TextAttributes& setIndent( std::size_t _value )         { indent = _value; return *this; }
+        TextAttributes& setWidth( std::size_t _value )          { width = _value; return *this; }
+        TextAttributes& setTabChar( char _value )               { tabChar = _value; return *this; }
+
+        std::size_t initialIndent;  // indent of first line, or npos
+        std::size_t indent;         // indent of subsequent lines, or all if initialIndent is npos
+        std::size_t width;          // maximum width of text, including indent. Longer text will wrap
+        char tabChar;               // If this char is seen the indent is changed to current pos
+    };
+
+    class Text {
+    public:
+        Text( std::string const& _str, TextAttributes const& _attr = TextAttributes() )
+        : attr( _attr )
+        {
+            std::string wrappableChars = " [({.,/|\\-";
+            std::size_t indent = _attr.initialIndent != std::string::npos
+                ? _attr.initialIndent
+                : _attr.indent;
+            std::string remainder = _str;
+
+            while( !remainder.empty() ) {
+                if( lines.size() >= 1000 ) {
+                    lines.push_back( "... message truncated due to excessive size" );
+                    return;
+                }
+                std::size_t tabPos = std::string::npos;
+                std::size_t width = (std::min)( remainder.size(), _attr.width - indent );
+                std::size_t pos = remainder.find_first_of( '\n' );
+                if( pos <= width ) {
+                    width = pos;
+                }
+                pos = remainder.find_last_of( _attr.tabChar, width );
+                if( pos != std::string::npos ) {
+                    tabPos = pos;
+                    if( remainder[width] == '\n' )
+                        width--;
+                    remainder = remainder.substr( 0, tabPos ) + remainder.substr( tabPos+1 );
+                }
+
+                if( width == remainder.size() ) {
+                    spliceLine( indent, remainder, width );
+                }
+                else if( remainder[width] == '\n' ) {
+                    spliceLine( indent, remainder, width );
+                    if( width <= 1 || remainder.size() != 1 )
+                        remainder = remainder.substr( 1 );
+                    indent = _attr.indent;
+                }
+                else {
+                    pos = remainder.find_last_of( wrappableChars, width );
+                    if( pos != std::string::npos && pos > 0 ) {
+                        spliceLine( indent, remainder, pos );
+                        if( remainder[0] == ' ' )
+                            remainder = remainder.substr( 1 );
+                    }
+                    else {
+                        spliceLine( indent, remainder, width-1 );
+                        lines.back() += "-";
+                    }
+                    if( lines.size() == 1 )
+                        indent = _attr.indent;
+                    if( tabPos != std::string::npos )
+                        indent += tabPos;
+                }
+            }
+        }
+
+        void spliceLine( std::size_t _indent, std::string& _remainder, std::size_t _pos ) {
+            lines.push_back( std::string( _indent, ' ' ) + _remainder.substr( 0, _pos ) );
+            _remainder = _remainder.substr( _pos );
+        }
+
+        typedef std::vector<std::string>::const_iterator const_iterator;
+
+        const_iterator begin() const { return lines.begin(); }
+        const_iterator end() const { return lines.end(); }
+        std::string const& last() const { return lines.back(); }
+        std::size_t size() const { return lines.size(); }
+        std::string const& operator[]( std::size_t _index ) const { return lines[_index]; }
+        std::string toString() const {
+            std::ostringstream oss;
+            oss << *this;
+            return oss.str();
+        }
+
+        inline friend std::ostream& operator << ( std::ostream& _stream, Text const& _text ) {
+            for( Text::const_iterator it = _text.begin(), itEnd = _text.end();
+                it != itEnd; ++it ) {
+                if( it != _text.begin() )
+                    _stream << "\n";
+                _stream << *it;
+            }
+            return _stream;
+        }
+
+    private:
+        std::string str;
+        TextAttributes attr;
+        std::vector<std::string> lines;
+    };
+
+} // end namespace Tbc
+
+#ifdef CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+} // end outer namespace
+#endif
+
+#endif // TWOBLUECUBES_TEXT_FORMAT_H_ALREADY_INCLUDED
+#undef CLICHE_TBC_TEXT_FORMAT_OUTER_NAMESPACE
+
+namespace Catch {
+    using Tbc::Text;
+    using Tbc::TextAttributes;
+}
+
+// #included from: catch_console_colour.hpp
+#define TWOBLUECUBES_CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+namespace Catch {
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = Yellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = Yellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+
+        // Use constructed object for RAII guard
+        Colour( Code _colourCode );
+        Colour( Colour const& other );
+        ~Colour();
+
+        // Use static method for one-shot changes
+        static void use( Code _colourCode );
+
+    private:
+        bool m_moved;
+    };
+
+    inline std::ostream& operator << ( std::ostream& os, Colour const& ) { return os; }
+
+} // end namespace Catch
+
+// #included from: catch_interfaces_reporter.h
+#define TWOBLUECUBES_CATCH_INTERFACES_REPORTER_H_INCLUDED
+
+#include <string>
+#include <ostream>
+#include <map>
+#include <assert.h>
+
+namespace Catch
+{
+    struct ReporterConfig {
+        explicit ReporterConfig( Ptr<IConfig const> const& _fullConfig )
+        :   m_stream( &_fullConfig->stream() ), m_fullConfig( _fullConfig ) {}
+
+        ReporterConfig( Ptr<IConfig const> const& _fullConfig, std::ostream& _stream )
+        :   m_stream( &_stream ), m_fullConfig( _fullConfig ) {}
+
+        std::ostream& stream() const    { return *m_stream; }
+        Ptr<IConfig const> fullConfig() const { return m_fullConfig; }
+
+    private:
+        std::ostream* m_stream;
+        Ptr<IConfig const> m_fullConfig;
+    };
+
+    struct ReporterPreferences {
+        ReporterPreferences()
+        : shouldRedirectStdOut( false )
+        {}
+
+        bool shouldRedirectStdOut;
+    };
+
+    template<typename T>
+    struct LazyStat : Option<T> {
+        LazyStat() : used( false ) {}
+        LazyStat& operator=( T const& _value ) {
+            Option<T>::operator=( _value );
+            used = false;
+            return *this;
+        }
+        void reset() {
+            Option<T>::reset();
+            used = false;
+        }
+        bool used;
+    };
+
+    struct TestRunInfo {
+        TestRunInfo( std::string const& _name ) : name( _name ) {}
+        std::string name;
+    };
+    struct GroupInfo {
+        GroupInfo(  std::string const& _name,
+                    std::size_t _groupIndex,
+                    std::size_t _groupsCount )
+        :   name( _name ),
+            groupIndex( _groupIndex ),
+            groupsCounts( _groupsCount )
+        {}
+
+        std::string name;
+        std::size_t groupIndex;
+        std::size_t groupsCounts;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals )
+        :   assertionResult( _assertionResult ),
+            infoMessages( _infoMessages ),
+            totals( _totals )
+        {
+            if( assertionResult.hasMessage() ) {
+                // Copy message into messages list.
+                // !TBD This should have been done earlier, somewhere
+                MessageBuilder builder( assertionResult.getTestMacroName(), assertionResult.getSourceInfo(), assertionResult.getResultType() );
+                builder << assertionResult.getMessage();
+                builder.m_info.message = builder.m_stream.str();
+
+                infoMessages.push_back( builder.m_info );
+            }
+        }
+        virtual ~AssertionStats();
+
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = default;
+        AssertionStats& operator = ( AssertionStats && )     = default;
+#  endif
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo const& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions )
+        :   sectionInfo( _sectionInfo ),
+            assertions( _assertions ),
+            durationInSeconds( _durationInSeconds ),
+            missingAssertions( _missingAssertions )
+        {}
+        virtual ~SectionStats();
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        SectionStats( SectionStats const& )              = default;
+        SectionStats( SectionStats && )                  = default;
+        SectionStats& operator = ( SectionStats const& ) = default;
+        SectionStats& operator = ( SectionStats && )     = default;
+#  endif
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string const& _stdOut,
+                        std::string const& _stdErr,
+                        bool _aborting )
+        : testInfo( _testInfo ),
+            totals( _totals ),
+            stdOut( _stdOut ),
+            stdErr( _stdErr ),
+            aborting( _aborting )
+        {}
+        virtual ~TestCaseStats();
+
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        TestCaseStats( TestCaseStats const& )              = default;
+        TestCaseStats( TestCaseStats && )                  = default;
+        TestCaseStats& operator = ( TestCaseStats const& ) = default;
+        TestCaseStats& operator = ( TestCaseStats && )     = default;
+#  endif
+
+        TestCaseInfo testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestGroupStats {
+        TestGroupStats( GroupInfo const& _groupInfo,
+                        Totals const& _totals,
+                        bool _aborting )
+        :   groupInfo( _groupInfo ),
+            totals( _totals ),
+            aborting( _aborting )
+        {}
+        TestGroupStats( GroupInfo const& _groupInfo )
+        :   groupInfo( _groupInfo ),
+            aborting( false )
+        {}
+        virtual ~TestGroupStats();
+
+#  ifdef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        TestGroupStats( TestGroupStats const& )              = default;
+        TestGroupStats( TestGroupStats && )                  = default;
+        TestGroupStats& operator = ( TestGroupStats const& ) = default;
+        TestGroupStats& operator = ( TestGroupStats && )     = default;
+#  endif
+
+        GroupInfo groupInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting )
+        :   runInfo( _runInfo ),
+            totals( _totals ),
+            aborting( _aborting )
+        {}
+        virtual ~TestRunStats();
+
+#  ifndef CATCH_CONFIG_CPP11_GENERATED_METHODS
+        TestRunStats( TestRunStats const& _other )
+        :   runInfo( _other.runInfo ),
+            totals( _other.totals ),
+            aborting( _other.aborting )
+        {}
+#  else
+        TestRunStats( TestRunStats const& )              = default;
+        TestRunStats( TestRunStats && )                  = default;
+        TestRunStats& operator = ( TestRunStats const& ) = default;
+        TestRunStats& operator = ( TestRunStats && )     = default;
+#  endif
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    struct IStreamingReporter : IShared {
+        virtual ~IStreamingReporter();
+
+        // Implementing class must also provide the following static method:
+        // static std::string getDescription();
+
+        virtual ReporterPreferences getPreferences() const = 0;
+
+        virtual void noMatchingTestCases( std::string const& spec ) = 0;
+
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+        virtual void testGroupStarting( GroupInfo const& groupInfo ) = 0;
+
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
+
+        // The return value indicates if the messages buffer should be cleared:
+        virtual bool assertionEnded( AssertionStats const& assertionStats ) = 0;
+
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) = 0;
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
+
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
+    };
+
+    struct IReporterFactory : IShared {
+        virtual ~IReporterFactory();
+        virtual IStreamingReporter* create( ReporterConfig const& config ) const = 0;
+        virtual std::string getDescription() const = 0;
+    };
+
+    struct IReporterRegistry {
+        typedef std::map<std::string, Ptr<IReporterFactory> > FactoryMap;
+        typedef std::vector<Ptr<IReporterFactory> > Listeners;
+
+        virtual ~IReporterRegistry();
+        virtual IStreamingReporter* create( std::string const& name, Ptr<IConfig const> const& config ) const = 0;
+        virtual FactoryMap const& getFactories() const = 0;
+        virtual Listeners const& getListeners() const = 0;
+    };
+
+    Ptr<IStreamingReporter> addReporter( Ptr<IStreamingReporter> const& existingReporter, Ptr<IStreamingReporter> const& additionalReporter );
+
+}
+
+#include <limits>
+#include <algorithm>
+
+namespace Catch {
+
+    inline std::size_t listTests( Config const& config ) {
+
+        TestSpec testSpec = config.testSpec();
+        if( config.testSpec().hasFilters() )
+            Catch::cout() << "Matching test cases:\n";
+        else {
+            Catch::cout() << "All available test cases:\n";
+            testSpec = TestSpecParser( ITagAliasRegistry::get() ).parse( "*" ).testSpec();
+        }
+
+        std::size_t matchedTests = 0;
+        TextAttributes nameAttr, tagsAttr;
+        nameAttr.setInitialIndent( 2 ).setIndent( 4 );
+        tagsAttr.setIndent( 6 );
+
+        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( std::vector<TestCase>::const_iterator it = matchedTestCases.begin(), itEnd = matchedTestCases.end();
+                it != itEnd;
+                ++it ) {
+            matchedTests++;
+            TestCaseInfo const& testCaseInfo = it->getTestCaseInfo();
+            Colour::Code colour = testCaseInfo.isHidden()
+                ? Colour::SecondaryText
+                : Colour::None;
+            Colour colourGuard( colour );
+
+            Catch::cout() << Text( testCaseInfo.name, nameAttr ) << std::endl;
+            if( !testCaseInfo.tags.empty() )
+                Catch::cout() << Text( testCaseInfo.tagsAsString, tagsAttr ) << std::endl;
+        }
+
+        if( !config.testSpec().hasFilters() )
+            Catch::cout() << pluralise( matchedTests, "test case" ) << "\n" << std::endl;
+        else
+            Catch::cout() << pluralise( matchedTests, "matching test case" ) << "\n" << std::endl;
+        return matchedTests;
+    }
+
+    inline std::size_t listTestsNamesOnly( Config const& config ) {
+        TestSpec testSpec = config.testSpec();
+        if( !config.testSpec().hasFilters() )
+            testSpec = TestSpecParser( ITagAliasRegistry::get() ).parse( "*" ).testSpec();
+        std::size_t matchedTests = 0;
+        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( std::vector<TestCase>::const_iterator it = matchedTestCases.begin(), itEnd = matchedTestCases.end();
+                it != itEnd;
+                ++it ) {
+            matchedTests++;
+            TestCaseInfo const& testCaseInfo = it->getTestCaseInfo();
+            Catch::cout() << testCaseInfo.name << std::endl;
+        }
+        return matchedTests;
+    }
+
+    struct TagInfo {
+        TagInfo() : count ( 0 ) {}
+        void add( std::string const& spelling ) {
+            ++count;
+            spellings.insert( spelling );
+        }
+        std::string all() const {
+            std::string out;
+            for( std::set<std::string>::const_iterator it = spellings.begin(), itEnd = spellings.end();
+                        it != itEnd;
+                        ++it )
+                out += "[" + *it + "]";
+            return out;
+        }
+        std::set<std::string> spellings;
+        std::size_t count;
+    };
+
+    inline std::size_t listTags( Config const& config ) {
+        TestSpec testSpec = config.testSpec();
+        if( config.testSpec().hasFilters() )
+            Catch::cout() << "Tags for matching test cases:\n";
+        else {
+            Catch::cout() << "All available tags:\n";
+            testSpec = TestSpecParser( ITagAliasRegistry::get() ).parse( "*" ).testSpec();
+        }
+
+        std::map<std::string, TagInfo> tagCounts;
+
+        std::vector<TestCase> matchedTestCases = filterTests( getAllTestCasesSorted( config ), testSpec, config );
+        for( std::vector<TestCase>::const_iterator it = matchedTestCases.begin(), itEnd = matchedTestCases.end();
+                it != itEnd;
+                ++it ) {
+            for( std::set<std::string>::const_iterator  tagIt = it->getTestCaseInfo().tags.begin(),
+                                                        tagItEnd = it->getTestCaseInfo().tags.end();
+                    tagIt != tagItEnd;
+                    ++tagIt ) {
+                std::string tagName = *tagIt;
+                std::string lcaseTagName = toLower( tagName );
+                std::map<std::string, TagInfo>::iterator countIt = tagCounts.find( lcaseTagName );
+                if( countIt == tagCounts.end() )
+                    countIt = tagCounts.insert( std::make_pair( lcaseTagName, TagInfo() ) ).first;
+                countIt->second.add( tagName );
+            }
+        }
+
+        for( std::map<std::string, TagInfo>::const_iterator countIt = tagCounts.begin(),
+                                                            countItEnd = tagCounts.end();
+                countIt != countItEnd;
+                ++countIt ) {
+            std::ostringstream oss;
+            oss << "  " << std::setw(2) << countIt->second.count << "  ";
+            Text wrapper( countIt->second.all(), TextAttributes()
+                                                    .setInitialIndent( 0 )
+                                                    .setIndent( oss.str().size() )
+                                                    .setWidth( CATCH_CONFIG_CONSOLE_WIDTH-10 ) );
+            Catch::cout() << oss.str() << wrapper << "\n";
+        }
+        Catch::cout() << pluralise( tagCounts.size(), "tag" ) << "\n" << std::endl;
+        return tagCounts.size();
+    }
+
+    inline std::size_t listReporters( Config const& /*config*/ ) {
+        Catch::cout() << "Available reporters:\n";
+        IReporterRegistry::FactoryMap const& factories = getRegistryHub().getReporterRegistry().getFactories();
+        IReporterRegistry::FactoryMap::const_iterator itBegin = factories.begin(), itEnd = factories.end(), it;
+        std::size_t maxNameLen = 0;
+        for(it = itBegin; it != itEnd; ++it )
+            maxNameLen = (std::max)( maxNameLen, it->first.size() );
+
+        for(it = itBegin; it != itEnd; ++it ) {
+            Text wrapper( it->second->getDescription(), TextAttributes()
+                                                        .setInitialIndent( 0 )
+                                                        .setIndent( 7+maxNameLen )
+                                                        .setWidth( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen-8 ) );
+            Catch::cout() << "  "
+                    << it->first
+                    << ":"
+                    << std::string( maxNameLen - it->first.size() + 2, ' ' )
+                    << wrapper << "\n";
+        }
+        Catch::cout() << std::endl;
+        return factories.size();
+    }
+
+    inline Option<std::size_t> list( Config const& config ) {
+        Option<std::size_t> listedCount;
+        if( config.listTests() )
+            listedCount = listedCount.valueOr(0) + listTests( config );
+        if( config.listTestNamesOnly() )
+            listedCount = listedCount.valueOr(0) + listTestsNamesOnly( config );
+        if( config.listTags() )
+            listedCount = listedCount.valueOr(0) + listTags( config );
+        if( config.listReporters() )
+            listedCount = listedCount.valueOr(0) + listReporters( config );
+        return listedCount;
+    }
+
+} // end namespace Catch
+
+// #included from: internal/catch_run_context.hpp
+#define TWOBLUECUBES_CATCH_RUNNER_IMPL_HPP_INCLUDED
+
+// #included from: catch_test_case_tracker.hpp
+#define TWOBLUECUBES_CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+
+#include <map>
+#include <string>
+#include <assert.h>
+#include <vector>
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    struct ITracker : SharedImpl<> {
+        virtual ~ITracker();
+
+        // static queries
+        virtual std::string name() const = 0;
+
+        // dynamic queries
+        virtual bool isComplete() const = 0; // Successfully completed or failed
+        virtual bool isSuccessfullyCompleted() const = 0;
+        virtual bool isOpen() const = 0; // Started but not complete
+        virtual bool hasChildren() const = 0;
+
+        virtual ITracker& parent() = 0;
+
+        // actions
+        virtual void close() = 0; // Successfully complete
+        virtual void fail() = 0;
+        virtual void markAsNeedingAnotherRun() = 0;
+
+        virtual void addChild( Ptr<ITracker> const& child ) = 0;
+        virtual ITracker* findChild( std::string const& name ) = 0;
+        virtual void openChild() = 0;
+    };
+
+    class TrackerContext {
+
+        enum RunState {
+            NotStarted,
+            Executing,
+            CompletedCycle
+        };
+
+        Ptr<ITracker> m_rootTracker;
+        ITracker* m_currentTracker;
+        RunState m_runState;
+
+    public:
+
+        static TrackerContext& instance() {
+            static TrackerContext s_instance;
+            return s_instance;
+        }
+
+        TrackerContext()
+        :   m_currentTracker( CATCH_NULL ),
+            m_runState( NotStarted )
+        {}
+
+        ITracker& startRun();
+
+        void endRun() {
+            m_rootTracker.reset();
+            m_currentTracker = CATCH_NULL;
+            m_runState = NotStarted;
+        }
+
+        void startCycle() {
+            m_currentTracker = m_rootTracker.get();
+            m_runState = Executing;
+        }
+        void completeCycle() {
+            m_runState = CompletedCycle;
+        }
+
+        bool completedCycle() const {
+            return m_runState == CompletedCycle;
+        }
+        ITracker& currentTracker() {
+            return *m_currentTracker;
+        }
+        void setCurrentTracker( ITracker* tracker ) {
+            m_currentTracker = tracker;
+        }
+    };
+
+    class TrackerBase : public ITracker {
+    protected:
+        enum CycleState {
+            NotStarted,
+            Executing,
+            ExecutingChildren,
+            NeedsAnotherRun,
+            CompletedSuccessfully,
+            Failed
+        };
+        class TrackerHasName {
+            std::string m_name;
+        public:
+            TrackerHasName( std::string const& name ) : m_name( name ) {}
+            bool operator ()( Ptr<ITracker> const& tracker ) {
+                return tracker->name() == m_name;
+            }
+        };
+        typedef std::vector<Ptr<ITracker> > Children;
+        std::string m_name;
+        TrackerContext& m_ctx;
+        ITracker* m_parent;
+        Children m_children;
+        CycleState m_runState;
+    public:
+        TrackerBase( std::string const& name, TrackerContext& ctx, ITracker* parent )
+        :   m_name( name ),
+            m_ctx( ctx ),
+            m_parent( parent ),
+            m_runState( NotStarted )
+        {}
+        virtual ~TrackerBase();
+
+        virtual std::string name() const CATCH_OVERRIDE {
+            return m_name;
+        }
+        virtual bool isComplete() const CATCH_OVERRIDE {
+            return m_runState == CompletedSuccessfully || m_runState == Failed;
+        }
+        virtual bool isSuccessfullyCompleted() const CATCH_OVERRIDE {
+            return m_runState == CompletedSuccessfully;
+        }
+        virtual bool isOpen() const CATCH_OVERRIDE {
+            return m_runState != NotStarted && !isComplete();
+        }
+        virtual bool hasChildren() const CATCH_OVERRIDE {
+            return !m_children.empty();
+        }
+
+        virtual void addChild( Ptr<ITracker> const& child ) CATCH_OVERRIDE {
+            m_children.push_back( child );
+        }
+
+        virtual ITracker* findChild( std::string const& name ) CATCH_OVERRIDE {
+            Children::const_iterator it = std::find_if( m_children.begin(), m_children.end(), TrackerHasName( name ) );
+            return( it != m_children.end() )
+                ? it->get()
+                : CATCH_NULL;
+        }
+        virtual ITracker& parent() CATCH_OVERRIDE {
+            assert( m_parent ); // Should always be non-null except for root
+            return *m_parent;
+        }
+
+        virtual void openChild() CATCH_OVERRIDE {
+            if( m_runState != ExecutingChildren ) {
+                m_runState = ExecutingChildren;
+                if( m_parent )
+                    m_parent->openChild();
+            }
+        }
+        void open() {
+            m_runState = Executing;
+            moveToThis();
+            if( m_parent )
+                m_parent->openChild();
+        }
+
+        virtual void close() CATCH_OVERRIDE {
+
+            // Close any still open children (e.g. generators)
+            while( &m_ctx.currentTracker() != this )
+                m_ctx.currentTracker().close();
+
+            switch( m_runState ) {
+                case NotStarted:
+                case CompletedSuccessfully:
+                case Failed:
+                    throw std::logic_error( "Illogical state" );
+
+                case NeedsAnotherRun:
+                    break;;
+
+                case Executing:
+                    m_runState = CompletedSuccessfully;
+                    break;
+                case ExecutingChildren:
+                    if( m_children.empty() || m_children.back()->isComplete() )
+                        m_runState = CompletedSuccessfully;
+                    break;
+
+                default:
+                    throw std::logic_error( "Unexpected state" );
+            }
+            moveToParent();
+            m_ctx.completeCycle();
+        }
+        virtual void fail() CATCH_OVERRIDE {
+            m_runState = Failed;
+            if( m_parent )
+                m_parent->markAsNeedingAnotherRun();
+            moveToParent();
+            m_ctx.completeCycle();
+        }
+        virtual void markAsNeedingAnotherRun() CATCH_OVERRIDE {
+            m_runState = NeedsAnotherRun;
+        }
+    private:
+        void moveToParent() {
+            assert( m_parent );
+            m_ctx.setCurrentTracker( m_parent );
+        }
+        void moveToThis() {
+            m_ctx.setCurrentTracker( this );
+        }
+    };
+
+    class SectionTracker : public TrackerBase {
+    public:
+        SectionTracker( std::string const& name, TrackerContext& ctx, ITracker* parent )
+        :   TrackerBase( name, ctx, parent )
+        {}
+        virtual ~SectionTracker();
+
+        static SectionTracker& acquire( TrackerContext& ctx, std::string const& name ) {
+            SectionTracker* section = CATCH_NULL;
+
+            ITracker& currentTracker = ctx.currentTracker();
+            if( ITracker* childTracker = currentTracker.findChild( name ) ) {
+                section = dynamic_cast<SectionTracker*>( childTracker );
+                assert( section );
+            }
+            else {
+                section = new SectionTracker( name, ctx, &currentTracker );
+                currentTracker.addChild( section );
+            }
+            if( !ctx.completedCycle() && !section->isComplete() ) {
+
+                section->open();
+            }
+            return *section;
+        }
+    };
+
+    class IndexTracker : public TrackerBase {
+        int m_size;
+        int m_index;
+    public:
+        IndexTracker( std::string const& name, TrackerContext& ctx, ITracker* parent, int size )
+        :   TrackerBase( name, ctx, parent ),
+            m_size( size ),
+            m_index( -1 )
+        {}
+        virtual ~IndexTracker();
+
+        static IndexTracker& acquire( TrackerContext& ctx, std::string const& name, int size ) {
+            IndexTracker* tracker = CATCH_NULL;
+
+            ITracker& currentTracker = ctx.currentTracker();
+            if( ITracker* childTracker = currentTracker.findChild( name ) ) {
+                tracker = dynamic_cast<IndexTracker*>( childTracker );
+                assert( tracker );
+            }
+            else {
+                tracker = new IndexTracker( name, ctx, &currentTracker, size );
+                currentTracker.addChild( tracker );
+            }
+
+            if( !ctx.completedCycle() && !tracker->isComplete() ) {
+                if( tracker->m_runState != ExecutingChildren && tracker->m_runState != NeedsAnotherRun )
+                    tracker->moveNext();
+                tracker->open();
+            }
+
+            return *tracker;
+        }
+
+        int index() const { return m_index; }
+
+        void moveNext() {
+            m_index++;
+            m_children.clear();
+        }
+
+        virtual void close() CATCH_OVERRIDE {
+            TrackerBase::close();
+            if( m_runState == CompletedSuccessfully && m_index < m_size-1 )
+                m_runState = Executing;
+        }
+    };
+
+    inline ITracker& TrackerContext::startRun() {
+        m_rootTracker = new SectionTracker( "{root}", *this, CATCH_NULL );
+        m_currentTracker = CATCH_NULL;
+        m_runState = Executing;
+        return *m_rootTracker;
+    }
+
+} // namespace TestCaseTracking
+
+using TestCaseTracking::ITracker;
+using TestCaseTracking::TrackerContext;
+using TestCaseTracking::SectionTracker;
+using TestCaseTracking::IndexTracker;
+
+} // namespace Catch
+
+// #included from: catch_fatal_condition.hpp
+#define TWOBLUECUBES_CATCH_FATAL_CONDITION_H_INCLUDED
+
+namespace Catch {
+
+    // Report the error condition then exit the process
+    inline void fatal( std::string const& message, int exitCode ) {
+        IContext& context = Catch::getCurrentContext();
+        IResultCapture* resultCapture = context.getResultCapture();
+        resultCapture->handleFatalErrorCondition( message );
+
+		if( Catch::alwaysTrue() ) // avoids "no return" warnings
+            exit( exitCode );
+    }
+
+} // namespace Catch
+
+#if defined ( CATCH_PLATFORM_WINDOWS ) /////////////////////////////////////////
+
+namespace Catch {
+
+    struct FatalConditionHandler {
+		void reset() {}
+	};
+
+} // namespace Catch
+
+#else // Not Windows - assumed to be POSIX compatible //////////////////////////
+
+#include <signal.h>
+
+namespace Catch {
+
+    struct SignalDefs { int id; const char* name; };
+    extern SignalDefs signalDefs[];
+    SignalDefs signalDefs[] = {
+            { SIGINT,  "SIGINT - Terminal interrupt signal" },
+            { SIGILL,  "SIGILL - Illegal instruction signal" },
+            { SIGFPE,  "SIGFPE - Floating point error signal" },
+            { SIGSEGV, "SIGSEGV - Segmentation violation signal" },
+            { SIGTERM, "SIGTERM - Termination request signal" },
+            { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" }
+        };
+
+    struct FatalConditionHandler {
+
+        static void handleSignal( int sig ) {
+            for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i )
+                if( sig == signalDefs[i].id )
+                    fatal( signalDefs[i].name, -sig );
+            fatal( "<unknown signal>", -sig );
+        }
+
+        FatalConditionHandler() : m_isSet( true ) {
+            for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i )
+                signal( signalDefs[i].id, handleSignal );
+        }
+        ~FatalConditionHandler() {
+            reset();
+        }
+        void reset() {
+            if( m_isSet ) {
+                for( std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i )
+                    signal( signalDefs[i].id, SIG_DFL );
+                m_isSet = false;
+            }
+        }
+
+        bool m_isSet;
+    };
+
+} // namespace Catch
+
+#endif // not Windows
+
+#include <set>
+#include <string>
+
+namespace Catch {
+
+    class StreamRedirect {
+
+    public:
+        StreamRedirect( std::ostream& stream, std::string& targetString )
+        :   m_stream( stream ),
+            m_prevBuf( stream.rdbuf() ),
+            m_targetString( targetString )
+        {
+            stream.rdbuf( m_oss.rdbuf() );
+        }
+
+        ~StreamRedirect() {
+            m_targetString += m_oss.str();
+            m_stream.rdbuf( m_prevBuf );
+        }
+
+    private:
+        std::ostream& m_stream;
+        std::streambuf* m_prevBuf;
+        std::ostringstream m_oss;
+        std::string& m_targetString;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class RunContext : public IResultCapture, public IRunner {
+
+        RunContext( RunContext const& );
+        void operator =( RunContext const& );
+
+    public:
+
+        explicit RunContext( Ptr<IConfig const> const& _config, Ptr<IStreamingReporter> const& reporter )
+        :   m_runInfo( _config->name() ),
+            m_context( getCurrentMutableContext() ),
+            m_activeTestCase( CATCH_NULL ),
+            m_config( _config ),
+            m_reporter( reporter )
+        {
+            m_context.setRunner( this );
+            m_context.setConfig( m_config );
+            m_context.setResultCapture( this );
+            m_reporter->testRunStarting( m_runInfo );
+        }
+
+        virtual ~RunContext() {
+            m_reporter->testRunEnded( TestRunStats( m_runInfo, m_totals, aborting() ) );
+        }
+
+        void testGroupStarting( std::string const& testSpec, std::size_t groupIndex, std::size_t groupsCount ) {
+            m_reporter->testGroupStarting( GroupInfo( testSpec, groupIndex, groupsCount ) );
+        }
+        void testGroupEnded( std::string const& testSpec, Totals const& totals, std::size_t groupIndex, std::size_t groupsCount ) {
+            m_reporter->testGroupEnded( TestGroupStats( GroupInfo( testSpec, groupIndex, groupsCount ), totals, aborting() ) );
+        }
+
+        Totals runTest( TestCase const& testCase ) {
+            Totals prevTotals = m_totals;
+
+            std::string redirectedCout;
+            std::string redirectedCerr;
+
+            TestCaseInfo testInfo = testCase.getTestCaseInfo();
+
+            m_reporter->testCaseStarting( testInfo );
+
+            m_activeTestCase = &testCase;
+
+            do {
+                m_trackerContext.startRun();
+                do {
+                    m_trackerContext.startCycle();
+                    m_testCaseTracker = &SectionTracker::acquire( m_trackerContext, testInfo.name );
+                    runCurrentTest( redirectedCout, redirectedCerr );
+                }
+                while( !m_testCaseTracker->isSuccessfullyCompleted() && !aborting() );
+            }
+            // !TBD: deprecated - this will be replaced by indexed trackers
+            while( getCurrentContext().advanceGeneratorsForCurrentTest() && !aborting() );
+
+            Totals deltaTotals = m_totals.delta( prevTotals );
+            if( testInfo.expectedToFail() && deltaTotals.testCases.passed > 0 ) {
+                deltaTotals.assertions.failed++;
+                deltaTotals.testCases.passed--;
+                deltaTotals.testCases.failed++;
+            }
+            m_totals.testCases += deltaTotals.testCases;
+            m_reporter->testCaseEnded( TestCaseStats(   testInfo,
+                                                        deltaTotals,
+                                                        redirectedCout,
+                                                        redirectedCerr,
+                                                        aborting() ) );
+
+            m_activeTestCase = CATCH_NULL;
+            m_testCaseTracker = CATCH_NULL;
+
+            return deltaTotals;
+        }
+
+        Ptr<IConfig const> config() const {
+            return m_config;
+        }
+
+    private: // IResultCapture
+
+        virtual void assertionEnded( AssertionResult const& result ) {
+            if( result.getResultType() == ResultWas::Ok ) {
+                m_totals.assertions.passed++;
+            }
+            else if( !result.isOk() ) {
+                m_totals.assertions.failed++;
+            }
+
+            if( m_reporter->assertionEnded( AssertionStats( result, m_messages, m_totals ) ) )
+                m_messages.clear();
+
+            // Reset working state
+            m_lastAssertionInfo = AssertionInfo( "", m_lastAssertionInfo.lineInfo, "{Unknown expression after the reported line}" , m_lastAssertionInfo.resultDisposition );
+            m_lastResult = result;
+        }
+
+        virtual bool sectionStarted (
+            SectionInfo const& sectionInfo,
+            Counts& assertions
+        )
+        {
+            std::ostringstream oss;
+            oss << sectionInfo.name << "@" << sectionInfo.lineInfo;
+
+            ITracker& sectionTracker = SectionTracker::acquire( m_trackerContext, oss.str() );
+            if( !sectionTracker.isOpen() )
+                return false;
+            m_activeSections.push_back( &sectionTracker );
+
+            m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;
+
+            m_reporter->sectionStarting( sectionInfo );
+
+            assertions = m_totals.assertions;
+
+            return true;
+        }
+        bool testForMissingAssertions( Counts& assertions ) {
+            if( assertions.total() != 0 )
+                return false;
+            if( !m_config->warnAboutMissingAssertions() )
+                return false;
+            if( m_trackerContext.currentTracker().hasChildren() )
+                return false;
+            m_totals.assertions.failed++;
+            assertions.failed++;
+            return true;
+        }
+
+        virtual void sectionEnded( SectionEndInfo const& endInfo ) {
+            Counts assertions = m_totals.assertions - endInfo.prevAssertions;
+            bool missingAssertions = testForMissingAssertions( assertions );
+
+            if( !m_activeSections.empty() ) {
+                m_activeSections.back()->close();
+                m_activeSections.pop_back();
+            }
+
+            m_reporter->sectionEnded( SectionStats( endInfo.sectionInfo, assertions, endInfo.durationInSeconds, missingAssertions ) );
+            m_messages.clear();
+        }
+
+        virtual void sectionEndedEarly( SectionEndInfo const& endInfo ) {
+            if( m_unfinishedSections.empty() )
+                m_activeSections.back()->fail();
+            else
+                m_activeSections.back()->close();
+            m_activeSections.pop_back();
+
+            m_unfinishedSections.push_back( endInfo );
+        }
+
+        virtual void pushScopedMessage( MessageInfo const& message ) {
+            m_messages.push_back( message );
+        }
+
+        virtual void popScopedMessage( MessageInfo const& message ) {
+            m_messages.erase( std::remove( m_messages.begin(), m_messages.end(), message ), m_messages.end() );
+        }
+
+        virtual std::string getCurrentTestName() const {
+            return m_activeTestCase
+                ? m_activeTestCase->getTestCaseInfo().name
+                : "";
+        }
+
+        virtual const AssertionResult* getLastResult() const {
+            return &m_lastResult;
+        }
+
+        virtual void handleFatalErrorCondition( std::string const& message ) {
+            ResultBuilder resultBuilder = makeUnexpectedResultBuilder();
+            resultBuilder.setResultType( ResultWas::FatalErrorCondition );
+            resultBuilder << message;
+            resultBuilder.captureExpression();
+
+            handleUnfinishedSections();
+
+            // Recreate section for test case (as we will lose the one that was in scope)
+            TestCaseInfo const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+            SectionInfo testCaseSection( testCaseInfo.lineInfo, testCaseInfo.name, testCaseInfo.description );
+
+            Counts assertions;
+            assertions.failed = 1;
+            SectionStats testCaseSectionStats( testCaseSection, assertions, 0, false );
+            m_reporter->sectionEnded( testCaseSectionStats );
+
+            TestCaseInfo testInfo = m_activeTestCase->getTestCaseInfo();
+
+            Totals deltaTotals;
+            deltaTotals.testCases.failed = 1;
+            m_reporter->testCaseEnded( TestCaseStats(   testInfo,
+                                                        deltaTotals,
+                                                        "",
+                                                        "",
+                                                        false ) );
+            m_totals.testCases.failed++;
+            testGroupEnded( "", m_totals, 1, 1 );
+            m_reporter->testRunEnded( TestRunStats( m_runInfo, m_totals, false ) );
+        }
+
+    public:
+        // !TBD We need to do this another way!
+        bool aborting() const {
+            return m_totals.assertions.failed == static_cast<std::size_t>( m_config->abortAfter() );
+        }
+
+    private:
+
+        void runCurrentTest( std::string& redirectedCout, std::string& redirectedCerr ) {
+            TestCaseInfo const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+            SectionInfo testCaseSection( testCaseInfo.lineInfo, testCaseInfo.name, testCaseInfo.description );
+            m_reporter->sectionStarting( testCaseSection );
+            Counts prevAssertions = m_totals.assertions;
+            double duration = 0;
+            try {
+                m_lastAssertionInfo = AssertionInfo( "TEST_CASE", testCaseInfo.lineInfo, "", ResultDisposition::Normal );
+
+                seedRng( *m_config );
+
+                Timer timer;
+                timer.start();
+                if( m_reporter->getPreferences().shouldRedirectStdOut ) {
+                    StreamRedirect coutRedir( Catch::cout(), redirectedCout );
+                    StreamRedirect cerrRedir( Catch::cerr(), redirectedCerr );
+                    invokeActiveTestCase();
+                }
+                else {
+                    invokeActiveTestCase();
+                }
+                duration = timer.getElapsedSeconds();
+            }
+            catch( TestFailureException& ) {
+                // This just means the test was aborted due to failure
+            }
+            catch(...) {
+                makeUnexpectedResultBuilder().useActiveException();
+            }
+            m_testCaseTracker->close();
+            handleUnfinishedSections();
+            m_messages.clear();
+
+            Counts assertions = m_totals.assertions - prevAssertions;
+            bool missingAssertions = testForMissingAssertions( assertions );
+
+            if( testCaseInfo.okToFail() ) {
+                std::swap( assertions.failedButOk, assertions.failed );
+                m_totals.assertions.failed -= assertions.failedButOk;
+                m_totals.assertions.failedButOk += assertions.failedButOk;
+            }
+
+            SectionStats testCaseSectionStats( testCaseSection, assertions, duration, missingAssertions );
+            m_reporter->sectionEnded( testCaseSectionStats );
+        }
+
+        void invokeActiveTestCase() {
+            FatalConditionHandler fatalConditionHandler; // Handle signals
+            m_activeTestCase->invoke();
+            fatalConditionHandler.reset();
+        }
+
+    private:
+
+        ResultBuilder makeUnexpectedResultBuilder() const {
+            return ResultBuilder(   m_lastAssertionInfo.macroName.c_str(),
+                                    m_lastAssertionInfo.lineInfo,
+                                    m_lastAssertionInfo.capturedExpression.c_str(),
+                                    m_lastAssertionInfo.resultDisposition );
+        }
+
+        void handleUnfinishedSections() {
+            // If sections ended prematurely due to an exception we stored their
+            // infos here so we can tear them down outside the unwind process.
+            for( std::vector<SectionEndInfo>::const_reverse_iterator it = m_unfinishedSections.rbegin(),
+                        itEnd = m_unfinishedSections.rend();
+                    it != itEnd;
+                    ++it )
+                sectionEnded( *it );
+            m_unfinishedSections.clear();
+        }
+
+        TestRunInfo m_runInfo;
+        IMutableContext& m_context;
+        TestCase const* m_activeTestCase;
+        ITracker* m_testCaseTracker;
+        ITracker* m_currentSectionTracker;
+        AssertionResult m_lastResult;
+
+        Ptr<IConfig const> m_config;
+        Totals m_totals;
+        Ptr<IStreamingReporter> m_reporter;
+        std::vector<MessageInfo> m_messages;
+        AssertionInfo m_lastAssertionInfo;
+        std::vector<SectionEndInfo> m_unfinishedSections;
+        std::vector<ITracker*> m_activeSections;
+        TrackerContext m_trackerContext;
+    };
+
+    IResultCapture& getResultCapture() {
+        if( IResultCapture* capture = getCurrentContext().getResultCapture() )
+            return *capture;
+        else
+            throw std::logic_error( "No result capture instance" );
+    }
+
+} // end namespace Catch
+
+// #included from: internal/catch_version.h
+#define TWOBLUECUBES_CATCH_VERSION_H_INCLUDED
+
+namespace Catch {
+
+    // Versioning information
+    struct Version {
+        Version(    unsigned int _majorVersion,
+                    unsigned int _minorVersion,
+                    unsigned int _patchNumber,
+                    std::string const& _branchName,
+                    unsigned int _buildNumber );
+
+        unsigned int const majorVersion;
+        unsigned int const minorVersion;
+        unsigned int const patchNumber;
+
+        // buildNumber is only used if branchName is not null
+        std::string const branchName;
+        unsigned int const buildNumber;
+
+        friend std::ostream& operator << ( std::ostream& os, Version const& version );
+
+    private:
+        void operator=( Version const& );
+    };
+
+    extern Version libraryVersion;
+}
+
+#include <fstream>
+#include <stdlib.h>
+#include <limits>
+
+namespace Catch {
+
+    Ptr<IStreamingReporter> createReporter( std::string const& reporterName, Ptr<Config> const& config ) {
+        Ptr<IStreamingReporter> reporter = getRegistryHub().getReporterRegistry().create( reporterName, config.get() );
+        if( !reporter ) {
+            std::ostringstream oss;
+            oss << "No reporter registered with name: '" << reporterName << "'";
+            throw std::domain_error( oss.str() );
+        }
+        return reporter;
+    }
+
+    Ptr<IStreamingReporter> makeReporter( Ptr<Config> const& config ) {
+        std::vector<std::string> reporters = config->getReporterNames();
+        if( reporters.empty() )
+            reporters.push_back( "console" );
+
+        Ptr<IStreamingReporter> reporter;
+        for( std::vector<std::string>::const_iterator it = reporters.begin(), itEnd = reporters.end();
+                it != itEnd;
+                ++it )
+            reporter = addReporter( reporter, createReporter( *it, config ) );
+        return reporter;
+    }
+    Ptr<IStreamingReporter> addListeners( Ptr<IConfig const> const& config, Ptr<IStreamingReporter> reporters ) {
+        IReporterRegistry::Listeners listeners = getRegistryHub().getReporterRegistry().getListeners();
+        for( IReporterRegistry::Listeners::const_iterator it = listeners.begin(), itEnd = listeners.end();
+                it != itEnd;
+                ++it )
+            reporters = addReporter(reporters, (*it)->create( ReporterConfig( config ) ) );
+        return reporters;
+    }
+
+    Totals runTests( Ptr<Config> const& config ) {
+
+        Ptr<IConfig const> iconfig = config.get();
+
+        Ptr<IStreamingReporter> reporter = makeReporter( config );
+        reporter = addListeners( iconfig, reporter );
+
+        RunContext context( iconfig, reporter );
+
+        Totals totals;
+
+        context.testGroupStarting( config->name(), 1, 1 );
+
+        TestSpec testSpec = config->testSpec();
+        if( !testSpec.hasFilters() )
+            testSpec = TestSpecParser( ITagAliasRegistry::get() ).parse( "~[.]" ).testSpec(); // All not hidden tests
+
+        std::vector<TestCase> const& allTestCases = getAllTestCasesSorted( *iconfig );
+        for( std::vector<TestCase>::const_iterator it = allTestCases.begin(), itEnd = allTestCases.end();
+                it != itEnd;
+                ++it ) {
+            if( !context.aborting() && matchTest( *it, testSpec, *iconfig ) )
+                totals += context.runTest( *it );
+            else
+                reporter->skipTest( *it );
+        }
+
+        context.testGroupEnded( iconfig->name(), totals, 1, 1 );
+        return totals;
+    }
+
+    void applyFilenamesAsTags( IConfig const& config ) {
+        std::vector<TestCase> const& tests = getAllTestCasesSorted( config );
+        for(std::size_t i = 0; i < tests.size(); ++i ) {
+            TestCase& test = const_cast<TestCase&>( tests[i] );
+            std::set<std::string> tags = test.tags;
+
+            std::string filename = test.lineInfo.file;
+            std::string::size_type lastSlash = filename.find_last_of( "\\/" );
+            if( lastSlash != std::string::npos )
+                filename = filename.substr( lastSlash+1 );
+
+            std::string::size_type lastDot = filename.find_last_of( "." );
+            if( lastDot != std::string::npos )
+                filename = filename.substr( 0, lastDot );
+
+            tags.insert( "#" + filename );
+            setTags( test, tags );
+        }
+    }
+
+    class Session : NonCopyable {
+        static bool alreadyInstantiated;
+
+    public:
+
+        struct OnUnusedOptions { enum DoWhat { Ignore, Fail }; };
+
+        Session()
+        : m_cli( makeCommandLineParser() ) {
+            if( alreadyInstantiated ) {
+                std::string msg = "Only one instance of Catch::Session can ever be used";
+                Catch::cerr() << msg << std::endl;
+                throw std::logic_error( msg );
+            }
+            alreadyInstantiated = true;
+        }
+        ~Session() {
+            Catch::cleanUp();
+        }
+
+        void showHelp( std::string const& processName ) {
+            Catch::cout() << "\nCatch v" << libraryVersion << "\n";
+
+            m_cli.usage( Catch::cout(), processName );
+            Catch::cout() << "For more detail usage please see the project docs\n" << std::endl;
+        }
+
+        int applyCommandLine( int argc, char const* argv[], OnUnusedOptions::DoWhat unusedOptionBehaviour = OnUnusedOptions::Fail ) {
+            try {
+                m_cli.setThrowOnUnrecognisedTokens( unusedOptionBehaviour == OnUnusedOptions::Fail );
+                m_unusedTokens = m_cli.parseInto( argc, argv, m_configData );
+                if( m_configData.showHelp )
+                    showHelp( m_configData.processName );
+                m_config.reset();
+            }
+            catch( std::exception& ex ) {
+                {
+                    Colour colourGuard( Colour::Red );
+                    Catch::cerr()
+                        << "\nError(s) in input:\n"
+                        << Text( ex.what(), TextAttributes().setIndent(2) )
+                        << "\n\n";
+                }
+                m_cli.usage( Catch::cout(), m_configData.processName );
+                return (std::numeric_limits<int>::max)();
+            }
+            return 0;
+        }
+
+        void useConfigData( ConfigData const& _configData ) {
+            m_configData = _configData;
+            m_config.reset();
+        }
+
+        int run( int argc, char const* argv[] ) {
+
+            int returnCode = applyCommandLine( argc, argv );
+            if( returnCode == 0 )
+                returnCode = run();
+            return returnCode;
+        }
+        int run( int argc, char* argv[] ) {
+            return run( argc, const_cast<char const**>( argv ) );
+        }
+
+        int run() {
+            if( m_configData.showHelp )
+                return 0;
+
+            try
+            {
+                config(); // Force config to be constructed
+
+                seedRng( *m_config );
+
+                if( m_configData.filenamesAsTags )
+                    applyFilenamesAsTags( *m_config );
+
+                // Handle list request
+                if( Option<std::size_t> listed = list( config() ) )
+                    return static_cast<int>( *listed );
+
+                return static_cast<int>( runTests( m_config ).assertions.failed );
+            }
+            catch( std::exception& ex ) {
+                Catch::cerr() << ex.what() << std::endl;
+                return (std::numeric_limits<int>::max)();
+            }
+        }
+
+        Clara::CommandLine<ConfigData> const& cli() const {
+            return m_cli;
+        }
+        std::vector<Clara::Parser::Token> const& unusedTokens() const {
+            return m_unusedTokens;
+        }
+        ConfigData& configData() {
+            return m_configData;
+        }
+        Config& config() {
+            if( !m_config )
+                m_config = new Config( m_configData );
+            return *m_config;
+        }
+    private:
+        Clara::CommandLine<ConfigData> m_cli;
+        std::vector<Clara::Parser::Token> m_unusedTokens;
+        ConfigData m_configData;
+        Ptr<Config> m_config;
+    };
+
+    bool Session::alreadyInstantiated = false;
+
+} // end namespace Catch
+
+// #included from: catch_registry_hub.hpp
+#define TWOBLUECUBES_CATCH_REGISTRY_HUB_HPP_INCLUDED
+
+// #included from: catch_test_case_registry_impl.hpp
+#define TWOBLUECUBES_CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+
+#include <vector>
+#include <set>
+#include <sstream>
+#include <iostream>
+#include <algorithm>
+
+namespace Catch {
+
+    struct LexSort {
+        bool operator() (TestCase i,TestCase j) const { return (i<j);}
+    };
+    struct RandomNumberGenerator {
+        int operator()( int n ) const { return std::rand() % n; }
+    };
+
+    inline std::vector<TestCase> sortTests( IConfig const& config, std::vector<TestCase> const& unsortedTestCases ) {
+
+        std::vector<TestCase> sorted = unsortedTestCases;
+
+        switch( config.runOrder() ) {
+            case RunTests::InLexicographicalOrder:
+                std::sort( sorted.begin(), sorted.end(), LexSort() );
+                break;
+            case RunTests::InRandomOrder:
+                {
+                    seedRng( config );
+
+                    RandomNumberGenerator rng;
+                    std::random_shuffle( sorted.begin(), sorted.end(), rng );
+                }
+                break;
+            case RunTests::InDeclarationOrder:
+                // already in declaration order
+                break;
+        }
+        return sorted;
+    }
+    bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ) {
+        return testSpec.matches( testCase ) && ( config.allowThrows() || !testCase.throws() );
+    }
+
+    void enforceNoDuplicateTestCases( std::vector<TestCase> const& functions ) {
+        std::set<TestCase> seenFunctions;
+        for( std::vector<TestCase>::const_iterator it = functions.begin(), itEnd = functions.end();
+            it != itEnd;
+            ++it ) {
+            std::pair<std::set<TestCase>::const_iterator, bool> prev = seenFunctions.insert( *it );
+            if( !prev.second ){
+                Catch::cerr()
+                << Colour( Colour::Red )
+                << "error: TEST_CASE( \"" << it->name << "\" ) already defined.\n"
+                << "\tFirst seen at " << prev.first->getTestCaseInfo().lineInfo << "\n"
+                << "\tRedefined at " << it->getTestCaseInfo().lineInfo << std::endl;
+                exit(1);
+            }
+        }
+    }
+
+    std::vector<TestCase> filterTests( std::vector<TestCase> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
+        std::vector<TestCase> filtered;
+        filtered.reserve( testCases.size() );
+        for( std::vector<TestCase>::const_iterator it = testCases.begin(), itEnd = testCases.end();
+                it != itEnd;
+                ++it )
+            if( matchTest( *it, testSpec, config ) )
+                filtered.push_back( *it );
+        return filtered;
+    }
+    std::vector<TestCase> const& getAllTestCasesSorted( IConfig const& config ) {
+        return getRegistryHub().getTestCaseRegistry().getAllTestsSorted( config );
+    }
+
+    class TestRegistry : public ITestCaseRegistry {
+    public:
+        TestRegistry()
+        :   m_currentSortOrder( RunTests::InDeclarationOrder ),
+            m_unnamedCount( 0 )
+        {}
+        virtual ~TestRegistry();
+
+        virtual void registerTest( TestCase const& testCase ) {
+            std::string name = testCase.getTestCaseInfo().name;
+            if( name == "" ) {
+                std::ostringstream oss;
+                oss << "Anonymous test case " << ++m_unnamedCount;
+                return registerTest( testCase.withName( oss.str() ) );
+            }
+            m_functions.push_back( testCase );
+        }
+
+        virtual std::vector<TestCase> const& getAllTests() const {
+            return m_functions;
+        }
+        virtual std::vector<TestCase> const& getAllTestsSorted( IConfig const& config ) const {
+            if( m_sortedFunctions.empty() )
+                enforceNoDuplicateTestCases( m_functions );
+
+            if(  m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty() ) {
+                m_sortedFunctions = sortTests( config, m_functions );
+                m_currentSortOrder = config.runOrder();
+            }
+            return m_sortedFunctions;
+        }
+
+    private:
+        std::vector<TestCase> m_functions;
+        mutable RunTests::InWhatOrder m_currentSortOrder;
+        mutable std::vector<TestCase> m_sortedFunctions;
+        size_t m_unnamedCount;
+        std::ios_base::Init m_ostreamInit; // Forces cout/ cerr to be initialised
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class FreeFunctionTestCase : public SharedImpl<ITestCase> {
+    public:
+
+        FreeFunctionTestCase( TestFunction fun ) : m_fun( fun ) {}
+
+        virtual void invoke() const {
+            m_fun();
+        }
+
+    private:
+        virtual ~FreeFunctionTestCase();
+
+        TestFunction m_fun;
+    };
+
+    inline std::string extractClassName( std::string const& classOrQualifiedMethodName ) {
+        std::string className = classOrQualifiedMethodName;
+        if( startsWith( className, "&" ) )
+        {
+            std::size_t lastColons = className.rfind( "::" );
+            std::size_t penultimateColons = className.rfind( "::", lastColons-1 );
+            if( penultimateColons == std::string::npos )
+                penultimateColons = 1;
+            className = className.substr( penultimateColons, lastColons-penultimateColons );
+        }
+        return className;
+    }
+
+    void registerTestCase
+        (   ITestCase* testCase,
+            char const* classOrQualifiedMethodName,
+            NameAndDesc const& nameAndDesc,
+            SourceLineInfo const& lineInfo ) {
+
+        getMutableRegistryHub().registerTest
+            ( makeTestCase
+                (   testCase,
+                    extractClassName( classOrQualifiedMethodName ),
+                    nameAndDesc.name,
+                    nameAndDesc.description,
+                    lineInfo ) );
+    }
+    void registerTestCaseFunction
+        (   TestFunction function,
+            SourceLineInfo const& lineInfo,
+            NameAndDesc const& nameAndDesc ) {
+        registerTestCase( new FreeFunctionTestCase( function ), "", nameAndDesc, lineInfo );
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    AutoReg::AutoReg
+        (   TestFunction function,
+            SourceLineInfo const& lineInfo,
+            NameAndDesc const& nameAndDesc ) {
+        registerTestCaseFunction( function, lineInfo, nameAndDesc );
+    }
+
+    AutoReg::~AutoReg() {}
+
+} // end namespace Catch
+
+// #included from: catch_reporter_registry.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+
+#include <map>
+
+namespace Catch {
+
+    class ReporterRegistry : public IReporterRegistry {
+
+    public:
+
+        virtual ~ReporterRegistry() CATCH_OVERRIDE {}
+
+        virtual IStreamingReporter* create( std::string const& name, Ptr<IConfig const> const& config ) const CATCH_OVERRIDE {
+            FactoryMap::const_iterator it =  m_factories.find( name );
+            if( it == m_factories.end() )
+                return CATCH_NULL;
+            return it->second->create( ReporterConfig( config ) );
+        }
+
+        void registerReporter( std::string const& name, Ptr<IReporterFactory> const& factory ) {
+            m_factories.insert( std::make_pair( name, factory ) );
+        }
+        void registerListener( Ptr<IReporterFactory> const& factory ) {
+            m_listeners.push_back( factory );
+        }
+
+        virtual FactoryMap const& getFactories() const CATCH_OVERRIDE {
+            return m_factories;
+        }
+        virtual Listeners const& getListeners() const CATCH_OVERRIDE {
+            return m_listeners;
+        }
+
+    private:
+        FactoryMap m_factories;
+        Listeners m_listeners;
+    };
+}
+
+// #included from: catch_exception_translator_registry.hpp
+#define TWOBLUECUBES_CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+
+#ifdef __OBJC__
+#import "Foundation/Foundation.h"
+#endif
+
+namespace Catch {
+
+    class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
+    public:
+        ~ExceptionTranslatorRegistry() {
+            deleteAll( m_translators );
+        }
+
+        virtual void registerTranslator( const IExceptionTranslator* translator ) {
+            m_translators.push_back( translator );
+        }
+
+        virtual std::string translateActiveException() const {
+            try {
+#ifdef __OBJC__
+                // In Objective-C try objective-c exceptions first
+                @try {
+                    return tryTranslators();
+                }
+                @catch (NSException *exception) {
+                    return Catch::toString( [exception description] );
+                }
+#else
+                return tryTranslators();
+#endif
+            }
+            catch( TestFailureException& ) {
+                throw;
+            }
+            catch( std::exception& ex ) {
+                return ex.what();
+            }
+            catch( std::string& msg ) {
+                return msg;
+            }
+            catch( const char* msg ) {
+                return msg;
+            }
+            catch(...) {
+                return "Unknown exception";
+            }
+        }
+
+        std::string tryTranslators() const {
+            if( m_translators.empty() )
+                throw;
+            else
+                return m_translators[0]->translate( m_translators.begin()+1, m_translators.end() );
+        }
+
+    private:
+        std::vector<const IExceptionTranslator*> m_translators;
+    };
+}
+
+namespace Catch {
+
+    namespace {
+
+        class RegistryHub : public IRegistryHub, public IMutableRegistryHub {
+
+            RegistryHub( RegistryHub const& );
+            void operator=( RegistryHub const& );
+
+        public: // IRegistryHub
+            RegistryHub() {
+            }
+            virtual IReporterRegistry const& getReporterRegistry() const CATCH_OVERRIDE {
+                return m_reporterRegistry;
+            }
+            virtual ITestCaseRegistry const& getTestCaseRegistry() const CATCH_OVERRIDE {
+                return m_testCaseRegistry;
+            }
+            virtual IExceptionTranslatorRegistry& getExceptionTranslatorRegistry() CATCH_OVERRIDE {
+                return m_exceptionTranslatorRegistry;
+            }
+
+        public: // IMutableRegistryHub
+            virtual void registerReporter( std::string const& name, Ptr<IReporterFactory> const& factory ) CATCH_OVERRIDE {
+                m_reporterRegistry.registerReporter( name, factory );
+            }
+            virtual void registerListener( Ptr<IReporterFactory> const& factory ) CATCH_OVERRIDE {
+                m_reporterRegistry.registerListener( factory );
+            }
+            virtual void registerTest( TestCase const& testInfo ) CATCH_OVERRIDE {
+                m_testCaseRegistry.registerTest( testInfo );
+            }
+            virtual void registerTranslator( const IExceptionTranslator* translator ) CATCH_OVERRIDE {
+                m_exceptionTranslatorRegistry.registerTranslator( translator );
+            }
+
+        private:
+            TestRegistry m_testCaseRegistry;
+            ReporterRegistry m_reporterRegistry;
+            ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
+        };
+
+        // Single, global, instance
+        inline RegistryHub*& getTheRegistryHub() {
+            static RegistryHub* theRegistryHub = CATCH_NULL;
+            if( !theRegistryHub )
+                theRegistryHub = new RegistryHub();
+            return theRegistryHub;
+        }
+    }
+
+    IRegistryHub& getRegistryHub() {
+        return *getTheRegistryHub();
+    }
+    IMutableRegistryHub& getMutableRegistryHub() {
+        return *getTheRegistryHub();
+    }
+    void cleanUp() {
+        delete getTheRegistryHub();
+        getTheRegistryHub() = CATCH_NULL;
+        cleanUpContext();
+    }
+    std::string translateActiveException() {
+        return getRegistryHub().getExceptionTranslatorRegistry().translateActiveException();
+    }
+
+} // end namespace Catch
+
+// #included from: catch_notimplemented_exception.hpp
+#define TWOBLUECUBES_CATCH_NOTIMPLEMENTED_EXCEPTION_HPP_INCLUDED
+
+#include <ostream>
+
+namespace Catch {
+
+    NotImplementedException::NotImplementedException( SourceLineInfo const& lineInfo )
+    :   m_lineInfo( lineInfo ) {
+        std::ostringstream oss;
+        oss << lineInfo << ": function ";
+        oss << "not implemented";
+        m_what = oss.str();
+    }
+
+    const char* NotImplementedException::what() const CATCH_NOEXCEPT {
+        return m_what.c_str();
+    }
+
+} // end namespace Catch
+
+// #included from: catch_context_impl.hpp
+#define TWOBLUECUBES_CATCH_CONTEXT_IMPL_HPP_INCLUDED
+
+// #included from: catch_stream.hpp
+#define TWOBLUECUBES_CATCH_STREAM_HPP_INCLUDED
+
+#include <stdexcept>
+#include <cstdio>
+#include <iostream>
+
+namespace Catch {
+
+    template<typename WriterF, size_t bufferSize=256>
+    class StreamBufImpl : public StreamBufBase {
+        char data[bufferSize];
+        WriterF m_writer;
+
+    public:
+        StreamBufImpl() {
+            setp( data, data + sizeof(data) );
+        }
+
+        ~StreamBufImpl() CATCH_NOEXCEPT {
+            sync();
+        }
+
+    private:
+        int overflow( int c ) {
+            sync();
+
+            if( c != EOF ) {
+                if( pbase() == epptr() )
+                    m_writer( std::string( 1, static_cast<char>( c ) ) );
+                else
+                    sputc( static_cast<char>( c ) );
+            }
+            return 0;
+        }
+
+        int sync() {
+            if( pbase() != pptr() ) {
+                m_writer( std::string( pbase(), static_cast<std::string::size_type>( pptr() - pbase() ) ) );
+                setp( pbase(), epptr() );
+            }
+            return 0;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    FileStream::FileStream( std::string const& filename ) {
+        m_ofs.open( filename.c_str() );
+        if( m_ofs.fail() ) {
+            std::ostringstream oss;
+            oss << "Unable to open file: '" << filename << "'";
+            throw std::domain_error( oss.str() );
+        }
+    }
+
+    std::ostream& FileStream::stream() const {
+        return m_ofs;
+    }
+
+    struct OutputDebugWriter {
+
+        void operator()( std::string const&str ) {
+            writeToDebugConsole( str );
+        }
+    };
+
+    DebugOutStream::DebugOutStream()
+    :   m_streamBuf( new StreamBufImpl<OutputDebugWriter>() ),
+        m_os( m_streamBuf.get() )
+    {}
+
+    std::ostream& DebugOutStream::stream() const {
+        return m_os;
+    }
+
+    // Store the streambuf from cout up-front because
+    // cout may get redirected when running tests
+    CoutStream::CoutStream()
+    :   m_os( Catch::cout().rdbuf() )
+    {}
+
+    std::ostream& CoutStream::stream() const {
+        return m_os;
+    }
+
+#ifndef CATCH_CONFIG_NOSTDOUT // If you #define this you must implement these functions
+    std::ostream& cout() {
+        return std::cout;
+    }
+    std::ostream& cerr() {
+        return std::cerr;
+    }
+#endif
+}
+
+namespace Catch {
+
+    class Context : public IMutableContext {
+
+        Context() : m_config( CATCH_NULL ), m_runner( CATCH_NULL ), m_resultCapture( CATCH_NULL ) {}
+        Context( Context const& );
+        void operator=( Context const& );
+
+    public: // IContext
+        virtual IResultCapture* getResultCapture() {
+            return m_resultCapture;
+        }
+        virtual IRunner* getRunner() {
+            return m_runner;
+        }
+        virtual size_t getGeneratorIndex( std::string const& fileInfo, size_t totalSize ) {
+            return getGeneratorsForCurrentTest()
+            .getGeneratorInfo( fileInfo, totalSize )
+            .getCurrentIndex();
+        }
+        virtual bool advanceGeneratorsForCurrentTest() {
+            IGeneratorsForTest* generators = findGeneratorsForCurrentTest();
+            return generators && generators->moveNext();
+        }
+
+        virtual Ptr<IConfig const> getConfig() const {
+            return m_config;
+        }
+
+    public: // IMutableContext
+        virtual void setResultCapture( IResultCapture* resultCapture ) {
+            m_resultCapture = resultCapture;
+        }
+        virtual void setRunner( IRunner* runner ) {
+            m_runner = runner;
+        }
+        virtual void setConfig( Ptr<IConfig const> const& config ) {
+            m_config = config;
+        }
+
+        friend IMutableContext& getCurrentMutableContext();
+
+    private:
+        IGeneratorsForTest* findGeneratorsForCurrentTest() {
+            std::string testName = getResultCapture()->getCurrentTestName();
+
+            std::map<std::string, IGeneratorsForTest*>::const_iterator it =
+                m_generatorsByTestName.find( testName );
+            return it != m_generatorsByTestName.end()
+                ? it->second
+                : CATCH_NULL;
+        }
+
+        IGeneratorsForTest& getGeneratorsForCurrentTest() {
+            IGeneratorsForTest* generators = findGeneratorsForCurrentTest();
+            if( !generators ) {
+                std::string testName = getResultCapture()->getCurrentTestName();
+                generators = createGeneratorsForTest();
+                m_generatorsByTestName.insert( std::make_pair( testName, generators ) );
+            }
+            return *generators;
+        }
+
+    private:
+        Ptr<IConfig const> m_config;
+        IRunner* m_runner;
+        IResultCapture* m_resultCapture;
+        std::map<std::string, IGeneratorsForTest*> m_generatorsByTestName;
+    };
+
+    namespace {
+        Context* currentContext = CATCH_NULL;
+    }
+    IMutableContext& getCurrentMutableContext() {
+        if( !currentContext )
+            currentContext = new Context();
+        return *currentContext;
+    }
+    IContext& getCurrentContext() {
+        return getCurrentMutableContext();
+    }
+
+    void cleanUpContext() {
+        delete currentContext;
+        currentContext = CATCH_NULL;
+    }
+}
+
+// #included from: catch_console_colour_impl.hpp
+#define TWOBLUECUBES_CATCH_CONSOLE_COLOUR_IMPL_HPP_INCLUDED
+
+namespace Catch {
+    namespace {
+
+        struct IColourImpl {
+            virtual ~IColourImpl() {}
+            virtual void use( Colour::Code _colourCode ) = 0;
+        };
+
+        struct NoColourImpl : IColourImpl {
+            void use( Colour::Code ) {}
+
+            static IColourImpl* instance() {
+                static NoColourImpl s_instance;
+                return &s_instance;
+            }
+        };
+
+    } // anon namespace
+} // namespace Catch
+
+#if !defined( CATCH_CONFIG_COLOUR_NONE ) && !defined( CATCH_CONFIG_COLOUR_WINDOWS ) && !defined( CATCH_CONFIG_COLOUR_ANSI )
+#   ifdef CATCH_PLATFORM_WINDOWS
+#       define CATCH_CONFIG_COLOUR_WINDOWS
+#   else
+#       define CATCH_CONFIG_COLOUR_ANSI
+#   endif
+#endif
+
+#if defined ( CATCH_CONFIG_COLOUR_WINDOWS ) /////////////////////////////////////////
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+#ifdef __AFXDLL
+#include <AfxWin.h>
+#else
+#include <windows.h>
+#endif
+
+namespace Catch {
+namespace {
+
+    class Win32ColourImpl : public IColourImpl {
+    public:
+        Win32ColourImpl() : stdoutHandle( GetStdHandle(STD_OUTPUT_HANDLE) )
+        {
+            CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
+            GetConsoleScreenBufferInfo( stdoutHandle, &csbiInfo );
+            originalForegroundAttributes = csbiInfo.wAttributes & ~( BACKGROUND_GREEN | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY );
+            originalBackgroundAttributes = csbiInfo.wAttributes & ~( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY );
+        }
+
+        virtual void use( Colour::Code _colourCode ) {
+            switch( _colourCode ) {
+                case Colour::None:      return setTextAttribute( originalForegroundAttributes );
+                case Colour::White:     return setTextAttribute( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::Red:       return setTextAttribute( FOREGROUND_RED );
+                case Colour::Green:     return setTextAttribute( FOREGROUND_GREEN );
+                case Colour::Blue:      return setTextAttribute( FOREGROUND_BLUE );
+                case Colour::Cyan:      return setTextAttribute( FOREGROUND_BLUE | FOREGROUND_GREEN );
+                case Colour::Yellow:    return setTextAttribute( FOREGROUND_RED | FOREGROUND_GREEN );
+                case Colour::Grey:      return setTextAttribute( 0 );
+
+                case Colour::LightGrey:     return setTextAttribute( FOREGROUND_INTENSITY );
+                case Colour::BrightRed:     return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED );
+                case Colour::BrightGreen:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN );
+                case Colour::BrightWhite:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+
+                case Colour::Bright: throw std::logic_error( "not a colour" );
+            }
+        }
+
+    private:
+        void setTextAttribute( WORD _textAttribute ) {
+            SetConsoleTextAttribute( stdoutHandle, _textAttribute | originalBackgroundAttributes );
+        }
+        HANDLE stdoutHandle;
+        WORD originalForegroundAttributes;
+        WORD originalBackgroundAttributes;
+    };
+
+    IColourImpl* platformColourInstance() {
+        static Win32ColourImpl s_instance;
+
+        Ptr<IConfig const> config = getCurrentContext().getConfig();
+        UseColour::YesOrNo colourMode = config
+            ? config->useColour()
+            : UseColour::Auto;
+        if( colourMode == UseColour::Auto )
+            colourMode = !isDebuggerActive()
+                ? UseColour::Yes
+                : UseColour::No;
+        return colourMode == UseColour::Yes
+            ? &s_instance
+            : NoColourImpl::instance();
+    }
+
+} // end anon namespace
+} // end namespace Catch
+
+#elif defined( CATCH_CONFIG_COLOUR_ANSI ) //////////////////////////////////////
+
+#include <unistd.h>
+
+namespace Catch {
+namespace {
+
+    // use POSIX/ ANSI console terminal codes
+    // Thanks to Adam Strzelecki for original contribution
+    // (http://github.com/nanoant)
+    // https://github.com/philsquared/Catch/pull/131
+    class PosixColourImpl : public IColourImpl {
+    public:
+        virtual void use( Colour::Code _colourCode ) {
+            switch( _colourCode ) {
+                case Colour::None:
+                case Colour::White:     return setColour( "[0m" );
+                case Colour::Red:       return setColour( "[0;31m" );
+                case Colour::Green:     return setColour( "[0;32m" );
+                case Colour::Blue:      return setColour( "[0:34m" );
+                case Colour::Cyan:      return setColour( "[0;36m" );
+                case Colour::Yellow:    return setColour( "[0;33m" );
+                case Colour::Grey:      return setColour( "[1;30m" );
+
+                case Colour::LightGrey:     return setColour( "[0;37m" );
+                case Colour::BrightRed:     return setColour( "[1;31m" );
+                case Colour::BrightGreen:   return setColour( "[1;32m" );
+                case Colour::BrightWhite:   return setColour( "[1;37m" );
+
+                case Colour::Bright: throw std::logic_error( "not a colour" );
+            }
+        }
+        static IColourImpl* instance() {
+            static PosixColourImpl s_instance;
+            return &s_instance;
+        }
+
+    private:
+        void setColour( const char* _escapeCode ) {
+            Catch::cout() << '\033' << _escapeCode;
+        }
+    };
+
+    IColourImpl* platformColourInstance() {
+        Ptr<IConfig const> config = getCurrentContext().getConfig();
+        UseColour::YesOrNo colourMode = config
+            ? config->useColour()
+            : UseColour::Auto;
+        if( colourMode == UseColour::Auto )
+            colourMode = (!isDebuggerActive() && isatty(STDOUT_FILENO) )
+                ? UseColour::Yes
+                : UseColour::No;
+        return colourMode == UseColour::Yes
+            ? PosixColourImpl::instance()
+            : NoColourImpl::instance();
+    }
+
+} // end anon namespace
+} // end namespace Catch
+
+#else  // not Windows or ANSI ///////////////////////////////////////////////
+
+namespace Catch {
+
+    static IColourImpl* platformColourInstance() { return NoColourImpl::instance(); }
+
+} // end namespace Catch
+
+#endif // Windows/ ANSI/ None
+
+namespace Catch {
+
+    Colour::Colour( Code _colourCode ) : m_moved( false ) { use( _colourCode ); }
+    Colour::Colour( Colour const& _other ) : m_moved( false ) { const_cast<Colour&>( _other ).m_moved = true; }
+    Colour::~Colour(){ if( !m_moved ) use( None ); }
+
+    void Colour::use( Code _colourCode ) {
+        static IColourImpl* impl = platformColourInstance();
+        impl->use( _colourCode );
+    }
+
+} // end namespace Catch
+
+// #included from: catch_generators_impl.hpp
+#define TWOBLUECUBES_CATCH_GENERATORS_IMPL_HPP_INCLUDED
+
+#include <vector>
+#include <string>
+#include <map>
+
+namespace Catch {
+
+    struct GeneratorInfo : IGeneratorInfo {
+
+        GeneratorInfo( std::size_t size )
+        :   m_size( size ),
+            m_currentIndex( 0 )
+        {}
+
+        bool moveNext() {
+            if( ++m_currentIndex == m_size ) {
+                m_currentIndex = 0;
+                return false;
+            }
+            return true;
+        }
+
+        std::size_t getCurrentIndex() const {
+            return m_currentIndex;
+        }
+
+        std::size_t m_size;
+        std::size_t m_currentIndex;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class GeneratorsForTest : public IGeneratorsForTest {
+
+    public:
+        ~GeneratorsForTest() {
+            deleteAll( m_generatorsInOrder );
+        }
+
+        IGeneratorInfo& getGeneratorInfo( std::string const& fileInfo, std::size_t size ) {
+            std::map<std::string, IGeneratorInfo*>::const_iterator it = m_generatorsByName.find( fileInfo );
+            if( it == m_generatorsByName.end() ) {
+                IGeneratorInfo* info = new GeneratorInfo( size );
+                m_generatorsByName.insert( std::make_pair( fileInfo, info ) );
+                m_generatorsInOrder.push_back( info );
+                return *info;
+            }
+            return *it->second;
+        }
+
+        bool moveNext() {
+            std::vector<IGeneratorInfo*>::const_iterator it = m_generatorsInOrder.begin();
+            std::vector<IGeneratorInfo*>::const_iterator itEnd = m_generatorsInOrder.end();
+            for(; it != itEnd; ++it ) {
+                if( (*it)->moveNext() )
+                    return true;
+            }
+            return false;
+        }
+
+    private:
+        std::map<std::string, IGeneratorInfo*> m_generatorsByName;
+        std::vector<IGeneratorInfo*> m_generatorsInOrder;
+    };
+
+    IGeneratorsForTest* createGeneratorsForTest()
+    {
+        return new GeneratorsForTest();
+    }
+
+} // end namespace Catch
+
+// #included from: catch_assertionresult.hpp
+#define TWOBLUECUBES_CATCH_ASSERTIONRESULT_HPP_INCLUDED
+
+namespace Catch {
+
+    AssertionInfo::AssertionInfo(   std::string const& _macroName,
+                                    SourceLineInfo const& _lineInfo,
+                                    std::string const& _capturedExpression,
+                                    ResultDisposition::Flags _resultDisposition )
+    :   macroName( _macroName ),
+        lineInfo( _lineInfo ),
+        capturedExpression( _capturedExpression ),
+        resultDisposition( _resultDisposition )
+    {}
+
+    AssertionResult::AssertionResult() {}
+
+    AssertionResult::AssertionResult( AssertionInfo const& info, AssertionResultData const& data )
+    :   m_info( info ),
+        m_resultData( data )
+    {}
+
+    AssertionResult::~AssertionResult() {}
+
+    // Result was a success
+    bool AssertionResult::succeeded() const {
+        return Catch::isOk( m_resultData.resultType );
+    }
+
+    // Result was a success, or failure is suppressed
+    bool AssertionResult::isOk() const {
+        return Catch::isOk( m_resultData.resultType ) || shouldSuppressFailure( m_info.resultDisposition );
+    }
+
+    ResultWas::OfType AssertionResult::getResultType() const {
+        return m_resultData.resultType;
+    }
+
+    bool AssertionResult::hasExpression() const {
+        return !m_info.capturedExpression.empty();
+    }
+
+    bool AssertionResult::hasMessage() const {
+        return !m_resultData.message.empty();
+    }
+
+    std::string AssertionResult::getExpression() const {
+        if( isFalseTest( m_info.resultDisposition ) )
+            return "!" + m_info.capturedExpression;
+        else
+            return m_info.capturedExpression;
+    }
+    std::string AssertionResult::getExpressionInMacro() const {
+        if( m_info.macroName.empty() )
+            return m_info.capturedExpression;
+        else
+            return m_info.macroName + "( " + m_info.capturedExpression + " )";
+    }
+
+    bool AssertionResult::hasExpandedExpression() const {
+        return hasExpression() && getExpandedExpression() != getExpression();
+    }
+
+    std::string AssertionResult::getExpandedExpression() const {
+        return m_resultData.reconstructedExpression;
+    }
+
+    std::string AssertionResult::getMessage() const {
+        return m_resultData.message;
+    }
+    SourceLineInfo AssertionResult::getSourceInfo() const {
+        return m_info.lineInfo;
+    }
+
+    std::string AssertionResult::getTestMacroName() const {
+        return m_info.macroName;
+    }
+
+} // end namespace Catch
+
+// #included from: catch_test_case_info.hpp
+#define TWOBLUECUBES_CATCH_TEST_CASE_INFO_HPP_INCLUDED
+
+namespace Catch {
+
+    inline TestCaseInfo::SpecialProperties parseSpecialTag( std::string const& tag ) {
+        if( startsWith( tag, "." ) ||
+            tag == "hide" ||
+            tag == "!hide" )
+            return TestCaseInfo::IsHidden;
+        else if( tag == "!throws" )
+            return TestCaseInfo::Throws;
+        else if( tag == "!shouldfail" )
+            return TestCaseInfo::ShouldFail;
+        else if( tag == "!mayfail" )
+            return TestCaseInfo::MayFail;
+        else
+            return TestCaseInfo::None;
+    }
+    inline bool isReservedTag( std::string const& tag ) {
+        return parseSpecialTag( tag ) == TestCaseInfo::None && tag.size() > 0 && !isalnum( tag[0] );
+    }
+    inline void enforceNotReservedTag( std::string const& tag, SourceLineInfo const& _lineInfo ) {
+        if( isReservedTag( tag ) ) {
+            {
+                Colour colourGuard( Colour::Red );
+                Catch::cerr()
+                    << "Tag name [" << tag << "] not allowed.\n"
+                    << "Tag names starting with non alpha-numeric characters are reserved\n";
+            }
+            {
+                Colour colourGuard( Colour::FileName );
+                Catch::cerr() << _lineInfo << std::endl;
+            }
+            exit(1);
+        }
+    }
+
+    TestCase makeTestCase(  ITestCase* _testCase,
+                            std::string const& _className,
+                            std::string const& _name,
+                            std::string const& _descOrTags,
+                            SourceLineInfo const& _lineInfo )
+    {
+        bool isHidden( startsWith( _name, "./" ) ); // Legacy support
+
+        // Parse out tags
+        std::set<std::string> tags;
+        std::string desc, tag;
+        bool inTag = false;
+        for( std::size_t i = 0; i < _descOrTags.size(); ++i ) {
+            char c = _descOrTags[i];
+            if( !inTag ) {
+                if( c == '[' )
+                    inTag = true;
+                else
+                    desc += c;
+            }
+            else {
+                if( c == ']' ) {
+                    TestCaseInfo::SpecialProperties prop = parseSpecialTag( tag );
+                    if( prop == TestCaseInfo::IsHidden )
+                        isHidden = true;
+                    else if( prop == TestCaseInfo::None )
+                        enforceNotReservedTag( tag, _lineInfo );
+
+                    tags.insert( tag );
+                    tag.clear();
+                    inTag = false;
+                }
+                else
+                    tag += c;
+            }
+        }
+        if( isHidden ) {
+            tags.insert( "hide" );
+            tags.insert( "." );
+        }
+
+        TestCaseInfo info( _name, _className, desc, tags, _lineInfo );
+        return TestCase( _testCase, info );
+    }
+
+    void setTags( TestCaseInfo& testCaseInfo, std::set<std::string> const& tags )
+    {
+        testCaseInfo.tags = tags;
+        testCaseInfo.lcaseTags.clear();
+
+        std::ostringstream oss;
+        for( std::set<std::string>::const_iterator it = tags.begin(), itEnd = tags.end(); it != itEnd; ++it ) {
+            oss << "[" << *it << "]";
+            std::string lcaseTag = toLower( *it );
+            testCaseInfo.properties = static_cast<TestCaseInfo::SpecialProperties>( testCaseInfo.properties | parseSpecialTag( lcaseTag ) );
+            testCaseInfo.lcaseTags.insert( lcaseTag );
+        }
+        testCaseInfo.tagsAsString = oss.str();
+    }
+
+    TestCaseInfo::TestCaseInfo( std::string const& _name,
+                                std::string const& _className,
+                                std::string const& _description,
+                                std::set<std::string> const& _tags,
+                                SourceLineInfo const& _lineInfo )
+    :   name( _name ),
+        className( _className ),
+        description( _description ),
+        lineInfo( _lineInfo ),
+        properties( None )
+    {
+        setTags( *this, _tags );
+    }
+
+    TestCaseInfo::TestCaseInfo( TestCaseInfo const& other )
+    :   name( other.name ),
+        className( other.className ),
+        description( other.description ),
+        tags( other.tags ),
+        lcaseTags( other.lcaseTags ),
+        tagsAsString( other.tagsAsString ),
+        lineInfo( other.lineInfo ),
+        properties( other.properties )
+    {}
+
+    bool TestCaseInfo::isHidden() const {
+        return ( properties & IsHidden ) != 0;
+    }
+    bool TestCaseInfo::throws() const {
+        return ( properties & Throws ) != 0;
+    }
+    bool TestCaseInfo::okToFail() const {
+        return ( properties & (ShouldFail | MayFail ) ) != 0;
+    }
+    bool TestCaseInfo::expectedToFail() const {
+        return ( properties & (ShouldFail ) ) != 0;
+    }
+
+    TestCase::TestCase( ITestCase* testCase, TestCaseInfo const& info ) : TestCaseInfo( info ), test( testCase ) {}
+
+    TestCase::TestCase( TestCase const& other )
+    :   TestCaseInfo( other ),
+        test( other.test )
+    {}
+
+    TestCase TestCase::withName( std::string const& _newName ) const {
+        TestCase other( *this );
+        other.name = _newName;
+        return other;
+    }
+
+    void TestCase::swap( TestCase& other ) {
+        test.swap( other.test );
+        name.swap( other.name );
+        className.swap( other.className );
+        description.swap( other.description );
+        tags.swap( other.tags );
+        lcaseTags.swap( other.lcaseTags );
+        tagsAsString.swap( other.tagsAsString );
+        std::swap( TestCaseInfo::properties, static_cast<TestCaseInfo&>( other ).properties );
+        std::swap( lineInfo, other.lineInfo );
+    }
+
+    void TestCase::invoke() const {
+        test->invoke();
+    }
+
+    bool TestCase::operator == ( TestCase const& other ) const {
+        return  test.get() == other.test.get() &&
+                name == other.name &&
+                className == other.className;
+    }
+
+    bool TestCase::operator < ( TestCase const& other ) const {
+        return name < other.name;
+    }
+    TestCase& TestCase::operator = ( TestCase const& other ) {
+        TestCase temp( other );
+        swap( temp );
+        return *this;
+    }
+
+    TestCaseInfo const& TestCase::getTestCaseInfo() const
+    {
+        return *this;
+    }
+
+} // end namespace Catch
+
+// #included from: catch_version.hpp
+#define TWOBLUECUBES_CATCH_VERSION_HPP_INCLUDED
+
+namespace Catch {
+
+    Version::Version
+        (   unsigned int _majorVersion,
+            unsigned int _minorVersion,
+            unsigned int _patchNumber,
+            std::string const& _branchName,
+            unsigned int _buildNumber )
+    :   majorVersion( _majorVersion ),
+        minorVersion( _minorVersion ),
+        patchNumber( _patchNumber ),
+        branchName( _branchName ),
+        buildNumber( _buildNumber )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, Version const& version ) {
+        os  << version.majorVersion << "."
+            << version.minorVersion << "."
+            << version.patchNumber;
+
+        if( !version.branchName.empty() ) {
+            os  << "-" << version.branchName
+                << "." << version.buildNumber;
+        }
+        return os;
+    }
+
+    Version libraryVersion( 1, 4, 0, "", 0 );
+
+}
+
+// #included from: catch_message.hpp
+#define TWOBLUECUBES_CATCH_MESSAGE_HPP_INCLUDED
+
+namespace Catch {
+
+    MessageInfo::MessageInfo(   std::string const& _macroName,
+                                SourceLineInfo const& _lineInfo,
+                                ResultWas::OfType _type )
+    :   macroName( _macroName ),
+        lineInfo( _lineInfo ),
+        type( _type ),
+        sequence( ++globalCount )
+    {}
+
+    // This may need protecting if threading support is added
+    unsigned int MessageInfo::globalCount = 0;
+
+    ////////////////////////////////////////////////////////////////////////////
+
+    ScopedMessage::ScopedMessage( MessageBuilder const& builder )
+    : m_info( builder.m_info )
+    {
+        m_info.message = builder.m_stream.str();
+        getResultCapture().pushScopedMessage( m_info );
+    }
+    ScopedMessage::ScopedMessage( ScopedMessage const& other )
+    : m_info( other.m_info )
+    {}
+
+    ScopedMessage::~ScopedMessage() {
+        getResultCapture().popScopedMessage( m_info );
+    }
+
+} // end namespace Catch
+
+// #included from: catch_legacy_reporter_adapter.hpp
+#define TWOBLUECUBES_CATCH_LEGACY_REPORTER_ADAPTER_HPP_INCLUDED
+
+// #included from: catch_legacy_reporter_adapter.h
+#define TWOBLUECUBES_CATCH_LEGACY_REPORTER_ADAPTER_H_INCLUDED
+
+namespace Catch
+{
+    // Deprecated
+    struct IReporter : IShared {
+        virtual ~IReporter();
+
+        virtual bool shouldRedirectStdout() const = 0;
+
+        virtual void StartTesting() = 0;
+        virtual void EndTesting( Totals const& totals ) = 0;
+        virtual void StartGroup( std::string const& groupName ) = 0;
+        virtual void EndGroup( std::string const& groupName, Totals const& totals ) = 0;
+        virtual void StartTestCase( TestCaseInfo const& testInfo ) = 0;
+        virtual void EndTestCase( TestCaseInfo const& testInfo, Totals const& totals, std::string const& stdOut, std::string const& stdErr ) = 0;
+        virtual void StartSection( std::string const& sectionName, std::string const& description ) = 0;
+        virtual void EndSection( std::string const& sectionName, Counts const& assertions ) = 0;
+        virtual void NoAssertionsInSection( std::string const& sectionName ) = 0;
+        virtual void NoAssertionsInTestCase( std::string const& testName ) = 0;
+        virtual void Aborted() = 0;
+        virtual void Result( AssertionResult const& result ) = 0;
+    };
+
+    class LegacyReporterAdapter : public SharedImpl<IStreamingReporter>
+    {
+    public:
+        LegacyReporterAdapter( Ptr<IReporter> const& legacyReporter );
+        virtual ~LegacyReporterAdapter();
+
+        virtual ReporterPreferences getPreferences() const;
+        virtual void noMatchingTestCases( std::string const& );
+        virtual void testRunStarting( TestRunInfo const& );
+        virtual void testGroupStarting( GroupInfo const& groupInfo );
+        virtual void testCaseStarting( TestCaseInfo const& testInfo );
+        virtual void sectionStarting( SectionInfo const& sectionInfo );
+        virtual void assertionStarting( AssertionInfo const& );
+        virtual bool assertionEnded( AssertionStats const& assertionStats );
+        virtual void sectionEnded( SectionStats const& sectionStats );
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats );
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats );
+        virtual void testRunEnded( TestRunStats const& testRunStats );
+        virtual void skipTest( TestCaseInfo const& );
+
+    private:
+        Ptr<IReporter> m_legacyReporter;
+    };
+}
+
+namespace Catch
+{
+    LegacyReporterAdapter::LegacyReporterAdapter( Ptr<IReporter> const& legacyReporter )
+    :   m_legacyReporter( legacyReporter )
+    {}
+    LegacyReporterAdapter::~LegacyReporterAdapter() {}
+
+    ReporterPreferences LegacyReporterAdapter::getPreferences() const {
+        ReporterPreferences prefs;
+        prefs.shouldRedirectStdOut = m_legacyReporter->shouldRedirectStdout();
+        return prefs;
+    }
+
+    void LegacyReporterAdapter::noMatchingTestCases( std::string const& ) {}
+    void LegacyReporterAdapter::testRunStarting( TestRunInfo const& ) {
+        m_legacyReporter->StartTesting();
+    }
+    void LegacyReporterAdapter::testGroupStarting( GroupInfo const& groupInfo ) {
+        m_legacyReporter->StartGroup( groupInfo.name );
+    }
+    void LegacyReporterAdapter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        m_legacyReporter->StartTestCase( testInfo );
+    }
+    void LegacyReporterAdapter::sectionStarting( SectionInfo const& sectionInfo ) {
+        m_legacyReporter->StartSection( sectionInfo.name, sectionInfo.description );
+    }
+    void LegacyReporterAdapter::assertionStarting( AssertionInfo const& ) {
+        // Not on legacy interface
+    }
+
+    bool LegacyReporterAdapter::assertionEnded( AssertionStats const& assertionStats ) {
+        if( assertionStats.assertionResult.getResultType() != ResultWas::Ok ) {
+            for( std::vector<MessageInfo>::const_iterator it = assertionStats.infoMessages.begin(), itEnd = assertionStats.infoMessages.end();
+                    it != itEnd;
+                    ++it ) {
+                if( it->type == ResultWas::Info ) {
+                    ResultBuilder rb( it->macroName.c_str(), it->lineInfo, "", ResultDisposition::Normal );
+                    rb << it->message;
+                    rb.setResultType( ResultWas::Info );
+                    AssertionResult result = rb.build();
+                    m_legacyReporter->Result( result );
+                }
+            }
+        }
+        m_legacyReporter->Result( assertionStats.assertionResult );
+        return true;
+    }
+    void LegacyReporterAdapter::sectionEnded( SectionStats const& sectionStats ) {
+        if( sectionStats.missingAssertions )
+            m_legacyReporter->NoAssertionsInSection( sectionStats.sectionInfo.name );
+        m_legacyReporter->EndSection( sectionStats.sectionInfo.name, sectionStats.assertions );
+    }
+    void LegacyReporterAdapter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        m_legacyReporter->EndTestCase
+            (   testCaseStats.testInfo,
+                testCaseStats.totals,
+                testCaseStats.stdOut,
+                testCaseStats.stdErr );
+    }
+    void LegacyReporterAdapter::testGroupEnded( TestGroupStats const& testGroupStats ) {
+        if( testGroupStats.aborting )
+            m_legacyReporter->Aborted();
+        m_legacyReporter->EndGroup( testGroupStats.groupInfo.name, testGroupStats.totals );
+    }
+    void LegacyReporterAdapter::testRunEnded( TestRunStats const& testRunStats ) {
+        m_legacyReporter->EndTesting( testRunStats.totals );
+    }
+    void LegacyReporterAdapter::skipTest( TestCaseInfo const& ) {
+    }
+}
+
+// #included from: catch_timer.hpp
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#endif
+
+#ifdef CATCH_PLATFORM_WINDOWS
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+namespace Catch {
+
+    namespace {
+#ifdef CATCH_PLATFORM_WINDOWS
+        uint64_t getCurrentTicks() {
+            static uint64_t hz=0, hzo=0;
+            if (!hz) {
+                QueryPerformanceFrequency( reinterpret_cast<LARGE_INTEGER*>( &hz ) );
+                QueryPerformanceCounter( reinterpret_cast<LARGE_INTEGER*>( &hzo ) );
+            }
+            uint64_t t;
+            QueryPerformanceCounter( reinterpret_cast<LARGE_INTEGER*>( &t ) );
+            return ((t-hzo)*1000000)/hz;
+        }
+#else
+        uint64_t getCurrentTicks() {
+            timeval t;
+            gettimeofday(&t,CATCH_NULL);
+            return static_cast<uint64_t>( t.tv_sec ) * 1000000ull + static_cast<uint64_t>( t.tv_usec );
+        }
+#endif
+    }
+
+    void Timer::start() {
+        m_ticks = getCurrentTicks();
+    }
+    unsigned int Timer::getElapsedMicroseconds() const {
+        return static_cast<unsigned int>(getCurrentTicks() - m_ticks);
+    }
+    unsigned int Timer::getElapsedMilliseconds() const {
+        return static_cast<unsigned int>(getElapsedMicroseconds()/1000);
+    }
+    double Timer::getElapsedSeconds() const {
+        return getElapsedMicroseconds()/1000000.0;
+    }
+
+} // namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+// #included from: catch_common.hpp
+#define TWOBLUECUBES_CATCH_COMMON_HPP_INCLUDED
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix ) {
+        return s.size() >= prefix.size() && s.substr( 0, prefix.size() ) == prefix;
+    }
+    bool endsWith( std::string const& s, std::string const& suffix ) {
+        return s.size() >= suffix.size() && s.substr( s.size()-suffix.size(), suffix.size() ) == suffix;
+    }
+    bool contains( std::string const& s, std::string const& infix ) {
+        return s.find( infix ) != std::string::npos;
+    }
+    void toLowerInPlace( std::string& s ) {
+        std::transform( s.begin(), s.end(), s.begin(), ::tolower );
+    }
+    std::string toLower( std::string const& s ) {
+        std::string lc = s;
+        toLowerInPlace( lc );
+        return lc;
+    }
+    std::string trim( std::string const& str ) {
+        static char const* whitespaceChars = "\n\r\t ";
+        std::string::size_type start = str.find_first_not_of( whitespaceChars );
+        std::string::size_type end = str.find_last_not_of( whitespaceChars );
+
+        return start != std::string::npos ? str.substr( start, 1+end-start ) : "";
+    }
+
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) {
+        bool replaced = false;
+        std::size_t i = str.find( replaceThis );
+        while( i != std::string::npos ) {
+            replaced = true;
+            str = str.substr( 0, i ) + withThis + str.substr( i+replaceThis.size() );
+            if( i < str.size()-withThis.size() )
+                i = str.find( replaceThis, i+withThis.size() );
+            else
+                i = std::string::npos;
+        }
+        return replaced;
+    }
+
+    pluralise::pluralise( std::size_t count, std::string const& label )
+    :   m_count( count ),
+        m_label( label )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ) {
+        os << pluraliser.m_count << " " << pluraliser.m_label;
+        if( pluraliser.m_count != 1 )
+            os << "s";
+        return os;
+    }
+
+    SourceLineInfo::SourceLineInfo() : line( 0 ){}
+    SourceLineInfo::SourceLineInfo( char const* _file, std::size_t _line )
+    :   file( _file ),
+        line( _line )
+    {}
+    SourceLineInfo::SourceLineInfo( SourceLineInfo const& other )
+    :   file( other.file ),
+        line( other.line )
+    {}
+    bool SourceLineInfo::empty() const {
+        return file.empty();
+    }
+    bool SourceLineInfo::operator == ( SourceLineInfo const& other ) const {
+        return line == other.line && file == other.file;
+    }
+    bool SourceLineInfo::operator < ( SourceLineInfo const& other ) const {
+        return line < other.line || ( line == other.line  && file < other.file );
+    }
+
+    void seedRng( IConfig const& config ) {
+        if( config.rngSeed() != 0 )
+            std::srand( config.rngSeed() );
+    }
+    unsigned int rngSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ) {
+#ifndef __GNUG__
+        os << info.file << "(" << info.line << ")";
+#else
+        os << info.file << ":" << info.line;
+#endif
+        return os;
+    }
+
+    void throwLogicError( std::string const& message, SourceLineInfo const& locationInfo ) {
+        std::ostringstream oss;
+        oss << locationInfo << ": Internal Catch error: '" << message << "'";
+        if( alwaysTrue() )
+            throw std::logic_error( oss.str() );
+    }
+}
+
+// #included from: catch_section.hpp
+#define TWOBLUECUBES_CATCH_SECTION_HPP_INCLUDED
+
+namespace Catch {
+
+    SectionInfo::SectionInfo
+        (   SourceLineInfo const& _lineInfo,
+            std::string const& _name,
+            std::string const& _description )
+    :   name( _name ),
+        description( _description ),
+        lineInfo( _lineInfo )
+    {}
+
+    Section::Section( SectionInfo const& info )
+    :   m_info( info ),
+        m_sectionIncluded( getResultCapture().sectionStarted( m_info, m_assertions ) )
+    {
+        m_timer.start();
+    }
+
+    Section::~Section() {
+        if( m_sectionIncluded ) {
+            SectionEndInfo endInfo( m_info, m_assertions, m_timer.getElapsedSeconds() );
+            if( std::uncaught_exception() )
+                getResultCapture().sectionEndedEarly( endInfo );
+            else
+                getResultCapture().sectionEnded( endInfo );
+        }
+    }
+
+    // This indicates whether the section should be executed or not
+    Section::operator bool() const {
+        return m_sectionIncluded;
+    }
+
+} // end namespace Catch
+
+// #included from: catch_debugger.hpp
+#define TWOBLUECUBES_CATCH_DEBUGGER_HPP_INCLUDED
+
+#include <iostream>
+
+#ifdef CATCH_PLATFORM_MAC
+
+    #include <assert.h>
+    #include <stdbool.h>
+    #include <sys/types.h>
+    #include <unistd.h>
+    #include <sys/sysctl.h>
+
+    namespace Catch{
+
+        // The following function is taken directly from the following technical note:
+        // http://developer.apple.com/library/mac/#qa/qa2004/qa1361.html
+
+        // Returns true if the current process is being debugged (either
+        // running under the debugger or has a debugger attached post facto).
+        bool isDebuggerActive(){
+
+            int                 mib[4];
+            struct kinfo_proc   info;
+            size_t              size;
+
+            // Initialize the flags so that, if sysctl fails for some bizarre
+            // reason, we get a predictable result.
+
+            info.kp_proc.p_flag = 0;
+
+            // Initialize mib, which tells sysctl the info we want, in this case
+            // we're looking for information about a specific process ID.
+
+            mib[0] = CTL_KERN;
+            mib[1] = KERN_PROC;
+            mib[2] = KERN_PROC_PID;
+            mib[3] = getpid();
+
+            // Call sysctl.
+
+            size = sizeof(info);
+            if( sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, CATCH_NULL, 0) != 0 ) {
+                Catch::cerr() << "\n** Call to sysctl failed - unable to determine if debugger is active **\n" << std::endl;
+                return false;
+            }
+
+            // We're being debugged if the P_TRACED flag is set.
+
+            return ( (info.kp_proc.p_flag & P_TRACED) != 0 );
+        }
+    } // namespace Catch
+
+#elif defined(_MSC_VER)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#else
+    namespace Catch {
+       inline bool isDebuggerActive() { return false; }
+    }
+#endif // Platform
+
+#ifdef CATCH_PLATFORM_WINDOWS
+    extern "C" __declspec(dllimport) void __stdcall OutputDebugStringA( const char* );
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            ::OutputDebugStringA( text.c_str() );
+        }
+    }
+#else
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            // !TBD: Need a version for Mac/ XCode and other IDEs
+            Catch::cout() << text;
+        }
+    }
+#endif // Platform
+
+// #included from: catch_tostring.hpp
+#define TWOBLUECUBES_CATCH_TOSTRING_HPP_INCLUDED
+
+namespace Catch {
+
+namespace Detail {
+
+    const std::string unprintableString = "{?}";
+
+    namespace {
+        const int hexThreshold = 255;
+
+        struct Endianness {
+            enum Arch { Big, Little };
+
+            static Arch which() {
+                union _{
+                    int asInt;
+                    char asChar[sizeof (int)];
+                } u;
+
+                u.asInt = 1;
+                return ( u.asChar[sizeof(int)-1] == 1 ) ? Big : Little;
+            }
+        };
+    }
+
+    std::string rawMemoryToString( const void *object, std::size_t size )
+    {
+        // Reverse order for little endian architectures
+        int i = 0, end = static_cast<int>( size ), inc = 1;
+        if( Endianness::which() == Endianness::Little ) {
+            i = end-1;
+            end = inc = -1;
+        }
+
+        unsigned char const *bytes = static_cast<unsigned char const *>(object);
+        std::ostringstream os;
+        os << "0x" << std::setfill('0') << std::hex;
+        for( ; i != end; i += inc )
+             os << std::setw(2) << static_cast<unsigned>(bytes[i]);
+       return os.str();
+    }
+}
+
+std::string toString( std::string const& value ) {
+    std::string s = value;
+    if( getCurrentContext().getConfig()->showInvisibles() ) {
+        for(size_t i = 0; i < s.size(); ++i ) {
+            std::string subs;
+            switch( s[i] ) {
+            case '\n': subs = "\\n"; break;
+            case '\t': subs = "\\t"; break;
+            default: break;
+            }
+            if( !subs.empty() ) {
+                s = s.substr( 0, i ) + subs + s.substr( i+1 );
+                ++i;
+            }
+        }
+    }
+    return "\"" + s + "\"";
+}
+std::string toString( std::wstring const& value ) {
+
+    std::string s;
+    s.reserve( value.size() );
+    for(size_t i = 0; i < value.size(); ++i )
+        s += value[i] <= 0xff ? static_cast<char>( value[i] ) : '?';
+    return Catch::toString( s );
+}
+
+std::string toString( const char* const value ) {
+    return value ? Catch::toString( std::string( value ) ) : std::string( "{null string}" );
+}
+
+std::string toString( char* const value ) {
+    return Catch::toString( static_cast<const char*>( value ) );
+}
+
+std::string toString( const wchar_t* const value )
+{
+	return value ? Catch::toString( std::wstring(value) ) : std::string( "{null string}" );
+}
+
+std::string toString( wchar_t* const value )
+{
+	return Catch::toString( static_cast<const wchar_t*>( value ) );
+}
+
+std::string toString( int value ) {
+    std::ostringstream oss;
+    oss << value;
+    if( value > Detail::hexThreshold )
+        oss << " (0x" << std::hex << value << ")";
+    return oss.str();
+}
+
+std::string toString( unsigned long value ) {
+    std::ostringstream oss;
+    oss << value;
+    if( value > Detail::hexThreshold )
+        oss << " (0x" << std::hex << value << ")";
+    return oss.str();
+}
+
+std::string toString( unsigned int value ) {
+    return Catch::toString( static_cast<unsigned long>( value ) );
+}
+
+template<typename T>
+std::string fpToString( T value, int precision ) {
+    std::ostringstream oss;
+    oss << std::setprecision( precision )
+        << std::fixed
+        << value;
+    std::string d = oss.str();
+    std::size_t i = d.find_last_not_of( '0' );
+    if( i != std::string::npos && i != d.size()-1 ) {
+        if( d[i] == '.' )
+            i++;
+        d = d.substr( 0, i+1 );
+    }
+    return d;
+}
+
+std::string toString( const double value ) {
+    return fpToString( value, 10 );
+}
+std::string toString( const float value ) {
+    return fpToString( value, 5 ) + "f";
+}
+
+std::string toString( bool value ) {
+    return value ? "true" : "false";
+}
+
+std::string toString( char value ) {
+    return value < ' '
+        ? toString( static_cast<unsigned int>( value ) )
+        : Detail::makeString( value );
+}
+
+std::string toString( signed char value ) {
+    return toString( static_cast<char>( value ) );
+}
+
+std::string toString( unsigned char value ) {
+    return toString( static_cast<char>( value ) );
+}
+
+#ifdef CATCH_CONFIG_CPP11_LONG_LONG
+std::string toString( long long value ) {
+    std::ostringstream oss;
+    oss << value;
+    if( value > Detail::hexThreshold )
+        oss << " (0x" << std::hex << value << ")";
+    return oss.str();
+}
+std::string toString( unsigned long long value ) {
+    std::ostringstream oss;
+    oss << value;
+    if( value > Detail::hexThreshold )
+        oss << " (0x" << std::hex << value << ")";
+    return oss.str();
+}
+#endif
+
+#ifdef CATCH_CONFIG_CPP11_NULLPTR
+std::string toString( std::nullptr_t ) {
+    return "nullptr";
+}
+#endif
+
+#ifdef __OBJC__
+    std::string toString( NSString const * const& nsstring ) {
+        if( !nsstring )
+            return "nil";
+        return "@" + toString([nsstring UTF8String]);
+    }
+    std::string toString( NSString * CATCH_ARC_STRONG const& nsstring ) {
+        if( !nsstring )
+            return "nil";
+        return "@" + toString([nsstring UTF8String]);
+    }
+    std::string toString( NSObject* const& nsObject ) {
+        return toString( [nsObject description] );
+    }
+#endif
+
+} // end namespace Catch
+
+// #included from: catch_result_builder.hpp
+#define TWOBLUECUBES_CATCH_RESULT_BUILDER_HPP_INCLUDED
+
+namespace Catch {
+
+    std::string capturedExpressionWithSecondArgument( std::string const& capturedExpression, std::string const& secondArg ) {
+        return secondArg.empty() || secondArg == "\"\""
+            ? capturedExpression
+            : capturedExpression + ", " + secondArg;
+    }
+    ResultBuilder::ResultBuilder(   char const* macroName,
+                                    SourceLineInfo const& lineInfo,
+                                    char const* capturedExpression,
+                                    ResultDisposition::Flags resultDisposition,
+                                    char const* secondArg )
+    :   m_assertionInfo( macroName, lineInfo, capturedExpressionWithSecondArgument( capturedExpression, secondArg ), resultDisposition ),
+        m_shouldDebugBreak( false ),
+        m_shouldThrow( false )
+    {}
+
+    ResultBuilder& ResultBuilder::setResultType( ResultWas::OfType result ) {
+        m_data.resultType = result;
+        return *this;
+    }
+    ResultBuilder& ResultBuilder::setResultType( bool result ) {
+        m_data.resultType = result ? ResultWas::Ok : ResultWas::ExpressionFailed;
+        return *this;
+    }
+    ResultBuilder& ResultBuilder::setLhs( std::string const& lhs ) {
+        m_exprComponents.lhs = lhs;
+        return *this;
+    }
+    ResultBuilder& ResultBuilder::setRhs( std::string const& rhs ) {
+        m_exprComponents.rhs = rhs;
+        return *this;
+    }
+    ResultBuilder& ResultBuilder::setOp( std::string const& op ) {
+        m_exprComponents.op = op;
+        return *this;
+    }
+
+    void ResultBuilder::endExpression() {
+        m_exprComponents.testFalse = isFalseTest( m_assertionInfo.resultDisposition );
+        captureExpression();
+    }
+
+    void ResultBuilder::useActiveException( ResultDisposition::Flags resultDisposition ) {
+        m_assertionInfo.resultDisposition = resultDisposition;
+        m_stream.oss << Catch::translateActiveException();
+        captureResult( ResultWas::ThrewException );
+    }
+
+    void ResultBuilder::captureResult( ResultWas::OfType resultType ) {
+        setResultType( resultType );
+        captureExpression();
+    }
+    void ResultBuilder::captureExpectedException( std::string const& expectedMessage ) {
+        if( expectedMessage.empty() )
+            captureExpectedException( Matchers::Impl::Generic::AllOf<std::string>() );
+        else
+            captureExpectedException( Matchers::Equals( expectedMessage ) );
+    }
+
+    void ResultBuilder::captureExpectedException( Matchers::Impl::Matcher<std::string> const& matcher ) {
+
+        assert( m_exprComponents.testFalse == false );
+        AssertionResultData data = m_data;
+        data.resultType = ResultWas::Ok;
+        data.reconstructedExpression = m_assertionInfo.capturedExpression;
+
+        std::string actualMessage = Catch::translateActiveException();
+        if( !matcher.match( actualMessage ) ) {
+            data.resultType = ResultWas::ExpressionFailed;
+            data.reconstructedExpression = actualMessage;
+        }
+        AssertionResult result( m_assertionInfo, data );
+        handleResult( result );
+    }
+
+    void ResultBuilder::captureExpression() {
+        AssertionResult result = build();
+        handleResult( result );
+    }
+    void ResultBuilder::handleResult( AssertionResult const& result )
+    {
+        getResultCapture().assertionEnded( result );
+
+        if( !result.isOk() ) {
+            if( getCurrentContext().getConfig()->shouldDebugBreak() )
+                m_shouldDebugBreak = true;
+            if( getCurrentContext().getRunner()->aborting() || (m_assertionInfo.resultDisposition & ResultDisposition::Normal) )
+                m_shouldThrow = true;
+        }
+    }
+    void ResultBuilder::react() {
+        if( m_shouldThrow )
+            throw Catch::TestFailureException();
+    }
+
+    bool ResultBuilder::shouldDebugBreak() const { return m_shouldDebugBreak; }
+    bool ResultBuilder::allowThrows() const { return getCurrentContext().getConfig()->allowThrows(); }
+
+    AssertionResult ResultBuilder::build() const
+    {
+        assert( m_data.resultType != ResultWas::Unknown );
+
+        AssertionResultData data = m_data;
+
+        // Flip bool results if testFalse is set
+        if( m_exprComponents.testFalse ) {
+            if( data.resultType == ResultWas::Ok )
+                data.resultType = ResultWas::ExpressionFailed;
+            else if( data.resultType == ResultWas::ExpressionFailed )
+                data.resultType = ResultWas::Ok;
+        }
+
+        data.message = m_stream.oss.str();
+        data.reconstructedExpression = reconstructExpression();
+        if( m_exprComponents.testFalse ) {
+            if( m_exprComponents.op == "" )
+                data.reconstructedExpression = "!" + data.reconstructedExpression;
+            else
+                data.reconstructedExpression = "!(" + data.reconstructedExpression + ")";
+        }
+        return AssertionResult( m_assertionInfo, data );
+    }
+    std::string ResultBuilder::reconstructExpression() const {
+        if( m_exprComponents.op == "" )
+            return m_exprComponents.lhs.empty() ? m_assertionInfo.capturedExpression : m_exprComponents.op + m_exprComponents.lhs;
+        else if( m_exprComponents.op == "matches" )
+            return m_exprComponents.lhs + " " + m_exprComponents.rhs;
+        else if( m_exprComponents.op != "!" ) {
+            if( m_exprComponents.lhs.size() + m_exprComponents.rhs.size() < 40 &&
+                m_exprComponents.lhs.find("\n") == std::string::npos &&
+                m_exprComponents.rhs.find("\n") == std::string::npos )
+                return m_exprComponents.lhs + " " + m_exprComponents.op + " " + m_exprComponents.rhs;
+            else
+                return m_exprComponents.lhs + "\n" + m_exprComponents.op + "\n" + m_exprComponents.rhs;
+        }
+        else
+            return "{can't expand - use " + m_assertionInfo.macroName + "_FALSE( " + m_assertionInfo.capturedExpression.substr(1) + " ) instead of " + m_assertionInfo.macroName + "( " + m_assertionInfo.capturedExpression + " ) for better diagnostics}";
+    }
+
+} // end namespace Catch
+
+// #included from: catch_tag_alias_registry.hpp
+#define TWOBLUECUBES_CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+// #included from: catch_tag_alias_registry.h
+#define TWOBLUECUBES_CATCH_TAG_ALIAS_REGISTRY_H_INCLUDED
+
+#include <map>
+
+namespace Catch {
+
+    class TagAliasRegistry : public ITagAliasRegistry {
+    public:
+        virtual ~TagAliasRegistry();
+        virtual Option<TagAlias> find( std::string const& alias ) const;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const;
+        void add( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
+        static TagAliasRegistry& get();
+
+    private:
+        std::map<std::string, TagAlias> m_registry;
+    };
+
+} // end namespace Catch
+
+#include <map>
+#include <iostream>
+
+namespace Catch {
+
+    TagAliasRegistry::~TagAliasRegistry() {}
+
+    Option<TagAlias> TagAliasRegistry::find( std::string const& alias ) const {
+        std::map<std::string, TagAlias>::const_iterator it = m_registry.find( alias );
+        if( it != m_registry.end() )
+            return it->second;
+        else
+            return Option<TagAlias>();
+    }
+
+    std::string TagAliasRegistry::expandAliases( std::string const& unexpandedTestSpec ) const {
+        std::string expandedTestSpec = unexpandedTestSpec;
+        for( std::map<std::string, TagAlias>::const_iterator it = m_registry.begin(), itEnd = m_registry.end();
+                it != itEnd;
+                ++it ) {
+            std::size_t pos = expandedTestSpec.find( it->first );
+            if( pos != std::string::npos ) {
+                expandedTestSpec =  expandedTestSpec.substr( 0, pos ) +
+                                    it->second.tag +
+                                    expandedTestSpec.substr( pos + it->first.size() );
+            }
+        }
+        return expandedTestSpec;
+    }
+
+    void TagAliasRegistry::add( char const* alias, char const* tag, SourceLineInfo const& lineInfo ) {
+
+        if( !startsWith( alias, "[@" ) || !endsWith( alias, "]" ) ) {
+            std::ostringstream oss;
+            oss << "error: tag alias, \"" << alias << "\" is not of the form [@alias name].\n" << lineInfo;
+            throw std::domain_error( oss.str().c_str() );
+        }
+        if( !m_registry.insert( std::make_pair( alias, TagAlias( tag, lineInfo ) ) ).second ) {
+            std::ostringstream oss;
+            oss << "error: tag alias, \"" << alias << "\" already registered.\n"
+                << "\tFirst seen at " << find(alias)->lineInfo << "\n"
+                << "\tRedefined at " << lineInfo;
+            throw std::domain_error( oss.str().c_str() );
+        }
+    }
+
+    TagAliasRegistry& TagAliasRegistry::get() {
+        static TagAliasRegistry instance;
+        return instance;
+
+    }
+
+    ITagAliasRegistry::~ITagAliasRegistry() {}
+    ITagAliasRegistry const& ITagAliasRegistry::get() { return TagAliasRegistry::get(); }
+
+    RegistrarForTagAliases::RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo ) {
+        try {
+            TagAliasRegistry::get().add( alias, tag, lineInfo );
+        }
+        catch( std::exception& ex ) {
+            Colour colourGuard( Colour::Red );
+            Catch::cerr() << ex.what() << std::endl;
+            exit(1);
+        }
+    }
+
+} // end namespace Catch
+
+// #included from: ../reporters/catch_reporter_multi.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_MULTI_HPP_INCLUDED
+
+namespace Catch {
+
+class MultipleReporters : public SharedImpl<IStreamingReporter> {
+    typedef std::vector<Ptr<IStreamingReporter> > Reporters;
+    Reporters m_reporters;
+
+public:
+    void add( Ptr<IStreamingReporter> const& reporter ) {
+        m_reporters.push_back( reporter );
+    }
+
+public: // IStreamingReporter
+
+    virtual ReporterPreferences getPreferences() const CATCH_OVERRIDE {
+        return m_reporters[0]->getPreferences();
+    }
+
+    virtual void noMatchingTestCases( std::string const& spec ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->noMatchingTestCases( spec );
+    }
+
+    virtual void testRunStarting( TestRunInfo const& testRunInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testRunStarting( testRunInfo );
+    }
+
+    virtual void testGroupStarting( GroupInfo const& groupInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testGroupStarting( groupInfo );
+    }
+
+    virtual void testCaseStarting( TestCaseInfo const& testInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testCaseStarting( testInfo );
+    }
+
+    virtual void sectionStarting( SectionInfo const& sectionInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->sectionStarting( sectionInfo );
+    }
+
+    virtual void assertionStarting( AssertionInfo const& assertionInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->assertionStarting( assertionInfo );
+    }
+
+    // The return value indicates if the messages buffer should be cleared:
+    virtual bool assertionEnded( AssertionStats const& assertionStats ) CATCH_OVERRIDE {
+        bool clearBuffer = false;
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            clearBuffer |= (*it)->assertionEnded( assertionStats );
+        return clearBuffer;
+    }
+
+    virtual void sectionEnded( SectionStats const& sectionStats ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->sectionEnded( sectionStats );
+    }
+
+    virtual void testCaseEnded( TestCaseStats const& testCaseStats ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testCaseEnded( testCaseStats );
+    }
+
+    virtual void testGroupEnded( TestGroupStats const& testGroupStats ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testGroupEnded( testGroupStats );
+    }
+
+    virtual void testRunEnded( TestRunStats const& testRunStats ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->testRunEnded( testRunStats );
+    }
+
+    virtual void skipTest( TestCaseInfo const& testInfo ) CATCH_OVERRIDE {
+        for( Reporters::const_iterator it = m_reporters.begin(), itEnd = m_reporters.end();
+                it != itEnd;
+                ++it )
+            (*it)->skipTest( testInfo );
+    }
+};
+
+Ptr<IStreamingReporter> addReporter( Ptr<IStreamingReporter> const& existingReporter, Ptr<IStreamingReporter> const& additionalReporter ) {
+    Ptr<IStreamingReporter> resultingReporter;
+
+    if( existingReporter ) {
+        MultipleReporters* multi = dynamic_cast<MultipleReporters*>( existingReporter.get() );
+        if( !multi ) {
+            multi = new MultipleReporters;
+            resultingReporter = Ptr<IStreamingReporter>( multi );
+            if( existingReporter )
+                multi->add( existingReporter );
+        }
+        else
+            resultingReporter = existingReporter;
+        multi->add( additionalReporter );
+    }
+    else
+        resultingReporter = additionalReporter;
+
+    return resultingReporter;
+}
+
+} // end namespace Catch
+
+// #included from: ../reporters/catch_reporter_xml.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_XML_HPP_INCLUDED
+
+// #included from: catch_reporter_bases.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_BASES_HPP_INCLUDED
+
+#include <cstring>
+
+namespace Catch {
+
+    struct StreamingReporterBase : SharedImpl<IStreamingReporter> {
+
+        StreamingReporterBase( ReporterConfig const& _config )
+        :   m_config( _config.fullConfig() ),
+            stream( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = false;
+        }
+
+        virtual ReporterPreferences getPreferences() const CATCH_OVERRIDE {
+            return m_reporterPrefs;
+        }
+
+        virtual ~StreamingReporterBase() CATCH_OVERRIDE;
+
+        virtual void noMatchingTestCases( std::string const& ) CATCH_OVERRIDE {}
+
+        virtual void testRunStarting( TestRunInfo const& _testRunInfo ) CATCH_OVERRIDE {
+            currentTestRunInfo = _testRunInfo;
+        }
+        virtual void testGroupStarting( GroupInfo const& _groupInfo ) CATCH_OVERRIDE {
+            currentGroupInfo = _groupInfo;
+        }
+
+        virtual void testCaseStarting( TestCaseInfo const& _testInfo ) CATCH_OVERRIDE {
+            currentTestCaseInfo = _testInfo;
+        }
+        virtual void sectionStarting( SectionInfo const& _sectionInfo ) CATCH_OVERRIDE {
+            m_sectionStack.push_back( _sectionInfo );
+        }
+
+        virtual void sectionEnded( SectionStats const& /* _sectionStats */ ) CATCH_OVERRIDE {
+            m_sectionStack.pop_back();
+        }
+        virtual void testCaseEnded( TestCaseStats const& /* _testCaseStats */ ) CATCH_OVERRIDE {
+            currentTestCaseInfo.reset();
+        }
+        virtual void testGroupEnded( TestGroupStats const& /* _testGroupStats */ ) CATCH_OVERRIDE {
+            currentGroupInfo.reset();
+        }
+        virtual void testRunEnded( TestRunStats const& /* _testRunStats */ ) CATCH_OVERRIDE {
+            currentTestCaseInfo.reset();
+            currentGroupInfo.reset();
+            currentTestRunInfo.reset();
+        }
+
+        virtual void skipTest( TestCaseInfo const& ) CATCH_OVERRIDE {
+            // Don't do anything with this by default.
+            // It can optionally be overridden in the derived class.
+        }
+
+        Ptr<IConfig const> m_config;
+        std::ostream& stream;
+
+        LazyStat<TestRunInfo> currentTestRunInfo;
+        LazyStat<GroupInfo> currentGroupInfo;
+        LazyStat<TestCaseInfo> currentTestCaseInfo;
+
+        std::vector<SectionInfo> m_sectionStack;
+        ReporterPreferences m_reporterPrefs;
+    };
+
+    struct CumulativeReporterBase : SharedImpl<IStreamingReporter> {
+        template<typename T, typename ChildNodeT>
+        struct Node : SharedImpl<> {
+            explicit Node( T const& _value ) : value( _value ) {}
+            virtual ~Node() {}
+
+            typedef std::vector<Ptr<ChildNodeT> > ChildNodes;
+            T value;
+            ChildNodes children;
+        };
+        struct SectionNode : SharedImpl<> {
+            explicit SectionNode( SectionStats const& _stats ) : stats( _stats ) {}
+            virtual ~SectionNode();
+
+            bool operator == ( SectionNode const& other ) const {
+                return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
+            }
+            bool operator == ( Ptr<SectionNode> const& other ) const {
+                return operator==( *other );
+            }
+
+            SectionStats stats;
+            typedef std::vector<Ptr<SectionNode> > ChildSections;
+            typedef std::vector<AssertionStats> Assertions;
+            ChildSections childSections;
+            Assertions assertions;
+            std::string stdOut;
+            std::string stdErr;
+        };
+
+        struct BySectionInfo {
+            BySectionInfo( SectionInfo const& other ) : m_other( other ) {}
+			BySectionInfo( BySectionInfo const& other ) : m_other( other.m_other ) {}
+            bool operator() ( Ptr<SectionNode> const& node ) const {
+                return node->stats.sectionInfo.lineInfo == m_other.lineInfo;
+            }
+        private:
+			void operator=( BySectionInfo const& );
+            SectionInfo const& m_other;
+        };
+
+        typedef Node<TestCaseStats, SectionNode> TestCaseNode;
+        typedef Node<TestGroupStats, TestCaseNode> TestGroupNode;
+        typedef Node<TestRunStats, TestGroupNode> TestRunNode;
+
+        CumulativeReporterBase( ReporterConfig const& _config )
+        :   m_config( _config.fullConfig() ),
+            stream( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = false;
+        }
+        ~CumulativeReporterBase();
+
+        virtual ReporterPreferences getPreferences() const CATCH_OVERRIDE {
+            return m_reporterPrefs;
+        }
+
+        virtual void testRunStarting( TestRunInfo const& ) CATCH_OVERRIDE {}
+        virtual void testGroupStarting( GroupInfo const& ) CATCH_OVERRIDE {}
+
+        virtual void testCaseStarting( TestCaseInfo const& ) CATCH_OVERRIDE {}
+
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) CATCH_OVERRIDE {
+            SectionStats incompleteStats( sectionInfo, Counts(), 0, false );
+            Ptr<SectionNode> node;
+            if( m_sectionStack.empty() ) {
+                if( !m_rootSection )
+                    m_rootSection = new SectionNode( incompleteStats );
+                node = m_rootSection;
+            }
+            else {
+                SectionNode& parentNode = *m_sectionStack.back();
+                SectionNode::ChildSections::const_iterator it =
+                    std::find_if(   parentNode.childSections.begin(),
+                                    parentNode.childSections.end(),
+                                    BySectionInfo( sectionInfo ) );
+                if( it == parentNode.childSections.end() ) {
+                    node = new SectionNode( incompleteStats );
+                    parentNode.childSections.push_back( node );
+                }
+                else
+                    node = *it;
+            }
+            m_sectionStack.push_back( node );
+            m_deepestSection = node;
+        }
+
+        virtual void assertionStarting( AssertionInfo const& ) CATCH_OVERRIDE {}
+
+        virtual bool assertionEnded( AssertionStats const& assertionStats ) {
+            assert( !m_sectionStack.empty() );
+            SectionNode& sectionNode = *m_sectionStack.back();
+            sectionNode.assertions.push_back( assertionStats );
+            return true;
+        }
+        virtual void sectionEnded( SectionStats const& sectionStats ) CATCH_OVERRIDE {
+            assert( !m_sectionStack.empty() );
+            SectionNode& node = *m_sectionStack.back();
+            node.stats = sectionStats;
+            m_sectionStack.pop_back();
+        }
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) CATCH_OVERRIDE {
+            Ptr<TestCaseNode> node = new TestCaseNode( testCaseStats );
+            assert( m_sectionStack.size() == 0 );
+            node->children.push_back( m_rootSection );
+            m_testCases.push_back( node );
+            m_rootSection.reset();
+
+            assert( m_deepestSection );
+            m_deepestSection->stdOut = testCaseStats.stdOut;
+            m_deepestSection->stdErr = testCaseStats.stdErr;
+        }
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) CATCH_OVERRIDE {
+            Ptr<TestGroupNode> node = new TestGroupNode( testGroupStats );
+            node->children.swap( m_testCases );
+            m_testGroups.push_back( node );
+        }
+        virtual void testRunEnded( TestRunStats const& testRunStats ) CATCH_OVERRIDE {
+            Ptr<TestRunNode> node = new TestRunNode( testRunStats );
+            node->children.swap( m_testGroups );
+            m_testRuns.push_back( node );
+            testRunEndedCumulative();
+        }
+        virtual void testRunEndedCumulative() = 0;
+
+        virtual void skipTest( TestCaseInfo const& ) CATCH_OVERRIDE {}
+
+        Ptr<IConfig const> m_config;
+        std::ostream& stream;
+        std::vector<AssertionStats> m_assertions;
+        std::vector<std::vector<Ptr<SectionNode> > > m_sections;
+        std::vector<Ptr<TestCaseNode> > m_testCases;
+        std::vector<Ptr<TestGroupNode> > m_testGroups;
+
+        std::vector<Ptr<TestRunNode> > m_testRuns;
+
+        Ptr<SectionNode> m_rootSection;
+        Ptr<SectionNode> m_deepestSection;
+        std::vector<Ptr<SectionNode> > m_sectionStack;
+        ReporterPreferences m_reporterPrefs;
+
+    };
+
+    template<char C>
+    char const* getLineOfChars() {
+        static char line[CATCH_CONFIG_CONSOLE_WIDTH] = {0};
+        if( !*line ) {
+            memset( line, C, CATCH_CONFIG_CONSOLE_WIDTH-1 );
+            line[CATCH_CONFIG_CONSOLE_WIDTH-1] = 0;
+        }
+        return line;
+    }
+
+    struct TestEventListenerBase : StreamingReporterBase {
+        TestEventListenerBase( ReporterConfig const& _config )
+        :   StreamingReporterBase( _config )
+        {}
+
+        virtual void assertionStarting( AssertionInfo const& ) CATCH_OVERRIDE {}
+        virtual bool assertionEnded( AssertionStats const& ) CATCH_OVERRIDE {
+            return false;
+        }
+    };
+
+} // end namespace Catch
+
+// #included from: ../internal/catch_reporter_registrars.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+
+namespace Catch {
+
+    template<typename T>
+    class LegacyReporterRegistrar {
+
+        class ReporterFactory : public IReporterFactory {
+            virtual IStreamingReporter* create( ReporterConfig const& config ) const {
+                return new LegacyReporterAdapter( new T( config ) );
+            }
+
+            virtual std::string getDescription() const {
+                return T::getDescription();
+            }
+        };
+
+    public:
+
+        LegacyReporterRegistrar( std::string const& name ) {
+            getMutableRegistryHub().registerReporter( name, new ReporterFactory() );
+        }
+    };
+
+    template<typename T>
+    class ReporterRegistrar {
+
+        class ReporterFactory : public SharedImpl<IReporterFactory> {
+
+            // *** Please Note ***:
+            // - If you end up here looking at a compiler error because it's trying to register
+            // your custom reporter class be aware that the native reporter interface has changed
+            // to IStreamingReporter. The "legacy" interface, IReporter, is still supported via
+            // an adapter. Just use REGISTER_LEGACY_REPORTER to take advantage of the adapter.
+            // However please consider updating to the new interface as the old one is now
+            // deprecated and will probably be removed quite soon!
+            // Please contact me via github if you have any questions at all about this.
+            // In fact, ideally, please contact me anyway to let me know you've hit this - as I have
+            // no idea who is actually using custom reporters at all (possibly no-one!).
+            // The new interface is designed to minimise exposure to interface changes in the future.
+            virtual IStreamingReporter* create( ReporterConfig const& config ) const {
+                return new T( config );
+            }
+
+            virtual std::string getDescription() const {
+                return T::getDescription();
+            }
+        };
+
+    public:
+
+        ReporterRegistrar( std::string const& name ) {
+            getMutableRegistryHub().registerReporter( name, new ReporterFactory() );
+        }
+    };
+
+    template<typename T>
+    class ListenerRegistrar {
+
+        class ListenerFactory : public SharedImpl<IReporterFactory> {
+
+            virtual IStreamingReporter* create( ReporterConfig const& config ) const {
+                return new T( config );
+            }
+            virtual std::string getDescription() const {
+                return "";
+            }
+        };
+
+    public:
+
+        ListenerRegistrar() {
+            getMutableRegistryHub().registerListener( new ListenerFactory() );
+        }
+    };
+}
+
+#define INTERNAL_CATCH_REGISTER_LEGACY_REPORTER( name, reporterType ) \
+    namespace{ Catch::LegacyReporterRegistrar<reporterType> catch_internal_RegistrarFor##reporterType( name ); }
+
+#define INTERNAL_CATCH_REGISTER_REPORTER( name, reporterType ) \
+    namespace{ Catch::ReporterRegistrar<reporterType> catch_internal_RegistrarFor##reporterType( name ); }
+
+#define INTERNAL_CATCH_REGISTER_LISTENER( listenerType ) \
+    namespace{ Catch::ListenerRegistrar<listenerType> catch_internal_RegistrarFor##listenerType; }
+
+// #included from: ../internal/catch_xmlwriter.hpp
+#define TWOBLUECUBES_CATCH_XMLWRITER_HPP_INCLUDED
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include <iomanip>
+
+namespace Catch {
+
+    class XmlEncode {
+    public:
+        enum ForWhat { ForTextNodes, ForAttributes };
+
+        XmlEncode( std::string const& str, ForWhat forWhat = ForTextNodes )
+        :   m_str( str ),
+            m_forWhat( forWhat )
+        {}
+
+        void encodeTo( std::ostream& os ) const {
+
+            // Apostrophe escaping not necessary if we always use " to write attributes
+            // (see: http://www.w3.org/TR/xml/#syntax)
+
+            for( std::size_t i = 0; i < m_str.size(); ++ i ) {
+                char c = m_str[i];
+                switch( c ) {
+                    case '<':   os << "&lt;"; break;
+                    case '&':   os << "&amp;"; break;
+
+                    case '>':
+                        // See: http://www.w3.org/TR/xml/#syntax
+                        if( i > 2 && m_str[i-1] == ']' && m_str[i-2] == ']' )
+                            os << "&gt;";
+                        else
+                            os << c;
+                        break;
+
+                    case '\"':
+                        if( m_forWhat == ForAttributes )
+                            os << "&quot;";
+                        else
+                            os << c;
+                        break;
+
+                    default:
+                        // Escape control chars - based on contribution by @espenalb in PR #465
+                        if ( ( c < '\x09' ) || ( c > '\x0D' && c < '\x20') || c=='\x7F' )
+                            os << "&#x" << std::uppercase << std::hex << static_cast<int>( c );
+                        else
+                            os << c;
+                }
+            }
+        }
+
+        friend std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
+            xmlEncode.encodeTo( os );
+            return os;
+        }
+
+    private:
+        std::string m_str;
+        ForWhat m_forWhat;
+    };
+
+    class XmlWriter {
+    public:
+
+        class ScopedElement {
+        public:
+            ScopedElement( XmlWriter* writer )
+            :   m_writer( writer )
+            {}
+
+            ScopedElement( ScopedElement const& other )
+            :   m_writer( other.m_writer ){
+                other.m_writer = CATCH_NULL;
+            }
+
+            ~ScopedElement() {
+                if( m_writer )
+                    m_writer->endElement();
+            }
+
+            ScopedElement& writeText( std::string const& text, bool indent = true ) {
+                m_writer->writeText( text, indent );
+                return *this;
+            }
+
+            template<typename T>
+            ScopedElement& writeAttribute( std::string const& name, T const& attribute ) {
+                m_writer->writeAttribute( name, attribute );
+                return *this;
+            }
+
+        private:
+            mutable XmlWriter* m_writer;
+        };
+
+        XmlWriter()
+        :   m_tagIsOpen( false ),
+            m_needsNewline( false ),
+            m_os( &Catch::cout() )
+        {}
+
+        XmlWriter( std::ostream& os )
+        :   m_tagIsOpen( false ),
+            m_needsNewline( false ),
+            m_os( &os )
+        {}
+
+        ~XmlWriter() {
+            while( !m_tags.empty() )
+                endElement();
+        }
+
+        XmlWriter& startElement( std::string const& name ) {
+            ensureTagClosed();
+            newlineIfNecessary();
+            stream() << m_indent << "<" << name;
+            m_tags.push_back( name );
+            m_indent += "  ";
+            m_tagIsOpen = true;
+            return *this;
+        }
+
+        ScopedElement scopedElement( std::string const& name ) {
+            ScopedElement scoped( this );
+            startElement( name );
+            return scoped;
+        }
+
+        XmlWriter& endElement() {
+            newlineIfNecessary();
+            m_indent = m_indent.substr( 0, m_indent.size()-2 );
+            if( m_tagIsOpen ) {
+                stream() << "/>\n";
+                m_tagIsOpen = false;
+            }
+            else {
+                stream() << m_indent << "</" << m_tags.back() << ">\n";
+            }
+            m_tags.pop_back();
+            return *this;
+        }
+
+        XmlWriter& writeAttribute( std::string const& name, std::string const& attribute ) {
+            if( !name.empty() && !attribute.empty() )
+                stream() << " " << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << "\"";
+            return *this;
+        }
+
+        XmlWriter& writeAttribute( std::string const& name, bool attribute ) {
+            stream() << " " << name << "=\"" << ( attribute ? "true" : "false" ) << "\"";
+            return *this;
+        }
+
+        template<typename T>
+        XmlWriter& writeAttribute( std::string const& name, T const& attribute ) {
+            std::ostringstream oss;
+            oss << attribute;
+            return writeAttribute( name, oss.str() );
+        }
+
+        XmlWriter& writeText( std::string const& text, bool indent = true ) {
+            if( !text.empty() ){
+                bool tagWasOpen = m_tagIsOpen;
+                ensureTagClosed();
+                if( tagWasOpen && indent )
+                    stream() << m_indent;
+                stream() << XmlEncode( text );
+                m_needsNewline = true;
+            }
+            return *this;
+        }
+
+        XmlWriter& writeComment( std::string const& text ) {
+            ensureTagClosed();
+            stream() << m_indent << "<!--" << text << "-->";
+            m_needsNewline = true;
+            return *this;
+        }
+
+        XmlWriter& writeBlankLine() {
+            ensureTagClosed();
+            stream() << "\n";
+            return *this;
+        }
+
+        void setStream( std::ostream& os ) {
+            m_os = &os;
+        }
+
+    private:
+        XmlWriter( XmlWriter const& );
+        void operator=( XmlWriter const& );
+
+        std::ostream& stream() {
+            return *m_os;
+        }
+
+        void ensureTagClosed() {
+            if( m_tagIsOpen ) {
+                stream() << ">\n";
+                m_tagIsOpen = false;
+            }
+        }
+
+        void newlineIfNecessary() {
+            if( m_needsNewline ) {
+                stream() << "\n";
+                m_needsNewline = false;
+            }
+        }
+
+        bool m_tagIsOpen;
+        bool m_needsNewline;
+        std::vector<std::string> m_tags;
+        std::string m_indent;
+        std::ostream* m_os;
+    };
+
+}
+// #included from: catch_reenable_warnings.h
+
+#define TWOBLUECUBES_CATCH_REENABLE_WARNINGS_H_INCLUDED
+
+#ifdef __clang__
+#    ifdef __ICC // icpc defines the __clang__ macro
+#        pragma warning(pop)
+#    else
+#        pragma clang diagnostic pop
+#    endif
+#elif defined __GNUC__
+#    pragma GCC diagnostic pop
+#endif
+
+
+namespace Catch {
+    class XmlReporter : public StreamingReporterBase {
+    public:
+        XmlReporter( ReporterConfig const& _config )
+        :   StreamingReporterBase( _config ),
+            m_sectionDepth( 0 )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = true;
+        }
+
+        virtual ~XmlReporter() CATCH_OVERRIDE;
+
+        static std::string getDescription() {
+            return "Reports test results as an XML document";
+        }
+
+    public: // StreamingReporterBase
+
+        virtual void noMatchingTestCases( std::string const& s ) CATCH_OVERRIDE {
+            StreamingReporterBase::noMatchingTestCases( s );
+        }
+
+        virtual void testRunStarting( TestRunInfo const& testInfo ) CATCH_OVERRIDE {
+            StreamingReporterBase::testRunStarting( testInfo );
+            m_xml.setStream( stream );
+            m_xml.startElement( "Catch" );
+            if( !m_config->name().empty() )
+                m_xml.writeAttribute( "name", m_config->name() );
+        }
+
+        virtual void testGroupStarting( GroupInfo const& groupInfo ) CATCH_OVERRIDE {
+            StreamingReporterBase::testGroupStarting( groupInfo );
+            m_xml.startElement( "Group" )
+                .writeAttribute( "name", groupInfo.name );
+        }
+
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) CATCH_OVERRIDE {
+            StreamingReporterBase::testCaseStarting(testInfo);
+            m_xml.startElement( "TestCase" ).writeAttribute( "name", trim( testInfo.name ) );
+
+            if ( m_config->showDurations() == ShowDurations::Always )
+                m_testCaseTimer.start();
+        }
+
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) CATCH_OVERRIDE {
+            StreamingReporterBase::sectionStarting( sectionInfo );
+            if( m_sectionDepth++ > 0 ) {
+                m_xml.startElement( "Section" )
+                    .writeAttribute( "name", trim( sectionInfo.name ) )
+                    .writeAttribute( "description", sectionInfo.description );
+            }
+        }
+
+        virtual void assertionStarting( AssertionInfo const& ) CATCH_OVERRIDE { }
+
+        virtual bool assertionEnded( AssertionStats const& assertionStats ) CATCH_OVERRIDE {
+            const AssertionResult& assertionResult = assertionStats.assertionResult;
+
+            // Print any info messages in <Info> tags.
+            if( assertionStats.assertionResult.getResultType() != ResultWas::Ok ) {
+                for( std::vector<MessageInfo>::const_iterator it = assertionStats.infoMessages.begin(), itEnd = assertionStats.infoMessages.end();
+                        it != itEnd;
+                        ++it ) {
+                    if( it->type == ResultWas::Info ) {
+                        m_xml.scopedElement( "Info" )
+                            .writeText( it->message );
+                    } else if ( it->type == ResultWas::Warning ) {
+                        m_xml.scopedElement( "Warning" )
+                            .writeText( it->message );
+                    }
+                }
+            }
+
+            // Drop out if result was successful but we're not printing them.
+            if( !m_config->includeSuccessfulResults() && isOk(assertionResult.getResultType()) )
+                return true;
+
+            // Print the expression if there is one.
+            if( assertionResult.hasExpression() ) {
+                m_xml.startElement( "Expression" )
+                    .writeAttribute( "success", assertionResult.succeeded() )
+					.writeAttribute( "type", assertionResult.getTestMacroName() )
+                    .writeAttribute( "filename", assertionResult.getSourceInfo().file )
+                    .writeAttribute( "line", assertionResult.getSourceInfo().line );
+
+                m_xml.scopedElement( "Original" )
+                    .writeText( assertionResult.getExpression() );
+                m_xml.scopedElement( "Expanded" )
+                    .writeText( assertionResult.getExpandedExpression() );
+            }
+
+            // And... Print a result applicable to each result type.
+            switch( assertionResult.getResultType() ) {
+                case ResultWas::ThrewException:
+                    m_xml.scopedElement( "Exception" )
+                        .writeAttribute( "filename", assertionResult.getSourceInfo().file )
+                        .writeAttribute( "line", assertionResult.getSourceInfo().line )
+                        .writeText( assertionResult.getMessage() );
+                    break;
+                case ResultWas::FatalErrorCondition:
+                    m_xml.scopedElement( "Fatal Error Condition" )
+                        .writeAttribute( "filename", assertionResult.getSourceInfo().file )
+                        .writeAttribute( "line", assertionResult.getSourceInfo().line )
+                        .writeText( assertionResult.getMessage() );
+                    break;
+                case ResultWas::Info:
+                    m_xml.scopedElement( "Info" )
+                        .writeText( assertionResult.getMessage() );
+                    break;
+                case ResultWas::Warning:
+                    // Warning will already have been written
+                    break;
+                case ResultWas::ExplicitFailure:
+                    m_xml.scopedElement( "Failure" )
+                        .writeText( assertionResult.getMessage() );
+                    break;
+                default:
+                    break;
+            }
+
+            if( assertionResult.hasExpression() )
+                m_xml.endElement();
+
+            return true;
+        }
+
+        virtual void sectionEnded( SectionStats const& sectionStats ) CATCH_OVERRIDE {
+            StreamingReporterBase::sectionEnded( sectionStats );
+            if( --m_sectionDepth > 0 ) {
+                XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResults" );
+                e.writeAttribute( "successes", sectionStats.assertions.passed );
+                e.writeAttribute( "failures", sectionStats.assertions.failed );
+                e.writeAttribute( "expectedFailures", sectionStats.assertions.failedButOk );
+
+                if ( m_config->showDurations() == ShowDurations::Always )
+                    e.writeAttribute( "durationInSeconds", sectionStats.durationInSeconds );
+
+                m_xml.endElement();
+            }
+        }
+
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) CATCH_OVERRIDE {
+            StreamingReporterBase::testCaseEnded( testCaseStats );
+            XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResult" );
+            e.writeAttribute( "success", testCaseStats.totals.assertions.allOk() );
+
+            if ( m_config->showDurations() == ShowDurations::Always )
+                e.writeAttribute( "durationInSeconds", m_testCaseTimer.getElapsedSeconds() );
+
+            m_xml.endElement();
+        }
+
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) CATCH_OVERRIDE {
+            StreamingReporterBase::testGroupEnded( testGroupStats );
+            // TODO: Check testGroupStats.aborting and act accordingly.
+            m_xml.scopedElement( "OverallResults" )
+                .writeAttribute( "successes", testGroupStats.totals.assertions.passed )
+                .writeAttribute( "failures", testGroupStats.totals.assertions.failed )
+                .writeAttribute( "expectedFailures", testGroupStats.totals.assertions.failedButOk );
+            m_xml.endElement();
+        }
+
+        virtual void testRunEnded( TestRunStats const& testRunStats ) CATCH_OVERRIDE {
+            StreamingReporterBase::testRunEnded( testRunStats );
+            m_xml.scopedElement( "OverallResults" )
+                .writeAttribute( "successes", testRunStats.totals.assertions.passed )
+                .writeAttribute( "failures", testRunStats.totals.assertions.failed )
+                .writeAttribute( "expectedFailures", testRunStats.totals.assertions.failedButOk );
+            m_xml.endElement();
+        }
+
+    private:
+        Timer m_testCaseTimer;
+        XmlWriter m_xml;
+        int m_sectionDepth;
+    };
+
+     INTERNAL_CATCH_REGISTER_REPORTER( "xml", XmlReporter )
+
+} // end namespace Catch
+
+// #included from: ../reporters/catch_reporter_junit.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_JUNIT_HPP_INCLUDED
+
+#include <assert.h>
+
+namespace Catch {
+
+    class JunitReporter : public CumulativeReporterBase {
+    public:
+        JunitReporter( ReporterConfig const& _config )
+        :   CumulativeReporterBase( _config ),
+            xml( _config.stream() )
+        {
+            m_reporterPrefs.shouldRedirectStdOut = true;
+        }
+
+        virtual ~JunitReporter() CATCH_OVERRIDE;
+
+        static std::string getDescription() {
+            return "Reports test results in an XML format that looks like Ant's junitreport target";
+        }
+
+        virtual void noMatchingTestCases( std::string const& /*spec*/ ) CATCH_OVERRIDE {}
+
+        virtual void testRunStarting( TestRunInfo const& runInfo ) CATCH_OVERRIDE {
+            CumulativeReporterBase::testRunStarting( runInfo );
+            xml.startElement( "testsuites" );
+        }
+
+        virtual void testGroupStarting( GroupInfo const& groupInfo ) CATCH_OVERRIDE {
+            suiteTimer.start();
+            stdOutForSuite.str("");
+            stdErrForSuite.str("");
+            unexpectedExceptions = 0;
+            CumulativeReporterBase::testGroupStarting( groupInfo );
+        }
+
+        virtual bool assertionEnded( AssertionStats const& assertionStats ) CATCH_OVERRIDE {
+            if( assertionStats.assertionResult.getResultType() == ResultWas::ThrewException )
+                unexpectedExceptions++;
+            return CumulativeReporterBase::assertionEnded( assertionStats );
+        }
+
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) CATCH_OVERRIDE {
+            stdOutForSuite << testCaseStats.stdOut;
+            stdErrForSuite << testCaseStats.stdErr;
+            CumulativeReporterBase::testCaseEnded( testCaseStats );
+        }
+
+        virtual void testGroupEnded( TestGroupStats const& testGroupStats ) CATCH_OVERRIDE {
+            double suiteTime = suiteTimer.getElapsedSeconds();
+            CumulativeReporterBase::testGroupEnded( testGroupStats );
+            writeGroup( *m_testGroups.back(), suiteTime );
+        }
+
+        virtual void testRunEndedCumulative() CATCH_OVERRIDE {
+            xml.endElement();
+        }
+
+        void writeGroup( TestGroupNode const& groupNode, double suiteTime ) {
+            XmlWriter::ScopedElement e = xml.scopedElement( "testsuite" );
+            TestGroupStats const& stats = groupNode.value;
+            xml.writeAttribute( "name", stats.groupInfo.name );
+            xml.writeAttribute( "errors", unexpectedExceptions );
+            xml.writeAttribute( "failures", stats.totals.assertions.failed-unexpectedExceptions );
+            xml.writeAttribute( "tests", stats.totals.assertions.total() );
+            xml.writeAttribute( "hostname", "tbd" ); // !TBD
+            if( m_config->showDurations() == ShowDurations::Never )
+                xml.writeAttribute( "time", "" );
+            else
+                xml.writeAttribute( "time", suiteTime );
+            xml.writeAttribute( "timestamp", "tbd" ); // !TBD
+
+            // Write test cases
+            for( TestGroupNode::ChildNodes::const_iterator
+                    it = groupNode.children.begin(), itEnd = groupNode.children.end();
+                    it != itEnd;
+                    ++it )
+                writeTestCase( **it );
+
+            xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite.str() ), false );
+            xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite.str() ), false );
+        }
+
+        void writeTestCase( TestCaseNode const& testCaseNode ) {
+            TestCaseStats const& stats = testCaseNode.value;
+
+            // All test cases have exactly one section - which represents the
+            // test case itself. That section may have 0-n nested sections
+            assert( testCaseNode.children.size() == 1 );
+            SectionNode const& rootSection = *testCaseNode.children.front();
+
+            std::string className = stats.testInfo.className;
+
+            if( className.empty() ) {
+                if( rootSection.childSections.empty() )
+                    className = "global";
+            }
+            writeSection( className, "", rootSection );
+        }
+
+        void writeSection(  std::string const& className,
+                            std::string const& rootName,
+                            SectionNode const& sectionNode ) {
+            std::string name = trim( sectionNode.stats.sectionInfo.name );
+            if( !rootName.empty() )
+                name = rootName + "/" + name;
+
+            if( !sectionNode.assertions.empty() ||
+                !sectionNode.stdOut.empty() ||
+                !sectionNode.stdErr.empty() ) {
+                XmlWriter::ScopedElement e = xml.scopedElement( "testcase" );
+                if( className.empty() ) {
+                    xml.writeAttribute( "classname", name );
+                    xml.writeAttribute( "name", "root" );
+                }
+                else {
+                    xml.writeAttribute( "classname", className );
+                    xml.writeAttribute( "name", name );
+                }
+                xml.writeAttribute( "time", Catch::toString( sectionNode.stats.durationInSeconds ) );
+
+                writeAssertions( sectionNode );
+
+                if( !sectionNode.stdOut.empty() )
+                    xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), false );
+                if( !sectionNode.stdErr.empty() )
+                    xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), false );
+            }
+            for( SectionNode::ChildSections::const_iterator
+                    it = sectionNode.childSections.begin(),
+                    itEnd = sectionNode.childSections.end();
+                    it != itEnd;
+                    ++it )
+                if( className.empty() )
+                    writeSection( name, "", **it );
+                else
+                    writeSection( className, name, **it );
+        }
+
+        void writeAssertions( SectionNode const& sectionNode ) {
+            for( SectionNode::Assertions::const_iterator
+                    it = sectionNode.assertions.begin(), itEnd = sectionNode.assertions.end();
+                    it != itEnd;
+                    ++it )
+                writeAssertion( *it );
+        }
+        void writeAssertion( AssertionStats const& stats ) {
+            AssertionResult const& result = stats.assertionResult;
+            if( !result.isOk() ) {
+                std::string elementName;
+                switch( result.getResultType() ) {
+                    case ResultWas::ThrewException:
+                    case ResultWas::FatalErrorCondition:
+                        elementName = "error";
+                        break;
+                    case ResultWas::ExplicitFailure:
+                        elementName = "failure";
+                        break;
+                    case ResultWas::ExpressionFailed:
+                        elementName = "failure";
+                        break;
+                    case ResultWas::DidntThrowException:
+                        elementName = "failure";
+                        break;
+
+                    // We should never see these here:
+                    case ResultWas::Info:
+                    case ResultWas::Warning:
+                    case ResultWas::Ok:
+                    case ResultWas::Unknown:
+                    case ResultWas::FailureBit:
+                    case ResultWas::Exception:
+                        elementName = "internalError";
+                        break;
+                }
+
+                XmlWriter::ScopedElement e = xml.scopedElement( elementName );
+
+                xml.writeAttribute( "message", result.getExpandedExpression() );
+                xml.writeAttribute( "type", result.getTestMacroName() );
+
+                std::ostringstream oss;
+                if( !result.getMessage().empty() )
+                    oss << result.getMessage() << "\n";
+                for( std::vector<MessageInfo>::const_iterator
+                        it = stats.infoMessages.begin(),
+                        itEnd = stats.infoMessages.end();
+                            it != itEnd;
+                            ++it )
+                    if( it->type == ResultWas::Info )
+                        oss << it->message << "\n";
+
+                oss << "at " << result.getSourceInfo();
+                xml.writeText( oss.str(), false );
+            }
+        }
+
+        XmlWriter xml;
+        Timer suiteTimer;
+        std::ostringstream stdOutForSuite;
+        std::ostringstream stdErrForSuite;
+        unsigned int unexpectedExceptions;
+    };
+
+    INTERNAL_CATCH_REGISTER_REPORTER( "junit", JunitReporter )
+
+} // end namespace Catch
+
+// #included from: ../reporters/catch_reporter_console.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+
+namespace Catch {
+
+    struct ConsoleReporter : StreamingReporterBase {
+        ConsoleReporter( ReporterConfig const& _config )
+        :   StreamingReporterBase( _config ),
+            m_headerPrinted( false )
+        {}
+
+        virtual ~ConsoleReporter() CATCH_OVERRIDE;
+        static std::string getDescription() {
+            return "Reports test results as plain lines of text";
+        }
+
+        virtual void noMatchingTestCases( std::string const& spec ) CATCH_OVERRIDE {
+            stream << "No test cases matched '" << spec << "'" << std::endl;
+        }
+
+        virtual void assertionStarting( AssertionInfo const& ) CATCH_OVERRIDE {
+        }
+
+        virtual bool assertionEnded( AssertionStats const& _assertionStats ) CATCH_OVERRIDE {
+            AssertionResult const& result = _assertionStats.assertionResult;
+
+            bool printInfoMessages = true;
+
+            // Drop out if result was successful and we're not printing those
+            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
+                if( result.getResultType() != ResultWas::Warning )
+                    return false;
+                printInfoMessages = false;
+            }
+
+            lazyPrint();
+
+            AssertionPrinter printer( stream, _assertionStats, printInfoMessages );
+            printer.print();
+            stream << std::endl;
+            return true;
+        }
+
+        virtual void sectionStarting( SectionInfo const& _sectionInfo ) CATCH_OVERRIDE {
+            m_headerPrinted = false;
+            StreamingReporterBase::sectionStarting( _sectionInfo );
+        }
+        virtual void sectionEnded( SectionStats const& _sectionStats ) CATCH_OVERRIDE {
+            if( _sectionStats.missingAssertions ) {
+                lazyPrint();
+                Colour colour( Colour::ResultError );
+                if( m_sectionStack.size() > 1 )
+                    stream << "\nNo assertions in section";
+                else
+                    stream << "\nNo assertions in test case";
+                stream << " '" << _sectionStats.sectionInfo.name << "'\n" << std::endl;
+            }
+            if( m_headerPrinted ) {
+                if( m_config->showDurations() == ShowDurations::Always )
+                    stream << "Completed in " << _sectionStats.durationInSeconds << "s" << std::endl;
+                m_headerPrinted = false;
+            }
+            else {
+                if( m_config->showDurations() == ShowDurations::Always )
+                    stream << _sectionStats.sectionInfo.name << " completed in " << _sectionStats.durationInSeconds << "s" << std::endl;
+            }
+            StreamingReporterBase::sectionEnded( _sectionStats );
+        }
+
+        virtual void testCaseEnded( TestCaseStats const& _testCaseStats ) CATCH_OVERRIDE {
+            StreamingReporterBase::testCaseEnded( _testCaseStats );
+            m_headerPrinted = false;
+        }
+        virtual void testGroupEnded( TestGroupStats const& _testGroupStats ) CATCH_OVERRIDE {
+            if( currentGroupInfo.used ) {
+                printSummaryDivider();
+                stream << "Summary for group '" << _testGroupStats.groupInfo.name << "':\n";
+                printTotals( _testGroupStats.totals );
+                stream << "\n" << std::endl;
+            }
+            StreamingReporterBase::testGroupEnded( _testGroupStats );
+        }
+        virtual void testRunEnded( TestRunStats const& _testRunStats ) CATCH_OVERRIDE {
+            printTotalsDivider( _testRunStats.totals );
+            printTotals( _testRunStats.totals );
+            stream << std::endl;
+            StreamingReporterBase::testRunEnded( _testRunStats );
+        }
+
+    private:
+
+        class AssertionPrinter {
+            void operator= ( AssertionPrinter const& );
+        public:
+            AssertionPrinter( std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages )
+            :   stream( _stream ),
+                stats( _stats ),
+                result( _stats.assertionResult ),
+                colour( Colour::None ),
+                message( result.getMessage() ),
+                messages( _stats.infoMessages ),
+                printInfoMessages( _printInfoMessages )
+            {
+                switch( result.getResultType() ) {
+                    case ResultWas::Ok:
+                        colour = Colour::Success;
+                        passOrFail = "PASSED";
+                        //if( result.hasMessage() )
+                        if( _stats.infoMessages.size() == 1 )
+                            messageLabel = "with message";
+                        if( _stats.infoMessages.size() > 1 )
+                            messageLabel = "with messages";
+                        break;
+                    case ResultWas::ExpressionFailed:
+                        if( result.isOk() ) {
+                            colour = Colour::Success;
+                            passOrFail = "FAILED - but was ok";
+                        }
+                        else {
+                            colour = Colour::Error;
+                            passOrFail = "FAILED";
+                        }
+                        if( _stats.infoMessages.size() == 1 )
+                            messageLabel = "with message";
+                        if( _stats.infoMessages.size() > 1 )
+                            messageLabel = "with messages";
+                        break;
+                    case ResultWas::ThrewException:
+                        colour = Colour::Error;
+                        passOrFail = "FAILED";
+                        messageLabel = "due to unexpected exception with message";
+                        break;
+                    case ResultWas::FatalErrorCondition:
+                        colour = Colour::Error;
+                        passOrFail = "FAILED";
+                        messageLabel = "due to a fatal error condition";
+                        break;
+                    case ResultWas::DidntThrowException:
+                        colour = Colour::Error;
+                        passOrFail = "FAILED";
+                        messageLabel = "because no exception was thrown where one was expected";
+                        break;
+                    case ResultWas::Info:
+                        messageLabel = "info";
+                        break;
+                    case ResultWas::Warning:
+                        messageLabel = "warning";
+                        break;
+                    case ResultWas::ExplicitFailure:
+                        passOrFail = "FAILED";
+                        colour = Colour::Error;
+                        if( _stats.infoMessages.size() == 1 )
+                            messageLabel = "explicitly with message";
+                        if( _stats.infoMessages.size() > 1 )
+                            messageLabel = "explicitly with messages";
+                        break;
+                    // These cases are here to prevent compiler warnings
+                    case ResultWas::Unknown:
+                    case ResultWas::FailureBit:
+                    case ResultWas::Exception:
+                        passOrFail = "** internal error **";
+                        colour = Colour::Error;
+                        break;
+                }
+            }
+
+            void print() const {
+                printSourceInfo();
+                if( stats.totals.assertions.total() > 0 ) {
+                    if( result.isOk() )
+                        stream << "\n";
+                    printResultType();
+                    printOriginalExpression();
+                    printReconstructedExpression();
+                }
+                else {
+                    stream << "\n";
+                }
+                printMessage();
+            }
+
+        private:
+            void printResultType() const {
+                if( !passOrFail.empty() ) {
+                    Colour colourGuard( colour );
+                    stream << passOrFail << ":\n";
+                }
+            }
+            void printOriginalExpression() const {
+                if( result.hasExpression() ) {
+                    Colour colourGuard( Colour::OriginalExpression );
+                    stream  << "  ";
+                    stream << result.getExpressionInMacro();
+                    stream << "\n";
+                }
+            }
+            void printReconstructedExpression() const {
+                if( result.hasExpandedExpression() ) {
+                    stream << "with expansion:\n";
+                    Colour colourGuard( Colour::ReconstructedExpression );
+                    stream << Text( result.getExpandedExpression(), TextAttributes().setIndent(2) ) << "\n";
+                }
+            }
+            void printMessage() const {
+                if( !messageLabel.empty() )
+                    stream << messageLabel << ":" << "\n";
+                for( std::vector<MessageInfo>::const_iterator it = messages.begin(), itEnd = messages.end();
+                        it != itEnd;
+                        ++it ) {
+                    // If this assertion is a warning ignore any INFO messages
+                    if( printInfoMessages || it->type != ResultWas::Info )
+                        stream << Text( it->message, TextAttributes().setIndent(2) ) << "\n";
+                }
+            }
+            void printSourceInfo() const {
+                Colour colourGuard( Colour::FileName );
+                stream << result.getSourceInfo() << ": ";
+            }
+
+            std::ostream& stream;
+            AssertionStats const& stats;
+            AssertionResult const& result;
+            Colour::Code colour;
+            std::string passOrFail;
+            std::string messageLabel;
+            std::string message;
+            std::vector<MessageInfo> messages;
+            bool printInfoMessages;
+        };
+
+        void lazyPrint() {
+
+            if( !currentTestRunInfo.used )
+                lazyPrintRunInfo();
+            if( !currentGroupInfo.used )
+                lazyPrintGroupInfo();
+
+            if( !m_headerPrinted ) {
+                printTestCaseAndSectionHeader();
+                m_headerPrinted = true;
+            }
+        }
+        void lazyPrintRunInfo() {
+            stream  << "\n" << getLineOfChars<'~'>() << "\n";
+            Colour colour( Colour::SecondaryText );
+            stream  << currentTestRunInfo->name
+                    << " is a Catch v"  << libraryVersion << " host application.\n"
+                    << "Run with -? for options\n\n";
+
+            if( m_config->rngSeed() != 0 )
+                stream << "Randomness seeded to: " << m_config->rngSeed() << "\n\n";
+
+            currentTestRunInfo.used = true;
+        }
+        void lazyPrintGroupInfo() {
+            if( !currentGroupInfo->name.empty() && currentGroupInfo->groupsCounts > 1 ) {
+                printClosedHeader( "Group: " + currentGroupInfo->name );
+                currentGroupInfo.used = true;
+            }
+        }
+        void printTestCaseAndSectionHeader() {
+            assert( !m_sectionStack.empty() );
+            printOpenHeader( currentTestCaseInfo->name );
+
+            if( m_sectionStack.size() > 1 ) {
+                Colour colourGuard( Colour::Headers );
+
+                std::vector<SectionInfo>::const_iterator
+                    it = m_sectionStack.begin()+1, // Skip first section (test case)
+                    itEnd = m_sectionStack.end();
+                for( ; it != itEnd; ++it )
+                    printHeaderString( it->name, 2 );
+            }
+
+            SourceLineInfo lineInfo = m_sectionStack.front().lineInfo;
+
+            if( !lineInfo.empty() ){
+                stream << getLineOfChars<'-'>() << "\n";
+                Colour colourGuard( Colour::FileName );
+                stream << lineInfo << "\n";
+            }
+            stream << getLineOfChars<'.'>() << "\n" << std::endl;
+        }
+
+        void printClosedHeader( std::string const& _name ) {
+            printOpenHeader( _name );
+            stream << getLineOfChars<'.'>() << "\n";
+        }
+        void printOpenHeader( std::string const& _name ) {
+            stream  << getLineOfChars<'-'>() << "\n";
+            {
+                Colour colourGuard( Colour::Headers );
+                printHeaderString( _name );
+            }
+        }
+
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString( std::string const& _string, std::size_t indent = 0 ) {
+            std::size_t i = _string.find( ": " );
+            if( i != std::string::npos )
+                i+=2;
+            else
+                i = 0;
+            stream << Text( _string, TextAttributes()
+                                        .setIndent( indent+i)
+                                        .setInitialIndent( indent ) ) << "\n";
+        }
+
+        struct SummaryColumn {
+
+            SummaryColumn( std::string const& _label, Colour::Code _colour )
+            :   label( _label ),
+                colour( _colour )
+            {}
+            SummaryColumn addRow( std::size_t count ) {
+                std::ostringstream oss;
+                oss << count;
+                std::string row = oss.str();
+                for( std::vector<std::string>::iterator it = rows.begin(); it != rows.end(); ++it ) {
+                    while( it->size() < row.size() )
+                        *it = " " + *it;
+                    while( it->size() > row.size() )
+                        row = " " + row;
+                }
+                rows.push_back( row );
+                return *this;
+            }
+
+            std::string label;
+            Colour::Code colour;
+            std::vector<std::string> rows;
+
+        };
+
+        void printTotals( Totals const& totals ) {
+            if( totals.testCases.total() == 0 ) {
+                stream << Colour( Colour::Warning ) << "No tests ran\n";
+            }
+            else if( totals.assertions.total() > 0 && totals.testCases.allPassed() ) {
+                stream << Colour( Colour::ResultSuccess ) << "All tests passed";
+                stream << " ("
+                        << pluralise( totals.assertions.passed, "assertion" ) << " in "
+                        << pluralise( totals.testCases.passed, "test case" ) << ")"
+                        << "\n";
+            }
+            else {
+
+                std::vector<SummaryColumn> columns;
+                columns.push_back( SummaryColumn( "", Colour::None )
+                                        .addRow( totals.testCases.total() )
+                                        .addRow( totals.assertions.total() ) );
+                columns.push_back( SummaryColumn( "passed", Colour::Success )
+                                        .addRow( totals.testCases.passed )
+                                        .addRow( totals.assertions.passed ) );
+                columns.push_back( SummaryColumn( "failed", Colour::ResultError )
+                                        .addRow( totals.testCases.failed )
+                                        .addRow( totals.assertions.failed ) );
+                columns.push_back( SummaryColumn( "failed as expected", Colour::ResultExpectedFailure )
+                                        .addRow( totals.testCases.failedButOk )
+                                        .addRow( totals.assertions.failedButOk ) );
+
+                printSummaryRow( "test cases", columns, 0 );
+                printSummaryRow( "assertions", columns, 1 );
+            }
+        }
+        void printSummaryRow( std::string const& label, std::vector<SummaryColumn> const& cols, std::size_t row ) {
+            for( std::vector<SummaryColumn>::const_iterator it = cols.begin(); it != cols.end(); ++it ) {
+                std::string value = it->rows[row];
+                if( it->label.empty() ) {
+                    stream << label << ": ";
+                    if( value != "0" )
+                        stream << value;
+                    else
+                        stream << Colour( Colour::Warning ) << "- none -";
+                }
+                else if( value != "0" ) {
+                    stream  << Colour( Colour::LightGrey ) << " | ";
+                    stream  << Colour( it->colour )
+                            << value << " " << it->label;
+                }
+            }
+            stream << "\n";
+        }
+
+        static std::size_t makeRatio( std::size_t number, std::size_t total ) {
+            std::size_t ratio = total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number/ total : 0;
+            return ( ratio == 0 && number > 0 ) ? 1 : ratio;
+        }
+        static std::size_t& findMax( std::size_t& i, std::size_t& j, std::size_t& k ) {
+            if( i > j && i > k )
+                return i;
+            else if( j > k )
+                return j;
+            else
+                return k;
+        }
+
+        void printTotalsDivider( Totals const& totals ) {
+            if( totals.testCases.total() > 0 ) {
+                std::size_t failedRatio = makeRatio( totals.testCases.failed, totals.testCases.total() );
+                std::size_t failedButOkRatio = makeRatio( totals.testCases.failedButOk, totals.testCases.total() );
+                std::size_t passedRatio = makeRatio( totals.testCases.passed, totals.testCases.total() );
+                while( failedRatio + failedButOkRatio + passedRatio < CATCH_CONFIG_CONSOLE_WIDTH-1 )
+                    findMax( failedRatio, failedButOkRatio, passedRatio )++;
+                while( failedRatio + failedButOkRatio + passedRatio > CATCH_CONFIG_CONSOLE_WIDTH-1 )
+                    findMax( failedRatio, failedButOkRatio, passedRatio )--;
+
+                stream << Colour( Colour::Error ) << std::string( failedRatio, '=' );
+                stream << Colour( Colour::ResultExpectedFailure ) << std::string( failedButOkRatio, '=' );
+                if( totals.testCases.allPassed() )
+                    stream << Colour( Colour::ResultSuccess ) << std::string( passedRatio, '=' );
+                else
+                    stream << Colour( Colour::Success ) << std::string( passedRatio, '=' );
+            }
+            else {
+                stream << Colour( Colour::Warning ) << std::string( CATCH_CONFIG_CONSOLE_WIDTH-1, '=' );
+            }
+            stream << "\n";
+        }
+        void printSummaryDivider() {
+            stream << getLineOfChars<'-'>() << "\n";
+        }
+
+    private:
+        bool m_headerPrinted;
+    };
+
+    INTERNAL_CATCH_REGISTER_REPORTER( "console", ConsoleReporter )
+
+} // end namespace Catch
+
+// #included from: ../reporters/catch_reporter_compact.hpp
+#define TWOBLUECUBES_CATCH_REPORTER_COMPACT_HPP_INCLUDED
+
+namespace Catch {
+
+    struct CompactReporter : StreamingReporterBase {
+
+        CompactReporter( ReporterConfig const& _config )
+        : StreamingReporterBase( _config )
+        {}
+
+        virtual ~CompactReporter();
+
+        static std::string getDescription() {
+            return "Reports test results on a single line, suitable for IDEs";
+        }
+
+        virtual ReporterPreferences getPreferences() const {
+            ReporterPreferences prefs;
+            prefs.shouldRedirectStdOut = false;
+            return prefs;
+        }
+
+        virtual void noMatchingTestCases( std::string const& spec ) {
+            stream << "No test cases matched '" << spec << "'" << std::endl;
+        }
+
+        virtual void assertionStarting( AssertionInfo const& ) {
+        }
+
+        virtual bool assertionEnded( AssertionStats const& _assertionStats ) {
+            AssertionResult const& result = _assertionStats.assertionResult;
+
+            bool printInfoMessages = true;
+
+            // Drop out if result was successful and we're not printing those
+            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
+                if( result.getResultType() != ResultWas::Warning )
+                    return false;
+                printInfoMessages = false;
+            }
+
+            AssertionPrinter printer( stream, _assertionStats, printInfoMessages );
+            printer.print();
+
+            stream << std::endl;
+            return true;
+        }
+
+        virtual void testRunEnded( TestRunStats const& _testRunStats ) {
+            printTotals( _testRunStats.totals );
+            stream << "\n" << std::endl;
+            StreamingReporterBase::testRunEnded( _testRunStats );
+        }
+
+    private:
+        class AssertionPrinter {
+            void operator= ( AssertionPrinter const& );
+        public:
+            AssertionPrinter( std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages )
+            : stream( _stream )
+            , stats( _stats )
+            , result( _stats.assertionResult )
+            , messages( _stats.infoMessages )
+            , itMessage( _stats.infoMessages.begin() )
+            , printInfoMessages( _printInfoMessages )
+            {}
+
+            void print() {
+                printSourceInfo();
+
+                itMessage = messages.begin();
+
+                switch( result.getResultType() ) {
+                    case ResultWas::Ok:
+                        printResultType( Colour::ResultSuccess, passedString() );
+                        printOriginalExpression();
+                        printReconstructedExpression();
+                        if ( ! result.hasExpression() )
+                            printRemainingMessages( Colour::None );
+                        else
+                            printRemainingMessages();
+                        break;
+                    case ResultWas::ExpressionFailed:
+                        if( result.isOk() )
+                            printResultType( Colour::ResultSuccess, failedString() + std::string( " - but was ok" ) );
+                        else
+                            printResultType( Colour::Error, failedString() );
+                        printOriginalExpression();
+                        printReconstructedExpression();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::ThrewException:
+                        printResultType( Colour::Error, failedString() );
+                        printIssue( "unexpected exception with message:" );
+                        printMessage();
+                        printExpressionWas();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::FatalErrorCondition:
+                        printResultType( Colour::Error, failedString() );
+                        printIssue( "fatal error condition with message:" );
+                        printMessage();
+                        printExpressionWas();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::DidntThrowException:
+                        printResultType( Colour::Error, failedString() );
+                        printIssue( "expected exception, got none" );
+                        printExpressionWas();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::Info:
+                        printResultType( Colour::None, "info" );
+                        printMessage();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::Warning:
+                        printResultType( Colour::None, "warning" );
+                        printMessage();
+                        printRemainingMessages();
+                        break;
+                    case ResultWas::ExplicitFailure:
+                        printResultType( Colour::Error, failedString() );
+                        printIssue( "explicitly" );
+                        printRemainingMessages( Colour::None );
+                        break;
+                    // These cases are here to prevent compiler warnings
+                    case ResultWas::Unknown:
+                    case ResultWas::FailureBit:
+                    case ResultWas::Exception:
+                        printResultType( Colour::Error, "** internal error **" );
+                        break;
+                }
+            }
+
+        private:
+            // Colour::LightGrey
+
+            static Colour::Code dimColour() { return Colour::FileName; }
+
+#ifdef CATCH_PLATFORM_MAC
+            static const char* failedString() { return "FAILED"; }
+            static const char* passedString() { return "PASSED"; }
+#else
+            static const char* failedString() { return "failed"; }
+            static const char* passedString() { return "passed"; }
+#endif
+
+            void printSourceInfo() const {
+                Colour colourGuard( Colour::FileName );
+                stream << result.getSourceInfo() << ":";
+            }
+
+            void printResultType( Colour::Code colour, std::string passOrFail ) const {
+                if( !passOrFail.empty() ) {
+                    {
+                        Colour colourGuard( colour );
+                        stream << " " << passOrFail;
+                    }
+                    stream << ":";
+                }
+            }
+
+            void printIssue( std::string issue ) const {
+                stream << " " << issue;
+            }
+
+            void printExpressionWas() {
+                if( result.hasExpression() ) {
+                    stream << ";";
+                    {
+                        Colour colour( dimColour() );
+                        stream << " expression was:";
+                    }
+                    printOriginalExpression();
+                }
+            }
+
+            void printOriginalExpression() const {
+                if( result.hasExpression() ) {
+                    stream << " " << result.getExpression();
+                }
+            }
+
+            void printReconstructedExpression() const {
+                if( result.hasExpandedExpression() ) {
+                    {
+                        Colour colour( dimColour() );
+                        stream << " for: ";
+                    }
+                    stream << result.getExpandedExpression();
+                }
+            }
+
+            void printMessage() {
+                if ( itMessage != messages.end() ) {
+                    stream << " '" << itMessage->message << "'";
+                    ++itMessage;
+                }
+            }
+
+            void printRemainingMessages( Colour::Code colour = dimColour() ) {
+                if ( itMessage == messages.end() )
+                    return;
+
+                // using messages.end() directly yields compilation error:
+                std::vector<MessageInfo>::const_iterator itEnd = messages.end();
+                const std::size_t N = static_cast<std::size_t>( std::distance( itMessage, itEnd ) );
+
+                {
+                    Colour colourGuard( colour );
+                    stream << " with " << pluralise( N, "message" ) << ":";
+                }
+
+                for(; itMessage != itEnd; ) {
+                    // If this assertion is a warning ignore any INFO messages
+                    if( printInfoMessages || itMessage->type != ResultWas::Info ) {
+                        stream << " '" << itMessage->message << "'";
+                        if ( ++itMessage != itEnd ) {
+                            Colour colourGuard( dimColour() );
+                            stream << " and";
+                        }
+                    }
+                }
+            }
+
+        private:
+            std::ostream& stream;
+            AssertionStats const& stats;
+            AssertionResult const& result;
+            std::vector<MessageInfo> messages;
+            std::vector<MessageInfo>::const_iterator itMessage;
+            bool printInfoMessages;
+        };
+
+        // Colour, message variants:
+        // - white: No tests ran.
+        // -   red: Failed [both/all] N test cases, failed [both/all] M assertions.
+        // - white: Passed [both/all] N test cases (no assertions).
+        // -   red: Failed N tests cases, failed M assertions.
+        // - green: Passed [both/all] N tests cases with M assertions.
+
+        std::string bothOrAll( std::size_t count ) const {
+            return count == 1 ? "" : count == 2 ? "both " : "all " ;
+        }
+
+        void printTotals( const Totals& totals ) const {
+            if( totals.testCases.total() == 0 ) {
+                stream << "No tests ran.";
+            }
+            else if( totals.testCases.failed == totals.testCases.total() ) {
+                Colour colour( Colour::ResultError );
+                const std::string qualify_assertions_failed =
+                    totals.assertions.failed == totals.assertions.total() ?
+                        bothOrAll( totals.assertions.failed ) : "";
+                stream <<
+                    "Failed " << bothOrAll( totals.testCases.failed )
+                              << pluralise( totals.testCases.failed, "test case"  ) << ", "
+                    "failed " << qualify_assertions_failed <<
+                                 pluralise( totals.assertions.failed, "assertion" ) << ".";
+            }
+            else if( totals.assertions.total() == 0 ) {
+                stream <<
+                    "Passed " << bothOrAll( totals.testCases.total() )
+                              << pluralise( totals.testCases.total(), "test case" )
+                              << " (no assertions).";
+            }
+            else if( totals.assertions.failed ) {
+                Colour colour( Colour::ResultError );
+                stream <<
+                    "Failed " << pluralise( totals.testCases.failed, "test case"  ) << ", "
+                    "failed " << pluralise( totals.assertions.failed, "assertion" ) << ".";
+            }
+            else {
+                Colour colour( Colour::ResultSuccess );
+                stream <<
+                    "Passed " << bothOrAll( totals.testCases.passed )
+                              << pluralise( totals.testCases.passed, "test case"  ) <<
+                    " with "  << pluralise( totals.assertions.passed, "assertion" ) << ".";
+            }
+        }
+    };
+
+    INTERNAL_CATCH_REGISTER_REPORTER( "compact", CompactReporter )
+
+} // end namespace Catch
+
+namespace Catch {
+    // These are all here to avoid warnings about not having any out of line
+    // virtual methods
+    NonCopyable::~NonCopyable() {}
+    IShared::~IShared() {}
+    IStream::~IStream() CATCH_NOEXCEPT {}
+    FileStream::~FileStream() CATCH_NOEXCEPT {}
+    CoutStream::~CoutStream() CATCH_NOEXCEPT {}
+    DebugOutStream::~DebugOutStream() CATCH_NOEXCEPT {}
+    StreamBufBase::~StreamBufBase() CATCH_NOEXCEPT {}
+    IContext::~IContext() {}
+    IResultCapture::~IResultCapture() {}
+    ITestCase::~ITestCase() {}
+    ITestCaseRegistry::~ITestCaseRegistry() {}
+    IRegistryHub::~IRegistryHub() {}
+    IMutableRegistryHub::~IMutableRegistryHub() {}
+    IExceptionTranslator::~IExceptionTranslator() {}
+    IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() {}
+    IReporter::~IReporter() {}
+    IReporterFactory::~IReporterFactory() {}
+    IReporterRegistry::~IReporterRegistry() {}
+    IStreamingReporter::~IStreamingReporter() {}
+    AssertionStats::~AssertionStats() {}
+    SectionStats::~SectionStats() {}
+    TestCaseStats::~TestCaseStats() {}
+    TestGroupStats::~TestGroupStats() {}
+    TestRunStats::~TestRunStats() {}
+    CumulativeReporterBase::SectionNode::~SectionNode() {}
+    CumulativeReporterBase::~CumulativeReporterBase() {}
+
+    StreamingReporterBase::~StreamingReporterBase() {}
+    ConsoleReporter::~ConsoleReporter() {}
+    CompactReporter::~CompactReporter() {}
+    IRunner::~IRunner() {}
+    IMutableContext::~IMutableContext() {}
+    IConfig::~IConfig() {}
+    XmlReporter::~XmlReporter() {}
+    JunitReporter::~JunitReporter() {}
+    TestRegistry::~TestRegistry() {}
+    FreeFunctionTestCase::~FreeFunctionTestCase() {}
+    IGeneratorInfo::~IGeneratorInfo() {}
+    IGeneratorsForTest::~IGeneratorsForTest() {}
+    WildcardPattern::~WildcardPattern() {}
+    TestSpec::Pattern::~Pattern() {}
+    TestSpec::NamePattern::~NamePattern() {}
+    TestSpec::TagPattern::~TagPattern() {}
+    TestSpec::ExcludedPattern::~ExcludedPattern() {}
+
+    Matchers::Impl::StdString::Equals::~Equals() {}
+    Matchers::Impl::StdString::Contains::~Contains() {}
+    Matchers::Impl::StdString::StartsWith::~StartsWith() {}
+    Matchers::Impl::StdString::EndsWith::~EndsWith() {}
+
+    void Config::dummy() {}
+
+    namespace TestCaseTracking {
+        ITracker::~ITracker() {}
+        TrackerBase::~TrackerBase() {}
+        SectionTracker::~SectionTracker() {}
+        IndexTracker::~IndexTracker() {}
+    }
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif
+
+#ifdef CATCH_CONFIG_MAIN
+// #included from: internal/catch_default_main.hpp
+#define TWOBLUECUBES_CATCH_DEFAULT_MAIN_HPP_INCLUDED
+
+#ifndef __OBJC__
+
+// Standard C/C++ main entry point
+int main (int argc, char * argv[]) {
+    return Catch::Session().run( argc, argv );
+}
+
+#else // __OBJC__
+
+// Objective-C entry point
+int main (int argc, char * const argv[]) {
+#if !CATCH_ARC_ENABLED
+    NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init];
+#endif
+
+    Catch::registerTestMethods();
+    int result = Catch::Session().run( argc, (char* const*)argv );
+
+#if !CATCH_ARC_ENABLED
+    [pool drain];
+#endif
+
+    return result;
+}
+
+#endif // __OBJC__
+
+#endif
+
+#ifdef CLARA_CONFIG_MAIN_NOT_DEFINED
+#  undef CLARA_CONFIG_MAIN
+#endif
+
+//////
+
+// If this config identifier is defined then all CATCH macros are prefixed with CATCH_
+#ifdef CATCH_CONFIG_PREFIX_ALL
+
+#define CATCH_REQUIRE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::Normal, "CATCH_REQUIRE" )
+#define CATCH_REQUIRE_FALSE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, "CATCH_REQUIRE_FALSE" )
+
+#define CATCH_REQUIRE_THROWS( expr ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::Normal, "", "CATCH_REQUIRE_THROWS" )
+#define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( expr, exceptionType, Catch::ResultDisposition::Normal, "CATCH_REQUIRE_THROWS_AS" )
+#define CATCH_REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::Normal, matcher, "CATCH_REQUIRE_THROWS_WITH" )
+#define CATCH_REQUIRE_NOTHROW( expr ) INTERNAL_CATCH_NO_THROW( expr, Catch::ResultDisposition::Normal, "CATCH_REQUIRE_NOTHROW" )
+
+#define CATCH_CHECK( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECK" )
+#define CATCH_CHECK_FALSE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, "CATCH_CHECK_FALSE" )
+#define CATCH_CHECKED_IF( expr ) INTERNAL_CATCH_IF( expr, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECKED_IF" )
+#define CATCH_CHECKED_ELSE( expr ) INTERNAL_CATCH_ELSE( expr, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECKED_ELSE" )
+#define CATCH_CHECK_NOFAIL( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, "CATCH_CHECK_NOFAIL" )
+
+#define CATCH_CHECK_THROWS( expr )  INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECK_THROWS" )
+#define CATCH_CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( expr, exceptionType, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECK_THROWS_AS" )
+#define CATCH_CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::ContinueOnFailure, matcher, "CATCH_CHECK_THROWS_WITH" )
+#define CATCH_CHECK_NOTHROW( expr ) INTERNAL_CATCH_NO_THROW( expr, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECK_NOTHROW" )
+
+#define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( arg, matcher, Catch::ResultDisposition::ContinueOnFailure, "CATCH_CHECK_THAT" )
+#define CATCH_REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( arg, matcher, Catch::ResultDisposition::Normal, "CATCH_REQUIRE_THAT" )
+
+#define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( msg, "CATCH_INFO" )
+#define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, "CATCH_WARN", msg )
+#define CATCH_SCOPED_INFO( msg ) INTERNAL_CATCH_INFO( msg, "CATCH_INFO" )
+#define CATCH_CAPTURE( msg ) INTERNAL_CATCH_INFO( #msg " := " << msg, "CATCH_CAPTURE" )
+#define CATCH_SCOPED_CAPTURE( msg ) INTERNAL_CATCH_INFO( #msg " := " << msg, "CATCH_CAPTURE" )
+
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+    #define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+    #define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define CATCH_METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+    #define CATCH_REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+    #define CATCH_SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+    #define CATCH_FAIL( ... ) INTERNAL_CATCH_MSG( Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, "CATCH_FAIL", __VA_ARGS__ )
+    #define CATCH_SUCCEED( ... ) INTERNAL_CATCH_MSG( Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, "CATCH_SUCCEED", __VA_ARGS__ )
+#else
+    #define CATCH_TEST_CASE( name, description ) INTERNAL_CATCH_TESTCASE( name, description )
+    #define CATCH_TEST_CASE_METHOD( className, name, description ) INTERNAL_CATCH_TEST_CASE_METHOD( className, name, description )
+    #define CATCH_METHOD_AS_TEST_CASE( method, name, description ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, name, description )
+    #define CATCH_REGISTER_TEST_CASE( function, name, description ) INTERNAL_CATCH_REGISTER_TESTCASE( function, name, description )
+    #define CATCH_SECTION( name, description ) INTERNAL_CATCH_SECTION( name, description )
+    #define CATCH_FAIL( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, "CATCH_FAIL", msg )
+    #define CATCH_SUCCEED( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, "CATCH_SUCCEED", msg )
+#endif
+#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE( "", "" )
+
+#define CATCH_REGISTER_REPORTER( name, reporterType ) INTERNAL_CATCH_REGISTER_REPORTER( name, reporterType )
+#define CATCH_REGISTER_LEGACY_REPORTER( name, reporterType ) INTERNAL_CATCH_REGISTER_LEGACY_REPORTER( name, reporterType )
+
+#define CATCH_GENERATE( expr) INTERNAL_CATCH_GENERATE( expr )
+
+// "BDD-style" convenience wrappers
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+#define CATCH_SCENARIO( ... ) CATCH_TEST_CASE( "Scenario: " __VA_ARGS__ )
+#define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+#else
+#define CATCH_SCENARIO( name, tags ) CATCH_TEST_CASE( "Scenario: " name, tags )
+#define CATCH_SCENARIO_METHOD( className, name, tags ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " name, tags )
+#endif
+#define CATCH_GIVEN( desc )    CATCH_SECTION( std::string( "Given: ") + desc, "" )
+#define CATCH_WHEN( desc )     CATCH_SECTION( std::string( " When: ") + desc, "" )
+#define CATCH_AND_WHEN( desc ) CATCH_SECTION( std::string( "  And: ") + desc, "" )
+#define CATCH_THEN( desc )     CATCH_SECTION( std::string( " Then: ") + desc, "" )
+#define CATCH_AND_THEN( desc ) CATCH_SECTION( std::string( "  And: ") + desc, "" )
+
+// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not required
+#else
+
+#define REQUIRE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::Normal, "REQUIRE" )
+#define REQUIRE_FALSE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, "REQUIRE_FALSE" )
+
+#define REQUIRE_THROWS( expr ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::Normal, "", "REQUIRE_THROWS" )
+#define REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( expr, exceptionType, Catch::ResultDisposition::Normal, "REQUIRE_THROWS_AS" )
+#define REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::Normal, matcher, "REQUIRE_THROWS_WITH" )
+#define REQUIRE_NOTHROW( expr ) INTERNAL_CATCH_NO_THROW( expr, Catch::ResultDisposition::Normal, "REQUIRE_NOTHROW" )
+
+#define CHECK( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure, "CHECK" )
+#define CHECK_FALSE( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, "CHECK_FALSE" )
+#define CHECKED_IF( expr ) INTERNAL_CATCH_IF( expr, Catch::ResultDisposition::ContinueOnFailure, "CHECKED_IF" )
+#define CHECKED_ELSE( expr ) INTERNAL_CATCH_ELSE( expr, Catch::ResultDisposition::ContinueOnFailure, "CHECKED_ELSE" )
+#define CHECK_NOFAIL( expr ) INTERNAL_CATCH_TEST( expr, Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, "CHECK_NOFAIL" )
+
+#define CHECK_THROWS( expr )  INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::ContinueOnFailure, "", "CHECK_THROWS" )
+#define CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( expr, exceptionType, Catch::ResultDisposition::ContinueOnFailure, "CHECK_THROWS_AS" )
+#define CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS( expr, Catch::ResultDisposition::ContinueOnFailure, matcher, "CHECK_THROWS_WITH" )
+#define CHECK_NOTHROW( expr ) INTERNAL_CATCH_NO_THROW( expr, Catch::ResultDisposition::ContinueOnFailure, "CHECK_NOTHROW" )
+
+#define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( arg, matcher, Catch::ResultDisposition::ContinueOnFailure, "CHECK_THAT" )
+#define REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( arg, matcher, Catch::ResultDisposition::Normal, "REQUIRE_THAT" )
+
+#define INFO( msg ) INTERNAL_CATCH_INFO( msg, "INFO" )
+#define WARN( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, "WARN", msg )
+#define SCOPED_INFO( msg ) INTERNAL_CATCH_INFO( msg, "INFO" )
+#define CAPTURE( msg ) INTERNAL_CATCH_INFO( #msg " := " << msg, "CAPTURE" )
+#define SCOPED_CAPTURE( msg ) INTERNAL_CATCH_INFO( #msg " := " << msg, "CAPTURE" )
+
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+    #define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+    #define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+    #define REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+    #define SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+    #define FAIL( ... ) INTERNAL_CATCH_MSG( Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, "FAIL", __VA_ARGS__ )
+    #define SUCCEED( ... ) INTERNAL_CATCH_MSG( Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, "SUCCEED", __VA_ARGS__ )
+#else
+    #define TEST_CASE( name, description ) INTERNAL_CATCH_TESTCASE( name, description )
+    #define TEST_CASE_METHOD( className, name, description ) INTERNAL_CATCH_TEST_CASE_METHOD( className, name, description )
+    #define METHOD_AS_TEST_CASE( method, name, description ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, name, description )
+    #define REGISTER_TEST_CASE( method, name, description ) INTERNAL_CATCH_REGISTER_TESTCASE( method, name, description )
+    #define SECTION( name, description ) INTERNAL_CATCH_SECTION( name, description )
+    #define FAIL( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, "FAIL", msg )
+    #define SUCCEED( msg ) INTERNAL_CATCH_MSG( Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, "SUCCEED", msg )
+#endif
+#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE( "", "" )
+
+#define REGISTER_REPORTER( name, reporterType ) INTERNAL_CATCH_REGISTER_REPORTER( name, reporterType )
+#define REGISTER_LEGACY_REPORTER( name, reporterType ) INTERNAL_CATCH_REGISTER_LEGACY_REPORTER( name, reporterType )
+
+#define GENERATE( expr) INTERNAL_CATCH_GENERATE( expr )
+
+#endif
+
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature )
+
+// "BDD-style" convenience wrappers
+#ifdef CATCH_CONFIG_VARIADIC_MACROS
+#define SCENARIO( ... ) TEST_CASE( "Scenario: " __VA_ARGS__ )
+#define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+#else
+#define SCENARIO( name, tags ) TEST_CASE( "Scenario: " name, tags )
+#define SCENARIO_METHOD( className, name, tags ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " name, tags )
+#endif
+#define GIVEN( desc )    SECTION( std::string("   Given: ") + desc, "" )
+#define WHEN( desc )     SECTION( std::string("    When: ") + desc, "" )
+#define AND_WHEN( desc ) SECTION( std::string("And when: ") + desc, "" )
+#define THEN( desc )     SECTION( std::string("    Then: ") + desc, "" )
+#define AND_THEN( desc ) SECTION( std::string("     And: ") + desc, "" )
+
+using Catch::Detail::Approx;
+
+#endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED
diff --git a/tests/cc/test_array_table.cc b/tests/cc/test_array_table.cc
new file mode 100644
index 0000000..190f927
--- /dev/null
+++ b/tests/cc/test_array_table.cc
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2017 Politecnico di Torino
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BPF.h"
+
+#include "catch.hpp"
+
+#include <random>
+#include <iostream>
+
+#include <linux/version.h>
+
+TEST_CASE("test array table", "[array_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("hash", int, int, myhash, 128);
+    BPF_TABLE("array", int, int, myarray, 128);
+  )";
+
+  // turn off the rw_engine
+  ebpf::BPF bpf(0, nullptr, false);
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFArrayTable<int> t = bpf.get_array_table<int>("myarray");
+
+  SECTION("bad table type") {
+    // try to get table of wrong type
+    auto f1 = [&](){
+      bpf.get_array_table<int>("myhash");
+    };
+
+    REQUIRE_THROWS(f1());
+  }
+
+  SECTION("standard methods") {
+    int i, v1, v2;
+    i = 1;
+    v1 = 42;
+    // update element
+    res = t.update_value(i, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(i, v2);
+    REQUIRE(res.code() == 0);
+    REQUIRE(v2 == 42);
+
+    // update another element
+    i = 2;
+    v1 = 69;
+    res = t.update_value(i, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(i, v2);
+    REQUIRE(res.code() == 0);
+    REQUIRE(v2 == 69);
+
+    // get non existing element
+    i = 1024;
+    res = t.get_value(i, v2);
+    REQUIRE(res.code() != 0);
+  }
+
+  SECTION("full table") {
+    // random number generator
+    std::mt19937 rng;
+    rng.seed(std::random_device()());
+    std::uniform_int_distribution<int> dist;
+
+    std::vector<int> localtable(128);
+
+    for(int i = 0; i < 128; i++) {
+      int v = dist(rng);
+
+      res = t.update_value(i, v);
+      REQUIRE(res.code() == 0);
+
+      // save it in the local table to compare later on
+      localtable[i] = v;
+    }
+
+    std::vector<int> offlinetable = t.get_table_offline();
+    REQUIRE(localtable == offlinetable);
+  }
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+TEST_CASE("percpu array table", "[percpu_array_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("percpu_hash", int, u64, myhash, 128);
+    BPF_TABLE("percpu_array", int, u64, myarray, 64);
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFPercpuArrayTable<uint64_t> t = bpf.get_percpu_array_table<uint64_t>("myarray");
+  size_t ncpus = ebpf::BPFTable::get_possible_cpu_count();
+
+  SECTION("bad table type") {
+    // try to get table of wrong type
+    auto f1 = [&](){
+      bpf.get_percpu_array_table<uint64_t>("myhash");
+    };
+
+    REQUIRE_THROWS(f1());
+  }
+
+  SECTION("standard methods") {
+    int i;
+    std::vector<uint64_t> v1(ncpus);
+    std::vector<uint64_t> v2;
+
+    for (size_t j = 0; j < ncpus; j++) {
+      v1[j] = 42 * j;
+    }
+
+    i = 1;
+    // update element
+    res = t.update_value(i, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(i, v2);
+    REQUIRE(res.code() == 0);
+    REQUIRE(v2.size() == ncpus);
+    for (size_t j = 0; j < ncpus; j++) {
+      REQUIRE(v2.at(j) == 42 * j);
+    }
+
+    // get non existing element
+    i = 1024;
+    res = t.get_value(i, v2);
+    REQUIRE(res.code() != 0);
+  }
+}
+#endif
diff --git a/tests/cc/test_bpf_table.cc b/tests/cc/test_bpf_table.cc
new file mode 100644
index 0000000..40ee0af
--- /dev/null
+++ b/tests/cc/test_bpf_table.cc
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017 Politecnico di Torino
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <linux/version.h>
+#include <unistd.h>
+#include <string>
+
+#include "BPF.h"
+#include "catch.hpp"
+
+TEST_CASE("test bpf table", "[bpf_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("hash", int, int, myhash, 128);
+  )";
+
+  ebpf::BPF *bpf(new ebpf::BPF);
+  ebpf::StatusTuple res(0);
+  std::vector<std::pair<std::string, std::string>> elements;
+  res = bpf->init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFTable t = bpf->get_table("myhash");
+
+  // update element
+  std::string value;
+  res = t.update_value("0x07", "0x42");
+  REQUIRE(res.code() == 0);
+  res = t.get_value("0x7", value);
+  REQUIRE(res.code() == 0);
+  REQUIRE(value == "0x42");
+
+  // update another element
+  res = t.update_value("0x11", "0x777");
+  REQUIRE(res.code() == 0);
+  res = t.get_value("0x11", value);
+  REQUIRE(res.code() == 0);
+  REQUIRE(value == "0x777");
+
+  // remove value
+  res = t.remove_value("0x11");
+  REQUIRE(res.code() == 0);
+  res = t.get_value("0x11", value);
+  REQUIRE(res.code() != 0);
+
+  res = t.update_value("0x15", "0x888");
+  REQUIRE(res.code() == 0);
+  res = t.get_table_offline(elements);
+  REQUIRE(res.code() == 0);
+  REQUIRE(elements.size() == 2);
+
+  // check that elements match what is in the  table
+  for (auto &it : elements) {
+    if (it.first == "0x15") {
+      REQUIRE(it.second == "0x888");
+    } else if (it.first == "0x7") {
+      REQUIRE(it.second == "0x42");
+    } else {
+      FAIL("Element " + it.first + " should not be on the table", it.first);
+    }
+  }
+
+  res = t.clear_table_non_atomic();
+  REQUIRE(res.code() == 0);
+  res = t.get_table_offline(elements);
+  REQUIRE(res.code() == 0);
+  REQUIRE(elements.size() == 0);
+
+  // delete bpf_module, call to key/leaf printf/scanf must fail
+  delete bpf;
+
+  res = t.update_value("0x07", "0x42");
+  REQUIRE(res.code() != 0);
+
+  res = t.get_value("0x07", value);
+  REQUIRE(res.code() != 0);
+
+  res = t.remove_value("0x07");
+  REQUIRE(res.code() != 0);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+TEST_CASE("test bpf percpu tables", "[bpf_percpu_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("percpu_hash", int, u64, myhash, 128);
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFTable t = bpf.get_table("myhash");
+  size_t ncpus = ebpf::BPFTable::get_possible_cpu_count();
+
+  std::vector<std::string> v1(ncpus);
+  for (size_t i = 0; i < ncpus; i++) {
+    v1.at(i) = std::to_string(42 * i);
+  }
+
+  // update element
+  std::vector<std::string> value;
+  res = t.update_value("0x07", v1);
+  REQUIRE(res.code() == 0);
+  res = t.get_value("0x07", value);
+  REQUIRE(res.code() == 0);
+  for (size_t i = 0; i < ncpus; i++) {
+    REQUIRE(42 * i == std::stoul(value.at(i), nullptr, 16));
+  }
+}
+#endif
+
+TEST_CASE("test bpf hash table", "[bpf_hash_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_HASH(myhash, int, int, 128);
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  auto t = bpf.get_hash_table<int, int>("myhash");
+
+  int key, value;
+
+  // updaate element
+  key = 0x08;
+  value = 0x43;
+  res = t.update_value(key, value);
+  REQUIRE(res.code() == 0);
+  REQUIRE(t[key] == value);
+
+  // update another element
+  key = 0x12;
+  value = 0x778;
+  res = t.update_value(key, value);
+  REQUIRE(res.code() == 0);
+  key = 0x31;
+  value = 0x123;
+  res = t.update_value(key, value);
+  REQUIRE(res.code() == 0);
+  key = 0x12;
+  value = 0;
+  res = t.get_value(key, value);
+  REQUIRE(res.code() == 0);
+  REQUIRE(value == 0x778);
+
+  // remove value and dump table
+  key = 0x12;
+  res = t.remove_value(key);
+  REQUIRE(res.code() == 0);
+  auto values = t.get_table_offline();
+  REQUIRE(values.size() == 2);
+
+  // clear table
+  res = t.clear_table_non_atomic();
+  REQUIRE(res.code() == 0);
+  values = t.get_table_offline();
+  REQUIRE(values.size() == 0);
+}
+
+TEST_CASE("test bpf stack table", "[bpf_stack_table]") {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+  const std::string BPF_PROGRAM = R"(
+    BPF_HASH(id, int, int, 1);
+    BPF_STACK_TRACE(stack_traces, 8);
+
+    int on_sys_getuid(void *ctx) {
+      int stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
+      int zero = 0, *val;
+      val = id.lookup_or_init(&zero, &stack_id);
+      (*val) = stack_id;
+
+      return 0;
+    }
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+  std::string getuid_fnname = bpf.get_syscall_fnname("getuid");
+  res = bpf.attach_kprobe(getuid_fnname, "on_sys_getuid");
+  REQUIRE(res.code() == 0);
+  REQUIRE(getuid() >= 0);
+  res = bpf.detach_kprobe(getuid_fnname);
+  REQUIRE(res.code() == 0);
+
+  auto id = bpf.get_hash_table<int, int>("id");
+  auto stack_traces = bpf.get_stack_table("stack_traces");
+
+  int stack_id = id[0];
+  REQUIRE(stack_id >= 0);
+
+  auto addrs = stack_traces.get_stack_addr(stack_id);
+  auto symbols = stack_traces.get_stack_symbol(stack_id, -1);
+  REQUIRE(addrs.size() > 0);
+  REQUIRE(addrs.size() == symbols.size());
+  bool found = false;
+  for (const auto &symbol : symbols)
+    if (symbol.find("sys_getuid") != std::string::npos) {
+      found = true;
+      break;
+    }
+  REQUIRE(found);
+
+  stack_traces.clear_table_non_atomic();
+  addrs = stack_traces.get_stack_addr(stack_id);
+  REQUIRE(addrs.size() == 0);
+#endif
+}
diff --git a/tests/cc/test_c_api.cc b/tests/cc/test_c_api.cc
new file mode 100644
index 0000000..60804a0
--- /dev/null
+++ b/tests/cc/test_c_api.cc
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <fcntl.h>
+#include <dlfcn.h>
+#include <stdint.h>
+#include <string.h>
+#include <link.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "bcc_elf.h"
+#include "bcc_perf_map.h"
+#include "bcc_proc.h"
+#include "bcc_syms.h"
+#include "common.h"
+#include "vendor/tinyformat.hpp"
+
+#include "catch.hpp"
+
+using namespace std;
+
+static pid_t spawn_child(void *, bool, bool, int (*)(void *));
+
+TEST_CASE("language detection", "[c_api]") {
+  const char *c = bcc_procutils_language(getpid());
+  REQUIRE(c);
+  REQUIRE(string(c).compare("c") == 0);
+}
+
+TEST_CASE("shared object resolution", "[c_api]") {
+  char *libm = bcc_procutils_which_so("m", 0);
+  REQUIRE(libm);
+  REQUIRE(libm[0] == '/');
+  REQUIRE(string(libm).find("libm.so") != string::npos);
+  free(libm);
+}
+
+TEST_CASE("shared object resolution using loaded libraries", "[c_api]") {
+  char *libelf = bcc_procutils_which_so("elf", getpid());
+  REQUIRE(libelf);
+  REQUIRE(libelf[0] == '/');
+  REQUIRE(string(libelf).find("libelf") != string::npos);
+  free(libelf);
+}
+
+TEST_CASE("binary resolution with `which`", "[c_api]") {
+  char *ld = bcc_procutils_which("ld");
+  REQUIRE(ld);
+  REQUIRE(ld[0] == '/');
+  free(ld);
+}
+
+static void _test_ksym(const char *sym, uint64_t addr, void *_) {
+  if (!strcmp(sym, "startup_64"))
+    REQUIRE(addr != 0x0ull);
+}
+
+TEST_CASE("list all kernel symbols", "[c_api]") {
+  if (geteuid() != 0)
+    return;
+  bcc_procutils_each_ksym(_test_ksym, NULL);
+}
+
+TEST_CASE("file-backed mapping identification") {
+  CHECK(bcc_mapping_is_file_backed("/bin/ls") == 1);
+  CHECK(bcc_mapping_is_file_backed("") == 0);
+  CHECK(bcc_mapping_is_file_backed("//anon") == 0);
+  CHECK(bcc_mapping_is_file_backed("/dev/zero") == 0);
+  CHECK(bcc_mapping_is_file_backed("/anon_hugepage") == 0);
+  CHECK(bcc_mapping_is_file_backed("/anon_hugepage (deleted)") == 0);
+  CHECK(bcc_mapping_is_file_backed("[stack") == 0);
+  CHECK(bcc_mapping_is_file_backed("/SYSV") == 0);
+  CHECK(bcc_mapping_is_file_backed("[heap]") == 0);
+}
+
+TEST_CASE("resolve symbol name in external library", "[c_api]") {
+  struct bcc_symbol sym;
+
+  REQUIRE(bcc_resolve_symname("c", "malloc", 0x0, 0, nullptr, &sym) == 0);
+  REQUIRE(string(sym.module).find("libc.so") != string::npos);
+  REQUIRE(sym.module[0] == '/');
+  REQUIRE(sym.offset != 0);
+  bcc_procutils_free(sym.module);
+}
+
+TEST_CASE("resolve symbol name in external library using loaded libraries", "[c_api]") {
+  struct bcc_symbol sym;
+
+  REQUIRE(bcc_resolve_symname("bcc", "bcc_procutils_which", 0x0, getpid(), nullptr, &sym) == 0);
+  REQUIRE(string(sym.module).find("libbcc.so") != string::npos);
+  REQUIRE(sym.module[0] == '/');
+  REQUIRE(sym.offset != 0);
+  bcc_procutils_free(sym.module);
+}
+
+extern "C" int _a_test_function(const char *a_string) {
+  int i;
+  for (i = 0; a_string[i]; ++i)
+    ;
+  return i;
+}
+
+static int setup_tmp_mnts(void) {
+  // Disconnect this mount namespace from its parent
+  if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) {
+    fprintf(stderr, "unable to mark / PRIVATE: %s\n", strerror(errno));
+    return -1;
+  }
+  // create a new tmpfs mounted on /tmp
+  if (mount("tmpfs", "/tmp", "tmpfs", 0, NULL) < 0) {
+    fprintf(stderr, "unable to mount /tmp in mntns: %s\n", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int mntns_func(void *arg) {
+  int in_fd, out_fd;
+  char buf[4096];
+  char libpath[1024];
+  ssize_t rb;
+  void *dlhdl;
+  struct link_map *lm;
+
+  if (setup_tmp_mnts() < 0) {
+    return -1;
+  }
+
+  // Find libz.so.1, if it's installed
+  dlhdl = dlopen("libz.so.1", RTLD_LAZY);
+  if (dlhdl == NULL) {
+    fprintf(stderr, "Unable to dlopen libz.so.1: %s\n", dlerror());
+    return -1;
+  }
+
+  if (dlinfo(dlhdl, RTLD_DI_LINKMAP, &lm) < 0) {
+    fprintf(stderr, "Unable to find origin of libz.so.1: %s\n", dlerror());
+    return -1;
+  }
+
+  strncpy(libpath, lm->l_name, 1024);
+  dlclose(dlhdl);
+  dlhdl = NULL;
+
+  // Copy a shared library from shared mntns to private /tmp
+  snprintf(buf, 4096, "%s", libpath);
+  in_fd = open(buf, O_RDONLY);
+  if (in_fd < 0) {
+    fprintf(stderr, "Unable to open %s: %s\n", buf, strerror(errno));
+    return -1;
+  }
+
+  out_fd = open("/tmp/libz.so.1", O_RDWR|O_CREAT|O_EXCL,
+      S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
+  if (out_fd < 0) {
+    fprintf(stderr, "Unable to open /tmp/libz.so.1: %s\n", strerror(errno));
+    return -1;
+  }
+  memset(buf, 0, sizeof (buf));
+  while ((rb = read(in_fd, buf, sizeof (buf))) > 0) {
+    if (write(out_fd, buf, rb) < 0) {
+      fprintf(stderr, "Write error: %s\n", strerror(errno));
+      return -1;
+    }
+  }
+  close(in_fd);
+  close(out_fd);
+
+  dlhdl = dlopen("/tmp/libz.so.1", RTLD_NOW);
+  if (dlhdl == NULL) {
+    fprintf(stderr, "dlopen error: %s\n", dlerror());
+    return -1;
+  }
+
+  sleep(5);
+  dlclose(dlhdl);
+
+  return 0;
+}
+
+extern int cmd_scanf(const char *cmd, const char *fmt, ...);
+
+TEST_CASE("resolve symbol addresses for a given PID", "[c_api]") {
+  struct bcc_symbol sym;
+  void *resolver = bcc_symcache_new(getpid(), nullptr);
+
+  REQUIRE(resolver);
+
+  SECTION("resolve in our own binary memory space") {
+    REQUIRE(bcc_symcache_resolve(resolver, (uint64_t)&_a_test_function, &sym) ==
+            0);
+
+    char *this_exe = realpath("/proc/self/exe", NULL);
+    REQUIRE(string(this_exe) == sym.module);
+    free(this_exe);
+
+    REQUIRE(string("_a_test_function") == sym.name);
+  }
+
+  SECTION("resolve in libbcc.so") {
+    void *libbcc = dlopen("libbcc.so", RTLD_LAZY | RTLD_NOLOAD);
+    REQUIRE(libbcc);
+
+    void *libbcc_fptr = dlsym(libbcc, "bcc_resolve_symname");
+    REQUIRE(libbcc_fptr);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (uint64_t)libbcc_fptr, &sym) == 0);
+    REQUIRE(string(sym.module).find("libbcc.so") != string::npos);
+    REQUIRE(string("bcc_resolve_symname") == sym.name);
+  }
+
+  SECTION("resolve in libc") {
+    void *libc_fptr = dlsym(NULL, "strtok");
+    REQUIRE(libc_fptr);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (uint64_t)libc_fptr, &sym) == 0);
+    REQUIRE(sym.module);
+    REQUIRE(sym.module[0] == '/');
+    REQUIRE(string(sym.module).find("libc") != string::npos);
+
+    // In some cases, a symbol may have multiple aliases. Since
+    // bcc_symcache_resolve() returns only the first alias of a
+    // symbol, this may not always be "strtok" even if it points
+    // to the same address.
+    bool sym_match = (string("strtok") == sym.name);
+    if (!sym_match) {
+      uint64_t exp_addr, sym_addr;
+      char cmd[256];
+      const char *cmdfmt = "nm %s | grep \" %s$\" | cut -f 1 -d \" \"";
+
+      // Find address of symbol by the expected name
+      sprintf(cmd, cmdfmt, sym.module, "strtok");
+      REQUIRE(cmd_scanf(cmd, "%lx", &exp_addr) == 0);
+
+      // Find address of symbol by the name that was
+      // returned by bcc_symcache_resolve()
+      sprintf(cmd, cmdfmt, sym.module, sym.name);
+      REQUIRE(cmd_scanf(cmd, "%lx", &sym_addr) == 0);
+
+      // If both addresses match, they are definitely
+      // aliases of the same symbol
+      sym_match = (exp_addr == sym_addr);
+    }
+
+    REQUIRE(sym_match);
+  }
+
+  SECTION("resolve in separate mount namespace") {
+    pid_t child;
+    uint64_t addr = 0;
+
+    child = spawn_child(0, true, true, mntns_func);
+    REQUIRE(child > 0);
+
+    void *resolver = bcc_symcache_new(child, nullptr);
+    REQUIRE(resolver);
+
+    REQUIRE(bcc_symcache_resolve_name(resolver, "/tmp/libz.so.1", "zlibVersion",
+        &addr) == 0);
+    REQUIRE(addr != 0);
+  }
+}
+
+#define STACK_SIZE (1024 * 1024)
+static char child_stack[STACK_SIZE];
+
+static string perf_map_path(pid_t pid) {
+  return tfm::format("/tmp/perf-%d.map", pid);
+}
+
+static int make_perf_map_file(string &path, unsigned long long map_addr) {
+  FILE *file = fopen(path.c_str(), "w");
+  if (file == NULL) {
+    return -1;
+  }
+  fprintf(file, "%llx 10 dummy_fn\n", map_addr);
+  fprintf(file, "%llx 10 right_next_door_fn\n", map_addr + 0x10);
+  fclose(file);
+
+  return 0;
+}
+
+static int perf_map_func(void *arg) {
+  string path = perf_map_path(getpid());
+  if (make_perf_map_file(path, (unsigned long long)arg) < 0)
+    return -1;
+
+  sleep(5);
+
+  unlink(path.c_str());
+  return 0;
+}
+
+static int perf_map_func_mntns(void *arg) {
+  string path = perf_map_path(getpid());
+
+  if (setup_tmp_mnts() < 0) {
+    return -1;
+  }
+
+  if (make_perf_map_file(path, (unsigned long long)arg) < 0)
+    return -1;
+
+  sleep(5);
+
+  unlink(path.c_str());
+  return 0;
+}
+
+static int perf_map_func_noop(void *arg) {
+  if (setup_tmp_mnts() < 0) {
+    return -1;
+  }
+
+  sleep(5);
+
+  return 0;
+}
+
+static pid_t spawn_child(void *map_addr, bool own_pidns, bool own_mntns,
+    int (*child_func)(void *)) {
+  int flags = SIGCHLD;
+  if (own_pidns)
+    flags |= CLONE_NEWPID;
+  if (own_mntns)
+    flags |= CLONE_NEWNS;
+
+  pid_t child = clone(child_func,
+      /* stack grows down */ child_stack + STACK_SIZE, flags, (void*)map_addr);
+  if (child < 0)
+    return -1;
+
+  sleep(1); // let the child get set up
+  return child;
+}
+
+TEST_CASE("resolve symbols using /tmp/perf-pid.map", "[c_api]") {
+  const int map_sz = 4096;
+  void *map_addr = mmap(NULL, map_sz, PROT_READ | PROT_EXEC,
+    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  REQUIRE(map_addr != MAP_FAILED);
+
+  struct bcc_symbol sym;
+  pid_t child = -1;
+
+  SECTION("same namespace") {
+    child = spawn_child(map_addr, /* own_pidns */ false, false, perf_map_func);
+    REQUIRE(child > 0);
+
+    void *resolver = bcc_symcache_new(child, nullptr);
+    REQUIRE(resolver);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
+        &sym) == 0);
+    REQUIRE(sym.module);
+    REQUIRE(string(sym.module) == perf_map_path(child));
+    REQUIRE(string("dummy_fn") == sym.name);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr + 0x10,
+        &sym) == 0);
+    REQUIRE(sym.module);
+    REQUIRE(string(sym.module) == perf_map_path(child));
+    REQUIRE(string("right_next_door_fn") == sym.name);
+  }
+
+  SECTION("separate namespace") {
+    child = spawn_child(map_addr, /* own_pidns */ true, false, perf_map_func);
+    REQUIRE(child > 0);
+
+    void *resolver = bcc_symcache_new(child, nullptr);
+    REQUIRE(resolver);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
+        &sym) == 0);
+    REQUIRE(sym.module);
+    // child is PID 1 in its namespace
+    REQUIRE(string(sym.module) == perf_map_path(1));
+    REQUIRE(string("dummy_fn") == sym.name);
+    unlink("/tmp/perf-1.map");
+  }
+
+  SECTION("separate pid and mount namespace") {
+    child = spawn_child(map_addr, /* own_pidns */ true, true,
+        perf_map_func_mntns);
+    REQUIRE(child > 0);
+
+    void *resolver = bcc_symcache_new(child, nullptr);
+    REQUIRE(resolver);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
+        &sym) == 0);
+    REQUIRE(sym.module);
+    // child is PID 1 in its namespace
+    REQUIRE(string(sym.module) == perf_map_path(1));
+    REQUIRE(string("dummy_fn") == sym.name);
+  }
+
+  SECTION("separate pid and mount namespace, perf-map in host") {
+    child = spawn_child(map_addr, /* own_pidns */ true, true,
+        perf_map_func_noop);
+    REQUIRE(child > 0);
+
+    string path = perf_map_path(child);
+    REQUIRE(make_perf_map_file(path, (unsigned long long)map_addr) == 0);
+
+    void *resolver = bcc_symcache_new(child, nullptr);
+    REQUIRE(resolver);
+
+    REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
+        &sym) == 0);
+    REQUIRE(sym.module);
+    // child is PID 1 in its namespace
+    REQUIRE(string(sym.module) == perf_map_path(child));
+    REQUIRE(string("dummy_fn") == sym.name);
+
+    unlink(path.c_str());
+  }
+
+
+
+  munmap(map_addr, map_sz);
+}
+
+
+TEST_CASE("get online CPUs", "[c_api]") {
+	std::vector<int> cpus = ebpf::get_online_cpus();
+	int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	REQUIRE(cpus.size() == num_cpus);
+}
diff --git a/tests/cc/test_hash_table.cc b/tests/cc/test_hash_table.cc
new file mode 100644
index 0000000..1bcc306
--- /dev/null
+++ b/tests/cc/test_hash_table.cc
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017 Politecnico di Torino
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BPF.h"
+#include <linux/version.h>
+
+#include "catch.hpp"
+
+TEST_CASE("test hash table", "[hash_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("hash", int, int, myhash, 1024);
+    BPF_TABLE("array", int, int, myarray, 1024);
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFHashTable<int, int> t = bpf.get_hash_table<int, int>("myhash");
+
+  SECTION("bad table type") {
+    // try to get table of wrong type
+    auto f1 = [&](){
+      bpf.get_hash_table<int, int>("myarray");
+    };
+
+    REQUIRE_THROWS(f1());
+  }
+
+  SECTION("standard methods") {
+    int k, v1, v2;
+    k = 1;
+    v1 = 42;
+    // create new element
+    res = t.update_value(k, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() == 0);
+    REQUIRE(v2 == 42);
+
+    // update existing element
+    v1 = 69;
+    res = t.update_value(k, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() == 0);
+    REQUIRE(v2 == 69);
+
+    // remove existing element
+    res = t.remove_value(k);
+    REQUIRE(res.code() == 0);
+
+    // remove non existing element
+    res = t.remove_value(k);
+    REQUIRE(res.code() != 0);
+
+    // get non existing element
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() != 0);
+  }
+
+  SECTION("walk table") {
+    for (int i = 1; i <= 10; i++) {
+      res = t.update_value(i * 3, i);
+      REQUIRE(res.code() == 0);
+    }
+    auto offline = t.get_table_offline();
+    REQUIRE(offline.size() == 10);
+    for (const auto &pair : offline) {
+      REQUIRE(pair.first % 3 == 0);
+      REQUIRE(pair.first / 3 == pair.second);
+    }
+
+    // clear table
+    t.clear_table_non_atomic();
+    REQUIRE(t.get_table_offline().size() == 0);
+  }
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
+TEST_CASE("percpu hash table", "[percpu_hash_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("percpu_hash", int, u64, myhash, 128);
+    BPF_TABLE("percpu_array", int, u64, myarray, 64);
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFPercpuHashTable<int, uint64_t> t =
+    bpf.get_percpu_hash_table<int, uint64_t>("myhash");
+  size_t ncpus = ebpf::BPFTable::get_possible_cpu_count();
+
+  SECTION("bad table type") {
+    // try to get table of wrong type
+    auto f1 = [&](){
+      bpf.get_percpu_hash_table<int, uint64_t>("myarray");
+    };
+
+    REQUIRE_THROWS(f1());
+  }
+
+  SECTION("standard methods") {
+    int k;
+    std::vector<uint64_t> v1(ncpus);
+    std::vector<uint64_t> v2;
+
+    for (size_t j = 0; j < ncpus; j++) {
+      v1[j] = 42 * j;
+    }
+
+    k = 1;
+
+    // create new element
+    res = t.update_value(k, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() == 0);
+    for (size_t j = 0; j < ncpus; j++) {
+      REQUIRE(v2.at(j) == 42 * j);
+    }
+
+    // update existing element
+    for (size_t j = 0; j < ncpus; j++) {
+      v1[j] = 69 * j;
+    }
+    res = t.update_value(k, v1);
+    REQUIRE(res.code() == 0);
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() == 0);
+    for (size_t j = 0; j < ncpus; j++) {
+      REQUIRE(v2.at(j) == 69 * j);
+    }
+
+    // remove existing element
+    res = t.remove_value(k);
+    REQUIRE(res.code() == 0);
+
+    // remove non existing element
+    res = t.remove_value(k);
+    REQUIRE(res.code() != 0);
+
+    // get non existing element
+    res = t.get_value(k, v2);
+    REQUIRE(res.code() != 0);
+  }
+
+  SECTION("walk table") {
+    std::vector<uint64_t> v(ncpus);
+
+    for (int k = 3; k <= 30; k+=3) {
+      for (size_t cpu = 0; cpu < ncpus; cpu++) {
+        v[cpu] = k * cpu;
+      }
+      res = t.update_value(k, v);
+      REQUIRE(res.code() == 0);
+    }
+
+    // get whole table
+    auto offline = t.get_table_offline();
+    REQUIRE(offline.size() == 10);
+    for (int i = 0; i < 10; i++) {
+      // check the key
+      REQUIRE(offline.at(i).first % 3 == 0);
+
+      // check value
+      for (size_t cpu = 0; cpu < ncpus; cpu++) {
+        REQUIRE(offline.at(i).second.at(cpu) == cpu * offline.at(i).first);
+      }
+    }
+
+    // clear table
+    t.clear_table_non_atomic();
+    REQUIRE(t.get_table_offline().size() == 0);
+  }
+}
+#endif
diff --git a/tests/cc/test_libbcc.cc b/tests/cc/test_libbcc.cc
new file mode 100644
index 0000000..0c7c351
--- /dev/null
+++ b/tests/cc/test_libbcc.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/tests/cc/test_perf_event.cc b/tests/cc/test_perf_event.cc
new file mode 100644
index 0000000..76d2d17
--- /dev/null
+++ b/tests/cc/test_perf_event.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/version.h>
+#include <unistd.h>
+#include <string>
+
+#include "BPF.h"
+#include "catch.hpp"
+
+TEST_CASE("test read perf event", "[bpf_perf_event]") {
+// The basic bpf_perf_event_read is supported since Kernel 4.3. However in that
+// version it only supported HARDWARE and RAW events. On the other hand, our
+// tests running on Jenkins won't have availiable HARDWARE counters since they
+// are running on VMs. The support of other types of events such as SOFTWARE are
+// only added since Kernel 4.13, hence we can only run the test since that.
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0)
+  const std::string BPF_PROGRAM = R"(
+    BPF_PERF_ARRAY(cnt, NUM_CPUS);
+    BPF_HASH(val, int, u64, 1);
+    BPF_HASH(ret, int, int, 1);
+    BPF_HASH(counter, int, struct bpf_perf_event_value, 1);
+
+    int on_sys_getuid(void *ctx) {
+      int zero = 0;
+
+      u64 v = cnt.perf_read(CUR_CPU_IDENTIFIER);
+      if (((s64)v < 0) && ((s64)v > -256))
+        return 0;
+      val.update(&zero, &v);
+    #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
+      u32 cpu = bpf_get_smp_processor_id();
+      struct bpf_perf_event_value c = {0};
+      int r = cnt.perf_counter_value(cpu, &c, sizeof(c));
+      ret.update(&zero, &r);
+      counter.update(&zero, &c);
+    #endif
+      return 0;
+    }
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(
+      BPF_PROGRAM,
+      {"-DNUM_CPUS=" + std::to_string(sysconf(_SC_NPROCESSORS_ONLN))}, {});
+  REQUIRE(res.code() == 0);
+  res =
+      bpf.open_perf_event("cnt", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK);
+  REQUIRE(res.code() == 0);
+  std::string getuid_fnname = bpf.get_syscall_fnname("getuid");
+  res = bpf.attach_kprobe(getuid_fnname, "on_sys_getuid");
+  REQUIRE(res.code() == 0);
+  REQUIRE(getuid() >= 0);
+  res = bpf.detach_kprobe(getuid_fnname);
+  REQUIRE(res.code() == 0);
+  res = bpf.close_perf_event("cnt");
+  REQUIRE(res.code() == 0);
+
+  auto val = bpf.get_hash_table<int, uint64_t>("val");
+  REQUIRE(val[0] >= 0);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
+  auto counter_table =
+      bpf.get_hash_table<int, struct bpf_perf_event_value>("counter");
+  auto counter = counter_table[0];
+  auto ret = bpf.get_hash_table<int, int>("ret");
+  REQUIRE(ret[0] == 0);
+  REQUIRE(counter.counter >= 0);
+  REQUIRE(counter.enabled > 0);
+  REQUIRE(counter.running >= 0);
+  REQUIRE(counter.running <= counter.enabled);
+#endif
+}
+
+TEST_CASE("test attach perf event", "[bpf_perf_event]") {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)
+  const std::string BPF_PROGRAM = R"(
+    BPF_HASH(pid, int, u64, 1);
+    BPF_HASH(ret, int, int, 1);
+    BPF_HASH(counter, int, struct bpf_perf_event_value, 1);
+
+    int on_event(void *ctx) {
+      int zero = 0;
+      
+      u64 p = bpf_get_current_pid_tgid();
+      pid.update(&zero, &p);
+    #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
+      struct bpf_perf_event_value c = {0};
+      int r = bpf_perf_prog_read_value(ctx, &c, sizeof(c));
+      ret.update(&zero, &r);
+      counter.update(&zero, &c);
+    #endif
+      return 0;
+    }
+  )";
+
+  ebpf::BPF bpf;
+  ebpf::StatusTuple res(0);
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+  res = bpf.attach_perf_event(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK,
+                              "on_event", 0, 1000);
+  REQUIRE(res.code() == 0);
+  sleep(1);
+  res = bpf.detach_perf_event(PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CPU_CLOCK);
+  REQUIRE(res.code() == 0);
+
+  auto pid = bpf.get_hash_table<int, uint64_t>("pid");
+  REQUIRE(pid[0] >= 0);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
+  auto counter_table =
+      bpf.get_hash_table<int, struct bpf_perf_event_value>("counter");
+  auto counter = counter_table[0];
+  auto ret = bpf.get_hash_table<int, int>("ret");
+  REQUIRE(ret[0] == 0);
+  REQUIRE(counter.counter >= 0);
+  // the program slept one second between perf_event attachment and detachment
+  // in the above, so the enabled counter should be 1000000000ns or
+  // more. But in reality, most of counters (if not all) are 9xxxxxxxx,
+  // and I also saw one 8xxxxxxxx. So let us a little bit conservative here.
+  REQUIRE(counter.enabled >= 800000000);
+  REQUIRE(counter.running >= 0);
+  REQUIRE(counter.running <= counter.enabled);
+#endif
+}
diff --git a/tests/cc/test_prog_table.cc b/tests/cc/test_prog_table.cc
new file mode 100644
index 0000000..138db3e
--- /dev/null
+++ b/tests/cc/test_prog_table.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 Politecnico di Torino
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BPF.h"
+
+#include "catch.hpp"
+
+TEST_CASE("test prog table", "[prog_table]") {
+  const std::string BPF_PROGRAM = R"(
+    BPF_TABLE("prog", int, int, myprog, 16);
+  )";
+
+  const std::string BPF_PROGRAM2 = R"(
+    int hello(struct __sk_buff *skb) {
+      return 1;
+    }
+  )";
+
+  ebpf::StatusTuple res(0);
+
+  ebpf::BPF bpf;
+  res = bpf.init(BPF_PROGRAM);
+  REQUIRE(res.code() == 0);
+
+  ebpf::BPFProgTable t = bpf.get_prog_table("myprog");
+
+  ebpf::BPF bpf2;
+  res = bpf2.init(BPF_PROGRAM2);
+  REQUIRE(res.code() == 0);
+
+  int fd;
+  res = bpf2.load_func("hello", BPF_PROG_TYPE_SCHED_CLS, fd);
+  REQUIRE(res.code() == 0);
+
+  SECTION("update and remove") {
+    // update element
+    res = t.update_value(0, fd);
+    REQUIRE(res.code() == 0);
+
+    // remove element
+    res = t.remove_value(0);
+    REQUIRE(res.code() == 0);
+
+    // update out of range element
+    res = t.update_value(17, fd);
+    REQUIRE(res.code() != 0);
+
+    // remove out of range element
+    res = t.remove_value(17);
+    REQUIRE(res.code() != 0);
+  }
+}
diff --git a/tests/cc/test_shared_table.cc b/tests/cc/test_shared_table.cc
new file mode 100644
index 0000000..a638cb5
--- /dev/null
+++ b/tests/cc/test_shared_table.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018 Politecnico di Torino
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BPF.h"
+#include "catch.hpp"
+
+const std::string BPF_PROGRAM1 = R"(
+BPF_TABLE_SHARED("array", int, int, mysharedtable, 1024);
+)";
+
+const std::string BPF_PROGRAM2 = R"(
+BPF_TABLE("extern", int, int, mysharedtable, 1024);
+)";
+
+TEST_CASE("test shared table", "[shared_table]") {
+  // deploy 4 ebpf programs: _ns1_a and _ns1_b are in ns1, _ns2_a and _ns2_b in ns2
+  ebpf::BPF bpf_ns1_a(0, nullptr, false, "ns1");
+  ebpf::BPF bpf_ns1_b(0, nullptr, false, "ns1");
+  ebpf::BPF bpf_ns2_a(0, nullptr, false, "ns2");
+  ebpf::BPF bpf_ns2_b(0, nullptr, false, "ns2");
+
+  ebpf::StatusTuple res(0);
+
+  res = bpf_ns1_a.init(BPF_PROGRAM1);
+  REQUIRE(res.code() == 0);
+
+  res = bpf_ns1_b.init(BPF_PROGRAM2);
+  REQUIRE(res.code() == 0);
+
+  res = bpf_ns2_a.init(BPF_PROGRAM1);
+  REQUIRE(res.code() == 0);
+
+  res = bpf_ns2_b.init(BPF_PROGRAM2);
+  REQUIRE(res.code() == 0);
+
+  // get references to all tables
+  ebpf::BPFArrayTable<int> t_ns1_a = bpf_ns1_a.get_array_table<int>("mysharedtable");
+  ebpf::BPFArrayTable<int> t_ns1_b = bpf_ns1_b.get_array_table<int>("mysharedtable");
+  ebpf::BPFArrayTable<int> t_ns2_a = bpf_ns2_a.get_array_table<int>("mysharedtable");
+  ebpf::BPFArrayTable<int> t_ns2_b = bpf_ns2_b.get_array_table<int>("mysharedtable");
+
+  // test that tables within the same ns are shared
+  int v1, v2, v3;
+  res = t_ns1_a.update_value(13, 42);
+  REQUIRE(res.code() == 0);
+
+  res = t_ns1_b.get_value(13, v1);
+  REQUIRE(res.code() == 0);
+  REQUIRE(v1 == 42);
+
+  // test that tables are isolated within different ns
+  res = t_ns2_a.update_value(13, 69);
+  REQUIRE(res.code() == 0);
+
+  res = t_ns2_b.get_value(13, v2);
+  REQUIRE(res.code() == 0);
+  REQUIRE(v2 == 69);
+
+  res = t_ns1_b.get_value(13, v3);
+  REQUIRE(res.code() == 0);
+  REQUIRE(v3 == 42);  // value should still be 42
+}
diff --git a/tests/cc/test_static.c b/tests/cc/test_static.c
new file mode 100644
index 0000000..4af8b93
--- /dev/null
+++ b/tests/cc/test_static.c
@@ -0,0 +1,6 @@
+#include "bpf_common.h"
+
+int main(int argc, char **argv) {
+  void *mod = bpf_module_create_c_from_string("BPF_TABLE(\"array\", int, int, stats, 10);\n", 4, NULL, 0);
+  return !(mod != NULL);
+}
diff --git a/tests/cc/test_usdt_args.cc b/tests/cc/test_usdt_args.cc
new file mode 100644
index 0000000..3a96c5a
--- /dev/null
+++ b/tests/cc/test_usdt_args.cc
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+#include <string>
+
+#include "catch.hpp"
+#include "usdt.h"
+
+using std::experimental::optional;
+using std::experimental::nullopt;
+
+static void verify_register(USDT::ArgumentParser &parser, int arg_size,
+                            int constant) {
+  USDT::Argument arg;
+  REQUIRE(parser.parse(&arg));
+  REQUIRE(arg.arg_size() == arg_size);
+
+  REQUIRE(arg.constant());
+  REQUIRE(arg.constant() == constant);
+}
+
+static void verify_register(USDT::ArgumentParser &parser, int arg_size,
+                            const std::string &regname,
+                            optional<int> deref_offset = nullopt,
+                            optional<std::string> deref_ident = nullopt,
+                            optional<std::string> index_regname = nullopt,
+                            optional<int> scale = nullopt) {
+  USDT::Argument arg;
+  REQUIRE(parser.parse(&arg));
+  REQUIRE(arg.arg_size() == arg_size);
+
+  REQUIRE(arg.base_register_name());
+  REQUIRE(arg.base_register_name() == regname);
+
+  REQUIRE(arg.deref_offset() == deref_offset);
+  REQUIRE(arg.deref_ident() == deref_ident);
+
+  REQUIRE(arg.index_register_name() == index_regname);
+  REQUIRE(arg.scale() == scale);
+}
+
+TEST_CASE("test usdt argument parsing", "[usdt]") {
+  SECTION("parse failure") {
+#ifdef __aarch64__
+    USDT::ArgumentParser_aarch64 parser("4@[x32,200]");
+#elif __powerpc64__
+    USDT::ArgumentParser_powerpc64 parser("4@-12(42)");
+#elif defined(__x86_64__)
+    USDT::ArgumentParser_x64 parser("4@i%ra+1r");
+#endif
+    USDT::Argument arg;
+    REQUIRE(!parser.parse(&arg));
+    int i;
+    for (i = 0; i < 10 && !parser.done(); ++i) {
+      parser.parse(&arg);
+    }
+    // Make sure we reach termination
+    REQUIRE(i < 10);
+  }
+  SECTION("argument examples from the Python implementation") {
+#ifdef __aarch64__
+    USDT::ArgumentParser_aarch64 parser("-1@x0 4@5 8@[x12] -4@[x31,-40]");
+    verify_register(parser, -1, "regs[0]");
+    verify_register(parser, 4, 5);
+    verify_register(parser, 8, "regs[12]", 0);
+    verify_register(parser, -4, "regs[31]", -40);
+#elif __powerpc64__
+    USDT::ArgumentParser_powerpc64 parser(
+        "-4@0 8@%r0 8@i0 4@0(%r0) -2@0(0) "
+        "1@0 -2@%r3 -8@i9 -1@0(%r4) -4@16(6) "
+        "2@7 4@%r11 4@i-67 8@-16(%r17) 1@-52(11) "
+        "-8@13 -8@%r25 2@i-11 -2@14(%r26) -8@-32(24) "
+        "4@29 2@%r17 -8@i-693 -1@-23(%r31) 4@28(30) "
+        "-2@31 -4@%r30 2@i1097 4@108(%r30) -2@-4(31)");
+
+    verify_register(parser, -4, "gpr[0]");
+    verify_register(parser, 8, "gpr[0]");
+    verify_register(parser, 8, 0);
+    verify_register(parser, 4, "gpr[0]", 0);
+    verify_register(parser, -2, "gpr[0]", 0);
+
+    verify_register(parser, 1, "gpr[0]");
+    verify_register(parser, -2, "gpr[3]");
+    verify_register(parser, -8, 9);
+    verify_register(parser, -1, "gpr[4]", 0);
+    verify_register(parser, -4, "gpr[6]", 16);
+
+    verify_register(parser, 2, "gpr[7]");
+    verify_register(parser, 4, "gpr[11]");
+    verify_register(parser, 4, -67);
+    verify_register(parser, 8, "gpr[17]", -16);
+    verify_register(parser, 1, "gpr[11]", -52);
+
+    verify_register(parser, -8, "gpr[13]");
+    verify_register(parser, -8, "gpr[25]");
+    verify_register(parser, 2, -11);
+    verify_register(parser, -2, "gpr[26]", 14);
+    verify_register(parser, -8, "gpr[24]", -32);
+
+    verify_register(parser, 4, "gpr[29]");
+    verify_register(parser, 2, "gpr[17]");
+    verify_register(parser, -8, -693);
+    verify_register(parser, -1, "gpr[31]", -23);
+    verify_register(parser, 4, "gpr[30]", 28);
+
+    verify_register(parser, -2, "gpr[31]");
+    verify_register(parser, -4, "gpr[30]");
+    verify_register(parser, 2, 1097);
+    verify_register(parser, 4, "gpr[30]", 108);
+    verify_register(parser, -2, "gpr[31]", -4);
+#elif defined(__x86_64__)
+    USDT::ArgumentParser_x64 parser(
+        "-4@$0 8@$1234 %rdi %rax %rsi "
+        "-8@%rbx 4@%r12 8@-8(%rbp) 4@(%rax) "
+        "-4@global_max_action(%rip) "
+        "8@24+mp_(%rip) "
+        "-4@CheckpointStats+40(%rip) "
+        "4@glob-2(%rip) "
+        "8@(%rax,%rdx,8) "
+        "4@(%rbx,%rcx)");
+
+    verify_register(parser, -4, 0);
+    verify_register(parser, 8, 1234);
+
+    verify_register(parser, 8, "di");
+    verify_register(parser, 8, "ax");
+    verify_register(parser, 8, "si");
+    verify_register(parser, -8, "bx");
+    verify_register(parser, 4, "r12");
+
+    verify_register(parser, 8, "bp", -8);
+    verify_register(parser, 4, "ax", 0);
+
+    verify_register(parser, -4, "ip", 0, std::string("global_max_action"));
+    verify_register(parser, 8, "ip", 24, std::string("mp_"));
+    verify_register(parser, -4, "ip", 40, std::string("CheckpointStats"));
+    verify_register(parser, 4, "ip", -2, std::string("glob"));
+
+    verify_register(parser, 8, "ax", 0, nullopt, std::string("dx"), 8);
+    verify_register(parser, 4, "bx", 0, nullopt, std::string("cx"));
+#endif
+
+    REQUIRE(parser.done());
+  }
+}
diff --git a/tests/cc/test_usdt_probes.cc b/tests/cc/test_usdt_probes.cc
new file mode 100644
index 0000000..3e42633
--- /dev/null
+++ b/tests/cc/test_usdt_probes.cc
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2016 GitHub, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "catch.hpp"
+#include "usdt.h"
+#include "api/BPF.h"
+
+#ifdef HAVE_SDT_HEADER
+/* required to insert USDT probes on this very executable --
+ * we're gonna be testing them live! */
+#include <sys/sdt.h>
+
+static int a_probed_function() {
+  int an_int = 23 + getpid();
+  void *a_pointer = malloc(4);
+  DTRACE_PROBE2(libbcc_test, sample_probe_1, an_int, a_pointer);
+  free(a_pointer);
+  return an_int;
+}
+
+TEST_CASE("test finding a probe in our own process", "[usdt]") {
+  USDT::Context ctx(getpid());
+  REQUIRE(ctx.num_probes() >= 1);
+
+  SECTION("our test probe") {
+    auto probe = ctx.get("sample_probe_1");
+    REQUIRE(probe);
+
+    REQUIRE(probe->in_shared_object(probe->bin_path()) == false);
+    REQUIRE(probe->name() == "sample_probe_1");
+    REQUIRE(probe->provider() == "libbcc_test");
+    REQUIRE(probe->bin_path().find("/test_libbcc") != std::string::npos);
+
+    REQUIRE(probe->num_locations() == 1);
+    REQUIRE(probe->num_arguments() == 2);
+    REQUIRE(probe->need_enable() == false);
+
+    REQUIRE(a_probed_function() != 0);
+  }
+}
+
+TEST_CASE("test fine a probe in our own binary with C++ API", "[usdt]") {
+    ebpf::BPF bpf;
+    ebpf::USDT u("/proc/self/exe", "libbcc_test", "sample_probe_1", "on_event");
+
+    auto res = bpf.init("int on_event() { return 0; }", {}, {u});
+    REQUIRE(res.code() == 0);
+
+    res = bpf.attach_usdt(u);
+    REQUIRE(res.code() == 0);
+
+    res = bpf.detach_usdt(u);
+    REQUIRE(res.code() == 0);
+}
+
+TEST_CASE("test fine a probe in our Process with C++ API", "[usdt]") {
+    ebpf::BPF bpf;
+    ebpf::USDT u(::getpid(), "libbcc_test", "sample_probe_1", "on_event");
+
+    auto res = bpf.init("int on_event() { return 0; }", {}, {u});
+    REQUIRE(res.code() == 0);
+
+    res = bpf.attach_usdt(u);
+    REQUIRE(res.code() == 0);
+
+    res = bpf.detach_usdt(u);
+    REQUIRE(res.code() == 0);
+}
+#endif  // HAVE_SDT_HEADER
+
+class ChildProcess {
+  pid_t pid_;
+
+public:
+  ChildProcess(const char *name, char *const argv[]) {
+    pid_ = fork();
+    if (pid_ == 0) {
+      execvp(name, argv);
+      exit(0);
+    }
+    if (spawned()) {
+      usleep(250000);
+      if (kill(pid_, 0) < 0)
+        pid_ = -1;
+    }
+  }
+
+  ~ChildProcess() {
+    if (spawned()) {
+      int status;
+      kill(pid_, SIGKILL);
+      if (waitpid(pid_, &status, 0) != pid_)
+        abort();
+    }
+  }
+
+  bool spawned() const { return pid_ > 0; }
+  pid_t pid() const { return pid_; }
+};
+
+extern int cmd_scanf(const char *cmd, const char *fmt, ...);
+
+static int probe_num_locations(const char *bin_path, const char *func_name) {
+  int num_locations;
+  char cmd[512];
+  const char *cmdfmt = "readelf -n %s | grep -c \"Name: %s$\"";
+
+  sprintf(cmd, cmdfmt, bin_path, func_name);
+  if (cmd_scanf(cmd, "%d", &num_locations) != 0) {
+    return -1;
+  }
+
+  return num_locations;
+}
+
+static int probe_num_arguments(const char *bin_path, const char *func_name) {
+  int num_arguments;
+  char cmd[512];
+  const char *cmdfmt = "readelf -n %s | grep -m 1 -A 2 \" %s$\" | " \
+                       "tail -1 | cut -d \" \" -f 6- | wc -w";
+
+  sprintf(cmd, cmdfmt, bin_path, func_name);
+  if (cmd_scanf(cmd, "%d", &num_arguments) != 0) {
+    return -1;
+  }
+
+  return num_arguments;
+}
+
+TEST_CASE("test listing all USDT probes in Ruby/MRI", "[usdt]") {
+  size_t mri_probe_count = 0;
+
+  SECTION("without a running Ruby process") {
+    USDT::Context ctx("ruby");
+
+    if (!ctx.loaded())
+      return;
+
+    REQUIRE(ctx.num_probes() > 10);
+    mri_probe_count = ctx.num_probes();
+
+    SECTION("GC static probe") {
+      auto name = "gc__mark__begin";
+      auto probe = ctx.get(name);
+      REQUIRE(probe);
+
+      REQUIRE(probe->in_shared_object(probe->bin_path()) == true);
+      REQUIRE(probe->name() == name);
+      REQUIRE(probe->provider() == "ruby");
+
+      auto bin_path = probe->bin_path();
+      bool bin_path_match =
+            (bin_path.find("/ruby") != std::string::npos) ||
+            (bin_path.find("/libruby") != std::string::npos);
+      REQUIRE(bin_path_match);
+
+      int exp_locations, exp_arguments;
+      exp_locations = probe_num_locations(bin_path.c_str(), name);
+      exp_arguments = probe_num_arguments(bin_path.c_str(), name);
+      REQUIRE(probe->num_locations() == exp_locations);
+      REQUIRE(probe->num_arguments() == exp_arguments);
+      REQUIRE(probe->need_enable() == true);
+    }
+
+    SECTION("object creation probe") {
+      auto name = "object__create";
+      auto probe = ctx.get(name);
+      REQUIRE(probe);
+
+      REQUIRE(probe->in_shared_object(probe->bin_path()) == true);
+      REQUIRE(probe->name() == name);
+      REQUIRE(probe->provider() == "ruby");
+
+      auto bin_path = probe->bin_path();
+      bool bin_path_match =
+            (bin_path.find("/ruby") != std::string::npos) ||
+            (bin_path.find("/libruby") != std::string::npos);
+      REQUIRE(bin_path_match);
+
+      int exp_locations, exp_arguments;
+      exp_locations = probe_num_locations(bin_path.c_str(), name);
+      exp_arguments = probe_num_arguments(bin_path.c_str(), name);
+      REQUIRE(probe->num_locations() == exp_locations);
+      REQUIRE(probe->num_arguments() == exp_arguments);
+      REQUIRE(probe->need_enable() == true);
+    }
+
+    SECTION("array creation probe") {
+      auto name = "array__create";
+      auto probe = ctx.get(name);
+      REQUIRE(probe);
+      REQUIRE(probe->name() == name);
+
+      auto bin_path = probe->bin_path().c_str();
+      int exp_locations, exp_arguments;
+      exp_locations = probe_num_locations(bin_path, name);
+      exp_arguments = probe_num_arguments(bin_path, name);
+      REQUIRE(probe->num_locations() == exp_locations);
+      REQUIRE(probe->num_arguments() == exp_arguments);
+      REQUIRE(probe->need_enable() == true);
+    }
+  }
+
+  SECTION("with a running Ruby process") {
+    static char _ruby[] = "ruby";
+    char *const argv[2] = {_ruby, NULL};
+
+    ChildProcess ruby(argv[0], argv);
+    if (!ruby.spawned())
+      return;
+
+    USDT::Context ctx(ruby.pid());
+    REQUIRE(ctx.num_probes() >= mri_probe_count);
+
+    SECTION("get probe in running process") {
+      auto name = "gc__mark__begin";
+      auto probe = ctx.get(name);
+      REQUIRE(probe);
+
+      REQUIRE(probe->in_shared_object(probe->bin_path()) == true);
+      REQUIRE(probe->name() == name);
+      REQUIRE(probe->provider() == "ruby");
+
+      auto bin_path = probe->bin_path();
+      bool bin_path_match =
+            (bin_path.find("/ruby") != std::string::npos) ||
+            (bin_path.find("/libruby") != std::string::npos);
+      REQUIRE(bin_path_match);
+
+      int exp_locations, exp_arguments;
+      exp_locations = probe_num_locations(bin_path.c_str(), name);
+      exp_arguments = probe_num_arguments(bin_path.c_str(), name);
+      REQUIRE(probe->num_locations() == exp_locations);
+      REQUIRE(probe->num_arguments() == exp_arguments);
+      REQUIRE(probe->need_enable() == true);
+    }
+  }
+}
diff --git a/tests/cc/utils.cc b/tests/cc/utils.cc
new file mode 100644
index 0000000..ba8a3d4
--- /dev/null
+++ b/tests/cc/utils.cc
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2017 IBM Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdarg.h>
+#include <stdio.h>
+
+int cmd_scanf(const char *cmd, const char *fmt, ...) {
+  va_list args;
+  FILE *pipe;
+
+  va_start(args, fmt);
+  pipe = popen(cmd, "r");
+  if (pipe == NULL) {
+    va_end(args);
+    return -1;
+  }
+
+  vfscanf(pipe, fmt, args);
+  va_end(args);
+  pclose(pipe);
+  return 0;
+}
diff --git a/tests/lua/.busted b/tests/lua/.busted
new file mode 100644
index 0000000..5a83208
--- /dev/null
+++ b/tests/lua/.busted
@@ -0,0 +1,8 @@
+-- Configuration for unit tests
+-- See: http://olivinelabs.com/busted/ 
+return {
+	default = {
+		lpath = "./?.lua",
+		["auto-insulate"] = false,
+	}
+}
\ No newline at end of file
diff --git a/tests/lua/.luacheckrc b/tests/lua/.luacheckrc
new file mode 100644
index 0000000..407cdbe
--- /dev/null
+++ b/tests/lua/.luacheckrc
@@ -0,0 +1,12 @@
+std = "luajit"
+ignore = { "211", "212", "411", "412", "421", "431", "542" }
+files["examples"] = {
+	new_globals = { "pkt", "time", "xadd", "c" }
+}
+files["bpf/builtins.lua"] = {
+	ignore = { "122" }
+}
+files["spec"] = {
+	std = "+busted",
+	new_globals = { "pkt", "time", "xadd", "c" }
+}
\ No newline at end of file
diff --git a/tests/lua/CMakeLists.txt b/tests/lua/CMakeLists.txt
new file mode 100644
index 0000000..d3d7298
--- /dev/null
+++ b/tests/lua/CMakeLists.txt
@@ -0,0 +1,21 @@
+find_program(LUAJIT luajit)
+find_program(BUSTED busted)
+
+if(LUAJIT)
+	add_test(NAME lua_test_clang WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+		COMMAND ${TEST_WRAPPER} lua_test_clang sudo ${LUAJIT} test_clang.lua)
+
+	add_test(NAME lua_test_uprobes WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+		COMMAND ${TEST_WRAPPER} lua_test_uprobes sudo ${LUAJIT} test_uprobes.lua)
+
+	add_test(NAME lua_test_dump WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+		COMMAND ${TEST_WRAPPER} lua_test_dump sudo ${LUAJIT} test_dump.lua)
+
+	add_test(NAME lua_test_standalone WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+		COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/test_standalone.sh)
+
+	if(BUSTED)
+		add_test(NAME lua_test_busted WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+			COMMAND busted --lua=${LUAJIT} -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?.lua" -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?/init.lua;")
+	endif()
+endif()
diff --git a/tests/lua/luaunit.lua b/tests/lua/luaunit.lua
new file mode 100644
index 0000000..03316d3
--- /dev/null
+++ b/tests/lua/luaunit.lua
@@ -0,0 +1,2150 @@
+--[[
+        luaunit.lua
+
+Description: A unit testing framework
+Homepage: https://github.com/bluebird75/luaunit
+Development by Philippe Fremy <phil@freehackers.org>
+Based on initial work of Ryu, Gwang (http://www.gpgstudy.com/gpgiki/LuaUnit)
+License: BSD License, see LICENSE.txt
+Version: 3.2
+]]--
+
+require("math")
+local M={}
+
+-- private exported functions (for testing)
+M.private = {}
+
+M.VERSION='3.2'
+
+--[[ Some people like assertEquals( actual, expected ) and some people prefer
+assertEquals( expected, actual ).
+]]--
+M.ORDER_ACTUAL_EXPECTED = true
+M.PRINT_TABLE_REF_IN_ERROR_MSG = false
+M.TABLE_EQUALS_KEYBYCONTENT = true
+M.LINE_LENGTH=80
+
+-- set this to false to debug luaunit
+local STRIP_LUAUNIT_FROM_STACKTRACE=true
+
+M.VERBOSITY_DEFAULT = 10
+M.VERBOSITY_LOW     = 1
+M.VERBOSITY_QUIET   = 0
+M.VERBOSITY_VERBOSE = 20
+
+-- set EXPORT_ASSERT_TO_GLOBALS to have all asserts visible as global values
+-- EXPORT_ASSERT_TO_GLOBALS = true
+
+-- we need to keep a copy of the script args before it is overriden
+local cmdline_argv = rawget(_G, "arg")
+
+M.FAILURE_PREFIX = 'LuaUnit test FAILURE: ' -- prefix string for failed tests
+
+M.USAGE=[[Usage: lua <your_test_suite.lua> [options] [testname1 [testname2] ... ]
+Options:
+  -h, --help:             Print this help
+  --version:              Print version information
+  -v, --verbose:          Increase verbosity
+  -q, --quiet:            Set verbosity to minimum
+  -e, --error:            Stop on first error
+  -f, --failure:          Stop on first failure or error
+  -o, --output OUTPUT:    Set output type to OUTPUT
+                          Possible values: text, tap, junit, nil
+  -n, --name NAME:        For junit only, mandatory name of xml file
+  -p, --pattern PATTERN:  Execute all test names matching the Lua PATTERN
+                          May be repeated to include severals patterns
+                          Make sure you escape magic chars like +? with %
+  testname1, testname2, ... : tests to run in the form of testFunction,
+                              TestClass or TestClass.testMethod
+]]
+
+----------------------------------------------------------------
+--
+--                 general utility functions
+--
+----------------------------------------------------------------
+
+local crossTypeOrdering = {
+    number = 1,
+    boolean = 2,
+    string = 3,
+    table = 4,
+    other = 5
+}
+local crossTypeComparison = {
+    number = function(a, b) return a < b end,
+    string = function(a, b) return a < b end,
+    other = function(a, b) return tostring(a) < tostring(b) end,
+}
+
+local function crossTypeSort(a, b)
+    local type_a, type_b = type(a), type(b)
+    if type_a == type_b then
+        local func = crossTypeComparison[type_a] or crossTypeComparison.other
+        return func(a, b)
+    end
+    type_a = crossTypeOrdering[type_a] or crossTypeOrdering.other
+    type_b = crossTypeOrdering[type_b] or crossTypeOrdering.other
+    return type_a < type_b
+end
+
+local function __genSortedIndex( t )
+    -- Returns a sequence consisting of t's keys, sorted.
+    local sortedIndex = {}
+
+    for key,_ in pairs(t) do
+        table.insert(sortedIndex, key)
+    end
+
+    table.sort(sortedIndex, crossTypeSort)
+    return sortedIndex
+end
+M.private.__genSortedIndex = __genSortedIndex
+
+local function sortedNext(state, control)
+    -- Equivalent of the next() function of table iteration, but returns the
+    -- keys in sorted order (see __genSortedIndex and crossTypeSort).
+    -- The state is a temporary variable during iteration and contains the
+    -- sorted key table (state.sortedIdx). It also stores the last index (into
+    -- the keys) used by the iteration, to find the next one quickly.
+    local key
+
+    --print("sortedNext: control = "..tostring(control) )
+    if control == nil then
+        -- start of iteration
+        state.lastIdx = 1
+        key = state.sortedIdx[1]
+        return key, state.t[key]
+    end
+
+    -- normally, we expect the control variable to match the last key used
+    if control ~= state.sortedIdx[state.lastIdx] then
+        -- strange, we have to find the next value by ourselves
+        -- the key table is sorted in crossTypeSort() order! -> use bisection
+        local count = #state.sortedIdx
+        local lower, upper = 1, count
+        repeat
+            state.lastIdx = math.modf((lower + upper) / 2)
+            key = state.sortedIdx[state.lastIdx]
+            if key == control then break; end -- key found (and thus prev index)
+            if crossTypeSort(key, control) then
+                -- key < control, continue search "right" (towards upper bound)
+                lower = state.lastIdx + 1
+            else
+                -- key > control, continue search "left" (towards lower bound)
+                upper = state.lastIdx - 1
+            end
+        until lower > upper
+        if lower > upper then -- only true if the key wasn't found, ...
+            state.lastIdx = count -- ... so ensure no match for the code below
+        end
+    end
+
+    -- proceed by retrieving the next value (or nil) from the sorted keys
+    state.lastIdx = state.lastIdx + 1
+    key = state.sortedIdx[state.lastIdx]
+    if key then
+        return key, state.t[key]
+    end
+
+    -- getting here means returning `nil`, which will end the iteration
+end
+
+local function sortedPairs(tbl)
+    -- Equivalent of the pairs() function on tables. Allows to iterate in
+    -- sorted order. As required by "generic for" loops, this will return the
+    -- iterator (function), an "invariant state", and the initial control value.
+    -- (see http://www.lua.org/pil/7.2.html)
+    return sortedNext, {t = tbl, sortedIdx = __genSortedIndex(tbl)}, nil
+end
+M.private.sortedPairs = sortedPairs
+
+local function strsplit(delimiter, text)
+-- Split text into a list consisting of the strings in text,
+-- separated by strings matching delimiter (which may be a pattern).
+-- example: strsplit(",%s*", "Anna, Bob, Charlie,Dolores")
+    if string.find("", delimiter, 1, true) then -- this would result in endless loops
+        error("delimiter matches empty string!")
+    end
+    local list, pos, first, last = {}, 1
+    while true do
+        first, last = text:find(delimiter, pos, true)
+        if first then -- found?
+            table.insert(list, text:sub(pos, first - 1))
+            pos = last + 1
+        else
+            table.insert(list, text:sub(pos))
+            break
+        end
+    end
+    return list
+end
+M.private.strsplit = strsplit
+
+local function hasNewLine( s )
+    -- return true if s has a newline
+    return (string.find(s, '\n', 1, true) ~= nil)
+end
+M.private.hasNewLine = hasNewLine
+
+local function prefixString( prefix, s )
+    -- Prefix all the lines of s with prefix
+    return prefix .. table.concat(strsplit('\n', s), '\n' .. prefix)
+end
+M.private.prefixString = prefixString
+
+local function strMatch(s, pattern, start, final )
+    -- return true if s matches completely the pattern from index start to index end
+    -- return false in every other cases
+    -- if start is nil, matches from the beginning of the string
+    -- if final is nil, matches to the end of the string
+    start = start or 1
+    final = final or string.len(s)
+
+    local foundStart, foundEnd = string.find(s, pattern, start, false)
+    return foundStart == start and foundEnd == final
+end
+M.private.strMatch = strMatch
+
+local function xmlEscape( s )
+    -- Return s escaped for XML attributes
+    -- escapes table:
+    -- "   &quot;
+    -- '   &apos;
+    -- <   &lt;
+    -- >   &gt;
+    -- &   &amp;
+
+    return string.gsub( s, '.', {
+        ['&'] = "&amp;",
+        ['"'] = "&quot;",
+        ["'"] = "&apos;",
+        ['<'] = "&lt;",
+        ['>'] = "&gt;",
+    } )
+end
+M.private.xmlEscape = xmlEscape
+
+local function xmlCDataEscape( s )
+    -- Return s escaped for CData section, escapes: "]]>"
+    return string.gsub( s, ']]>', ']]&gt;' )
+end
+M.private.xmlCDataEscape = xmlCDataEscape
+
+local function stripLuaunitTrace( stackTrace )
+    --[[
+    -- Example of  a traceback:
+    <<stack traceback:
+        example_with_luaunit.lua:130: in function 'test2_withFailure'
+        ./luaunit.lua:1449: in function <./luaunit.lua:1449>
+        [C]: in function 'xpcall'
+        ./luaunit.lua:1449: in function 'protectedCall'
+        ./luaunit.lua:1508: in function 'execOneFunction'
+        ./luaunit.lua:1596: in function 'runSuiteByInstances'
+        ./luaunit.lua:1660: in function 'runSuiteByNames'
+        ./luaunit.lua:1736: in function 'runSuite'
+        example_with_luaunit.lua:140: in main chunk
+        [C]: in ?>>
+
+        Other example:
+    <<stack traceback:
+        ./luaunit.lua:545: in function 'assertEquals'
+        example_with_luaunit.lua:58: in function 'TestToto.test7'
+        ./luaunit.lua:1517: in function <./luaunit.lua:1517>
+        [C]: in function 'xpcall'
+        ./luaunit.lua:1517: in function 'protectedCall'
+        ./luaunit.lua:1578: in function 'execOneFunction'
+        ./luaunit.lua:1677: in function 'runSuiteByInstances'
+        ./luaunit.lua:1730: in function 'runSuiteByNames'
+        ./luaunit.lua:1806: in function 'runSuite'
+        example_with_luaunit.lua:140: in main chunk
+        [C]: in ?>>
+
+    <<stack traceback:
+        luaunit2/example_with_luaunit.lua:124: in function 'test1_withFailure'
+        luaunit2/luaunit.lua:1532: in function <luaunit2/luaunit.lua:1532>
+        [C]: in function 'xpcall'
+        luaunit2/luaunit.lua:1532: in function 'protectedCall'
+        luaunit2/luaunit.lua:1591: in function 'execOneFunction'
+        luaunit2/luaunit.lua:1679: in function 'runSuiteByInstances'
+        luaunit2/luaunit.lua:1743: in function 'runSuiteByNames'
+        luaunit2/luaunit.lua:1819: in function 'runSuite'
+        luaunit2/example_with_luaunit.lua:140: in main chunk
+        [C]: in ?>>
+
+
+    -- first line is "stack traceback": KEEP
+    -- next line may be luaunit line: REMOVE
+    -- next lines are call in the program under testOk: REMOVE
+    -- next lines are calls from luaunit to call the program under test: KEEP
+
+    -- Strategy:
+    -- keep first line
+    -- remove lines that are part of luaunit
+    -- kepp lines until we hit a luaunit line
+    ]]
+
+    local function isLuaunitInternalLine( s )
+        -- return true if line of stack trace comes from inside luaunit
+        return s:find('[/\\]luaunit%.lua:%d+: ') ~= nil
+    end
+
+    -- print( '<<'..stackTrace..'>>' )
+
+    local t = strsplit( '\n', stackTrace )
+    -- print( prettystr(t) )
+
+    local idx = 2
+
+    -- remove lines that are still part of luaunit
+    while t[idx] and isLuaunitInternalLine( t[idx] ) do
+        -- print('Removing : '..t[idx] )
+        table.remove(t, idx)
+    end
+
+    -- keep lines until we hit luaunit again
+    while t[idx] and (not isLuaunitInternalLine(t[idx])) do
+        -- print('Keeping : '..t[idx] )
+        idx = idx + 1
+    end
+
+    -- remove remaining luaunit lines
+    while t[idx] do
+        -- print('Removing : '..t[idx] )
+        table.remove(t, idx)
+    end
+
+    -- print( prettystr(t) )
+    return table.concat( t, '\n')
+
+end
+M.private.stripLuaunitTrace = stripLuaunitTrace
+
+
+local function prettystr_sub(v, indentLevel, keeponeline, printTableRefs, recursionTable )
+    local type_v = type(v)
+    if "string" == type_v  then
+        if keeponeline then v = v:gsub("\n", "\\n") end
+
+        -- use clever delimiters according to content:
+        -- enclose with single quotes if string contains ", but no '
+        if v:find('"', 1, true) and not v:find("'", 1, true) then
+            return "'" .. v .. "'"
+        end
+        -- use double quotes otherwise, escape embedded "
+        return '"' .. v:gsub('"', '\\"') .. '"'
+
+    elseif "table" == type_v then
+        --if v.__class__ then
+        --    return string.gsub( tostring(v), 'table', v.__class__ )
+        --end
+        return M.private._table_tostring(v, indentLevel, printTableRefs, recursionTable)
+    end
+
+    return tostring(v)
+end
+
+local function prettystr( v, keeponeline )
+    --[[ Better string conversion, to display nice variable content:
+    For strings, if keeponeline is set to true, string is displayed on one line, with visible \n
+    * string are enclosed with " by default, or with ' if string contains a "
+    * if table is a class, display class name
+    * tables are expanded
+    ]]--
+    local recursionTable = {}
+    local s = prettystr_sub(v, 1, keeponeline, M.PRINT_TABLE_REF_IN_ERROR_MSG, recursionTable)
+    if recursionTable.recursionDetected and not M.PRINT_TABLE_REF_IN_ERROR_MSG then
+        -- some table contain recursive references,
+        -- so we must recompute the value by including all table references
+        -- else the result looks like crap
+        recursionTable = {}
+        s = prettystr_sub(v, 1, keeponeline, true, recursionTable)
+    end
+    return s
+end
+M.prettystr = prettystr
+
+local function prettystrPadded(value1, value2, suffix_a, suffix_b)
+    --[[
+    This function helps with the recurring task of constructing the "expected
+    vs. actual" error messages. It takes two arbitrary values and formats
+    corresponding strings with prettystr().
+
+    To keep the (possibly complex) output more readable in case the resulting
+    strings contain line breaks, they get automatically prefixed with additional
+    newlines. Both suffixes are optional (default to empty strings), and get
+    appended to the "value1" string. "suffix_a" is used if line breaks were
+    encountered, "suffix_b" otherwise.
+
+    Returns the two formatted strings (including padding/newlines).
+    ]]
+    local str1, str2 = prettystr(value1), prettystr(value2)
+    if hasNewLine(str1) or hasNewLine(str2) then
+        -- line break(s) detected, add padding
+        return "\n" .. str1 .. (suffix_a or ""), "\n" .. str2
+    end
+    return str1 .. (suffix_b or ""), str2
+end
+M.private.prettystrPadded = prettystrPadded
+
+local function _table_keytostring(k)
+    -- like prettystr but do not enclose with "" if the string is just alphanumerical
+    -- this is better for displaying table keys who are often simple strings
+    if "string" == type(k) and k:match("^[_%a][_%w]*$") then
+        return k
+    end
+    return prettystr(k)
+end
+M.private._table_keytostring = _table_keytostring
+
+local TABLE_TOSTRING_SEP = ", "
+local TABLE_TOSTRING_SEP_LEN = string.len(TABLE_TOSTRING_SEP)
+
+local function _table_tostring( tbl, indentLevel, printTableRefs, recursionTable )
+    printTableRefs = printTableRefs or M.PRINT_TABLE_REF_IN_ERROR_MSG
+    recursionTable = recursionTable or {}
+    recursionTable[tbl] = true
+
+    local result, dispOnMultLines = {}, false
+
+    local entry, count, seq_index = nil, 0, 1
+    for k, v in sortedPairs( tbl ) do
+        if k == seq_index then
+            -- for the sequential part of tables, we'll skip the "<key>=" output
+            entry = ''
+            seq_index = seq_index + 1
+        else
+            entry = _table_keytostring( k ) .. "="
+        end
+        if recursionTable[v] then -- recursion detected!
+            recursionTable.recursionDetected = true
+            entry = entry .. "<"..tostring(v)..">"
+        else
+            entry = entry ..
+                prettystr_sub( v, indentLevel+1, true, printTableRefs, recursionTable )
+        end
+        count = count + 1
+        result[count] = entry
+    end
+
+    -- set dispOnMultLines if the maximum LINE_LENGTH would be exceeded
+    local totalLength = 0
+    for k, v in ipairs( result ) do
+        totalLength = totalLength + string.len( v )
+        if totalLength >= M.LINE_LENGTH then
+            dispOnMultLines = true
+            break
+        end
+    end
+
+    if not dispOnMultLines then
+        -- adjust with length of separator(s):
+        -- two items need 1 sep, three items two seps, ... plus len of '{}'
+        if count > 0 then
+            totalLength = totalLength + TABLE_TOSTRING_SEP_LEN * (count - 1)
+        end
+        dispOnMultLines = totalLength + 2 >= M.LINE_LENGTH
+    end
+
+    -- now reformat the result table (currently holding element strings)
+    if dispOnMultLines then
+        local indentString = string.rep("    ", indentLevel - 1)
+        result = {"{\n    ", indentString,
+                  table.concat(result, ",\n    " .. indentString), "\n",
+                  indentString, "}"}
+    else
+        result = {"{", table.concat(result, TABLE_TOSTRING_SEP), "}"}
+    end
+    if printTableRefs then
+        table.insert(result, 1, "<"..tostring(tbl).."> ") -- prepend table ref
+    end
+    return table.concat(result)
+end
+M.private._table_tostring = _table_tostring -- prettystr_sub() needs it
+
+local function _table_contains(t, element)
+    if t then
+        for _, value in pairs(t) do
+            if type(value) == type(element) then
+                if type(element) == 'table' then
+                    -- if we wanted recursive items content comparison, we could use
+                    -- _is_table_items_equals(v, expected) but one level of just comparing
+                    -- items is sufficient
+                    if M.private._is_table_equals( value, element ) then
+                        return true
+                    end
+                else
+                    if value == element then
+                        return true
+                    end
+                end
+            end
+        end
+    end
+    return false
+end
+
+local function _is_table_items_equals(actual, expected )
+    if (type(actual) == 'table') and (type(expected) == 'table') then
+        for k,v in pairs(actual) do
+            if not _table_contains(expected, v) then
+                return false
+            end
+        end
+        for k,v in pairs(expected) do
+            if not _table_contains(actual, v) then
+                return false
+            end
+        end
+        return true
+    elseif type(actual) ~= type(expected) then
+        return false
+    elseif actual == expected then
+        return true
+    end
+    return false
+end
+
+local function _is_table_equals(actual, expected)
+    if (type(actual) == 'table') and (type(expected) == 'table') then
+        if (#actual ~= #expected) then
+            return false
+        end
+
+        local actualTableKeys = {}
+        for k,v in pairs(actual) do
+            if M.TABLE_EQUALS_KEYBYCONTENT and type(k) == "table" then
+                -- If the keys are tables, things get a bit tricky here as we
+                -- can have _is_table_equals(k1, k2) and t[k1] ~= t[k2]. So we
+                -- collect actual's table keys, group them by length for
+                -- performance, and then for each table key in expected we look
+                -- it up in actualTableKeys.
+                if not actualTableKeys[#k] then actualTableKeys[#k] = {} end
+                table.insert(actualTableKeys[#k], k)
+            else
+                if not _is_table_equals(v, expected[k]) then
+                    return false
+                end
+            end
+        end
+
+        for k,v in pairs(expected) do
+            if M.TABLE_EQUALS_KEYBYCONTENT and type(k) == "table" then
+                local candidates = actualTableKeys[#k]
+                if not candidates then return false end
+                local found
+                for i, candidate in pairs(candidates) do
+                    if _is_table_equals(candidate, k) then
+                        found = candidate
+                        -- Remove the candidate we matched against from the list
+                        -- of candidates, so each key in actual can only match
+                        -- one key in expected.
+                        candidates[i] = nil
+                        break
+                    end
+                end
+                if not(found and _is_table_equals(actual[found], v)) then return false end
+            else
+                if not _is_table_equals(v, actual[k]) then
+                    return false
+                end
+            end
+        end
+
+        if M.TABLE_EQUALS_KEYBYCONTENT then
+            for _, keys in pairs(actualTableKeys) do
+                -- if there are any keys left in any actualTableKeys[i] then
+                -- that is a key in actual with no matching key in expected,
+                -- and so the tables aren't equal.
+                if next(keys) then return false end
+            end
+        end
+
+        return true
+    elseif type(actual) ~= type(expected) then
+        return false
+    elseif actual == expected then
+        return true
+    end
+    return false
+end
+M.private._is_table_equals = _is_table_equals
+
+local function failure(msg, level)
+    -- raise an error indicating a test failure
+    -- for error() compatibility we adjust "level" here (by +1), to report the
+    -- calling context
+    error(M.FAILURE_PREFIX .. msg, (level or 1) + 1)
+end
+
+local function fail_fmt(level, ...)
+     -- failure with printf-style formatted message and given error level
+    failure(string.format(...), (level or 1) + 1)
+end
+M.private.fail_fmt = fail_fmt
+
+local function error_fmt(level, ...)
+     -- printf-style error()
+    error(string.format(...), (level or 1) + 1)
+end
+
+----------------------------------------------------------------
+--
+--                     assertions
+--
+----------------------------------------------------------------
+
+local function errorMsgEquality(actual, expected)
+    if not M.ORDER_ACTUAL_EXPECTED then
+        expected, actual = actual, expected
+    end
+    if type(expected) == 'string' or type(expected) == 'table' then
+        expected, actual = prettystrPadded(expected, actual)
+        return string.format("expected: %s\nactual: %s", expected, actual)
+    end
+    return string.format("expected: %s, actual: %s",
+                         prettystr(expected), prettystr(actual))
+end
+
+function M.assertError(f, ...)
+    -- assert that calling f with the arguments will raise an error
+    -- example: assertError( f, 1, 2 ) => f(1,2) should generate an error
+    if pcall( f, ... ) then
+        failure( "Expected an error when calling function but no error generated", 2 )
+    end
+end
+
+function M.assertTrue(value)
+    if not value then
+        failure("expected: true, actual: " ..prettystr(value), 2)
+    end
+end
+
+function M.assertFalse(value)
+    if value then
+        failure("expected: false, actual: " ..prettystr(value), 2)
+    end
+end
+
+function M.assertIsNil(value)
+    if value ~= nil then
+        failure("expected: nil, actual: " ..prettystr(value), 2)
+    end
+end
+
+function M.assertNotIsNil(value)
+    if value == nil then
+        failure("expected non nil value, received nil", 2)
+    end
+end
+
+function M.assertEquals(actual, expected)
+    if type(actual) == 'table' and type(expected) == 'table' then
+        if not _is_table_equals(actual, expected) then
+            failure( errorMsgEquality(actual, expected), 2 )
+        end
+    elseif type(actual) ~= type(expected) then
+        failure( errorMsgEquality(actual, expected), 2 )
+    elseif actual ~= expected then
+        failure( errorMsgEquality(actual, expected), 2 )
+    end
+end
+
+-- Help Lua in corner cases like almostEquals(1.1, 1.0, 0.1), which by default
+-- may not work. We need to give margin a small boost; EPSILON defines the
+-- default value to use for this:
+local EPSILON = 0.00000000001
+function M.almostEquals( actual, expected, margin, margin_boost )
+    if type(actual) ~= 'number' or type(expected) ~= 'number' or type(margin) ~= 'number' then
+        error_fmt(3, 'almostEquals: must supply only number arguments.\nArguments supplied: %s, %s, %s',
+            prettystr(actual), prettystr(expected), prettystr(margin))
+    end
+    if margin <= 0 then
+        error('almostEquals: margin must be positive, current value is ' .. margin, 3)
+    end
+    local realmargin = margin + (margin_boost or EPSILON)
+    return math.abs(expected - actual) <= realmargin
+end
+
+function M.assertAlmostEquals( actual, expected, margin )
+    -- check that two floats are close by margin
+    if not M.almostEquals(actual, expected, margin) then
+        if not M.ORDER_ACTUAL_EXPECTED then
+            expected, actual = actual, expected
+        end
+        fail_fmt(2, 'Values are not almost equal\nExpected: %s with margin of %s, received: %s',
+                 expected, margin, actual)
+    end
+end
+
+function M.assertNotEquals(actual, expected)
+    if type(actual) ~= type(expected) then
+        return
+    end
+
+    if type(actual) == 'table' and type(expected) == 'table' then
+        if not _is_table_equals(actual, expected) then
+            return
+        end
+    elseif actual ~= expected then
+        return
+    end
+    fail_fmt(2, 'Received the not expected value: %s', prettystr(actual))
+end
+
+function M.assertNotAlmostEquals( actual, expected, margin )
+    -- check that two floats are not close by margin
+    if M.almostEquals(actual, expected, margin) then
+        if not M.ORDER_ACTUAL_EXPECTED then
+            expected, actual = actual, expected
+        end
+        fail_fmt(2, 'Values are almost equal\nExpected: %s with a difference above margin of %s, received: %s',
+                 expected, margin, actual)
+    end
+end
+
+function M.assertStrContains( str, sub, useRe )
+    -- this relies on lua string.find function
+    -- a string always contains the empty string
+    if not string.find(str, sub, 1, not useRe) then
+        sub, str = prettystrPadded(sub, str, '\n')
+        fail_fmt(2, 'Error, %s %s was not found in string %s',
+                 useRe and 'regexp' or 'substring', sub, str)
+    end
+end
+
+function M.assertStrIContains( str, sub )
+    -- this relies on lua string.find function
+    -- a string always contains the empty string
+    if not string.find(str:lower(), sub:lower(), 1, true) then
+        sub, str = prettystrPadded(sub, str, '\n')
+        fail_fmt(2, 'Error, substring %s was not found (case insensitively) in string %s',
+                 sub, str)
+    end
+end
+
+function M.assertNotStrContains( str, sub, useRe )
+    -- this relies on lua string.find function
+    -- a string always contains the empty string
+    if string.find(str, sub, 1, not useRe) then
+        sub, str = prettystrPadded(sub, str, '\n')
+        fail_fmt(2, 'Error, %s %s was found in string %s',
+                 useRe and 'regexp' or 'substring', sub, str)
+    end
+end
+
+function M.assertNotStrIContains( str, sub )
+    -- this relies on lua string.find function
+    -- a string always contains the empty string
+    if string.find(str:lower(), sub:lower(), 1, true) then
+        sub, str = prettystrPadded(sub, str, '\n')
+        fail_fmt(2, 'Error, substring %s was found (case insensitively) in string %s',
+                 sub, str)
+    end
+end
+
+function M.assertStrMatches( str, pattern, start, final )
+    -- Verify a full match for the string
+    -- for a partial match, simply use assertStrContains with useRe set to true
+    if not strMatch( str, pattern, start, final ) then
+        pattern, str = prettystrPadded(pattern, str, '\n')
+        fail_fmt(2, 'Error, pattern %s was not matched by string %s',
+                 pattern, str)
+    end
+end
+
+function M.assertErrorMsgEquals( expectedMsg, func, ... )
+    -- assert that calling f with the arguments will raise an error
+    -- example: assertError( f, 1, 2 ) => f(1,2) should generate an error
+    local no_error, error_msg = pcall( func, ... )
+    if no_error then
+        failure( 'No error generated when calling function but expected error: "'..expectedMsg..'"', 2 )
+    end
+    if error_msg ~= expectedMsg then
+        error_msg, expectedMsg = prettystrPadded(error_msg, expectedMsg)
+        fail_fmt(2, 'Exact error message expected: %s\nError message received: %s\n',
+                 expectedMsg, error_msg)
+    end
+end
+
+function M.assertErrorMsgContains( partialMsg, func, ... )
+    -- assert that calling f with the arguments will raise an error
+    -- example: assertError( f, 1, 2 ) => f(1,2) should generate an error
+    local no_error, error_msg = pcall( func, ... )
+    if no_error then
+        failure( 'No error generated when calling function but expected error containing: '..prettystr(partialMsg), 2 )
+    end
+    if not string.find( error_msg, partialMsg, nil, true ) then
+        error_msg, partialMsg = prettystrPadded(error_msg, partialMsg)
+        fail_fmt(2, 'Error message does not contain: %s\nError message received: %s\n',
+                 partialMsg, error_msg)
+    end
+end
+
+function M.assertErrorMsgMatches( expectedMsg, func, ... )
+    -- assert that calling f with the arguments will raise an error
+    -- example: assertError( f, 1, 2 ) => f(1,2) should generate an error
+    local no_error, error_msg = pcall( func, ... )
+    if no_error then
+        failure( 'No error generated when calling function but expected error matching: "'..expectedMsg..'"', 2 )
+    end
+    if not strMatch( error_msg, expectedMsg ) then
+        expectedMsg, error_msg = prettystrPadded(expectedMsg, error_msg)
+        fail_fmt(2, 'Error message does not match: %s\nError message received: %s\n',
+                 expectedMsg, error_msg)
+    end
+end
+
+--[[
+Add type assertion functions to the module table M. Each of these functions
+takes a single parameter "value", and checks that its Lua type matches the
+expected string (derived from the function name):
+
+M.assertIsXxx(value) -> ensure that type(value) conforms to "xxx"
+]]
+for _, funcName in ipairs(
+    {'assertIsNumber', 'assertIsString', 'assertIsTable', 'assertIsBoolean',
+     'assertIsFunction', 'assertIsUserdata', 'assertIsThread'}
+) do
+    local typeExpected = funcName:match("^assertIs([A-Z]%a*)$")
+    -- Lua type() always returns lowercase, also make sure the match() succeeded
+    typeExpected = typeExpected and typeExpected:lower()
+                   or error("bad function name '"..funcName.."' for type assertion")
+
+    M[funcName] = function(value)
+        if type(value) ~= typeExpected then
+            fail_fmt(2, 'Expected: a %s value, actual: type %s, value %s',
+                     typeExpected, type(value), prettystrPadded(value))
+        end
+    end
+end
+
+--[[
+Add non-type assertion functions to the module table M. Each of these functions
+takes a single parameter "value", and checks that its Lua type differs from the
+expected string (derived from the function name):
+
+M.assertNotIsXxx(value) -> ensure that type(value) is not "xxx"
+]]
+for _, funcName in ipairs(
+    {'assertNotIsNumber', 'assertNotIsString', 'assertNotIsTable', 'assertNotIsBoolean',
+     'assertNotIsFunction', 'assertNotIsUserdata', 'assertNotIsThread'}
+) do
+    local typeUnexpected = funcName:match("^assertNotIs([A-Z]%a*)$")
+    -- Lua type() always returns lowercase, also make sure the match() succeeded
+    typeUnexpected = typeUnexpected and typeUnexpected:lower()
+                   or error("bad function name '"..funcName.."' for type assertion")
+
+    M[funcName] = function(value)
+        if type(value) == typeUnexpected then
+            fail_fmt(2, 'Not expected: a %s type, actual: value %s',
+                     typeUnexpected, prettystrPadded(value))
+        end
+    end
+end
+
+function M.assertIs(actual, expected)
+    if actual ~= expected then
+        if not M.ORDER_ACTUAL_EXPECTED then
+            actual, expected = expected, actual
+        end
+        expected, actual = prettystrPadded(expected, actual, '\n', ', ')
+        fail_fmt(2, 'Expected object and actual object are not the same\nExpected: %sactual: %s',
+                 expected, actual)
+    end
+end
+
+function M.assertNotIs(actual, expected)
+    if actual == expected then
+        if not M.ORDER_ACTUAL_EXPECTED then
+            expected = actual
+        end
+        fail_fmt(2, 'Expected object and actual object are the same object: %s',
+                 prettystrPadded(expected))
+    end
+end
+
+function M.assertItemsEquals(actual, expected)
+    -- checks that the items of table expected
+    -- are contained in table actual. Warning, this function
+    -- is at least O(n^2)
+    if not _is_table_items_equals(actual, expected ) then
+        expected, actual = prettystrPadded(expected, actual)
+        fail_fmt(2, 'Contents of the tables are not identical:\nExpected: %s\nActual: %s',
+                 expected, actual)
+    end
+end
+
+----------------------------------------------------------------
+--                     Compatibility layer
+----------------------------------------------------------------
+
+-- for compatibility with LuaUnit v2.x
+function M.wrapFunctions(...)
+    io.stderr:write( [[Use of WrapFunction() is no longer needed.
+Just prefix your test function names with "test" or "Test" and they
+will be picked up and run by LuaUnit.]] )
+    -- In LuaUnit version <= 2.1 , this function was necessary to include
+    -- a test function inside the global test suite. Nowadays, the functions
+    -- are simply run directly as part of the test discovery process.
+    -- so just do nothing !
+
+    --[[
+    local testClass, testFunction
+    testClass = {}
+    local function storeAsMethod(idx, testName)
+        testFunction = _G[testName]
+        testClass[testName] = testFunction
+    end
+    for i,v in ipairs({...}) do
+        storeAsMethod( i, v )
+    end
+
+    return testClass
+    ]]
+end
+
+local list_of_funcs = {
+    -- { official function name , alias }
+
+    -- general assertions
+    { 'assertEquals'            , 'assert_equals' },
+    { 'assertItemsEquals'       , 'assert_items_equals' },
+    { 'assertNotEquals'         , 'assert_not_equals' },
+    { 'assertAlmostEquals'      , 'assert_almost_equals' },
+    { 'assertNotAlmostEquals'   , 'assert_not_almost_equals' },
+    { 'assertTrue'              , 'assert_true' },
+    { 'assertFalse'             , 'assert_false' },
+    { 'assertStrContains'       , 'assert_str_contains' },
+    { 'assertStrIContains'      , 'assert_str_icontains' },
+    { 'assertNotStrContains'    , 'assert_not_str_contains' },
+    { 'assertNotStrIContains'   , 'assert_not_str_icontains' },
+    { 'assertStrMatches'        , 'assert_str_matches' },
+    { 'assertError'             , 'assert_error' },
+    { 'assertErrorMsgEquals'    , 'assert_error_msg_equals' },
+    { 'assertErrorMsgContains'  , 'assert_error_msg_contains' },
+    { 'assertErrorMsgMatches'   , 'assert_error_msg_matches' },
+    { 'assertIs'                , 'assert_is' },
+    { 'assertNotIs'             , 'assert_not_is' },
+    { 'wrapFunctions'           , 'WrapFunctions' },
+    { 'wrapFunctions'           , 'wrap_functions' },
+
+    -- type assertions: assertIsXXX -> assert_is_xxx
+    { 'assertIsNumber'          , 'assert_is_number' },
+    { 'assertIsString'          , 'assert_is_string' },
+    { 'assertIsTable'           , 'assert_is_table' },
+    { 'assertIsBoolean'         , 'assert_is_boolean' },
+    { 'assertIsNil'             , 'assert_is_nil' },
+    { 'assertIsFunction'        , 'assert_is_function' },
+    { 'assertIsThread'          , 'assert_is_thread' },
+    { 'assertIsUserdata'        , 'assert_is_userdata' },
+
+    -- type assertions: assertIsXXX -> assertXxx
+    { 'assertIsNumber'          , 'assertNumber' },
+    { 'assertIsString'          , 'assertString' },
+    { 'assertIsTable'           , 'assertTable' },
+    { 'assertIsBoolean'         , 'assertBoolean' },
+    { 'assertIsNil'             , 'assertNil' },
+    { 'assertIsFunction'        , 'assertFunction' },
+    { 'assertIsThread'          , 'assertThread' },
+    { 'assertIsUserdata'        , 'assertUserdata' },
+
+    -- type assertions: assertIsXXX -> assert_xxx (luaunit v2 compat)
+    { 'assertIsNumber'          , 'assert_number' },
+    { 'assertIsString'          , 'assert_string' },
+    { 'assertIsTable'           , 'assert_table' },
+    { 'assertIsBoolean'         , 'assert_boolean' },
+    { 'assertIsNil'             , 'assert_nil' },
+    { 'assertIsFunction'        , 'assert_function' },
+    { 'assertIsThread'          , 'assert_thread' },
+    { 'assertIsUserdata'        , 'assert_userdata' },
+
+    -- type assertions: assertNotIsXXX -> assert_not_is_xxx
+    { 'assertNotIsNumber'       , 'assert_not_is_number' },
+    { 'assertNotIsString'       , 'assert_not_is_string' },
+    { 'assertNotIsTable'        , 'assert_not_is_table' },
+    { 'assertNotIsBoolean'      , 'assert_not_is_boolean' },
+    { 'assertNotIsNil'          , 'assert_not_is_nil' },
+    { 'assertNotIsFunction'     , 'assert_not_is_function' },
+    { 'assertNotIsThread'       , 'assert_not_is_thread' },
+    { 'assertNotIsUserdata'     , 'assert_not_is_userdata' },
+
+    -- type assertions: assertNotIsXXX -> assertNotXxx (luaunit v2 compat)
+    { 'assertNotIsNumber'       , 'assertNotNumber' },
+    { 'assertNotIsString'       , 'assertNotString' },
+    { 'assertNotIsTable'        , 'assertNotTable' },
+    { 'assertNotIsBoolean'      , 'assertNotBoolean' },
+    { 'assertNotIsNil'          , 'assertNotNil' },
+    { 'assertNotIsFunction'     , 'assertNotFunction' },
+    { 'assertNotIsThread'       , 'assertNotThread' },
+    { 'assertNotIsUserdata'     , 'assertNotUserdata' },
+
+    -- type assertions: assertNotIsXXX -> assert_not_xxx
+    { 'assertNotIsNumber'       , 'assert_not_number' },
+    { 'assertNotIsString'       , 'assert_not_string' },
+    { 'assertNotIsTable'        , 'assert_not_table' },
+    { 'assertNotIsBoolean'      , 'assert_not_boolean' },
+    { 'assertNotIsNil'          , 'assert_not_nil' },
+    { 'assertNotIsFunction'     , 'assert_not_function' },
+    { 'assertNotIsThread'       , 'assert_not_thread' },
+    { 'assertNotIsUserdata'     , 'assert_not_userdata' },
+
+    -- all assertions with Coroutine duplicate Thread assertions
+    { 'assertIsThread'          , 'assertIsCoroutine' },
+    { 'assertIsThread'          , 'assertCoroutine' },
+    { 'assertIsThread'          , 'assert_is_coroutine' },
+    { 'assertIsThread'          , 'assert_coroutine' },
+    { 'assertNotIsThread'       , 'assertNotIsCoroutine' },
+    { 'assertNotIsThread'       , 'assertNotCoroutine' },
+    { 'assertNotIsThread'       , 'assert_not_is_coroutine' },
+    { 'assertNotIsThread'       , 'assert_not_coroutine' },
+}
+
+-- Create all aliases in M
+for _,v in ipairs( list_of_funcs ) do
+    funcname, alias = v[1], v[2]
+    M[alias] = M[funcname]
+
+    if EXPORT_ASSERT_TO_GLOBALS then
+        _G[funcname] = M[funcname]
+        _G[alias] = M[funcname]
+    end
+end
+
+----------------------------------------------------------------
+--
+--                     Outputters
+--
+----------------------------------------------------------------
+
+----------------------------------------------------------------
+--                     class TapOutput
+----------------------------------------------------------------
+
+
+local TapOutput = { __class__ = 'TapOutput' } -- class
+local TapOutput_MT = { __index = TapOutput } -- metatable
+
+    -- For a good reference for TAP format, check: http://testanything.org/tap-specification.html
+
+    function TapOutput:new()
+        return setmetatable( { verbosity = M.VERBOSITY_LOW }, TapOutput_MT)
+    end
+    function TapOutput:startSuite()
+        print("1.."..self.result.testCount)
+        print('# Started on '..self.result.startDate)
+    end
+    function TapOutput:startClass(className)
+        if className ~= '[TestFunctions]' then
+            print('# Starting class: '..className)
+        end
+    end
+    function TapOutput:startTest(testName) end
+
+    function TapOutput:addFailure( node )
+        io.stdout:write("not ok ", self.result.currentTestNumber, "\t", node.testName, "\n")
+        if self.verbosity > M.VERBOSITY_LOW then
+           print( prefixString( '    ', node.msg ) )
+        end
+        if self.verbosity > M.VERBOSITY_DEFAULT then
+           print( prefixString( '    ', node.stackTrace ) )
+        end
+    end
+    TapOutput.addError = TapOutput.addFailure
+
+    function TapOutput:endTest( node )
+        if node:isPassed() then
+            io.stdout:write("ok     ", self.result.currentTestNumber, "\t", node.testName, "\n")
+        end
+    end
+
+    function TapOutput:endClass() end
+
+    function TapOutput:endSuite()
+        print( '# '..M.LuaUnit.statusLine( self.result ) )
+        return self.result.notPassedCount
+    end
+
+
+-- class TapOutput end
+
+----------------------------------------------------------------
+--                     class JUnitOutput
+----------------------------------------------------------------
+
+-- See directory junitxml for more information about the junit format
+local JUnitOutput = { __class__ = 'JUnitOutput' } -- class
+local JUnitOutput_MT = { __index = JUnitOutput } -- metatable
+
+    function JUnitOutput:new()
+        return setmetatable(
+            { testList = {}, verbosity = M.VERBOSITY_LOW }, JUnitOutput_MT)
+    end
+    function JUnitOutput:startSuite()
+
+        -- open xml file early to deal with errors
+        if self.fname == nil then
+            error('With Junit, an output filename must be supplied with --name!')
+        end
+        if string.sub(self.fname,-4) ~= '.xml' then
+            self.fname = self.fname..'.xml'
+        end
+        self.fd = io.open(self.fname, "w")
+        if self.fd == nil then
+            error("Could not open file for writing: "..self.fname)
+        end
+
+        print('# XML output to '..self.fname)
+        print('# Started on '..self.result.startDate)
+    end
+    function JUnitOutput:startClass(className)
+        if className ~= '[TestFunctions]' then
+            print('# Starting class: '..className)
+        end
+    end
+    function JUnitOutput:startTest(testName)
+        print('# Starting test: '..testName)
+    end
+
+    function JUnitOutput:addFailure( node )
+        print('# Failure: ' .. node.msg)
+        -- print('# ' .. node.stackTrace)
+    end
+
+    function JUnitOutput:addError( node )
+        print('# Error: ' .. node.msg)
+        -- print('# ' .. node.stackTrace)
+    end
+
+    function JUnitOutput:endTest( node )
+    end
+
+    function JUnitOutput:endClass()
+    end
+
+    function JUnitOutput:endSuite()
+        print( '# '..M.LuaUnit.statusLine(self.result))
+
+        -- XML file writing
+        self.fd:write('<?xml version="1.0" encoding="UTF-8" ?>\n')
+        self.fd:write('<testsuites>\n')
+        self.fd:write(string.format(
+            '    <testsuite name="LuaUnit" id="00001" package="" hostname="localhost" tests="%d" timestamp="%s" time="%0.3f" errors="%d" failures="%d">\n',
+            self.result.runCount, self.result.startIsodate, self.result.duration, self.result.errorCount, self.result.failureCount ))
+        self.fd:write("        <properties>\n")
+        self.fd:write(string.format('            <property name="Lua Version" value="%s"/>\n', _VERSION ) )
+        self.fd:write(string.format('            <property name="LuaUnit Version" value="%s"/>\n', M.VERSION) )
+        -- XXX please include system name and version if possible
+        self.fd:write("        </properties>\n")
+
+        for i,node in ipairs(self.result.tests) do
+            self.fd:write(string.format('        <testcase classname="%s" name="%s" time="%0.3f">\n',
+                node.className, node.testName, node.duration ) )
+            if node:isNotPassed() then
+                self.fd:write(node:statusXML())
+            end
+            self.fd:write('        </testcase>\n')
+        end
+
+        -- Next two lines are needed to validate junit ANT xsd, but really not useful in general:
+        self.fd:write('    <system-out/>\n')
+        self.fd:write('    <system-err/>\n')
+
+        self.fd:write('    </testsuite>\n')
+        self.fd:write('</testsuites>\n')
+        self.fd:close()
+        return self.result.notPassedCount
+    end
+
+
+-- class TapOutput end
+
+----------------------------------------------------------------
+--                     class TextOutput
+----------------------------------------------------------------
+
+--[[
+
+-- Python Non verbose:
+
+For each test: . or F or E
+
+If some failed tests:
+    ==============
+    ERROR / FAILURE: TestName (testfile.testclass)
+    ---------
+    Stack trace
+
+
+then --------------
+then "Ran x tests in 0.000s"
+then OK or FAILED (failures=1, error=1)
+
+-- Python Verbose:
+testname (filename.classname) ... ok
+testname (filename.classname) ... FAIL
+testname (filename.classname) ... ERROR
+
+then --------------
+then "Ran x tests in 0.000s"
+then OK or FAILED (failures=1, error=1)
+
+-- Ruby:
+Started
+ .
+ Finished in 0.002695 seconds.
+
+ 1 tests, 2 assertions, 0 failures, 0 errors
+
+-- Ruby:
+>> ruby tc_simple_number2.rb
+Loaded suite tc_simple_number2
+Started
+F..
+Finished in 0.038617 seconds.
+
+  1) Failure:
+test_failure(TestSimpleNumber) [tc_simple_number2.rb:16]:
+Adding doesn't work.
+<3> expected but was
+<4>.
+
+3 tests, 4 assertions, 1 failures, 0 errors
+
+-- Java Junit
+.......F.
+Time: 0,003
+There was 1 failure:
+1) testCapacity(junit.samples.VectorTest)junit.framework.AssertionFailedError
+    at junit.samples.VectorTest.testCapacity(VectorTest.java:87)
+    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
+    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
+    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
+
+FAILURES!!!
+Tests run: 8,  Failures: 1,  Errors: 0
+
+
+-- Maven
+
+# mvn test
+-------------------------------------------------------
+ T E S T S
+-------------------------------------------------------
+Running math.AdditionTest
+Tests run: 2, Failures: 1, Errors: 0, Skipped: 0, Time elapsed:
+0.03 sec <<< FAILURE!
+
+Results :
+
+Failed tests:
+  testLireSymbole(math.AdditionTest)
+
+Tests run: 2, Failures: 1, Errors: 0, Skipped: 0
+
+
+-- LuaUnit
+---- non verbose
+* display . or F or E when running tests
+---- verbose
+* display test name + ok/fail
+----
+* blank line
+* number) ERROR or FAILURE: TestName
+   Stack trace
+* blank line
+* number) ERROR or FAILURE: TestName
+   Stack trace
+
+then --------------
+then "Ran x tests in 0.000s (%d not selected, %d skipped)"
+then OK or FAILED (failures=1, error=1)
+
+
+]]
+
+local TextOutput = { __class__ = 'TextOutput' } -- class
+local TextOutput_MT = { __index = TextOutput } -- metatable
+
+    function TextOutput:new()
+        return setmetatable(
+            { errorList = {}, verbosity = M.VERBOSITY_DEFAULT }, TextOutput_MT )
+    end
+
+    function TextOutput:startSuite()
+        if self.verbosity > M.VERBOSITY_DEFAULT then
+            print( 'Started on '.. self.result.startDate )
+        end
+    end
+
+    function TextOutput:startClass(className)
+        -- display nothing when starting a new class
+    end
+
+    function TextOutput:startTest(testName)
+        if self.verbosity > M.VERBOSITY_DEFAULT then
+            io.stdout:write( "    ", self.result.currentNode.testName, " ... " )
+        end
+    end
+
+    function TextOutput:addFailure( node )
+        -- nothing
+    end
+
+    function TextOutput:addError( node )
+        -- nothing
+    end
+
+    function TextOutput:endTest( node )
+        if node:isPassed() then
+            if self.verbosity > M.VERBOSITY_DEFAULT then
+                io.stdout:write("Ok\n")
+            else
+                io.stdout:write(".")
+            end
+        else
+            if self.verbosity > M.VERBOSITY_DEFAULT then
+                print( node.status )
+                print( node.msg )
+                --[[
+                -- find out when to do this:
+                if self.verbosity > M.VERBOSITY_DEFAULT then
+                    print( node.stackTrace )
+                end
+                ]]
+            else
+                -- write only the first character of status
+                io.stdout:write(string.sub(node.status, 1, 1))
+            end
+        end
+    end
+
+    function TextOutput:endClass()
+        -- nothing
+    end
+
+    function TextOutput:displayOneFailedTest( index, failure )
+        print(index..") "..failure.testName )
+        print( failure.msg )
+        print( failure.stackTrace )
+        print()
+    end
+
+    function TextOutput:displayFailedTests()
+        if self.result.notPassedCount == 0 then return end
+        print("Failed tests:")
+        print("-------------")
+        for i,v in ipairs(self.result.notPassed) do
+            self:displayOneFailedTest( i, v )
+        end
+    end
+
+    function TextOutput:endSuite()
+        if self.verbosity > M.VERBOSITY_DEFAULT then
+            print("=========================================================")
+        else
+            print()
+        end
+        self:displayFailedTests()
+        print( M.LuaUnit.statusLine( self.result ) )
+        local ignoredString = ""
+        if self.result.notPassedCount == 0 then
+            print('OK')
+        end
+    end
+
+-- class TextOutput end
+
+
+----------------------------------------------------------------
+--                     class NilOutput
+----------------------------------------------------------------
+
+local function nopCallable()
+    --print(42)
+    return nopCallable
+end
+
+local NilOutput = { __class__ = 'NilOuptut' } -- class
+local NilOutput_MT = { __index = nopCallable } -- metatable
+
+function NilOutput:new()
+    return setmetatable( { __class__ = 'NilOutput' }, NilOutput_MT )
+end
+
+----------------------------------------------------------------
+--
+--                     class LuaUnit
+--
+----------------------------------------------------------------
+
+M.LuaUnit = {
+    outputType = TextOutput,
+    verbosity = M.VERBOSITY_DEFAULT,
+    __class__ = 'LuaUnit'
+}
+local LuaUnit_MT = { __index = M.LuaUnit }
+
+if EXPORT_ASSERT_TO_GLOBALS then
+    LuaUnit = M.LuaUnit
+end
+
+    function M.LuaUnit:new()
+        return setmetatable( {}, LuaUnit_MT )
+    end
+
+    -----------------[[ Utility methods ]]---------------------
+
+    function M.LuaUnit.asFunction(aObject)
+        -- return "aObject" if it is a function, and nil otherwise
+        if 'function' == type(aObject) then return aObject end
+    end
+
+    function M.LuaUnit.isClassMethod(aName)
+        -- return true if aName contains a class + a method name in the form class:method
+        return string.find(aName, '.', nil, true) ~= nil
+    end
+
+    function M.LuaUnit.splitClassMethod(someName)
+        -- return a pair className, methodName for a name in the form class:method
+        -- return nil if not a class + method name
+        -- name is class + method
+        local hasMethod, methodName, className
+        hasMethod = string.find(someName, '.', nil, true )
+        if not hasMethod then return nil end
+        methodName = string.sub(someName, hasMethod+1)
+        className = string.sub(someName,1,hasMethod-1)
+        return className, methodName
+    end
+
+    function M.LuaUnit.isMethodTestName( s )
+        -- return true is the name matches the name of a test method
+        -- default rule is that is starts with 'Test' or with 'test'
+        return string.sub(s, 1, 4):lower() == 'test'
+    end
+
+    function M.LuaUnit.isTestName( s )
+        -- return true is the name matches the name of a test
+        -- default rule is that is starts with 'Test' or with 'test'
+        return string.sub(s, 1, 4):lower() == 'test'
+    end
+
+    function M.LuaUnit.collectTests()
+        -- return a list of all test names in the global namespace
+        -- that match LuaUnit.isTestName
+
+        local testNames = {}
+        for k, v in pairs(_G) do
+            if M.LuaUnit.isTestName( k ) then
+                table.insert( testNames , k )
+            end
+        end
+        table.sort( testNames )
+        return testNames
+    end
+
+    function M.LuaUnit.parseCmdLine( cmdLine )
+        -- parse the command line
+        -- Supported command line parameters:
+        -- --verbose, -v: increase verbosity
+        -- --quiet, -q: silence output
+        -- --error, -e: treat errors as fatal (quit program)
+        -- --output, -o, + name: select output type
+        -- --pattern, -p, + pattern: run test matching pattern, may be repeated
+        -- --name, -n, + fname: name of output file for junit, default to stdout
+        -- [testnames, ...]: run selected test names
+        --
+        -- Returns a table with the following fields:
+        -- verbosity: nil, M.VERBOSITY_DEFAULT, M.VERBOSITY_QUIET, M.VERBOSITY_VERBOSE
+        -- output: nil, 'tap', 'junit', 'text', 'nil'
+        -- testNames: nil or a list of test names to run
+        -- pattern: nil or a list of patterns
+
+        local result = {}
+        local state = nil
+        local SET_OUTPUT = 1
+        local SET_PATTERN = 2
+        local SET_FNAME = 3
+
+        if cmdLine == nil then
+            return result
+        end
+
+        local function parseOption( option )
+            if option == '--help' or option == '-h' then
+                result['help'] = true
+                return
+            elseif option == '--version' then
+                result['version'] = true
+                return
+            elseif option == '--verbose' or option == '-v' then
+                result['verbosity'] = M.VERBOSITY_VERBOSE
+                return
+            elseif option == '--quiet' or option == '-q' then
+                result['verbosity'] = M.VERBOSITY_QUIET
+                return
+            elseif option == '--error' or option == '-e' then
+                result['quitOnError'] = true
+                return
+            elseif option == '--failure' or option == '-f' then
+                result['quitOnFailure'] = true
+                return
+            elseif option == '--output' or option == '-o' then
+                state = SET_OUTPUT
+                return state
+            elseif option == '--name' or option == '-n' then
+                state = SET_FNAME
+                return state
+            elseif option == '--pattern' or option == '-p' then
+                state = SET_PATTERN
+                return state
+            end
+            error('Unknown option: '..option,3)
+        end
+
+        local function setArg( cmdArg, state )
+            if state == SET_OUTPUT then
+                result['output'] = cmdArg
+                return
+            elseif state == SET_FNAME then
+                result['fname'] = cmdArg
+                return
+            elseif state == SET_PATTERN then
+                if result['pattern'] then
+                    table.insert( result['pattern'], cmdArg )
+                else
+                    result['pattern'] = { cmdArg }
+                end
+                return
+            end
+            error('Unknown parse state: '.. state)
+        end
+
+
+        for i, cmdArg in ipairs(cmdLine) do
+            if state ~= nil then
+                setArg( cmdArg, state, result )
+                state = nil
+            else
+                if cmdArg:sub(1,1) == '-' then
+                    state = parseOption( cmdArg )
+                else
+                    if result['testNames'] then
+                        table.insert( result['testNames'], cmdArg )
+                    else
+                        result['testNames'] = { cmdArg }
+                    end
+                end
+            end
+        end
+
+        if result['help'] then
+            M.LuaUnit.help()
+        end
+
+        if result['version'] then
+            M.LuaUnit.version()
+        end
+
+        if state ~= nil then
+            error('Missing argument after '..cmdLine[ #cmdLine ],2 )
+        end
+
+        return result
+    end
+
+    function M.LuaUnit.help()
+        print(M.USAGE)
+        os.exit(0)
+    end
+
+    function M.LuaUnit.version()
+        print('LuaUnit v'..M.VERSION..' by Philippe Fremy <phil@freehackers.org>')
+        os.exit(0)
+    end
+
+    function M.LuaUnit.patternInclude( patternFilter, expr )
+        -- check if any of patternFilter is contained in expr. If so, return true.
+        -- return false if None of the patterns are contained in expr
+        -- if patternFilter is nil, return true (no filtering)
+        if patternFilter == nil then
+            return true
+        end
+
+        for i,pattern in ipairs(patternFilter) do
+            if string.find(expr, pattern) then
+                return true
+            end
+        end
+
+        return false
+    end
+
+----------------------------------------------------------------
+--                     class NodeStatus
+----------------------------------------------------------------
+
+    local NodeStatus = { __class__ = 'NodeStatus' } -- class
+    local NodeStatus_MT = { __index = NodeStatus } -- metatable
+    M.NodeStatus = NodeStatus
+
+    -- values of status
+    NodeStatus.PASS  = 'PASS'
+    NodeStatus.FAIL  = 'FAIL'
+    NodeStatus.ERROR = 'ERROR'
+
+    function NodeStatus:new( number, testName, className )
+        local t = { number = number, testName = testName, className = className }
+        setmetatable( t, NodeStatus_MT )
+        t:pass()
+        return t
+    end
+
+    function NodeStatus:pass()
+        self.status = self.PASS
+        -- useless but we know it's the field we want to use
+        self.msg = nil
+        self.stackTrace = nil
+    end
+
+    function NodeStatus:fail(msg, stackTrace)
+        self.status = self.FAIL
+        self.msg = msg
+        self.stackTrace = stackTrace
+    end
+
+    function NodeStatus:error(msg, stackTrace)
+        self.status = self.ERROR
+        self.msg = msg
+        self.stackTrace = stackTrace
+    end
+
+    function NodeStatus:isPassed()
+        return self.status == NodeStatus.PASS
+    end
+
+    function NodeStatus:isNotPassed()
+        -- print('hasFailure: '..prettystr(self))
+        return self.status ~= NodeStatus.PASS
+    end
+
+    function NodeStatus:isFailure()
+        return self.status == NodeStatus.FAIL
+    end
+
+    function NodeStatus:isError()
+        return self.status == NodeStatus.ERROR
+    end
+
+    function NodeStatus:statusXML()
+        if self:isError() then
+            return table.concat(
+                {'            <error type="', xmlEscape(self.msg), '">\n',
+                 '                <![CDATA[', xmlCDataEscape(self.stackTrace),
+                 ']]></error>\n'})
+        elseif self:isFailure() then
+            return table.concat(
+                {'            <failure type="', xmlEscape(self.msg), '">\n',
+                 '                <![CDATA[', xmlCDataEscape(self.stackTrace),
+                 ']]></failure>\n'})
+        end
+        return '            <passed/>\n' -- (not XSD-compliant! normally shouldn't get here)
+    end
+
+    --------------[[ Output methods ]]-------------------------
+
+    function M.LuaUnit.statusLine(result)
+        -- return status line string according to results
+        local s = string.format('Ran %d tests in %0.3f seconds, %d successes',
+            result.runCount, result.duration, result.passedCount )
+        if result.notPassedCount > 0 then
+            if result.failureCount > 0 then
+                s = s..string.format(', %d failures', result.failureCount )
+            end
+            if result.errorCount > 0 then
+                s = s..string.format(', %d errors', result.errorCount )
+            end
+        else
+            s = s..', 0 failures'
+        end
+        if result.nonSelectedCount > 0 then
+            s = s..string.format(", %d non-selected", result.nonSelectedCount )
+        end
+        return s
+    end
+
+    function M.LuaUnit:startSuite(testCount, nonSelectedCount)
+        self.result = {}
+        self.result.testCount = testCount
+        self.result.nonSelectedCount = nonSelectedCount
+        self.result.passedCount = 0
+        self.result.runCount = 0
+        self.result.currentTestNumber = 0
+        self.result.currentClassName = ""
+        self.result.currentNode = nil
+        self.result.suiteStarted = true
+        self.result.startTime = os.clock()
+        self.result.startDate = os.date(os.getenv('LUAUNIT_DATEFMT'))
+        self.result.startIsodate = os.date('%Y-%m-%dT%H:%M:%S')
+        self.result.patternFilter = self.patternFilter
+        self.result.tests = {}
+        self.result.failures = {}
+        self.result.errors = {}
+        self.result.notPassed = {}
+
+        self.outputType = self.outputType or TextOutput
+        self.output = self.outputType:new()
+        self.output.runner = self
+        self.output.result = self.result
+        self.output.verbosity = self.verbosity
+        self.output.fname = self.fname
+        self.output:startSuite()
+    end
+
+    function M.LuaUnit:startClass( className )
+        self.result.currentClassName = className
+        self.output:startClass( className )
+    end
+
+    function M.LuaUnit:startTest( testName  )
+        self.result.currentTestNumber = self.result.currentTestNumber + 1
+        self.result.runCount = self.result.runCount + 1
+        self.result.currentNode = NodeStatus:new(
+            self.result.currentTestNumber,
+            testName,
+            self.result.currentClassName
+        )
+        self.result.currentNode.startTime = os.clock()
+        table.insert( self.result.tests, self.result.currentNode )
+        self.output:startTest( testName )
+    end
+
+    function M.LuaUnit:addStatus( err )
+        -- "err" is expected to be a table / result from protectedCall()
+        if err.status == NodeStatus.PASS then return end
+
+        local node = self.result.currentNode
+
+        --[[ As a first approach, we will report only one error or one failure for one test.
+
+        However, we can have the case where the test is in failure, and the teardown is in error.
+        In such case, it's a good idea to report both a failure and an error in the test suite. This is
+        what Python unittest does for example. However, it mixes up counts so need to be handled carefully: for
+        example, there could be more (failures + errors) count that tests. What happens to the current node ?
+
+        We will do this more intelligent version later.
+        ]]
+
+        -- if the node is already in failure/error, just don't report the new error (see above)
+        if node.status ~= NodeStatus.PASS then return end
+
+        table.insert( self.result.notPassed, node )
+
+        if err.status == NodeStatus.FAIL then
+            node:fail( err.msg, err.trace )
+            table.insert( self.result.failures, node )
+            self.output:addFailure( node )
+        elseif err.status == NodeStatus.ERROR then
+            node:error( err.msg, err.trace )
+            table.insert( self.result.errors, node )
+            self.output:addError( node )
+        end
+    end
+
+    function M.LuaUnit:endTest()
+        local node = self.result.currentNode
+        -- print( 'endTest() '..prettystr(node))
+        -- print( 'endTest() '..prettystr(node:isNotPassed()))
+        node.duration = os.clock() - node.startTime
+        node.startTime = nil
+        self.output:endTest( node )
+
+        if node:isPassed() then
+            self.result.passedCount = self.result.passedCount + 1
+        elseif node:isError() then
+            if self.quitOnError or self.quitOnFailure then
+                -- Runtime error - abort test execution as requested by
+                -- "--error" option. This is done by setting a special
+                -- flag that gets handled in runSuiteByInstances().
+                print("\nERROR during LuaUnit test execution:\n" .. node.msg)
+                self.result.aborted = true
+            end
+        elseif node:isFailure() then
+            if self.quitOnFailure then
+                -- Failure - abort test execution as requested by
+                -- "--failure" option. This is done by setting a special
+                -- flag that gets handled in runSuiteByInstances().
+                print("\nFailure during LuaUnit test execution:\n" .. node.msg)
+                self.result.aborted = true
+            end
+        end
+        self.result.currentNode = nil
+    end
+
+    function M.LuaUnit:endClass()
+        self.output:endClass()
+    end
+
+    function M.LuaUnit:endSuite()
+        if self.result.suiteStarted == false then
+            error('LuaUnit:endSuite() -- suite was already ended' )
+        end
+        self.result.duration = os.clock()-self.result.startTime
+        self.result.suiteStarted = false
+
+        -- Expose test counts for outputter's endSuite(). This could be managed
+        -- internally instead, but unit tests (and existing use cases) might
+        -- rely on these fields being present.
+        self.result.notPassedCount = #self.result.notPassed
+        self.result.failureCount = #self.result.failures
+        self.result.errorCount = #self.result.errors
+
+        self.output:endSuite()
+    end
+
+    function M.LuaUnit:setOutputType(outputType)
+        -- default to text
+        -- tap produces results according to TAP format
+        if outputType:upper() == "NIL" then
+            self.outputType = NilOutput
+            return
+        end
+        if outputType:upper() == "TAP" then
+            self.outputType = TapOutput
+            return
+        end
+        if outputType:upper() == "JUNIT" then
+            self.outputType = JUnitOutput
+            return
+        end
+        if outputType:upper() == "TEXT" then
+            self.outputType = TextOutput
+            return
+        end
+        error( 'No such format: '..outputType,2)
+    end
+
+    --------------[[ Runner ]]-----------------
+
+    function M.LuaUnit:protectedCall(classInstance, methodInstance, prettyFuncName)
+        -- if classInstance is nil, this is just a function call
+        -- else, it's method of a class being called.
+
+        local function err_handler(e)
+            -- transform error into a table, adding the traceback information
+            return {
+                status = NodeStatus.ERROR,
+                msg = e,
+                trace = string.sub(debug.traceback("", 3), 2)
+            }
+        end
+
+        local ok, err
+        if classInstance then
+            -- stupid Lua < 5.2 does not allow xpcall with arguments so let's use a workaround
+            ok, err = xpcall( function () methodInstance(classInstance) end, err_handler )
+        else
+            ok, err = xpcall( function () methodInstance() end, err_handler )
+        end
+        if ok then
+            return {status = NodeStatus.PASS}
+        end
+
+        -- determine if the error was a failed test:
+        -- We do this by stripping the failure prefix from the error message,
+        -- while keeping track of the gsub() count. A non-zero value -> failure
+        local failed
+        err.msg, failed = err.msg:gsub(M.FAILURE_PREFIX, "", 1)
+        if failed > 0 then
+            err.status = NodeStatus.FAIL
+        end
+
+        -- reformat / improve the stack trace
+        if prettyFuncName then -- we do have the real method name
+            err.trace = err.trace:gsub("in (%a+) 'methodInstance'", "in %1 '"..prettyFuncName.."'")
+        end
+        if STRIP_LUAUNIT_FROM_STACKTRACE then
+            err.trace = stripLuaunitTrace(err.trace)
+        end
+
+        return err -- return the error "object" (table)
+    end
+
+
+    function M.LuaUnit:execOneFunction(className, methodName, classInstance, methodInstance)
+        -- When executing a test function, className and classInstance must be nil
+        -- When executing a class method, all parameters must be set
+
+        if type(methodInstance) ~= 'function' then
+            error( tostring(methodName)..' must be a function, not '..type(methodInstance))
+        end
+
+        local prettyFuncName
+        if className == nil then
+            className = '[TestFunctions]'
+            prettyFuncName = methodName
+        else
+            prettyFuncName = className..'.'..methodName
+        end
+
+        if self.lastClassName ~= className then
+            if self.lastClassName ~= nil then
+                self:endClass()
+            end
+            self:startClass( className )
+            self.lastClassName = className
+        end
+
+        self:startTest(prettyFuncName)
+
+        -- run setUp first (if any)
+        if classInstance then
+            local func = self.asFunction( classInstance.setUp )
+                         or self.asFunction( classInstance.Setup )
+                         or self.asFunction( classInstance.setup )
+                         or self.asFunction( classInstance.SetUp )
+            if func then
+                self:addStatus(self:protectedCall(classInstance, func, className..'.setUp'))
+            end
+        end
+
+        -- run testMethod()
+        if self.result.currentNode:isPassed() then
+            self:addStatus(self:protectedCall(classInstance, methodInstance, prettyFuncName))
+        end
+
+        -- lastly, run tearDown (if any)
+        if classInstance then
+            local func = self.asFunction( classInstance.tearDown )
+                         or self.asFunction( classInstance.TearDown )
+                         or self.asFunction( classInstance.teardown )
+                         or self.asFunction( classInstance.Teardown )
+            if func then
+                self:addStatus(self:protectedCall(classInstance, func, className..'.tearDown'))
+            end
+        end
+
+        self:endTest()
+    end
+
+    function M.LuaUnit.expandOneClass( result, className, classInstance )
+        -- add all test methods of classInstance to result
+        for methodName, methodInstance in sortedPairs(classInstance) do
+            if M.LuaUnit.asFunction(methodInstance) and M.LuaUnit.isMethodTestName( methodName ) then
+                table.insert( result, { className..'.'..methodName, classInstance } )
+            end
+        end
+    end
+
+    function M.LuaUnit.expandClasses( listOfNameAndInst )
+        -- expand all classes (provided as {className, classInstance}) to a list of {className.methodName, classInstance}
+        -- functions and methods remain untouched
+        local result = {}
+
+        for i,v in ipairs( listOfNameAndInst ) do
+            local name, instance = v[1], v[2]
+            if M.LuaUnit.asFunction(instance) then
+                table.insert( result, { name, instance } )
+            else
+                if type(instance) ~= 'table' then
+                    error( 'Instance must be a table or a function, not a '..type(instance)..', value '..prettystr(instance))
+                end
+                if M.LuaUnit.isClassMethod( name ) then
+                    local className, methodName = M.LuaUnit.splitClassMethod( name )
+                    local methodInstance = instance[methodName]
+                    if methodInstance == nil then
+                        error( "Could not find method in class "..tostring(className).." for method "..tostring(methodName) )
+                    end
+                    table.insert( result, { name, instance } )
+                else
+                    M.LuaUnit.expandOneClass( result, name, instance )
+                end
+            end
+        end
+
+        return result
+    end
+
+    function M.LuaUnit.applyPatternFilter( patternFilter, listOfNameAndInst )
+        local included, excluded = {}, {}
+        for i, v in ipairs( listOfNameAndInst ) do
+            -- local name, instance = v[1], v[2]
+            if M.LuaUnit.patternInclude( patternFilter, v[1] ) then
+                table.insert( included, v )
+            else
+                table.insert( excluded, v )
+            end
+        end
+        return included, excluded
+    end
+
+    function M.LuaUnit:runSuiteByInstances( listOfNameAndInst )
+        -- Run an explicit list of tests. All test instances and names must be supplied.
+        -- each test must be one of:
+        --   * { function name, function instance }
+        --   * { class name, class instance }
+        --   * { class.method name, class instance }
+
+        local expandedList, filteredList, filteredOutList, className, methodName, methodInstance
+        expandedList = self.expandClasses( listOfNameAndInst )
+
+        filteredList, filteredOutList = self.applyPatternFilter( self.patternFilter, expandedList )
+
+        self:startSuite( #filteredList, #filteredOutList )
+
+        for i,v in ipairs( filteredList ) do
+            local name, instance = v[1], v[2]
+            if M.LuaUnit.asFunction(instance) then
+                self:execOneFunction( nil, name, nil, instance )
+            else
+                if type(instance) ~= 'table' then
+                    error( 'Instance must be a table or a function, not a '..type(instance)..', value '..prettystr(instance))
+                else
+                    assert( M.LuaUnit.isClassMethod( name ) )
+                    className, methodName = M.LuaUnit.splitClassMethod( name )
+                    methodInstance = instance[methodName]
+                    if methodInstance == nil then
+                        error( "Could not find method in class "..tostring(className).." for method "..tostring(methodName) )
+                    end
+                    self:execOneFunction( className, methodName, instance, methodInstance )
+                end
+            end
+            if self.result.aborted then break end -- "--error" or "--failure" option triggered
+        end
+
+        if self.lastClassName ~= nil then
+            self:endClass()
+        end
+
+        self:endSuite()
+
+        if self.result.aborted then
+            print("LuaUnit ABORTED (as requested by --error or --failure option)")
+            os.exit(-2)
+        end
+    end
+
+    function M.LuaUnit:runSuiteByNames( listOfName )
+        -- Run an explicit list of test names
+
+        local  className, methodName, instanceName, instance, methodInstance
+        local listOfNameAndInst = {}
+
+        for i,name in ipairs( listOfName ) do
+            if M.LuaUnit.isClassMethod( name ) then
+                className, methodName = M.LuaUnit.splitClassMethod( name )
+                instanceName = className
+                instance = _G[instanceName]
+
+                if instance == nil then
+                    error( "No such name in global space: "..instanceName )
+                end
+
+                if type(instance) ~= 'table' then
+                    error( 'Instance of '..instanceName..' must be a table, not '..type(instance))
+                end
+
+                methodInstance = instance[methodName]
+                if methodInstance == nil then
+                    error( "Could not find method in class "..tostring(className).." for method "..tostring(methodName) )
+                end
+
+            else
+                -- for functions and classes
+                instanceName = name
+                instance = _G[instanceName]
+            end
+
+            if instance == nil then
+                error( "No such name in global space: "..instanceName )
+            end
+
+            if (type(instance) ~= 'table' and type(instance) ~= 'function') then
+                error( 'Name must match a function or a table: '..instanceName )
+            end
+
+            table.insert( listOfNameAndInst, { name, instance } )
+        end
+
+        self:runSuiteByInstances( listOfNameAndInst )
+    end
+
+    function M.LuaUnit.run(...)
+        -- Run some specific test classes.
+        -- If no arguments are passed, run the class names specified on the
+        -- command line. If no class name is specified on the command line
+        -- run all classes whose name starts with 'Test'
+        --
+        -- If arguments are passed, they must be strings of the class names
+        -- that you want to run or generic command line arguments (-o, -p, -v, ...)
+
+        local runner = M.LuaUnit.new()
+        return runner:runSuite(...)
+    end
+
+    function M.LuaUnit:runSuite( ... )
+
+        local args = {...}
+        if type(args[1]) == 'table' and args[1].__class__ == 'LuaUnit' then
+            -- run was called with the syntax M.LuaUnit:runSuite()
+            -- we support both M.LuaUnit.run() and M.LuaUnit:run()
+            -- strip out the first argument
+            table.remove(args,1)
+        end
+
+        if #args == 0 then
+            args = cmdline_argv
+        end
+
+        local no_error, val = pcall( M.LuaUnit.parseCmdLine, args )
+        if not no_error then
+            print(val) -- error message
+            print()
+            print(M.USAGE)
+            os.exit(-1)
+        end
+
+        local options = val
+
+        -- We expect these option fields to be either `nil` or contain
+        -- valid values, so it's safe to always copy them directly.
+        self.verbosity     = options.verbosity
+        self.quitOnError   = options.quitOnError
+        self.quitOnFailure = options.quitOnFailure
+        self.fname         = options.fname
+        self.patternFilter = options.pattern
+
+        if options.output and options.output:lower() == 'junit' and options.fname == nil then
+            print('With junit output, a filename must be supplied with -n or --name')
+            os.exit(-1)
+        end
+
+        if options.output then
+            no_error, val = pcall(self.setOutputType, self, options.output)
+            if not no_error then
+                print(val) -- error message
+                print()
+                print(M.USAGE)
+                os.exit(-1)
+            end
+        end
+
+        self:runSuiteByNames( options.testNames or M.LuaUnit.collectTests() )
+
+        return self.result.notPassedCount
+    end
+-- class LuaUnit
+
+-- For compatbility with LuaUnit v2
+M.run = M.LuaUnit.run
+M.Run = M.LuaUnit.run
+
+function M:setVerbosity( verbosity )
+    M.LuaUnit.verbosity = verbosity
+end
+M.set_verbosity = M.setVerbosity
+M.SetVerbosity = M.setVerbosity
+
+
+return M
diff --git a/tests/lua/spec b/tests/lua/spec
new file mode 120000
index 0000000..e518184
--- /dev/null
+++ b/tests/lua/spec
@@ -0,0 +1 @@
+src/lua/bpf/spec
\ No newline at end of file
diff --git a/tests/lua/test_clang.lua b/tests/lua/test_clang.lua
new file mode 100644
index 0000000..f3d395e
--- /dev/null
+++ b/tests/lua/test_clang.lua
@@ -0,0 +1,331 @@
+local suite = require("test_helper")
+local TestClang = {}
+
+function TestClang:test_probe_read1()
+  local text = [[
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
+    pid_t p = prev->pid;
+    return (p != -1);
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("count_sched", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_probe_read2()
+  local text = [[
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+int count_foo(struct pt_regs *ctx, unsigned long a, unsigned long b) {
+    return (a != b);
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("count_foo", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_probe_read_keys()
+  local text = [[
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+BPF_HASH(start, struct request *);
+int do_request(struct pt_regs *ctx, struct request *req) {
+  u64 ts = bpf_ktime_get_ns();
+  start.update(&req, &ts);
+  return 0;
+}
+
+int do_completion(struct pt_regs *ctx, struct request *req) {
+  u64 *tsp = start.lookup(&req);
+  if (tsp != 0) {
+    start.delete(&req);
+  }
+  return 0;
+}
+  ]]
+  local b = BPF:new{text=text, debug=0}
+  local fns = b:load_funcs('BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_sscanf()
+  local text = [[
+BPF_HASH(stats, int, struct { u64 a; u64 b; u32 c:18; u32 d:14; struct { u32 a; u32 b; } s; }, 10);
+
+int foo(void *ctx) {
+    return 0;
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("foo", 'BPF_PROG_TYPE_KPROBE')
+  local t = b:get_table("stats")
+  local s1 = t:key_sprintf(2)
+
+  assert_equals(s1, "0x2")
+
+  local s2 = t:leaf_sprintf({{2, 3, 4, 1, {5, 6}}})
+  local l = t:leaf_scanf(s2)
+
+  assert_equals(tonumber(l.a), 2)
+  assert_equals(tonumber(l.b), 3)
+  assert_equals(tonumber(l.c), 4)
+  assert_equals(tonumber(l.d), 1)
+  assert_equals(tonumber(l.s.a), 5)
+  assert_equals(tonumber(l.s.b), 6)
+end
+
+function TestClang:test_sscanf_array()
+  local text = [[ BPF_HASH(stats, int, struct { u32 a[3]; u32 b; }, 10); ]]
+
+  local b = BPF:new{text=text, debug=0}
+  local t = b:get_table("stats")
+
+  local s1 = t:key_sprintf(2)
+  assert_equals(s1, "0x2")
+
+  local s2 = t:leaf_sprintf({{{1, 2, 3}, 4}})
+  assert_equals(s2, "{ [ 0x1 0x2 0x3 ] 0x4 }")
+
+  local l = t:leaf_scanf(s2)
+  assert_equals(l.a[0], 1)
+  assert_equals(l.a[1], 2)
+  assert_equals(l.a[2], 3)
+  assert_equals(l.b, 4)
+end
+
+function TestClang:test_iosnoop()
+  local text = [[
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+    struct request *req;
+};
+
+BPF_HASH(start, struct key_t, u64, 1024);
+int do_request(struct pt_regs *ctx, struct request *req) {
+    struct key_t key = {};
+
+    bpf_trace_printk("traced start %d\\n", req->__data_len);
+
+    return 0;
+}
+]]
+
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("do_request", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_blk_start_request()
+  local text = [[
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+int do_request(struct pt_regs *ctx, int req) {
+    bpf_trace_printk("req ptr: 0x%x\n", req);
+    return 0;
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("do_request", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_bpf_hash()
+  local text = [[
+BPF_HASH(table1);
+BPF_HASH(table2, u32);
+BPF_HASH(table3, u32, int);
+]]
+  local b = BPF:new{text=text, debug=0}
+end
+
+function TestClang:test_consecutive_probe_read()
+  local text = [[
+#include <linux/fs.h>
+#include <linux/mount.h>
+BPF_HASH(table1, struct super_block *);
+int trace_entry(struct pt_regs *ctx, struct file *file) {
+    if (!file) return 0;
+    struct vfsmount *mnt = file->f_path.mnt;
+    if (mnt) {
+        struct super_block *k = mnt->mnt_sb;
+        u64 zero = 0;
+        table1.update(&k, &zero);
+        k = mnt->mnt_sb;
+        table1.update(&k, &zero);
+    }
+
+    return 0;
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("trace_entry", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_nested_probe_read()
+  local text = [[
+#include <linux/fs.h>
+int trace_entry(struct pt_regs *ctx, struct file *file) {
+    if (!file) return 0;
+    const char *name = file->f_path.dentry->d_name.name;
+    bpf_trace_printk("%s\\n", name);
+    return 0;
+}
+]]
+  local b = BPF:new{text=text, debug=0}
+  local fn = b:load_func("trace_entry", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_char_array_probe()
+  local b = BPF:new{text=[[#include <linux/blkdev.h>
+int kprobe__blk_update_request(struct pt_regs *ctx, struct request *req) {
+    bpf_trace_printk("%s\\n", req->rq_disk->disk_name);
+    return 0;
+}]]}
+end
+
+function TestClang:test_probe_read_helper()
+  local b = BPF:new{text=[[
+#include <linux/fs.h>
+static void print_file_name(struct file *file) {
+    if (!file) return;
+    const char *name = file->f_path.dentry->d_name.name;
+    bpf_trace_printk("%s\\n", name);
+}
+static void print_file_name2(int unused, struct file *file) {
+    print_file_name(file);
+}
+int trace_entry1(struct pt_regs *ctx, struct file *file) {
+    print_file_name(file);
+    return 0;
+}
+int trace_entry2(struct pt_regs *ctx, int unused, struct file *file) {
+    print_file_name2(unused, file);
+    return 0;
+}
+]]}
+  local fn1 = b:load_func("trace_entry1", 'BPF_PROG_TYPE_KPROBE')
+  local fn2 = b:load_func("trace_entry2", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_probe_struct_assign()
+  local b = BPF:new{text = [[
+#include <uapi/linux/ptrace.h>
+struct args_t {
+    const char *filename;
+    int flags;
+    int mode;
+};
+int kprobe__sys_open(struct pt_regs *ctx, const char *filename,
+        int flags, int mode) {
+    struct args_t args = {};
+    args.filename = filename;
+    args.flags = flags;
+    args.mode = mode;
+    bpf_trace_printk("%s\\n", args.filename);
+    return 0;
+};
+]]}
+end
+
+function TestClang:test_task_switch()
+  local b = BPF:new{text=[[
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+struct key_t {
+  u32 prev_pid;
+  u32 curr_pid;
+};
+BPF_HASH(stats, struct key_t, u64, 1024);
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) {
+  struct key_t key = {};
+  u64 zero = 0, *val;
+  key.curr_pid = bpf_get_current_pid_tgid();
+  key.prev_pid = prev->pid;
+
+  val = stats.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+}
+]]}
+end
+
+function TestClang:test_probe_simple_assign()
+  local b = BPF:new{text=[[
+#include <uapi/linux/ptrace.h>
+#include <linux/gfp.h>
+struct leaf { size_t size; };
+BPF_HASH(simple_map, u32, struct leaf);
+int kprobe____kmalloc(struct pt_regs *ctx, size_t size) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct leaf* leaf = simple_map.lookup(&pid);
+    if (leaf)
+        leaf->size += size;
+    return 0;
+}]]}
+end
+
+function TestClang:test_unop_probe_read()
+  local text = [[
+#include <linux/blkdev.h>
+int trace_entry(struct pt_regs *ctx, struct request *req) {
+    if (!(req->bio->bi_flags & 1))
+        return 1;
+    if (((req->bio->bi_flags)))
+        return 1;
+    return 0;
+}
+]]
+  local b = BPF:new{text=text}
+  local fn = b:load_func("trace_entry", 'BPF_PROG_TYPE_KPROBE')
+end
+
+function TestClang:test_complex_leaf_types()
+  local text = [[
+struct list;
+struct list {
+  struct list *selfp;
+  struct list *another_selfp;
+  struct list *selfp_array[2];
+};
+struct empty {
+};
+union emptyu {
+  struct empty *em1;
+  struct empty em2;
+  struct empty em3;
+  struct empty em4;
+};
+BPF_ARRAY(t1, struct list, 1);
+BPF_ARRAY(t2, struct list *, 1);
+BPF_ARRAY(t3, union emptyu, 1);
+]]
+  local b = BPF:new{text=text}
+  local ffi = require("ffi")
+
+  -- TODO: ptrs?
+  assert_equals(ffi.sizeof(b:get_table("t3").c_leaf), 8)
+end
+
+function TestClang:test_cflags()
+  local text = [[
+#ifndef MYFLAG
+#error "MYFLAG not set as expected"
+#endif
+]]
+  local b = BPF:new{text=text, cflags={"-DMYFLAG"}}
+end
+
+function TestClang:test_exported_maps()
+  local b1 = BPF{text=[[BPF_TABLE_PUBLIC("hash", int, int, table1, 10);]]}
+  local b2 = BPF{text=[[BPF_TABLE("extern", int, int, table1, 10);]]}
+end
+
+function TestClang:test_syntax_error()
+  assert_error_msg_contains(
+    "failed to compile BPF module",
+    BPF.new,
+    BPF, {text=[[int failure(void *ctx) { if (); return 0; }]]})
+end
+
+suite("TestClang", TestClang)
diff --git a/tests/lua/test_dump.lua b/tests/lua/test_dump.lua
new file mode 100644
index 0000000..bd6a47b
--- /dev/null
+++ b/tests/lua/test_dump.lua
@@ -0,0 +1,10 @@
+local suite = require("test_helper")
+local TestDump = {}
+
+function TestDump:test_dump_func()
+  local raw = "\xb7\x00\x00\x00\x01\x00\x00\x00\x95\x00\x00\x00\x00\x00\x00\x00"
+  local b = BPF:new{text=[[int entry(void) { return 1; }]]}
+  assert_equals(b:dump_func("entry"), raw)
+end
+
+suite("TestDump", TestDump)
diff --git a/tests/lua/test_helper.lua b/tests/lua/test_helper.lua
new file mode 100644
index 0000000..f5ee529
--- /dev/null
+++ b/tests/lua/test_helper.lua
@@ -0,0 +1,23 @@
+function setup_path()
+  local str = require("debug").getinfo(2, "S").source:sub(2)
+  local cwd = str:match("(.*/)")
+  local bpf_path = cwd.."/../../src/lua/?.lua;"
+  local test_path = cwd.."/?.lua;"
+  package.path = bpf_path..test_path..package.path
+end
+
+setup_path()
+
+USE_EXPECTED_ACTUAL_IN_ASSERT_EQUALS = false
+EXPORT_ASSERT_TO_GLOBALS = true
+require("luaunit")
+
+rawset(_G, "BCC", require("bcc.init"))
+rawset(_G, "BPF", BCC.BPF)
+
+log.enabled = false
+
+return function (name, f)
+  rawset(_G, name, f)
+  os.exit(LuaUnit.run())
+end
diff --git a/tests/lua/test_standalone.sh b/tests/lua/test_standalone.sh
new file mode 100755
index 0000000..7786ac9
--- /dev/null
+++ b/tests/lua/test_standalone.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) GitHub, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+set -xe
+cd "src/lua"
+
+function fail {
+    echo "test failed: $1" >&2
+    exit 1
+}
+
+if [[ ! -x bcc-lua ]]; then
+    echo "bcc-lua not built --- skipping"
+    exit 0
+fi
+
+if ldd bcc-lua | grep -q luajit; then
+    fail "bcc-lua depends on libluajit"
+fi
+
+rm -f probe.lua
+echo "return function(BPF) print(\"Hello world\") end" > probe.lua
+
+PROBE="../../../examples/lua/offcputime.lua"
+
+if ! sudo ./bcc-lua "$PROBE" -d 1 >/dev/null 2>/dev/null; then
+    fail "bcc-lua cannot run complex probes"
+fi
+
+rm -f libbcc.so probe.lua
diff --git a/tests/lua/test_uprobes.lua b/tests/lua/test_uprobes.lua
new file mode 100644
index 0000000..059486e
--- /dev/null
+++ b/tests/lua/test_uprobes.lua
@@ -0,0 +1,70 @@
+local suite = require("test_helper")
+local ffi = require("ffi")
+local TestUprobes = {}
+
+ffi.cdef[[
+  int getpid(void);
+  void malloc_stats(void);
+]]
+
+function TestUprobes:test_simple_library()
+  local text = [[
+#include <uapi/linux/ptrace.h>
+BPF_ARRAY(stats, u64, 1);
+static void incr(int idx) {
+    u64 *ptr = stats.lookup(&idx);
+    if (ptr)
+        ++(*ptr);
+}
+int count(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    if (pid == PID)
+        incr(0);
+    return 0;
+}]]
+
+  local pid = tonumber(ffi.C.getpid())
+  local text = text:gsub("PID", tostring(pid))
+
+  local b = BPF:new{text=text}
+  b:attach_uprobe{name="c", sym="malloc_stats", fn_name="count", pid=pid}
+  b:attach_uprobe{name="c", sym="malloc_stats", fn_name="count", pid=pid, retprobe=true}
+
+  assert_equals(BPF.num_open_uprobes(), 2)
+
+  ffi.C.malloc_stats()
+
+  local stats = b:get_table("stats")
+  assert_equals(tonumber(stats:get(0)), 2)
+end
+
+function TestUprobes:test_simple_binary()
+  local text = [[
+#include <uapi/linux/ptrace.h>
+BPF_ARRAY(stats, u64, 1);
+static void incr(int idx) {
+    u64 *ptr = stats.lookup(&idx);
+    if (ptr)
+        ++(*ptr);
+}
+int count(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    incr(0);
+    return 0;
+}]]
+
+  local b = BPF:new{text=text}
+  b:attach_uprobe{name="/usr/bin/python", sym="main", fn_name="count"}
+  b:attach_uprobe{name="/usr/bin/python", sym="main", fn_name="count", retprobe=true}
+
+  os.spawn("/usr/bin/python -V")
+
+  local stats = b:get_table("stats")
+  assert_true(tonumber(stats:get(0)) >= 2)
+end
+
+function TestUprobes:teardown()
+  BPF.cleanup()
+end
+
+suite("TestUprobes", TestUprobes)
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
new file mode 100644
index 0000000..468c700
--- /dev/null
+++ b/tests/python/CMakeLists.txt
@@ -0,0 +1,79 @@
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+find_program(ARPING arping)
+if(ARPING STREQUAL "ARPING-NOTFOUND")
+  message(WARNING "Recommended test program 'arping' not found")
+endif()
+find_program(NETPERF netperf)
+if(NETPERF STREQUAL "NETPERF-NOTFOUND")
+  message(WARNING "Recommended test program 'netperf' not found")
+endif()
+find_program(IPERF iperf)
+if(IPERF STREQUAL "IPERF-NOTFOUND")
+  find_program(IPERF3 iperf3)
+  if(IPERF3 STREQUAL "IPERF3-NOTFOUND")
+    message(WARNING "Recommended test program 'iperf' or 'iperf3' not found")
+  endif()
+endif()
+
+add_test(NAME py_test_stat1_b WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_stat1_b namespace ${CMAKE_CURRENT_SOURCE_DIR}/test_stat1.py test_stat1.b proto.b)
+add_test(NAME py_test_bpf_log WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_bpf_prog sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_bpf_log.py)
+add_test(NAME py_test_stat1_c WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_stat1_c namespace ${CMAKE_CURRENT_SOURCE_DIR}/test_stat1.py test_stat1.c)
+#add_test(NAME py_test_xlate1_b WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+#  COMMAND ${TEST_WRAPPER} py_xlate1_b namespace ${CMAKE_CURRENT_SOURCE_DIR}/test_xlate1.py test_xlate1.b proto.b)
+add_test(NAME py_test_xlate1_c WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_xlate1_c namespace ${CMAKE_CURRENT_SOURCE_DIR}/test_xlate1.py test_xlate1.c)
+add_test(NAME py_test_call1 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_call1_c namespace ${CMAKE_CURRENT_SOURCE_DIR}/test_call1.py test_call1.c)
+add_test(NAME py_test_trace1 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_trace1 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace1.py test_trace1.b kprobe.b)
+add_test(NAME py_test_trace2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_trace2 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace2.py)
+add_test(NAME py_test_trace3_c WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_trace3_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace3.py test_trace3.c)
+add_test(NAME py_test_trace4 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_trace4 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace4.py)
+add_test(NAME py_test_probe_count WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_probe_count sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_probe_count.py)
+add_test(NAME py_test_debuginfo WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_debuginfo sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_debuginfo.py)
+add_test(NAME py_test_brb WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_brb_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_brb.py test_brb.c)
+add_test(NAME py_test_brb2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_brb2_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_brb2.py test_brb2.c)
+add_test(NAME py_test_clang WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_clang sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_clang.py)
+add_test(NAME py_test_histogram WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_histogram sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_histogram.py)
+add_test(NAME py_array WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_array sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_array.py)
+add_test(NAME py_uprobes WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_uprobes sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_uprobes.py)
+add_test(NAME py_test_stackid WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_stackid sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_stackid.py)
+add_test(NAME py_test_tracepoint WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_tracepoint sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tracepoint.py)
+add_test(NAME py_test_perf_event WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_perf_event sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_perf_event.py)
+add_test(NAME py_test_utils WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_utils sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.py)
+add_test(NAME py_test_percpu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_percpu sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_percpu.py)
+add_test(NAME py_test_dump_func WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_dump_func simple ${CMAKE_CURRENT_SOURCE_DIR}/test_dump_func.py)
+add_test(NAME py_test_tools_smoke WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_tools_smoke sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tools_smoke.py)
+add_test(NAME py_test_tools_memleak WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_tools_memleak sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tools_memleak.py)
+add_test(NAME py_test_usdt WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_usdt sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt.py)
+add_test(NAME py_test_usdt2 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_usdt2 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt2.py)
+add_test(NAME py_test_usdt3 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_usdt3 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt3.py)
+add_test(NAME py_test_license WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_license sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_license.py)
diff --git a/tests/python/dummy.cc b/tests/python/dummy.cc
new file mode 100644
index 0000000..bf39faa
--- /dev/null
+++ b/tests/python/dummy.cc
@@ -0,0 +1,17 @@
+#include <unistd.h>
+#include <cstdio>
+
+namespace some_namespace {
+  static __attribute__((noinline)) int some_function(int x, int y) {
+	  volatile int z = x + y;
+	  return z;
+  }
+}
+
+int main() {
+	printf("%p\n", &some_namespace::some_function);
+	fflush(stdout);
+	printf("result = %d\n", some_namespace::some_function(42, 11));
+	sleep(1000);
+	return 0;
+}
diff --git a/tests/python/include/folly/tracing/StaticTracepoint-ELF.h b/tests/python/include/folly/tracing/StaticTracepoint-ELF.h
new file mode 100644
index 0000000..a8a74c3
--- /dev/null
+++ b/tests/python/include/folly/tracing/StaticTracepoint-ELF.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// Default constraint for the probe arguments as operands.
+#ifndef FOLLY_SDT_ARG_CONSTRAINT
+#if defined(__powerpc64__) || defined(__powerpc__)
+#define FOLLY_SDT_ARG_CONSTRAINT      "nZr"
+#else
+#define FOLLY_SDT_ARG_CONSTRAINT      "nor"
+#endif
+#endif
+
+// Instruction to emit for the probe.
+#define FOLLY_SDT_NOP                 nop
+
+// Note section properties.
+#define FOLLY_SDT_NOTE_NAME           "stapsdt"
+#define FOLLY_SDT_NOTE_TYPE           3
+
+// Size of address depending on platform.
+#ifdef __LP64__
+#define FOLLY_SDT_ASM_ADDR            .8byte
+#else
+#define FOLLY_SDT_ASM_ADDR            .4byte
+#endif
+
+// Assembler helper Macros.
+#define FOLLY_SDT_S(x)                #x
+#define FOLLY_SDT_ASM_1(x)            FOLLY_SDT_S(x) "\n"
+#define FOLLY_SDT_ASM_2(a, b)         FOLLY_SDT_S(a) "," FOLLY_SDT_S(b) "\n"
+#define FOLLY_SDT_ASM_3(a, b, c)      FOLLY_SDT_S(a) "," FOLLY_SDT_S(b) ","    \
+                                      FOLLY_SDT_S(c) "\n"
+#define FOLLY_SDT_ASM_STRING(x)       FOLLY_SDT_ASM_1(.asciz FOLLY_SDT_S(x))
+
+// Helper to determine the size of an argument.
+#define FOLLY_SDT_ISARRAY(x)  (__builtin_classify_type(x) == 14)
+#define FOLLY_SDT_ARGSIZE(x)  (FOLLY_SDT_ISARRAY(x) ? sizeof(void*) : sizeof(x))
+
+// Format of each probe arguments as operand.
+// Size of the arugment tagged with FOLLY_SDT_Sn, with "n" constraint.
+// Value of the argument tagged with FOLLY_SDT_An, with configured constraint.
+#define FOLLY_SDT_ARG(n, x)                                                    \
+  [FOLLY_SDT_S##n] "n"                ((size_t)FOLLY_SDT_ARGSIZE(x)),          \
+  [FOLLY_SDT_A##n] FOLLY_SDT_ARG_CONSTRAINT (x)
+
+// Templates to append arguments as operands.
+#define FOLLY_SDT_OPERANDS_0()        [__sdt_dummy] "g" (0)
+#define FOLLY_SDT_OPERANDS_1(_1)      FOLLY_SDT_ARG(1, _1)
+#define FOLLY_SDT_OPERANDS_2(_1, _2)                                           \
+  FOLLY_SDT_OPERANDS_1(_1), FOLLY_SDT_ARG(2, _2)
+#define FOLLY_SDT_OPERANDS_3(_1, _2, _3)                                       \
+  FOLLY_SDT_OPERANDS_2(_1, _2), FOLLY_SDT_ARG(3, _3)
+#define FOLLY_SDT_OPERANDS_4(_1, _2, _3, _4)                                   \
+  FOLLY_SDT_OPERANDS_3(_1, _2, _3), FOLLY_SDT_ARG(4, _4)
+#define FOLLY_SDT_OPERANDS_5(_1, _2, _3, _4, _5)                               \
+  FOLLY_SDT_OPERANDS_4(_1, _2, _3, _4), FOLLY_SDT_ARG(5, _5)
+#define FOLLY_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6)                           \
+  FOLLY_SDT_OPERANDS_5(_1, _2, _3, _4, _5), FOLLY_SDT_ARG(6, _6)
+#define FOLLY_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7)                       \
+  FOLLY_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), FOLLY_SDT_ARG(7, _7)
+#define FOLLY_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8)                   \
+  FOLLY_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7), FOLLY_SDT_ARG(8, _8)
+
+// Templates to reference the arguments from operands in note section.
+#if defined(__powerpc64__ ) || defined(__powerpc__)
+#define FOLLY_SDT_ARGTMPL(id)       %I[id]%[id]
+#elif defined(__i386__)
+#define FOLLY_SDT_ARGTMPL(id)       %w[id]
+#else
+#define FOLLY_SDT_ARGTMPL(id)       %[id]
+#endif
+#define FOLLY_SDT_ARGFMT(no)        %n[FOLLY_SDT_S##no]@FOLLY_SDT_ARGTMPL(FOLLY_SDT_A##no)
+#define FOLLY_SDT_ARG_TEMPLATE_0    /*No arguments*/
+#define FOLLY_SDT_ARG_TEMPLATE_1    FOLLY_SDT_ARGFMT(1)
+#define FOLLY_SDT_ARG_TEMPLATE_2    FOLLY_SDT_ARG_TEMPLATE_1 FOLLY_SDT_ARGFMT(2)
+#define FOLLY_SDT_ARG_TEMPLATE_3    FOLLY_SDT_ARG_TEMPLATE_2 FOLLY_SDT_ARGFMT(3)
+#define FOLLY_SDT_ARG_TEMPLATE_4    FOLLY_SDT_ARG_TEMPLATE_3 FOLLY_SDT_ARGFMT(4)
+#define FOLLY_SDT_ARG_TEMPLATE_5    FOLLY_SDT_ARG_TEMPLATE_4 FOLLY_SDT_ARGFMT(5)
+#define FOLLY_SDT_ARG_TEMPLATE_6    FOLLY_SDT_ARG_TEMPLATE_5 FOLLY_SDT_ARGFMT(6)
+#define FOLLY_SDT_ARG_TEMPLATE_7    FOLLY_SDT_ARG_TEMPLATE_6 FOLLY_SDT_ARGFMT(7)
+#define FOLLY_SDT_ARG_TEMPLATE_8    FOLLY_SDT_ARG_TEMPLATE_7 FOLLY_SDT_ARGFMT(8)
+
+// Structure of note section for the probe.
+#define FOLLY_SDT_NOTE_CONTENT(provider, name, arg_template)                   \
+  FOLLY_SDT_ASM_1(990: FOLLY_SDT_NOP)                                          \
+  FOLLY_SDT_ASM_3(     .pushsection .note.stapsdt,"","note")                   \
+  FOLLY_SDT_ASM_1(     .balign 4)                                              \
+  FOLLY_SDT_ASM_3(     .4byte 992f-991f, 994f-993f, FOLLY_SDT_NOTE_TYPE)       \
+  FOLLY_SDT_ASM_1(991: .asciz FOLLY_SDT_NOTE_NAME)                             \
+  FOLLY_SDT_ASM_1(992: .balign 4)                                              \
+  FOLLY_SDT_ASM_1(993: FOLLY_SDT_ASM_ADDR 990b)                                \
+  FOLLY_SDT_ASM_1(     FOLLY_SDT_ASM_ADDR 0) /*Reserved for Semaphore address*/\
+  FOLLY_SDT_ASM_1(     FOLLY_SDT_ASM_ADDR 0) /*Reserved for Semaphore name*/   \
+  FOLLY_SDT_ASM_STRING(provider)                                               \
+  FOLLY_SDT_ASM_STRING(name)                                                   \
+  FOLLY_SDT_ASM_STRING(arg_template)                                           \
+  FOLLY_SDT_ASM_1(994: .balign 4)                                              \
+  FOLLY_SDT_ASM_1(     .popsection)
+
+// Main probe Macro.
+#define FOLLY_SDT_PROBE(provider, name, n, arglist)                            \
+    __asm__ __volatile__ (                                                     \
+      FOLLY_SDT_NOTE_CONTENT(provider, name, FOLLY_SDT_ARG_TEMPLATE_##n)       \
+      :: FOLLY_SDT_OPERANDS_##n arglist                                        \
+    )                                                                          \
+
+// Helper Macros to handle variadic arguments.
+#define FOLLY_SDT_NARG_(_0, _1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
+#define FOLLY_SDT_NARG(...)                                                    \
+  FOLLY_SDT_NARG_(__VA_ARGS__, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define FOLLY_SDT_PROBE_N(provider, name, N, ...)                              \
+  FOLLY_SDT_PROBE(provider, name, N, (__VA_ARGS__))
diff --git a/tests/python/include/folly/tracing/StaticTracepoint.h b/tests/python/include/folly/tracing/StaticTracepoint.h
new file mode 100644
index 0000000..37e271c
--- /dev/null
+++ b/tests/python/include/folly/tracing/StaticTracepoint.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2017 Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(__ELF__) &&                                                        \
+    (defined(__powerpc64__) || defined(__powerpc__) || defined(__aarch64__) || \
+     defined(__x86_64__) || defined(__i386__))
+#include <folly/tracing/StaticTracepoint-ELF.h>
+
+#define FOLLY_SDT(provider, name, ...)                                         \
+  FOLLY_SDT_PROBE_N(                                                           \
+    provider, name, FOLLY_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+#else
+#define FOLLY_SDT(provider, name, ...) do {} while(0)
+#endif
diff --git a/tests/python/kprobe.b b/tests/python/kprobe.b
new file mode 100644
index 0000000..74a996b
--- /dev/null
+++ b/tests/python/kprobe.b
@@ -0,0 +1,24 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#packed "false"
+
+struct pt_regs {
+  u64 r15:64;
+  u64 r14:64;
+  u64 r13:64;
+  u64 r12:64;
+  u64 bp:64;
+  u64 bx:64;
+  u64 r11:64;
+  u64 r10:64;
+  u64 r9:64;
+  u64 r8:64;
+  u64 ax:64;
+  u64 cx:64;
+  u64 dx:64;
+  u64 si:64;
+  u64 di:64;
+};
+
+
diff --git a/tests/python/proto.b b/tests/python/proto.b
new file mode 100644
index 0000000..78cfa5f
--- /dev/null
+++ b/tests/python/proto.b
@@ -0,0 +1,157 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#packed "true"
+
+struct ethernet {
+  u64 dst:48;
+  u64 src:48;
+  u32 type:16;
+};
+
+state ethernet {
+  switch $ethernet.type {
+    case 0x0800 {
+      next proto::ip;
+    };
+    case 0x8100 {
+      next proto::dot1q;
+    };
+    case * {
+      goto EOP;
+    };
+  }
+}
+
+
+struct dot1q {
+  u32 pri:3;
+  u32 cfi:1;
+  u32 vlanid:12;
+  u32 type:16;
+};
+
+state dot1q {
+  switch $dot1q.type {
+    case 0x0800 {
+      next proto::ip;
+    };
+    case * {
+      goto EOP;
+    };
+  }
+}
+
+
+struct ip {
+  u32 ver:4;
+  u32 hlen:4;
+  u32 tos:8;
+  u32 tlen:16;
+  u32 identification:16;
+  u32 ffo_unused:1;
+  u32 df:1;
+  u32 mf:1;
+  u32 foffset:13;
+  u32 ttl:8;
+  u32 nextp:8;
+  u32 hchecksum:16;
+  u32 src:32;
+  u32 dst:32;
+};
+
+state ip {
+  switch $ip.nextp {
+    case 6 {
+      next proto::tcp;
+    };
+    case 17 {
+      next proto::udp;
+    };
+    case 47 {
+      next proto::gre;
+    };
+    case * {
+      goto EOP;
+    };
+  }
+}
+
+
+struct udp {
+  u32 sport:16;
+  u32 dport:16;
+  u32 length:16;
+  u32 crc:16;
+};
+
+state udp {
+  switch $udp.dport {
+    case 8472 {
+      next proto::vxlan;
+    };
+    case * {
+      goto EOP;
+    };
+  }
+}
+
+struct tcp {
+  u16 src_port:16;
+  u16 dst_port:16;
+  u32 seq_num:32;
+  u32 ack_num:32;
+  u8 offset:4;
+  u8 reserved:4;
+  u8 flag_cwr:1;
+  u8 flag_ece:1;
+  u8 flag_urg:1;
+  u8 flag_ack:1;
+  u8 flag_psh:1;
+  u8 flag_rst:1;
+  u8 flag_syn:1;
+  u8 flag_fin:1;
+  u16 rcv_wnd:16;
+  u16 cksum:16;
+  u16 urg_ptr:16;
+};
+
+state tcp {
+  goto EOP;
+}
+
+struct vxlan {
+  u32 rsv1:4;
+  u32 iflag:1;
+  u32 rsv2:3;
+  u32 rsv3:24;
+  u32 key:24;
+  u32 rsv4:8;
+};
+
+state vxlan {
+  goto EOP;
+}
+
+
+struct gre {
+  u32 cflag:1;
+  u32 rflag:1;
+  u32 kflag:1;
+  u32 snflag:1;
+  u32 srflag:1;
+  u32 recurflag:3;
+  u32 reserved:5;
+  u32 vflag:3;
+  u32 protocol:16;
+  u32 key:32;
+};
+
+state gre {
+  switch $gre.protocol {
+    case * {
+      goto EOP;
+    };
+  }
+}
+
diff --git a/tests/python/simulation.py b/tests/python/simulation.py
new file mode 120000
index 0000000..c1aae4b
--- /dev/null
+++ b/tests/python/simulation.py
@@ -0,0 +1 @@
+../../examples/networking/simulation.py
\ No newline at end of file
diff --git a/tests/python/test_array.py b/tests/python/test_array.py
new file mode 100755
index 0000000..d5e1aee
--- /dev/null
+++ b/tests/python/test_array.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import ctypes as ct
+import random
+import time
+import subprocess
+from bcc.utils import get_online_cpus
+from unittest import main, TestCase
+
+class TestArray(TestCase):
+    def test_simple(self):
+        b = BPF(text="""BPF_ARRAY(table1, u64, 128);""")
+        t1 = b["table1"]
+        t1[ct.c_int(0)] = ct.c_ulonglong(100)
+        t1[ct.c_int(127)] = ct.c_ulonglong(1000)
+        for i, v in t1.items():
+            if i.value == 0:
+                self.assertEqual(v.value, 100)
+            if i.value == 127:
+                self.assertEqual(v.value, 1000)
+        self.assertEqual(len(t1), 128)
+
+    def test_native_type(self):
+        b = BPF(text="""BPF_ARRAY(table1, u64, 128);""")
+        t1 = b["table1"]
+        t1[0] = ct.c_ulonglong(100)
+        t1[-2] = ct.c_ulonglong(37)
+        t1[127] = ct.c_ulonglong(1000)
+        for i, v in t1.items():
+            if i.value == 0:
+                self.assertEqual(v.value, 100)
+            if i.value == 127:
+                self.assertEqual(v.value, 1000)
+        self.assertEqual(len(t1), 128)
+        self.assertEqual(t1[-2].value, 37)
+        self.assertEqual(t1[-1].value, t1[127].value)
+
+    def test_perf_buffer(self):
+        self.counter = 0
+
+        class Data(ct.Structure):
+            _fields_ = [("ts", ct.c_ulonglong)]
+
+        def cb(cpu, data, size):
+            self.assertGreater(size, ct.sizeof(Data))
+            event = ct.cast(data, ct.POINTER(Data)).contents
+            self.counter += 1
+
+        def lost_cb(lost):
+            self.assertGreater(lost, 0)
+
+        text = """
+BPF_PERF_OUTPUT(events);
+int do_sys_nanosleep(void *ctx) {
+    struct {
+        u64 ts;
+    } data = {bpf_ktime_get_ns()};
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+        b = BPF(text=text)
+        b.attach_kprobe(event=b.get_syscall_fnname("nanosleep"),
+                        fn_name="do_sys_nanosleep")
+        b["events"].open_perf_buffer(cb, lost_cb=lost_cb)
+        subprocess.call(['sleep', '0.1'])
+        b.perf_buffer_poll()
+        self.assertGreater(self.counter, 0)
+        b.cleanup()
+
+    def test_perf_buffer_for_each_cpu(self):
+        self.events = []
+
+        class Data(ct.Structure):
+            _fields_ = [("cpu", ct.c_ulonglong)]
+
+        def cb(cpu, data, size):
+            self.assertGreater(size, ct.sizeof(Data))
+            event = ct.cast(data, ct.POINTER(Data)).contents
+            self.events.append(event)
+
+        def lost_cb(lost):
+            self.assertGreater(lost, 0)
+
+        text = """
+BPF_PERF_OUTPUT(events);
+int do_sys_nanosleep(void *ctx) {
+    struct {
+        u64 cpu;
+    } data = {bpf_get_smp_processor_id()};
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+        b = BPF(text=text)
+        b.attach_kprobe(event=b.get_syscall_fnname("nanosleep"),
+                        fn_name="do_sys_nanosleep")
+        b["events"].open_perf_buffer(cb, lost_cb=lost_cb)
+        online_cpus = get_online_cpus()
+        for cpu in online_cpus:
+            subprocess.call(['taskset', '-c', str(cpu), 'sleep', '0.1'])
+        b.perf_buffer_poll()
+        b.cleanup()
+        self.assertGreaterEqual(len(self.events), len(online_cpus), 'Received only {}/{} events'.format(len(self.events), len(online_cpus)))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_bpf_log.py b/tests/python/test_bpf_log.py
new file mode 100755
index 0000000..cb3d003
--- /dev/null
+++ b/tests/python/test_bpf_log.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from simulation import Simulation
+import sys
+import os
+import tempfile
+from unittest import main, TestCase
+
+
+error_msg = "R0 invalid mem access 'map_value_or_null'\n"
+
+text = """
+       #include <uapi/linux/ptrace.h>
+       #include <bcc/proto.h>
+       BPF_HASH(t1, int, int, 10);
+       int sim_port(struct __sk_buff *skb) {
+           int x = 0, *y;
+       """
+repeat = """
+           y = t1.lookup(&x);
+           if (!y) return 0;
+           x = *y;
+         """
+end = """
+           y = t1.lookup(&x);
+           x = *y;
+           return 0;
+        }
+      """
+for i in range(0,300):
+    text += repeat
+text += end
+
+class TestBPFProgLoad(TestCase):
+
+    def setUp(self):
+        self.fp = tempfile.TemporaryFile()
+        os.dup2(self.fp.fileno(), sys.stderr.fileno())
+
+    def tearDown(self):
+        self.fp.close()
+
+
+    def test_log_debug(self):
+        b = BPF(text=text, debug=2)
+        try:
+            ingress = b.load_func("sim_port",BPF.SCHED_CLS)
+        except Exception:
+            self.fp.flush()
+            self.fp.seek(0)
+            self.assertEqual(error_msg in self.fp.read().decode(), True)
+
+
+    def test_log_no_debug(self):
+        b = BPF(text=text, debug=0)
+        try:
+            ingress = b.load_func("sim_port",BPF.SCHED_CLS)
+        except Exception:
+            self.fp.flush()
+            self.fp.seek(0)
+            self.assertEqual(error_msg in self.fp.read().decode(), True)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/tests/python/test_brb.c b/tests/python/test_brb.c
new file mode 100644
index 0000000..f999a5b
--- /dev/null
+++ b/tests/python/test_brb.c
@@ -0,0 +1,226 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+
+#define _memcpy __builtin_memcpy
+
+// meta data passed between bpf programs
+typedef struct bpf_metadata {
+    u32 prog_id;
+    u32 rx_port_id;
+} bpf_metadata_t;
+
+typedef struct bpf_dest {
+    u32 prog_id;
+    u32 port_id;
+} bpf_dest_t;
+
+// use u64 to represent eth_addr.
+// maintain the structure though to indicate the semantics
+typedef struct eth_addr {
+    u64 addr;
+} eth_addr_t;
+
+// Program table definitions for tail calls
+BPF_PROG_ARRAY(jump, 16);
+
+// physical endpoint manager (pem) tables which connects to boeht bridge 1 and bridge 2
+// <port_id, bpf_dest>
+BPF_ARRAY(pem_dest, bpf_dest_t, 256);
+// <port_id, ifindex>
+BPF_ARRAY(pem_port, u32, 256);
+// <ifindex, port_id>
+BPF_HASH(pem_ifindex, u32, u32, 256);
+// <0, tx2vm_pkts>
+BPF_ARRAY(pem_stats, u32, 1);
+
+// bridge 1 (br1) tables
+// <port_id, bpf_dest>
+BPF_ARRAY(br1_dest, bpf_dest_t, 256);
+// <eth_addr, port_id>
+BPF_HASH(br1_mac, eth_addr_t, u32, 256);
+// <0, rtr_ifindex>
+BPF_ARRAY(br1_rtr, u32, 1);
+// <mac, ifindex>
+BPF_HASH(br1_mac_ifindex, eth_addr_t, u32, 1);
+
+// bridge 2 (br2) tables
+// <port_id, bpf_dest>
+BPF_ARRAY(br2_dest, bpf_dest_t, 256);
+// <eth_addr, port_id>
+BPF_HASH(br2_mac, eth_addr_t, u32, 256);
+// <0, rtr_ifindex>
+BPF_ARRAY(br2_rtr, u32, 1);
+// <mac, ifindex>
+BPF_HASH(br2_mac_ifindex, eth_addr_t, u32, 1);
+
+int pem(struct __sk_buff *skb) {
+    bpf_metadata_t meta = {};
+    u32 ifindex;
+    u32 *tx_port_id_p;
+    u32 tx_port_id;
+    u32 rx_port;
+    u32 *ifindex_p;
+    bpf_dest_t *dest_p;
+
+    // pem does not look at packet data
+    if (skb->tc_index == 0) {
+        skb->tc_index = 1;
+        skb->cb[0] = skb->cb[1] = 0;
+        meta.prog_id = meta.rx_port_id = 0;
+    } else {
+        meta.prog_id = skb->cb[0];
+        asm volatile("" ::: "memory");
+        meta.rx_port_id = skb->cb[1];
+    }
+    if (!meta.prog_id) {
+        /* from external */
+        ifindex = skb->ingress_ifindex;
+        tx_port_id_p = pem_ifindex.lookup(&ifindex);
+        if (tx_port_id_p) {
+            tx_port_id = *tx_port_id_p;
+            dest_p = pem_dest.lookup(&tx_port_id);
+            if (dest_p) {
+                skb->cb[0] = dest_p->prog_id;
+                skb->cb[1] = dest_p->port_id;
+                jump.call(skb, dest_p->prog_id);
+            }
+        }
+    } else {
+        /* from internal */
+        rx_port = meta.rx_port_id;
+        ifindex_p = pem_port.lookup(&rx_port);
+        if (ifindex_p) {
+#if 1
+            /* accumulate stats, may hurt performance slightly */
+            u32 index = 0;
+            u32 *value = pem_stats.lookup(&index);
+            if (value)
+                lock_xadd(value, 1);
+#endif
+            bpf_clone_redirect(skb, *ifindex_p, 0);
+        }
+    }
+
+    return 1;
+}
+
+static int br_common(struct __sk_buff *skb, int which_br) {
+    u8 *cursor = 0;
+    u16 proto;
+    u16 arpop;
+    eth_addr_t dmac;
+    u8 *mac_p;
+    u32 dip;
+    u32 *tx_port_id_p;
+    u32 tx_port_id;
+    bpf_dest_t *dest_p;
+    u32 index, *rtrif_p;
+
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    /* handle ethernet packet header */
+    {
+        dmac.addr = ethernet->dst;
+        /* skb->tc_index may be preserved across router namespace if router simply rewrite packet
+         * and send it back.
+         */
+        if (skb->tc_index == 1) {
+            /* packet from pem, send to the router, set tc_index to 2 */
+            skb->tc_index = 2;
+            if (dmac.addr == 0xffffffffffffULL) {
+                 index = 0;
+                 if (which_br == 1)
+                     rtrif_p = br1_rtr.lookup(&index);
+                 else
+                     rtrif_p = br2_rtr.lookup(&index);
+                 if (rtrif_p)
+                     bpf_clone_redirect(skb, *rtrif_p, 0);
+             } else {
+                 /* the dmac address should match the router's */
+                 if (which_br == 1)
+                     rtrif_p = br1_mac_ifindex.lookup(&dmac);
+                 else
+                     rtrif_p = br2_mac_ifindex.lookup(&dmac);
+                 if (rtrif_p)
+                     bpf_clone_redirect(skb, *rtrif_p, 0);
+             }
+             return 1;
+        }
+
+        /* set the tc_index to 1 so pem knows it is from internal */
+        skb->tc_index = 1;
+        switch (ethernet->type) {
+            case ETH_P_IP: goto ip;
+            case ETH_P_ARP: goto arp;
+            case ETH_P_8021Q: goto dot1q;
+            default: goto EOP;
+        }
+    }
+
+    dot1q: {
+        struct dot1q_t *dot1q = cursor_advance(cursor, sizeof(*dot1q));
+        switch(dot1q->type) {
+            case ETH_P_IP: goto ip;
+            case ETH_P_ARP: goto arp;
+            default: goto EOP;
+        }
+    }
+
+    arp: {
+        struct arp_t *arp = cursor_advance(cursor, sizeof(*arp));
+        /* mac learning */
+        arpop = arp->oper;
+        if (arpop == 2) {
+            index = 0;
+            if (which_br == 1)
+                rtrif_p = br1_rtr.lookup(&index);
+            else
+                rtrif_p = br2_rtr.lookup(&index);
+            if (rtrif_p) {
+                __u32 ifindex = *rtrif_p;
+                eth_addr_t smac;
+
+                smac.addr = ethernet->src;
+                if (which_br == 1)
+                    br1_mac_ifindex.update(&smac, &ifindex);
+                else
+                    br2_mac_ifindex.update(&smac, &ifindex);
+            }
+        }
+        goto xmit;
+    }
+
+    ip: {
+        struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+        goto xmit;
+    }
+
+xmit:
+    if (which_br == 1)
+        tx_port_id_p = br1_mac.lookup(&dmac);
+    else
+        tx_port_id_p = br2_mac.lookup(&dmac);
+    if (tx_port_id_p) {
+        tx_port_id = *tx_port_id_p;
+        if (which_br == 1)
+            dest_p = br1_dest.lookup(&tx_port_id);
+        else
+            dest_p = br2_dest.lookup(&tx_port_id);
+        if (dest_p) {
+            skb->cb[0] = dest_p->prog_id;
+            skb->cb[1] = dest_p->port_id;
+            jump.call(skb, dest_p->prog_id);
+        }
+    }
+
+EOP:
+    return 1;
+}
+
+int br1(struct __sk_buff *skb) {
+    return br_common(skb, 1);
+}
+
+int br2(struct __sk_buff *skb) {
+    return br_common(skb, 2);
+}
diff --git a/tests/python/test_brb.py b/tests/python/test_brb.py
new file mode 100755
index 0000000..9a05a14
--- /dev/null
+++ b/tests/python/test_brb.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# This program implements a topology likes below:
+#   pem: physical endpoint manager, implemented as a bpf program
+#
+#     vm1 <--------+  +----> bridge1 <----+
+#                  V  V                   V
+#                  pem                  router
+#                  ^  ^                   ^
+#     vm2 <--------+  +----> bridge2 <----+
+#
+# The vm1, vm2 and router are implemented as namespaces.
+# The bridge is implemented with limited functionality in bpf program.
+#
+# vm1 and vm2 are in different subnet. For vm1 to communicate to vm2,
+# the packet will have to travel from vm1 to pem, bridge1, router, bridge2, pem, and
+# then come to vm2.
+#
+# When this test is run with verbose mode (ctest -R <test_name> -V),
+# the following printout is observed on my local box:
+#
+# ......
+# 8: ARPING 100.1.1.254 from 100.1.1.1 eth0
+# 8: Unicast reply from 100.1.1.254 [76:62:B5:5C:8C:6F]  0.533ms
+# 8: Sent 1 probes (1 broadcast(s))
+# 8: Received 1 response(s)
+# 8: ARPING 200.1.1.254 from 200.1.1.1 eth0
+# 8: Unicast reply from 200.1.1.254 [F2:F0:B4:ED:7B:1B]  0.524ms
+# 8: Sent 1 probes (1 broadcast(s))
+# 8: Received 1 response(s)
+# 8: PING 200.1.1.1 (200.1.1.1) 56(84) bytes of data.
+# 8: 64 bytes from 200.1.1.1: icmp_req=1 ttl=63 time=0.074 ms
+# 8: 64 bytes from 200.1.1.1: icmp_req=2 ttl=63 time=0.061 ms
+# 8:
+# 8: --- 200.1.1.1 ping statistics ---
+# 8: 2 packets transmitted, 2 received, 0% packet loss, time 999ms
+# 8: rtt min/avg/max/mdev = 0.061/0.067/0.074/0.010 ms
+# 8: [ ID] Interval       Transfer     Bandwidth
+# 8: [  5]  0.0- 1.0 sec  4.00 GBytes  34.3 Gbits/sec
+# 8: Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+# 8: MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo
+# 8: Recv   Send    Send
+# 8: Socket Socket  Message  Elapsed
+# 8: Size   Size    Size     Time     Throughput
+# 8: bytes  bytes   bytes    secs.    10^6bits/sec
+# 8:
+# 8:  87380  16384  65160    1.00     41991.68
+# 8: MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo : first burst 0
+# 8: Local /Remote
+# 8: Socket Size   Request  Resp.   Elapsed  Trans.
+# 8: Send   Recv   Size     Size    Time     Rate
+# 8: bytes  Bytes  bytes    bytes   secs.    per sec
+# 8:
+# 8: 16384  87380  1        1       1.00     48645.53
+# 8: 16384  87380
+# 8: .
+# 8: ----------------------------------------------------------------------
+# 8: Ran 1 test in 11.296s
+# 8:
+# 8: OK
+
+from ctypes import c_uint
+from netaddr import IPAddress, EUI
+from bcc import BPF
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from utils import NSPopenWithCheck
+import sys
+from time import sleep
+from unittest import main, TestCase
+from simulation import Simulation
+
+arg1 = sys.argv.pop(1)
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+sim = Simulation(ipdb)
+
+class TestBPFSocket(TestCase):
+    def set_default_const(self):
+        self.ns1            = "ns1"
+        self.ns2            = "ns2"
+        self.ns_router      = "ns_router"
+        self.vm1_ip         = "100.1.1.1"
+        self.vm2_ip         = "200.1.1.1"
+        self.vm1_rtr_ip     = "100.1.1.254"
+        self.vm2_rtr_ip     = "200.1.1.254"
+        self.vm1_rtr_mask   = "100.1.1.0/24"
+        self.vm2_rtr_mask   = "200.1.1.0/24"
+
+    def get_table(self, b):
+        self.jump = b.get_table("jump")
+
+        self.pem_dest = b.get_table("pem_dest")
+        self.pem_port = b.get_table("pem_port")
+        self.pem_ifindex = b.get_table("pem_ifindex")
+        self.pem_stats = b.get_table("pem_stats")
+
+        self.br1_dest = b.get_table("br1_dest")
+        self.br1_mac = b.get_table("br1_mac")
+        self.br1_rtr = b.get_table("br1_rtr")
+
+        self.br2_dest = b.get_table("br2_dest")
+        self.br2_mac = b.get_table("br2_mac")
+        self.br2_rtr = b.get_table("br2_rtr")
+
+    def connect_ports(self, prog_id_pem, prog_id_br, curr_pem_pid, curr_br_pid,
+                      br_dest_map, br_mac_map, ifindex, vm_mac, vm_ip):
+        self.pem_dest[c_uint(curr_pem_pid)] = self.pem_dest.Leaf(prog_id_br, curr_br_pid)
+        br_dest_map[c_uint(curr_br_pid)] = br_dest_map.Leaf(prog_id_pem, curr_pem_pid)
+        self.pem_port[c_uint(curr_pem_pid)] = c_uint(ifindex)
+        self.pem_ifindex[c_uint(ifindex)] = c_uint(curr_pem_pid)
+        mac_addr = br_mac_map.Key(int(EUI(vm_mac)))
+        br_mac_map[mac_addr] = c_uint(curr_br_pid)
+
+    def config_maps(self):
+        # program id
+        prog_id_pem = 1
+        prog_id_br1 = 2
+        prog_id_br2 = 3
+
+        # initial port id and table pointers
+        curr_pem_pid = 0
+        curr_br1_pid = 0
+        curr_br2_pid = 0
+
+        # configure jump table
+        self.jump[c_uint(prog_id_pem)] = c_uint(self.pem_fn.fd)
+        self.jump[c_uint(prog_id_br1)] = c_uint(self.br1_fn.fd)
+        self.jump[c_uint(prog_id_br2)] = c_uint(self.br2_fn.fd)
+
+        # connect pem and br1
+        curr_pem_pid = curr_pem_pid + 1
+        curr_br1_pid = curr_br1_pid + 1
+        self.connect_ports(prog_id_pem, prog_id_br1, curr_pem_pid, curr_br1_pid,
+                      self.br1_dest, self.br1_mac,
+                      self.ns1_eth_out.index, self.vm1_mac, self.vm1_ip)
+
+        # connect pem and br2
+        curr_pem_pid = curr_pem_pid + 1
+        curr_br2_pid = curr_br2_pid + 1
+        self.connect_ports(prog_id_pem, prog_id_br2, curr_pem_pid, curr_br2_pid,
+                      self.br2_dest, self.br2_mac,
+                      self.ns2_eth_out.index, self.vm2_mac, self.vm2_ip)
+
+        # connect <br1, rtr> and <br2, rtr>
+        self.br1_rtr[c_uint(0)] = c_uint(self.nsrtr_eth0_out.index)
+        self.br2_rtr[c_uint(0)] = c_uint(self.nsrtr_eth1_out.index)
+
+    def test_brb(self):
+        try:
+            b = BPF(src_file=arg1, debug=0)
+            self.pem_fn = b.load_func("pem", BPF.SCHED_CLS)
+            self.br1_fn = b.load_func("br1", BPF.SCHED_CLS)
+            self.br2_fn = b.load_func("br2", BPF.SCHED_CLS)
+            self.get_table(b)
+
+            # set up the topology
+            self.set_default_const()
+            (ns1_ipdb, self.ns1_eth_out, _) = sim._create_ns(self.ns1, ipaddr=self.vm1_ip+'/24',
+                                                             fn=self.pem_fn, action='drop',
+                                                             disable_ipv6=True)
+            (ns2_ipdb, self.ns2_eth_out, _) = sim._create_ns(self.ns2, ipaddr=self.vm2_ip+'/24',
+                                                             fn=self.pem_fn, action='drop',
+                                                             disable_ipv6=True)
+            ns1_ipdb.routes.add({'dst': self.vm2_rtr_mask, 'gateway': self.vm1_rtr_ip}).commit()
+            ns2_ipdb.routes.add({'dst': self.vm1_rtr_mask, 'gateway': self.vm2_rtr_ip}).commit()
+            self.vm1_mac = ns1_ipdb.interfaces['eth0'].address
+            self.vm2_mac = ns2_ipdb.interfaces['eth0'].address
+
+            (_, self.nsrtr_eth0_out, _) = sim._create_ns(self.ns_router, ipaddr=self.vm1_rtr_ip+'/24',
+                                                         fn=self.br1_fn, action='drop',
+                                                         disable_ipv6=True)
+            (rt_ipdb, self.nsrtr_eth1_out, _) = sim._ns_add_ifc(self.ns_router, "eth1", "ns_router2",
+                                                                ipaddr=self.vm2_rtr_ip+'/24',
+                                                                fn=self.br2_fn, action='drop',
+                                                                disable_ipv6=True)
+            nsp = NSPopen(rt_ipdb.nl.netns, ["sysctl", "-w", "net.ipv4.ip_forward=1"])
+            nsp.wait(); nsp.release()
+
+            # configure maps
+            self.config_maps()
+
+            # our bridge is not smart enough, so send arping for router learning to prevent router
+            # from sending out arp request
+            nsp = NSPopen(ns1_ipdb.nl.netns,
+                          ["arping", "-w", "1", "-c", "1", "-I", "eth0", self.vm1_rtr_ip])
+            nsp.wait(); nsp.release()
+            nsp = NSPopen(ns2_ipdb.nl.netns,
+                          ["arping", "-w", "1", "-c", "1", "-I", "eth0", self.vm2_rtr_ip])
+            nsp.wait(); nsp.release()
+
+            # ping
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["ping", self.vm2_ip, "-c", "2"])
+            nsp.wait(); nsp.release()
+            # pem_stats only counts pem->bridge traffic, each VM has 4: arping/arp request/2 icmp request
+            # total 8 packets should be counted
+            self.assertEqual(self.pem_stats[c_uint(0)].value, 8)
+
+            nsp_server = NSPopenWithCheck(ns2_ipdb.nl.netns, ["iperf", "-s", "-xSC"])
+            sleep(1)
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["iperf", "-c", self.vm2_ip, "-t", "1", "-xSC"])
+            nsp.wait(); nsp.release()
+            nsp_server.kill(); nsp_server.wait(); nsp_server.release()
+
+            nsp_server = NSPopenWithCheck(ns2_ipdb.nl.netns, ["netserver", "-D"])
+            sleep(1)
+            nsp = NSPopenWithCheck(ns1_ipdb.nl.netns, ["netperf", "-l", "1", "-H", self.vm2_ip, "--", "-m", "65160"])
+            nsp.wait(); nsp.release()
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["netperf", "-l", "1", "-H", self.vm2_ip, "-t", "TCP_RR"])
+            nsp.wait(); nsp.release()
+            nsp_server.kill(); nsp_server.wait(); nsp_server.release()
+
+        finally:
+            sim.release()
+            ipdb.release()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_brb2.c b/tests/python/test_brb2.c
new file mode 100644
index 0000000..5c164ec
--- /dev/null
+++ b/tests/python/test_brb2.c
@@ -0,0 +1,32 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+
+// physical endpoint manager (pem) tables which connects VMs and bridges
+// <ifindex_in, ifindex_out>
+BPF_HASH(pem_dest, u32, u32, 256);
+// <0, tx_pkts>
+BPF_ARRAY(pem_stats, u32, 1);
+
+int pem(struct __sk_buff *skb) {
+    u32 ifindex_in, *ifindex_p;
+    u8 *cursor = 0;
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+
+    ifindex_in = skb->ingress_ifindex;
+    ifindex_p = pem_dest.lookup(&ifindex_in);
+    if (ifindex_p) {
+#if 1
+        if (ethernet->type == 0x0800 || ethernet->type == 0x0806) {
+            /* accumulate stats */
+            u32 index = 0;
+            u32 *value = pem_stats.lookup(&index);
+            if (value)
+                lock_xadd(value, 1);
+        }
+#endif
+        bpf_clone_redirect(skb, *ifindex_p, 0);
+    }
+
+    return 1;
+}
diff --git a/tests/python/test_brb2.py b/tests/python/test_brb2.py
new file mode 100755
index 0000000..a0a0ecc
--- /dev/null
+++ b/tests/python/test_brb2.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# This program implements a topology likes below:
+#   pem: physical endpoint manager, implemented as a bpf program
+#
+#     vm1 <--------+  +----> bridge1 <----+
+#                  V  V                   V
+#                  pem                  router
+#                  ^  ^                   ^
+#     vm2 <--------+  +----> bridge2 <----+
+#
+# The vm1, vm2 and router are implemented as namespaces.
+# The linux bridge device is used to provice bridge functionality.
+# pem bpf will be attached to related network devices for vm1, vm1, bridge1 and bridge2.
+#
+# vm1 and vm2 are in different subnet. For vm1 to communicate to vm2,
+# the packet will have to travel from vm1 to pem, bridge1, router, bridge2, pem, and
+# then come to vm2.
+#
+# When this test is run with verbose mode (ctest -R <test_name> -V),
+# the following printout is observed on my local box:
+#
+# ......
+# 9: PING 200.1.1.1 (200.1.1.1) 56(84) bytes of data.
+# 9: 64 bytes from 200.1.1.1: icmp_req=1 ttl=63 time=0.090 ms
+# 9: 64 bytes from 200.1.1.1: icmp_req=2 ttl=63 time=0.032 ms
+# 9:
+# 9: --- 200.1.1.1 ping statistics ---
+# 9: 2 packets transmitted, 2 received, 0% packet loss, time 999ms
+# 9: rtt min/avg/max/mdev = 0.032/0.061/0.090/0.029 ms
+# 9: [ ID] Interval       Transfer     Bandwidth
+# 9: [  5]  0.0- 1.0 sec  3.80 GBytes  32.6 Gbits/sec
+# 9: Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
+# 9: MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo
+# 9: Recv   Send    Send
+# 9: Socket Socket  Message  Elapsed
+# 9: Size   Size    Size     Time     Throughput
+# 9: bytes  bytes   bytes    secs.    10^6bits/sec
+# 9:
+# 9:  87380  16384  65160    1.00     39940.46
+# 9: MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 200.1.1.1 (200.1.1.1) port 0 AF_INET : demo : first burst 0
+# 9: Local /Remote
+# 9: Socket Size   Request  Resp.   Elapsed  Trans.
+# 9: Send   Recv   Size     Size    Time     Rate
+# 9: bytes  Bytes  bytes    bytes   secs.    per sec
+# 9:
+# 9: 16384  87380  1        1       1.00     46387.80
+# 9: 16384  87380
+# 9: .
+# 9: ----------------------------------------------------------------------
+# 9: Ran 1 test in 7.495s
+# 9:
+# 9: OK
+
+from ctypes import c_uint
+from bcc import BPF
+from pyroute2 import IPRoute, NetNS, IPDB, NSPopen
+from utils import NSPopenWithCheck
+import sys
+from time import sleep
+from unittest import main, TestCase
+import subprocess
+from simulation import Simulation
+
+arg1 = sys.argv.pop(1)
+ipr = IPRoute()
+ipdb = IPDB(nl=ipr)
+sim = Simulation(ipdb)
+
+allocated_interfaces = set(ipdb.interfaces.keys())
+
+def get_next_iface(prefix):
+    i = 0
+    while True:
+        iface = "{0}{1}".format(prefix, i)
+        if iface not in allocated_interfaces:
+            allocated_interfaces.add(iface)
+            return iface
+        i += 1
+
+class TestBPFSocket(TestCase):
+    def setup_br(self, br, veth_rt_2_br, veth_pem_2_br, veth_br_2_pem):
+        # create veth which connecting pem and br
+        with ipdb.create(ifname=veth_pem_2_br, kind="veth", peer=veth_br_2_pem) as v:
+            v.up()
+        ipdb.interfaces[veth_br_2_pem].up().commit()
+        subprocess.call(["sysctl", "-q", "-w", "net.ipv6.conf." + veth_pem_2_br + ".disable_ipv6=1"])
+        subprocess.call(["sysctl", "-q", "-w", "net.ipv6.conf." + veth_br_2_pem + ".disable_ipv6=1"])
+
+        # set up the bridge and add router interface as one of its slaves
+        with ipdb.create(ifname=br, kind="bridge") as br1:
+            br1.add_port(ipdb.interfaces[veth_pem_2_br])
+            br1.add_port(ipdb.interfaces[veth_rt_2_br])
+            br1.up()
+        subprocess.call(["sysctl", "-q", "-w", "net.ipv6.conf." + br + ".disable_ipv6=1"])
+
+    def set_default_const(self):
+        self.ns1            = "ns1"
+        self.ns2            = "ns2"
+        self.ns_router      = "ns_router"
+        self.br1            = get_next_iface("br")
+        self.veth_pem_2_br1 = "v20"
+        self.veth_br1_2_pem = "v21"
+        self.br2            = get_next_iface("br")
+        self.veth_pem_2_br2 = "v22"
+        self.veth_br2_2_pem = "v23"
+
+        self.vm1_ip         = "100.1.1.1"
+        self.vm2_ip         = "200.1.1.1"
+        self.vm1_rtr_ip     = "100.1.1.254"
+        self.vm2_rtr_ip     = "200.1.1.254"
+        self.vm1_rtr_mask   = "100.1.1.0/24"
+        self.vm2_rtr_mask   = "200.1.1.0/24"
+
+    def attach_filter(self, ifname, fd, name):
+        ifindex = ipdb.interfaces[ifname].index
+        ipr.tc("add", "ingress", ifindex, "ffff:")
+        ipr.tc("add-filter", "bpf", ifindex, ":1", fd=fd, name=name,
+              parent="ffff:", action="drop", classid=1)
+
+    def config_maps(self):
+        # pem just relays packets between VM and its corresponding
+        # slave link in the bridge interface
+        ns1_ifindex = self.ns1_eth_out.index
+        ns2_ifindex = self.ns2_eth_out.index
+        br1_ifindex = ipdb.interfaces[self.veth_br1_2_pem].index
+        br2_ifindex = ipdb.interfaces[self.veth_br2_2_pem].index
+        self.pem_dest[c_uint(ns1_ifindex)] = c_uint(br1_ifindex)
+        self.pem_dest[c_uint(br1_ifindex)] = c_uint(ns1_ifindex)
+        self.pem_dest[c_uint(ns2_ifindex)] = c_uint(br2_ifindex)
+        self.pem_dest[c_uint(br2_ifindex)] = c_uint(ns2_ifindex)
+
+        # tc filter setup with bpf programs attached
+        self.attach_filter(self.veth_br1_2_pem, self.pem_fn.fd, self.pem_fn.name)
+        self.attach_filter(self.veth_br2_2_pem, self.pem_fn.fd, self.pem_fn.name)
+
+    def test_brb2(self):
+        try:
+            b = BPF(src_file=arg1, debug=0)
+            self.pem_fn = b.load_func("pem", BPF.SCHED_CLS)
+            self.pem_dest= b.get_table("pem_dest")
+            self.pem_stats = b.get_table("pem_stats")
+
+            # set up the topology
+            self.set_default_const()
+            (ns1_ipdb, self.ns1_eth_out, _) = sim._create_ns(self.ns1, ipaddr=self.vm1_ip+'/24',
+                                                             fn=self.pem_fn, action='drop',
+                                                             disable_ipv6=True)
+            (ns2_ipdb, self.ns2_eth_out, _) = sim._create_ns(self.ns2, ipaddr=self.vm2_ip+'/24',
+                                                             fn=self.pem_fn, action='drop',
+                                                             disable_ipv6=True)
+            ns1_ipdb.routes.add({'dst': self.vm2_rtr_mask, 'gateway': self.vm1_rtr_ip}).commit()
+            ns2_ipdb.routes.add({'dst': self.vm1_rtr_mask, 'gateway': self.vm2_rtr_ip}).commit()
+
+            (_, self.nsrtr_eth0_out, _) = sim._create_ns(self.ns_router, ipaddr=self.vm1_rtr_ip+'/24',
+                                                         disable_ipv6=True)
+            (rt_ipdb, self.nsrtr_eth1_out, _) = sim._ns_add_ifc(self.ns_router, "eth1", "ns_router2",
+                                                                ipaddr=self.vm2_rtr_ip+'/24',
+                                                                disable_ipv6=True)
+            # enable ip forwarding in router ns
+            nsp = NSPopen(rt_ipdb.nl.netns, ["sysctl", "-w", "net.ipv4.ip_forward=1"])
+            nsp.wait(); nsp.release()
+
+            # for each VM connecting to pem, there will be a corresponding veth connecting to the bridge
+            self.setup_br(self.br1, self.nsrtr_eth0_out.ifname, self.veth_pem_2_br1, self.veth_br1_2_pem)
+            self.setup_br(self.br2, self.nsrtr_eth1_out.ifname, self.veth_pem_2_br2, self.veth_br2_2_pem)
+
+            # load the program and configure maps
+            self.config_maps()
+
+            # ping
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["ping", self.vm2_ip, "-c", "2"]); nsp.wait(); nsp.release()
+            # one arp request/reply, 2 icmp request/reply per VM, total 6 packets per VM, 12 packets total
+            self.assertEqual(self.pem_stats[c_uint(0)].value, 12)
+
+            nsp_server = NSPopenWithCheck(ns2_ipdb.nl.netns, ["iperf", "-s", "-xSC"])
+            sleep(1)
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["iperf", "-c", self.vm2_ip, "-t", "1", "-xSC"])
+            nsp.wait(); nsp.release()
+            nsp_server.kill(); nsp_server.wait(); nsp_server.release()
+
+            nsp_server = NSPopenWithCheck(ns2_ipdb.nl.netns, ["netserver", "-D"])
+            sleep(1)
+            nsp = NSPopenWithCheck(ns1_ipdb.nl.netns, ["netperf", "-l", "1", "-H", self.vm2_ip, "--", "-m", "65160"])
+            nsp.wait(); nsp.release()
+            nsp = NSPopen(ns1_ipdb.nl.netns, ["netperf", "-l", "1", "-H", self.vm2_ip, "-t", "TCP_RR"])
+            nsp.wait(); nsp.release()
+            nsp_server.kill(); nsp_server.wait(); nsp_server.release()
+
+        finally:
+            if self.br1 in ipdb.interfaces: ipdb.interfaces[self.br1].remove().commit()
+            if self.br2 in ipdb.interfaces: ipdb.interfaces[self.br2].remove().commit()
+            if self.veth_pem_2_br1 in ipdb.interfaces: ipdb.interfaces[self.veth_pem_2_br1].remove().commit()
+            if self.veth_pem_2_br2 in ipdb.interfaces: ipdb.interfaces[self.veth_pem_2_br2].remove().commit()
+            sim.release()
+            ipdb.release()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_call1.c b/tests/python/test_call1.c
new file mode 100644
index 0000000..787de21
--- /dev/null
+++ b/tests/python/test_call1.c
@@ -0,0 +1,59 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+BPF_PROG_ARRAY(jump, 64);
+BPF_ARRAY(stats, u64, 64);
+
+enum states {
+  S_EOP = 1,
+  S_ETHER,
+  S_ARP,
+  S_IP
+};
+
+int parse_ether(struct __sk_buff *skb) {
+  size_t cur = 0;
+  size_t next = cur + 14;
+
+  int key = S_ETHER;
+  u64 *leaf = stats.lookup(&key);
+  if (leaf) (*leaf)++;
+
+  switch (bpf_dext_pkt(skb, cur + 12, 0, 16)) {
+    case 0x0800: jump.call(skb, S_IP);
+    case 0x0806: jump.call(skb, S_ARP);
+  }
+  jump.call(skb, S_EOP);
+  return 1;
+}
+
+int parse_arp(struct __sk_buff *skb) {
+  size_t cur = 14;  // TODO: get from ctx
+  size_t next = cur + 28;
+
+  int key = S_ARP;
+  u64 *leaf = stats.lookup(&key);
+  if (leaf) (*leaf)++;
+
+  jump.call(skb, S_EOP);
+  return 1;
+}
+
+int parse_ip(struct __sk_buff *skb) {
+  size_t cur = 14;  // TODO: get from ctx
+  size_t next = cur + 20;
+
+  int key = S_IP;
+  u64 *leaf = stats.lookup(&key);
+  if (leaf) (*leaf)++;
+
+  jump.call(skb, S_EOP);
+  return 1;
+}
+
+int eop(struct __sk_buff *skb) {
+  int key = S_EOP;
+  u64 *leaf = stats.lookup(&key);
+  if (leaf) (*leaf)++;
+  return 1;
+}
diff --git a/tests/python/test_call1.py b/tests/python/test_call1.py
new file mode 100755
index 0000000..68d68de
--- /dev/null
+++ b/tests/python/test_call1.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from ctypes import c_ushort, c_int, c_ulonglong
+from netaddr import IPAddress
+from bcc import BPF
+from pyroute2 import IPRoute
+from socket import socket, AF_INET, SOCK_DGRAM
+import sys
+from time import sleep
+from unittest import main, TestCase
+
+arg1 = sys.argv.pop(1)
+
+S_EOP = 1
+S_ETHER = 2
+S_ARP = 3
+S_IP = 4
+
+class TestBPFSocket(TestCase):
+    def setUp(self):
+        b = BPF(src_file=arg1, debug=0)
+        ether_fn = b.load_func("parse_ether", BPF.SCHED_CLS)
+        arp_fn = b.load_func("parse_arp", BPF.SCHED_CLS)
+        ip_fn = b.load_func("parse_ip", BPF.SCHED_CLS)
+        eop_fn = b.load_func("eop", BPF.SCHED_CLS)
+        ip = IPRoute()
+        ifindex = ip.link_lookup(ifname="eth0")[0]
+        ip.tc("add", "sfq", ifindex, "1:")
+        ip.tc("add-filter", "bpf", ifindex, ":1", fd=ether_fn.fd,
+              name=ether_fn.name, parent="1:", action="ok", classid=1)
+        self.jump = b.get_table("jump", c_int, c_int)
+        self.jump[c_int(S_ARP)] = c_int(arp_fn.fd)
+        self.jump[c_int(S_IP)] = c_int(ip_fn.fd)
+        self.jump[c_int(S_EOP)] = c_int(eop_fn.fd)
+        self.stats = b.get_table("stats", c_int, c_ulonglong)
+
+    def test_jumps(self):
+        udp = socket(AF_INET, SOCK_DGRAM)
+        udp.sendto(b"a" * 10, ("172.16.1.1", 5000))
+        udp.close()
+        self.assertGreater(self.stats[c_int(S_IP)].value, 0)
+        self.assertGreater(self.stats[c_int(S_ARP)].value, 0)
+        self.assertGreater(self.stats[c_int(S_EOP)].value, 1)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_clang.py b/tests/python/test_clang.py
new file mode 100755
index 0000000..36f0a1b
--- /dev/null
+++ b/tests/python/test_clang.py
@@ -0,0 +1,1254 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import ctypes as ct
+from unittest import main, skipUnless, TestCase
+import os
+import sys
+import socket
+import struct
+from contextlib import contextmanager
+import distutils.version
+
+@contextmanager
+def redirect_stderr(to):
+    stderr_fd = sys.stderr.fileno()
+    with os.fdopen(os.dup(stderr_fd), 'wb') as copied, os.fdopen(to, 'w') as to:
+        sys.stderr.flush()
+        os.dup2(to.fileno(), stderr_fd)
+        try:
+            yield sys.stderr
+        finally:
+            sys.stderr.flush()
+            os.dup2(copied.fileno(), stderr_fd)
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+class TestClang(TestCase):
+    def test_complex(self):
+        b = BPF(src_file="test_clang_complex.c", debug=0)
+        fn = b.load_func("handle_packet", BPF.SCHED_CLS)
+    def test_printk(self):
+        text = """
+#include <bcc/proto.h>
+int handle_packet(void *ctx) {
+  u8 *cursor = 0;
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+  bpf_trace_printk("ethernet->dst = %llx, ethernet->src = %llx\\n",
+                   ethernet->dst, ethernet->src);
+  return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("handle_packet", BPF.SCHED_CLS)
+
+    def test_probe_read1(self):
+        text = """
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
+    pid_t p = prev->pid;
+    return (p != -1);
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("count_sched", BPF.KPROBE)
+
+    def test_probe_read2(self):
+        text = """
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+int count_foo(struct pt_regs *ctx, unsigned long a, unsigned long b) {
+    return (a != b);
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("count_foo", BPF.KPROBE)
+
+    def test_probe_read3(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <net/tcp.h>
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+int count_tcp(struct pt_regs *ctx, struct sk_buff *skb) {
+    return _(TCP_SKB_CB(skb)->tcp_gso_size);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("count_tcp", BPF.KPROBE)
+
+    def test_probe_read4(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <net/tcp.h>
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+int test(struct pt_regs *ctx, struct sk_buff *skb) {
+    return _(TCP_SKB_CB(skb)->tcp_gso_size) + skb->protocol;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_whitelist1(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <net/tcp.h>
+int count_tcp(struct pt_regs *ctx, struct sk_buff *skb) {
+    // The below define is in net/tcp.h:
+    //    #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
+    // Note that it has AddrOf in the macro, which will cause current rewriter
+    // failing below statement
+    // return TCP_SKB_CB(skb)->tcp_gso_size;
+    u16 val = 0;
+    bpf_probe_read(&val, sizeof(val), &(TCP_SKB_CB(skb)->tcp_gso_size));
+    return val;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("count_tcp", BPF.KPROBE)
+
+    def test_probe_read_whitelist2(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <net/tcp.h>
+int count_tcp(struct pt_regs *ctx, struct sk_buff *skb) {
+    // The below define is in net/tcp.h:
+    //    #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
+    // Note that it has AddrOf in the macro, which will cause current rewriter
+    // failing below statement
+    // return TCP_SKB_CB(skb)->tcp_gso_size;
+    u16 val = 0;
+    bpf_probe_read(&val, sizeof(val), &(TCP_SKB_CB(skb)->tcp_gso_size));
+    return val + skb->protocol;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("count_tcp", BPF.KPROBE)
+
+    def test_probe_read_keys(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+BPF_HASH(start, struct request *);
+int do_request(struct pt_regs *ctx, struct request *req) {
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+    return 0;
+}
+
+int do_completion(struct pt_regs *ctx, struct request *req) {
+    u64 *tsp = start.lookup(&req);
+    if (tsp != 0) {
+        start.delete(&req);
+    }
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fns = b.load_funcs(BPF.KPROBE)
+
+    def test_sscanf(self):
+        text = """
+BPF_HASH(stats, int, struct { u64 a; u64 b; u64 c:36; u64 d:28; struct { u32 a; u32 b; } s; }, 10);
+int foo(void *ctx) {
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("foo", BPF.KPROBE)
+        t = b.get_table("stats")
+        s1 = t.key_sprintf(t.Key(2))
+        self.assertEqual(s1, b"0x2")
+        s2 = t.leaf_sprintf(t.Leaf(2, 3, 4, 1, (5, 6)))
+        l = t.leaf_scanf(s2)
+        self.assertEqual(l.a, 2)
+        self.assertEqual(l.b, 3)
+        self.assertEqual(l.c, 4)
+        self.assertEqual(l.d, 1)
+        self.assertEqual(l.s.a, 5)
+        self.assertEqual(l.s.b, 6)
+
+    def test_sscanf_array(self):
+        text = """
+BPF_HASH(stats, int, struct { u32 a[3]; u32 b; }, 10);
+"""
+        b = BPF(text=text, debug=0)
+        t = b.get_table("stats")
+        s1 = t.key_sprintf(t.Key(2))
+        self.assertEqual(s1, b"0x2")
+        s2 = t.leaf_sprintf(t.Leaf((ct.c_uint * 3)(1,2,3), 4))
+        self.assertEqual(s2, b"{ [ 0x1 0x2 0x3 ] 0x4 }")
+        l = t.leaf_scanf(s2)
+        self.assertEqual(l.a[0], 1)
+        self.assertEqual(l.a[1], 2)
+        self.assertEqual(l.a[2], 3)
+        self.assertEqual(l.b, 4)
+
+    def test_sscanf_string(self):
+        text = """
+struct Symbol {
+    char name[128];
+    char path[128];
+};
+struct Event {
+    uint32_t pid;
+    uint32_t tid;
+    struct Symbol stack[64];
+};
+BPF_TABLE("array", int, struct Event, comms, 1);
+"""
+        b = BPF(text=text)
+        t = b.get_table("comms")
+        s1 = t.leaf_sprintf(t[0])
+        fill = b' { "" "" }' * 63
+        self.assertEqual(s1, b'{ 0x0 0x0 [ { "" "" }%s ] }' % fill)
+        l = t.Leaf(1, 2)
+        name = b"libxyz"
+        path = b"/usr/lib/libxyz.so"
+        l.stack[0].name = name
+        l.stack[0].path = path
+        s2 = t.leaf_sprintf(l)
+        self.assertEqual(s2,
+                b'{ 0x1 0x2 [ { "%s" "%s" }%s ] }' % (name, path, fill))
+        l = t.leaf_scanf(s2)
+        self.assertEqual(l.pid, 1)
+        self.assertEqual(l.tid, 2)
+        self.assertEqual(l.stack[0].name, name)
+        self.assertEqual(l.stack[0].path, path)
+
+    def test_iosnoop(self):
+        text = """
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+    struct request *req;
+};
+
+BPF_HASH(start, struct key_t, u64, 1024);
+int do_request(struct pt_regs *ctx, struct request *req) {
+    struct key_t key = {};
+
+    bpf_trace_printk("traced start %d\\n", req->__data_len);
+
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("do_request", BPF.KPROBE)
+
+    def test_blk_start_request(self):
+        text = """
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+int do_request(struct pt_regs *ctx, int req) {
+    bpf_trace_printk("req ptr: 0x%x\\n", req);
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("do_request", BPF.KPROBE)
+
+    def test_bpf_hash(self):
+        text = """
+BPF_HASH(table1);
+BPF_HASH(table2, u32);
+BPF_HASH(table3, u32, int);
+"""
+        b = BPF(text=text, debug=0)
+
+    def test_consecutive_probe_read(self):
+        text = """
+#include <linux/fs.h>
+#include <linux/mount.h>
+BPF_HASH(table1, struct super_block *);
+int trace_entry(struct pt_regs *ctx, struct file *file) {
+    if (!file) return 0;
+    struct vfsmount *mnt = file->f_path.mnt;
+    if (mnt) {
+        struct super_block *k = mnt->mnt_sb;
+        u64 zero = 0;
+        table1.update(&k, &zero);
+        k = mnt->mnt_sb;
+        table1.update(&k, &zero);
+    }
+
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("trace_entry", BPF.KPROBE)
+
+    def test_nested_probe_read(self):
+        text = """
+#include <linux/fs.h>
+int trace_entry(struct pt_regs *ctx, struct file *file) {
+    if (!file) return 0;
+    const char *name = file->f_path.dentry->d_name.name;
+    bpf_trace_printk("%s\\n", name);
+    return 0;
+}
+"""
+        b = BPF(text=text, debug=0)
+        fn = b.load_func("trace_entry", BPF.KPROBE)
+
+    def test_nested_probe_read_deref(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+struct sock {
+    u32 *sk_daddr;
+};
+int test(struct pt_regs *ctx, struct sock *skp) {
+    return *(skp->sk_daddr);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_char_array_probe(self):
+        BPF(text="""#include <linux/blkdev.h>
+int kprobe__blk_update_request(struct pt_regs *ctx, struct request *req) {
+    bpf_trace_printk("%s\\n", req->rq_disk->disk_name);
+    return 0;
+}""")
+
+    def test_probe_read_helper(self):
+        b = BPF(text="""
+#include <linux/fs.h>
+static void print_file_name(struct file *file) {
+    if (!file) return;
+    const char *name = file->f_path.dentry->d_name.name;
+    bpf_trace_printk("%s\\n", name);
+}
+static void print_file_name2(int unused, struct file *file) {
+    print_file_name(file);
+}
+int trace_entry1(struct pt_regs *ctx, struct file *file) {
+    print_file_name(file);
+    return 0;
+}
+int trace_entry2(struct pt_regs *ctx, int unused, struct file *file) {
+    print_file_name2(unused, file);
+    return 0;
+}
+""")
+        fn = b.load_func("trace_entry1", BPF.KPROBE)
+        fn = b.load_func("trace_entry2", BPF.KPROBE)
+
+    def test_probe_unnamed_union_deref(self):
+        text = """
+#include <linux/mm_types.h>
+int trace(struct pt_regs *ctx, struct page *page) {
+    void *p = page->mapping;
+    return p != NULL;
+}
+"""
+        # depending on llvm, compile may pass/fail, but at least shouldn't crash
+        try:
+            b = BPF(text=text)
+        except:
+            pass
+
+    def test_probe_struct_assign(self):
+        b = BPF(text = """
+#include <uapi/linux/ptrace.h>
+struct args_t {
+    const char *filename;
+    int flags;
+    int mode;
+};
+int do_sys_open(struct pt_regs *ctx, const char *filename,
+        int flags, int mode) {
+    struct args_t args = {};
+    args.filename = filename;
+    args.flags = flags;
+    args.mode = mode;
+    bpf_trace_printk("%s\\n", args.filename);
+    return 0;
+};
+""")
+        b.attach_kprobe(event=b.get_syscall_fnname("open"),
+                        fn_name="do_sys_open")
+
+    def test_task_switch(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+struct key_t {
+  u32 prev_pid;
+  u32 curr_pid;
+};
+BPF_HASH(stats, struct key_t, u64, 1024);
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) {
+  struct key_t key = {};
+  u64 zero = 0, *val;
+  key.curr_pid = bpf_get_current_pid_tgid();
+  key.prev_pid = prev->pid;
+
+  val = stats.lookup_or_init(&key, &zero);
+  (*val)++;
+  return 0;
+}
+""")
+
+    def test_probe_simple_assign(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/gfp.h>
+struct leaf { size_t size; };
+BPF_HASH(simple_map, u32, struct leaf);
+int kprobe____kmalloc(struct pt_regs *ctx, size_t size) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct leaf* leaf = simple_map.lookup(&pid);
+    if (leaf)
+        leaf->size += size;
+    return 0;
+}""")
+
+    def test_probe_simple_member_assign(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/netdevice.h>
+struct leaf { void *ptr; };
+int test(struct pt_regs *ctx, struct sk_buff *skb) {
+    struct leaf l = {};
+    struct leaf *lp = &l;
+    lp->ptr = skb;
+    return 0;
+}""")
+        b.load_func("test", BPF.KPROBE)
+
+    def test_probe_member_expr_deref(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/netdevice.h>
+struct leaf { struct sk_buff *ptr; };
+int test(struct pt_regs *ctx, struct sk_buff *skb) {
+    struct leaf l = {};
+    struct leaf *lp = &l;
+    lp->ptr = skb;
+    return lp->ptr->priority;
+}""")
+        b.load_func("test", BPF.KPROBE)
+
+    def test_probe_member_expr(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/netdevice.h>
+struct leaf { struct sk_buff *ptr; };
+int test(struct pt_regs *ctx, struct sk_buff *skb) {
+    struct leaf l = {};
+    struct leaf *lp = &l;
+    lp->ptr = skb;
+    return l.ptr->priority;
+}""")
+        b.load_func("test", BPF.KPROBE)
+
+    def test_unop_probe_read(self):
+        text = """
+#include <linux/blkdev.h>
+int trace_entry(struct pt_regs *ctx, struct request *req) {
+    if (!(req->bio->bi_flags & 1))
+        return 1;
+    if (((req->bio->bi_flags)))
+        return 1;
+    return 0;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("trace_entry", BPF.KPROBE)
+
+    def test_probe_read_nested_deref(self):
+        text = """
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx, struct sock *sk) {
+    struct sock *ptr1;
+    struct sock **ptr2 = &ptr1;
+    *ptr2 = sk;
+    return ((struct sock *)(*ptr2))->sk_daddr;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_nested_deref2(self):
+        text = """
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx, struct sock *sk) {
+    struct sock *ptr1;
+    struct sock **ptr2 = &ptr1;
+    struct sock ***ptr3 = &ptr2;
+    *ptr2 = sk;
+    *ptr3 = ptr2;
+    return ((struct sock *)(**ptr3))->sk_daddr;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_nested_deref_func(self):
+        text = """
+#include <net/inet_sock.h>
+static int subtest(struct sock ***skp) {
+    return ((struct sock *)(**skp))->sk_daddr;
+}
+int test(struct pt_regs *ctx, struct sock *sk) {
+    struct sock *ptr1;
+    struct sock **ptr2 = &ptr1;
+    struct sock ***ptr3 = &ptr2;
+    *ptr2 = sk;
+    *ptr3 = ptr2;
+    return subtest(ptr3);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_nested_member1(self):
+        text = """
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx, struct sock *skp) {
+    u32 *daddr = &skp->sk_daddr;
+    return *daddr;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_nested_member2(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+struct sock {
+    u32 **sk_daddr;
+};
+int test(struct pt_regs *ctx, struct sock *skp) {
+    u32 *daddr = *(skp->sk_daddr);
+    return *daddr;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_nested_member3(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+struct sock {
+    u32 *sk_daddr;
+};
+int test(struct pt_regs *ctx, struct sock *skp) {
+    return *(&skp->sk_daddr);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_paren_probe_read(self):
+        text = """
+#include <net/inet_sock.h>
+int trace_entry(struct pt_regs *ctx, struct sock *sk) {
+    u16 sport = ((struct inet_sock *)sk)->inet_sport;
+    return sport;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("trace_entry", BPF.KPROBE)
+
+    def test_complex_leaf_types(self):
+        text = """
+struct list;
+struct list {
+  struct list *selfp;
+  struct list *another_selfp;
+  struct list *selfp_array[2];
+};
+struct empty {
+};
+union emptyu {
+  struct empty *em1;
+  struct empty em2;
+  struct empty em3;
+  struct empty em4;
+};
+BPF_ARRAY(t1, struct list, 1);
+BPF_ARRAY(t2, struct list *, 1);
+BPF_ARRAY(t3, union emptyu, 1);
+"""
+        b = BPF(text=text)
+        self.assertEqual(ct.sizeof(b["t3"].Leaf), 8)
+
+    def test_cflags(self):
+        text = """
+#ifndef MYFLAG
+#error "MYFLAG not set as expected"
+#endif
+"""
+        b = BPF(text=text, cflags=["-DMYFLAG"])
+
+    def test_exported_maps(self):
+        b1 = BPF(text="""BPF_TABLE_PUBLIC("hash", int, int, table1, 10);""")
+        b2 = BPF(text="""BPF_TABLE("extern", int, int, table1, 10);""")
+        t = b2["table1"]
+
+    def test_syntax_error(self):
+        with self.assertRaises(Exception):
+            b = BPF(text="""int failure(void *ctx) { if (); return 0; }""")
+
+    def test_nested_union(self):
+        text = """
+BPF_HASH(t1, struct bpf_tunnel_key, int, 1);
+"""
+        b = BPF(text=text)
+        t1 = b["t1"]
+        print(t1.Key().remote_ipv4)
+
+    def test_too_many_args(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+int many(struct pt_regs *ctx, int a, int b, int c, int d, int e, int f, int g) {
+    return 0;
+}
+"""
+        with self.assertRaises(Exception):
+            b = BPF(text=text)
+
+    def test_call_macro_arg(self):
+        text = """
+BPF_PROG_ARRAY(jmp, 32);
+
+#define JMP_IDX_PIPE (1U << 1)
+
+enum action {
+    ACTION_PASS
+};
+
+int process(struct xdp_md *ctx) {
+    jmp.call((void *)ctx, ACTION_PASS);
+    jmp.call((void *)ctx, JMP_IDX_PIPE);
+    return XDP_PASS;
+}
+        """
+        b = BPF(text=text)
+        t = b["jmp"]
+        self.assertEqual(len(t), 32);
+
+    def test_update_macro_arg(self):
+        text = """
+BPF_ARRAY(act, u32, 32);
+
+#define JMP_IDX_PIPE (1U << 1)
+
+enum action {
+    ACTION_PASS
+};
+
+int process(struct xdp_md *ctx) {
+    act.increment(ACTION_PASS);
+    act.increment(JMP_IDX_PIPE);
+    return XDP_PASS;
+}
+        """
+        b = BPF(text=text)
+        t = b["act"]
+        self.assertEqual(len(t), 32);
+
+    def test_ext_ptr_maps1(self):
+        bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int trace_entry(struct pt_regs *ctx, struct sock *sk,
+    struct sockaddr *uaddr, int addr_len) {
+    u32 pid = bpf_get_current_pid_tgid();
+    currsock.update(&pid, &sk);
+    return 0;
+};
+
+int trace_exit(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct sock **skpp;
+    skpp = currsock.lookup(&pid);
+    if (skpp) {
+        struct sock *skp = *skpp;
+        return skp->__sk_common.skc_dport;
+    }
+    return 0;
+}
+        """
+        b = BPF(text=bpf_text)
+        b.load_func("trace_entry", BPF.KPROBE)
+        b.load_func("trace_exit", BPF.KPROBE)
+
+    def test_ext_ptr_maps2(self):
+        bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int trace_entry(struct pt_regs *ctx, struct sock *sk,
+    struct sockaddr *uaddr, int addr_len) {
+    u32 pid = bpf_get_current_pid_tgid();
+    currsock.update(&pid, &sk);
+    return 0;
+};
+
+int trace_exit(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct sock **skpp = currsock.lookup(&pid);
+    if (skpp) {
+        struct sock *skp = *skpp;
+        return skp->__sk_common.skc_dport;
+    }
+    return 0;
+}
+        """
+        b = BPF(text=bpf_text)
+        b.load_func("trace_entry", BPF.KPROBE)
+        b.load_func("trace_exit", BPF.KPROBE)
+
+    def test_ext_ptr_maps_reverse(self):
+        bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int trace_exit(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct sock **skpp;
+    skpp = currsock.lookup(&pid);
+    if (skpp) {
+        struct sock *skp = *skpp;
+        return skp->__sk_common.skc_dport;
+    }
+    return 0;
+}
+
+int trace_entry(struct pt_regs *ctx, struct sock *sk) {
+    u32 pid = bpf_get_current_pid_tgid();
+    currsock.update(&pid, &sk);
+    return 0;
+};
+        """
+        b = BPF(text=bpf_text)
+        b.load_func("trace_entry", BPF.KPROBE)
+        b.load_func("trace_exit", BPF.KPROBE)
+
+    def test_ext_ptr_maps_indirect(self):
+        bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int trace_entry(struct pt_regs *ctx, struct sock *sk) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct sock **skp = &sk;
+    currsock.update(&pid, skp);
+    return 0;
+};
+
+int trace_exit(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct sock **skpp;
+    skpp = currsock.lookup(&pid);
+    if (skpp) {
+        struct sock *skp = *skpp;
+        return skp->__sk_common.skc_dport;
+    }
+    return 0;
+}
+        """
+        b = BPF(text=bpf_text)
+        b.load_func("trace_entry", BPF.KPROBE)
+        b.load_func("trace_exit", BPF.KPROBE)
+
+    def test_bpf_dins_pkt_rewrite(self):
+        text = """
+#include <bcc/proto.h>
+int dns_test(struct __sk_buff *skb) {
+    u8 *cursor = 0;
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    if(ethernet->type == ETH_P_IP) {
+        struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+        ip->src = ip->dst;
+        return 0;
+    }
+    return -1;
+}
+        """
+        b = BPF(text=text)
+
+    @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8")
+    def test_ext_ptr_from_helper(self):
+        text = """
+#include <linux/sched.h>
+int test(struct pt_regs *ctx) {
+    struct task_struct *task = (struct task_struct *)bpf_get_current_task();
+    return task->prio;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_unary_operator(self):
+        text = """
+#include <linux/fs.h>
+#include <uapi/linux/ptrace.h>
+int trace_read_entry(struct pt_regs *ctx, struct file *file) {
+    return !file->f_op->read_iter;
+}
+        """
+        b = BPF(text=text)
+        b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
+
+    def test_printk_f(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+int trace_entry(struct pt_regs *ctx) {
+  bpf_trace_printk("%0.2f\\n", 1);
+  return 0;
+}
+"""
+        r, w = os.pipe()
+        with redirect_stderr(to=w):
+            BPF(text=text)
+        r = os.fdopen(r)
+        output = r.read()
+        expectedWarn = "warning: only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed"
+        self.assertIn(expectedWarn, output)
+        r.close()
+
+    def test_printk_lf(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+int trace_entry(struct pt_regs *ctx) {
+  bpf_trace_printk("%lf\\n", 1);
+  return 0;
+}
+"""
+        r, w = os.pipe()
+        with redirect_stderr(to=w):
+            BPF(text=text)
+        r = os.fdopen(r)
+        output = r.read()
+        expectedWarn = "warning: only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed"
+        self.assertIn(expectedWarn, output)
+        r.close()
+
+    def test_printk_2s(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+int trace_entry(struct pt_regs *ctx) {
+  char s1[] = "hello", s2[] = "world";
+  bpf_trace_printk("%s %s\\n", s1, s2);
+  return 0;
+}
+"""
+        r, w = os.pipe()
+        with redirect_stderr(to=w):
+            BPF(text=text)
+        r = os.fdopen(r)
+        output = r.read()
+        expectedWarn = "warning: cannot use several %s conversion specifiers"
+        self.assertIn(expectedWarn, output)
+        r.close()
+
+    def test_map_insert(self):
+        text = """
+BPF_HASH(dummy);
+void do_trace(struct pt_regs *ctx) {
+    u64 key = 0, val = 2;
+    dummy.insert(&key, &val);
+    key = 1;
+    dummy.update(&key, &val);
+}
+"""
+        b = BPF(text=text)
+        c_val = ct.c_ulong(1)
+        b["dummy"][ct.c_ulong(0)] = c_val
+        b["dummy"][ct.c_ulong(1)] = c_val
+        b.attach_kprobe(event=b.get_syscall_fnname("sync"), fn_name="do_trace")
+        libc = ct.CDLL("libc.so.6")
+        libc.sync()
+        self.assertEqual(1, b["dummy"][ct.c_ulong(0)].value)
+        self.assertEqual(2, b["dummy"][ct.c_ulong(1)].value)
+
+    def test_prog_array_delete(self):
+        text = """
+BPF_PROG_ARRAY(dummy, 256);
+"""
+        b1 = BPF(text=text)
+        text = """
+int do_next(struct pt_regs *ctx) {
+    return 0;
+}
+"""
+        b2 = BPF(text=text)
+        fn = b2.load_func("do_next", BPF.KPROBE)
+        c_key = ct.c_int(0)
+        b1["dummy"][c_key] = ct.c_int(fn.fd)
+        b1["dummy"].__delitem__(c_key);
+        with self.assertRaises(KeyError):
+            b1["dummy"][c_key]
+
+    def test_invalid_noninline_call(self):
+        text = """
+int bar(void) {
+    return 0;
+}
+int foo(struct pt_regs *ctx) {
+    return bar();
+}
+"""
+        with self.assertRaises(Exception):
+            b = BPF(text=text)
+
+    def test_incomplete_type(self):
+        text = """
+BPF_HASH(drops, struct key_t);
+struct key_t {
+    u64 location;
+};
+"""
+        with self.assertRaises(Exception):
+            b = BPF(text=text)
+
+    def test_enumerations(self):
+        text = """
+enum b {
+    CHOICE_A,
+};
+struct a {
+    enum b test;
+};
+BPF_HASH(drops, struct a);
+        """
+        b = BPF(text=text)
+
+    def test_int128_types(self):
+        text = """
+BPF_HASH(table1, unsigned __int128, __int128);
+"""
+        b = BPF(text=text)
+        table = b['table1']
+        self.assertEqual(ct.sizeof(table.Key), 16)
+        self.assertEqual(ct.sizeof(table.Leaf), 16)
+        table[
+            table.Key.from_buffer_copy(
+                socket.inet_pton(socket.AF_INET6, "2001:db8::"))
+        ] = table.Leaf.from_buffer_copy(struct.pack('LL', 42, 123456789))
+        for k, v in table.items():
+            self.assertEqual(v[0], 42)
+            self.assertEqual(v[1], 123456789)
+            self.assertEqual(socket.inet_ntop(socket.AF_INET6,
+                                              struct.pack('LL', k[0], k[1])),
+                             "2001:db8::")
+
+    def test_padding_types(self):
+        text = """
+struct key_t {
+  u32 f1_1;               /* offset 0 */
+  struct {
+    char f2_1;            /* offset 16 */
+    __int128 f2_2;        /* offset 32 */
+  };
+  u8 f1_3;                /* offset 48 */
+  unsigned __int128 f1_4; /* offset 64 */
+  char f1_5;              /* offset 80 */
+};
+struct value_t {
+  u8 src[4] __attribute__ ((aligned (8))); /* offset 0 */
+  u8 dst[4] __attribute__ ((aligned (8))); /* offset 8 */
+};
+BPF_HASH(table1, struct key_t, struct value_t);
+"""
+        b = BPF(text=text)
+        table = b['table1']
+        self.assertEqual(ct.sizeof(table.Key), 96)
+        self.assertEqual(ct.sizeof(table.Leaf), 16)
+
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+    def test_probe_read_tracepoint_context(self):
+        text = """
+#include <linux/netdevice.h>
+TRACEPOINT_PROBE(skb, kfree_skb) {
+    struct sk_buff *skb = (struct sk_buff *)args->skbaddr;
+    return skb->protocol;
+}
+"""
+        b = BPF(text=text)
+
+    def test_probe_read_kprobe_ctx(self):
+        text = """
+#include <linux/sched.h>
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx) {
+    struct sock *sk;
+    sk = (struct sock *)PT_REGS_PARM1(ctx);
+    return sk->sk_dport;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_ctx_array(self):
+        text = """
+#include <linux/sched.h>
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx) {
+    struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
+    return newsk->__sk_common.skc_rcv_saddr;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+    def test_probe_read_tc_ctx(self):
+        text = """
+#include <uapi/linux/pkt_cls.h>
+#include <linux/if_ether.h>
+int test(struct __sk_buff *ctx) {
+    void* data_end = (void*)(long)ctx->data_end;
+    void* data = (void*)(long)ctx->data;
+    if (data + sizeof(struct ethhdr) > data_end)
+        return TC_ACT_SHOT;
+    struct ethhdr *eh = (struct ethhdr *)data;
+    if (eh->h_proto == 0x1)
+        return TC_ACT_SHOT;
+    return TC_ACT_OK;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.SCHED_CLS)
+
+    def test_probe_read_return(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/ptrace.h>
+#include <linux/tcp.h>
+static inline unsigned char *my_skb_transport_header(struct sk_buff *skb) {
+    return skb->head + skb->transport_header;
+}
+int test(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) {
+    struct tcphdr *th = (struct tcphdr *)my_skb_transport_header(skb);
+    return th->seq;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_multiple_return(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/ptrace.h>
+#include <linux/tcp.h>
+static inline u64 error_function() {
+    return 0;
+}
+static inline unsigned char *my_skb_transport_header(struct sk_buff *skb) {
+    if (skb)
+        return skb->head + skb->transport_header;
+    return (unsigned char *)error_function();
+}
+int test(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) {
+    struct tcphdr *th = (struct tcphdr *)my_skb_transport_header(skb);
+    return th->seq;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_return_expr(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/ptrace.h>
+#include <linux/tcp.h>
+static inline unsigned char *my_skb_transport_header(struct sk_buff *skb) {
+    return skb->head + skb->transport_header;
+}
+int test(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) {
+    u32 *seq = (u32 *)my_skb_transport_header(skb) + offsetof(struct tcphdr, seq);
+    return *seq;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_return_call(self):
+        text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/ptrace.h>
+#include <linux/tcp.h>
+static inline struct tcphdr *my_skb_transport_header(struct sk_buff *skb) {
+    return (struct tcphdr *)skb->head + skb->transport_header;
+}
+int test(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) {
+    return my_skb_transport_header(skb)->seq;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_no_probe_read_addrof(self):
+        text = """
+#include <linux/sched.h>
+#include <net/inet_sock.h>
+static inline int test_help(__be16 *addr) {
+    __be16 val = 0;
+    bpf_probe_read(&val, sizeof(val), addr);
+    return val;
+}
+int test(struct pt_regs *ctx) {
+    struct sock *sk;
+    sk = (struct sock *)PT_REGS_PARM1(ctx);
+    return test_help(&sk->sk_dport);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses1(self):
+        text = """
+#include <linux/ptrace.h>
+#include <linux/dcache.h>
+int test(struct pt_regs *ctx, const struct qstr *name) {
+    return name->name[1];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses2(self):
+        text = """
+#include <linux/ptrace.h>
+#include <linux/dcache.h>
+int test(struct pt_regs *ctx, const struct qstr *name) {
+    return name->name  [ 1];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses3(self):
+        text = """
+#include <linux/ptrace.h>
+#include <linux/dcache.h>
+int test(struct pt_regs *ctx, const struct qstr *name) {
+    return (name->name)[1];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses4(self):
+        text = """
+#include <linux/ptrace.h>
+int test(struct pt_regs *ctx, char *name) {
+    return name[1];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses5(self):
+        text = """
+#include <linux/ptrace.h>
+int test(struct pt_regs *ctx, char **name) {
+    return (*name)[1];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses6(self):
+        text = """
+#include <linux/ptrace.h>
+struct test_t {
+    int tab[5];
+};
+int test(struct pt_regs *ctx, struct test_t *t) {
+    return *(&t->tab[1]);
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses7(self):
+        text = """
+#include <net/inet_sock.h>
+int test(struct pt_regs *ctx, struct sock *sk) {
+    return sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[0];
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_probe_read_array_accesses8(self):
+        text = """
+#include <linux/mm_types.h>
+int test(struct pt_regs *ctx, struct mm_struct *mm) {
+    return mm->rss_stat.count[MM_ANONPAGES].counter;
+}
+"""
+        b = BPF(text=text)
+        fn = b.load_func("test", BPF.KPROBE)
+
+    def test_arbitrary_increment_simple(self):
+        b = BPF(text=b"""
+#include <uapi/linux/ptrace.h>
+struct bpf_map;
+BPF_HASH(map);
+int map_delete(struct pt_regs *ctx, struct bpf_map *bpfmap, u64 *k) {
+    map.increment(42, 10);
+    return 0;
+}
+""")
+        b.attach_kprobe(event=b"htab_map_delete_elem", fn_name=b"map_delete")
+        b.cleanup()
+
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+    def test_packed_structure(self):
+        b = BPF(text=b"""
+struct test {
+    u16 a;
+    u32 b;
+} __packed;
+BPF_TABLE("hash", u32, struct test, testing, 2);
+TRACEPOINT_PROBE(kmem, kmalloc) {
+    u32 key = 0;
+    struct test info, *entry;
+    entry = testing.lookup(&key);
+    if (entry == NULL) {
+        info.a = 10;
+        info.b = 20;
+        testing.update(&key, &info);
+    }
+    return 0;
+}
+""")
+        if len(b["testing"].items()):
+            st = b["testing"][ct.c_uint(0)]
+            self.assertEqual(st.a, 10)
+            self.assertEqual(st.b, 20)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_clang_complex.c b/tests/python/test_clang_complex.c
new file mode 100644
index 0000000..8cb9d7c
--- /dev/null
+++ b/tests/python/test_clang_complex.c
@@ -0,0 +1,175 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <bcc/proto.h>
+
+// hash
+struct FwdKey {
+  u32 dip:32;
+};
+struct FwdLeaf {
+  u32 fwd_idx:32;
+};
+BPF_HASH(fwd_map, struct FwdKey, struct FwdLeaf, 1);
+
+// array
+struct ConfigKey {
+  u32 index;
+};
+struct ConfigLeaf {
+  u32 bpfdev_ip;
+  u32 slave_ip;
+};
+BPF_TABLE("array", struct ConfigKey, struct ConfigLeaf, config_map, 1);
+
+// hash
+struct MacaddrKey {
+  u32 ip;
+};
+struct MacaddrLeaf {
+  u64 mac;
+};
+BPF_HASH(macaddr_map, struct MacaddrKey, struct MacaddrLeaf, 11);
+
+// hash
+struct SlaveKey {
+  u32 slave_ip;
+};
+struct SlaveLeaf {
+  u32 slave_ifindex;
+};
+BPF_HASH(slave_map, struct SlaveKey, struct SlaveLeaf, 10);
+
+int handle_packet(struct __sk_buff *skb) {
+  int ret = 0;
+  u8 *cursor = 0;
+
+  if (skb->pkt_type == 0) {
+    // tx
+    // make sure configured
+    u32 slave_ip;
+
+    struct ConfigKey cfg_key = {.index = 0};
+    struct ConfigLeaf *cfg_leaf = config_map.lookup(&cfg_key);
+    if (cfg_leaf) {
+      slave_ip = cfg_leaf->slave_ip;
+    } else {
+      return 0xffffffff;
+    }
+
+    // make sure slave configured
+    // tx, default to the single slave
+    struct SlaveKey slave_key = {.slave_ip = slave_ip};
+    struct SlaveLeaf *slave_leaf = slave_map.lookup(&slave_key);
+    if (slave_leaf) {
+      ret = slave_leaf->slave_ifindex;
+    } else {
+      return 0xffffffff;
+    }
+  } else {
+    // rx, default to stack
+    ret = 0;
+  }
+
+  struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+  switch (ethernet->type) {
+    case ETH_P_IP: goto ip;
+    case ETH_P_ARP: goto arp;
+    case ETH_P_8021Q: goto dot1q;
+    default: goto EOP;
+  }
+
+  dot1q: {
+    struct dot1q_t *dot1q = cursor_advance(cursor, sizeof(*dot1q));
+    switch (dot1q->type) {
+      case ETH_P_IP: goto ip;
+      case ETH_P_ARP: goto arp;
+      default: goto EOP;
+    }
+  }
+
+  arp: {
+    struct arp_t *arp = cursor_advance(cursor, sizeof(*arp));
+    if (skb->pkt_type) {
+      if (arp->oper == 1) {
+        struct MacaddrKey mac_key = {.ip=arp->spa};
+        struct MacaddrLeaf mac_leaf = {.mac=arp->sha};
+        macaddr_map.update(&mac_key, &mac_leaf);
+      }
+    }
+    goto EOP;
+  }
+
+  struct ip_t *ip;
+  ip: {
+    ip = cursor_advance(cursor, sizeof(*ip));
+    switch (ip->nextp) {
+      case 6: goto tcp;
+      case 17: goto udp;
+      default: goto EOP;
+    }
+  }
+  tcp: {
+    struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));
+    goto EOP;
+  }
+  udp: {
+    struct udp_t *udp = cursor_advance(cursor, sizeof(*udp));
+    if (udp->dport != 5000) {
+       goto EOP;
+    }
+    if (skb->pkt_type) {
+      // lookup and then forward
+      struct FwdKey fwd_key = {.dip=ip->dst};
+      struct FwdLeaf *fwd_val = fwd_map.lookup(&fwd_key);
+      if (fwd_val) {
+         return fwd_val->fwd_idx;
+      }
+    } else {
+      // rewrite the packet and send to a pre-configured index if needed
+      u32 new_ip;
+      u32 old_ip;
+      u64 src_mac;
+      u64 dst_mac;
+
+      struct ConfigKey cfg_key = {.index = 0};
+      struct ConfigLeaf *cfg_leaf = config_map.lookup(&cfg_key);
+      if (cfg_leaf) {
+        struct MacaddrKey mac_key = {.ip = cfg_leaf->bpfdev_ip};
+        struct MacaddrLeaf *mac_leaf;
+
+        mac_key.ip = cfg_leaf->bpfdev_ip;
+        mac_leaf = macaddr_map.lookup(&mac_key);
+        if (mac_leaf) {
+          src_mac = mac_leaf->mac;
+        } else {
+          goto EOP;
+        }
+
+        mac_key.ip = cfg_leaf->slave_ip;
+        mac_leaf = macaddr_map.lookup(&mac_key);
+        if (mac_leaf) {
+          dst_mac = mac_leaf->mac;
+        } else {
+          goto EOP;
+        }
+
+        // rewrite ethernet header
+        ethernet->dst = dst_mac;
+        ethernet->src = src_mac;
+
+        // ip & udp checksum
+        incr_cksum_l4(&udp->crc, ip->src, cfg_leaf->bpfdev_ip, 1);
+        incr_cksum_l4(&udp->crc, ip->dst, cfg_leaf->slave_ip, 1);
+
+        // rewrite ip src/dst fields
+        ip->src = cfg_leaf->bpfdev_ip;
+        ip->dst = cfg_leaf->slave_ip;
+      }
+    }
+    goto EOP;
+  }
+
+EOP:
+  return ret;
+}
diff --git a/tests/python/test_debuginfo.py b/tests/python/test_debuginfo.py
new file mode 100755
index 0000000..ba4bdd6
--- /dev/null
+++ b/tests/python/test_debuginfo.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# Copyright (c) Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import os
+import subprocess
+from bcc import SymbolCache, BPF
+from unittest import main, TestCase
+
+class TestKSyms(TestCase):
+    def grab_sym(self):
+        address = ""
+        aliases = []
+
+        # Grab the first symbol in kallsyms that has type 't' or 'T'.
+        # Also, find all aliases of this symbol which are identifiable
+        # by the same address.
+        with open("/proc/kallsyms", "rb") as f:
+            for line in f:
+
+                # Extract the first 3 columns only. The 4th column
+                # containing the module name may not exist for all
+                # symbols.
+                (addr, t, name) = line.strip().split()[:3]
+                if t == b"t" or t == b"T":
+                    if not address:
+                        address = addr
+                    if addr == address:
+                        aliases.append(name)
+
+        # Return all aliases of the first symbol.
+        return (address, aliases)
+
+    def test_ksymname(self):
+        sym = BPF.ksymname(b"__kmalloc")
+        self.assertIsNotNone(sym)
+        self.assertNotEqual(sym, 0)
+
+    def test_ksym(self):
+        (addr, aliases) = self.grab_sym()
+        sym = BPF.ksym(int(addr, 16))
+        found = sym in aliases
+        self.assertTrue(found)
+
+class Harness(TestCase):
+    def setUp(self):
+        self.build_command()
+        subprocess.check_output('objcopy --only-keep-debug dummy dummy.debug'
+                                .split())
+        self.debug_command()
+        subprocess.check_output('strip dummy'.split())
+        self.process = subprocess.Popen('./dummy', stdout=subprocess.PIPE)
+        # The process prints out the address of some symbol, which we then
+        # try to resolve in the test.
+        self.addr = int(self.process.stdout.readline().strip(), 16)
+        self.syms = SymbolCache(self.process.pid)
+
+    def tearDown(self):
+        self.process.kill()
+        self.process.wait()
+        self.process.stdout.close()
+        self.process = None
+
+    def resolve_addr(self):
+        sym, offset, module = self.syms.resolve(self.addr, False)
+        self.assertEqual(sym, self.mangled_name)
+        self.assertEqual(offset, 0)
+        self.assertTrue(module[-5:] == b'dummy')
+        sym, offset, module = self.syms.resolve(self.addr, True)
+        self.assertEqual(sym, b'some_namespace::some_function(int, int)')
+        self.assertEqual(offset, 0)
+        self.assertTrue(module[-5:] == b'dummy')
+
+
+    def resolve_name(self):
+        script_dir = os.path.dirname(os.path.realpath(__file__).encode("utf8"))
+        addr = self.syms.resolve_name(os.path.join(script_dir, b'dummy'),
+                                      self.mangled_name)
+        self.assertEqual(addr, self.addr)
+        pass
+
+class TestDebuglink(Harness):
+    def build_command(self):
+        subprocess.check_output('g++ -o dummy dummy.cc'.split())
+        lines = subprocess.check_output('nm dummy'.split()).splitlines()
+        for line in lines:
+            if b"some_function" in line:
+                self.mangled_name = line.split(b' ')[2]
+                break
+        self.assertTrue(self.mangled_name)
+
+    def debug_command(self):
+        subprocess.check_output('objcopy --add-gnu-debuglink=dummy.debug dummy'
+                                .split())
+
+    def tearDown(self):
+        super(TestDebuglink, self).tearDown()
+        subprocess.check_output('rm dummy dummy.debug'.split())
+
+    def test_resolve_addr(self):
+        self.resolve_addr()
+
+    def test_resolve_name(self):
+        self.resolve_name()
+
+class TestBuildid(Harness):
+    def build_command(self):
+        subprocess.check_output(('g++ -o dummy -Xlinker ' + \
+               '--build-id=0x123456789abcdef0123456789abcdef012345678 dummy.cc')
+               .split())
+        lines = subprocess.check_output('nm dummy'.split()).splitlines()
+        for line in lines:
+            if b"some_function" in line:
+                self.mangled_name = line.split(b' ')[2]
+                break
+        self.assertTrue(self.mangled_name)
+
+
+    def debug_command(self):
+        subprocess.check_output('mkdir -p /usr/lib/debug/.build-id/12'.split())
+        subprocess.check_output(('mv dummy.debug /usr/lib/debug/.build-id' + \
+            '/12/3456789abcdef0123456789abcdef012345678.debug').split())
+
+    def tearDown(self):
+        super(TestBuildid, self).tearDown()
+        subprocess.check_output('rm dummy'.split())
+        subprocess.check_output(('rm /usr/lib/debug/.build-id/12' +
+            '/3456789abcdef0123456789abcdef012345678.debug').split())
+
+    def test_resolve_name(self):
+        self.resolve_addr()
+
+    def test_resolve_addr(self):
+        self.resolve_name()
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_dump_func.py b/tests/python/test_dump_func.py
new file mode 100755
index 0000000..6fd3b49
--- /dev/null
+++ b/tests/python/test_dump_func.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# test program for the 'dump_func' method
+
+from bcc import BPF
+from unittest import main, TestCase
+
+class TestDumpFunc(TestCase):
+    def test_return(self):
+        b = BPF(text="""
+            int entry(void)
+            {
+                return 1;
+            }""")
+
+        self.assertEqual(
+            b"\xb7\x00\x00\x00\x01\x00\x00\x00" +
+            b"\x95\x00\x00\x00\x00\x00\x00\x00",
+            b.dump_func("entry"))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_flags.py b/tests/python/test_flags.py
new file mode 100644
index 0000000..a5d2b42
--- /dev/null
+++ b/tests/python/test_flags.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import unittest
+from bcc import BPF
+
+class TestLru(unittest.TestCase):
+    def test_lru_map_flags(self):
+        test_prog1 = """
+        BPF_F_TABLE("lru_hash", int, u64, lru, 1024, BPF_F_NO_COMMON_LRU);
+        """
+        b = BPF(text=test_prog1)
+        t = b["lru"]
+        self.assertEqual(t.flags, 2);
+
+    def test_hash_map_flags(self):
+        test_prog1 = """
+        BPF_F_TABLE("hash", int, u64, hash, 1024, BPF_F_NO_PREALLOC);
+        """
+        b = BPF(text=test_prog1)
+        t = b["hash"]
+        self.assertEqual(t.flags, 1);
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_histogram.py b/tests/python/test_histogram.py
new file mode 100755
index 0000000..2fb8c16
--- /dev/null
+++ b/tests/python/test_histogram.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+from ctypes import c_int, c_ulonglong
+import random
+import time
+from unittest import main, TestCase
+
+class TestHistogram(TestCase):
+    def test_simple(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+struct bpf_map;
+BPF_HISTOGRAM(hist1);
+BPF_HASH(stub);
+int kprobe__htab_map_delete_elem(struct pt_regs *ctx, struct bpf_map *map, u64 *k) {
+    hist1.increment(bpf_log2l(*k));
+    return 0;
+}
+""")
+        for i in range(0, 32):
+            for j in range(0, random.randint(1, 10)):
+                try: del b["stub"][c_ulonglong(1 << i)]
+                except: pass
+        b["hist1"].print_log2_hist()
+
+        for i in range(32, 64):
+            for j in range(0, random.randint(1, 10)):
+                try: del b["stub"][c_ulonglong(1 << i)]
+                except: pass
+        b["hist1"].print_log2_hist()
+        b.cleanup()
+
+    def test_struct(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+struct bpf_map;
+typedef struct { void *map; u64 slot; } Key;
+BPF_HISTOGRAM(hist1, Key, 1024);
+BPF_HASH(stub1);
+BPF_HASH(stub2);
+int kprobe__htab_map_delete_elem(struct pt_regs *ctx, struct bpf_map *map, u64 *k) {
+    hist1.increment((Key){map, bpf_log2l(*k)});
+    return 0;
+}
+""")
+        for i in range(0, 64):
+            for j in range(0, random.randint(1, 10)):
+                try: del b["stub1"][c_ulonglong(1 << i)]
+                except: pass
+                try: del b["stub2"][c_ulonglong(1 << i)]
+                except: pass
+        b["hist1"].print_log2_hist()
+        b.cleanup()
+
+    def test_chars(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+typedef struct { char name[TASK_COMM_LEN]; u64 slot; } Key;
+BPF_HISTOGRAM(hist1, Key, 1024);
+int kprobe__finish_task_switch(struct pt_regs *ctx, struct task_struct *prev) {
+    Key k = {.slot = bpf_log2l(prev->real_start_time)};
+    if (!bpf_get_current_comm(&k.name, sizeof(k.name)))
+        hist1.increment(k);
+    return 0;
+}
+""")
+        for i in range(0, 100): time.sleep(0.01)
+        b["hist1"].print_log2_hist()
+        b.cleanup()
+
+    def test_multiple_key(self):
+        b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/fs.h>
+struct hist_s_key {
+    u64 key_1;
+    u64 key_2;
+};
+struct hist_key {
+    struct hist_s_key s_key;
+    u64 slot;
+};
+BPF_HISTOGRAM(mk_hist, struct hist_key, 1024);
+int kprobe__vfs_read(struct pt_regs *ctx, struct file *file,
+        char __user *buf, size_t count) {
+    struct hist_key key = {.slot = bpf_log2l(count)};
+    key.s_key.key_1 = (unsigned long)buf & 0x70;
+    key.s_key.key_2 = (unsigned long)buf & 0x7;
+    mk_hist.increment(key);
+    return 0;
+}
+""")
+        def bucket_sort(buckets):
+            buckets.sort()
+            return buckets
+
+        for i in range(0, 100): time.sleep(0.01)
+        b["mk_hist"].print_log2_hist("size", "k_1 & k_2",
+                section_print_fn=lambda bucket: "%3d %d" % (bucket[0], bucket[1]),
+                bucket_fn=lambda bucket: (bucket.key_1, bucket.key_2),
+                strip_leading_zero=True,
+                bucket_sort_fn=bucket_sort)
+        b.cleanup()
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_license.py b/tests/python/test_license.py
new file mode 100755
index 0000000..f0c6b1d
--- /dev/null
+++ b/tests/python/test_license.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+# Copyright (c) 2018 Clevernet, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import unittest
+from bcc import BPF
+
+class TestLicense(unittest.TestCase):
+    gpl_only_text = """
+#include <uapi/linux/ptrace.h>
+struct gpl_s {
+    u64 ts;
+};
+BPF_PERF_OUTPUT(events);
+int license_program(struct pt_regs *ctx) {
+    struct gpl_s data = {};
+    data.ts = bpf_ktime_get_ns();
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+
+    proprietary_text = """
+#include <uapi/linux/ptrace.h>
+struct key_t {
+    u64 ip;
+    u32 pid;
+    u32 uid;
+    char comm[16];
+};
+
+BPF_HASH(counts, struct key_t);
+
+int license_program(struct pt_regs *ctx) {
+    struct key_t key = {};
+    u64 zero = 0 , *val;
+    u64 pid = bpf_get_current_pid_tgid();
+    u32 uid = bpf_get_current_uid_gid();
+
+    key.ip = PT_REGS_IP(ctx);
+    key.pid = pid & 0xFFFFFFFF;
+    key.uid = uid & 0xFFFFFFFF;
+    bpf_get_current_comm(&(key.comm), 16);
+
+    val = counts.lookup_or_init(&key, &zero);  // update counter
+    (*val)++;
+    return 0;
+}
+"""
+
+    def license(self, lic):
+        return '''
+#define BPF_LICENSE %s
+''' % (lic)
+
+    def load_bpf_code(self, bpf_code):
+        event_name = bpf_code.get_syscall_fnname("read")
+        bpf_code.attach_kprobe(event=event_name, fn_name="license_program")
+        bpf_code.detach_kprobe(event=event_name)
+
+    def test_default(self):
+        b = BPF(text=self.gpl_only_text)
+        self.load_bpf_code(b)
+
+    def test_gpl_helper_macro(self):
+        b = BPF(text=self.gpl_only_text + self.license('GPL'))
+        self.load_bpf_code(b)
+
+    def test_proprietary_macro(self):
+        b = BPF(text=self.proprietary_text + self.license('Proprietary'))
+        self.load_bpf_code(b)
+
+    def test_gpl_compatible_macro(self):
+        b = BPF(text=self.gpl_only_text + self.license('Dual BSD/GPL'))
+        self.load_bpf_code(b)
+
+    def test_proprietary_words_macro(self):
+        b = BPF(text=self.proprietary_text + self.license('Proprietary license'))
+        self.load_bpf_code(b)
+
+    @unittest.expectedFailure
+    def test_cflags_fail(self):
+        b = BPF(text=self.gpl_only_text, cflags=["-DBPF_LICENSE=GPL"])
+        self.load_bpf_code(b)
+
+    @unittest.expectedFailure
+    def test_cflags_macro_fail(self):
+        b = BPF(text=self.gpl_only_text + self.license('GPL'), cflags=["-DBPF_LICENSE=GPL"])
+        self.load_bpf_code(b)
+
+    @unittest.expectedFailure
+    def test_empty_fail_macro(self):
+        b = BPF(text=self.gpl_only_text + self.license(''))
+        self.load_bpf_code(b)
+
+    @unittest.expectedFailure
+    def test_proprietary_fail_macro(self):
+        b = BPF(text=self.gpl_only_text + self.license('Proprietary license'))
+        self.load_bpf_code(b)
+
+    @unittest.expectedFailure
+    def test_proprietary_cflags_fail(self):
+        b = BPF(text=self.proprietary_text, cflags=["-DBPF_LICENSE=Proprietary"])
+        self.load_bpf_code(b)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_lpm_trie.py b/tests/python/test_lpm_trie.py
new file mode 100644
index 0000000..560cb4b
--- /dev/null
+++ b/tests/python/test_lpm_trie.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# Copyright (c) 2017 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import ctypes as ct
+import unittest
+from bcc import BPF
+from netaddr import IPAddress
+
+class KeyV4(ct.Structure):
+    _fields_ = [("prefixlen", ct.c_uint),
+                ("data", ct.c_ubyte * 4)]
+
+class KeyV6(ct.Structure):
+    _fields_ = [("prefixlen", ct.c_uint),
+                ("data", ct.c_ushort * 8)]
+
+class TestLpmTrie(unittest.TestCase):
+    def test_lpm_trie_v4(self):
+        test_prog1 = """
+        BPF_LPM_TRIE(trie, u64, int, 16);
+        """
+        b = BPF(text=test_prog1)
+        t = b["trie"]
+
+        k1 = KeyV4(24, (192, 168, 0, 0))
+        v1 = ct.c_int(24)
+        t[k1] = v1
+
+        k2 = KeyV4(28, (192, 168, 0, 0))
+        v2 = ct.c_int(28)
+        t[k2] = v2
+
+        k = KeyV4(32, (192, 168, 0, 15))
+        self.assertEqual(t[k].value, 28)
+
+        k = KeyV4(32, (192, 168, 0, 127))
+        self.assertEqual(t[k].value, 24)
+
+        with self.assertRaises(KeyError):
+            k = KeyV4(32, (172, 16, 1, 127))
+            v = t[k]
+
+    def test_lpm_trie_v6(self):
+        test_prog1 = """
+        struct key_v6 {
+            u32 prefixlen;
+            u32 data[4];
+        };
+        BPF_LPM_TRIE(trie, struct key_v6, int, 16);
+        """
+        b = BPF(text=test_prog1)
+        t = b["trie"]
+
+        k1 = KeyV6(64, IPAddress('2a00:1450:4001:814:200e::').words)
+        v1 = ct.c_int(64)
+        t[k1] = v1
+
+        k2 = KeyV6(96, IPAddress('2a00:1450:4001:814::200e').words)
+        v2 = ct.c_int(96)
+        t[k2] = v2
+
+        k = KeyV6(128, IPAddress('2a00:1450:4001:814::1024').words)
+        self.assertEqual(t[k].value, 96)
+
+        k = KeyV6(128, IPAddress('2a00:1450:4001:814:2046::').words)
+        self.assertEqual(t[k].value, 64)
+
+        with self.assertRaises(KeyError):
+            k = KeyV6(128, IPAddress('2a00:ffff::').words)
+            v = t[k]
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_lru.py b/tests/python/test_lru.py
new file mode 100644
index 0000000..fd279c1
--- /dev/null
+++ b/tests/python/test_lru.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import ctypes as ct
+import os
+import unittest
+from bcc import BPF
+import multiprocessing
+
+class TestLru(unittest.TestCase):
+    def test_lru_hash(self):
+        b = BPF(text="""BPF_TABLE("lru_hash", int, u64, lru, 1024);""")
+        t = b["lru"]
+        for i in range(1, 1032):
+            t[ct.c_int(i)] = ct.c_ulonglong(i)
+        for i, v in t.items():
+            self.assertEqual(v.value, i.value)
+        # BPF_MAP_TYPE_LRU_HASH eviction happens in batch and we expect less
+        # items than specified size.
+        self.assertLess(len(t), 1024);
+
+    def test_lru_percpu_hash(self):
+        test_prog1 = """
+        BPF_TABLE("lru_percpu_hash", u32, u32, stats, 1);
+        int hello_world(void *ctx) {
+            u32 key=0;
+            u32 value = 0, *val;
+            val = stats.lookup_or_init(&key, &value);
+            *val += 1;
+            return 0;
+        }
+        """
+        b = BPF(text=test_prog1)
+        stats_map = b.get_table("stats")
+        event_name = b.get_syscall_fnname("clone")
+        b.attach_kprobe(event=event_name, fn_name="hello_world")
+        ini = stats_map.Leaf()
+        for i in range(0, multiprocessing.cpu_count()):
+            ini[i] = 0
+        # First initialize with key 1
+        stats_map[ stats_map.Key(1) ] = ini
+        # Then initialize with key 0
+        stats_map[ stats_map.Key(0) ] = ini
+        # Key 1 should have been evicted
+        with self.assertRaises(KeyError):
+            val = stats_map[ stats_map.Key(1) ]
+        f = os.popen("hostname")
+        f.close()
+        self.assertEqual(len(stats_map),1)
+        val = stats_map[ stats_map.Key(0) ]
+        sum = stats_map.sum(stats_map.Key(0))
+        avg = stats_map.average(stats_map.Key(0))
+        max = stats_map.max(stats_map.Key(0))
+        self.assertGreater(sum.value, 0L)
+        self.assertGreater(max.value, 0L)
+        b.detach_kprobe(event_name)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_percpu.py b/tests/python/test_percpu.py
new file mode 100755
index 0000000..39a3bbc
--- /dev/null
+++ b/tests/python/test_percpu.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import os
+import unittest
+from bcc import BPF
+import multiprocessing
+
+class TestPercpu(unittest.TestCase):
+
+    def setUp(self):
+        try:
+            b = BPF(text='BPF_TABLE("percpu_array", u32, u32, stub, 1);')
+        except:
+            raise unittest.SkipTest("PerCpu unsupported on this kernel")
+
+    def test_helper(self):
+        test_prog1 = """
+        BPF_PERCPU_ARRAY(stub_default);
+        BPF_PERCPU_ARRAY(stub_type, u64);
+        BPF_PERCPU_ARRAY(stub_full, u64, 1024);
+        """
+        BPF(text=test_prog1)
+
+    def test_u64(self):
+        test_prog1 = """
+        BPF_TABLE("percpu_hash", u32, u64, stats, 1);
+        int hello_world(void *ctx) {
+            u32 key=0;
+            u64 value = 0, *val;
+            val = stats.lookup_or_init(&key, &value);
+            *val += 1;
+            return 0;
+        }
+        """
+        bpf_code = BPF(text=test_prog1)
+        stats_map = bpf_code.get_table("stats")
+        event_name = bpf_code.get_syscall_fnname("clone")
+        bpf_code.attach_kprobe(event=event_name, fn_name="hello_world")
+        ini = stats_map.Leaf()
+        for i in range(0, multiprocessing.cpu_count()):
+            ini[i] = 0
+        stats_map[ stats_map.Key(0) ] = ini
+        f = os.popen("hostname")
+        f.close()
+        self.assertEqual(len(stats_map),1)
+        val = stats_map[ stats_map.Key(0) ]
+        sum = stats_map.sum(stats_map.Key(0))
+        avg = stats_map.average(stats_map.Key(0))
+        max = stats_map.max(stats_map.Key(0))
+        self.assertGreater(sum.value, int(0))
+        self.assertGreater(max.value, int(0))
+        bpf_code.detach_kprobe(event_name)
+
+    def test_u32(self):
+        test_prog1 = """
+        BPF_TABLE("percpu_array", u32, u32, stats, 1);
+        int hello_world(void *ctx) {
+            u32 key=0;
+            u32 value = 0, *val;
+            val = stats.lookup_or_init(&key, &value);
+            *val += 1;
+            return 0;
+        }
+        """
+        bpf_code = BPF(text=test_prog1)
+        stats_map = bpf_code.get_table("stats")
+        event_name = bpf_code.get_syscall_fnname("clone")
+        bpf_code.attach_kprobe(event=event_name, fn_name="hello_world")
+        ini = stats_map.Leaf()
+        for i in range(0, multiprocessing.cpu_count()):
+            ini[i] = 0
+        stats_map[ stats_map.Key(0) ] = ini
+        f = os.popen("hostname")
+        f.close()
+        self.assertEqual(len(stats_map),1)
+        val = stats_map[ stats_map.Key(0) ]
+        sum = stats_map.sum(stats_map.Key(0))
+        avg = stats_map.average(stats_map.Key(0))
+        max = stats_map.max(stats_map.Key(0))
+        self.assertGreater(sum.value, int(0))
+        self.assertGreater(max.value, int(0))
+        bpf_code.detach_kprobe(event_name)
+
+    def test_struct_custom_func(self):
+        test_prog2 = """
+        typedef struct counter {
+        u32 c1;
+        u32 c2;
+        } counter;
+        BPF_TABLE("percpu_hash", u32, counter, stats, 1);
+        int hello_world(void *ctx) {
+            u32 key=0;
+            counter value = {0,0}, *val;
+            val = stats.lookup_or_init(&key, &value);
+            val->c1 += 1;
+            val->c2 += 1;
+            return 0;
+        }
+        """
+        bpf_code = BPF(text=test_prog2)
+        stats_map = bpf_code.get_table("stats",
+                reducer=lambda x,y: stats_map.sLeaf(x.c1+y.c1))
+        event_name = bpf_code.get_syscall_fnname("clone")
+        bpf_code.attach_kprobe(event=event_name, fn_name="hello_world")
+        ini = stats_map.Leaf()
+        for i in ini:
+            i = stats_map.sLeaf(0,0)
+        stats_map[ stats_map.Key(0) ] = ini
+        f = os.popen("hostname")
+        f.close()
+        self.assertEqual(len(stats_map),1)
+        k = stats_map[ stats_map.Key(0) ]
+        self.assertGreater(k.c1, int(0))
+        bpf_code.detach_kprobe(event_name)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_perf_event.py b/tests/python/test_perf_event.py
new file mode 100755
index 0000000..3f78f5b
--- /dev/null
+++ b/tests/python/test_perf_event.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PLUMgrid
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import bcc
+import ctypes
+import multiprocessing
+import os
+import time
+import unittest
+
+class TestPerfCounter(unittest.TestCase):
+    def test_cycles(self):
+        text = """
+BPF_PERF_ARRAY(cnt1, NUM_CPUS);
+BPF_ARRAY(prev, u64, NUM_CPUS);
+BPF_HISTOGRAM(dist);
+int do_sys_getuid(void *ctx) {
+    u32 cpu = bpf_get_smp_processor_id();
+    u64 val = cnt1.perf_read(CUR_CPU_IDENTIFIER);
+
+    if (((s64)val < 0) && ((s64)val > -256))
+        return 0;
+
+    prev.update(&cpu, &val);
+    return 0;
+}
+int do_ret_sys_getuid(void *ctx) {
+    u32 cpu = bpf_get_smp_processor_id();
+    u64 val = cnt1.perf_read(CUR_CPU_IDENTIFIER);
+
+    if (((s64)val < 0) && ((s64)val > -256))
+        return 0;
+
+    u64 *prevp = prev.lookup(&cpu);
+    if (prevp)
+        dist.increment(bpf_log2l(val - *prevp));
+    return 0;
+}
+"""
+        b = bcc.BPF(text=text, debug=0,
+                cflags=["-DNUM_CPUS=%d" % multiprocessing.cpu_count()])
+        event_name = b.get_syscall_fnname("getuid")
+        b.attach_kprobe(event=event_name, fn_name="do_sys_getuid")
+        b.attach_kretprobe(event=event_name, fn_name="do_ret_sys_getuid")
+        cnt1 = b["cnt1"]
+        try:
+            cnt1.open_perf_event(bcc.PerfType.HARDWARE, bcc.PerfHWConfig.CPU_CYCLES)
+        except:
+            if ctypes.get_errno() == 2:
+                raise self.skipTest("hardware events unsupported")
+            raise
+        for i in range(0, 100):
+            os.getuid()
+        b["dist"].print_log2_hist()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_probe_count.py b/tests/python/test_probe_count.py
new file mode 100755
index 0000000..df0baa2
--- /dev/null
+++ b/tests/python/test_probe_count.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# Copyright (c) Suchakra Sharma <suchakrapani.sharma@polymtl.ca>
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF, _get_num_open_probes, TRACEFS
+import os
+import sys
+from unittest import main, TestCase
+
+class TestKprobeCnt(TestCase):
+    def setUp(self):
+        self.b = BPF(text="""
+        int wololo(void *ctx) {
+          return 0;
+        }
+        """)
+        self.b.attach_kprobe(event_re="^vfs_.*", fn_name="wololo")
+
+    def test_attach1(self):
+        actual_cnt = 0
+        with open("%s/available_filter_functions" % TRACEFS, "rb") as f:
+            for line in f:
+                if line.startswith(b"vfs_"):
+                    actual_cnt += 1
+        open_cnt = self.b.num_open_kprobes()
+        self.assertEqual(actual_cnt, open_cnt)
+
+    def tearDown(self):
+        self.b.cleanup()
+
+
+class TestProbeGlobalCnt(TestCase):
+    def setUp(self):
+        self.b1 = BPF(text="""int count(void *ctx) { return 0; }""")
+        self.b2 = BPF(text="""int count(void *ctx) { return 0; }""")
+
+    def test_probe_quota(self):
+        self.b1.attach_kprobe(event="schedule", fn_name="count")
+        self.b2.attach_kprobe(event="submit_bio", fn_name="count")
+        self.assertEqual(1, self.b1.num_open_kprobes())
+        self.assertEqual(1, self.b2.num_open_kprobes())
+        self.assertEqual(2, _get_num_open_probes())
+        self.b1.cleanup()
+        self.b2.cleanup()
+        self.assertEqual(0, _get_num_open_probes())
+
+
+class TestAutoKprobe(TestCase):
+    def setUp(self):
+        self.b = BPF(text="""
+        int kprobe__schedule(void *ctx) { return 0; }
+        int kretprobe__schedule(void *ctx) { return 0; }
+        """)
+
+    def test_count(self):
+        self.assertEqual(2, self.b.num_open_kprobes())
+
+    def tearDown(self):
+        self.b.cleanup()
+
+
+class TestProbeQuota(TestCase):
+    def setUp(self):
+        self.b = BPF(text="""int count(void *ctx) { return 0; }""")
+
+    def test_probe_quota(self):
+        with self.assertRaises(Exception):
+            self.b.attach_kprobe(event_re=".*", fn_name="count")
+
+    def test_uprobe_quota(self):
+        with self.assertRaises(Exception):
+            self.b.attach_uprobe(name="c", sym_re=".*", fn_name="count")
+
+    def tearDown(self):
+        self.b.cleanup()
+
+
+class TestProbeNotExist(TestCase):
+    def setUp(self):
+        self.b = BPF(text="""int count(void *ctx) { return 0; }""")
+
+    def test_not_exist(self):
+        with self.assertRaises(Exception):
+            b.attach_kprobe(event="___doesnotexist", fn_name="count")
+
+    def tearDown(self):
+        self.b.cleanup()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_shared_table.py b/tests/python/test_shared_table.py
new file mode 100644
index 0000000..10dd63f
--- /dev/null
+++ b/tests/python/test_shared_table.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import ctypes as ct
+import unittest
+from bcc import BPF
+
+class TestSharedTable(unittest.TestCase):
+    def test_close_extern(self):
+        b1 = BPF(text="""BPF_TABLE_PUBLIC("array", int, int, table1, 10);""")
+
+        with BPF(text="""BPF_TABLE("extern", int, int, table1, 10);""") as b2:
+            t2 = b2["table1"]
+            t2[ct.c_int(1)] = ct.c_int(10)
+            self.assertEqual(len(t2), 10)
+
+        t1 = b1["table1"]
+        self.assertEqual(t1[ct.c_int(1)].value, 10)
+        self.assertEqual(len(t1), 10)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_stackid.py b/tests/python/test_stackid.py
new file mode 100755
index 0000000..2587293
--- /dev/null
+++ b/tests/python/test_stackid.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import bcc
+import distutils.version
+import os
+import unittest
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+
+@unittest.skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+class TestStackid(unittest.TestCase):
+    def test_simple(self):
+        b = bcc.BPF(text="""
+#include <uapi/linux/ptrace.h>
+struct bpf_map;
+BPF_STACK_TRACE(stack_traces, 10240);
+BPF_HASH(stack_entries, int, int);
+BPF_HASH(stub);
+int kprobe__htab_map_lookup_elem(struct pt_regs *ctx, struct bpf_map *map, u64 *k) {
+    int id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
+    if (id < 0)
+        return 0;
+    int key = 1;
+    stack_entries.update(&key, &id);
+    return 0;
+}
+""")
+        stub = b["stub"]
+        stack_traces = b["stack_traces"]
+        stack_entries = b["stack_entries"]
+        try: x = stub[stub.Key(1)]
+        except: pass
+        k = stack_entries.Key(1)
+        self.assertIn(k, stack_entries)
+        stackid = stack_entries[k]
+        self.assertIsNotNone(stackid)
+        stack = stack_traces[stackid].ip
+        self.assertEqual(b.ksym(stack[0]), b"htab_map_lookup_elem")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_stat1.b b/tests/python/test_stat1.b
new file mode 100644
index 0000000..fb505d6
--- /dev/null
+++ b/tests/python/test_stat1.b
@@ -0,0 +1,66 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+struct IPKey {
+  u32 dip:32;
+  u32 sip:32;
+};
+struct IPLeaf {
+  u32 rx_pkts:64;
+  u32 tx_pkts:64;
+};
+Table<IPKey, IPLeaf, FIXED_MATCH, AUTO> stats(1024);
+
+struct skbuff {
+  u32 type:32;
+};
+
+u32 on_packet(struct skbuff *skb) {
+  u32 ret:32 = 0;
+
+  goto proto::ethernet;
+
+  state proto::ethernet {
+  }
+
+  state proto::dot1q {
+  }
+
+  state proto::ip {
+    u32 rx:32 = 0;
+    u32 tx:32 = 0;
+    u32 IPKey key;
+    if $ip.dst > $ip.src {
+      key.dip = $ip.dst;
+      key.sip = $ip.src;
+      rx = 1;
+      // test arbitrary return stmt
+      if false {
+        return 3;
+      }
+    } else {
+      key.dip = $ip.src;
+      key.sip = $ip.dst;
+      tx = 1;
+      ret = 1;
+    }
+    struct IPLeaf *leaf;
+    leaf = stats[key];
+    on_valid(leaf) {
+      atomic_add(leaf.rx_pkts, rx);
+      atomic_add(leaf.tx_pkts, tx);
+    }
+  }
+
+  state proto::udp {
+  }
+
+  state proto::vxlan {
+  }
+
+  state proto::gre {
+  }
+
+  state EOP {
+    return ret;
+  }
+}
diff --git a/tests/python/test_stat1.c b/tests/python/test_stat1.c
new file mode 100644
index 0000000..f7ecb93
--- /dev/null
+++ b/tests/python/test_stat1.c
@@ -0,0 +1,57 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+
+#include <bcc/proto.h>
+
+struct IPKey {
+  u32 dip;
+  u32 sip;
+};
+struct IPLeaf {
+  u64 rx_pkts;
+  u64 tx_pkts;
+};
+
+BPF_HASH(stats, struct IPKey, struct IPLeaf, 256);
+
+int on_packet(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    switch (ethernet->type) {
+        case ETH_P_IP: goto ip;
+        case ETH_P_8021Q: goto dot1q;
+        default: goto EOP;
+    }
+  }
+
+  dot1q: {
+    struct dot1q_t *dot1q = cursor_advance(cursor, sizeof(*dot1q));
+    switch (dot1q->type) {
+      case ETH_P_8021Q: goto ip;
+      default: goto EOP;
+    }
+  }
+
+  ip: {
+    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+    int rx = 0, tx = 0;
+    struct IPKey key;
+    if (ip->dst > ip->src) {
+      key.dip = ip->dst;
+      key.sip = ip->src;
+      rx = 1;
+    } else {
+      key.dip = ip->src;
+      key.sip = ip->dst;
+      tx = 1;
+    }
+    struct IPLeaf zleaf = {0};
+    struct IPLeaf *leaf = stats.lookup_or_init(&key, &zleaf);
+    lock_xadd(&leaf->rx_pkts, rx);
+    lock_xadd(&leaf->tx_pkts, tx);
+  }
+
+EOP:
+  return 0;
+}
diff --git a/tests/python/test_stat1.py b/tests/python/test_stat1.py
new file mode 100755
index 0000000..23b3a29
--- /dev/null
+++ b/tests/python/test_stat1.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# test program to count the packets sent to a device in a .5
+# second period
+
+from ctypes import c_uint, c_ulong, Structure
+from netaddr import IPAddress
+from bcc import BPF
+from subprocess import check_call
+import sys
+from unittest import main, TestCase
+
+arg1 = sys.argv.pop(1)
+arg2 = ""
+if len(sys.argv) > 1:
+  arg2 = sys.argv.pop(1)
+
+Key = None
+Leaf = None
+if arg1.endswith(".b"):
+    class Key(Structure):
+        _fields_ = [("dip", c_uint),
+                    ("sip", c_uint)]
+    class Leaf(Structure):
+        _fields_ = [("rx_pkts", c_ulong),
+                    ("tx_pkts", c_ulong)]
+
+class TestBPFSocket(TestCase):
+    def setUp(self):
+        b = BPF(arg1, arg2, debug=0)
+        fn = b.load_func("on_packet", BPF.SOCKET_FILTER)
+        BPF.attach_raw_socket(fn, "eth0")
+        self.stats = b.get_table("stats", Key, Leaf)
+
+    def test_ping(self):
+        cmd = ["ping", "-f", "-c", "100", "172.16.1.1"]
+        check_call(cmd)
+        #for key, leaf in self.stats.items():
+        #    print(IPAddress(key.sip), "=>", IPAddress(key.dip),
+        #          "rx", leaf.rx_pkts, "tx", leaf.tx_pkts)
+        key = self.stats.Key(IPAddress("172.16.1.2").value, IPAddress("172.16.1.1").value)
+        leaf = self.stats[key]
+        self.assertEqual(leaf.rx_pkts, 100)
+        self.assertEqual(leaf.tx_pkts, 100)
+        del self.stats[key]
+        with self.assertRaises(KeyError):
+            x = self.stats[key]
+        with self.assertRaises(KeyError):
+            del self.stats[key]
+        self.stats.clear()
+        self.assertEqual(len(self.stats), 0)
+        self.stats[key] = leaf
+        self.assertEqual(len(self.stats), 1)
+        self.stats.clear()
+        self.assertEqual(len(self.stats), 0)
+
+    def test_empty_key(self):
+        # test with a 0 key
+        self.stats.clear()
+        self.stats[self.stats.Key()] = self.stats.Leaf(100, 200)
+        x = self.stats.popitem()
+        self.stats[self.stats.Key(10, 20)] = self.stats.Leaf(300, 400)
+        with self.assertRaises(KeyError):
+            x = self.stats[self.stats.Key()]
+        (_, x) = self.stats.popitem()
+        self.assertEqual(x.rx_pkts, 300)
+        self.assertEqual(x.tx_pkts, 400)
+        self.stats.clear()
+        self.assertEqual(len(self.stats), 0)
+        self.stats[self.stats.Key()] = x
+        self.stats[self.stats.Key(0, 1)] = x
+        self.stats[self.stats.Key(0, 2)] = x
+        self.stats[self.stats.Key(0, 3)] = x
+        self.assertEqual(len(self.stats), 4)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_tools_memleak.py b/tests/python/test_tools_memleak.py
new file mode 100755
index 0000000..bbc0a83
--- /dev/null
+++ b/tests/python/test_tools_memleak.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+from unittest import main, skipUnless, TestCase
+import distutils.version
+import os
+import subprocess
+import sys
+import tempfile
+
+TOOLS_DIR = "../../tools/"
+
+
+class cfg:
+    cmd_format = ""
+
+    # Amount of memory to leak. Note, that test application allocates memory
+    # for its own needs in libc, so this amount should be large enough to be
+    # the biggest allocation.
+    leaking_amount = 30000
+
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+
+def setUpModule():
+    # Build the memory leaking application.
+    c_src = 'test_tools_memleak_leaker_app.c'
+    tmp_dir = tempfile.mkdtemp(prefix='bcc-test-memleak-')
+    c_src_full = os.path.dirname(sys.argv[0]) + os.path.sep + c_src
+    exec_dst = tmp_dir + os.path.sep + 'leaker_app'
+
+    if subprocess.call(['gcc', '-g', '-O0', '-o', exec_dst, c_src_full]) != 0:
+        print("can't compile the leaking application")
+        raise Exception
+
+    # Taking two snapshot with one second interval. Getting the largest
+    # allocation. Since attaching to a program happens with a delay, we wait
+    # for the first snapshot, then issue the command to the app. Finally,
+    # second snapshot is used to extract the information.
+    # Helper utilities "timeout" and "setbuf" are used to limit overall running
+    # time, and to disable buffering.
+    cfg.cmd_format = (
+        'stdbuf -o 0 -i 0 timeout -s KILL 10s ' + TOOLS_DIR +
+        'memleak.py -c "{} {{}} {}" -T 1 1 2'.format(exec_dst,
+                                                     cfg.leaking_amount))
+
+
+@skipUnless(kernel_version_ge(4, 6), "requires kernel >= 4.6")
+class MemleakToolTests(TestCase):
+    def tearDown(self):
+        if self.p:
+            del(self.p)
+    def run_leaker(self, leak_kind):
+        # Starting memleak.py, which in turn launches the leaking application.
+        self.p = subprocess.Popen(cfg.cmd_format.format(leak_kind),
+                                  stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+                                  shell=True)
+
+        # Waiting for the first report.
+        while True:
+            self.p.poll()
+            if self.p.returncode is not None:
+                break
+            line = self.p.stdout.readline()
+            if b"with outstanding allocations" in line:
+                break
+
+        # At this point, memleak.py have already launched application and set
+        # probes. Sending command to the leaking application to make its
+        # allocations.
+        out = self.p.communicate(input=b"\n")[0]
+
+        # If there were memory leaks, they are in the output. Filter the lines
+        # containing "byte" substring. Every interesting line is expected to
+        # start with "N bytes from"
+        x = [x for x in out.split(b'\n') if b'byte' in x]
+
+        self.assertTrue(len(x) >= 1,
+                        msg="At least one line should have 'byte' substring.")
+
+        # Taking last report.
+        x = x[-1].split()
+        self.assertTrue(len(x) >= 1,
+                        msg="There should be at least one word in the line.")
+
+        # First word is the leak amount in bytes.
+        return int(x[0])
+
+    def test_malloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("malloc"))
+
+    def test_calloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("calloc"))
+
+    def test_realloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("realloc"))
+
+    def test_posix_memalign(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("posix_memalign"))
+
+    def test_valloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("valloc"))
+
+    def test_memalign(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("memalign"))
+
+    def test_pvalloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("pvalloc"))
+
+    def test_aligned_alloc(self):
+        self.assertEqual(cfg.leaking_amount, self.run_leaker("aligned_alloc"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_tools_memleak_leaker_app.c b/tests/python/test_tools_memleak_leaker_app.c
new file mode 100644
index 0000000..617dc5a
--- /dev/null
+++ b/tests/python/test_tools_memleak_leaker_app.c
@@ -0,0 +1,88 @@
+// This is a program that leaks memory, used for memory leak detector testing.
+
+#include <fcntl.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+static void generate_leak(const char *kind, int amount) {
+  void *ptr = NULL;
+
+  if (strcmp(kind, "malloc") == 0) {
+    printf("leaking via malloc, %p\n", malloc(amount));
+    return;
+  }
+
+  if (strcmp(kind, "calloc") == 0) {
+    printf("leaking via calloc, %p\n", calloc(amount, 1));
+    return;
+  }
+
+  if (strcmp(kind, "realloc") == 0) {
+    printf("leaking via realloc, %p\n", realloc(malloc(10), amount));
+    return;
+  }
+
+  if (strcmp(kind, "posix_memalign") == 0) {
+    posix_memalign(&ptr, 512, amount);
+    printf("leaking via posix_memalign, %p\n", ptr);
+    return;
+  }
+
+  if (strcmp(kind, "valloc") == 0) {
+    printf("leaking via valloc, %p\n", valloc(amount));
+    return;
+  }
+
+  if (strcmp(kind, "memalign") == 0) {
+    printf("leaking via memalign, %p\n", memalign(512, amount));
+    return;
+  }
+
+  if (strcmp(kind, "pvalloc") == 0) {
+    printf("leaking via pvalloc, %p\n", pvalloc(amount));
+    return;
+  }
+
+  if (strcmp(kind, "aligned_alloc") == 0) {
+    printf("leaking via aligned_alloc, %p\n", aligned_alloc(512, amount));
+    return;
+  }
+
+  if (strcmp(kind, "no_leak") == 0) {
+    void *ptr = malloc(amount);
+    printf("ptr = %p\n", ptr);
+    free(ptr);
+    return;
+  }
+
+  printf("unknown leak type '%s'\n", kind);
+}
+
+int main(int argc, char *argv[]) {
+  if (argc < 2) {
+    printf("usage: leak-userspace <kind-of-leak> [amount]\n");
+    return EXIT_SUCCESS;
+  }
+
+  const char *kind = argv[1];
+
+  int amount = 30;
+  if (argc > 2) {
+    amount = atoi(argv[2]);
+    if (amount < 1)
+      amount = 1;
+  }
+
+  // Wait for something in stdin to give external detector time to attach.
+  char c;
+  read(0, &c, sizeof(c));
+
+  // Do the work.
+  generate_leak(kind, amount);
+  return EXIT_SUCCESS;
+}
diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py
new file mode 100755
index 0000000..ab80ecf
--- /dev/null
+++ b/tests/python/test_tools_smoke.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python
+# Copyright (c) Sasha Goldshtein, 2017
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import distutils.version
+import subprocess
+import os
+import re
+from unittest import main, skipUnless, TestCase
+
+TOOLS_DIR = "../../tools/"
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+@skipUnless(kernel_version_ge(4,1), "requires kernel >= 4.1")
+class SmokeTests(TestCase):
+    # Use this for commands that have a built-in timeout, so they only need
+    # to be killed in case of a hard hang.
+    def run_with_duration(self, command, timeout=10):
+        full_command = TOOLS_DIR + command
+        self.assertEqual(0,     # clean exit
+                subprocess.call("timeout -s KILL %ds %s > /dev/null" %
+                                (timeout, full_command), shell=True))
+
+    # Use this for commands that don't have a built-in timeout, so we have
+    # to Ctrl-C out of them by sending SIGINT. If that still doesn't stop
+    # them, send a kill signal 5 seconds later.
+    def run_with_int(self, command, timeout=5, kill_timeout=5,
+                     allow_early=False, kill=False):
+        full_command = TOOLS_DIR + command
+        signal = "KILL" if kill else "INT"
+        rc = subprocess.call("timeout -s %s -k %ds %ds %s > /dev/null" %
+                (signal, kill_timeout, timeout, full_command), shell=True)
+        # timeout returns 124 if the program did not terminate prematurely,
+        # and returns 137 if we used KILL instead of INT. So there are three
+        # sensible scenarios:
+        #   1. The script is allowed to return early, and it did, with a
+        #      success return code.
+        #   2. The script timed out and was killed by the SIGINT signal.
+        #   3. The script timed out and was killed by the SIGKILL signal, and
+        #      this was what we asked for using kill=True.
+        self.assertTrue((rc == 0 and allow_early) or rc == 124
+                        or (rc == 137 and kill), "rc was %d" % rc)
+
+    def kmod_loaded(self, mod):
+        with open("/proc/modules", "r") as mods:
+            reg = re.compile("^%s\s" % mod)
+            for line in mods:
+                if reg.match(line):
+                    return 1
+                return 0
+
+    def setUp(self):
+        pass
+
+    def tearDown(self):
+        pass
+
+    def test_argdist(self):
+        self.run_with_duration("argdist.py -v -C 'p::do_sys_open()' -n 1 -i 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_bashreadline(self):
+        self.run_with_int("bashreadline.py")
+
+    def test_biolatency(self):
+        self.run_with_duration("biolatency.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_biosnoop(self):
+        self.run_with_int("biosnoop.py")
+
+    def test_biotop(self):
+        self.run_with_duration("biotop.py 1 1")
+
+    def test_bitesize(self):
+        self.run_with_int("biotop.py")
+
+    def test_bpflist(self):
+        self.run_with_duration("bpflist.py")
+
+    def test_btrfsdist(self):
+        # Will attempt to do anything meaningful only when btrfs is installed.
+        self.run_with_duration("btrfsdist.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_btrfsslower(self):
+        # Will attempt to do anything meaningful only when btrfs is installed.
+        self.run_with_int("btrfsslower.py", allow_early=True)
+
+    def test_cachestat(self):
+        self.run_with_duration("cachestat.py 1 1")
+
+    def test_cachetop(self):
+        # TODO cachetop doesn't like to run without a terminal, disabled
+        # for now.
+        # self.run_with_int("cachetop.py 1")
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_capable(self):
+        self.run_with_int("capable.py")
+
+    def test_cpudist(self):
+        self.run_with_duration("cpudist.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,9), "requires kernel >= 4.9")
+    def test_cpuunclaimed(self):
+        self.run_with_duration("cpuunclaimed.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_dbslower(self):
+        # Deliberately left empty -- dbslower requires an instance of either
+        # MySQL or PostgreSQL to be running, or it fails to attach.
+        pass
+
+    @skipUnless(kernel_version_ge(4,3), "requires kernel >= 4.3")
+    def test_dbstat(self):
+        # Deliberately left empty -- dbstat requires an instance of either
+        # MySQL or PostgreSQL to be running, or it fails to attach.
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_dcsnoop(self):
+        self.run_with_int("dcsnoop.py")
+
+    def test_dcstat(self):
+        self.run_with_duration("dcstat.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_deadlock_detector(self):
+        # TODO This tool requires a massive BPF stack traces table allocation,
+        # which might fail the run or even trigger the oomkiller to kill some
+        # other processes. Disabling for now.
+        # self.run_with_int("deadlock_detector.py $(pgrep -n bash)", timeout=10)
+        pass
+
+    @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8")
+    def test_execsnoop(self):
+        self.run_with_int("execsnoop.py")
+
+    def test_ext4dist(self):
+        self.run_with_duration("ext4dist.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_ext4slower(self):
+        self.run_with_int("ext4slower.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_filelife(self):
+        self.run_with_int("filelife.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_fileslower(self):
+        self.run_with_int("fileslower.py")
+
+    def test_filetop(self):
+        self.run_with_duration("filetop.py 1 1")
+
+    def test_funccount(self):
+        self.run_with_int("funccount.py __kmalloc -i 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_funclatency(self):
+        self.run_with_int("funclatency.py __kmalloc -i 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_funcslower(self):
+        self.run_with_int("funcslower.py __kmalloc")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_gethostlatency(self):
+        self.run_with_int("gethostlatency.py")
+
+    def test_hardirqs(self):
+        self.run_with_duration("hardirqs.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_killsnoop(self):
+        # Because killsnoop intercepts signals, if we send it a SIGINT we we
+        # we likely catch it while it is handling the data packet from the
+        # BPF program, and the exception from the SIGINT will be swallowed by
+        # ctypes. Therefore, we use SIGKILL.
+        # To reproduce the above issue, run killsnoop and in another shell run
+        # `kill -s SIGINT $(pidof python)`. As a result, killsnoop will print
+        # a traceback but will not exit.
+        self.run_with_int("killsnoop.py", kill=True)
+
+    @skipUnless(kernel_version_ge(4,9), "requires kernel >= 4.9")
+    def test_llcstat(self):
+        # Requires PMU, which is not available in virtual machines.
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_mdflush(self):
+        self.run_with_int("mdflush.py")
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_memleak(self):
+        self.run_with_duration("memleak.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8")
+    def test_mountsnoop(self):
+        self.run_with_int("mountsnoop.py")
+
+    @skipUnless(kernel_version_ge(4,3), "requires kernel >= 4.3")
+    def test_mysqld_qslower(self):
+        # Deliberately left empty -- mysqld_qslower requires an instance of
+        # MySQL to be running, or it fails to attach.
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_nfsslower(self):
+        if(self.kmod_loaded("nfs")):
+            self.run_with_int("nfsslower.py")
+        else:
+            pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_nfsdist(self):
+        if(self.kmod_loaded("nfs")):
+            self.run_with_duration("nfsdist.py 1 1")
+        else:
+            pass
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_offcputime(self):
+        self.run_with_duration("offcputime.py 1")
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_offwaketime(self):
+        self.run_with_duration("offwaketime.py 1")
+
+    @skipUnless(kernel_version_ge(4,9), "requires kernel >= 4.9")
+    def test_oomkill(self):
+        self.run_with_int("oomkill.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_opensnoop(self):
+        self.run_with_int("opensnoop.py")
+
+    def test_pidpersec(self):
+        self.run_with_int("pidpersec.py")
+
+    @skipUnless(kernel_version_ge(4,9), "requires kernel >= 4.9")
+    def test_profile(self):
+        self.run_with_duration("profile.py 1")
+
+    def test_runqlat(self):
+        self.run_with_duration("runqlat.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,9), "requires kernel >= 4.9")
+    def test_runqlen(self):
+        self.run_with_duration("runqlen.py 1 1")
+
+    def test_slabratetop(self):
+        self.run_with_duration("slabratetop.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+    def test_softirqs(self):
+        self.run_with_duration("softirqs.py 1 1")
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_solisten(self):
+        self.run_with_int("solisten.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_sslsniff(self):
+        self.run_with_int("sslsniff.py")
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_stackcount(self):
+        self.run_with_int("stackcount.py __kmalloc -i 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_statsnoop(self):
+        self.run_with_int("statsnoop.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_syncsnoop(self):
+        self.run_with_int("syncsnoop.py")
+
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+    def test_syscount(self):
+        self.run_with_int("syscount.py -i 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_tcpaccept(self):
+        self.run_with_int("tcpaccept.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_tcpconnect(self):
+        self.run_with_int("tcpconnect.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_tcpconnlat(self):
+        self.run_with_int("tcpconnlat.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_tcplife(self):
+        self.run_with_int("tcplife.py")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_tcpretrans(self):
+        self.run_with_int("tcpretrans.py")
+
+    @skipUnless(kernel_version_ge(4, 7), "requires kernel >= 4.7")
+    def test_tcpdrop(self):
+        self.run_with_int("tcpdrop.py")
+
+    def test_tcptop(self):
+        self.run_with_duration("tcptop.py 1 1")
+
+    def test_tplist(self):
+        self.run_with_duration("tplist.py -p %d" % os.getpid())
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_trace(self):
+        self.run_with_int("trace.py do_sys_open")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_ttysnoop(self):
+        self.run_with_int("ttysnoop.py /dev/console")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_ucalls(self):
+        # This attaches a large number (300+) kprobes, which can be slow,
+        # so use an increased timeout value.
+        self.run_with_int("lib/ucalls.py -l none -S %d" % os.getpid(),
+                          timeout=60, kill_timeout=60)
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_uflow(self):
+        # The Python installed on the Ubuntu buildbot doesn't have USDT
+        # probes, so we can't run uflow.
+        # self.run_with_int("pythonflow.py %d" % os.getpid())
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_ugc(self):
+        # This requires a runtime that has GC probes to be installed.
+        # Python has them, but only in very recent versions. Skip.
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_uobjnew(self):
+        self.run_with_int("cobjnew.sh %d" % os.getpid())
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_ustat(self):
+        self.run_with_duration("lib/ustat.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_uthreads(self):
+        self.run_with_int("lib/uthreads.py %d" % os.getpid())
+
+    def test_vfscount(self):
+        self.run_with_int("vfscount.py", timeout=15, kill_timeout=15)
+
+    def test_vfsstat(self):
+        self.run_with_duration("vfsstat.py 1 1")
+
+    @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6")
+    def test_wakeuptime(self):
+        self.run_with_duration("wakeuptime.py 1")
+
+    def test_xfsdist(self):
+        # Doesn't work on build bot because xfs functions not present in the
+        # kernel image.
+        # self.run_with_duration("xfsdist.py 1 1")
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_xfsslower(self):
+        # Doesn't work on build bot because xfs functions not present in the
+        # kernel image.
+        # self.run_with_int("xfsslower.py")
+        pass
+
+    def test_zfsdist(self):
+        # Fails to attach the probe if zfs is not installed.
+        pass
+
+    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    def test_zfsslower(self):
+        # Fails to attach the probe if zfs is not installed.
+        pass
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_trace1.b b/tests/python/test_trace1.b
new file mode 100644
index 0000000..05ddda6
--- /dev/null
+++ b/tests/python/test_trace1.b
@@ -0,0 +1,43 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+struct Ptr {
+  u64 ptr:64;
+};
+struct Counters {
+  u64 stat1:64;
+  u64 stat2:64;
+};
+Table<Ptr, Counters, FIXED_MATCH, AUTO> stats(1024);
+
+// example with on_valid syntax
+u32 sys_wr (struct proto::pt_regs *ctx) {
+  struct Ptr key = {.ptr=ctx->di};
+  struct Counters *leaf;
+  leaf = stats[key];
+  if leaf {
+    atomic_add(leaf->stat2, 1);
+  }
+  log("sys_wr: %p\n", ctx->di);
+  return 0;
+}
+
+// example with smallest available syntax
+// note: if stats[key] fails, program returns early
+u32 sys_rd (struct proto::pt_regs *ctx) {
+  struct Ptr key = {.ptr=ctx->di};
+  atomic_add(stats[key].stat1, 1);
+}
+
+// example with if/else case
+u32 sys_bpf (struct proto::pt_regs *ctx) {
+  struct Ptr key = {.ptr=ctx->di};
+  struct Counters *leaf;
+  leaf = stats[key];
+  if leaf {
+    atomic_add(leaf->stat1, 1);
+  } else {
+    log("update %llx failed\n", ctx->di);
+  }
+  return 0;
+}
+
diff --git a/tests/python/test_trace1.py b/tests/python/test_trace1.py
new file mode 100755
index 0000000..dc005c5
--- /dev/null
+++ b/tests/python/test_trace1.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from ctypes import c_uint, c_ulong, Structure
+from bcc import BPF
+import os
+from time import sleep
+import sys
+from unittest import main, TestCase
+
+arg1 = sys.argv.pop(1)
+arg2 = ""
+if len(sys.argv) > 1:
+  arg2 = sys.argv.pop(1)
+
+Key = None
+Leaf = None
+if arg1.endswith(".b"):
+    class Key(Structure):
+        _fields_ = [("fd", c_ulong)]
+    class Leaf(Structure):
+        _fields_ = [("stat1", c_ulong),
+                    ("stat2", c_ulong)]
+
+class TestKprobe(TestCase):
+    def setUp(self):
+        b = BPF(arg1, arg2, debug=0)
+        self.stats = b.get_table("stats", Key, Leaf)
+        b.attach_kprobe(event=b.get_syscall_fnname("write"), fn_name="sys_wr")
+        b.attach_kprobe(event=b.get_syscall_fnname("read"), fn_name="sys_rd")
+        b.attach_kprobe(event="htab_map_get_next_key", fn_name="sys_rd")
+
+    def test_trace1(self):
+        with open("/dev/null", "a") as f:
+            for i in range(0, 100):
+                os.write(f.fileno(), b"")
+        with open("/etc/services", "r") as f:
+            for i in range(0, 200):
+                os.read(f.fileno(), 1)
+        for key, leaf in self.stats.items():
+            print("fd %x:" % key.fd, "stat1 %d" % leaf.stat1, "stat2 %d" % leaf.stat2)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_trace2.b b/tests/python/test_trace2.b
new file mode 100644
index 0000000..1e4bcd1
--- /dev/null
+++ b/tests/python/test_trace2.b
@@ -0,0 +1,11 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include "kprobe.b"
+struct Ptr { u64 ptr:64; };
+struct Counters { u64 stat1:64; };
+Table<Ptr, Counters, FIXED_MATCH, AUTO> stats(1024);
+
+u32 count_sched (struct proto::pt_regs *ctx) {
+  struct Ptr key = {.ptr=ctx->bx};
+  atomic_add(stats[key].stat1, 1);
+}
diff --git a/tests/python/test_trace2.c b/tests/python/test_trace2.c
new file mode 100644
index 0000000..4c18a86
--- /dev/null
+++ b/tests/python/test_trace2.c
@@ -0,0 +1,13 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <linux/ptrace.h>
+struct Ptr { u64 ptr; };
+struct Counters { u64 stat1; };
+BPF_HASH(stats, struct Ptr, struct Counters, 1024);
+
+int count_sched(struct pt_regs *ctx) {
+  struct Ptr key = {.ptr = PT_REGS_PARM1(ctx)};
+  struct Counters zleaf = {0};
+  stats.lookup_or_init(&key, &zleaf)->stat1++;
+  return 0;
+}
diff --git a/tests/python/test_trace2.py b/tests/python/test_trace2.py
new file mode 100755
index 0000000..5e9805a
--- /dev/null
+++ b/tests/python/test_trace2.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from ctypes import c_uint, c_ulong, Structure
+from bcc import BPF
+from time import sleep
+import sys
+from unittest import main, TestCase
+
+text = """
+#include <linux/ptrace.h>
+struct Ptr { u64 ptr; };
+struct Counters { char unused; __int128 stat1; };
+BPF_HASH(stats, struct Ptr, struct Counters, 1024);
+
+int count_sched(struct pt_regs *ctx) {
+  struct Ptr key = {.ptr=PT_REGS_PARM1(ctx)};
+  struct Counters zleaf;
+
+  memset(&zleaf, 0, sizeof(zleaf));
+  stats.lookup_or_init(&key, &zleaf)->stat1++;
+  return 0;
+}
+"""
+
+class TestTracingEvent(TestCase):
+    def setUp(self):
+        b = BPF(text=text, debug=0)
+        self.stats = b.get_table("stats")
+        b.attach_kprobe(event="finish_task_switch", fn_name="count_sched")
+
+    def test_sched1(self):
+        for i in range(0, 100):
+            sleep(0.01)
+        for key, leaf in self.stats.items():
+            print("ptr %x:" % key.ptr, "stat1 (%d %d)" % (leaf.stat1[1], leaf.stat1[0]))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_trace3.c b/tests/python/test_trace3.c
new file mode 100644
index 0000000..10d91d0
--- /dev/null
+++ b/tests/python/test_trace3.c
@@ -0,0 +1,53 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <linux/ptrace.h>
+#include <linux/blkdev.h>
+struct Request { u64 rq; };
+struct Time { u64 start; };
+BPF_HASH(requests, struct Request, struct Time, 1024);
+#define SLOTS 100
+BPF_ARRAY(latency, u64, SLOTS);
+
+static u32 log2(u32 v) {
+  u32 r, shift;
+
+  r = (v > 0xFFFF) << 4; v >>= r;
+  shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+  shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+  shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+  r |= (v >> 1);
+  return r;
+}
+
+static u32 log2l(u64 v) {
+  u32 hi = v >> 32;
+  if (hi)
+    return log2(hi) + 32;
+  else
+    return log2(v);
+}
+
+int probe_blk_start_request(struct pt_regs *ctx) {
+  struct Request rq = {.rq = PT_REGS_PARM1(ctx)};
+  struct Time tm = {.start = bpf_ktime_get_ns()};
+  requests.update(&rq, &tm);
+  return 0;
+}
+
+int probe_blk_update_request(struct pt_regs *ctx) {
+  struct Request rq = {.rq = PT_REGS_PARM1(ctx)};
+  struct Time *tm = requests.lookup(&rq);
+  if (!tm) return 0;
+  u64 delta = bpf_ktime_get_ns() - tm->start;
+  requests.delete(&rq);
+  u64 lg = log2l(delta);
+  u64 base = 1ull << lg;
+  u32 index = (lg * 64 + (delta - base) * 64 / base) * 3 / 64;
+  if (index >= SLOTS)
+    index = SLOTS - 1;
+
+  u64 zero = 0;
+  u64 *val = latency.lookup_or_init(&index, &zero);
+  lock_xadd(val, 1);
+  return 0;
+}
diff --git a/tests/python/test_trace3.py b/tests/python/test_trace3.py
new file mode 100755
index 0000000..94f5498
--- /dev/null
+++ b/tests/python/test_trace3.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from ctypes import c_uint, c_ulong, Structure
+from bcc import BPF
+from time import sleep
+import sys
+from unittest import main, TestCase
+
+arg1 = sys.argv.pop(1)
+arg2 = ""
+if len(sys.argv) > 1:
+  arg2 = sys.argv.pop(1)
+
+
+class TestBlkRequest(TestCase):
+    def setUp(self):
+        b = BPF(arg1, arg2, debug=0)
+        self.latency = b.get_table("latency", c_uint, c_ulong)
+        b.attach_kprobe(event="blk_start_request",
+                fn_name="probe_blk_start_request")
+        b.attach_kprobe(event="blk_update_request",
+                fn_name="probe_blk_update_request")
+
+    def test_blk1(self):
+        import subprocess
+        import os
+        # use /opt instead of /tmp so that it hits a real disk
+        for i in range(0, 2):
+            subprocess.call(["dd", "if=/dev/zero", "of=/opt/trace3.txt",
+                             "count=1024", "bs=4096"])
+            subprocess.call(["sync"])
+        os.unlink("/opt/trace3.txt")
+        for key, leaf in self.latency.items():
+            print("latency %u:" % key.value, "count %u" % leaf.value)
+        sys.stdout.flush()
+        self.assertEqual(len(list(self.latency.keys())), len(self.latency))
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_trace4.py b/tests/python/test_trace4.py
new file mode 100755
index 0000000..6836047
--- /dev/null
+++ b/tests/python/test_trace4.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import os
+import sys
+from unittest import main, TestCase
+
+class TestKprobeRgx(TestCase):
+    def setUp(self):
+        self.b = BPF(text=b"""
+        typedef struct { int idx; } Key;
+        typedef struct { u64 val; } Val;
+        BPF_HASH(stats, Key, Val, 3);
+        int hello(void *ctx) {
+          stats.lookup_or_init(&(Key){1}, &(Val){0})->val++;
+          return 0;
+        }
+        int goodbye(void *ctx) {
+          stats.lookup_or_init(&(Key){2}, &(Val){0})->val++;
+          return 0;
+        }
+        """)
+        self.b.attach_kprobe(event_re=b"^" + self.b.get_syscall_prefix() + b"bp.*",
+                             fn_name=b"hello")
+        self.b.attach_kretprobe(event_re=b"^" + self.b.get_syscall_prefix() + b"bp.*",
+                                fn_name=b"goodbye")
+
+    def test_send1(self):
+        k1 = self.b[b"stats"].Key(1)
+        k2 = self.b[b"stats"].Key(2)
+        self.assertTrue(self.b[b"stats"][k1].val >= 2)
+        self.assertTrue(self.b[b"stats"][k2].val == 1)
+
+class TestKprobeReplace(TestCase):
+    def setUp(self):
+        self.b = BPF(text=b"int empty(void *ctx) { return 0; }")
+
+    def test_periods(self):
+        self.b.attach_kprobe(event_re=b"^tcp_enter_cwr.*", fn_name=b"empty")
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_tracepoint.py b/tests/python/test_tracepoint.py
new file mode 100755
index 0000000..3bc576a
--- /dev/null
+++ b/tests/python/test_tracepoint.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# Copyright (c) Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import bcc
+import unittest
+from time import sleep
+import distutils.version
+import os
+import subprocess
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+@unittest.skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+class TestTracepoint(unittest.TestCase):
+    def test_tracepoint(self):
+        text = """
+        BPF_HASH(switches, u32, u64);
+        TRACEPOINT_PROBE(sched, sched_switch) {
+            u64 val = 0;
+            u32 pid = args->next_pid;
+            u64 *existing = switches.lookup_or_init(&pid, &val);
+            (*existing)++;
+            return 0;
+        }
+        """
+        b = bcc.BPF(text=text)
+        sleep(1)
+        total_switches = 0
+        for k, v in b["switches"].items():
+            total_switches += v.value
+        self.assertNotEqual(0, total_switches)
+
+@unittest.skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
+class TestTracepointDataLoc(unittest.TestCase):
+    def test_tracepoint_data_loc(self):
+        text = """
+        struct value_t {
+            char filename[64];
+        };
+        BPF_HASH(execs, u32, struct value_t);
+        TRACEPOINT_PROBE(sched, sched_process_exec) {
+            struct value_t val = {0};
+            char fn[64];
+            u32 pid = args->pid;
+            struct value_t *existing = execs.lookup_or_init(&pid, &val);
+            TP_DATA_LOC_READ_CONST(fn, filename, 64);
+            __builtin_memcpy(existing->filename, fn, 64);
+            return 0;
+        }
+        """
+        b = bcc.BPF(text=text)
+        subprocess.check_output(["/bin/ls"])
+        sleep(1)
+        self.assertTrue("/bin/ls" in [v.filename.decode()
+                                      for v in b["execs"].values()])
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_uprobes.py b/tests/python/test_uprobes.py
new file mode 100755
index 0000000..62a370f
--- /dev/null
+++ b/tests/python/test_uprobes.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import bcc
+import ctypes
+import errno
+import os
+import subprocess
+import shutil
+import time
+import unittest
+
+class TestUprobes(unittest.TestCase):
+    def test_simple_library(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+BPF_ARRAY(stats, u64, 1);
+static void incr(int idx) {
+    u64 *ptr = stats.lookup(&idx);
+    if (ptr)
+        ++(*ptr);
+}
+int count(struct pt_regs *ctx) {
+    bpf_trace_printk("count() uprobe fired");
+    u32 pid = bpf_get_current_pid_tgid();
+    if (pid == PID)
+        incr(0);
+    return 0;
+}"""
+        test_pid = os.getpid()
+        text = text.replace("PID", "%d" % test_pid)
+        b = bcc.BPF(text=text)
+        b.attach_uprobe(name="c", sym="malloc_stats", fn_name="count", pid=test_pid)
+        b.attach_uretprobe(name="c", sym="malloc_stats", fn_name="count", pid=test_pid)
+        libc = ctypes.CDLL("libc.so.6")
+        libc.malloc_stats.restype = None
+        libc.malloc_stats.argtypes = []
+        libc.malloc_stats()
+        self.assertEqual(b["stats"][ctypes.c_int(0)].value, 2)
+        b.detach_uretprobe(name="c", sym="malloc_stats", pid=test_pid)
+        b.detach_uprobe(name="c", sym="malloc_stats", pid=test_pid)
+
+    def test_simple_binary(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+BPF_ARRAY(stats, u64, 1);
+static void incr(int idx) {
+    u64 *ptr = stats.lookup(&idx);
+    if (ptr)
+        ++(*ptr);
+}
+int count(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    incr(0);
+    return 0;
+}"""
+        b = bcc.BPF(text=text)
+        b.attach_uprobe(name="/usr/bin/python", sym="main", fn_name="count")
+        b.attach_uretprobe(name="/usr/bin/python", sym="main", fn_name="count")
+        with os.popen("/usr/bin/python -V") as f:
+            pass
+        self.assertGreater(b["stats"][ctypes.c_int(0)].value, 0)
+        b.detach_uretprobe(name="/usr/bin/python", sym="main")
+        b.detach_uprobe(name="/usr/bin/python", sym="main")
+
+    def test_mount_namespace(self):
+        text = """
+#include <uapi/linux/ptrace.h>
+BPF_TABLE("array", int, u64, stats, 1);
+static void incr(int idx) {
+    u64 *ptr = stats.lookup(&idx);
+    if (ptr)
+        ++(*ptr);
+}
+int count(struct pt_regs *ctx) {
+    bpf_trace_printk("count() uprobe fired");
+    u32 pid = bpf_get_current_pid_tgid();
+    if (pid == PID)
+        incr(0);
+    return 0;
+}"""
+        # Need to import libc from ctypes to access unshare(2)
+        libc = ctypes.CDLL("libc.so.6", use_errno=True)
+
+        # Need to find path to libz.so.1
+        libz_path = None
+        p = subprocess.Popen(["ldconfig", "-p"], stdout=subprocess.PIPE)
+        for l in p.stdout:
+            n = l.split()
+            if n[0] == b"libz.so.1":
+                # if libz was already found, override only if new lib is more
+                # specific (e.g. libc6,x86-64 vs libc6)
+                if not libz_path or len(n[1].split(b",")) > 1:
+                    libz_path = n[-1]
+        p.wait()
+        p.stdout.close()
+        p = None
+
+        self.assertIsNotNone(libz_path)
+
+        # fork a child that we'll place in a separate mount namespace
+        child_pid = os.fork()
+        if child_pid == 0:
+            # Unshare CLONE_NEWNS
+            if libc.unshare(0x00020000) == -1:
+                e = ctypes.get_errno()
+                raise OSError(e, errno.errorcode[e])
+
+            # Remount root MS_REC|MS_PRIVATE
+            if libc.mount(None, b"/", None, (1<<14)|(1<<18) , None) == -1:
+                e = ctypes.get_errno()
+                raise OSError(e, errno.errorcode[e])
+
+            if libc.mount(b"tmpfs", b"/tmp", b"tmpfs", 0, None) == -1:
+                e = ctypes.get_errno()
+                raise OSError(e, errno.errorcode[e])
+
+            shutil.copy(libz_path, b"/tmp")
+
+            libz = ctypes.CDLL("/tmp/libz.so.1")
+            time.sleep(1)
+            libz.zlibVersion()
+            time.sleep(5)
+            os._exit(0)
+
+        libname = "/tmp/libz.so.1"
+        symname = "zlibVersion"
+        text = text.replace("PID", "%d" % child_pid)
+        b = bcc.BPF(text=text)
+        b.attach_uprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
+        b.attach_uretprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
+        time.sleep(1)
+        self.assertEqual(b["stats"][ctypes.c_int(0)].value, 2)
+        b.detach_uretprobe(name=libname, sym=symname, pid=child_pid)
+        b.detach_uprobe(name=libname, sym=symname, pid=child_pid)
+        os.wait()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_usdt.py b/tests/python/test_usdt.py
new file mode 100755
index 0000000..27a0e47
--- /dev/null
+++ b/tests/python/test_usdt.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+#
+# USAGE: test_usdt.py
+#
+# Copyright 2017 Facebook, Inc
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF, USDT
+from unittest import main, TestCase
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+import ctypes as ct
+import inspect
+import os
+import signal
+
+class TestUDST(TestCase):
+    def setUp(self):
+        # Application, minimum, to define three trace points
+        app_text = b"""
+#include <unistd.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "folly/tracing/StaticTracepoint.h"
+
+int main() {
+  char s[100];
+  int i, a = 200, b = 40;
+  for (i = 0; i < 100; i++) s[i] = (i & 7) + (i & 6);
+  uint64_t j = 0;
+  char s1[64];
+  const char* str = "str";
+  size_t len = strlen(str);
+  while (1) {
+    FOLLY_SDT(test, probe_point_1, s[7], b);
+    FOLLY_SDT(test, probe_point_3, a, b);
+    FOLLY_SDT(test, probe_point_1, s[4], a);
+    FOLLY_SDT(test, probe_point_2, 5, s[10]);
+    FOLLY_SDT(test, probe_point_3, s[4], s[7]);
+
+    memset(&s1, '\0', sizeof(s1));
+    strncpy(s1, str, len);
+    snprintf(s1 + len, sizeof(s1) - len, "%d", j);
+    FOLLY_SDT(test, probe_point_4, j++, &s1);
+
+    memset(&s1, '\0', sizeof(s1));
+    strncpy(s1, str, len);
+    snprintf(s1 + len, sizeof(s1) - len, "%d", j);
+    FOLLY_SDT(test, probe_point_5, &s1, j++);
+
+    sleep(1);
+  }
+  return 1;
+}
+"""
+        # BPF program
+        self.bpf_text = """
+#include <linux/blkdev.h>
+#include <uapi/linux/ptrace.h>
+
+struct probe_result_t1 {
+  char v1;
+  int  v2;
+};
+
+struct probe_result_t2 {
+  int  v1;
+  char v2;
+};
+
+struct probe_result_t3 {
+  int v1;
+  int v2;
+};
+
+struct probe_result_t4 {
+  u64  v1;
+  char v2[8];
+};
+
+struct probe_result_t5 {
+  char v1[8];
+  u64  v2;
+};
+
+BPF_PERF_OUTPUT(event1);
+BPF_PERF_OUTPUT(event2);
+BPF_PERF_OUTPUT(event3);
+BPF_PERF_OUTPUT(event4);
+BPF_PERF_OUTPUT(event5);
+
+int do_trace1(struct pt_regs *ctx) {
+    struct probe_result_t1 result = {};
+    bpf_usdt_readarg(1, ctx, &result.v1);
+    bpf_usdt_readarg(2, ctx, &result.v2);
+    event1.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+};
+int do_trace2(struct pt_regs *ctx) {
+    struct probe_result_t2 result = {};
+    bpf_usdt_readarg(1, ctx, &result.v1);
+    bpf_usdt_readarg(2, ctx, &result.v2);
+    event2.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+int do_trace3(struct pt_regs *ctx) {
+    struct probe_result_t3 result = {};
+    bpf_usdt_readarg(1, ctx, &result.v1);
+    bpf_usdt_readarg(2, ctx, &result.v2);
+    event3.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+int do_trace4(struct pt_regs *ctx) {
+    struct probe_result_t4 result = {};
+    bpf_usdt_readarg(1, ctx, &result.v1);
+    bpf_usdt_readarg_p(2, ctx, &result.v2, sizeof(result.v2));
+    event4.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+int do_trace5(struct pt_regs *ctx) {
+    struct probe_result_t5 result = {};
+    bpf_usdt_readarg_p(1, ctx, &result.v1, sizeof(result.v1));
+    bpf_usdt_readarg(2, ctx, &result.v2);
+    event5.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+"""
+
+        # Compile and run the application
+        self.ftemp = NamedTemporaryFile(delete=False)
+        self.ftemp.close()
+        comp = Popen(["gcc", "-I", "%s/include" % os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),
+                      "-x", "c", "-o", self.ftemp.name, "-"],
+                     stdin=PIPE)
+        comp.stdin.write(app_text)
+        comp.stdin.close()
+        self.assertEqual(comp.wait(), 0)
+        self.app = Popen([self.ftemp.name])
+
+    def test_attach1(self):
+        # enable USDT probe from given PID and verifier generated BPF programs
+        u = USDT(pid=int(self.app.pid))
+        u.enable_probe(probe="probe_point_1", fn_name="do_trace1")
+        u.enable_probe(probe="probe_point_2", fn_name="do_trace2")
+        u.enable_probe(probe="probe_point_3", fn_name="do_trace3")
+        u.enable_probe(probe="probe_point_4", fn_name="do_trace4")
+        u.enable_probe(probe="probe_point_5", fn_name="do_trace5")
+        b = BPF(text=self.bpf_text, usdt_contexts=[u], debug=4)
+
+        # Event states for each event:
+        # 0 - probe not caught, 1 - probe caught with correct value,
+        # 2 - probe caught with incorrect value
+        self.evt_st_1 = 0
+        self.evt_st_2 = 0
+        self.evt_st_3 = 0
+
+        # define output data structure in Python
+        class Data1(ct.Structure):
+            _fields_ = [("v1", ct.c_char),
+                        ("v2", ct.c_int)]
+
+        class Data2(ct.Structure):
+            _fields_ = [("v1", ct.c_int),
+                        ("v2", ct.c_char)]
+
+        class Data3(ct.Structure):
+            _fields_ = [("v1", ct.c_int),
+                        ("v2", ct.c_int)]
+
+        class Data4(ct.Structure):
+            _fields_ = [("v1", ct.c_ulonglong),
+                        ("v2", ct.c_char * 64)]
+
+        class Data5(ct.Structure):
+            _fields_ = [("v1", ct.c_char * 64),
+                        ("v2", ct.c_ulonglong)]
+
+        def check_event_val(event, event_state, v1, v2, v3, v4):
+            if ((event.v1 == v1 and event.v2 == v2) or (event.v1 == v3 and event.v2 == v4)):
+                if (event_state == 0 or event_state == 1):
+                    return 1
+            return 2
+
+        def print_event1(cpu, data, size):
+            event = ct.cast(data, ct.POINTER(Data1)).contents
+            self.evt_st_1 = check_event_val(event, self.evt_st_1, b'\x0d', 40, b'\x08', 200)
+
+        def print_event2(cpu, data, size):
+            event = ct.cast(data, ct.POINTER(Data2)).contents
+            # pretend we have two identical probe points to simplify the code
+            self.evt_st_2 = check_event_val(event, self.evt_st_2, 5, b'\x04', 5, b'\x04')
+
+        def print_event3(cpu, data, size):
+            event = ct.cast(data, ct.POINTER(Data3)).contents
+            self.evt_st_3 = check_event_val(event, self.evt_st_3, 200, 40, 8, 13)
+
+        def print_event4(cpu, data, size):
+            event = ct.cast(data, ct.POINTER(Data4)).contents
+            print("%s" % event.v2)
+
+        def print_event5(cpu, data, size):
+            event = ct.cast(data, ct.POINTER(Data5)).contents
+            print("%s" % event.v1)
+
+        # loop with callback to print_event
+        b["event1"].open_perf_buffer(print_event1)
+        b["event2"].open_perf_buffer(print_event2)
+        b["event3"].open_perf_buffer(print_event3)
+        b["event4"].open_perf_buffer(print_event4)
+        b["event5"].open_perf_buffer(print_event5)
+
+        # three iterations to make sure we get some probes and have time to process them
+        for i in range(3):
+            b.perf_buffer_poll()
+        self.assertTrue(self.evt_st_1 == 1 and self.evt_st_2 == 1 and self.evt_st_3 == 1)
+
+    def tearDown(self):
+        # kill the subprocess, clean the environment
+        self.app.kill()
+        self.app.wait()
+        os.unlink(self.ftemp.name)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_usdt2.py b/tests/python/test_usdt2.py
new file mode 100755
index 0000000..a2f4611
--- /dev/null
+++ b/tests/python/test_usdt2.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+#
+# USAGE: test_usdt2.py
+#
+# Copyright 2017 Facebook, Inc
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF, USDT
+from unittest import main, TestCase
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile
+import ctypes as ct
+import inspect
+import os
+import signal
+
+class TestUDST(TestCase):
+    def setUp(self):
+        # Application, minimum, to define three trace points
+        app_text = b"""
+#include <stdlib.h>
+#include <unistd.h>
+#include "folly/tracing/StaticTracepoint.h"
+
+int main(int argc, char **argv) {
+  int t = atoi(argv[1]);
+  while (1) {
+    FOLLY_SDT(test, probe_point_1, t);
+    FOLLY_SDT(test, probe_point_2, t + 1);
+    FOLLY_SDT(test, probe_point_3, t + 2);
+    sleep(1);
+  }
+  return 1;
+}
+"""
+        # BPF program
+        self.bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+BPF_PERF_OUTPUT(event1);
+BPF_PERF_OUTPUT(event2);
+BPF_PERF_OUTPUT(event3);
+BPF_PERF_OUTPUT(event4);
+BPF_PERF_OUTPUT(event5);
+BPF_PERF_OUTPUT(event6);
+
+int do_trace1(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    int result = 0;
+    bpf_usdt_readarg(1, ctx, &result);
+    if (FILTER)
+      event1.perf_submit(ctx, &result, sizeof(result));
+    else
+      event4.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+};
+int do_trace2(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    int result = 0;
+    bpf_usdt_readarg(1, ctx, &result);
+    if (FILTER)
+      event2.perf_submit(ctx, &result, sizeof(result));
+    else
+      event5.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+int do_trace3(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    int result = 0;
+    bpf_usdt_readarg(1, ctx, &result);
+    if (FILTER)
+      event3.perf_submit(ctx, &result, sizeof(result));
+    else
+      event6.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+}
+"""
+
+        # Compile and run the application
+        self.ftemp = NamedTemporaryFile(delete=False)
+        self.ftemp.close()
+        comp = Popen(["gcc", "-I", "%s/include" % os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),
+                      "-x", "c", "-o", self.ftemp.name, "-"],
+                     stdin=PIPE)
+        comp.stdin.write(app_text)
+        comp.stdin.close()
+        self.assertEqual(comp.wait(), 0)
+
+        # create 3 applications, 2 applications will have usdt attached and
+        # the third one does not, and the third one should not call into
+        # bpf program.
+        self.app = Popen([self.ftemp.name, "1"])
+        self.app2 = Popen([self.ftemp.name, "11"])
+        self.app3 = Popen([self.ftemp.name, "21"])
+
+    def test_attach1(self):
+        # Enable USDT probe from given PID and verifier generated BPF programs.
+        u = USDT(pid=int(self.app.pid))
+        u.enable_probe(probe="probe_point_1", fn_name="do_trace1")
+        u.enable_probe(probe="probe_point_2", fn_name="do_trace2")
+        u2 = USDT(pid=int(self.app2.pid))
+        u2.enable_probe(probe="probe_point_2", fn_name="do_trace2")
+        u2.enable_probe(probe="probe_point_3", fn_name="do_trace3")
+        self.bpf_text = self.bpf_text.replace("FILTER", "pid == %d" % self.app.pid)
+        b = BPF(text=self.bpf_text, usdt_contexts=[u, u2])
+
+        # Event states for each event:
+        # 0 - probe not caught, 1 - probe caught with correct value,
+        # 2 - probe caught with incorrect value
+        self.evt_st_1 = 0
+        self.evt_st_2 = 0
+        self.evt_st_3 = 0
+        self.evt_st_4 = 0
+        self.evt_st_5 = 0
+        self.evt_st_6 = 0
+
+        def check_event_val(data, event_state, expected_val):
+            result = ct.cast(data, ct.POINTER(ct.c_int)).contents
+            if result.value == expected_val:
+                if (event_state == 0 or event_state == 1):
+                    return 1
+            return 2
+
+        def print_event1(cpu, data, size):
+            self.evt_st_1 = check_event_val(data, self.evt_st_1, 1)
+
+        def print_event2(cpu, data, size):
+            self.evt_st_2 = check_event_val(data, self.evt_st_2, 2)
+
+        def print_event3(cpu, data, size):
+            self.evt_st_3 = check_event_val(data, self.evt_st_3, 3)
+
+        def print_event4(cpu, data, size):
+            self.evt_st_4 = check_event_val(data, self.evt_st_4, 11)
+
+        def print_event5(cpu, data, size):
+            self.evt_st_5 = check_event_val(data, self.evt_st_5, 12)
+
+        def print_event6(cpu, data, size):
+            self.evt_st_6 = check_event_val(data, self.evt_st_6, 13)
+
+        # loop with callback to print_event
+        b["event1"].open_perf_buffer(print_event1)
+        b["event2"].open_perf_buffer(print_event2)
+        b["event3"].open_perf_buffer(print_event3)
+        b["event4"].open_perf_buffer(print_event4)
+        b["event5"].open_perf_buffer(print_event5)
+        b["event6"].open_perf_buffer(print_event6)
+
+        # three iterations to make sure we get some probes and have time to process them
+        for i in range(3):
+            b.perf_buffer_poll()
+
+        # note that event1 and event4 do not really fire, so their state should be 0
+        # use separate asserts so that if test fails we know which one is the culprit
+        self.assertTrue(self.evt_st_1 == 1)
+        self.assertTrue(self.evt_st_2 == 1)
+        self.assertTrue(self.evt_st_3 == 0)
+        self.assertTrue(self.evt_st_4 == 0)
+        self.assertTrue(self.evt_st_5 == 1)
+        self.assertTrue(self.evt_st_6 == 1)
+
+    def tearDown(self):
+        # kill the subprocess, clean the environment
+        self.app.kill()
+        self.app.wait()
+        self.app2.kill()
+        self.app2.wait()
+        self.app3.kill()
+        self.app3.wait()
+        os.unlink(self.ftemp.name)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_usdt3.py b/tests/python/test_usdt3.py
new file mode 100755
index 0000000..f788111
--- /dev/null
+++ b/tests/python/test_usdt3.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+#
+# USAGE: test_usdt3.py
+#
+# Copyright 2018 Facebook, Inc
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF, USDT
+from unittest import main, TestCase
+from subprocess import Popen, PIPE
+import ctypes as ct
+import inspect, os, tempfile
+
+class TestUDST(TestCase):
+    def setUp(self):
+        common_h = b"""
+#include "folly/tracing/StaticTracepoint.h"
+
+static inline void record_val(int val)
+{
+  FOLLY_SDT(test, probe, val);
+}
+
+extern void record_a(int val);
+extern void record_b(int val);
+"""
+
+        a_c = b"""
+#include <stdio.h>
+#include "common.h"
+
+void record_a(int val)
+{
+    record_val(val);
+}
+"""
+
+        b_c = b"""
+#include <stdio.h>
+#include "common.h"
+
+void record_b(int val)
+{
+    record_val(val);
+}
+"""
+
+        m_c = b"""
+#include <stdio.h>
+#include <unistd.h>
+#include "common.h"
+
+int main() {
+   while (1) {
+     record_a(1);
+     record_b(2);
+     record_val(3);
+     sleep(1);
+   }
+   return 0;
+}
+"""
+        # BPF program
+        self.bpf_text = """
+BPF_PERF_OUTPUT(event);
+int do_trace(struct pt_regs *ctx) {
+    int result = 0;
+    bpf_usdt_readarg(1, ctx, &result);
+    event.perf_submit(ctx, &result, sizeof(result));
+    return 0;
+};
+"""
+
+        def _create_file(name, text):
+            text_file = open(name, "wb")
+            text_file.write(text)
+            text_file.close()
+
+        # Create source files
+        self.tmp_dir = tempfile.mkdtemp()
+        print("temp directory: " + self.tmp_dir)
+        _create_file(self.tmp_dir + "/common.h", common_h)
+        _create_file(self.tmp_dir + "/a.c", a_c)
+        _create_file(self.tmp_dir + "/b.c", b_c)
+        _create_file(self.tmp_dir + "/m.c", m_c)
+
+        # Compilation
+        # the usdt test:probe exists in liba.so, libb.so and a.out
+        include_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + "/include"
+        a_src = self.tmp_dir + "/a.c"
+        a_obj = self.tmp_dir + "/a.o"
+        a_lib = self.tmp_dir + "/liba.so"
+        b_src = self.tmp_dir + "/b.c"
+        b_obj = self.tmp_dir + "/b.o"
+        b_lib = self.tmp_dir + "/libb.so"
+        m_src = self.tmp_dir + "/m.c"
+        m_bin = self.tmp_dir + "/a.out"
+        m_linker_opt = " -L" + self.tmp_dir + " -la -lb"
+        self.assertEqual(os.system("gcc -I" + include_path + " -fpic -c -o " + a_obj + " " + a_src), 0)
+        self.assertEqual(os.system("gcc -I" + include_path + " -fpic -c -o " + b_obj + " " + b_src), 0)
+        self.assertEqual(os.system("gcc -shared -o " + a_lib + " " + a_obj), 0)
+        self.assertEqual(os.system("gcc -shared -o " + b_lib + " " + b_obj), 0)
+        self.assertEqual(os.system("gcc -I" + include_path + " " + m_src + " -o " + m_bin + m_linker_opt), 0)
+
+        # Run the application
+        self.app = Popen([m_bin], env=dict(os.environ, LD_LIBRARY_PATH=self.tmp_dir))
+        # os.system("tplist.py -vvv -p " + str(self.app.pid))
+
+    def test_attach1(self):
+        # enable USDT probe from given PID and verifier generated BPF programs
+        u = USDT(pid=int(self.app.pid))
+        u.enable_probe(probe="probe", fn_name="do_trace")
+        b = BPF(text=self.bpf_text, usdt_contexts=[u])
+
+        # processing events
+        self.probe_value_1 = 0
+        self.probe_value_2 = 0
+        self.probe_value_3 = 0
+        self.probe_value_other = 0
+
+        def print_event(cpu, data, size):
+            result = ct.cast(data, ct.POINTER(ct.c_int)).contents
+            if result.value == 1:
+                self.probe_value_1 = 1
+            elif result.value == 2:
+                self.probe_value_2 = 1
+            elif result.value == 3:
+                self.probe_value_3 = 1
+            else:
+                self.probe_value_other = 1
+
+        b["event"].open_perf_buffer(print_event)
+        for i in range(100):
+            if (self.probe_value_1 == 0 or
+                self.probe_value_2 == 0 or
+                self.probe_value_3 == 0 or
+                self.probe_value_other != 0):
+                b.perf_buffer_poll()
+            else:
+                break;
+
+        self.assertTrue(self.probe_value_1 != 0)
+        self.assertTrue(self.probe_value_2 != 0)
+        self.assertTrue(self.probe_value_3 != 0)
+        self.assertTrue(self.probe_value_other == 0)
+
+    def tearDown(self):
+        # kill the subprocess, clean the environment
+        self.app.kill()
+        self.app.wait()
+        os.system("rm -rf " + self.tmp_dir)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_utils.py b/tests/python/test_utils.py
new file mode 100755
index 0000000..54b97cf
--- /dev/null
+++ b/tests/python/test_utils.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# Copyright (c) Catalysts GmbH
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc.utils import get_online_cpus, detect_language
+import multiprocessing
+import unittest
+import os
+
+class TestUtils(unittest.TestCase):
+    def test_get_online_cpus(self):
+        online_cpus = get_online_cpus()
+        num_cores = multiprocessing.cpu_count()
+
+        self.assertEqual(len(online_cpus), num_cores)
+
+    def test_detect_language(self):
+        candidates = ["c", "java", "perl", "php", "node", "ruby", "python"]
+        language = detect_language(candidates, os.getpid())
+        self.assertEqual(language, "python")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/python/test_xlate1.b b/tests/python/test_xlate1.b
new file mode 100644
index 0000000..2db0046
--- /dev/null
+++ b/tests/python/test_xlate1.b
@@ -0,0 +1,75 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+// test for packet modification
+
+#packed "false"
+
+struct IPKey {
+  u32 dip:32;
+  u32 sip:32;
+};
+struct IPLeaf {
+  u32 xdip:32;
+  u32 xsip:32;
+  u64 xlated_pkts:64;
+};
+Table<IPKey, IPLeaf, FIXED_MATCH, NONE> xlate(1024);
+
+struct skbuff {
+  u32 type:32;
+};
+
+u32 on_packet (struct skbuff *skb) {
+  u32 ret:32 = 1;
+
+  u32 orig_dip:32 = 0;
+  u32 orig_sip:32 = 0;
+  struct IPLeaf *xleaf;
+
+  goto proto::ethernet;
+
+  state proto::ethernet {
+  }
+
+  state proto::dot1q {
+  }
+
+  state proto::ip {
+    orig_dip = $ip.dst;
+    orig_sip = $ip.src;
+    struct IPKey key = {.dip=orig_dip, .sip=orig_sip};
+    xlate.lookup(key, xleaf) {};
+    on_valid(xleaf) {
+      incr_cksum(@ip.hchecksum, orig_dip, xleaf.xdip);
+      incr_cksum(@ip.hchecksum, orig_sip, xleaf.xsip);
+      // the below are equivalent
+      pkt.rewrite_field($ip.dst, xleaf.xdip);
+      $ip.src = xleaf.xsip;
+      atomic_add(xleaf.xlated_pkts, 1);
+    }
+  }
+
+  state proto::udp {
+    on_valid(xleaf) {
+      incr_cksum(@udp.crc, orig_dip, xleaf.xdip, 1);
+      incr_cksum(@udp.crc, orig_sip, xleaf.xsip, 1);
+    }
+  }
+
+  state proto::tcp {
+    on_valid(xleaf) {
+      incr_cksum(@tcp.cksum, orig_dip, xleaf.xdip, 1);
+      incr_cksum(@tcp.cksum, orig_sip, xleaf.xsip, 1);
+    }
+  }
+
+  state proto::vxlan {
+  }
+
+  state proto::gre {
+  }
+
+  state EOP {
+    return ret;
+  }
+}
diff --git a/tests/python/test_xlate1.c b/tests/python/test_xlate1.c
new file mode 100644
index 0000000..5ca6717
--- /dev/null
+++ b/tests/python/test_xlate1.c
@@ -0,0 +1,98 @@
+// Copyright (c) PLUMgrid, Inc.
+// Licensed under the Apache License, Version 2.0 (the "License")
+#include <bcc/proto.h>
+struct IPKey {
+  u32 dip;
+  u32 sip;
+};
+struct IPLeaf {
+  u32 xdip;
+  u32 xsip;
+  u64 ip_xlated_pkts;
+  u64 arp_xlated_pkts;
+};
+BPF_HASH(xlate, struct IPKey, struct IPLeaf, 1024);
+
+int on_packet(struct __sk_buff *skb) {
+  u8 *cursor = 0;
+
+  u32 orig_dip = 0;
+  u32 orig_sip = 0;
+  struct IPLeaf xleaf = {};
+
+  ethernet: {
+    struct ethernet_t *ethernet = cursor_advance(cursor, sizeof(*ethernet));
+    switch (ethernet->type) {
+      case ETH_P_IP: goto ip;
+      case ETH_P_ARP: goto arp;
+      case ETH_P_8021Q: goto dot1q;
+      default: goto EOP;
+    }
+  }
+
+  dot1q: {
+    struct dot1q_t *dot1q = cursor_advance(cursor, sizeof(*dot1q));
+    switch (dot1q->type) {
+      case ETH_P_IP: goto ip;
+      case ETH_P_ARP: goto arp;
+      default: goto EOP;
+    }
+  }
+
+  arp: {
+    struct arp_t *arp = cursor_advance(cursor, sizeof(*arp));
+    orig_dip = arp->tpa;
+    orig_sip = arp->spa;
+    struct IPKey key = {.dip=orig_dip, .sip=orig_sip};
+    struct IPLeaf *xleafp = xlate.lookup(&key);
+    if (xleafp) {
+      xleaf = *xleafp;
+      arp->tpa = xleaf.xdip;
+      arp->spa = xleaf.xsip;
+      lock_xadd(&xleafp->arp_xlated_pkts, 1);
+    }
+    goto EOP;
+  }
+
+  ip: {
+    struct ip_t *ip = cursor_advance(cursor, sizeof(*ip));
+    orig_dip = ip->dst;
+    orig_sip = ip->src;
+    struct IPKey key = {.dip=orig_dip, .sip=orig_sip};
+    struct IPLeaf *xleafp = xlate.lookup(&key);
+    if (xleafp) {
+      xleaf = *xleafp;
+      ip->dst = xleaf.xdip;
+      incr_cksum_l3(&ip->hchecksum, orig_dip, xleaf.xdip);
+      ip->src = xleaf.xsip;
+      incr_cksum_l3(&ip->hchecksum, orig_sip, xleaf.xsip);
+      lock_xadd(&xleafp->ip_xlated_pkts, 1);
+    }
+    switch (ip->nextp) {
+      case 6: goto tcp;
+      case 17: goto udp;
+      default: goto EOP;
+    }
+  }
+
+  udp: {
+    struct udp_t *udp = cursor_advance(cursor, sizeof(*udp));
+    if (xleaf.xdip) {
+      incr_cksum_l4(&udp->crc, orig_dip, xleaf.xdip, 1);
+      incr_cksum_l4(&udp->crc, orig_sip, xleaf.xsip, 1);
+    }
+    goto EOP;
+  }
+
+  tcp: {
+    struct tcp_t *tcp = cursor_advance(cursor, sizeof(*tcp));
+    if (xleaf.xdip) {
+      incr_cksum_l4(&tcp->cksum, orig_dip, xleaf.xdip, 1);
+      incr_cksum_l4(&tcp->cksum, orig_sip, xleaf.xsip, 1);
+    }
+    goto EOP;
+  }
+
+EOP:
+  return 0;
+}
diff --git a/tests/python/test_xlate1.py b/tests/python/test_xlate1.py
new file mode 100755
index 0000000..5183e2a
--- /dev/null
+++ b/tests/python/test_xlate1.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from netaddr import IPAddress
+from bcc import BPF
+from pyroute2 import IPRoute, protocols
+from socket import socket, AF_INET, SOCK_DGRAM
+from subprocess import call
+import sys
+from time import sleep
+from unittest import main, TestCase
+
+arg1 = sys.argv.pop(1)
+arg2 = ""
+if len(sys.argv) > 1:
+  arg2 = sys.argv.pop(1)
+
+class TestBPFFilter(TestCase):
+    def setUp(self):
+        b = BPF(arg1, arg2, debug=0)
+        fn = b.load_func("on_packet", BPF.SCHED_ACT)
+        ip = IPRoute()
+        ifindex = ip.link_lookup(ifname="eth0")[0]
+        # set up a network to change the flow:
+        #             outside      |       inside
+        # 172.16.1.1 - 172.16.1.2  |  192.168.1.1 - 192.16.1.2
+        ip.addr("del", index=ifindex, address="172.16.1.2", mask=24)
+        ip.addr("add", index=ifindex, address="192.168.1.2", mask=24)
+        # add an ingress and egress qdisc
+        ip.tc("add", "ingress", ifindex, "ffff:")
+        ip.tc("add", "sfq", ifindex, "1:")
+        # add same program to both ingress/egress, so pkt is translated in both directions
+        action = {"kind": "bpf", "fd": fn.fd, "name": fn.name, "action": "ok"}
+        ip.tc("add-filter", "u32", ifindex, ":1", parent="ffff:", action=[action],
+                protocol=protocols.ETH_P_ALL, classid=1, target=0x10002, keys=['0x0/0x0+0'])
+        ip.tc("add-filter", "u32", ifindex, ":2", parent="1:", action=[action],
+                protocol=protocols.ETH_P_ALL, classid=1, target=0x10002, keys=['0x0/0x0+0'])
+        self.xlate = b.get_table("xlate")
+
+    def test_xlate(self):
+        key1 = self.xlate.Key(IPAddress("172.16.1.2").value, IPAddress("172.16.1.1").value)
+        leaf1 = self.xlate.Leaf(IPAddress("192.168.1.2").value, IPAddress("192.168.1.1").value, 0, 0)
+        self.xlate[key1] = leaf1
+        key2 = self.xlate.Key(IPAddress("192.168.1.1").value, IPAddress("192.168.1.2").value)
+        leaf2 = self.xlate.Leaf(IPAddress("172.16.1.1").value, IPAddress("172.16.1.2").value, 0, 0)
+        self.xlate[key2] = leaf2
+        call(["ping", "-c1", "192.168.1.1"])
+        leaf = self.xlate[key1]
+        self.assertGreater(leaf.ip_xlated_pkts, 0)
+        self.assertGreater(leaf.arp_xlated_pkts, 0)
+        leaf = self.xlate[key2]
+        self.assertGreater(leaf.ip_xlated_pkts, 0)
+        self.assertGreater(leaf.arp_xlated_pkts, 0)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/utils.py b/tests/python/utils.py
new file mode 100644
index 0000000..c34370e
--- /dev/null
+++ b/tests/python/utils.py
@@ -0,0 +1,21 @@
+from pyroute2 import NSPopen
+from distutils.spawn import find_executable
+
+def has_executable(name):
+    path = find_executable(name)
+    if path is None:
+        raise Exception(name + ": command not found")
+    return path
+
+class NSPopenWithCheck(NSPopen):
+    """
+    A wrapper for NSPopen that additionally checks if the program
+    to be executed is available from the system path or not.
+    If found, it proceeds with the usual NSPopen() call.
+    Otherwise, it raises an exception.
+    """
+
+    def __init__(self, nsname, *argv, **kwarg):
+        name = list(argv)[0][0]
+        has_executable(name)
+        super(NSPopenWithCheck, self).__init__(nsname, *argv, **kwarg)
diff --git a/tests/wrapper.sh.in b/tests/wrapper.sh.in
new file mode 100755
index 0000000..90b63ec
--- /dev/null
+++ b/tests/wrapper.sh.in
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright (c) PLUMgrid, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+#set -x
+
+name=$1; shift
+kind=$1; shift
+cmd=$1; shift
+
+PYTHONPATH=@CMAKE_BINARY_DIR@/src/python
+LD_LIBRARY_PATH=@CMAKE_BINARY_DIR@:@CMAKE_BINARY_DIR@/src/cc
+
+ns=$name
+
+function cleanup() {
+  trap - EXIT
+  if [[ "$kind" = "namespace" ]]; then
+    sudo ip netns delete $ns
+  fi
+}
+
+trap cleanup EXIT
+
+function ns_run() {
+  sudo ip netns add $ns
+  sudo ip link add $ns.in type veth peer name $ns.out
+  sudo ip link set $ns.in netns $ns
+  sudo ip netns exec $ns ip link set $ns.in name eth0
+  sudo ip netns exec $ns ip addr add dev eth0 172.16.1.2/24
+  sudo ip netns exec $ns ip link set eth0 up
+  sudo ip netns exec $ns ethtool -K eth0 tx off
+  sudo ip addr add dev $ns.out 172.16.1.1/24
+  sudo ip link set $ns.out up
+  sudo bash -c "PYTHONPATH=$PYTHONPATH LD_LIBRARY_PATH=$LD_LIBRARY_PATH ip netns exec $ns $cmd $1 $2"
+  return $?
+}
+function sudo_run() {
+  sudo bash -c "PYTHONPATH=$PYTHONPATH LD_LIBRARY_PATH=$LD_LIBRARY_PATH $cmd $1 $2"
+  return $?
+}
+function simple_run() {
+  PYTHONPATH=$PYTHONPATH LD_LIBRARY_PATH=$LD_LIBRARY_PATH $cmd $1 $2
+  return $?
+}
+
+case $kind in
+  namespace)
+    ns_run $@
+    ;;
+  sudo)
+    sudo_run $@
+    ;;
+  simple)
+    simple_run $@
+    ;;
+  *)
+    echo "Invalid kind $kind"
+    exit 1
+    ;;
+esac
+
+[[ $? -ne 0 ]] && { echo "Failed"; exit 1; }
+
+exit 0
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
new file mode 100644
index 0000000..77f96f7
--- /dev/null
+++ b/tools/CMakeLists.txt
@@ -0,0 +1,26 @@
+file(GLOB C_FILES *.c)
+file(GLOB PY_FILES *.py)
+file(GLOB SH_FILES *.sh)
+file(GLOB TXT_FILES *.txt)
+list(REMOVE_ITEM TXT_FILES ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt)
+foreach(FIL ${PY_FILES})
+  get_filename_component(FIL_WE ${FIL} NAME_WE)
+  install(PROGRAMS ${FIL} DESTINATION share/bcc/tools RENAME ${FIL_WE})
+endforeach()
+foreach(FIL ${SH_FILES})
+  if(${FIL} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/reset-trace.sh)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    install(PROGRAMS ${FIL} DESTINATION share/bcc/tools RENAME ${FIL_WE})
+  else()
+    file(READ ${FIL} CONTENT)
+    string(REPLACE ".py -l" " -l" CONTENT_WE ${CONTENT})
+    string(REPLACE "\"" "\\\"" CONTENT_WE ${CONTENT_WE})
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    install(PROGRAMS ${FIL} DESTINATION share/bcc/tools RENAME ${FIL_WE})
+    install(CODE "file(WRITE \"\$ENV{DESTDIR}/\${CMAKE_INSTALL_PREFIX}/share/bcc/tools/${FIL_WE}\" \"${CONTENT_WE}\")")
+  endif()
+endforeach()
+install(FILES ${C_FILES} DESTINATION share/bcc/tools)
+install(FILES ${TXT_FILES} DESTINATION share/bcc/tools/doc)
+add_subdirectory(lib)
+add_subdirectory(old)
diff --git a/tools/argdist.py b/tools/argdist.py
new file mode 100755
index 0000000..bbf6273
--- /dev/null
+++ b/tools/argdist.py
@@ -0,0 +1,723 @@
+#!/usr/bin/env python
+#
+# argdist   Trace a function and display a distribution of its
+#           parameter values as a histogram or frequency count.
+#
+# USAGE: argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v]
+#                [-c] [-T TOP] [-C specifier] [-H specifier] [-I header]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Copyright (C) 2016 Sasha Goldshtein.
+
+from bcc import BPF, USDT
+from time import sleep, strftime
+import argparse
+import re
+import traceback
+import os
+import sys
+
+class Probe(object):
+        next_probe_index = 0
+        streq_index = 0
+        aliases = {"$PID": "(bpf_get_current_pid_tgid() >> 32)"}
+
+        def _substitute_aliases(self, expr):
+                if expr is None:
+                        return expr
+                for alias, subst in Probe.aliases.items():
+                        expr = expr.replace(alias, subst)
+                return expr
+
+        def _parse_signature(self):
+                params = map(str.strip, self.signature.split(','))
+                self.param_types = {}
+                for param in params:
+                        # If the type is a pointer, the * can be next to the
+                        # param name. Other complex types like arrays are not
+                        # supported right now.
+                        index = param.rfind('*')
+                        index = index if index != -1 else param.rfind(' ')
+                        param_type = param[0:index + 1].strip()
+                        param_name = param[index + 1:].strip()
+                        self.param_types[param_name] = param_type
+
+        def _generate_entry(self):
+                self.entry_probe_func = self.probe_func_name + "_entry"
+                text = """
+int PROBENAME(struct pt_regs *ctx SIGNATURE)
+{
+        u64 __pid_tgid = bpf_get_current_pid_tgid();
+        u32 __pid      = __pid_tgid;        // lower 32 bits
+        u32 __tgid     = __pid_tgid >> 32;  // upper 32 bits
+        PID_FILTER
+        COLLECT
+        return 0;
+}
+"""
+                text = text.replace("PROBENAME", self.entry_probe_func)
+                text = text.replace("SIGNATURE",
+                     "" if len(self.signature) == 0 else ", " + self.signature)
+                text = text.replace("PID_FILTER", self._generate_pid_filter())
+                collect = ""
+                for pname in self.args_to_probe:
+                        param_hash = self.hashname_prefix + pname
+                        if pname == "__latency":
+                                collect += """
+u64 __time = bpf_ktime_get_ns();
+%s.update(&__pid, &__time);
+                        """ % param_hash
+                        else:
+                                collect += "%s.update(&__pid, &%s);\n" % \
+                                           (param_hash, pname)
+                text = text.replace("COLLECT", collect)
+                return text
+
+        def _generate_entry_probe(self):
+                # Any $entry(name) expressions result in saving that argument
+                # when entering the function.
+                self.args_to_probe = set()
+                regex = r"\$entry\((\w+)\)"
+                for expr in self.exprs:
+                        for arg in re.finditer(regex, expr):
+                                self.args_to_probe.add(arg.group(1))
+                for arg in re.finditer(regex, self.filter):
+                        self.args_to_probe.add(arg.group(1))
+                if any(map(lambda expr: "$latency" in expr, self.exprs)) or \
+                   "$latency" in self.filter:
+                        self.args_to_probe.add("__latency")
+                        self.param_types["__latency"] = "u64"    # nanoseconds
+                for pname in self.args_to_probe:
+                        if pname not in self.param_types:
+                                raise ValueError("$entry(%s): no such param" %
+                                                 arg)
+
+                self.hashname_prefix = "%s_param_" % self.probe_hash_name
+                text = ""
+                for pname in self.args_to_probe:
+                        # Each argument is stored in a separate hash that is
+                        # keyed by pid.
+                        text += "BPF_HASH(%s, u32, %s);\n" % \
+                             (self.hashname_prefix + pname,
+                              self.param_types[pname])
+                text += self._generate_entry()
+                return text
+
+        def _generate_retprobe_prefix(self):
+                # After we're done here, there are __%s_val variables for each
+                # argument we needed to probe using $entry(name), and they all
+                # have values (which isn't necessarily the case if we missed
+                # the method entry probe).
+                text = ""
+                self.param_val_names = {}
+                for pname in self.args_to_probe:
+                        val_name = "__%s_val" % pname
+                        text += "%s *%s = %s.lookup(&__pid);\n" % \
+                                (self.param_types[pname], val_name,
+                                 self.hashname_prefix + pname)
+                        text += "if (%s == 0) { return 0 ; }\n" % val_name
+                        self.param_val_names[pname] = val_name
+                return text
+
+        def _replace_entry_exprs(self):
+                for pname, vname in self.param_val_names.items():
+                        if pname == "__latency":
+                                entry_expr = "$latency"
+                                val_expr = "(bpf_ktime_get_ns() - *%s)" % vname
+                        else:
+                                entry_expr = "$entry(%s)" % pname
+                                val_expr = "(*%s)" % vname
+                        for i in range(0, len(self.exprs)):
+                                self.exprs[i] = self.exprs[i].replace(
+                                                entry_expr, val_expr)
+                        self.filter = self.filter.replace(entry_expr,
+                                                          val_expr)
+
+        def _attach_entry_probe(self):
+                if self.is_user:
+                        self.bpf.attach_uprobe(name=self.library,
+                                               sym=self.function,
+                                               fn_name=self.entry_probe_func,
+                                               pid=self.pid or -1)
+                else:
+                        self.bpf.attach_kprobe(event=self.function,
+                                               fn_name=self.entry_probe_func)
+
+        def _bail(self, error):
+                raise ValueError("error parsing probe '%s': %s" %
+                                 (self.raw_spec, error))
+
+        def _validate_specifier(self):
+                # Everything after '#' is the probe label, ignore it
+                spec = self.raw_spec.split('#')[0]
+                parts = spec.strip().split(':')
+                if len(parts) < 3:
+                        self._bail("at least the probe type, library, and " +
+                                   "function signature must be specified")
+                if len(parts) > 6:
+                        self._bail("extraneous ':'-separated parts detected")
+                if parts[0] not in ["r", "p", "t", "u"]:
+                        self._bail("probe type must be 'p', 'r', 't', or 'u'" +
+                                   " but got '%s'" % parts[0])
+                if re.match(r"\S+\(.*\)", parts[2]) is None:
+                        self._bail(("function signature '%s' has an invalid " +
+                                    "format") % parts[2])
+
+        def _parse_expr_types(self, expr_types):
+                if len(expr_types) == 0:
+                        self._bail("no expr types specified")
+                self.expr_types = expr_types.split(',')
+
+        def _parse_exprs(self, exprs):
+                if len(exprs) == 0:
+                        self._bail("no exprs specified")
+                self.exprs = exprs.split(',')
+
+        def _make_valid_identifier(self, ident):
+                return re.sub(r'[^A-Za-z0-9_]', '_', ident)
+
+        def __init__(self, tool, type, specifier):
+                self.usdt_ctx = None
+                self.streq_functions = ""
+                self.pid = tool.args.pid
+                self.cumulative = tool.args.cumulative or False
+                self.raw_spec = specifier
+                self._validate_specifier()
+
+                spec_and_label = specifier.split('#')
+                self.label = spec_and_label[1] \
+                             if len(spec_and_label) == 2 else None
+
+                parts = spec_and_label[0].strip().split(':')
+                self.type = type    # hist or freq
+                self.probe_type = parts[0]
+                fparts = parts[2].split('(')
+                self.function = fparts[0].strip()
+                if self.probe_type == "t":
+                        self.library = ""       # kernel
+                        self.tp_category = parts[1]
+                        self.tp_event = self.function
+                elif self.probe_type == "u":
+                        self.library = parts[1]
+                        self.probe_func_name = self._make_valid_identifier(
+                                "%s_probe%d" %
+                                (self.function, Probe.next_probe_index))
+                        self._enable_usdt_probe()
+                else:
+                        self.library = parts[1]
+                self.is_user = len(self.library) > 0
+                self.signature = fparts[1].strip()[:-1]
+                self._parse_signature()
+
+                # If the user didn't specify an expression to probe, we probe
+                # the retval in a ret probe, or simply the value "1" otherwise.
+                self.is_default_expr = len(parts) < 5
+                if not self.is_default_expr:
+                        self._parse_expr_types(parts[3])
+                        self._parse_exprs(parts[4])
+                        if len(self.exprs) != len(self.expr_types):
+                                self._bail("mismatched # of exprs and types")
+                        if self.type == "hist" and len(self.expr_types) > 1:
+                                self._bail("histograms can only have 1 expr")
+                else:
+                        if not self.probe_type == "r" and self.type == "hist":
+                                self._bail("histograms must have expr")
+                        self.expr_types = \
+                          ["u64" if not self.probe_type == "r" else "int"]
+                        self.exprs = \
+                          ["1" if not self.probe_type == "r" else "$retval"]
+                self.filter = "" if len(parts) != 6 else parts[5]
+                self._substitute_exprs()
+
+                # Do we need to attach an entry probe so that we can collect an
+                # argument that is required for an exit (return) probe?
+                def check(expr):
+                        keywords = ["$entry", "$latency"]
+                        return any(map(lambda kw: kw in expr, keywords))
+                self.entry_probe_required = self.probe_type == "r" and \
+                        (any(map(check, self.exprs)) or check(self.filter))
+
+                self.probe_func_name = self._make_valid_identifier(
+                        "%s_probe%d" %
+                        (self.function, Probe.next_probe_index))
+                self.probe_hash_name = self._make_valid_identifier(
+                        "%s_hash%d" %
+                        (self.function, Probe.next_probe_index))
+                Probe.next_probe_index += 1
+
+        def _enable_usdt_probe(self):
+                self.usdt_ctx = USDT(path=self.library, pid=self.pid)
+                self.usdt_ctx.enable_probe(
+                        self.function, self.probe_func_name)
+
+        def _generate_streq_function(self, string):
+                fname = "streq_%d" % Probe.streq_index
+                Probe.streq_index += 1
+                self.streq_functions += """
+static inline bool %s(char const *ignored, char const *str) {
+        char needle[] = %s;
+        char haystack[sizeof(needle)];
+        bpf_probe_read(&haystack, sizeof(haystack), (void *)str);
+        for (int i = 0; i < sizeof(needle) - 1; ++i) {
+                if (needle[i] != haystack[i]) {
+                        return false;
+                }
+        }
+        return true;
+}
+                """ % (fname, string)
+                return fname
+
+        def _substitute_exprs(self):
+                def repl(expr):
+                        expr = self._substitute_aliases(expr)
+                        matches = re.finditer('STRCMP\\(("[^"]+\\")', expr)
+                        for match in matches:
+                                string = match.group(1)
+                                fname = self._generate_streq_function(string)
+                                expr = expr.replace("STRCMP", fname, 1)
+                        return expr.replace("$retval", "PT_REGS_RC(ctx)")
+                for i in range(0, len(self.exprs)):
+                        self.exprs[i] = repl(self.exprs[i])
+                self.filter = repl(self.filter)
+
+        def _is_string(self, expr_type):
+                return expr_type == "char*" or expr_type == "char *"
+
+        def _generate_hash_field(self, i):
+                if self._is_string(self.expr_types[i]):
+                        return "struct __string_t v%d;\n" % i
+                else:
+                        return "%s v%d;\n" % (self.expr_types[i], i)
+
+        def _generate_usdt_arg_assignment(self, i):
+                expr = self.exprs[i]
+                if self.probe_type == "u" and expr[0:3] == "arg":
+                        arg_index = int(expr[3])
+                        arg_ctype = self.usdt_ctx.get_probe_arg_ctype(
+                                self.function, arg_index - 1)
+                        return ("        %s %s = 0;\n" +
+                                "        bpf_usdt_readarg(%s, ctx, &%s);\n") \
+                                % (arg_ctype, expr, expr[3], expr)
+                else:
+                        return ""
+
+        def _generate_field_assignment(self, i):
+                text = self._generate_usdt_arg_assignment(i)
+                if self._is_string(self.expr_types[i]):
+                        return (text + "        bpf_probe_read(&__key.v%d.s," +
+                                " sizeof(__key.v%d.s), (void *)%s);\n") % \
+                                (i, i, self.exprs[i])
+                else:
+                        return text + "        __key.v%d = %s;\n" % \
+                               (i, self.exprs[i])
+
+        def _generate_hash_decl(self):
+                if self.type == "hist":
+                        return "BPF_HISTOGRAM(%s, %s);" % \
+                               (self.probe_hash_name, self.expr_types[0])
+                else:
+                        text = "struct %s_key_t {\n" % self.probe_hash_name
+                        for i in range(0, len(self.expr_types)):
+                                text += self._generate_hash_field(i)
+                        text += "};\n"
+                        text += "BPF_HASH(%s, struct %s_key_t, u64);\n" % \
+                                (self.probe_hash_name, self.probe_hash_name)
+                        return text
+
+        def _generate_key_assignment(self):
+                if self.type == "hist":
+                        return self._generate_usdt_arg_assignment(0) + \
+                               ("%s __key = %s;\n" %
+                                (self.expr_types[0], self.exprs[0]))
+                else:
+                        text = "struct %s_key_t __key = {};\n" % \
+                                self.probe_hash_name
+                        for i in range(0, len(self.exprs)):
+                                text += self._generate_field_assignment(i)
+                        return text
+
+        def _generate_hash_update(self):
+                if self.type == "hist":
+                        return "%s.increment(bpf_log2l(__key));" % \
+                                self.probe_hash_name
+                else:
+                        return "%s.increment(__key);" % self.probe_hash_name
+
+        def _generate_pid_filter(self):
+                # Kernel probes need to explicitly filter pid, because the
+                # attach interface doesn't support pid filtering
+                if self.pid is not None and not self.is_user:
+                        return "if (__tgid != %d) { return 0; }" % self.pid
+                else:
+                        return ""
+
+        def generate_text(self):
+                program = ""
+                probe_text = """
+DATA_DECL
+                """ + (
+                    "TRACEPOINT_PROBE(%s, %s)" %
+                    (self.tp_category, self.tp_event)
+                    if self.probe_type == "t"
+                    else "int PROBENAME(struct pt_regs *ctx SIGNATURE)") + """
+{
+        u64 __pid_tgid = bpf_get_current_pid_tgid();
+        u32 __pid      = __pid_tgid;        // lower 32 bits
+        u32 __tgid     = __pid_tgid >> 32;  // upper 32 bits
+        PID_FILTER
+        PREFIX
+        if (!(FILTER)) return 0;
+        KEY_EXPR
+        COLLECT
+        return 0;
+}
+"""
+                prefix = ""
+                signature = ""
+
+                # If any entry arguments are probed in a ret probe, we need
+                # to generate an entry probe to collect them
+                if self.entry_probe_required:
+                        program += self._generate_entry_probe()
+                        prefix += self._generate_retprobe_prefix()
+                        # Replace $entry(paramname) with a reference to the
+                        # value we collected when entering the function:
+                        self._replace_entry_exprs()
+
+                if self.probe_type == "p" and len(self.signature) > 0:
+                        # Only entry uprobes/kprobes can have user-specified
+                        # signatures. Other probes force it to ().
+                        signature = ", " + self.signature
+
+                program += probe_text.replace("PROBENAME",
+                                              self.probe_func_name)
+                program = program.replace("SIGNATURE", signature)
+                program = program.replace("PID_FILTER",
+                                          self._generate_pid_filter())
+
+                decl = self._generate_hash_decl()
+                key_expr = self._generate_key_assignment()
+                collect = self._generate_hash_update()
+                program = program.replace("DATA_DECL", decl)
+                program = program.replace("KEY_EXPR", key_expr)
+                program = program.replace("FILTER",
+                        "1" if len(self.filter) == 0 else self.filter)
+                program = program.replace("COLLECT", collect)
+                program = program.replace("PREFIX", prefix)
+
+                return self.streq_functions + program
+
+        def _attach_u(self):
+                libpath = BPF.find_library(self.library)
+                if libpath is None:
+                        libpath = BPF.find_exe(self.library)
+                if libpath is None or len(libpath) == 0:
+                        self._bail("unable to find library %s" % self.library)
+
+                if self.probe_type == "r":
+                        self.bpf.attach_uretprobe(name=libpath,
+                                                  sym=self.function,
+                                                  fn_name=self.probe_func_name,
+                                                  pid=self.pid or -1)
+                else:
+                        self.bpf.attach_uprobe(name=libpath,
+                                               sym=self.function,
+                                               fn_name=self.probe_func_name,
+                                               pid=self.pid or -1)
+
+        def _attach_k(self):
+                if self.probe_type == "t":
+                        pass    # Nothing to do for tracepoints
+                elif self.probe_type == "r":
+                        self.bpf.attach_kretprobe(event=self.function,
+                                             fn_name=self.probe_func_name)
+                else:
+                        self.bpf.attach_kprobe(event=self.function,
+                                          fn_name=self.probe_func_name)
+
+        def attach(self, bpf):
+                self.bpf = bpf
+                if self.probe_type == "u":
+                        return
+                if self.is_user:
+                        self._attach_u()
+                else:
+                        self._attach_k()
+                if self.entry_probe_required:
+                        self._attach_entry_probe()
+
+        def _v2s(self, v):
+                # Most fields can be converted with plain str(), but strings
+                # are wrapped in a __string_t which has an .s field
+                if "__string_t" in type(v).__name__:
+                        return str(v.s)
+                return str(v)
+
+        def _display_expr(self, i):
+                # Replace ugly latency calculation with $latency
+                expr = self.exprs[i].replace(
+                        "(bpf_ktime_get_ns() - *____latency_val)", "$latency")
+                # Replace alias values back with the alias name
+                for alias, subst in Probe.aliases.items():
+                        expr = expr.replace(subst, alias)
+                # Replace retval expression with $retval
+                expr = expr.replace("PT_REGS_RC(ctx)", "$retval")
+                # Replace ugly (*__param_val) expressions with param name
+                return re.sub(r"\(\*__(\w+)_val\)", r"\1", expr)
+
+        def _display_key(self, key):
+                if self.is_default_expr:
+                        if not self.probe_type == "r":
+                                return "total calls"
+                        else:
+                                return "retval = %s" % str(key.v0)
+                else:
+                        # The key object has v0, ..., vk fields containing
+                        # the values of the expressions from self.exprs
+                        def str_i(i):
+                                key_i = self._v2s(getattr(key, "v%d" % i))
+                                return "%s = %s" % \
+                                        (self._display_expr(i), key_i)
+                        return ", ".join(map(str_i, range(0, len(self.exprs))))
+
+        def display(self, top):
+                data = self.bpf.get_table(self.probe_hash_name)
+                if self.type == "freq":
+                        print(self.label or self.raw_spec)
+                        print("\t%-10s %s" % ("COUNT", "EVENT"))
+                        sdata = sorted(data.items(), key=lambda p: p[1].value)
+                        if top is not None:
+                                sdata = sdata[-top:]
+                        for key, value in sdata:
+                                # Print some nice values if the user didn't
+                                # specify an expression to probe
+                                if self.is_default_expr:
+                                        if not self.probe_type == "r":
+                                                key_str = "total calls"
+                                        else:
+                                                key_str = "retval = %s" % \
+                                                          self._v2s(key.v0)
+                                else:
+                                        key_str = self._display_key(key)
+                                print("\t%-10s %s" %
+                                      (str(value.value), key_str))
+                elif self.type == "hist":
+                        label = self.label or (self._display_expr(0)
+                                if not self.is_default_expr else "retval")
+                        data.print_log2_hist(val_type=label)
+                if not self.cumulative:
+                        data.clear()
+
+        def __str__(self):
+                return self.label or self.raw_spec
+
+class Tool(object):
+        examples = """
+Probe specifier syntax:
+        {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label]
+Where:
+        p,r,t,u    -- probe at function entry, function exit, kernel
+                      tracepoint, or USDT probe
+                      in exit probes: can use $retval, $entry(param), $latency
+        library    -- the library that contains the function
+                      (leave empty for kernel functions)
+        category   -- the category of the kernel tracepoint (e.g. net, sched)
+        function   -- the function name to trace (or tracepoint name)
+        signature  -- the function's parameters, as in the C header
+        type       -- the type of the expression to collect (supports multiple)
+        expr       -- the expression to collect (supports multiple)
+        filter     -- the filter that is applied to collected values
+        label      -- the label for this probe in the resulting output
+
+EXAMPLES:
+
+argdist -H 'p::__kmalloc(u64 size):u64:size'
+        Print a histogram of allocation sizes passed to kmalloc
+
+argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16'
+        Print a frequency count of how many times process 1005 called malloc
+        with an allocation size of 16 bytes
+
+argdist -C 'r:c:gets():char*:(char*)$retval#snooped strings'
+        Snoop on all strings returned by gets()
+
+argdist -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size)#ns per byte'
+        Print a histogram of nanoseconds per byte from kmalloc allocations
+
+argdist -C 'p::__kmalloc(size_t sz, gfp_t flags):size_t:sz:flags&GFP_ATOMIC'
+        Print frequency count of kmalloc allocation sizes that have GFP_ATOMIC
+
+argdist -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
+        Print frequency counts of how many times writes were issued to a
+        particular file descriptor number, in process 1005, but only show
+        the top 5 busiest fds
+
+argdist -p 1005 -H 'r:c:read()'
+        Print a histogram of results (sizes) returned by read() in process 1005
+
+argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+        Print frequency of reads by process where the latency was >0.1ms
+
+argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:
+            $entry(count):$latency > 1000000'
+        Print a histogram of read sizes that were longer than 1ms
+
+argdist -H \\
+        'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
+        Print a histogram of buffer sizes passed to write() across all
+        processes, where the file descriptor was 1 (STDOUT)
+
+argdist -C 'p:c:fork()#fork calls'
+        Count fork() calls in libc across all processes
+        Can also use funccount.py, which is easier and more flexible
+
+argdist -H 't:block:block_rq_complete():u32:args->nr_sector'
+        Print histogram of number of sectors in completing block I/O requests
+
+argdist -C 't:irq:irq_handler_entry():int:args->irq'
+        Aggregate interrupts by interrupt request (IRQ)
+
+argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337
+        Print frequency of function addresses used as a pthread start function,
+        relying on the USDT pthread_start probe in process 1337
+
+argdist -H 'p:c:sleep(u32 seconds):u32:seconds' \\
+        -H 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec'
+        Print histograms of sleep() and nanosleep() parameter values
+
+argdist -p 2780 -z 120 \\
+        -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1'
+        Spy on writes to STDOUT performed by process 2780, up to a string size
+        of 120 characters
+
+argdist -I 'kernel/sched/sched.h' \\
+        -C 'p::__account_cfs_rq_runtime(struct cfs_rq *cfs_rq):s64:cfs_rq->runtime_remaining'
+        Trace on the cfs scheduling runqueue remaining runtime. The struct cfs_rq is defined
+        in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
+        package.  So this command needs to run at the kernel source tree root directory
+        so that the added header file can be found by the compiler.
+"""
+
+        def __init__(self):
+                parser = argparse.ArgumentParser(description="Trace a " +
+                  "function and display a summary of its parameter values.",
+                  formatter_class=argparse.RawDescriptionHelpFormatter,
+                  epilog=Tool.examples)
+                parser.add_argument("-p", "--pid", type=int,
+                  help="id of the process to trace (optional)")
+                parser.add_argument("-z", "--string-size", default=80,
+                  type=int,
+                  help="maximum string size to read from char* arguments")
+                parser.add_argument("-i", "--interval", default=1, type=int,
+                  help="output interval, in seconds (default 1 second)")
+                parser.add_argument("-d", "--duration", type=int,
+                  help="total duration of trace, in seconds")
+                parser.add_argument("-n", "--number", type=int, dest="count",
+                  help="number of outputs")
+                parser.add_argument("-v", "--verbose", action="store_true",
+                  help="print resulting BPF program code before executing")
+                parser.add_argument("-c", "--cumulative", action="store_true",
+                  help="do not clear histograms and freq counts at " +
+                       "each interval")
+                parser.add_argument("-T", "--top", type=int,
+                  help="number of top results to show (not applicable to " +
+                  "histograms)")
+                parser.add_argument("-H", "--histogram", action="append",
+                  dest="histspecifier", metavar="specifier",
+                  help="probe specifier to capture histogram of " +
+                  "(see examples below)")
+                parser.add_argument("-C", "--count", action="append",
+                  dest="countspecifier", metavar="specifier",
+                  help="probe specifier to capture count of " +
+                  "(see examples below)")
+                parser.add_argument("-I", "--include", action="append",
+                  metavar="header",
+                  help="additional header files to include in the BPF program "
+                       "as either full path, "
+                       "or relative to relative to current working directory, "
+                       "or relative to default kernel header search path")
+                self.args = parser.parse_args()
+                self.usdt_ctx = None
+
+        def _create_probes(self):
+                self.probes = []
+                for specifier in (self.args.countspecifier or []):
+                        self.probes.append(Probe(self, "freq", specifier))
+                for histspecifier in (self.args.histspecifier or []):
+                        self.probes.append(Probe(self, "hist", histspecifier))
+                if len(self.probes) == 0:
+                        print("at least one specifier is required")
+                        exit(1)
+
+        def _generate_program(self):
+                bpf_source = """
+struct __string_t { char s[%d]; };
+
+#include <uapi/linux/ptrace.h>
+                """ % self.args.string_size
+                for include in (self.args.include or []):
+                        if include.startswith((".", "/")):
+                                include = os.path.abspath(include)
+                                bpf_source += "#include \"%s\"\n" % include
+                        else:
+                                bpf_source += "#include <%s>\n" % include
+
+                bpf_source += BPF.generate_auto_includes(
+                                map(lambda p: p.raw_spec, self.probes))
+                for probe in self.probes:
+                        bpf_source += probe.generate_text()
+                if self.args.verbose:
+                        for text in [probe.usdt_ctx.get_text()
+                                     for probe in self.probes
+                                     if probe.usdt_ctx]:
+                            print(text)
+                        print(bpf_source)
+                usdt_contexts = [probe.usdt_ctx
+                                 for probe in self.probes if probe.usdt_ctx]
+                self.bpf = BPF(text=bpf_source, usdt_contexts=usdt_contexts)
+
+        def _attach(self):
+                for probe in self.probes:
+                        probe.attach(self.bpf)
+                if self.args.verbose:
+                        print("open uprobes: %s" % list(self.bpf.uprobe_fds.keys()))
+                        print("open kprobes: %s" % list(self.bpf.kprobe_fds.keys()))
+
+        def _main_loop(self):
+                count_so_far = 0
+                seconds = 0
+                while True:
+                        try:
+                                sleep(self.args.interval)
+                                seconds += self.args.interval
+                        except KeyboardInterrupt:
+                                exit()
+                        print("[%s]" % strftime("%H:%M:%S"))
+                        for probe in self.probes:
+                                probe.display(self.args.top)
+                        count_so_far += 1
+                        if self.args.count is not None and \
+                           count_so_far >= self.args.count:
+                                exit()
+                        if self.args.duration and \
+                           seconds >= self.args.duration:
+                                exit()
+
+        def run(self):
+                try:
+                        self._create_probes()
+                        self._generate_program()
+                        self._attach()
+                        self._main_loop()
+                except:
+                        exc_info = sys.exc_info()
+                        sys_exit = exc_info[0] is SystemExit
+                        if self.args.verbose:
+                                traceback.print_exc()
+                        elif not sys_exit:
+                                print(exc_info[1])
+                        exit(0 if sys_exit else 1)
+
+if __name__ == "__main__":
+        Tool().run()
diff --git a/tools/argdist_example.txt b/tools/argdist_example.txt
new file mode 100644
index 0000000..7098e56
--- /dev/null
+++ b/tools/argdist_example.txt
@@ -0,0 +1,451 @@
+Demonstrations of argdist.
+
+
+argdist probes functions you specify and collects parameter values into a
+histogram or a frequency count. This can be used to understand the distribution
+of values a certain parameter takes, filter and print interesting parameters
+without attaching a debugger, and obtain general execution statistics on
+various functions.
+
+For example, suppose you want to find what allocation sizes are common in
+your application:
+
+# ./argdist -p 2420 -c -C 'p:c:malloc(size_t size):size_t:size'
+[01:42:29]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+[01:42:30]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+[01:42:31]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+        1          size = 16
+[01:42:32]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+        2          size = 16
+[01:42:33]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+        3          size = 16
+[01:42:34]
+p:c:malloc(size_t size):size_t:size
+        COUNT      EVENT
+        4          size = 16
+^C
+
+It seems that the application is allocating blocks of size 16. The COUNT
+column contains the number of occurrences of a particular event, and the
+EVENT column describes the event. In this case, the "size" parameter was 
+probed and its value was 16, repeatedly.
+
+Now, suppose you wanted a histogram of buffer sizes passed to the write()
+function across the system:
+
+# ./argdist -c -H 'p:c:write(int fd, void *buf, size_t len):size_t:len'
+[01:45:22]
+p:c:write(int fd, void *buf, size_t len):size_t:len
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 2        |*************                           |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 2        |*************                           |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 6        |****************************************|
+[01:45:23]
+p:c:write(int fd, void *buf, size_t len):size_t:len
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 11       |***************                         |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 4        |*****                                   |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 28       |****************************************|
+        64 -> 127        : 12       |*****************                       |
+[01:45:24]
+p:c:write(int fd, void *buf, size_t len):size_t:len
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 21       |****************                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 6        |****                                    |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 52       |****************************************|
+        64 -> 127        : 26       |********************                    |
+^C
+
+It seems that most writes fall into three buckets: very small writes of 2-3
+bytes, medium writes of 32-63 bytes, and larger writes of 64-127 bytes.
+
+But these are writes across the board -- what if you wanted to focus on writes
+to STDOUT?
+
+# ./argdist -c -H 'p:c:write(int fd, void *buf, size_t len):size_t:len:fd==1'
+[01:47:17]
+p:c:write(int fd, void *buf, size_t len):size_t:len:fd==1
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 1        |****************************************|
+[01:47:18]
+p:c:write(int fd, void *buf, size_t len):size_t:len:fd==1
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 2        |*************                           |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 3        |********************                    |
+        64 -> 127        : 6        |****************************************|
+[01:47:19]
+p:c:write(int fd, void *buf, size_t len):size_t:len:fd==1
+     len                 : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 3        |*********                               |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 5        |***************                         |
+        64 -> 127        : 13       |****************************************|
+^C
+
+The "fd==1" part is a filter that is applied to every invocation of write().
+Only if the filter condition is true, the value is recorded.
+
+You can also use argdist to trace kernel functions. For example, suppose you
+wanted a histogram of kernel allocation (kmalloc) sizes across the system,
+printed twice with 3 second intervals:
+
+# ./argdist -i 3 -n 2 -H 'p::__kmalloc(size_t size):size_t:size'
+[01:50:00]
+p::__kmalloc(size_t size):size_t:size
+     size                : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 6        |****************************************|
+[01:50:03]
+p::__kmalloc(size_t size):size_t:size
+     size                : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 22       |****************************************|
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 5        |*********                               |
+       128 -> 255        : 2        |***                                     |
+
+Occasionally, numeric information isn't enough and you want to capture strings.
+What are the strings printed by puts() across the system?
+
+# ./argdist -i 10 -n 1 -C 'p:c:puts(char *str):char*:str'
+[01:53:54]
+p:c:puts(char *str):char*:str
+        COUNT      EVENT
+        2          str = Press ENTER to start.
+
+It looks like the message "Press ENTER to start." was printed twice during the
+10 seconds we were tracing.
+
+What about reads? You could trace gets() across the system and print the 
+strings input by the user (note how "r" is used instead of "p" to attach a
+probe to the function's return):
+
+# ./argdist -i 10 -n 1 -C 'r:c:gets():char*:(char*)$retval:$retval!=0'
+[02:12:23]
+r:c:gets():char*:$retval:$retval!=0
+        COUNT      EVENT
+        1          (char*)$retval = hi there
+        3          (char*)$retval = sasha
+        8          (char*)$retval = hello
+
+Similarly, we could get a histogram of the error codes returned by read():
+
+# ./argdist -i 10 -c 1 -H 'r:c:read()'
+[02:15:36]
+r:c:read()
+     retval              : count     distribution
+         0 -> 1          : 29       |****************************************|
+         2 -> 3          : 11       |***************                         |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 3        |****                                    |
+        16 -> 31         : 2        |**                                      |
+        32 -> 63         : 22       |******************************          |
+        64 -> 127        : 5        |******                                  |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1        |*                                       |
+       512 -> 1023       : 1        |*                                       |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 2        |**                                      |
+
+In return probes, you can also trace the latency of the function (unless it is
+recursive) and the parameters it had on entry. For example, we can identify
+which processes are performing slow synchronous filesystem reads -- say,
+longer than 0.1ms (100,000ns):
+
+# ./argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+[01:08:48]
+r::__vfs_read():u32:$PID:$latency > 100000
+        COUNT      EVENT
+        1          $PID = 10457
+        21         $PID = 2780
+[01:08:49]
+r::__vfs_read():u32:$PID:$latency > 100000
+        COUNT      EVENT
+        1          $PID = 10457
+        21         $PID = 2780
+^C
+
+It looks like process 2780 performed 21 slow reads.
+
+Occasionally, entry parameter values are also interesting. For example, you
+might be curious how long it takes malloc() to allocate memory -- nanoseconds
+per byte allocated. Let's go:
+
+# ./argdist -H 'r:c:malloc(size_t size):u64:$latency/$entry(size);ns per byte' -n 1 -i 10
+[01:11:13]
+     ns per byte         : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 4        |*****************                       |
+         4 -> 7          : 3        |*************                           |
+         8 -> 15         : 2        |********                                |
+        16 -> 31         : 1        |****                                    |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 7        |*******************************         |
+       128 -> 255        : 1        |****                                    |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |****                                    |
+      1024 -> 2047       : 1        |****                                    |
+      2048 -> 4095       : 9        |****************************************|
+      4096 -> 8191       : 1        |****                                    | 
+
+It looks like a tri-modal distribution. Some allocations are extremely cheap,
+and take 2-15 nanoseconds per byte. Other allocations are slower, and take
+64-127 nanoseconds per byte. And some allocations are slower still, and take
+multiple microseconds per byte.
+
+You could also group results by more than one field. For example, __kmalloc
+takes an additional flags parameter that describes how to allocate memory:
+
+# ./argdist -c -C 'p::__kmalloc(size_t size, gfp_t flags):gfp_t,size_t:flags,size'
+[03:42:29]
+p::__kmalloc(size_t size, gfp_t flags):gfp_t,size_t:flags,size
+        COUNT      EVENT
+        1          flags = 16, size = 152
+        2          flags = 131280, size = 8
+        7          flags = 131280, size = 16
+[03:42:30]
+p::__kmalloc(size_t size, gfp_t flags):gfp_t,size_t:flags,size
+        COUNT      EVENT
+        1          flags = 16, size = 152
+        6          flags = 131280, size = 8
+        19         flags = 131280, size = 16
+[03:42:31]
+p::__kmalloc(size_t size, gfp_t flags):gfp_t,size_t:flags,size
+        COUNT      EVENT
+        2          flags = 16, size = 152
+        10         flags = 131280, size = 8
+        31         flags = 131280, size = 16
+[03:42:32]
+p::__kmalloc(size_t size, gfp_t flags):gfp_t,size_t:flags,size
+        COUNT      EVENT
+        2          flags = 16, size = 152
+        14         flags = 131280, size = 8
+        43         flags = 131280, size = 16
+^C
+
+The flags value must be expanded by hand, but it's still helpful to eliminate
+certain kinds of allocations or visually group them together.
+
+argdist also has basic support for kernel tracepoints. It is sometimes more
+convenient to use tracepoints because they are documented and don't vary a lot
+between kernel versions. For example, let's trace the net:net_dev_start_xmit
+tracepoint and print out the protocol field from the tracepoint structure:
+
+# argdist -C 't:net:net_dev_start_xmit():u16:args->protocol'
+[13:01:49]
+t:net:net_dev_start_xmit():u16:args->protocol
+        COUNT      EVENT
+        8          args->protocol = 2048
+^C
+
+Note that to discover the format of the net:net_dev_start_xmit tracepoint, you
+use the tplist tool (tplist -v net:net_dev_start_xmit).
+
+
+Occasionally, it is useful to filter certain expressions by string. This is not
+trivially supported by BPF, but argdist provides a STRCMP helper you can use in
+filter expressions. For example, to get a histogram of latencies opening a
+specific file, run this:
+
+# argdist -c -H 'r:c:open(char *file):u64:$latency/1000:STRCMP("test.txt",$entry(file))'
+[02:16:38]
+[02:16:39]
+[02:16:40]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 2        |****************************************|
+[02:16:41]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |**********                              |
+        16 -> 31         : 4        |****************************************|
+[02:16:42]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |********                                |
+        16 -> 31         : 5        |****************************************|
+[02:16:43]
+     $latency/1000       : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |********                                |
+        16 -> 31         : 5        |****************************************|
+
+
+Here's a final example that finds how many write() system calls are performed
+by each process on the system:
+
+# argdist -c -C 'p:c:write():int:$PID;write per process' -n 2
+[06:47:18]
+write by process
+        COUNT      EVENT
+        3          $PID = 8889
+        7          $PID = 7615
+        7          $PID = 2480
+[06:47:19]
+write by process
+        COUNT      EVENT
+        9          $PID = 8889
+        23         $PID = 7615
+        23         $PID = 2480
+
+
+USAGE message:
+
+# argdist -h
+usage: argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v]
+               [-c] [-T TOP] [-H specifier] [-C[specifier] [-I header]
+
+Trace a function and display a summary of its parameter values.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     id of the process to trace (optional)
+  -z STRING_SIZE, --string-size STRING_SIZE
+                        maximum string size to read from char* arguments
+  -i INTERVAL, --interval INTERVAL
+                        output interval, in seconds (default 1 second)
+  -d DURATION, --duration DURATION
+			total duration of trace, in seconds
+  -n COUNT, --number COUNT
+                        number of outputs
+  -v, --verbose         print resulting BPF program code before executing
+  -c, --cumulative      do not clear histograms and freq counts at each interval
+  -T TOP, --top TOP     number of top results to show (not applicable to
+                        histograms)
+  -H specifier, --histogram specifier
+                        probe specifier to capture histogram of (see examples
+                        below)
+  -C specifier, --count specifier
+                        probe specifier to capture count of (see examples
+                        below)
+  -I header, --include header
+                        additional header files to include in the BPF program
+                        as either full path, or relative to current working directory,
+                        or relative to default kernel header search path
+
+Probe specifier syntax:
+        {p,r,t,u}:{[library],category}:function(signature)[:type[,type...]:expr[,expr...][:filter]][#label]
+Where:
+        p,r,t,u    -- probe at function entry, function exit, kernel tracepoint,
+                      or USDT probe
+                      in exit probes: can use $retval, $entry(param), $latency
+        library    -- the library that contains the function
+                      (leave empty for kernel functions)
+        category   -- the category of the kernel tracepoint (e.g. net, sched)
+        signature  -- the function's parameters, as in the C header
+        type       -- the type of the expression to collect (supports multiple)
+        expr       -- the expression to collect (supports multiple)
+        filter     -- the filter that is applied to collected values
+        label      -- the label for this probe in the resulting output
+
+EXAMPLES:
+
+argdist -H 'p::__kmalloc(u64 size):u64:size'
+        Print a histogram of allocation sizes passed to kmalloc
+
+argdist -p 1005 -C 'p:c:malloc(size_t size):size_t:size:size==16'
+        Print a frequency count of how many times process 1005 called malloc
+        with an allocation size of 16 bytes
+
+argdist -C 'r:c:gets():char*:$retval#snooped strings'
+        Snoop on all strings returned by gets()
+
+argdist -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size)#ns per byte'
+        Print a histogram of nanoseconds per byte from kmalloc allocations
+
+argdist -C 'p::__kmalloc(size_t size, gfp_t flags):size_t:size:flags&GFP_ATOMIC'
+        Print frequency count of kmalloc allocation sizes that have GFP_ATOMIC
+
+argdist -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
+        Print frequency counts of how many times writes were issued to a
+        particular file descriptor number, in process 1005, but only show
+        the top 5 busiest fds
+
+argdist -p 1005 -H 'r:c:read()'
+        Print a histogram of error codes returned by read() in process 1005
+
+argdist -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+        Print frequency of reads by process where the latency was >0.1ms
+
+argdist -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' 
+        Print a histogram of read sizes that were longer than 1ms
+
+argdist -H \
+        'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
+        Print a histogram of buffer sizes passed to write() across all
+        processes, where the file descriptor was 1 (STDOUT)
+
+argdist -C 'p:c:fork()#fork calls'
+        Count fork() calls in libc across all processes
+        Can also use funccount.py, which is easier and more flexible 
+
+argdist -H 't:block:block_rq_complete():u32:args->nr_sector'
+        Print histogram of number of sectors in completing block I/O requests
+
+argdist -C 't:irq:irq_handler_entry():int:args->irq'
+        Aggregate interrupts by interrupt request (IRQ)
+
+argdist -C 'u:pthread:pthread_start():u64:arg2' -p 1337
+        Print frequency of function addresses used as a pthread start function,
+        relying on the USDT pthread_start probe in process 1337
+
+argdist -H 'p:c:sleep(u32 seconds):u32:seconds' \
+        -H 'p:c:nanosleep(struct timespec *req):long:req->tv_nsec'
+        Print histograms of sleep() and nanosleep() parameter values
+
+argdist -p 2780 -z 120 \
+        -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1'
+        Spy on writes to STDOUT performed by process 2780, up to a string size
+        of 120 characters 
+
+argdist -I 'kernel/sched/sched.h' \
+        -C 'p::__account_cfs_rq_runtime(struct cfs_rq *cfs_rq):s64:cfs_rq->runtime_remaining'
+        Trace on the cfs scheduling runqueue remaining runtime. The struct cfs_rq is defined
+        in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
+        package.  So this command needs to run at the kernel source tree root directory
+        so that the added header file can be found by the compiler.
diff --git a/tools/bashreadline.py b/tools/bashreadline.py
new file mode 100755
index 0000000..89c37c3
--- /dev/null
+++ b/tools/bashreadline.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+#
+# bashreadline  Print entered bash commands from all running shells.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# This works by tracing the readline() function using a uretprobe (uprobes).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 28-Jan-2016    Brendan Gregg   Created this.
+# 12-Feb-2016    Allan McAleavy migrated to BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+from time import strftime
+import ctypes as ct
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+struct str_t {
+    u64 pid;
+    char str[80];
+};
+
+BPF_PERF_OUTPUT(events);
+
+int printret(struct pt_regs *ctx) {
+    struct str_t data  = {};
+    u32 pid;
+    if (!PT_REGS_RC(ctx))
+        return 0;
+    pid = bpf_get_current_pid_tgid();
+    data.pid = pid;
+    bpf_probe_read(&data.str, sizeof(data.str), (void *)PT_REGS_RC(ctx));
+    events.perf_submit(ctx,&data,sizeof(data));
+
+    return 0;
+};
+"""
+STR_DATA = 80
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("str", ct.c_char * STR_DATA)
+    ]
+
+b = BPF(text=bpf_text)
+b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret")
+
+# header
+print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND"))
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-9s %-6d %s" % (strftime("%H:%M:%S"), event.pid,
+                            event.str.decode('utf-8', 'replace')))
+
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/bashreadline_example.txt b/tools/bashreadline_example.txt
new file mode 100644
index 0000000..861a89b
--- /dev/null
+++ b/tools/bashreadline_example.txt
@@ -0,0 +1,21 @@
+Demonstrations of bashreadline, the Linux eBPF/bcc version.
+
+
+This prints bash commands from all running bash shells on the system. For
+example:
+
+# ./bashreadline
+TIME      PID    COMMAND
+05:28:25  21176  ls -l
+05:28:28  21176  date
+05:28:35  21176  echo hello world
+05:28:43  21176  foo this command failed
+05:28:45  21176  df -h
+05:29:04  3059   echo another shell
+05:29:13  21176  echo first shell again
+
+The entered command may fail. This is just showing what command lines were
+entered interactively for bash to process.
+
+It works by tracing the return of the readline() function using uprobes
+(specifically a uretprobe).
diff --git a/tools/biolatency.py b/tools/biolatency.py
new file mode 100755
index 0000000..3879af1
--- /dev/null
+++ b/tools/biolatency.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biolatency    Summarize block device I/O latency as a histogram.
+#       For Linux, uses BCC, eBPF.
+#
+# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [interval] [count]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Sep-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./biolatency            # summarize block I/O latency as a histogram
+    ./biolatency 1 10       # print 1 second summaries, 10 times
+    ./biolatency -mT 1      # 1s summaries, milliseconds, and timestamps
+    ./biolatency -Q         # include OS queued time in I/O time
+    ./biolatency -D         # show each disk device separately
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize block device I/O latency as a histogram",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-Q", "--queued", action="store_true",
+    help="include OS queued time in I/O time")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="millisecond histogram")
+parser.add_argument("-D", "--disks", action="store_true",
+    help="print a histogram per disk device")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+typedef struct disk_key {
+    char disk[DISK_NAME_LEN];
+    u64 slot;
+} disk_key_t;
+BPF_HASH(start, struct request *);
+STORAGE
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+    return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp, delta;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        return 0;   // missed issue
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+    FACTOR
+
+    // store as histogram
+    STORE
+
+    start.delete(&req);
+    return 0;
+}
+"""
+
+# code substitutions
+if args.milliseconds:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
+    label = "msecs"
+else:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
+    label = "usecs"
+if args.disks:
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, disk_key_t);')
+    bpf_text = bpf_text.replace('STORE',
+        'disk_key_t key = {.slot = bpf_log2l(delta)}; ' +
+        'void *__tmp = (void *)req->rq_disk->disk_name; ' +
+        'bpf_probe_read(&key.disk, sizeof(key.disk), __tmp); ' +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+    bpf_text = bpf_text.replace('STORE',
+        'dist.increment(bpf_log2l(delta));')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+if args.queued:
+    b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
+else:
+    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+    b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+    fn_name="trace_req_completion")
+
+print("Tracing block device I/O... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    dist.print_log2_hist(label, "disk")
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/biolatency_example.txt b/tools/biolatency_example.txt
new file mode 100644
index 0000000..5d39b7e
--- /dev/null
+++ b/tools/biolatency_example.txt
@@ -0,0 +1,224 @@
+Demonstrations of biolatency, the Linux eBPF/bcc version.
+
+
+biolatency traces block device I/O (disk I/O), and records the distribution
+of I/O latency (time), printing this as a histogram when Ctrl-C is hit.
+For example:
+
+# ./biolatency
+Tracing block device I/O... Hit Ctrl-C to end.
+^C
+     usecs           : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 12       |********                              |
+     256 -> 511      : 15       |**********                            |
+     512 -> 1023     : 43       |*******************************       |
+    1024 -> 2047     : 52       |**************************************|
+    2048 -> 4095     : 47       |**********************************    |
+    4096 -> 8191     : 52       |**************************************|
+    8192 -> 16383    : 36       |**************************            |
+   16384 -> 32767    : 15       |**********                            |
+   32768 -> 65535    : 2        |*                                     |
+   65536 -> 131071   : 2        |*                                     |
+
+The latency of the disk I/O is measured from the issue to the device to its
+completion. A -Q option can be used to include time queued in the kernel.
+
+This example output shows a large mode of latency from about 128 microseconds
+to about 32767 microseconds (33 milliseconds). The bulk of the I/O was
+between 1 and 8 ms, which is the expected block device latency for
+rotational storage devices.
+
+The highest latency seen while tracing was between 65 and 131 milliseconds:
+the last row printed, for which there were 2 I/O.
+
+For efficiency, biolatency uses an in-kernel eBPF map to store timestamps
+with requests, and another in-kernel map to store the histogram (the "count")
+column, which is copied to user-space only when output is printed. These
+methods lower the performance overhead when tracing is performed.
+
+
+In the following example, the -m option is used to print a histogram using
+milliseconds as the units (which eliminates the first several rows), -T to
+print timestamps with the output, and to print 1 second summaries 5 times:
+
+# ./biolatency -mT 1 5
+Tracing block device I/O... Hit Ctrl-C to end.
+
+06:20:16
+     msecs           : count     distribution
+       0 -> 1        : 36       |**************************************|
+       2 -> 3        : 1        |*                                     |
+       4 -> 7        : 3        |***                                   |
+       8 -> 15       : 17       |*****************                     |
+      16 -> 31       : 33       |**********************************    |
+      32 -> 63       : 7        |*******                               |
+      64 -> 127      : 6        |******                                |
+
+06:20:17
+     msecs           : count     distribution
+       0 -> 1        : 96       |************************************  |
+       2 -> 3        : 25       |*********                             |
+       4 -> 7        : 29       |***********                           |
+       8 -> 15       : 62       |***********************               |
+      16 -> 31       : 100      |**************************************|
+      32 -> 63       : 62       |***********************               |
+      64 -> 127      : 18       |******                                |
+
+06:20:18
+     msecs           : count     distribution
+       0 -> 1        : 68       |*************************             |
+       2 -> 3        : 76       |****************************          |
+       4 -> 7        : 20       |*******                               |
+       8 -> 15       : 48       |*****************                     |
+      16 -> 31       : 103      |**************************************|
+      32 -> 63       : 49       |******************                    |
+      64 -> 127      : 17       |******                                |
+
+06:20:19
+     msecs           : count     distribution
+       0 -> 1        : 522      |*************************************+|
+       2 -> 3        : 225      |****************                      |
+       4 -> 7        : 38       |**                                    |
+       8 -> 15       : 8        |                                      |
+      16 -> 31       : 1        |                                      |
+
+06:20:20
+     msecs           : count     distribution
+       0 -> 1        : 436      |**************************************|
+       2 -> 3        : 106      |*********                             |
+       4 -> 7        : 34       |**                                    |
+       8 -> 15       : 19       |*                                     |
+      16 -> 31       : 1        |                                      |
+
+How the I/O latency distribution changes over time can be seen.
+
+
+
+The -Q option begins measuring I/O latency from when the request was first
+queued in the kernel, and includes queuing latency:
+
+# ./biolatency -Q
+Tracing block device I/O... Hit Ctrl-C to end.
+^C
+     usecs           : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 0        |                                      |
+     128 -> 255      : 3        |*                                     |
+     256 -> 511      : 37       |**************                        |
+     512 -> 1023     : 30       |***********                           |
+    1024 -> 2047     : 18       |*******                               |
+    2048 -> 4095     : 22       |********                              |
+    4096 -> 8191     : 14       |*****                                 |
+    8192 -> 16383    : 48       |*******************                   |
+   16384 -> 32767    : 96       |**************************************|
+   32768 -> 65535    : 31       |************                          |
+   65536 -> 131071   : 26       |**********                            |
+  131072 -> 262143   : 12       |****                                  |
+
+This better reflects the latency suffered by the application (if it is
+synchronous I/O), whereas the default mode without kernel queueing better
+reflects the performance of the device.
+
+Note that the storage device (and storage device controller) usually have
+queues of their own, which are always included in the latency, with or
+without -Q.
+
+
+The -D option will print a histogram per disk. Eg:
+
+# ./biolatency -D
+Tracing block device I/O... Hit Ctrl-C to end.
+^C
+
+Bucket disk = 'xvdb'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 1        |                                        |
+       256 -> 511        : 33       |**********************                  |
+       512 -> 1023       : 36       |************************                |
+      1024 -> 2047       : 58       |****************************************|
+      2048 -> 4095       : 51       |***********************************     |
+      4096 -> 8191       : 21       |**************                          |
+      8192 -> 16383      : 2        |*                                       |
+
+Bucket disk = 'xvdc'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 1        |                                        |
+       256 -> 511        : 38       |***********************                 |
+       512 -> 1023       : 42       |*************************               |
+      1024 -> 2047       : 66       |****************************************|
+      2048 -> 4095       : 40       |************************                |
+      4096 -> 8191       : 14       |********                                |
+
+Bucket disk = 'xvda1'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 18       |**********                              |
+       512 -> 1023       : 67       |*************************************   |
+      1024 -> 2047       : 35       |*******************                     |
+      2048 -> 4095       : 71       |****************************************|
+      4096 -> 8191       : 65       |************************************    |
+      8192 -> 16383      : 65       |************************************    |
+     16384 -> 32767      : 20       |***********                             |
+     32768 -> 65535      : 7        |***                                     |
+
+This output sows that xvda1 has much higher latency, usually between 0.5 ms
+and 32 ms, whereas xvdc is usually between 0.2 ms and 4 ms.
+
+
+USAGE message:
+
+# ./biolatency -h
+usage: biolatency [-h] [-T] [-Q] [-m] [-D] [interval] [count]
+
+Summarize block device I/O latency as a histogram
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --timestamp     include timestamp on output
+  -Q, --queued        include OS queued time in I/O time
+  -m, --milliseconds  millisecond histogram
+  -D, --disks         print a histogram per disk device
+
+examples:
+    ./biolatency            # summarize block I/O latency as a histogram
+    ./biolatency 1 10       # print 1 second summaries, 10 times
+    ./biolatency -mT 1      # 1s summaries, milliseconds, and timestamps
+    ./biolatency -Q         # include OS queued time in I/O time
+    ./biolatency -D         # show each disk device separately
diff --git a/tools/biosnoop.lua b/tools/biosnoop.lua
new file mode 100755
index 0000000..705212e
--- /dev/null
+++ b/tools/biosnoop.lua
@@ -0,0 +1,193 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+struct val_t {
+    u32 pid;
+    char name[TASK_COMM_LEN];
+};
+
+struct data_t {
+    u32 pid;
+    u64 rwflag;
+    u64 delta;
+    u64 sector;
+    u64 len;
+    u64 ts;
+    char disk_name[DISK_NAME_LEN];
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HASH(start, struct request *);
+BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct val_t val = {};
+
+    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
+        val.pid = bpf_get_current_pid_tgid();
+        infobyreq.update(&req, &val);
+    }
+    return 0;
+}
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts;
+
+    ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+
+    return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp, delta;
+    u32 *pidp = 0;
+    struct val_t *valp;
+    struct data_t data ={};
+    u64 ts;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        // missed tracing issue
+        return 0;
+    }
+    ts = bpf_ktime_get_ns();
+    data.delta = ts - *tsp;
+    data.ts = ts / 1000;
+
+    valp = infobyreq.lookup(&req);
+    if (valp == 0) {
+        data.len = req->__data_len;
+        strcpy(data.name,"?");
+    } else {
+        data.pid = valp->pid;
+        data.len = req->__data_len;
+        data.sector = req->__sector;
+        bpf_probe_read(&data.name, sizeof(data.name), valp->name);
+        bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
+                       req->rq_disk->disk_name);
+    }
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+#ifdef REQ_WRITE
+    data.rwflag = !!(req->cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+    data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+    data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+
+    events.perf_submit(ctx,&data,sizeof(data));
+    start.delete(&req);
+    infobyreq.delete(&req);
+
+    return 0;
+}
+]]
+
+local ffi = require("ffi")
+
+return function(BPF, utils)
+  local bpf = BPF:new{text=program}
+
+  bpf:attach_kprobe{event="blk_account_io_start", fn_name="trace_pid_start"}
+  bpf:attach_kprobe{event="blk_start_request", fn_name="trace_req_start"}
+  bpf:attach_kprobe{event="blk_mq_start_request", fn_name="trace_req_start"}
+  bpf:attach_kprobe{event="blk_account_io_completion",
+      fn_name="trace_req_completion"}
+
+  print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % {"TIME(s)", "COMM", "PID",
+    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"})
+
+  local rwflg = ""
+  local start_ts = 0
+  local prev_ts = 0
+  local delta = 0
+
+  local function print_event(cpu, event)
+    local val = -1
+    local event_pid = event.pid
+    local event_delta = tonumber(event.delta)
+    local event_sector = tonumber(event.sector)
+    local event_len = tonumber(event.len)
+    local event_ts = tonumber(event.ts)
+    local event_disk_name = ffi.string(event.disk_name)
+    local event_name = ffi.string(event.name)
+
+    if event.rwflag == 1 then
+      rwflg = "W"
+    end
+
+    if event.rwflag == 0 then
+      rwflg = "R"
+    end
+
+    if not event_name:match("%?") then
+      val = event_sector
+    end
+
+    if start_ts == 0 then
+      prev_ts = start_ts
+    end
+
+    if start_ts == 1 then
+      delta = delta + (event_ts - prev_ts)
+    end
+
+    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % {
+      delta / 1000000, event_name, event_pid, event_disk_name, rwflg, val,
+      event_len, event_delta / 1000000})
+
+    prev_ts = event_ts
+    start_ts = 1
+  end
+
+  local TASK_COMM_LEN = 16 -- linux/sched.h
+  local DISK_NAME_LEN = 32 -- linux/genhd.h
+
+  bpf:get_table("events"):open_perf_buffer(print_event, [[
+    struct {
+      uint32_t pid;
+      uint64_t rwflag;
+      uint64_t delta;
+      uint64_t sector;
+      uint64_t len;
+      uint64_t ts;
+      char disk_name[$];
+      char name[$];
+    }
+  ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
+  bpf:perf_buffer_poll_loop()
+end
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
new file mode 100755
index 0000000..2b1e77d
--- /dev/null
+++ b/tools/biosnoop.py
@@ -0,0 +1,189 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biosnoop  Trace block device I/O and print details including issuing PID.
+#       For Linux, uses BCC, eBPF.
+#
+# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+# request, as well as a starting timestamp for calculating I/O latency.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 16-Sep-2015   Brendan Gregg   Created this.
+# 11-Feb-2016   Allan McAleavy  updated for BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+import ctypes as ct
+import re
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+struct val_t {
+    u32 pid;
+    char name[TASK_COMM_LEN];
+};
+
+struct data_t {
+    u32 pid;
+    u64 rwflag;
+    u64 delta;
+    u64 sector;
+    u64 len;
+    u64 ts;
+    char disk_name[DISK_NAME_LEN];
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HASH(start, struct request *);
+BPF_HASH(infobyreq, struct request *, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct val_t val = {};
+
+    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
+        val.pid = bpf_get_current_pid_tgid();
+        infobyreq.update(&req, &val);
+    }
+    return 0;
+}
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts;
+
+    ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+
+    return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp, delta;
+    u32 *pidp = 0;
+    struct val_t *valp;
+    struct data_t data = {};
+    u64 ts;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        // missed tracing issue
+        return 0;
+    }
+    ts = bpf_ktime_get_ns();
+    data.delta = ts - *tsp;
+    data.ts = ts / 1000;
+
+    valp = infobyreq.lookup(&req);
+    if (valp == 0) {
+        data.len = req->__data_len;
+        strcpy(data.name, "?");
+    } else {
+        data.pid = valp->pid;
+        data.len = req->__data_len;
+        data.sector = req->__sector;
+        bpf_probe_read(&data.name, sizeof(data.name), valp->name);
+        struct gendisk *rq_disk = req->rq_disk;
+        bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
+                       rq_disk->disk_name);
+    }
+
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+#ifdef REQ_WRITE
+    data.rwflag = !!(req->cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+    data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+    data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+
+    events.perf_submit(ctx, &data, sizeof(data));
+    start.delete(&req);
+    infobyreq.delete(&req);
+
+    return 0;
+}
+""", debug=0)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+    fn_name="trace_req_completion")
+
+TASK_COMM_LEN = 16  # linux/sched.h
+DISK_NAME_LEN = 32  # linux/genhd.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("rwflag", ct.c_ulonglong),
+        ("delta", ct.c_ulonglong),
+        ("sector", ct.c_ulonglong),
+        ("len", ct.c_ulonglong),
+        ("ts", ct.c_ulonglong),
+        ("disk_name", ct.c_char * DISK_NAME_LEN),
+        ("name", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# header
+print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
+    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
+
+rwflg = ""
+start_ts = 0
+prev_ts = 0
+delta = 0
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    val = -1
+    global start_ts
+    global prev_ts
+    global delta
+
+    if event.rwflag == 1:
+        rwflg = "W"
+
+    if event.rwflag == 0:
+        rwflg = "R"
+
+    if not re.match(b'\?', event.name):
+        val = event.sector
+
+    if start_ts == 0:
+        prev_ts = start_ts
+
+    if start_ts == 1:
+        delta = float(delta) + (event.ts - prev_ts)
+
+    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
+        delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
+        event.disk_name.decode('utf-8', 'replace'), rwflg, val,
+        event.len, float(event.delta) / 1000000))
+
+    prev_ts = event.ts
+    start_ts = 1
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/biosnoop_example.txt b/tools/biosnoop_example.txt
new file mode 100644
index 0000000..b5cee7f
--- /dev/null
+++ b/tools/biosnoop_example.txt
@@ -0,0 +1,49 @@
+Demonstrations of biosnoop, the Linux eBPF/bcc version.
+
+
+biosnoop traces block device I/O (disk I/O), and prints a line of output
+per I/O. Example:
+
+# ./biosnoop
+TIME(s)        COMM           PID    DISK    T  SECTOR    BYTES   LAT(ms)
+0.000004001    supervise      1950   xvda1   W  13092560  4096       0.74
+0.000178002    supervise      1950   xvda1   W  13092432  4096       0.61
+0.001469001    supervise      1956   xvda1   W  13092440  4096       1.24
+0.001588002    supervise      1956   xvda1   W  13115128  4096       1.09
+1.022346001    supervise      1950   xvda1   W  13115272  4096       0.98
+1.022568002    supervise      1950   xvda1   W  13188496  4096       0.93
+1.023534000    supervise      1956   xvda1   W  13188520  4096       0.79
+1.023585003    supervise      1956   xvda1   W  13189512  4096       0.60
+2.003920000    xfsaild/md0    456    xvdc    W  62901512  8192       0.23
+2.003931001    xfsaild/md0    456    xvdb    W  62901513  512        0.25
+2.004034001    xfsaild/md0    456    xvdb    W  62901520  8192       0.35
+2.004042000    xfsaild/md0    456    xvdb    W  63542016  4096       0.36
+2.004204001    kworker/0:3    26040  xvdb    W  41950344  65536      0.34
+2.044352002    supervise      1950   xvda1   W  13192672  4096       0.65
+2.044574000    supervise      1950   xvda1   W  13189072  4096       0.58
+
+This includes the PID and comm (process name) that were on-CPU at the time of
+issue (which usually means the process responsible).
+
+The latency of the disk I/O, measured from the issue to the device to its
+completion, is included as the last column.
+
+This example output is from what should be an idle system, however, the
+following is visible in iostat:
+
+$ iostat -x 1
+[...]
+avg-cpu:  %user   %nice %system %iowait  %steal   %idle
+           0.12    0.00    0.12    0.00    0.00   99.75
+
+Device: rrqm/s  wrqm/s    r/s    w/s  rkB/s  wkB/s  await  svctm  %util
+xvda      0.00    0.00   0.00   4.00   0.00  16.00   0.00   0.00   0.00
+xvdb      0.00    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
+xvdc      0.00    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
+md0       0.00    0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00
+
+There are 4 write IOPS.
+
+The output of biosnoop identifies the reason: multiple supervise processes are
+issuing writes to the xvda1 disk. I can now drill down on supervise using other
+tools to understand its file system workload.
diff --git a/tools/biotop.py b/tools/biotop.py
new file mode 100755
index 0000000..c6e1ca2
--- /dev/null
+++ b/tools/biotop.py
@@ -0,0 +1,235 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biotop  block device (disk) I/O by process.
+#         For Linux, uses BCC, eBPF.
+#
+# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+#
+# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+# request, as well as a starting timestamp for calculating I/O latency.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 06-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+from subprocess import call
+
+# arguments
+examples = """examples:
+    ./biotop            # block device I/O top, 1 second refresh
+    ./biotop -C         # don't clear the screen
+    ./biotop 5          # 5 second summaries
+    ./biotop 5 10       # 5 second summaries, 10 times only
+"""
+parser = argparse.ArgumentParser(
+    description="Block device (disk) I/O by process",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-r", "--maxrows", default=20,
+    help="maximum rows to print, default 20")
+parser.add_argument("interval", nargs="?", default=1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+interval = int(args.interval)
+countdown = int(args.count)
+maxrows = int(args.maxrows)
+clear = not int(args.noclear)
+
+# linux stats
+loadavg = "/proc/loadavg"
+diskstats = "/proc/diskstats"
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+// for saving process info by request
+struct who_t {
+    u32 pid;
+    char name[TASK_COMM_LEN];
+};
+
+// the key for the output summary
+struct info_t {
+    u32 pid;
+    int rwflag;
+    int major;
+    int minor;
+    char name[TASK_COMM_LEN];
+};
+
+// the value of the output summary
+struct val_t {
+    u64 bytes;
+    u64 us;
+    u32 io;
+};
+
+BPF_HASH(start, struct request *);
+BPF_HASH(whobyreq, struct request *, struct who_t);
+BPF_HASH(counts, struct info_t, struct val_t);
+
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct who_t who = {};
+
+    if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
+        who.pid = bpf_get_current_pid_tgid();
+        whobyreq.update(&req, &who);
+    }
+
+    return 0;
+}
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts;
+
+    ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+
+    return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        return 0;    // missed tracing issue
+    }
+
+    struct who_t *whop;
+    struct val_t *valp, zero = {};
+    u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+
+    // setup info_t key
+    struct info_t info = {};
+    info.major = req->rq_disk->major;
+    info.minor = req->rq_disk->first_minor;
+/*
+ * The following deals with a kernel version change (in mainline 4.7, although
+ * it may be backported to earlier kernels) with how block request write flags
+ * are tested. We handle both pre- and post-change versions here. Please avoid
+ * kernel version tests like this as much as possible: they inflate the code,
+ * test, and maintenance burden.
+ */
+#ifdef REQ_WRITE
+    info.rwflag = !!(req->cmd_flags & REQ_WRITE);
+#elif defined(REQ_OP_SHIFT)
+    info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
+#else
+    info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
+#endif
+
+    whop = whobyreq.lookup(&req);
+    if (whop == 0) {
+        // missed pid who, save stats as pid 0
+        valp = counts.lookup_or_init(&info, &zero);
+    } else {
+        info.pid = whop->pid;
+        __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
+        valp = counts.lookup_or_init(&info, &zero);
+    }
+
+    // save stats
+    valp->us += delta_us;
+    valp->bytes += req->__data_len;
+    valp->io++;
+
+    start.delete(&req);
+    whobyreq.delete(&req);
+
+    return 0;
+}
+"""
+
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+    fn_name="trace_req_completion")
+
+print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
+
+# cache disk major,minor -> diskname
+disklookup = {}
+with open(diskstats) as stats:
+    for line in stats:
+        a = line.split()
+        disklookup[a[0] + "," + a[1]] = a[2]
+
+# output
+exiting = 0
+while 1:
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    # header
+    if clear:
+        call("clear")
+    else:
+        print()
+    with open(loadavg) as stats:
+        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+    print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
+        "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
+
+    # by-PID output
+    counts = b.get_table("counts")
+    line = 0
+    for k, v in reversed(sorted(counts.items(),
+                                key=lambda counts: counts[1].bytes)):
+
+        # lookup disk
+        disk = str(k.major) + "," + str(k.minor)
+        if disk in disklookup:
+            diskname = disklookup[disk]
+        else:
+            diskname = "?"
+
+        # print line
+        avg_ms = (float(v.us) / 1000) / v.io
+        print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
+            k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
+            k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
+
+        line += 1
+        if line >= maxrows:
+            break
+    counts.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        print("Detaching...")
+        exit()
diff --git a/tools/biotop_example.txt b/tools/biotop_example.txt
new file mode 100644
index 0000000..62e8f1c
--- /dev/null
+++ b/tools/biotop_example.txt
@@ -0,0 +1,187 @@
+Demonstrations of biotop, the Linux eBPF/bcc version.
+
+
+Short for block device I/O top, biotop summarizes which processes are
+performing disk I/O. It's top for disks. Sample output:
+
+# ./biotop
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+08:04:11 loadavg: 1.48 0.87 0.45 1/287 14547
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+14501  cksum            R 202 1   xvda1      361   28832   3.39
+6961   dd               R 202 1   xvda1     1628   13024   0.59
+13855  dd               R 202 1   xvda1     1627   13016   0.59
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   3.00
+1880   supervise        W 202 1   xvda1        2       8   6.71
+1873   supervise        W 202 1   xvda1        2       8   2.51
+1871   supervise        W 202 1   xvda1        2       8   1.57
+1876   supervise        W 202 1   xvda1        2       8   1.22
+1892   supervise        W 202 1   xvda1        2       8   0.62
+1878   supervise        W 202 1   xvda1        2       8   0.78
+1886   supervise        W 202 1   xvda1        2       8   1.30
+1894   supervise        W 202 1   xvda1        2       8   3.46
+1869   supervise        W 202 1   xvda1        2       8   0.73
+1888   supervise        W 202 1   xvda1        2       8   1.48
+
+By default the screen refreshes every 1 second, and shows the top 20 disk
+consumers, sorted on total Kbytes. The first line printed is the header,
+which has the time and then the contents of /proc/loadavg.
+
+For the interval summarized by the output above, the "cksum" command performed
+361 disk reads to the "xvda1" device, for a total of 28832 Kbytes, with an
+average I/O time of 3.39 ms. Two "dd" processes were also reading from the
+same disk, which a higher I/O rate and lower latency. While the average I/O
+size is not printed, it can be determined by dividing the Kbytes column by
+the I/O column.
+
+The columns through to Kbytes show the workload applied. The final column,
+AVGms, shows resulting performance. Other bcc tools can be used to get more
+details when needed: biolatency and biosnoop.
+
+Many years ago I created the original "iotop", and later regretted not calling
+it diskiotop or blockiotop, as "io" alone is ambiguous. This time it is biotop.
+
+
+The -C option can be used to prevent the screen from clearing (my preference).
+Here's using it with a 5 second interval:
+
+# ./biotop -C 5
+Tracing... Output every 5 secs. Hit Ctrl-C to end
+
+08:09:44 loadavg: 0.42 0.44 0.39 2/282 22115
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1     5993   47976   0.33
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.67
+1866   svscan           R 202 1   xvda1       33     132   1.24
+1880   supervise        W 202 1   xvda1       10      40   0.56
+1873   supervise        W 202 1   xvda1       10      40   0.79
+1871   supervise        W 202 1   xvda1       10      40   0.78
+1876   supervise        W 202 1   xvda1       10      40   0.68
+1892   supervise        W 202 1   xvda1       10      40   0.71
+1878   supervise        W 202 1   xvda1       10      40   0.65
+1886   supervise        W 202 1   xvda1       10      40   0.78
+1894   supervise        W 202 1   xvda1       10      40   0.80
+1869   supervise        W 202 1   xvda1       10      40   0.91
+1888   supervise        W 202 1   xvda1       10      40   0.63
+22069  bash             R 202 1   xvda1        1      16  19.94
+9251   kworker/u16:2    W 202 16  xvdb         2       8   0.13
+
+08:09:49 loadavg: 0.47 0.44 0.39 1/282 22231
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    13450  107600   0.35
+22199  cksum            R 202 1   xvda1      941   45548   4.63
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.93
+24467  kworker/0:2      W 202 16  xvdb         1      64   0.28
+1880   supervise        W 202 1   xvda1       10      40   0.81
+1873   supervise        W 202 1   xvda1       10      40   0.81
+1871   supervise        W 202 1   xvda1       10      40   1.03
+1876   supervise        W 202 1   xvda1       10      40   0.76
+1892   supervise        W 202 1   xvda1       10      40   0.74
+1878   supervise        W 202 1   xvda1       10      40   0.94
+1886   supervise        W 202 1   xvda1       10      40   0.76
+1894   supervise        W 202 1   xvda1       10      40   0.69
+1869   supervise        W 202 1   xvda1       10      40   0.72
+1888   supervise        W 202 1   xvda1       10      40   1.70
+22199  bash             R 202 1   xvda1        2      20   0.35
+482    xfsaild/md0      W 202 16  xvdb         5      13   0.27
+482    xfsaild/md0      W 202 32  xvdc         2       8   0.33
+31331  pickup           R 202 1   xvda1        1       4   0.31
+
+08:09:54 loadavg: 0.51 0.45 0.39 2/282 22346
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14689  117512   0.32
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.33
+1880   supervise        W 202 1   xvda1       10      40   0.65
+1873   supervise        W 202 1   xvda1       10      40   1.08
+1871   supervise        W 202 1   xvda1       10      40   0.66
+1876   supervise        W 202 1   xvda1       10      40   0.79
+1892   supervise        W 202 1   xvda1       10      40   0.67
+1878   supervise        W 202 1   xvda1       10      40   0.66
+1886   supervise        W 202 1   xvda1       10      40   1.02
+1894   supervise        W 202 1   xvda1       10      40   0.88
+1869   supervise        W 202 1   xvda1       10      40   0.89
+1888   supervise        W 202 1   xvda1       10      40   1.25
+
+08:09:59 loadavg: 0.55 0.46 0.40 2/282 22461
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14442  115536   0.33
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   3.46
+1880   supervise        W 202 1   xvda1       10      40   0.87
+1873   supervise        W 202 1   xvda1       10      40   0.87
+1871   supervise        W 202 1   xvda1       10      40   0.78
+1876   supervise        W 202 1   xvda1       10      40   0.86
+1892   supervise        W 202 1   xvda1       10      40   0.89
+1878   supervise        W 202 1   xvda1       10      40   0.87
+1886   supervise        W 202 1   xvda1       10      40   0.86
+1894   supervise        W 202 1   xvda1       10      40   1.06
+1869   supervise        W 202 1   xvda1       10      40   1.12
+1888   supervise        W 202 1   xvda1       10      40   0.98
+
+08:10:04 loadavg: 0.59 0.47 0.40 3/282 22576
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1    14179  113432   0.34
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.39
+1880   supervise        W 202 1   xvda1       10      40   0.81
+1873   supervise        W 202 1   xvda1       10      40   1.02
+1871   supervise        W 202 1   xvda1       10      40   1.15
+1876   supervise        W 202 1   xvda1       10      40   1.10
+1892   supervise        W 202 1   xvda1       10      40   0.77
+1878   supervise        W 202 1   xvda1       10      40   0.72
+1886   supervise        W 202 1   xvda1       10      40   0.81
+1894   supervise        W 202 1   xvda1       10      40   0.86
+1869   supervise        W 202 1   xvda1       10      40   0.83
+1888   supervise        W 202 1   xvda1       10      40   0.79
+24467  kworker/0:2      R 202 32  xvdc         3      12   0.26
+1056   cron             R 202 1   xvda1        2       8   0.30
+24467  kworker/0:2      R 202 16  xvdb         1       4   0.23
+
+08:10:09 loadavg: 0.54 0.46 0.40 2/281 22668
+
+PID    COMM             D MAJ MIN DISK       I/O  Kbytes  AVGms
+22069  dd               R 202 1   xvda1      250    2000   0.34
+326    jbd2/xvda1-8     W 202 1   xvda1        3     168   2.40
+1880   supervise        W 202 1   xvda1        8      32   0.93
+1873   supervise        W 202 1   xvda1        8      32   0.76
+1871   supervise        W 202 1   xvda1        8      32   0.60
+1876   supervise        W 202 1   xvda1        8      32   0.61
+1892   supervise        W 202 1   xvda1        8      32   0.68
+1878   supervise        W 202 1   xvda1        8      32   0.90
+1886   supervise        W 202 1   xvda1        8      32   0.57
+1894   supervise        W 202 1   xvda1        8      32   0.97
+1869   supervise        W 202 1   xvda1        8      32   0.69
+1888   supervise        W 202 1   xvda1        8      32   0.67
+
+This shows another "dd" command reading from xvda1. On this system, various
+"supervise" processes do 8 disk writes per second, every second (they are
+creating and updating "status" files).
+
+
+USAGE message:
+
+# ./biotop.py -h
+usage: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+
+Block device (disk) I/O by process
+
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -C, --noclear         don't clear the screen
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+
+examples:
+    ./biotop            # block device I/O top, 1 second refresh
+    ./biotop -C         # don't clear the screen
+    ./biotop 5          # 5 second summaries
+    ./biotop 5 10       # 5 second summaries, 10 times only
diff --git a/tools/bitesize.py b/tools/bitesize.py
new file mode 100755
index 0000000..f70f091
--- /dev/null
+++ b/tools/bitesize.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+#
+# bitehist.py   Block I/O size histogram.
+#               For Linux, uses BCC, eBPF. See .c file.
+#
+# USAGE: bitesize
+#
+# Ctrl-C will print the partially gathered histogram then exit.
+#
+# Copyright (c) 2016 Allan McAleavy
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 05-Feb-2016 Allan McAleavy ran pep8 against file
+
+from bcc import BPF
+from time import sleep
+
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+struct proc_key_t {
+    char name[TASK_COMM_LEN];
+    u64 slot;
+};
+
+struct val_t {
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HISTOGRAM(dist, struct proc_key_t);
+BPF_HASH(commbyreq, struct request *, struct val_t);
+
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct val_t val = {};
+
+    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
+        commbyreq.update(&req, &val);
+    }
+    return 0;
+}
+
+int do_count(struct pt_regs *ctx, struct request *req)
+{
+    struct val_t *valp;
+
+    valp = commbyreq.lookup(&req);
+    if (valp == 0) {
+       return 0;
+    }
+
+    if (req->__data_len > 0) {
+        struct proc_key_t key = {.slot = bpf_log2l(req->__data_len / 1024)};
+        bpf_probe_read(&key.name, sizeof(key.name),valp->name);
+        dist.increment(key);
+    }
+    return 0;
+}
+"""
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_account_io_completion", fn_name="do_count")
+
+print("Tracing... Hit Ctrl-C to end.")
+
+# trace until Ctrl-C
+dist = b.get_table("dist")
+
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    dist.print_log2_hist("Kbytes", "Process Name",
+            section_print_fn=bytes.decode)
diff --git a/tools/bitesize_example.txt b/tools/bitesize_example.txt
new file mode 100644
index 0000000..4ea62e1
--- /dev/null
+++ b/tools/bitesize_example.txt
@@ -0,0 +1,89 @@
+Examples of bitesize.py, the Linux bcc/eBPF version.
+
+
+The aim of this tool is to show I/O distribution for requested block sizes, by process name.
+
+# ./bitesize.py
+Tracing... Hit Ctrl-C to end.
+^C
+
+Process Name = 'kworker/u128:1'
+     Kbytes              : count     distribution
+         0 -> 1          : 1        |********************                    |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 2        |****************************************|
+
+Process Name = 'bitesize.py'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 1        |****************************************|
+
+Process Name = 'dd'
+     Kbytes              : count     distribution
+         0 -> 1          : 3        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 6        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 1        |                                        |
+        32 -> 63         : 1        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 488      |****************************************|
+
+Process Name = 'jbd2/dm-1-8'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+
+Process Name = 'cat'
+     Kbytes              : count     distribution
+         0 -> 1          : 1        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 1        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1924     |****************************************|
+
+Process Name = 'ntpd'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 104      |****************************************|
+
+Process Name = 'vmtoolsd'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+
+Process Name = 'bash'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 2        |****************************************|
+
+Process Name = 'jbd2/sdb-8'
+     Kbytes              : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 1        |****************************************|
+
+We can see from above that there was a dd command being run which generated 488 IOPS between 1MB and 2MB, we can also see the
+cat command generating 1924 IOPS between 256Kb and 512Kb.
diff --git a/tools/bpflist.py b/tools/bpflist.py
new file mode 100755
index 0000000..f73e945
--- /dev/null
+++ b/tools/bpflist.py
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+#
+# bpflist   Display processes currently using BPF programs and maps,
+#           pinned BPF programs and maps, and enabled probes.
+#
+# USAGE: bpflist [-v]
+#
+# Idea by Brendan Gregg.
+#
+# Copyright 2017, Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0
+#
+# 09-Mar-2017   Sasha Goldshtein   Created this.
+
+from bcc import BPF, USDT
+import argparse
+import re
+import os
+import subprocess
+
+examples = """examples:
+    bpflist     # display all processes currently using BPF
+    bpflist -v  # also count kprobes/uprobes
+    bpflist -vv # display kprobes/uprobes and count them
+"""
+parser = argparse.ArgumentParser(
+    description="Display processes currently using BPF programs and maps",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-v", "--verbosity", action="count", default=0,
+    help="count and display kprobes/uprobes as well")
+args = parser.parse_args()
+
+def comm_for_pid(pid):
+    try:
+        return open("/proc/%d/comm" % pid).read().strip()
+    except:
+        return "[unknown]"
+
+counts = {}
+
+def parse_probes(typ):
+    if args.verbosity > 1:
+        print("open %ss:" % typ)
+    for probe in open("/sys/kernel/debug/tracing/%s_events" % typ):
+        # Probes opened by bcc have a specific pattern that includes the pid
+        # of the requesting process.
+        match = re.search('_bcc_(\\d+)\\s', probe)
+        if match:
+            pid = int(match.group(1))
+            counts[(pid, typ)] = counts.get((pid, typ), 0) + 1
+        if args.verbosity > 1:
+            print(probe.strip())
+    if args.verbosity > 1:
+        print("")
+
+if args.verbosity > 0:
+    parse_probes("kprobe")
+    parse_probes("uprobe")
+
+def find_bpf_fds(pid):
+    root = '/proc/%d/fd' % pid
+    for fd in os.listdir(root):
+        try:
+            link = os.readlink(os.path.join(root, fd))
+        except OSError:
+            continue
+        match = re.match('.*bpf-(\\w+)', link)
+        if match:
+            tup = (pid, match.group(1))
+            counts[tup] = counts.get(tup, 0) + 1
+
+for pdir in os.listdir('/proc'):
+    if re.match('\\d+', pdir):
+        try:
+            find_bpf_fds(int(pdir))
+        except OSError:
+            continue
+print("%-6s %-16s %-8s %s" % ("PID", "COMM", "TYPE", "COUNT"))
+for (pid, typ), count in sorted(counts.items(), key=lambda t: t[0][0]):
+    comm = comm_for_pid(pid)
+    print("%-6d %-16s %-8s %-4d" % (pid, comm, typ, count))
diff --git a/tools/bpflist_example.txt b/tools/bpflist_example.txt
new file mode 100644
index 0000000..bc44d1f
--- /dev/null
+++ b/tools/bpflist_example.txt
@@ -0,0 +1,66 @@
+Demonstrations of bpflist.
+
+
+bpflist displays information on running BPF programs and optionally also
+prints open kprobes and uprobes. It is used to understand which BPF programs
+are currently running on the system. For example:
+
+# bpflist
+PID    COMM             TYPE     COUNT
+4058   fileslower       prog     4   
+4058   fileslower       map      2   
+4106   bashreadline     map      1   
+4106   bashreadline     prog     1   
+
+From the output above, the fileslower and bashreadline tools are running.
+fileslower has installed 4 BPF programs (functions) and has opened 2 BPF maps
+(such as hashes, histograms, stack trace tables, and so on).
+
+In verbose mode, bpflist also counts the number of kprobes and uprobes opened
+by the process. This information is obtained heuristically: bcc-based tools
+include the process id in the name of the probe. For example:
+
+# bpflist -v
+PID    COMM             TYPE     COUNT
+4058   fileslower       prog     4   
+4058   fileslower       kprobe   4   
+4058   fileslower       map      2   
+4106   bashreadline     uprobe   1   
+4106   bashreadline     prog     1   
+4106   bashreadline     map      1   
+
+In double-verbose mode, the probe definitions are also displayed:
+
+# bpflist -vv
+open kprobes:
+p:kprobes/p___vfs_read_bcc_4058 __vfs_read
+r:kprobes/r___vfs_read_bcc_4058 __vfs_read
+p:kprobes/p___vfs_write_bcc_4058 __vfs_write
+r:kprobes/r___vfs_write_bcc_4058 __vfs_write
+
+open uprobes:
+r:uprobes/r__bin_bash_0xa4dd0_bcc_4106 /bin/bash:0x00000000000a4dd0
+
+PID    COMM             TYPE     COUNT
+4058   fileslower       prog     4   
+4058   fileslower       kprobe   4   
+4058   fileslower       map      2   
+4106   bashreadline     uprobe   1   
+4106   bashreadline     prog     1   
+4106   bashreadline     map      1   
+
+
+USAGE:
+# bpflist -h
+usage: bpflist.py [-h] [-v]
+
+Display processes currently using BPF programs and maps
+
+optional arguments:
+  -h, --help       show this help message and exit
+  -v, --verbosity  count and display kprobes/uprobes as well
+
+examples:
+    bpflist     # display all processes currently using BPF
+    bpflist -v  # also count kprobes/uprobes
+    bpflist -vv # display kprobes/uprobes and count them
diff --git a/tools/btrfsdist.py b/tools/btrfsdist.py
new file mode 100755
index 0000000..4659ab4
--- /dev/null
+++ b/tools/btrfsdist.py
@@ -0,0 +1,229 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# btrfsdist  Summarize btrfs operation latency.
+#            For Linux, uses BCC, eBPF.
+#
+# USAGE: btrfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# symbols
+kallsyms = "/proc/kallsyms"
+
+# arguments
+examples = """examples:
+    ./btrfsdist            # show operation latency as a histogram
+    ./btrfsdist -p 181     # trace PID 181 only
+    ./btrfsdist 1 10       # print 1 second summaries, 10 times
+    ./btrfsdist -m 5       # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize btrfs operation latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+    help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+    factor = 1000000
+    label = "msecs"
+else:
+    factor = 1000
+    label = "usecs"
+if args.interval and int(args.interval) == 0:
+    print("ERROR: interval 0. Exiting.")
+    exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+    char op[OP_NAME_LEN];
+    u64 slot;
+} dist_key_t;
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's
+// own read function. So we need to trace that and then filter on btrfs, which
+// I do by checking file->f_op.
+int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+
+    // btrfs filter on file->f_op == btrfs_file_operations
+    struct file *fp = iocb->ki_filp;
+    if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS)
+        return 0;
+
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own
+// function. Same as with reads. Trace the generic path and filter:
+int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
+    struct file *file)
+{
+    u32 pid;
+    pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+
+    // btrfs filter on file->f_op == btrfs_file_operations
+    if ((u64)file->f_op != BTRFS_FILE_OPERATIONS)
+        return 0;
+
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+    u64 *tsp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start or filtered
+    }
+    u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+
+    // store as histogram
+    dist_key_t key = {.slot = bpf_log2l(delta)};
+    __builtin_memcpy(&key.op, op, sizeof(key.op));
+    dist.increment(key);
+
+    start.delete(&pid);
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    char *op = "read";
+    return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    char *op = "write";
+    return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    char *op = "open";
+    return trace_return(ctx, op);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    char *op = "fsync";
+    return trace_return(ctx, op);
+}
+"""
+
+# code replacements
+with open(kallsyms) as syms:
+    ops = ''
+    for line in syms:
+        a = line.rstrip().split()
+        (addr, name) = (a[0], a[2])
+        name = name.split("\t")[0]
+        if name == "btrfs_file_operations":
+            ops = "0x" + addr
+            break
+    if ops == '':
+        print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.")
+        print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.")
+        exit()
+    bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops)
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# Common file functions. See earlier comment about generic_file_read_iter().
+b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
+b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_entry")
+b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry")
+b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_entry")
+b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return")
+
+print("Tracing btrfs operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.interval and (not args.notimestamp):
+        print(strftime("%H:%M:%S:"))
+
+    dist.print_log2_hist(label, "operation")
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/btrfsdist_example.txt b/tools/btrfsdist_example.txt
new file mode 100644
index 0000000..4cadc76
--- /dev/null
+++ b/tools/btrfsdist_example.txt
@@ -0,0 +1,179 @@
+Demonstrations of btrfsdist, the Linux eBPF/bcc version.
+
+
+btrfsdist traces btrfs reads, writes, opens, and fsyncs, and summarizes their
+latency as a power-of-2 histogram. For example:
+
+# ./btrfsdist 
+Tracing btrfs operation latency... Hit Ctrl-C to end.
+^C
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 15       |                                        |
+         2 -> 3          : 1308     |*******                                 |
+         4 -> 7          : 198      |*                                       |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 11       |                                        |
+        32 -> 63         : 361      |*                                       |
+        64 -> 127        : 55       |                                        |
+       128 -> 255        : 104      |                                        |
+       256 -> 511        : 7312     |****************************************|
+       512 -> 1023       : 387      |**                                      |
+      1024 -> 2047       : 10       |                                        |
+      2048 -> 4095       : 4        |                                        |
+
+operation = 'write'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 4        |****************************************|
+
+operation = 'open'
+     usecs               : count     distribution
+         0 -> 1          : 1        |**********                              |
+         2 -> 3          : 4        |****************************************|
+
+This output shows a bi-modal distribution for read latency, with a faster
+mode of 1,308 reads that took between 2 and 3 microseconds, and a slower
+mode of over 7,312 reads that took between 256 and 511 microseconds. It's
+likely that the faster mode was a hit from the in-memory file system cache,
+and the slower mode is a read from a storage device (disk).
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example, two second summaries, five times:
+
+# ./btrfsdist 2 5
+Tracing btrfs operation latency... Hit Ctrl-C to end.
+
+03:40:49:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 15       |                                        |
+         2 -> 3          : 833      |********                                |
+         4 -> 7          : 127      |*                                       |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 8        |                                        |
+        32 -> 63         : 907      |********                                |
+        64 -> 127        : 91       |                                        |
+       128 -> 255        : 246      |**                                      |
+       256 -> 511        : 4164     |****************************************|
+       512 -> 1023       : 193      |*                                       |
+      1024 -> 2047       : 4        |                                        |
+      2048 -> 4095       : 6        |                                        |
+      4096 -> 8191       : 2        |                                        |
+
+03:40:51:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 25       |                                        |
+         2 -> 3          : 1491     |***************                         |
+         4 -> 7          : 218      |**                                      |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 16       |                                        |
+        32 -> 63         : 1527     |***************                         |
+        64 -> 127        : 319      |***                                     |
+       128 -> 255        : 429      |****                                    |
+       256 -> 511        : 3841     |****************************************|
+       512 -> 1023       : 232      |**                                      |
+      1024 -> 2047       : 3        |                                        |
+      2048 -> 4095       : 6        |                                        |
+      4096 -> 8191       : 1        |                                        |
+      8192 -> 16383      : 1        |                                        |
+
+03:40:53:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 27       |                                        |
+         2 -> 3          : 2999     |*********************************       |
+         4 -> 7          : 407      |****                                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 46       |                                        |
+        32 -> 63         : 3538     |****************************************|
+        64 -> 127        : 595      |******                                  |
+       128 -> 255        : 621      |*******                                 |
+       256 -> 511        : 3532     |*************************************** |
+       512 -> 1023       : 212      |**                                      |
+      1024 -> 2047       : 1        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 1        |                                        |
+
+03:40:55:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 221      |                                        |
+         2 -> 3          : 12580    |****************************************|
+         4 -> 7          : 1366     |****                                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 289      |                                        |
+        32 -> 63         : 10782    |**********************************      |
+        64 -> 127        : 1232     |***                                     |
+       128 -> 255        : 807      |**                                      |
+       256 -> 511        : 2299     |*******                                 |
+       512 -> 1023       : 135      |                                        |
+      1024 -> 2047       : 5        |                                        |
+      2048 -> 4095       : 2        |                                        |
+
+03:40:57:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 73951    |*************************               |
+         2 -> 3          : 117639   |****************************************|
+         4 -> 7          : 7943     |**                                      |
+         8 -> 15         : 1841     |                                        |
+        16 -> 31         : 1143     |                                        |
+        32 -> 63         : 5006     |*                                       |
+        64 -> 127        : 483      |                                        |
+       128 -> 255        : 242      |                                        |
+       256 -> 511        : 253      |                                        |
+       512 -> 1023       : 84       |                                        |
+      1024 -> 2047       : 23       |                                        |
+
+This shows a read workload that begins bimodal, and eventually the second
+mode disappears. The reason for this is that the workload cached during
+tracing. Note that the rate also increased, with over 200k reads for the
+final two second sample.
+
+
+USAGE message:
+
+# ./btrfsdist -h
+usage: btrfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize btrfs operation latency
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --notimestamp   don't include timestamp on interval output
+  -m, --milliseconds  output in milliseconds
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./btrfsdist            # show operation latency as a histogram
+    ./btrfsdist -p 181     # trace PID 181 only
+    ./btrfsdist 1 10       # print 1 second summaries, 10 times
+    ./btrfsdist -m 5       # 5s summaries, milliseconds
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
new file mode 100755
index 0000000..d48e04c
--- /dev/null
+++ b/tools/btrfsslower.py
@@ -0,0 +1,355 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# btrfsslower  Trace slow btrfs operations.
+#              For Linux, uses BCC, eBPF.
+#
+# USAGE: btrfsslower [-h] [-j] [-p PID] [min_ms]
+#
+# This script traces common btrfs file operations: reads, writes, opens, and
+# syncs. It measures the time spent in these operations, and prints details
+# for each that exceeded a threshold.
+#
+# WARNING: This adds low-overhead instrumentation to these btrfs operations,
+# including reads and writes from the file system cache. Such reads and writes
+# can be very frequent (depending on the workload; eg, 1M/sec), at which
+# point the overhead of this tool (even if it prints no "slower" events) can
+# begin to become significant.
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# symbols
+kallsyms = "/proc/kallsyms"
+
+# arguments
+examples = """examples:
+    ./btrfsslower             # trace operations slower than 10 ms (default)
+    ./btrfsslower 1           # trace operations slower than 1 ms
+    ./btrfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./btrfsslower 0           # trace all operations (warning: verbose)
+    ./btrfsslower -p 185      # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace common btrfs file operations slower than a threshold",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="just print fields: comma-separated values")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("min_ms", nargs="?", default='10',
+    help="minimum I/O duration to trace, in ms (default 10)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+pid = args.pid
+csv = args.csv
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/dcache.h>
+
+// XXX: switch these to char's when supported
+#define TRACE_READ      0
+#define TRACE_WRITE     1
+#define TRACE_OPEN      2
+#define TRACE_FSYNC     3
+
+struct val_t {
+    u64 ts;
+    u64 offset;
+    struct file *fp;
+};
+
+struct data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 type;
+    u64 size;
+    u64 offset;
+    u64 delta_us;
+    u64 pid;
+    char task[TASK_COMM_LEN];
+    char file[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(entryinfo, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+//
+// Store timestamp and size on entry
+//
+
+// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's
+// own read function. So we need to trace that and then filter on btrfs, which
+// I do by checking file->f_op.
+int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u64 id =  bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // btrfs filter on file->f_op == btrfs_file_operations
+    struct file *fp = iocb->ki_filp;
+    if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS)
+        return 0;
+
+    // store filep and timestamp by pid
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = fp;
+    val.offset = iocb->ki_pos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// btrfs_file_write_iter():
+int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = iocb->ki_filp;
+    val.offset = iocb->ki_pos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own
+// function. Same as with reads. Trace the generic path and filter:
+int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
+    struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // btrfs filter on file->f_op == btrfs_file_operations
+    if ((u64)file->f_op != BTRFS_FILE_OPERATIONS)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// btrfs_sync_file():
+int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+//
+// Output
+//
+
+static int trace_return(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    valp = entryinfo.lookup(&id);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+
+    // calculate delta
+    u64 ts = bpf_ktime_get_ns();
+    u64 delta_us = (ts - valp->ts) / 1000;
+    entryinfo.delete(&id);
+    if (FILTER_US)
+        return 0;
+
+    // populate output struct
+    u32 size = PT_REGS_RC(ctx);
+    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
+        .pid = pid};
+    data.ts_us = ts / 1000;
+    data.offset = valp->offset;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // workaround (rewriter should handle file to d_name in one step):
+    struct dentry *de = NULL;
+    struct qstr qs = {};
+    de = valp->fp->f_path.dentry;
+    qs = de->d_name;
+    if (qs.len == 0)
+        return 0;
+    bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_WRITE);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_OPEN);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_FSYNC);
+}
+
+"""
+
+# code replacements
+with open(kallsyms) as syms:
+    ops = ''
+    for line in syms:
+        a = line.rstrip().split()
+        (addr, name) = (a[0], a[2])
+        name = name.split("\t")[0]
+        if name == "btrfs_file_operations":
+            ops = "0x" + addr
+            break
+    if ops == '':
+        print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.")
+        print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.")
+        exit()
+    bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops)
+if min_ms == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US',
+        'delta_us <= %s' % str(min_ms * 1000))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("type", ct.c_ulonglong),
+        ("size", ct.c_ulonglong),
+        ("offset", ct.c_ulonglong),
+        ("delta_us", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("file", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    type = 'R'
+    if event.type == 1:
+        type = 'W'
+    elif event.type == 2:
+        type = 'O'
+    elif event.type == 3:
+        type = 'S'
+
+    if (csv):
+        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
+            event.ts_us, event.task.decode('utf-8', 'replace'), event.pid,
+            type, event.size, event.offset, event.delta_us,
+            event.file.decode('utf-8', 'replace')))
+        return
+    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
+        event.task.decode('utf-8', 'replace'), event.pid, type, event.size,
+        event.offset / 1024, float(event.delta_us) / 1000,
+        event.file.decode('utf-8', 'replace')))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# Common file functions. See earlier comment about generic_*().
+b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
+b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_write_entry")
+b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry")
+b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_fsync_entry")
+b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return")
+
+# header
+if (csv):
+    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
+else:
+    if min_ms == 0:
+        print("Tracing btrfs operations")
+    else:
+        print("Tracing btrfs operations slower than %d ms" % min_ms)
+    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
+        "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/btrfsslower_example.txt b/tools/btrfsslower_example.txt
new file mode 100644
index 0000000..ccb9369
--- /dev/null
+++ b/tools/btrfsslower_example.txt
@@ -0,0 +1,146 @@
+Demonstrations of btrfsslower, the Linux eBPF/bcc version.
+
+
+btrfsslower shows btrfs reads, writes, opens, and fsyncs, slower than a
+threshold. For example:
+
+# ./btrfsslower
+Tracing btrfs operations slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+01:22:03 randread.pl    13602  R 8192    391384     10.40 data1
+01:22:03 randread.pl    13602  R 8192    92632      10.41 data1
+01:22:06 randread.pl    13602  R 8192    199800     17.33 data1
+01:22:06 randread.pl    13602  R 8192    415160     17.21 data1
+01:22:07 randread.pl    13602  R 8192    729984     11.93 data1
+01:22:09 randread.pl    13602  R 8192    342784     11.90 data1
+[...]
+
+This shows several reads from a "randread.pl" program, each 8 Kbytes in size,
+and from a "data1" file. These all had over 10 ms latency.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+The threshold can be provided as an argument. Eg, I/O slower than 1 ms:
+
+# ./btrfsslower 1
+Tracing btrfs operations slower than 1 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+03:26:54 randread.pl    30578  R 8192    214864      1.87 data1
+03:26:54 randread.pl    30578  R 8192    267600      1.48 data1
+03:26:54 randread.pl    30578  R 8192    704200      1.30 data1
+03:26:54 randread.pl    30578  R 8192    492352      3.09 data1
+03:26:55 randread.pl    30578  R 8192    319448      1.34 data1
+03:26:55 randread.pl    30578  R 8192    676032      1.88 data1
+03:26:55 randread.pl    30578  R 8192    646712      2.24 data1
+03:26:55 randread.pl    30578  R 8192    124376      1.02 data1
+03:26:55 randread.pl    30578  R 8192    223064      2.64 data1
+03:26:55 randread.pl    30578  R 8192    521280      1.55 data1
+03:26:55 randread.pl    30578  R 8192    272992      2.48 data1
+03:26:55 randread.pl    30578  R 8192    450112      2.67 data1
+03:26:55 randread.pl    30578  R 8192    361808      1.78 data1
+03:26:55 randread.pl    30578  R 8192    41088       1.46 data1
+03:26:55 randread.pl    30578  R 8192    756576      1.67 data1
+03:26:55 randread.pl    30578  R 8192    711776      2.74 data1
+03:26:55 randread.pl    30578  R 8192    129472      1.34 data1
+03:26:55 randread.pl    30578  R 8192    526928      1.82 data1
+03:26:56 randread.pl    30578  R 8192    312768      1.44 data1
+03:26:56 randread.pl    30578  R 8192    34720       1.14 data1
+03:26:56 randread.pl    30578  R 8192    258376      1.13 data1
+03:26:56 randread.pl    30578  R 8192    308456      1.44 data1
+03:26:56 randread.pl    30578  R 8192    759656      1.27 data1
+03:26:56 randread.pl    30578  R 8192    387424      3.24 data1
+03:26:56 randread.pl    30578  R 8192    168864      3.38 data1
+03:26:56 randread.pl    30578  R 8192    699296      1.38 data1
+03:26:56 randread.pl    30578  R 8192    405688      2.37 data1
+03:26:56 randread.pl    30578  R 8192    559064      1.18 data1
+03:26:56 randread.pl    30578  R 8192    264808      1.13 data1
+03:26:56 randread.pl    30578  R 8192    369240      2.20 data1
+[...]
+
+There's now much more output (this spans less than 3 seconds, the previous output
+spanned 6 seconds), as the lower threshold is catching more I/O.
+
+
+A threshold of 0 will trace all operations. Warning: the output will be
+verbose, as it will include all file system cache hits.
+
+# ./btrfsslower 0
+Tracing btrfs operations
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+03:28:17 bash           32597  O 0       0           0.00 date.txt
+03:28:17 date           32597  W 29      0           0.02 date.txt
+03:28:23 cksum          32743  O 0       0           0.00 date.txt
+03:28:23 cksum          32743  R 29      0           0.01 date.txt
+03:28:23 cksum          32743  R 0       0           0.00 date.txt
+
+While tracing, the following commands were run in another window:
+
+# date > date.txt
+# cksum date.txt 
+
+The output of btrfsslower now includes open operations ("O"), and writes ("W").
+The first read from cksum(1) returned 29 bytes, and the second returned 0:
+causing cksum(1) to stop reading.
+
+
+A -j option will print just the fields (parsable output, csv):
+
+# ./btrfsslower -j 1
+ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE
+8930665366,randread.pl,2717,R,8192,230391808,4312,data1
+8930670746,randread.pl,2717,R,8192,347832320,1296,data1
+8930675995,randread.pl,2717,R,8192,409812992,4207,data1
+8930680213,randread.pl,2717,R,8192,498204672,3104,data1
+8930685970,randread.pl,2717,R,8192,553164800,1843,data1
+8930687568,randread.pl,2717,R,8192,339492864,1475,data1
+8930694108,randread.pl,2717,R,8192,500711424,6276,data1
+8930697139,randread.pl,2717,R,8192,485801984,2180,data1
+8930705755,randread.pl,2717,R,8192,376922112,7535,data1
+8930711340,randread.pl,2717,R,8192,380084224,3314,data1
+8930740964,randread.pl,2717,R,8192,226091008,24762,data1
+8930743169,randread.pl,2717,R,8192,361570304,1809,data1
+8930748789,randread.pl,2717,R,8192,346931200,1530,data1
+8930763514,randread.pl,2717,R,8192,59719680,13938,data1
+8930764870,randread.pl,2717,R,8192,406511616,1313,data1
+8930774327,randread.pl,2717,R,8192,661430272,7361,data1
+8930780360,randread.pl,2717,R,8192,406904832,2220,data1
+8930785736,randread.pl,2717,R,8192,523419648,2005,data1
+8930794560,randread.pl,2717,R,8192,342974464,8388,data1
+[...]
+
+This may be useful for visualizing with another tool, for example, for
+producing a scatter plot of ENDTIME vs LATENCY, to look for time-based
+patterns.
+
+
+USAGE message:
+
+# ./btrfsslower -h
+usage: btrfsslower [-h] [-j] [-p PID] [min_ms]
+
+Trace common btrfs file operations slower than a threshold
+
+positional arguments:
+  min_ms             minimum I/O duration to trace, in ms (default 10)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -j, --csv          just print fields: comma-separated values
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./btrfsslower             # trace operations slower than 10 ms (default)
+    ./btrfsslower 1           # trace operations slower than 1 ms
+    ./btrfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./btrfsslower 0           # trace all operations (warning: verbose)
+    ./btrfsslower -p 185      # trace PID 185 only
diff --git a/tools/cachestat.py b/tools/cachestat.py
new file mode 100755
index 0000000..b00c804
--- /dev/null
+++ b/tools/cachestat.py
@@ -0,0 +1,185 @@
+#!/usr/bin/python
+#
+# cachestat     Count cache kernel function calls.
+#               For Linux, uses BCC, eBPF. See .c file.
+#
+# USAGE: cachestat
+# Taken from funccount by Brendan Gregg
+# This is a rewrite of cachestat from perf to bcc
+# https://github.com/brendangregg/perf-tools/blob/master/fs/cachestat
+#
+# Copyright (c) 2016 Allan McAleavy.
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Sep-2015   Brendan Gregg   Created this.
+# 06-Nov-2015   Allan McAleavy
+# 13-Jan-2016   Allan McAleavy  run pep8 against program
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+import re
+from sys import argv
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# Function to gather data from /proc/meminfo
+# return dictionary for quicker lookup of both values
+def get_meminfo():
+    result = dict()
+
+    for line in open('/proc/meminfo'):
+        k = line.split(':', 3)
+        v = k[1].split()
+        result[k[0]] = int(v[0])
+    return result
+
+# set global variables
+mpa = 0
+mbd = 0
+apcl = 0
+apd = 0
+total = 0
+misses = 0
+hits = 0
+debug = 0
+
+# arguments
+parser = argparse.ArgumentParser(
+    description="Count cache kernel function calls",
+    formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("interval", nargs="?", default=5,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=-1,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+count = int(args.count)
+tstamp = args.timestamp
+interval = int(args.interval)
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+struct key_t {
+    u64 ip;
+};
+
+BPF_HASH(counts, struct key_t);
+
+int do_count(struct pt_regs *ctx) {
+    struct key_t key = {};
+    u64 ip;
+
+    key.ip = PT_REGS_IP(ctx);
+    counts.increment(key); // update counter
+    return 0;
+}
+
+"""
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="add_to_page_cache_lru", fn_name="do_count")
+b.attach_kprobe(event="mark_page_accessed", fn_name="do_count")
+b.attach_kprobe(event="account_page_dirtied", fn_name="do_count")
+b.attach_kprobe(event="mark_buffer_dirty", fn_name="do_count")
+
+# header
+if tstamp:
+    print("%-8s " % "TIME", end="")
+print("%8s %8s %8s %8s %12s %10s" %
+     ("TOTAL", "MISSES", "HITS", "DIRTIES", "BUFFERS_MB", "CACHED_MB"))
+
+loop = 0
+exiting = 0
+while 1:
+    if count > 0:
+        loop += 1
+        if loop > count:
+            exit()
+
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        exiting = 1
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    counts = b["counts"]
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+
+        if re.match(b'mark_page_accessed', b.ksym(k.ip)) is not None:
+            mpa = max(0, v.value)
+
+        if re.match(b'mark_buffer_dirty', b.ksym(k.ip)) is not None:
+            mbd = max(0, v.value)
+
+        if re.match(b'add_to_page_cache_lru', b.ksym(k.ip)) is not None:
+            apcl = max(0, v.value)
+
+        if re.match(b'account_page_dirtied', b.ksym(k.ip)) is not None:
+            apd = max(0, v.value)
+
+        # total = total cache accesses without counting dirties
+        # misses = total of add to lru because of read misses
+        total = (mpa - mbd)
+        misses = (apcl - apd)
+
+        if total < 0:
+            total = 0
+
+        if misses < 0:
+            misses = 0
+
+        hits = total - misses
+
+        # If hits are < 0, then its possible misses are overestimated
+        # due to possibly page cache read ahead adding more pages than
+        # needed. In this case just assume misses as total and reset hits.
+        if hits < 0:
+            misses = total
+            hits = 0
+
+    if debug:
+        print("%d %d %d %d %d %d %d\n" %
+        (mpa, mbd, apcl, apd, total, misses, hits))
+
+    counts.clear()
+
+    # Get memory info
+    mem = get_meminfo()
+    cached = int(mem["Cached"]) / 1024
+    buff = int(mem["Buffers"]) / 1024
+
+    if tstamp:
+        print("%-8s " % strftime("%H:%M:%S"), end="")
+    print("%8d %8d %8d %8d %12.0f %10.0f" %
+    (total, misses, hits, mbd, buff, cached))
+
+    mpa = 0
+    mbd = 0
+    apcl = 0
+    apd = 0
+    total = 0
+    misses = 0
+    hits = 0
+    cached = 0
+    buff = 0
+
+    if exiting:
+        print("Detaching...")
+        exit()
diff --git a/tools/cachestat_example.txt b/tools/cachestat_example.txt
new file mode 100644
index 0000000..7ecfec6
--- /dev/null
+++ b/tools/cachestat_example.txt
@@ -0,0 +1,56 @@
+# ./cachestat -h
+USAGE: ./cachestat [-T] [ interval [count] ]
+
+show Linux page cache hit/miss statistics including read and write hit %
+
+optional arguments:
+  -T              include timestamp on output
+
+examples:
+    ./cachestat             # run with default option of 5 seconds delay
+    ./cachestat -T          # run with default option of 5 seconds delay with timestamps
+    ./cachestat 1           # print every second hit/miss stats
+    ./cachestat -T 1        # include timestamps with one second samples
+    ./cachestat 1 5         # run with interval of one second for five iterations
+    ./cachestat -T 1 5      # include timestamps with interval of one second for five iterations
+    
+
+Following commands show a 2GB file being read into the page cache.
+
+Command used to generate activity:
+# dd if=/root/tmpfile of=/dev/null bs=8192
+
+Output from cachestat running simultatenously:
+# ./tools/cachestat.py 1
+   TOTAL   MISSES     HITS  DIRTIES   BUFFERS_MB  CACHED_MB
+       1        0        1        0            8        283
+       0        0        0        0            8        283
+       0        0        0        2            8        283
+       0        0        0        0            8        283
+   10009     9173      836        2            9        369
+  152032   152032        0        0            9       1028
+  157408   157405        3        0            9       1707
+  150432   150432        0        0            9       2331
+       0        0        0        0            9       2331
+       1        1        0        1            9       2331
+       0        0        0        0            9       2331
+       0        0        0        0            9       2331
+       0        0        0        0            9       2331
+
+The misses counter reflects a 2GB file being read and almost everything being
+a page cache miss.
+
+Below shows an example of a new 100MB file added to page cache, by using
+the command: dd if=/dev/zero of=/root/tmpfile2 bs=4k count=$((256*100))
+
+# ./tools/cachestat.py 1
+   TOTAL   MISSES     HITS  DIRTIES   BUFFERS_MB  CACHED_MB
+       0        0        0        0           15       2440
+       0        0        0        0           15       2440
+       0        0        0        0           15       2440
+    1758        0     1758    25603           15       2540
+       0        0        0        0           15       2540
+       0        0        0        0           15       2541
+
+~25600 pages are being dirtied (writes) which corresponds to the 100MB file
+added to the page cache.
diff --git a/tools/cachetop.py b/tools/cachetop.py
new file mode 100755
index 0000000..1013675
--- /dev/null
+++ b/tools/cachetop.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cachetop      Count cache kernel function calls per processes
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: cachetop
+# Taken from cachestat by Brendan Gregg
+#
+# Copyright (c) 2016-present, Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Jul-2016   Emmanuel Bretelle first version
+
+from __future__ import absolute_import
+from __future__ import division
+# Do not import unicode_literals until #623 is fixed
+# from __future__ import unicode_literals
+from __future__ import print_function
+
+from bcc import BPF
+from collections import defaultdict
+from time import strftime
+
+import argparse
+import curses
+import pwd
+import re
+import signal
+from time import sleep
+
+FIELDS = (
+    "PID",
+    "UID",
+    "CMD",
+    "HITS",
+    "MISSES",
+    "DIRTIES",
+    "READ_HIT%",
+    "WRITE_HIT%"
+)
+DEFAULT_FIELD = "HITS"
+
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+
+# Function to gather data from /proc/meminfo
+# return dictionary for quicker lookup of both values
+def get_meminfo():
+    result = {}
+
+    for line in open('/proc/meminfo'):
+        k = line.split(':', 3)
+        v = k[1].split()
+        result[k[0]] = int(v[0])
+    return result
+
+
+def get_processes_stats(
+        bpf,
+        sort_field=FIELDS.index(DEFAULT_FIELD),
+        sort_reverse=False):
+    '''
+    Return a tuple containing:
+    buffer
+    cached
+    list of tuple with per process cache stats
+    '''
+    counts = bpf.get_table("counts")
+    stats = defaultdict(lambda: defaultdict(int))
+    for k, v in counts.items():
+        stats["%d-%d-%s" % (k.pid, k.uid, k.comm.decode('utf-8', 'replace'))][k.ip] = v.value
+    stats_list = []
+
+    for pid, count in sorted(stats.items(), key=lambda stat: stat[0]):
+        rtaccess = 0
+        wtaccess = 0
+        mpa = 0
+        mbd = 0
+        apcl = 0
+        apd = 0
+        access = 0
+        misses = 0
+        rhits = 0
+        whits = 0
+
+        for k, v in count.items():
+            if re.match(b'mark_page_accessed', bpf.ksym(k)) is not None:
+                mpa = max(0, v)
+
+            if re.match(b'mark_buffer_dirty', bpf.ksym(k)) is not None:
+                mbd = max(0, v)
+
+            if re.match(b'add_to_page_cache_lru', bpf.ksym(k)) is not None:
+                apcl = max(0, v)
+
+            if re.match(b'account_page_dirtied', bpf.ksym(k)) is not None:
+                apd = max(0, v)
+
+            # access = total cache access incl. reads(mpa) and writes(mbd)
+            # misses = total of add to lru which we do when we write(mbd)
+            # and also the mark the page dirty(same as mbd)
+            access = (mpa + mbd)
+            misses = (apcl + apd)
+
+            # rtaccess is the read hit % during the sample period.
+            # wtaccess is the write hit % during the smaple period.
+            if mpa > 0:
+                rtaccess = float(mpa) / (access + misses)
+            if apcl > 0:
+                wtaccess = float(apcl) / (access + misses)
+
+            if wtaccess != 0:
+                whits = 100 * wtaccess
+            if rtaccess != 0:
+                rhits = 100 * rtaccess
+
+        _pid, uid, comm = pid.split('-', 2)
+        stats_list.append(
+            (int(_pid), uid, comm,
+             access, misses, mbd,
+             rhits, whits))
+
+    stats_list = sorted(
+        stats_list, key=lambda stat: stat[sort_field], reverse=sort_reverse
+    )
+    counts.clear()
+    return stats_list
+
+
+def handle_loop(stdscr, args):
+    # don't wait on key press
+    stdscr.nodelay(1)
+    # set default sorting field
+    sort_field = FIELDS.index(DEFAULT_FIELD)
+    sort_reverse = False
+
+    # load BPF program
+    bpf_text = """
+
+    #include <uapi/linux/ptrace.h>
+    struct key_t {
+        u64 ip;
+        u32 pid;
+        u32 uid;
+        char comm[16];
+    };
+
+    BPF_HASH(counts, struct key_t);
+
+    int do_count(struct pt_regs *ctx) {
+        struct key_t key = {};
+        u64 pid = bpf_get_current_pid_tgid();
+        u32 uid = bpf_get_current_uid_gid();
+
+        key.ip = PT_REGS_IP(ctx);
+        key.pid = pid & 0xFFFFFFFF;
+        key.uid = uid & 0xFFFFFFFF;
+        bpf_get_current_comm(&(key.comm), 16);
+
+        counts.increment(key);
+        return 0;
+    }
+
+    """
+    b = BPF(text=bpf_text)
+    b.attach_kprobe(event="add_to_page_cache_lru", fn_name="do_count")
+    b.attach_kprobe(event="mark_page_accessed", fn_name="do_count")
+    b.attach_kprobe(event="account_page_dirtied", fn_name="do_count")
+    b.attach_kprobe(event="mark_buffer_dirty", fn_name="do_count")
+
+    exiting = 0
+
+    while 1:
+        s = stdscr.getch()
+        if s == ord('q'):
+            exiting = 1
+        elif s == ord('r'):
+            sort_reverse = not sort_reverse
+        elif s == ord('<'):
+            sort_field = max(0, sort_field - 1)
+        elif s == ord('>'):
+            sort_field = min(len(FIELDS) - 1, sort_field + 1)
+        try:
+            sleep(args.interval)
+        except KeyboardInterrupt:
+            exiting = 1
+            # as cleanup can take many seconds, trap Ctrl-C:
+            signal.signal(signal.SIGINT, signal_ignore)
+
+        # Get memory info
+        mem = get_meminfo()
+        cached = int(mem["Cached"]) / 1024
+        buff = int(mem["Buffers"]) / 1024
+
+        process_stats = get_processes_stats(
+            b,
+            sort_field=sort_field,
+            sort_reverse=sort_reverse)
+        stdscr.clear()
+        stdscr.addstr(
+            0, 0,
+            "%-8s Buffers MB: %.0f / Cached MB: %.0f "
+            "/ Sort: %s / Order: %s" % (
+                strftime("%H:%M:%S"), buff, cached, FIELDS[sort_field],
+                sort_reverse and "descending" or "ascending"
+            )
+        )
+
+        # header
+        stdscr.addstr(
+            1, 0,
+            "{0:8} {1:8} {2:16} {3:8} {4:8} {5:8} {6:10} {7:10}".format(
+                *FIELDS
+            ),
+            curses.A_REVERSE
+        )
+        (height, width) = stdscr.getmaxyx()
+        for i, stat in enumerate(process_stats):
+            uid = int(stat[1])
+            try:
+                username = pwd.getpwuid(uid)[0]
+            except KeyError as ex:
+                # `pwd` throws a KeyError if the user cannot be found. This can
+                # happen e.g. when the process is running in a cgroup that has
+                # different users from the host.
+                username = 'UNKNOWN({})'.format(uid)
+
+            stdscr.addstr(
+                i + 2, 0,
+                "{0:8} {username:8.8} {2:16} {3:8} {4:8} "
+                "{5:8} {6:9.1f}% {7:9.1f}%".format(
+                    *stat, username=username
+                )
+            )
+            if i > height - 4:
+                break
+        stdscr.refresh()
+        if exiting:
+            print("Detaching...")
+            return
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description='show Linux page cache hit/miss statistics including read '
+                    'and write hit % per processes in a UI like top.'
+    )
+    parser.add_argument(
+        'interval', type=int, default=5, nargs='?',
+        help='Interval between probes.'
+    )
+
+    args = parser.parse_args()
+    return args
+
+args = parse_arguments()
+curses.wrapper(handle_loop, args)
diff --git a/tools/cachetop_example.txt b/tools/cachetop_example.txt
new file mode 100644
index 0000000..ef608b5
--- /dev/null
+++ b/tools/cachetop_example.txt
@@ -0,0 +1,70 @@
+# ./cachetop -h
+usage: cachetop.py [-h] [interval]
+
+show Linux page cache hit/miss statistics including read and write hit % per
+processes in a UI like top.
+
+positional arguments:
+  interval    Interval between probes.
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+examples:
+    ./cachetop             # run with default option of 5 seconds delay
+    ./cachetop 1           # print every second hit/miss stats
+
+# ./cachetop 5
+13:01:01 Buffers MB: 76 / Cached MB: 114 / Sort: HITS / Order: ascending
+PID      UID      CMD              HITS     MISSES   DIRTIES  READ_HIT%  WRITE_HIT%
+       1 root     systemd                 2        0        0     100.0%       0.0%
+     680 root     vminfo                  3        4        2      14.3%      42.9%
+     567 syslog   rs:main Q:Reg          10        4        2      57.1%      21.4%
+     986 root     kworker/u2:2           10     2457        4       0.2%      99.5%
+     988 root     kworker/u2:2           10        9        4      31.6%      36.8%
+     877 vagrant  systemd                18        4        2      72.7%      13.6%
+     983 root     python                148        3      143       3.3%       1.3%
+     981 root     strace                419        3      143      65.4%       0.5%
+     544 messageb dbus-daemon           455      371      454       0.1%       0.4%
+     243 root     jbd2/dm-0-8           457      371      454       0.4%       0.4%
+     985 root     (mount)               560     2457        4      18.4%      81.4%
+     987 root     systemd-udevd         566        9        4      97.7%       1.2%
+     988 root     systemd-cgroups       569        9        4      97.8%       1.2%
+     986 root     modprobe              578        9        4      97.8%       1.2%
+     287 root     systemd-journal       598      371      454      14.9%       0.3%
+     985 root     mount                 692     2457        4      21.8%      78.0%
+     984 vagrant  find                 9529     2457        4      79.5%      20.5%
+
+Above shows the run of `find /` on a newly booted system.
+
+Command used to generate the activity
+# find /
+
+Below shows the hit rate increases as we run find a second time and it gets it
+its pages from the cache.
+# ./cachetop.py
+13:01:01 Buffers MB: 76 / Cached MB: 115 / Sort: HITS / Order: ascending
+PID      UID      CMD              HITS     MISSES   DIRTIES  READ_HIT%  WRITE_HIT%
+     544 messageb dbus-daemon             2        2        1      25.0%      50.0%
+     680 root     vminfo                  2        2        1      25.0%      50.0%
+     243 root     jbd2/dm-0-8             3        2        1      40.0%      40.0%
+    1068 root     python                  5        0        0     100.0%       0.0%
+    1071 vagrant  bash                  350        0        0     100.0%       0.0%
+    1071 vagrant  find                12959        0        0     100.0%       0.0%
+
+
+Below shows that the dirty pages increases as a file of 80M is created running
+# dd if=/dev/urandom of=/tmp/c bs=8192 count=10000
+
+# ./cachetop.py 10
+13:01:01 Buffers MB: 77 / Cached MB: 193 / Sort: HITS / Order: ascending
+PID      UID      CMD              HITS     MISSES   DIRTIES  READ_HIT%  WRITE_HIT%
+     544 messageb dbus-daemon             9       10        7      10.5%      15.8%
+     680 root     vminfo                  9       10        7      10.5%      15.8%
+    1109 root     python                 22        0        0     100.0%       0.0%
+     243 root     jbd2/dm-0-8            25       10        7      51.4%       8.6%
+    1070 root     kworker/u2:2           85        0        0     100.0%       0.0%
+    1110 vagrant  bash                  366        0        0     100.0%       0.0%
+    1110 vagrant  dd                  42183    40000    20000      27.0%      24.3%
+
+The file copied into page cache was named /tmp/c with a size of 81920000 (81920000/4096) = 20000
diff --git a/tools/capable.py b/tools/capable.py
new file mode 100755
index 0000000..efcff4d
--- /dev/null
+++ b/tools/capable.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# capable   Trace security capabilitiy checks (cap_capable()).
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: capable [-h] [-v] [-p PID]
+#
+# ToDo: add -s for kernel stacks.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Sep-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./capable             # trace capability checks
+    ./capable -v          # verbose: include non-audit checks
+    ./capable -p 181      # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace security capability checks",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="include non-audit checks")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# capabilities to names, generated from (and will need updating):
+# awk '/^#define.CAP_.*[0-9]$/ { print "    " $3 ": \"" $2 "\"," }' \
+#     include/uapi/linux/capability.h
+capabilities = {
+    0: "CAP_CHOWN",
+    1: "CAP_DAC_OVERRIDE",
+    2: "CAP_DAC_READ_SEARCH",
+    3: "CAP_FOWNER",
+    4: "CAP_FSETID",
+    5: "CAP_KILL",
+    6: "CAP_SETGID",
+    7: "CAP_SETUID",
+    8: "CAP_SETPCAP",
+    9: "CAP_LINUX_IMMUTABLE",
+    10: "CAP_NET_BIND_SERVICE",
+    11: "CAP_NET_BROADCAST",
+    12: "CAP_NET_ADMIN",
+    13: "CAP_NET_RAW",
+    14: "CAP_IPC_LOCK",
+    15: "CAP_IPC_OWNER",
+    16: "CAP_SYS_MODULE",
+    17: "CAP_SYS_RAWIO",
+    18: "CAP_SYS_CHROOT",
+    19: "CAP_SYS_PTRACE",
+    20: "CAP_SYS_PACCT",
+    21: "CAP_SYS_ADMIN",
+    22: "CAP_SYS_BOOT",
+    23: "CAP_SYS_NICE",
+    24: "CAP_SYS_RESOURCE",
+    25: "CAP_SYS_TIME",
+    26: "CAP_SYS_TTY_CONFIG",
+    27: "CAP_MKNOD",
+    28: "CAP_LEASE",
+    29: "CAP_AUDIT_WRITE",
+    30: "CAP_AUDIT_CONTROL",
+    31: "CAP_SETFCAP",
+    32: "CAP_MAC_OVERRIDE",
+    33: "CAP_MAC_ADMIN",
+    34: "CAP_SYSLOG",
+    35: "CAP_WAKE_ALARM",
+    36: "CAP_BLOCK_SUSPEND",
+    37: "CAP_AUDIT_READ",
+}
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct data_t {
+   // switch to u32s when supported
+   u64 pid;
+   u64 uid;
+   int cap;
+   int audit;
+   char comm[TASK_COMM_LEN];
+};
+
+BPF_PERF_OUTPUT(events);
+
+int kprobe__cap_capable(struct pt_regs *ctx, const struct cred *cred,
+    struct user_namespace *targ_ns, int cap, int audit)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER1
+    FILTER2
+
+    u32 uid = bpf_get_current_uid_gid();
+    struct data_t data = {.pid = pid, .uid = uid, .cap = cap, .audit = audit};
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+};
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER1',
+        'if (pid != %s) { return 0; }' % args.pid)
+if not args.verbose:
+    bpf_text = bpf_text.replace('FILTER2', 'if (audit == 0) { return 0; }')
+bpf_text = bpf_text.replace('FILTER1', '')
+bpf_text = bpf_text.replace('FILTER2', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("uid", ct.c_ulonglong),
+        ("cap", ct.c_int),
+        ("audit", ct.c_int),
+        ("comm", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# header
+print("%-9s %-6s %-6s %-16s %-4s %-20s %s" % (
+    "TIME", "UID", "PID", "COMM", "CAP", "NAME", "AUDIT"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    if event.cap in capabilities:
+        name = capabilities[event.cap]
+    else:
+        name = "?"
+    print("%-9s %-6d %-6d %-16s %-4d %-20s %d" % (strftime("%H:%M:%S"),
+        event.uid, event.pid, event.comm.decode('utf-8', 'replace'),
+        event.cap, name, event.audit))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/capable_example.txt b/tools/capable_example.txt
new file mode 100644
index 0000000..0a63765
--- /dev/null
+++ b/tools/capable_example.txt
@@ -0,0 +1,79 @@
+Demonstrations of capable, the Linux eBPF/bcc version.
+
+
+capable traces calls to the kernel cap_capable() function, which does security
+capability checks, and prints details for each call. For example:
+
+# ./capable.py 
+TIME      UID    PID    COMM             CAP  NAME                 AUDIT
+22:11:23  114    2676   snmpd            12   CAP_NET_ADMIN        1
+22:11:23  0      6990   run              24   CAP_SYS_RESOURCE     1
+22:11:23  0      7003   chmod            3    CAP_FOWNER           1
+22:11:23  0      7003   chmod            4    CAP_FSETID           1
+22:11:23  0      7005   chmod            4    CAP_FSETID           1
+22:11:23  0      7005   chmod            4    CAP_FSETID           1
+22:11:23  0      7006   chown            4    CAP_FSETID           1
+22:11:23  0      7006   chown            4    CAP_FSETID           1
+22:11:23  0      6990   setuidgid        6    CAP_SETGID           1
+22:11:23  0      6990   setuidgid        6    CAP_SETGID           1
+22:11:23  0      6990   setuidgid        7    CAP_SETUID           1
+22:11:24  0      7013   run              24   CAP_SYS_RESOURCE     1
+22:11:24  0      7026   chmod            3    CAP_FOWNER           1
+22:11:24  0      7026   chmod            4    CAP_FSETID           1
+22:11:24  0      7028   chmod            4    CAP_FSETID           1
+22:11:24  0      7028   chmod            4    CAP_FSETID           1
+22:11:24  0      7029   chown            4    CAP_FSETID           1
+22:11:24  0      7029   chown            4    CAP_FSETID           1
+22:11:24  0      7013   setuidgid        6    CAP_SETGID           1
+22:11:24  0      7013   setuidgid        6    CAP_SETGID           1
+22:11:24  0      7013   setuidgid        7    CAP_SETUID           1
+22:11:25  0      7036   run              24   CAP_SYS_RESOURCE     1
+22:11:25  0      7049   chmod            3    CAP_FOWNER           1
+22:11:25  0      7049   chmod            4    CAP_FSETID           1
+22:11:25  0      7051   chmod            4    CAP_FSETID           1
+22:11:25  0      7051   chmod            4    CAP_FSETID           1
+[...]
+
+This can be useful for general debugging, and also security enforcement:
+determining a whitelist of capabilities an application needs.
+
+The output above includes various capability checks: snmpd checking
+CAP_NET_ADMIN, run checking CAP_SYS_RESOURCES, then some short-lived processes
+checking CAP_FOWNER, CAP_FSETID, etc.
+
+To see what each of these capabilities does, check the capabilities(7) man
+page and the kernel source.
+
+
+Sometimes capable catches itself starting up:
+
+# ./capable.py 
+TIME      UID    PID    COMM             CAP  NAME                 AUDIT
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
+22:22:19  0      21952  run              24   CAP_SYS_RESOURCE     1
+[...]
+
+These are capability checks from BPF and perf_events syscalls.
+
+
+USAGE:
+
+# ./capable.py -h
+usage: capable.py [-h] [-v] [-p PID]
+
+Trace security capability checks
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -v, --verbose      include non-audit checks
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./capable             # trace capability checks
+    ./capable -v          # verbose: include non-audit checks
+    ./capable -p 181      # only trace PID 181
diff --git a/tools/cobjnew.sh b/tools/cobjnew.sh
new file mode 100755
index 0000000..8bcdad3
--- /dev/null
+++ b/tools/cobjnew.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uobjnew.py -l c "$@"
diff --git a/tools/cobjnew_example.txt b/tools/cobjnew_example.txt
new file mode 120000
index 0000000..a8a83c3
--- /dev/null
+++ b/tools/cobjnew_example.txt
@@ -0,0 +1 @@
+lib/uobjnew_example.txt
\ No newline at end of file
diff --git a/tools/cpudist.py b/tools/cpudist.py
new file mode 100755
index 0000000..4d7c9eb
--- /dev/null
+++ b/tools/cpudist.py
@@ -0,0 +1,189 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpudist   Summarize on- and off-CPU time per task as a histogram.
+#
+# USAGE: cpudist [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+#
+# This measures the time a task spends on or off the CPU, and shows this time
+# as a histogram, optionally per-process.
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+examples = """examples:
+    cpudist              # summarize on-CPU time as a histogram
+    cpudist -O           # summarize off-CPU time as a histogram
+    cpudist 1 10         # print 1 second summaries, 10 times
+    cpudist -mT 1        # 1s summaries, milliseconds, and timestamps
+    cpudist -P           # show each PID separately
+    cpudist -p 185       # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize on-CPU time per task as a histogram.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-O", "--offcpu", action="store_true",
+    help="measure off-CPU time")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="millisecond histogram")
+parser.add_argument("-P", "--pids", action="store_true",
+    help="print a histogram per process ID")
+parser.add_argument("-L", "--tids", action="store_true",
+    help="print a histogram per thread ID")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+
+bpf_text = """#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+"""
+
+if not args.offcpu:
+    bpf_text += "#define ONCPU\n"
+
+bpf_text += """
+typedef struct pid_key {
+    u64 id;
+    u64 slot;
+} pid_key_t;
+
+
+BPF_HASH(start, u32, u64);
+STORAGE
+
+static inline void store_start(u32 tgid, u32 pid, u64 ts)
+{
+    if (FILTER)
+        return;
+
+    start.update(&pid, &ts);
+}
+
+static inline void update_hist(u32 tgid, u32 pid, u64 ts)
+{
+    if (FILTER)
+        return;
+
+    u64 *tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return;
+
+    if (ts < *tsp) {
+        // Probably a clock issue where the recorded on-CPU event had a
+        // timestamp later than the recorded off-CPU event, or vice versa.
+        return;
+    }
+    u64 delta = ts - *tsp;
+    FACTOR
+    STORE
+}
+
+int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u64 ts = bpf_ktime_get_ns();
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    u32 tgid = pid_tgid >> 32, pid = pid_tgid;
+
+#ifdef ONCPU
+    if (prev->state == TASK_RUNNING) {
+#else
+    if (1) {
+#endif
+        u32 prev_pid = prev->pid;
+        u32 prev_tgid = prev->tgid;
+#ifdef ONCPU
+        update_hist(prev_tgid, prev_pid, ts);
+#else
+        store_start(prev_tgid, prev_pid, ts);
+#endif
+    }
+
+BAIL:
+#ifdef ONCPU
+    store_start(tgid, pid, ts);
+#else
+    update_hist(tgid, pid, ts);
+#endif
+
+    return 0;
+}
+"""
+
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER', 'tgid != %s' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '0')
+if args.milliseconds:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
+    label = "msecs"
+else:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
+    label = "usecs"
+if args.pids or args.tids:
+    section = "pid"
+    pid = "tgid"
+    if args.tids:
+        pid = "pid"
+        section = "tid"
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, pid_key_t);')
+    bpf_text = bpf_text.replace('STORE',
+        'pid_key_t key = {.id = ' + pid + ', .slot = bpf_log2l(delta)}; ' +
+        'dist.increment(key);')
+else:
+    section = ""
+    bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+    bpf_text = bpf_text.replace('STORE',
+        'dist.increment(bpf_log2l(delta));')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")
+
+print("Tracing %s-CPU time... Hit Ctrl-C to end." %
+      ("off" if args.offcpu else "on"))
+
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    def pid_to_comm(pid):
+        try:
+            comm = open("/proc/%d/comm" % pid, "r").read()
+            return "%d %s" % (pid, comm)
+        except IOError:
+            return str(pid)
+
+    dist.print_log2_hist(label, section, section_print_fn=pid_to_comm)
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/cpudist_example.txt b/tools/cpudist_example.txt
new file mode 100644
index 0000000..7da4354
--- /dev/null
+++ b/tools/cpudist_example.txt
@@ -0,0 +1,306 @@
+Demonstrations of cpudist.
+
+This program summarizes task on-CPU time as a histogram, showing how long tasks
+spent on the CPU before being descheduled. This provides valuable information
+that can indicate oversubscription (too many tasks for too few processors),
+overhead due to excessive context switching (e.g. a common shared lock for
+multiple threads), uneven workload distribution, too-granular tasks, and more.
+
+Alternatively, the same options are available for summarizing task off-CPU
+time, which helps understand how often threads are being descheduled and how
+long they spend waiting for I/O, locks, timers, and other causes of suspension.
+
+# ./cpudist.py
+Tracing on-CPU time... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |                                        |
+         4 -> 7          : 1        |                                        |
+         8 -> 15         : 13       |**                                      |
+        16 -> 31         : 187      |****************************************|
+        32 -> 63         : 89       |*******************                     |
+        64 -> 127        : 26       |*****                                   |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1        |                                        |
+
+This is from a mostly idle system. Tasks wake up occasionally and run for only
+a few dozen microseconds, and then get descheduled.
+
+Here's some output from a system that is heavily loaded by threads that perform
+computation but also compete for a lock:
+
+# ./cpudist.py
+Tracing on-CPU time... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 51       |*                                       |
+         2 -> 3          : 395      |***********                             |
+         4 -> 7          : 259      |*******                                 |
+         8 -> 15         : 61       |*                                       |
+        16 -> 31         : 75       |**                                      |
+        32 -> 63         : 31       |                                        |
+        64 -> 127        : 7        |                                        |
+       128 -> 255        : 5        |                                        |
+       256 -> 511        : 3        |                                        |
+       512 -> 1023       : 5        |                                        |
+      1024 -> 2047       : 6        |                                        |
+      2048 -> 4095       : 4        |                                        |
+      4096 -> 8191       : 1361     |****************************************|
+      8192 -> 16383      : 523      |***************                         |
+     16384 -> 32767      : 3        |                                        |
+
+A bimodal distribution is now clearly visible. Most of the time, tasks were
+able to run for 4-16ms before being descheduled (this is likely the quantum
+length). Occasionally, tasks had to be descheduled a lot earlier -- possibly
+because they competed for a shared lock.
+
+If necessary, you can restrict the output to include only threads from a 
+particular process -- this helps reduce noise:
+
+# ./cpudist.py -p $(pidof parprimes)
+Tracing on-CPU time... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 3        |                                        |
+         2 -> 3          : 17       |                                        |
+         4 -> 7          : 39       |                                        |
+         8 -> 15         : 52       |*                                       |
+        16 -> 31         : 43       |                                        |
+        32 -> 63         : 12       |                                        |
+        64 -> 127        : 13       |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1        |                                        |
+       512 -> 1023       : 11       |                                        |
+      1024 -> 2047       : 15       |                                        |
+      2048 -> 4095       : 41       |                                        |
+      4096 -> 8191       : 1134     |************************                |
+      8192 -> 16383      : 1883     |****************************************|
+     16384 -> 32767      : 65       |*                                       |
+
+You can also ask for output at predefined intervals, and include timestamps for
+easier interpretation. While we're at it, the -P switch will print a histogram
+separately for each process:
+
+# ./cpudist.py -TP 5 3
+Tracing on-CPU time... Hit Ctrl-C to end.
+
+03:46:51
+
+pid = 0
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |**                                      |
+         4 -> 7          : 17       |**********************************      |
+         8 -> 15         : 11       |**********************                  |
+        16 -> 31         : 20       |****************************************|
+        32 -> 63         : 15       |******************************          |
+        64 -> 127        : 9        |******************                      |
+       128 -> 255        : 6        |************                            |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 1        |**                                      |
+
+pid = 5068
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |*************                           |
+         4 -> 7          : 3        |****************************************|
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 1        |*************                           |
+
+03:46:56
+
+pid = 0
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |**                                      |
+         4 -> 7          : 19       |****************************************|
+         8 -> 15         : 11       |***********************                 |
+        16 -> 31         : 9        |******************                      |
+        32 -> 63         : 3        |******                                  |
+        64 -> 127        : 1        |**                                      |
+       128 -> 255        : 3        |******                                  |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |**                                      |
+
+pid = 5068
+     usecs               : count     distribution
+         0 -> 1          : 1        |********************                    |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 2        |****************************************|
+
+03:47:01
+
+pid = 0
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 12       |********************************        |
+         8 -> 15         : 15       |****************************************|
+        16 -> 31         : 15       |****************************************|
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 3        |********                                |
+       128 -> 255        : 1        |**                                      |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |**                                      |
+
+pid = 5068
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |******                                  |
+         4 -> 7          : 6        |****************************************|
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 2        |*************                           |
+
+This histogram was obtained while executing `dd if=/dev/zero of=/dev/null` with
+fairly large block sizes.
+
+You could also ask for an off-CPU report using the -O switch. Here's a
+histogram of task block times while the system is heavily loaded:
+
+# ./cpudist -O -p $(parprimes)
+Tracing off-CPU time... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 3        |                                        |
+        64 -> 127        : 1        |                                        |
+       128 -> 255        : 1        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 2        |                                        |
+      1024 -> 2047       : 4        |                                        |
+      2048 -> 4095       : 3        |                                        |
+      4096 -> 8191       : 70       |***                                     |
+      8192 -> 16383      : 867      |****************************************|
+     16384 -> 32767      : 141      |******                                  |
+     32768 -> 65535      : 8        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 1        |                                        |
+    262144 -> 524287     : 2        |                                        |
+    524288 -> 1048575    : 3        |                                        |
+
+As you can see, threads are switching out for relatively long intervals, even
+though we know the workload doesn't have any significant blocking. This can be
+a result of over-subscription -- too many threads contending over too few CPUs.
+Indeed, there are four available CPUs and more than four runnable threads:
+
+# nproc
+4
+# cat /proc/loadavg
+0.04 0.11 0.06 9/147 7494
+
+(This shows we have 9 threads runnable out of 147 total. This is more than 4,
+the number of available CPUs.)
+
+Finally, let's ask for a per-thread report and values in milliseconds instead
+of microseconds:
+
+# ./cpudist.py -p $(pidof parprimes) -mL
+Tracing on-CPU time... Hit Ctrl-C to end.
+
+
+tid = 5092
+     msecs               : count     distribution
+         0 -> 1          : 3        |                                        |
+         2 -> 3          : 4        |                                        |
+         4 -> 7          : 4        |                                        |
+         8 -> 15         : 535      |****************************************|
+        16 -> 31         : 14       |*                                       |
+
+tid = 5093
+     msecs               : count     distribution
+         0 -> 1          : 8        |                                        |
+         2 -> 3          : 6        |                                        |
+         4 -> 7          : 4        |                                        |
+         8 -> 15         : 534      |****************************************|
+        16 -> 31         : 12       |                                        |
+
+tid = 5094
+     msecs               : count     distribution
+         0 -> 1          : 38       |***                                     |
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 5        |                                        |
+         8 -> 15         : 476      |****************************************|
+        16 -> 31         : 25       |**                                      |
+
+tid = 5095
+     msecs               : count     distribution
+         0 -> 1          : 31       |**                                      |
+         2 -> 3          : 6        |                                        |
+         4 -> 7          : 10       |                                        |
+         8 -> 15         : 478      |****************************************|
+        16 -> 31         : 20       |*                                       |
+
+tid = 5096
+     msecs               : count     distribution
+         0 -> 1          : 21       |*                                       |
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 4        |                                        |
+         8 -> 15         : 523      |****************************************|
+        16 -> 31         : 16       |*                                       |
+
+tid = 5097
+     msecs               : count     distribution
+         0 -> 1          : 11       |                                        |
+         2 -> 3          : 7        |                                        |
+         4 -> 7          : 7        |                                        |
+         8 -> 15         : 502      |****************************************|
+        16 -> 31         : 23       |*                                       |
+
+tid = 5098
+     msecs               : count     distribution
+         0 -> 1          : 21       |*                                       |
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 3        |                                        |
+         8 -> 15         : 494      |****************************************|
+        16 -> 31         : 28       |**                                      |
+
+tid = 5099
+     msecs               : count     distribution
+         0 -> 1          : 15       |*                                       |
+         2 -> 3          : 4        |                                        |
+         4 -> 7          : 6        |                                        |
+         8 -> 15         : 521      |****************************************|
+        16 -> 31         : 12       |                                        |
+
+It looks like all threads are more-or-less equally busy, and are typically
+switched out after running for 8-15 milliseconds (again, this is the typical
+quantum length).
+
+
+USAGE message:
+
+# ./cpudist.py -h
+
+usage: cpudist.py [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+
+Summarize on-CPU time per task as a histogram.
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -O, --offcpu        measure off-CPU time
+  -T, --timestamp     include timestamp on output
+  -m, --milliseconds  millisecond histogram
+  -P, --pids          print a histogram per process ID
+  -L, --tids          print a histogram per thread ID
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    cpudist              # summarize on-CPU time as a histogram
+    cpudist -O           # summarize off-CPU time as a histogram
+    cpudist 1 10         # print 1 second summaries, 10 times
+    cpudist -mT 1        # 1s summaries, milliseconds, and timestamps
+    cpudist -P           # show each PID separately
+    cpudist -p 185       # trace PID 185 only
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
new file mode 100755
index 0000000..b862bad
--- /dev/null
+++ b/tools/cpuunclaimed.py
@@ -0,0 +1,344 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpuunclaimed   Sample CPU run queues and calculate unclaimed idle CPU.
+#                For Linux, uses BCC, eBPF.
+#
+# This samples the length of the run queues and determine when there are idle
+# CPUs, yet queued threads waiting their turn. Report the amount of idle
+# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
+#
+# This situation can happen for a number of reasons:
+#
+# - An application has been bound to some, but not all, CPUs, and has runnable
+#   threads that cannot migrate to other CPUs due to this configuration.
+# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
+#   caches are warm, even if this means short periods of waiting while other
+#   CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
+# - Scheduler bugs.
+#
+# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
+# cause for concern. By leaving the CPU idle, overall throughput of the system
+# may be improved. This tool is best for identifying larger issues, > 2%, due
+# to the coarseness of its 99 Hertz samples.
+#
+# This is an experimental tool that currently works by use of sampling to
+# keep overheads low. Tool assumptions:
+#
+# - CPU samples consistently fire around the same offset. There will sometimes
+#   be a lag as a sample is delayed by higher-priority interrupts, but it is
+#   assumed the subsequent samples will catch up to the expected offsets (as
+#   is seen in practice). You can use -J to inspect sample offsets. Some
+#   systems can power down CPUs when idle, and when they wake up again they
+#   may begin firing at a skewed offset: this tool will detect the skew, print
+#   an error, and exit.
+# - All CPUs are online (see ncpu).
+#
+# If this identifies unclaimed CPU, you can double check it by dumping raw
+# samples (-j), as well as using other tracing tools to instrument scheduler
+# events (although this latter approach has much higher overhead).
+#
+# This tool passes all sampled events to user space for post processing.
+# I originally wrote this to do the calculations entirerly in kernel context,
+# and only pass a summary. That involves a number of challenges, and the
+# overhead savings may not outweigh the caveats. You can see my WIP here:
+# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
+#
+# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
+#
+# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Dec-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from time import sleep, strftime
+from ctypes import c_int
+import argparse
+import multiprocessing
+from os import getpid, system
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./cpuunclaimed            # sample and calculate unclaimed idle CPUs,
+                              # output every 1 second (default)
+    ./cpuunclaimed 5 10       # print 5 second summaries, 10 times
+    ./cpuunclaimed -T 1       # 1s summaries and timestamps
+    ./cpuunclaimed -j         # raw dump of all samples (verbose), CSV
+"""
+parser = argparse.ArgumentParser(
+    description="Sample CPU run queues and calculate unclaimed idle CPU",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="print sample summaries (verbose) as comma-separated values")
+parser.add_argument("-J", "--fullcsv", action="store_true",
+    help="print sample summaries with extra fields: CPU sample offsets")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("interval", nargs="?", default=-1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+frequency = 99
+dobind = 1
+wakeup_hz = 10                      # frequency to read buffers
+wakeup_s = float(1) / wakeup_hz
+ncpu = multiprocessing.cpu_count()  # assume all are online
+debug = 0
+
+# process arguments
+if args.fullcsv:
+    args.csv = True
+if args.csv:
+    interval = 0.2
+if args.interval != -1 and (args.fullcsv or args.csv):
+    print("ERROR: cannot use interval with either -j or -J. Exiting.")
+    exit()
+if args.interval == -1:
+    args.interval = "1"
+interval = float(args.interval)
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+
+struct data_t {
+    u64 ts;
+    u64 cpu;
+    u64 len;
+};
+
+BPF_PERF_OUTPUT(events);
+
+// Declare enough of cfs_rq to find nr_running, since we can't #import the
+// header. This will need maintenance. It is from kernel/sched/sched.h:
+struct cfs_rq_partial {
+    struct load_weight load;
+    unsigned int nr_running, h_nr_running;
+};
+
+int do_perf_event(struct bpf_perf_event_data *ctx)
+{
+    int cpu = bpf_get_smp_processor_id();
+    u64 now = bpf_ktime_get_ns();
+
+    /*
+     * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
+     * unstable interface and may need maintenance. Perhaps a future version
+     * of BPF will support task_rq(p) or something similar as a more reliable
+     * interface.
+     */
+    unsigned int len = 0;
+    struct task_struct *task = NULL;
+    struct cfs_rq_partial *my_q = NULL;
+    task = (struct task_struct *)bpf_get_current_task();
+    my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
+    len = my_q->nr_running;
+
+    struct data_t data = {.ts = now, .cpu = cpu, .len = len};
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+# code substitutions
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+# TODO: check for HW counters first and use if more accurate
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+    ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
+    sample_period=0, sample_freq=frequency)
+
+if args.csv:
+    if args.timestamp:
+        print("TIME", end=",")
+    print("TIMESTAMP_ns", end=",")
+    print(",".join("CPU" + str(c) for c in range(ncpu)), end="")
+    if args.fullcsv:
+        print(",", end="")
+        print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="")
+    print()
+else:
+    print(("Sampling run queues... Output every %s seconds. " +
+          "Hit Ctrl-C to end.") % args.interval)
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts", ct.c_ulonglong),
+        ("cpu", ct.c_ulonglong),
+        ("len", ct.c_ulonglong)
+    ]
+
+samples = {}
+group = {}
+last = 0
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    samples[event.ts] = {}
+    samples[event.ts]['cpu'] = event.cpu
+    samples[event.ts]['len'] = event.len
+
+exiting = 0 if args.interval else 1
+slept = float(0)
+
+# Choose the elapsed time from one sample group to the next that identifies a
+# new sample group (a group being a set of samples from all CPUs). The
+# earliest timestamp is compared in each group. This trigger is also used
+# for sanity testing, if a group's samples exceed half this value.
+trigger = int(0.8 * (1000000000 / frequency))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    # allow some buffering by calling sleep(), to reduce the context switch
+    # rate and lower overhead.
+    try:
+        if not exiting:
+            sleep(wakeup_s)
+    except KeyboardInterrupt:
+        exiting = 1
+    b.perf_buffer_poll()
+    slept += wakeup_s
+
+    if slept < 0.999 * interval:   # floating point workaround
+        continue
+    slept = 0
+
+    positive = 0  # number of samples where an idle CPU could have run work
+    running = 0
+    idle = 0
+    if debug >= 2:
+        print("DEBUG: begin samples loop, count %d" % len(samples))
+    for e in sorted(samples):
+        if debug >= 2:
+            print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
+                  samples[e]['cpu'], samples[e]['len'], e - last,
+                  e - last > trigger))
+
+        # look for time jumps to identify a new sample group
+        if e - last > trigger:
+
+            # first first group timestamp, and sanity test
+            g_time = 0
+            g_max = 0
+            for ge in sorted(group):
+                if g_time == 0:
+                    g_time = ge
+                g_max = ge
+
+            # process previous sample group
+            if args.csv:
+                lens = [0] * ncpu
+                offs = [0] * ncpu
+                for ge in sorted(group):
+                    lens[samples[ge]['cpu']] = samples[ge]['len']
+                    if args.fullcsv:
+                        offs[samples[ge]['cpu']] = ge - g_time
+                if g_time > 0:      # else first sample
+                    if args.timestamp:
+                        print("%-8s" % strftime("%H:%M:%S"), end=",")
+                    print("%d" % g_time, end=",")
+                    print(",".join(str(lens[c]) for c in range(ncpu)), end="")
+                    if args.fullcsv:
+                        print(",", end="")
+                        print(",".join(str(offs[c]) for c in range(ncpu)))
+                    else:
+                        print()
+            else:
+                # calculate stats
+                g_running = 0
+                g_queued = 0
+                for ge in group:
+                    if samples[ge]['len'] > 0:
+                        g_running += 1
+                    if samples[ge]['len'] > 1:
+                        g_queued += samples[ge]['len'] - 1
+                g_idle = ncpu - g_running
+
+                # calculate the number of threads that could have run as the
+                # minimum of idle and queued
+                if g_idle > 0 and g_queued > 0:
+                    if g_queued > g_idle:
+                        i = g_idle
+                    else:
+                        i = g_queued
+                    positive += i
+                running += g_running
+                idle += g_idle
+
+            # now sanity test, after -J output
+            g_range = g_max - g_time
+            if g_range > trigger / 2:
+                # if a sample group exceeds half the interval, we can no
+                # longer draw conclusions about some CPUs idle while others
+                # have queued work. Error and exit. This can happen when
+                # CPUs power down, then start again on different offsets.
+                # TODO: Since this is a sampling tool, an error margin should
+                # be anticipated, so an improvement may be to bump a counter
+                # instead of exiting, and only exit if this counter shows
+                # a skewed sample rate of over, say, 1%. Such an approach
+                # would allow a small rate of outliers (sampling error),
+                # and, we could tighten the trigger to be, say, trigger / 5.
+                # In the case of a power down, if it's detectable, perhaps
+                # the tool could reinitialize the timers (although exiting
+                # is simple and works).
+                print(("ERROR: CPU samples arrived at skewed offsets " +
+                      "(CPUs may have powered down when idle), " +
+                      "spanning %d ns (expected < %d ns). Debug with -J, " +
+                      "and see the man page. As output may begin to be " +
+                      "unreliable, exiting.") % (g_range, trigger / 2))
+                exit()
+
+            # these are done, remove
+            for ge in sorted(group):
+                del samples[ge]
+
+            # begin next group
+            group = {}
+            last = e
+
+        # stash this timestamp in a sample group dict
+        group[e] = 1
+
+    if not args.csv:
+        total = running + idle
+        unclaimed = util = 0
+
+        if debug:
+            print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
+                  positive, running, idle, total, len(samples)))
+
+        if args.timestamp:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+
+        # output
+        if total:
+            unclaimed = float(positive) / total
+            util = float(running) / total
+        print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
+              100 * unclaimed))
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/cpuunclaimed_example.txt b/tools/cpuunclaimed_example.txt
new file mode 100644
index 0000000..64158a9
--- /dev/null
+++ b/tools/cpuunclaimed_example.txt
@@ -0,0 +1,325 @@
+Demonstrations of cpuunclaimed, the Linux eBPF/bcc version.
+
+
+This tool samples the length of the CPU run queues and determine when there are
+idle CPUs, yet queued threads waiting their turn. It reports the amount of idle
+(yet unclaimed by waiting threads) CPU as a system-wide percentage. For
+example:
+
+# ./cpuunclaimed.py
+Sampling run queues... Output every 1 seconds. Hit Ctrl-C to end.
+%CPU  83.00%, unclaimed idle 0.12%
+%CPU  87.25%, unclaimed idle 0.38%
+%CPU  85.00%, unclaimed idle 0.25%
+%CPU  85.00%, unclaimed idle 0.25%
+%CPU  80.88%, unclaimed idle 0.00%
+%CPU  82.25%, unclaimed idle 0.00%
+%CPU  83.50%, unclaimed idle 0.12%
+%CPU  81.50%, unclaimed idle 0.00%
+%CPU  81.38%, unclaimed idle 0.00%
+[...]
+
+This shows a system running at over 80% CPU utilization, and with less than
+0.5% unclaimed idle CPUs.
+
+Unclaimed idle CPUs can happen for a number of reasons:
+
+- An application has been bound to some, but not all, CPUs, and has runnable
+  threads that cannot migrate to other CPUs due to this configuration.
+- CPU affinity: an optimization that leaves threads on CPUs where the CPU
+  caches are warm, even if this means short periods of waiting while other
+  CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
+- Scheduler bugs.
+
+An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
+cause for concern. By leaving the CPU idle, overall throughput of the system
+may be improved. This tool is best for identifying larger issues, > 2%, due
+to the coarseness of its 99 Hertz samples.
+
+
+This is an 8 CPU system, with an 8 CPU-bound threaded application running that
+has been bound to one CPU (via taskset):
+
+# ./cpuunclaimed.py 
+Sampling run queues... Output every 1 seconds. Hit Ctrl-C to end.
+%CPU  12.63%, unclaimed idle 86.36%
+%CPU  12.50%, unclaimed idle 87.50%
+%CPU  12.63%, unclaimed idle 87.37%
+%CPU  12.75%, unclaimed idle 87.25%
+%CPU  12.50%, unclaimed idle 87.50%
+%CPU  12.63%, unclaimed idle 87.37%
+%CPU  12.50%, unclaimed idle 87.50%
+%CPU  12.50%, unclaimed idle 87.50%
+[...]
+
+It shows that 7 of the 8 CPUs (87.5%) are idle at the same time there are
+queued threads waiting to run on CPU. This is an artificial situation caused
+by binding threads to the same CPU, to demonstrate how the tool works.
+
+
+This is an 8 CPU system running a Linux kernel build with "make -j8", and -T
+to print timestamps:
+
+# ./cpuunclaimed.py -T
+Sampling run queues... Output every 1 seconds. Hit Ctrl-C to end.
+22:25:55 %CPU  98.88%, unclaimed idle 0.12%
+22:25:56 %CPU  99.75%, unclaimed idle 0.25%
+22:25:57 %CPU  99.50%, unclaimed idle 0.50%
+22:25:58 %CPU  99.25%, unclaimed idle 0.75%
+22:25:59 %CPU  99.75%, unclaimed idle 0.25%
+22:26:00 %CPU  99.50%, unclaimed idle 0.50%
+22:26:01 %CPU  99.25%, unclaimed idle 0.75%
+22:26:02 %CPU  99.25%, unclaimed idle 0.75%
+22:26:03 %CPU  99.01%, unclaimed idle 0.87%
+22:26:04 %CPU  99.88%, unclaimed idle 0.12%
+22:26:05 %CPU  99.38%, unclaimed idle 0.62%
+
+There's now a consistent, yet small, amount of unclaimed idle CPU. This is
+expected to be deliberate: CPU affinity, as mentioned earlier.
+
+
+The -j option will print raw samples: around one hundred lines of output
+every second. For the same system with a Linux kernel build of "make -j8":
+
+# ./cpuunclaimed.py -j
+TIMESTAMP_ns,CPU0,CPU1,CPU2,CPU3,CPU4,CPU5,CPU6,CPU7
+514606928954752,1,1,1,1,1,1,1,1
+514606939054312,1,1,1,1,1,1,1,2
+514606949156518,1,1,1,1,1,1,1,1
+514606959256596,2,2,1,1,1,1,1,1
+514606969357989,1,1,1,1,1,2,1,1
+514606979459700,1,2,1,1,1,2,1,1
+514606989560481,1,1,1,1,1,1,1,1
+514606999661396,1,1,1,1,1,1,2,1
+514607009795601,1,1,1,1,1,1,1,2
+514607019862711,1,1,1,1,1,1,1,1
+514607029963734,1,1,1,1,1,1,1,1
+514607040062372,1,1,1,1,1,1,1,1
+514607050197735,1,1,1,2,1,1,1,1
+514607060266464,1,1,1,1,1,1,1,2
+514607070368025,1,1,1,1,1,2,1,1
+514607080468375,1,1,1,1,1,1,1,2
+514607090570292,3,2,1,1,1,1,1,1
+514607100670725,1,1,1,1,1,2,1,1
+514607110771946,1,2,1,1,1,1,1,1
+514607120873489,1,1,1,1,2,1,2,1
+514607130973857,2,1,1,1,3,1,1,1
+514607141080056,0,1,1,1,1,2,1,3
+514607151176312,1,1,1,2,1,1,1,1
+514607161277753,1,1,1,1,1,1,2,1
+514607171379095,1,1,1,1,1,1,1,1
+514607181479262,1,1,1,1,1,1,1,1
+514607191580794,3,1,1,1,1,1,1,1
+514607201680952,1,1,1,1,1,1,2,1
+514607211783683,1,1,1,1,1,1,1,1
+514607221883274,1,1,1,1,1,1,0,1
+514607231984244,1,1,1,1,1,1,1,1
+514607242085698,1,1,1,1,1,1,1,1
+514607252216898,1,2,1,1,1,1,1,1
+514607262289420,1,1,1,1,1,2,1,1
+514607272389922,1,1,1,1,1,1,1,1
+514607282489413,1,1,1,1,1,1,1,1
+514607292589950,1,3,1,1,1,1,1,1
+514607302693367,1,1,1,1,2,1,1,1
+514607312793792,1,1,1,1,1,1,1,1
+514607322895249,1,1,1,3,1,1,3,1
+514607332994278,1,0,1,1,1,2,1,2
+514607343095836,1,1,1,1,1,2,1,1
+514607353196533,1,1,1,1,2,1,1,1
+514607363297749,1,1,1,1,1,1,1,2
+514607373399011,1,1,1,1,1,1,1,2
+514607383499730,1,1,1,1,1,1,1,2
+514607393601510,1,1,1,1,1,1,1,2
+514607403704117,2,1,1,1,1,1,1,2
+514607413802700,1,1,1,1,2,1,0,1
+514607423904559,1,1,1,1,1,1,1,1
+[...]
+
+The output is verbose: printing out a timestamp, and then the length of each
+CPU's run queue. The second last line, of timestamp 514607413802700, is an
+example of what this tool detects: CPU 4 has a run queue length of 4, which
+means one thread running and one thread queued, while CPU 6 has a run queue
+length of 0: idle. The very next sample shows all CPUs busy.
+
+
+The -J option prints raw samples with time offsets showing when the samples
+were collected on each CPU. It's mostly useful for debugging the tool itself.
+For example, during a Linux kernel build:
+
+# ./cpuunclaimed.py -J
+TIMESTAMP_ns,CPU0,CPU1,CPU2,CPU3,CPU4,CPU5,CPU6,CPU7,OFFSET_ns_CPU0,OFFSET_ns_CPU1,OFFSET_ns_CPU2,OFFSET_ns_CPU3,OFFSET_ns_CPU4,OFFSET_ns_CPU5,OFFSET_ns_CPU6,OFFSET_ns_CPU7
+514722625198188,1,1,1,1,1,1,1,2,0,28321,51655,73396,89654,111172,132803,159792
+514722635299034,1,1,1,1,1,2,1,1,0,28809,51999,74183,89552,110011,131995,153519
+514722645400274,1,1,1,1,1,1,1,2,0,28024,51333,73652,88964,110075,131973,153568
+514722655501816,1,2,1,1,1,1,1,1,0,28893,51671,75233,89496,109430,131945,153694
+514722665602594,1,1,2,1,1,2,1,1,0,28623,50988,73866,89383,109186,131786,154555
+514722675703498,1,1,1,1,1,1,1,1,0,27379,51031,73175,89625,110380,131482,104811
+514722685804942,1,1,1,1,1,2,1,1,0,27213,50501,72183,88797,108780,130659,152153
+514722695906294,1,1,1,1,1,1,1,1,0,27036,51182,73420,87861,109585,130364,155089
+514722706005778,1,1,1,1,1,1,1,1,0,28492,51851,74138,89744,110208,132462,154060
+514722716060705,1,1,1,1,1,1,1,1,0,154499,152528,155232,155046,154502,178746,200001
+514722726209615,1,1,1,1,1,1,1,1,0,28170,49580,72605,87741,108144,130723,152138
+514722736309475,1,2,1,1,1,1,1,1,0,27421,51386,73061,89358,109457,131273,153005
+514722746410845,1,2,1,1,1,2,1,1,0,27788,50840,72720,88920,109111,131143,152979
+514722756511363,1,1,1,1,1,1,2,1,0,28280,50977,73559,89848,109659,131579,152693
+514722766613044,1,1,1,1,1,1,1,1,0,28046,50812,72754,89160,110108,130735,152948
+514722776712932,1,1,1,2,1,1,1,1,0,28586,51177,73974,89588,109947,132376,154162
+514722786815477,1,1,1,1,1,1,1,1,0,27973,71104,72539,88302,108896,130414,152236
+514722796914955,1,1,1,1,1,1,1,1,0,29054,52354,74214,89592,110615,132586,153925
+514722807044060,1,1,1,1,1,1,1,1,1587130,0,24079,46633,61787,82325,104706,125278
+514722817117432,2,1,2,1,1,1,1,1,0,27628,51038,75138,89724,109340,132426,155348
+514722827218254,1,1,1,1,1,1,2,1,0,29111,51868,74347,88904,109911,132764,153851
+514722837340158,1,1,1,1,1,1,1,1,0,7366,30760,53528,68622,89317,111095,132319
+514722847421305,1,1,1,1,1,1,1,1,0,28257,51105,73841,89037,110820,131605,153368
+514722857521112,1,1,1,1,1,1,1,1,0,28544,51441,73857,89530,110497,131915,153513
+514722867626129,0,2,1,1,1,1,1,1,0,24621,47917,70568,85391,106670,128081,150329
+514722877727183,2,1,1,1,1,1,1,1,0,24869,47630,71547,84761,106048,128444,149285
+514722887824589,1,1,1,1,1,1,2,1,0,28793,51212,73863,89584,109773,132348,153194
+514722897925481,1,1,1,1,1,1,2,1,0,29278,51163,73961,89774,109592,132029,153715
+514722908026097,1,1,1,1,1,1,1,1,0,30630,35595,36210,188001,190815,190072,190732
+514722918127439,1,1,1,1,1,1,1,1,0,28544,51885,73948,89987,109763,132632,154083
+514722928227399,1,1,1,1,1,1,1,1,0,31882,51574,74769,89939,110578,132951,154356
+514722938329471,1,1,1,1,1,1,1,1,0,28498,51304,74101,89670,110278,132653,153176
+514722948430589,1,1,1,1,1,1,1,1,0,27868,50925,73477,89676,109583,132360,153014
+514722958531802,1,1,1,1,1,1,1,1,0,28505,50886,73729,89919,109618,131988,152896
+514722968632181,1,1,1,1,1,1,1,1,0,28492,51749,73977,90334,109816,132897,152890
+514722978733584,1,1,1,1,1,1,1,1,0,28847,50957,74121,90014,110019,132377,152978
+514722988834321,1,1,1,1,1,1,1,1,0,28601,51437,74021,89968,110252,132233,153623
+514722998937170,1,1,2,1,1,1,1,1,0,27007,50044,73259,87725,108663,132194,152459
+514723009036821,1,2,1,2,1,1,1,1,0,28226,50937,73983,89110,110476,131740,153663
+514723019137577,1,1,1,1,1,1,1,1,0,30261,52357,75657,87803,61823,131850,153585
+514723029238745,1,1,1,1,1,1,1,1,0,28030,50752,74452,89240,110791,132187,153327
+514723039339069,1,1,1,1,1,1,1,1,0,29791,52636,75996,90475,110414,132232,154714
+514723049439822,1,1,1,1,2,1,1,1,0,29133,56662,74153,89520,110683,132740,154708
+514723059541617,1,1,1,1,1,1,1,1,0,27932,51480,74644,89656,109176,131499,153732
+514723069642500,1,1,2,1,1,1,2,1,0,27678,51509,73984,90136,110124,131554,153459
+514723079743525,2,1,1,1,1,1,1,1,0,28029,51424,74394,90056,110087,132383,152963
+514723089844091,2,1,1,2,1,1,1,1,0,28944,51692,74440,90339,110402,132722,154083
+514723099945957,1,1,2,1,1,1,1,1,0,28425,51267,73164,89322,115048,114630,115187
+514723110047020,1,1,2,0,1,1,1,2,0,28192,50811,76814,89835,109370,131265,153511
+514723120216662,1,1,2,1,1,2,1,1,29,34,0,4514,19268,40293,62674,84009
+[...]
+
+This is a Xen guest system, and it shows that CPU 0 usually completes first (an
+offset of 0), followed by CPU 1 around 28000 nanoseconds later, and so on.
+The spread of offsets is triggered by the bcc Python library that initializes
+the timers, which steps through the CPUs in sequence, with a small delay
+between them merely from executing its own loop code.
+
+Here's more output during a Linux kernel build:
+
+# ./cpuunclaimed.py -J
+TIMESTAMP_ns,CPU0,CPU1,CPU2,CPU3,CPU4,CPU5,CPU6,CPU7,OFFSET_ns_CPU0,OFFSET_ns_CPU1,OFFSET_ns_CPU2,OFFSET_ns_CPU3,OFFSET_ns_CPU4,OFFSET_ns_CPU5,OFFSET_ns_CPU6,OFFSET_ns_CPU7
+514722625198188,1,1,1,1,1,1,1,2,0,28321,51655,73396,89654,111172,132803,159792
+515700745758947,2,1,1,1,1,1,1,1,0,19835,34891,49397,59364,71988,87571,102769
+515700755860451,2,1,1,1,1,1,1,2,0,19946,34323,49855,59844,72741,87925,102891
+515700765960560,1,1,1,1,1,1,1,1,0,20805,35339,50436,59677,73557,88661,104796
+515700776061744,1,1,1,1,1,1,1,1,1626,77,0,190,153452,154665,178218,154116
+515700786162017,1,1,1,1,1,1,1,1,0,20497,35361,51552,59787,74451,147789,104545
+515700796262811,1,1,1,1,1,1,1,2,0,20910,35657,50805,60175,73953,88492,103527
+515700806364951,1,1,1,1,1,1,1,1,0,20140,35023,50074,59726,72757,88040,102421
+515700816465253,1,1,1,1,1,1,2,1,0,20952,34899,50262,60048,72890,88067,103545
+515700826566573,1,1,1,1,1,1,1,1,0,20898,35490,50609,59805,74060,88550,103354
+515700836667480,1,1,1,1,1,1,2,1,0,20548,34760,50959,59490,73059,87820,103006
+515700846768182,1,1,1,1,1,1,2,1,0,20571,35113,50777,59962,74139,88543,103192
+515700856869468,1,1,2,1,1,2,2,1,0,20932,35382,50510,60106,73739,91818,103684
+515700866971905,1,1,1,2,1,1,1,1,0,19780,33018,49075,58375,71949,86537,102136
+515700877073459,2,1,1,1,1,1,1,1,0,20065,73966,48989,58832,71408,85714,101067
+515700887172772,1,1,1,1,1,1,1,1,0,20909,34608,51493,59890,73564,88668,103454
+515700897273292,1,2,1,1,1,1,1,1,0,20353,35292,50114,59773,73948,88615,103383
+515700907374341,1,1,2,1,1,1,1,1,0,20816,35206,50915,60062,73878,88857,103794
+515700917475331,1,1,6,1,1,2,1,1,0,20752,34931,50280,59764,73781,88329,103234
+515700927576958,1,1,1,1,1,1,1,1,0,19929,34703,50181,59364,73004,88053,103127
+515700937677298,1,1,2,2,1,1,1,1,0,21178,34724,50740,61193,73452,89030,103390
+515700947778409,2,1,1,1,1,1,1,1,0,21059,35604,50853,60098,73919,88675,103506
+515700957879196,2,1,1,1,1,1,1,1,0,21326,35939,51492,60083,74249,89474,103761
+[...]
+
+Notice the tighter range of offsets? I began executing cpuunclaimed when the
+system was idle, and it initialized the CPU timers more quickly, and then I
+began the Linux kernel build.
+
+Here's some different output, this time from a physical system with 4 CPUs,
+also doing a kernel build,
+
+# ./cpuunclaimed.py -J
+TIMESTAMP_ns,CPU0,CPU1,CPU2,CPU3,OFFSET_ns_CPU0,OFFSET_ns_CPU1,OFFSET_ns_CPU2,OFFSET_ns_CPU3
+4429382557480,1,1,1,1,0,6011,10895,16018
+4429392655042,2,1,1,1,0,8217,13661,19378
+4429402757604,1,1,1,1,0,6879,12433,18000
+4429412857809,1,1,1,1,0,8303,13190,18719
+4429422960709,2,1,1,1,0,6095,11234,17079
+4429433060391,1,1,1,2,0,6747,12480,18070
+4429443161699,1,1,1,1,0,6560,12264,17945
+4429453262002,1,2,1,1,0,6992,12644,18341
+4429463363706,1,2,1,1,0,6211,12071,17853
+4429473465571,1,1,1,1,0,5766,11495,17638
+4429483566920,1,1,1,1,0,5223,11736,16358
+4429493666279,1,1,1,1,0,6964,12653,18410
+4429503769113,1,1,1,1,0,5161,11399,16612
+4429513870744,1,1,1,1,0,5943,10583,15768
+4429523969826,1,1,1,1,0,6533,12336,18189
+4429534070311,1,1,1,1,0,6834,12816,18488
+4429544170456,1,1,1,1,0,7284,13401,19129
+4429554274467,1,2,1,1,0,5941,11160,16594
+4429564372365,1,2,1,1,0,7514,13618,19190
+4429574474406,1,2,1,1,0,6687,12650,18248
+4429584574220,1,2,1,1,0,7912,13705,19136
+[...]
+
+If the offset range becomes too great, we can no longer conclude about when
+some CPUs were idle and others had queued work. The tool will detect this,
+and print an error message and exit.
+
+
+Some systems can power down CPUs when idle, and when they wake up again the
+timed samples may resume from different offsets. If this happens, this tool
+can no longer draw conclusions about when some CPUs were idle and others
+had queued work, so it prints an error, and exits. Eg:
+
+# ./cpuunclaimed.py 1
+Sampling run queues... Output every 1 seconds. Hit Ctrl-C to end.
+%CPU   0.25%, unclaimed idle 0.00%
+%CPU   0.75%, unclaimed idle 0.00%
+%CPU   0.25%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.12%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.25%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.12%, unclaimed idle 0.00%
+%CPU   0.13%, unclaimed idle 0.00%
+%CPU   0.12%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+%CPU   0.00%, unclaimed idle 0.00%
+ERROR: CPU samples arrived at skewed offsets (CPUs may have powered down when idle), spanning 4328176 ns (expected < 4040404 ns). Debug with -J, and see the man page. As output may begin to be unreliable, exiting.
+
+It's expected that this will only really occur on idle systems.
+
+USAGE:
+
+# ./cpuunclaimed.py -h
+usage: cpuunclaimed.py [-h] [-j] [-J] [-T] [interval] [count]
+
+Sample CPU run queues and calculate unclaimed idle CPU
+
+positional arguments:
+  interval         output interval, in seconds
+  count            number of outputs
+
+optional arguments:
+  -h, --help       show this help message and exit
+  -j, --csv        print sample summaries (verbose) as comma-separated values
+  -J, --fullcsv    print sample summaries with extra fields: CPU sample
+                   offsets
+  -T, --timestamp  include timestamp on output
+
+examples:
+    ./cpuunclaimed            # sample and calculate unclaimed idle CPUs,
+                              # output every 1 second (default)
+    ./cpuunclaimed 5 10       # print 5 second summaries, 10 times
+    ./cpuunclaimed -T 1       # 1s summaries and timestamps
+    ./cpuunclaimed -j         # raw dump of all samples (verbose), CSV
diff --git a/tools/criticalstat.py b/tools/criticalstat.py
new file mode 100755
index 0000000..e45731c
--- /dev/null
+++ b/tools/criticalstat.py
@@ -0,0 +1,331 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# criticalstat  Trace long critical sections (IRQs or preemption disabled)
+#               For Linux, uses BCC, eBPF. Requires kernel built with
+#               CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS
+#
+# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION]
+#
+# Copyright (c) 2018, Google, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# By Joel Fernandes <joel@joelfernandes.org>
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import sys
+import subprocess
+import os.path
+
+examples=""
+
+parser = argparse.ArgumentParser(
+    description="Trace long critical sections",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+
+parser.add_argument("-p", "--preemptoff", action="store_true",
+                    help="Find long sections where preemption was off")
+
+parser.add_argument("-i", "--irqoff", action="store_true",
+                    help="Find long sections where IRQ was off")
+
+parser.add_argument("-d", "--duration", default=100,
+                    help="Duration in uS (microseconds) below which we filter")
+
+args = parser.parse_args()
+
+preemptoff = False
+irqoff = False
+
+if args.irqoff:
+    preemptoff = False
+    irqoff = True
+elif args.preemptoff:
+    preemptoff = True
+    irqoff = False
+
+debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" +
+    " | awk '{print $2}'",
+    shell=True,
+    stdout=subprocess.PIPE).stdout.read().split(b"\n")[0]
+
+if debugfs_path == "":
+    print("ERROR: Unable to find debugfs mount point");
+    sys.exit(0);
+
+trace_path = debugfs_path + b"/tracing/events/preemptirq/";
+
+if (not os.path.exists(trace_path + b"irq_disable") or
+   not os.path.exists(trace_path + b"irq_enable") or
+   not os.path.exists(trace_path + b"preempt_disable") or
+   not os.path.exists(trace_path + b"preempt_enable")):
+    print("ERROR: required tracing events are not available\n" +
+        "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " +
+        "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " +
+        "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.")
+    sys.exit(0)
+
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+enum addr_offs {
+    START_CALLER_OFF,
+    START_PARENT_OFF,
+    END_CALLER_OFF,
+    END_PARENT_OFF
+};
+
+struct start_data {
+    u32 addr_offs[2];
+    u64 ts;
+    int idle_skip;
+    int active;
+};
+
+struct data_t {
+    u64 time;
+    s64 stack_id;
+    u32 cpu;
+    u64 id;
+    u32 addrs[4];   /* indexed by addr_offs */
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_STACK_TRACE(stack_traces, 16384);
+BPF_PERCPU_ARRAY(sts, struct start_data, 1);
+BPF_PERCPU_ARRAY(isidle, u64, 1);
+BPF_PERF_OUTPUT(events);
+
+/*
+ * In the below code we install tracepoint probes on preempt or
+ * IRQ disable/enable critical sections and idle events, the cases
+ * are combinations of 4 different states.
+ * The states are defined as:
+ * CSenter: A critical section has been entered. Either due to
+ *          preempt disable or irq disable.
+ * CSexit: A critical section has been exited. Either due to
+ *         preempt enable or irq enable.
+ * Ienter: The CPU has entered an idle state.
+ * Iexit: The CPU has exited an idle state.
+ *
+ * The scenario we are trying to detect is if there is an overlap
+ * between Critical sections and idle entry/exit. If there are any
+ * such cases, we avoid recording those critical sections since they
+ * are not worth while to record and just add noise.
+ */
+TRACEPOINT_PROBE(power, cpu_idle)
+{
+    int idx = 0;
+    u64 val;
+    struct start_data *stdp, std;
+
+    // Mark active sections as that they should be skipped
+
+    // Handle the case CSenter, Ienter, CSexit, Iexit
+    // Handle the case CSenter, Ienter, Iexit, CSexit
+    stdp = sts.lookup(&idx);
+    if (stdp && stdp->active) {
+        /*
+         * Due to verifier issues, we have to copy contents
+         * of stdp onto the stack before the update.
+         * Fix it to directly update once kernel patch d71962f
+         * becomes more widespread.
+         */
+        std = *stdp;
+        std.idle_skip = 1;
+        sts.update(&idx, &std);
+    }
+
+    // Mark CPU as actively within idle or not.
+    if (args->state < 100) {
+        val = 1;
+        isidle.update(&idx, &val);
+    } else {
+        val = 0;
+        isidle.update(&idx, &val);
+    }
+    return 0;
+}
+
+static int in_idle(void)
+{
+     u64 *idlep;
+     int idx = 0;
+
+    // Skip event if we're in idle loop
+    idlep = isidle.lookup(&idx);
+    if (idlep && *idlep)
+        return 1;
+    return 0;
+}
+
+static void reset_state(void)
+{
+    int idx = 0;
+    struct start_data s = {};
+
+    sts.update(&idx, &s);
+}
+
+TRACEPOINT_PROBE(preemptirq, TYPE_disable)
+{
+    int idx = 0;
+    struct start_data s;
+
+    // Handle the case Ienter, CSenter, CSexit, Iexit
+    // Handle the case Ienter, CSenter, Iexit, CSexit
+    if (in_idle()) {
+        reset_state();
+        return 0;
+    }
+
+    u64 ts = bpf_ktime_get_ns();
+
+    s.idle_skip = 0;
+    s.addr_offs[START_CALLER_OFF] = args->caller_offs;
+    s.addr_offs[START_PARENT_OFF] = args->parent_offs;
+    s.ts = ts;
+    s.active = 1;
+
+    sts.update(&idx, &s);
+    return 0;
+}
+
+TRACEPOINT_PROBE(preemptirq, TYPE_enable)
+{
+    int idx = 0;
+    u64 start_ts, end_ts, diff;
+    struct start_data *stdp;
+
+    // Handle the case CSenter, Ienter, CSexit, Iexit
+    // Handle the case Ienter, CSenter, CSexit, Iexit
+    if (in_idle()) {
+        reset_state();
+        return 0;
+    }
+
+    stdp = sts.lookup(&idx);
+    if (!stdp) {
+        reset_state();
+        return 0;
+    }
+
+    // Handle the case Ienter, Csenter, Iexit, Csexit
+    if (!stdp->active) {
+        reset_state();
+        return 0;
+    }
+
+    // Handle the case CSenter, Ienter, Iexit, CSexit
+    if (stdp->idle_skip) {
+        reset_state();
+        return 0;
+    }
+
+    end_ts = bpf_ktime_get_ns();
+    start_ts = stdp->ts;
+
+    if (start_ts > end_ts) {
+        reset_state();
+        return 0;
+    }
+
+    diff = end_ts - start_ts;
+
+    if (diff < DURATION) {
+        reset_state();
+        return 0;
+    }
+
+    u64 id = bpf_get_current_pid_tgid();
+    struct data_t data = {};
+
+    if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
+        data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF];
+        data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF];
+        data.addrs[END_CALLER_OFF] = args->caller_offs;
+        data.addrs[END_PARENT_OFF] = args->parent_offs;
+
+        data.id = id;
+        data.stack_id = stack_traces.get_stackid(args, 0);
+        data.time = diff;
+        data.cpu = bpf_get_smp_processor_id();
+        events.perf_submit(args, &data, sizeof(data));
+    }
+
+    reset_state();
+    return 0;
+}
+"""
+bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000))
+
+if preemptoff:
+    bpf_text = bpf_text.replace('TYPE', 'preempt')
+else:
+    bpf_text = bpf_text.replace('TYPE', 'irq')
+
+b = BPF(text=bpf_text)
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("time", ct.c_ulonglong),
+        ("stack_id", ct.c_longlong),
+        ("cpu", ct.c_int),
+        ("id", ct.c_ulonglong),
+        ("addrs", ct.c_int * 4),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+    ]
+
+def get_syms(kstack):
+    syms = []
+
+    for addr in kstack:
+        s = b.ksym(addr, show_offset=True)
+        syms.append(s)
+
+    return syms
+
+# process event
+def print_event(cpu, data, size):
+    try:
+        global b
+        event = ct.cast(data, ct.POINTER(Data)).contents
+        stack_traces = b['stack_traces']
+        stext = b.ksymname('_stext')
+
+        print("===================================")
+        print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \
+            (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="")
+        print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1])))
+        print("Section end:   {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3])))
+
+        if event.stack_id >= 0:
+            kstack = stack_traces.walk(event.stack_id)
+            syms = get_syms(kstack)
+            if not syms:
+                return
+
+            for s in syms:
+                print("  ", end="")
+                print("%s" % s)
+        else:
+            print("NO STACK FOUND DUE TO COLLISION")
+        print("===================================")
+        print("")
+    except:
+        sys.exit(0)
+
+b["events"].open_perf_buffer(print_event, page_cnt=256)
+
+print("Finding critical section with {} disabled for > {}us".format(
+    ('preempt' if preemptoff else 'IRQ'), args.duration))
+
+while 1:
+    b.perf_buffer_poll();
diff --git a/tools/criticalstat_example.txt b/tools/criticalstat_example.txt
new file mode 100644
index 0000000..1f53769
--- /dev/null
+++ b/tools/criticalstat_example.txt
@@ -0,0 +1,139 @@
+Demonstrations of criticalstat: Find long atomic critical sections in the kernel.
+
+criticalstat traces and reports occurences of atomic critical sections in the
+kernel with useful stacktraces showing the origin of them. Such critical
+sections frequently occur due to use of spinlocks, or if interrupts or
+preemption were explicity disabled by a driver. IRQ routines in Linux are also
+executed with interrupts disabled. There are many reasons. Such critical
+sections are a source of long latency/responsive issues for real-time systems.
+
+This works by probing the preempt/irq and cpuidle tracepoints in the kernel.
+Since this uses BPF, only the root user can use this tool. Further, the kernel
+has to be built with certain CONFIG options enabled inorder for it to work:
+CONFIG_PREEMPTIRQ_EVENTS
+CONFIG_DEBUG_PREEMPT
+Additionally, the following options should be turned off on older kernels:
+CONFIG_PROVE_LOCKING
+CONFIG_LOCKDEP
+
+USAGE:
+# ./criticalstat -h
+usage: criticalstat [-h] [-p] [-i] [-d DURATION]
+
+Trace long critical sections
+
+optional arguments:
+  -h, --help            Show this help message and exit
+  -p, --preemptoff      Find long sections where preemption was off
+  -i, --irqoff          Find long sections where IRQ was off
+  -d DURATION, --duration DURATION
+                        Duration in uS (microseconds) below which we filter
+
+examples:
+    ./criticalstat          	# run with default options: irq off for more than 100 uS
+    ./criticalstat -p       	# find sections with preemption disabled for more than 100 uS
+    ./criticalstat -d 500   	# find sections with IRQs disabled for more than 500 uS
+    ./criticalstat -p -d 500	# find sections with preemption disabled for more than 500 uS
+
+The tool runs continuously until interrupted by Ctrl-C
+
+
+By default, criticalstat finds IRQ disable sections for > 100us.
+
+# ./criticalstat
+Finding critical section with IRQ disabled for > 100us
+===================================
+TASK: kworker/u16:5 (pid  5903 tid  5903) Total Time: 194.427  us
+
+Section start: __schedule -> schedule
+Section end:   _raw_spin_unlock_irq -> finish_task_switch
+  trace_hardirqs_on+0xdc
+  trace_hardirqs_on+0xdc
+  _raw_spin_unlock_irq+0x18
+  finish_task_switch+0xf0
+  __schedule+0x8c8
+  preempt_schedule_irq+0x38
+  el1_preempt+0x8
+===================================
+
+
+If too many sections are showing up, the user can raise the threshold to only
+show critical sections that are > 500us by passing "-d" option:
+
+# ./criticalstat -d 500
+Finding critical section with IRQ disabled for > 500us
+===================================
+TASK: crtc_commit:111 (pid   246 tid   246) Total Time: 580.730  us
+
+Section start: clk_enable_lock -> clk_enable
+Section end:   _raw_spin_unlock_irqrestore -> clk_enable
+  trace_hardirqs_on+0xdc
+  trace_hardirqs_on+0xdc
+  _raw_spin_unlock_irqrestore+0x24
+  clk_enable+0x80
+  msm_dss_enable_clk+0x7c
+  sde_power_resource_enable+0x578
+  _sde_crtc_vblank_enable_no_lock+0x68
+  sde_crtc_vblank+0x8c
+  sde_kms_enable_vblank+0x18
+  vblank_ctrl_worker+0xd0
+  kthread_worker_fn+0xf8
+  kthread+0x114
+  ret_from_fork+0x10
+===================================
+
+
+If instead of irq disabled sections, we want to see preempt disabled sections,
+then pass the "-p" option. Below we try to find preempt-disabled critical
+sections that are > 500us.
+
+# ./criticalstat -p -d 500
+Finding critical section with preempt disabled for > 500us
+===================================
+TASK: swapper/1 (pid     0 tid     0) Total Time: 618.437  us
+
+Section start: preempt_count_add -> preempt_count_add
+Section end:   preempt_count_sub -> preempt_count_sub
+  trace_preempt_on+0x98
+  trace_preempt_on+0x98
+  preempt_latency_stop+0x164
+  preempt_count_sub+0x50
+  schedule+0x74
+  schedule_preempt_disabled+0x14
+  cpu_startup_entry+0x84
+  secondary_start_kernel+0x1c8
+  [unknown]
+===================================
+
+
+criticalstat -p can also reflect kernel scheduler issues sometimes. These may
+show up as long preempt-off sections if the functions in the scheduler take a
+long time to run (such as pick_next_task_fair which selects the CPU for a task
+
+Follow is a report showing a preempt-off latency of 700us during the schedule
+loop's execution:
+
+===================================
+TASK: irq/296-cs35l36 (pid   666 tid   666) Total Time: 732.657  us
+
+Section start: schedule -> schedule
+Section end:   schedule -> schedule
+  trace_preempt_on+0x98
+  trace_preempt_on+0x98
+  preempt_count_sub+0xa4
+  schedule+0x78
+  schedule_timeout+0x80
+  wait_for_common+0xb4
+  wait_for_completion_timeout+0x28
+  geni_i2c_xfer+0x298
+  __i2c_transfer+0x4e0
+  i2c_transfer+0x8
+  irq_thread_fn+0x2c
+  irq_thread+0x160
+  kthread+0x118
+  ret_from_fork+0x10
+===================================
+
+
+See Also: Linux kernel's preemptoff and irqoff tracers which provide similar
+tracing but with some limitations.
diff --git a/tools/cthreads_example.txt b/tools/cthreads_example.txt
new file mode 120000
index 0000000..4e678a8
--- /dev/null
+++ b/tools/cthreads_example.txt
@@ -0,0 +1 @@
+lib/uthreads_example.txt
\ No newline at end of file
diff --git a/tools/dbslower.py b/tools/dbslower.py
new file mode 100755
index 0000000..c523d7a
--- /dev/null
+++ b/tools/dbslower.py
@@ -0,0 +1,233 @@
+#!/usr/bin/python
+#
+# dbslower      Trace MySQL and PostgreSQL queries slower than a threshold.
+#
+# USAGE: dbslower [-v] [-p PID [PID ...]] [-b PATH_TO_BINARY] [-m THRESHOLD]
+#                 {mysql,postgres}
+#
+# By default, a threshold of 1ms is used. Set the threshold to 0 to trace all
+# queries (verbose).
+#
+# Script works in two different modes:
+# 1) USDT probes, which means it needs MySQL and PostgreSQL built with
+# USDT (DTrace) support.
+# 2) uprobe and uretprobe on exported function of binary specified by
+# PATH_TO_BINARY parameter. (At the moment only MySQL support)
+#
+# If no PID or PATH_TO_BINARY is provided, the script attempts to discover
+# all MySQL or PostgreSQL database processes and uses USDT probes.
+#
+# Strongly inspired by Brendan Gregg's work on the mysqld_qslower script.
+#
+# Copyright 2017, Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0
+#
+# 15-Feb-2017   Sasha Goldshtein   Created this.
+
+from bcc import BPF, USDT
+import argparse
+import re
+import ctypes as ct
+import subprocess
+
+examples = """examples:
+    dbslower postgres            # trace PostgreSQL queries slower than 1ms
+    dbslower postgres -p 188 322 # trace specific PostgreSQL processes
+    dbslower mysql -p 480 -m 30  # trace MySQL queries slower than 30ms
+    dbslower mysql -p 480 -v     # trace MySQL queries & print the BPF program
+    dbslower mysql -x $(which mysqld)  # trace MySQL queries with uprobes
+"""
+parser = argparse.ArgumentParser(
+    description="",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program")
+parser.add_argument("db", choices=["mysql", "postgres"],
+    help="the database engine to use")
+parser.add_argument("-p", "--pid", type=int, nargs='*',
+    dest="pids", metavar="PID", help="the pid(s) to trace")
+parser.add_argument("-x", "--exe", type=str,
+    dest="path", metavar="PATH", help="path to binary")
+parser.add_argument("-m", "--threshold", type=int, default=1,
+    help="trace queries slower than this threshold (ms)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+threshold_ns = args.threshold * 1000000
+
+mode = "USDT"
+if args.path and not args.pids:
+    if args.db == "mysql":
+        regex = "\\w+dispatch_command\\w+"
+        symbols = BPF.get_user_functions_and_addresses(args.path, regex)
+
+        if len(symbols) == 0:
+            print("Can't find function 'dispatch_command' in %s" % (args.path))
+            exit(1)
+
+        (mysql_func_name, addr) = symbols[0]
+
+        if mysql_func_name.find("COM_DATA") >= 0:
+            mode = "MYSQL57"
+        else:
+            mode = "MYSQL56"
+    else:
+        # Placeholder for PostrgeSQL
+        # Look on functions initStringInfo, pgstat_report_activity, EndCommand,
+        # NullCommand
+        print("Sorry at the moment PostgreSQL supports only USDT")
+        exit(1)
+
+program = """
+#include <uapi/linux/ptrace.h>
+
+DEFINE_THRESHOLD
+DEFINE_USDT
+DEFINE_MYSQL56
+DEFINE_MYSQL57
+
+struct temp_t {
+    u64 timestamp;
+#ifdef USDT
+    char *query;
+#else
+    /*
+    MySQL clears query packet before uretprobe call - so copy query in advance
+    */
+    char query[256];
+#endif //USDT
+};
+
+struct data_t {
+    u64 pid;
+    u64 timestamp;
+    u64 duration;
+    char query[256];
+};
+
+BPF_HASH(temp, u64, struct temp_t);
+BPF_PERF_OUTPUT(events);
+
+int query_start(struct pt_regs *ctx) {
+
+#if defined(MYSQL56) || defined(MYSQL57)
+/*
+Trace only packets with enum_server_command == COM_QUERY
+*/
+    #ifdef MYSQL56
+    u64 command  = (u64) PT_REGS_PARM1(ctx);
+    #else //MYSQL57
+    u64 command  = (u64) PT_REGS_PARM3(ctx);
+    #endif
+    if (command != 3) return 0;
+#endif
+
+    struct temp_t tmp = {};
+    tmp.timestamp = bpf_ktime_get_ns();
+
+#if defined(MYSQL56)
+    bpf_probe_read(&tmp.query, sizeof(tmp.query), (void*) PT_REGS_PARM3(ctx));
+#elif defined(MYSQL57)
+    void* st = (void*) PT_REGS_PARM2(ctx);
+    char* query;
+    bpf_probe_read(&query, sizeof(query), st);
+    bpf_probe_read(&tmp.query, sizeof(tmp.query), query);
+#else //USDT
+    bpf_usdt_readarg(1, ctx, &tmp.query);
+#endif
+
+    u64 pid = bpf_get_current_pid_tgid();
+    temp.update(&pid, &tmp);
+    return 0;
+}
+
+int query_end(struct pt_regs *ctx) {
+    struct temp_t *tempp;
+    u64 pid = bpf_get_current_pid_tgid();
+    tempp = temp.lookup(&pid);
+    if (!tempp)
+        return 0;
+
+    u64 delta = bpf_ktime_get_ns() - tempp->timestamp;
+#ifdef THRESHOLD
+    if (delta >= THRESHOLD) {
+#endif //THRESHOLD
+        struct data_t data = {};
+        data.pid = pid >> 32;   // only process id
+        data.timestamp = tempp->timestamp;
+        data.duration = delta;
+        bpf_probe_read(&data.query, sizeof(data.query), tempp->query);
+        events.perf_submit(ctx, &data, sizeof(data));
+#ifdef THRESHOLD
+    }
+#endif //THRESHOLD
+    temp.delete(&pid);
+    return 0;
+};
+""".replace("DEFINE_USDT", "#define USDT" if mode == "USDT" else "") \
+   .replace("DEFINE_MYSQL56", "#define MYSQL56" if mode == "MYSQL56" else "") \
+   .replace("DEFINE_MYSQL57", "#define MYSQL57" if mode == "MYSQL57" else "") \
+   .replace("DEFINE_THRESHOLD",
+            "#define THRESHOLD %d" % threshold_ns if threshold_ns > 0 else "")
+
+if mode.startswith("MYSQL"):
+    # Uprobes mode
+    bpf = BPF(text=program)
+    bpf.attach_uprobe(name=args.path, sym=mysql_func_name,
+                      fn_name="query_start")
+    bpf.attach_uretprobe(name=args.path, sym=mysql_func_name,
+                         fn_name="query_end")
+else:
+    # USDT mode
+    if not args.pids or len(args.pids) == 0:
+        if args.db == "mysql":
+            args.pids = map(int, subprocess.check_output(
+                                            "pidof mysqld".split()).split())
+        elif args.db == "postgres":
+            args.pids = map(int, subprocess.check_output(
+                                            "pidof postgres".split()).split())
+
+    usdts = map(lambda pid: USDT(pid=pid), args.pids)
+    for usdt in usdts:
+        usdt.enable_probe("query__start", "query_start")
+        usdt.enable_probe("query__done", "query_end")
+    if args.verbose:
+        print('\n'.join(map(lambda u: u.get_text(), usdts)))
+
+    bpf = BPF(text=program, usdt_contexts=usdts)
+
+if args.verbose or args.ebpf:
+    print(program)
+    if args.ebpf:
+        exit()
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("timestamp", ct.c_ulonglong),
+        ("delta", ct.c_ulonglong),
+        ("query", ct.c_char * 256)
+    ]
+
+start = BPF.monotonic_time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-14.6f %-6d %8.3f %s" % (
+        float(event.timestamp - start) / 1000000000,
+        event.pid, float(event.delta) / 1000000, event.query))
+
+if mode.startswith("MYSQL"):
+    print("Tracing database queries for application %s slower than %d ms..." %
+        (args.path, args.threshold))
+else:
+    print("Tracing database queries for pids %s slower than %d ms..." %
+        (', '.join(map(str, args.pids)), args.threshold))
+
+print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
+
+bpf["events"].open_perf_buffer(print_event, page_cnt=64)
+while True:
+    bpf.perf_buffer_poll()
diff --git a/tools/dbslower_example.txt b/tools/dbslower_example.txt
new file mode 100644
index 0000000..88cbab0
--- /dev/null
+++ b/tools/dbslower_example.txt
@@ -0,0 +1,88 @@
+Demonstrations of dbslower, the Linux eBPF/bcc version.
+
+
+dbslower traces queries served by a MySQL or PostgreSQL server, and prints
+those that exceed a latency (query time) threshold. By default a threshold of
+1 ms is used. For example:
+
+# dbslower mysql
+Tracing database queries for pids 25776 slower than 1 ms...
+TIME(s)        PID          MS QUERY
+1.315800       25776  2000.999 call getproduct(97)
+3.360380       25776     3.226 call getproduct(6)
+^C
+
+This traced two queries slower than 1ms, one of which is very slow: over 2
+seconds. We can filter out the shorter ones and keep only the really slow ones:
+
+# dbslower mysql -m 1000
+Tracing database queries for pids 25776 slower than 1000 ms...
+TIME(s)        PID          MS QUERY
+1.421264       25776  2002.183 call getproduct(97)
+3.572617       25776  2001.381 call getproduct(97)
+5.661411       25776  2001.867 call getproduct(97)
+7.748296       25776  2001.329 call getproduct(97)
+^C
+
+This looks like a pattern -- we keep making this slow query every 2 seconds
+or so, and it takes approximately 2 seconds to run.
+
+By default, dbslower will try to detect mysqld and postgres processes, but if
+necessary, you can specify the process ids with the -p switch:
+
+# dbslower mysql -p $(pidof mysql)
+Tracing database queries for pids 25776 slower than 1 ms...
+TIME(s)        PID          MS QUERY
+2.002125       25776     3.340 call getproduct(7)
+2.045006       25776  2001.558 call getproduct(97)
+4.131863       25776  2002.275 call getproduct(97)
+6.190513       25776     3.248 call getproduct(33)
+^C
+
+Specifying 0 as the threshold will print all the queries:
+
+# dbslower mysql -m 0
+Tracing database queries for pids 25776 slower than 0 ms...
+TIME(s)        PID          MS QUERY
+6.003720       25776     2.363 /* mysql-connector-java-5.1.40 ( Revision: 402933ef52cad9aa82624e80acbea46e3a701ce6 ) */SELECT  @@session.auto_increment_increment AS auto_increment_increment, @@character_set_client AS character_set_client, @@character_set_connection AS character_set_conn
+6.599219       25776     0.068 SET NAMES latin1
+6.613944       25776     0.057 SET character_set_results = NULL
+6.645228       25776     0.059 SET autocommit=1
+6.653798       25776     0.059 SET sql_mode='NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION,STRICT_TRANS_TABLES'
+6.682184       25776     2.526 select * from users where id = 0
+6.767888       25776     0.288 select id from products where userid = 0
+6.790642       25776     2.255 call getproduct(0)
+6.809865       25776     0.218 call getproduct(1)
+6.846878       25776     0.248 select * from users where id = 1
+6.847623       25776     0.166 select id from products where userid = 1
+6.867363       25776     0.244 call getproduct(2)
+6.868162       25776     0.107 call getproduct(3)
+6.874726       25776     0.208 select * from users where id = 2
+6.881722       25776     0.260 select id from products where userid = 2
+^C
+
+Here we can see the MySQL connector initialization and connection establishment,
+before the actual queries start coming in.
+
+
+USAGE:
+# dbslower -h
+usage: dbslower.py [-h] [-v] [-p [PIDS [PIDS ...]]] [-m THRESHOLD]
+                   {mysql,postgres}
+
+positional arguments:
+  {mysql,postgres}      the database engine to use
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --verbose         print the BPF program
+  -p [PID [PID ...]], --pid [PID [PID ...]]
+                        the pid(s) to trace
+  -m THRESHOLD, --threshold THRESHOLD
+                        trace queries slower than this threshold (ms)
+
+examples:
+    dbslower postgres            # trace PostgreSQL queries slower than 1ms
+    dbslower postgres -p 188 322 # trace specific PostgreSQL processes
+    dbslower mysql -p 480 -m 30  # trace MySQL queries slower than 30ms
+    dbslower mysql -p 480 -v     # trace MySQL queries and print the BPF program
diff --git a/tools/dbstat.py b/tools/dbstat.py
new file mode 100755
index 0000000..a89b097
--- /dev/null
+++ b/tools/dbstat.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+#
+# dbstat        Display a histogram of MySQL and PostgreSQL query latencies.
+#
+# USAGE: dbstat [-v] [-p PID [PID ...]] [-m THRESHOLD] [-u]
+#               [-i INTERVAL] {mysql,postgres}
+#
+# This tool uses USDT probes, which means it needs MySQL and PostgreSQL built
+# with USDT (DTrace) support.
+#
+# Copyright 2017, Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0
+#
+# 15-Feb-2017   Sasha Goldshtein   Created this.
+
+from bcc import BPF, USDT
+import argparse
+import subprocess
+from time import sleep, strftime
+
+examples = """
+    dbstat postgres     # display a histogram of PostgreSQL query latencies
+    dbstat mysql -v     # display MySQL latencies and print the BPF program
+    dbstat mysql -u     # display query latencies in microseconds (default: ms)
+    dbstat mysql -m 5   # trace only queries slower than 5ms
+    dbstat mysql -p 408 # trace queries in a specific process
+"""
+parser = argparse.ArgumentParser(
+    description="",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program")
+parser.add_argument("db", choices=["mysql", "postgres"],
+    help="the database engine to use")
+parser.add_argument("-p", "--pid", type=int, nargs='*',
+    dest="pids", metavar="PID", help="the pid(s) to trace")
+parser.add_argument("-m", "--threshold", type=int, default=0,
+    help="trace queries slower than this threshold (ms)")
+parser.add_argument("-u", "--microseconds", action="store_true",
+    help="display query latencies in microseconds (default: milliseconds)")
+parser.add_argument("-i", "--interval", type=int, default=99999999999,
+    help="print summary at this interval (seconds)")
+args = parser.parse_args()
+
+if not args.pids or len(args.pids) == 0:
+    if args.db == "mysql":
+        args.pids = map(int, subprocess.check_output(
+                                        "pidof mysqld".split()).split())
+    elif args.db == "postgres":
+        args.pids = map(int, subprocess.check_output(
+                                        "pidof postgres".split()).split())
+
+program = """
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(temp, u64, u64);
+BPF_HISTOGRAM(latency);
+
+int probe_start(struct pt_regs *ctx) {
+    u64 timestamp = bpf_ktime_get_ns();
+    u64 pid = bpf_get_current_pid_tgid();
+    temp.update(&pid, &timestamp);
+    return 0;
+}
+
+int probe_end(struct pt_regs *ctx) {
+    u64 *timestampp;
+    u64 pid = bpf_get_current_pid_tgid();
+    timestampp = temp.lookup(&pid);
+    if (!timestampp)
+        return 0;
+
+    u64 delta = bpf_ktime_get_ns() - *timestampp;
+    FILTER
+    delta /= SCALE;
+    latency.increment(bpf_log2l(delta));
+    temp.delete(&pid);
+    return 0;
+}
+"""
+program = program.replace("SCALE", str(1000 if args.microseconds else 1000000))
+program = program.replace("FILTER", "" if args.threshold == 0 else
+        "if (delta / 1000000 < %d) { return 0; }" % args.threshold)
+
+usdts = map(lambda pid: USDT(pid=pid), args.pids)
+for usdt in usdts:
+    usdt.enable_probe("query__start", "probe_start")
+    usdt.enable_probe("query__done", "probe_end")
+
+if args.verbose:
+    print('\n'.join(map(lambda u: u.get_text(), usdts)))
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=usdts)
+
+print("Tracing database queries for pids %s slower than %d ms..." %
+      (', '.join(map(str, args.pids)), args.threshold))
+
+latencies = bpf["latency"]
+
+def print_hist():
+    print("[%s]" % strftime("%H:%M:%S"))
+    latencies.print_log2_hist("query latency (%s)" %
+                              ("us" if args.microseconds else "ms"))
+    print("")
+    latencies.clear()
+
+while True:
+    try:
+        sleep(args.interval)
+        print_hist()
+    except KeyboardInterrupt:
+        print_hist()
+        break
diff --git a/tools/dbstat_example.txt b/tools/dbstat_example.txt
new file mode 100644
index 0000000..79f17f0
--- /dev/null
+++ b/tools/dbstat_example.txt
@@ -0,0 +1,120 @@
+Demonstrations of dbstat, the Linux eBPF/bcc version.
+
+
+dbstat traces queries performed by a MySQL or PostgreSQL database process, and
+displays a histogram of query latencies. For example:
+
+# dbstat mysql
+Tracing database queries for pids 25776 slower than 0 ms...
+     query latency (ms)  : count     distribution
+         0 -> 1          : 990      |****************************************|
+         2 -> 3          : 7        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 2        |                                        |
+^C
+
+It's immediately evident that the vast majority of queries finish very quickly,
+in under 1ms, but there are some super-slow queries occasionally, in the 1-2
+seconds bucket.
+
+We can filter out the shorter queries with the -m switch:
+
+# dbstat mysql -m 1000
+Tracing database queries for pids 25776 slower than 1000 ms...
+     query latency (ms)  : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 8        |****************************************|
+^C
+
+By default, dbstat will try to detect mysqld and postgres processes, but if
+necessary, you can specify the process ids with the -p switch. Here, the -i
+switch is also used to request histograms at 3 second intervals:
+
+# dbstat mysql -p $(pidof mysql) -i 3
+Tracing database queries for pids 25776 slower than 0 ms...
+[06:14:36]
+     query latency (ms)  : count     distribution
+         0 -> 1          : 758      |****************************************|
+         2 -> 3          : 1        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |                                        |
+
+[06:14:39]
+     query latency (ms)  : count     distribution
+         0 -> 1          : 436      |****************************************|
+         2 -> 3          : 2        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |                                        |
+
+[06:14:42]
+     query latency (ms)  : count     distribution
+         0 -> 1          : 399      |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |                                        |
+^C
+
+
+USAGE:
+# dbstat -h
+usage: dbstat.py [-h] [-v] [-p [PID [PID ...]]] [-m THRESHOLD] [-u]
+                 [-i INTERVAL]
+                 {mysql,postgres}
+
+positional arguments:
+  {mysql,postgres}      the database engine to use
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --verbose         print the BPF program
+  -p [PID [PID ...]], --pid [PID [PID ...]]
+                        the pid(s) to trace
+  -m THRESHOLD, --threshold THRESHOLD
+                        trace queries slower than this threshold (ms)
+  -u, --microseconds    display query latencies in microseconds (default:
+                        milliseconds)
+  -i INTERVAL, --interval INTERVAL
+                        print summary at this interval (seconds)
+
+    dbstat postgres     # display a histogram of PostgreSQL query latencies
+    dbstat mysql -v     # display MySQL latencies and print the BPF program
+    dbstat mysql -u     # display query latencies in microseconds (default: ms)
+    dbstat mysql -m 5   # trace only queries slower than 5ms
+    dbstat mysql -p 408 # trace queries in a specific process
diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py
new file mode 100755
index 0000000..13152c2
--- /dev/null
+++ b/tools/dcsnoop.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# dcsnoop   Trace directory entry cache (dcache) lookups.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: dcsnoop [-h] [-a]
+#
+# By default, this traces every failed dcache lookup, and shows the process
+# performing the lookup and the filename requested. A -a option can be used
+# to show all lookups, not just failed ones.
+#
+# This uses kernel dynamic tracing of the d_lookup() function, and will need
+# to be modified to match kernel changes.
+#
+# Also see dcstat(8), for per-second summaries.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import re
+import time
+
+# arguments
+examples = """examples:
+    ./dcsnoop           # trace failed dcache lookups
+    ./dcsnoop -a        # trace all dcache lookups
+"""
+parser = argparse.ArgumentParser(
+    description="Trace directory entry cache (dcache) lookups",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-a", "--all", action="store_true",
+    help="trace all lookups (default is fails only)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define MAX_FILE_LEN  64
+
+enum lookup_type {
+    LOOKUP_MISS,
+    LOOKUP_REFERENCE,
+};
+
+struct entry_t {
+    char name[MAX_FILE_LEN];
+};
+
+BPF_HASH(entrybypid, u32, struct entry_t);
+
+struct data_t {
+    u32 pid;
+    enum lookup_type type;
+    char comm[TASK_COMM_LEN];
+    char filename[MAX_FILE_LEN];
+};
+
+BPF_PERF_OUTPUT(events);
+
+/* from fs/namei.c: */
+struct nameidata {
+        struct path     path;
+        struct qstr     last;
+        // [...]
+};
+
+static inline
+void submit_event(struct pt_regs *ctx, void *name, int type, u32 pid)
+{
+    struct data_t data = {
+        .pid = pid,
+        .type = type,
+    };
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    bpf_probe_read(&data.filename, sizeof(data.filename), name);
+    events.perf_submit(ctx, &data, sizeof(data));
+}
+
+int trace_fast(struct pt_regs *ctx, struct nameidata *nd, struct path *path)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    submit_event(ctx, (void *)nd->last.name, LOOKUP_REFERENCE, pid);
+    return 1;
+}
+
+int kprobe__d_lookup(struct pt_regs *ctx, const struct dentry *parent,
+    const struct qstr *name)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    struct entry_t entry = {};
+    const char *fname = name->name;
+    if (fname) {
+        bpf_probe_read(&entry.name, sizeof(entry.name), (void *)fname);
+    }
+    entrybypid.update(&pid, &entry);
+    return 0;
+}
+
+int kretprobe__d_lookup(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    struct entry_t *ep;
+    ep = entrybypid.lookup(&pid);
+    if (ep == 0 || PT_REGS_RC(ctx) != 0) {
+        return 0;   // missed entry or lookup didn't fail
+    }
+    submit_event(ctx, (void *)ep->name, LOOKUP_MISS, pid);
+    entrybypid.delete(&pid);
+    return 0;
+}
+"""
+
+TASK_COMM_LEN = 16  # linux/sched.h
+MAX_FILE_LEN = 64  # see inline C
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("type", ct.c_int),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("filename", ct.c_char * MAX_FILE_LEN),
+    ]
+
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+if args.all:
+    b.attach_kprobe(event="lookup_fast", fn_name="trace_fast")
+
+mode_s = {
+    0: 'M',
+    1: 'R',
+}
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-11.6f %-6d %-16s %1s %s" % (
+            time.time() - start_ts, event.pid,
+            event.comm.decode('utf-8', 'replace'), mode_s[event.type],
+            event.filename.decode('utf-8', 'replace')))
+
+# header
+print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
+
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/dcsnoop_example.txt b/tools/dcsnoop_example.txt
new file mode 100644
index 0000000..2184db0
--- /dev/null
+++ b/tools/dcsnoop_example.txt
@@ -0,0 +1,96 @@
+Demonstrations of dcsnoop, the Linux eBPF/bcc version.
+
+
+dcsnoop traces directory entry cache (dcache) lookups, and can be used for
+further investigation beyond dcstat(8). The output is likely verbose, as
+dcache lookups are likely frequent. By default, only failed lookups are shown.
+For example:
+
+# ./dcsnoop.py 
+TIME(s)     PID    COMM             T FILE
+0.002837    1643   snmpd            M net/dev
+0.002852    1643   snmpd            M 1643
+0.002856    1643   snmpd            M net
+0.002863    1643   snmpd            M dev
+0.002952    1643   snmpd            M net/if_inet6
+0.002964    1643   snmpd            M if_inet6
+0.003180    1643   snmpd            M net/ipv4/neigh/eth0/retrans_time_ms
+0.003192    1643   snmpd            M ipv4/neigh/eth0/retrans_time_ms
+0.003197    1643   snmpd            M neigh/eth0/retrans_time_ms
+0.003203    1643   snmpd            M eth0/retrans_time_ms
+0.003206    1643   snmpd            M retrans_time_ms
+0.003245    1643   snmpd            M ipv6/neigh/eth0/retrans_time_ms
+0.003249    1643   snmpd            M neigh/eth0/retrans_time_ms
+0.003252    1643   snmpd            M eth0/retrans_time_ms
+0.003255    1643   snmpd            M retrans_time_ms
+0.003287    1643   snmpd            M conf/eth0/forwarding
+0.003292    1643   snmpd            M eth0/forwarding
+0.003295    1643   snmpd            M forwarding
+0.003326    1643   snmpd            M base_reachable_time_ms
+[...]
+
+I ran a drop caches at the same time as executing this tool. The output shows
+the processes, the type of event ("T" column: M == miss, R == reference),
+and the filename for the dcache lookup.
+
+The way the dcache is currently implemented, each component of a path is
+checked in turn. The first line, showing "net/dev" from snmp, will be a lookup
+for "net" in a directory (that isn't shown here). If it finds "net", it will
+then lookup "dev" inside net. You can see this sequence a little later,
+starting at time 0.003180, where a pathname is being searched
+directory by directory.
+
+
+The -a option will show all lookups, although be warned, the output will be
+very verbose. For example:
+
+# ./dcsnoop
+TIME(s)     PID    COMM             T FILE
+0.000000    20279  dcsnoop.py       M p_lookup_fast
+0.000010    20279  dcsnoop.py       M enable
+0.000013    20279  dcsnoop.py       M id
+0.000015    20279  dcsnoop.py       M filter
+0.000017    20279  dcsnoop.py       M trigger
+0.000019    20279  dcsnoop.py       M format
+0.006148    20279  dcsnoop.py       R sys/kernel/debug/tracing/trace_pipe
+0.006158    20279  dcsnoop.py       R kernel/debug/tracing/trace_pipe
+0.006161    20279  dcsnoop.py       R debug/tracing/trace_pipe
+0.006164    20279  dcsnoop.py       R tracing/trace_pipe
+0.006166    20279  dcsnoop.py       R trace_pipe
+0.015900    1643   snmpd            R proc/sys/net/ipv6/conf/lo/forwarding
+0.015901    1643   snmpd            R sys/net/ipv6/conf/lo/forwarding
+0.015901    1643   snmpd            R net/ipv6/conf/lo/forwarding
+0.015902    1643   snmpd            R ipv6/conf/lo/forwarding
+0.015903    1643   snmpd            R conf/lo/forwarding
+0.015904    1643   snmpd            R lo/forwarding
+0.015905    1643   snmpd            M lo/forwarding
+0.015908    1643   snmpd            R forwarding
+0.015909    1643   snmpd            M forwarding
+0.015937    1643   snmpd            R proc/sys/net/ipv6/neigh/lo/base_reachable_time_ms
+0.015937    1643   snmpd            R sys/net/ipv6/neigh/lo/base_reachable_time_ms
+0.015938    1643   snmpd            R net/ipv6/neigh/lo/base_reachable_time_ms
+0.015939    1643   snmpd            R ipv6/neigh/lo/base_reachable_time_ms
+0.015940    1643   snmpd            R neigh/lo/base_reachable_time_ms
+0.015941    1643   snmpd            R lo/base_reachable_time_ms
+0.015941    1643   snmpd            R base_reachable_time_ms
+0.015943    1643   snmpd            M base_reachable_time_ms
+0.043569    1876   supervise        M 20281
+0.043573    1886   supervise        M 20280
+0.043582    1886   supervise        R supervise/status.new
+[...]
+
+
+USAGE message:
+
+# ./dcsnoop.py -h
+usage: dcsnoop.py [-h] [-a]
+
+Trace directory entry cache (dcache) lookups
+
+optional arguments:
+  -h, --help  show this help message and exit
+  -a, --all   trace all lookups (default is fails only)
+
+examples:
+    ./dcsnoop           # trace failed dcache lookups
+    ./dcsnoop -a        # trace all dcache lookups
diff --git a/tools/dcstat.py b/tools/dcstat.py
new file mode 100755
index 0000000..5ecddd1
--- /dev/null
+++ b/tools/dcstat.py
@@ -0,0 +1,143 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# dcstat   Directory entry cache (dcache) stats.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: dcstat [interval [count]]
+#
+# This uses kernel dynamic tracing of kernel functions, lookup_fast() and
+# d_lookup(), which will need to be modified to match kernel changes. See
+# code comments.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import c_int
+from time import sleep, strftime
+from sys import argv
+
+def usage():
+    print("USAGE: %s [interval [count]]" % argv[0])
+    exit()
+
+# arguments
+interval = 1
+count = -1
+if len(argv) > 1:
+    try:
+        interval = int(argv[1])
+        if interval == 0:
+            raise
+        if len(argv) > 2:
+            count = int(argv[2])
+    except:  # also catches -h, --help
+        usage()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+enum stats {
+    S_REFS = 1,
+    S_SLOW,
+    S_MISS,
+    S_MAXSTAT
+};
+
+BPF_ARRAY(stats, u64, S_MAXSTAT);
+
+/*
+ * How this is instrumented, and how to interpret the statistics, is very much
+ * tied to the current kernel implementation (this was written on Linux 4.4).
+ * This will need maintenance to keep working as the implementation changes. To
+ * aid future adventurers, this is is what the current code does, and why.
+ *
+ * First problem: the current implementation takes a path and then does a
+ * lookup of each component. So how do we count a reference? Once for the path
+ * lookup, or once for every component lookup? I've chosen the latter
+ * since it seems to map more closely to actual dcache lookups (via
+ * __d_lookup_rcu()). It's counted via calls to lookup_fast().
+ *
+ * The implementation tries different, progressively slower, approaches to
+ * lookup a file. At what point do we call it a dcache miss? I've chosen when
+ * a d_lookup() (which is called during lookup_slow()) returns zero.
+ *
+ * I've also included a "SLOW" statistic to show how often the fast lookup
+ * failed. Whether this exists or is interesting is an implementation detail,
+ * and the "SLOW" statistic may be removed in future versions.
+ */
+void count_fast(struct pt_regs *ctx) {
+    int key = S_REFS;
+    u64 *leaf = stats.lookup(&key);
+    if (leaf) (*leaf)++;
+}
+
+void count_lookup(struct pt_regs *ctx) {
+    int key = S_SLOW;
+    u64 *leaf = stats.lookup(&key);
+    if (leaf) (*leaf)++;
+    if (PT_REGS_RC(ctx) == 0) {
+        key = S_MISS;
+        leaf = stats.lookup(&key);
+        if (leaf) (*leaf)++;
+    }
+}
+"""
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="lookup_fast", fn_name="count_fast")
+b.attach_kretprobe(event="d_lookup", fn_name="count_lookup")
+
+# stat column labels and indexes
+stats = {
+    "REFS": 1,
+    "SLOW": 2,
+    "MISS": 3
+}
+
+# header
+print("%-8s  " % "TIME", end="")
+for stype, idx in sorted(stats.items(), key=lambda k_v: (k_v[1], k_v[0])):
+    print(" %8s" % (stype + "/s"), end="")
+print(" %8s" % "HIT%")
+
+# output
+i = 0
+while (1):
+    if count > 0:
+        i += 1
+        if i > count:
+            exit()
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        pass
+        exit()
+
+    print("%-8s: " % strftime("%H:%M:%S"), end="")
+
+    # print each statistic as a column
+    for stype, idx in sorted(stats.items(), key=lambda k_v: (k_v[1], k_v[0])):
+        try:
+            val = b["stats"][c_int(idx)].value / interval
+            print(" %8d" % val, end="")
+        except:
+            print(" %8d" % 0, end="")
+
+    # print hit ratio percentage
+    try:
+        ref = b["stats"][c_int(stats["REFS"])].value
+        miss = b["stats"][c_int(stats["MISS"])].value
+        hit = ref - miss
+        pct = float(100) * hit / ref
+        print(" %8.2f" % pct)
+    except:
+        print(" %7s%%" % "-")
+
+    b["stats"].clear()
diff --git a/tools/dcstat_example.txt b/tools/dcstat_example.txt
new file mode 100644
index 0000000..574473f
--- /dev/null
+++ b/tools/dcstat_example.txt
@@ -0,0 +1,108 @@
+Demonstrations of dcstat, the Linux eBPF/bcc version.
+
+
+dcstat shows directory entry cache (dcache) statistics. For example:
+
+# ./dcstat 
+TIME         REFS/s   SLOW/s   MISS/s     HIT%
+08:11:47:      2059      141       97    95.29
+08:11:48:     79974      151      106    99.87
+08:11:49:    192874      146      102    99.95
+08:11:50:      2051      144      100    95.12
+08:11:51:     73373    17239    17194    76.57
+08:11:52:     54685    25431    25387    53.58
+08:11:53:     18127     8182     8137    55.12
+08:11:54:     22517    10345    10301    54.25
+08:11:55:      7524     2881     2836    62.31
+08:11:56:      2067      141       97    95.31
+08:11:57:      2115      145      101    95.22
+
+The output shows the total references per second ("REFS/s"), the number that
+took a slower code path to be processed ("SLOW/s"), the number of dcache misses
+("MISS/s"), and the hit ratio as a percentage. By default, an interval of 1
+second is used.
+
+At 08:11:49, there were 192 thousand references, which almost entirely hit
+from the dcache, with a hit ration of 99.95%. A little later, starting at
+08:11:51, a workload began that walked many uncached files, reducing the hit
+ratio to 53%, and more importantly, a miss rate of over 10 thousand per second.
+
+
+Here's an interesting workload:
+
+# ./dcstat 
+TIME         REFS/s   SLOW/s   MISS/s     HIT%
+08:15:53:    250683      141       97    99.96
+08:15:54:    266115      145      101    99.96
+08:15:55:    268428      141       97    99.96
+08:15:56:    260389      143       99    99.96
+
+It's a 99.96% hit ratio, and these are all negative hits: accessing a file that
+does not exist. Here's the C program that generated the workload:
+
+# cat -n badopen.c
+     1	#include <sys/types.h>
+     2	#include <sys/stat.h>
+     3	#include <fcntl.h>
+     4	
+     5	int
+     6	main(int argc, char *argv[])
+     7	{
+     8	    int fd;
+     9	    while (1) {
+    10	        fd = open("bad", O_RDONLY);
+    11	    }
+    12	    return 0;
+    13	}
+
+This is a simple workload generator than tries to open a missing file ("bad")
+as quickly as possible.
+
+
+Lets see what happens if the workload attempts to open a different filename
+each time (which is also a missing file), using the following C code:
+
+# cat -n badopen2.c
+     1	#include <sys/types.h>
+     2	#include <sys/stat.h>
+     3	#include <fcntl.h>
+     4	#include <stdio.h>
+     5	
+     6	int
+     7	main(int argc, char *argv[])
+     8	{
+     9	    int fd, i = 0;
+    10	    char buf[128] = {};
+    11	
+    12	    while (1) {
+    13	        sprintf(buf, "bad%d", i++);
+    14	        fd = open(buf, O_RDONLY);
+    15	    }
+    16	    return 0;
+    17	}
+
+Here's dcstat:
+
+# ./dcstat 
+TIME         REFS/s   SLOW/s   MISS/s     HIT%
+08:18:52:    241131   237544   237505     1.51
+08:18:53:    238210   236323   236278     0.82
+08:18:54:    235259   233307   233261     0.85
+08:18:55:    233144   231256   231214     0.83
+08:18:56:    231981   230097   230053     0.83
+
+
+dcstat also supports an optional interval and optional count. For example,
+printing 5 second summaries 3 times:
+
+# ./dcstat 5 3
+TIME         REFS/s   SLOW/s   MISS/s     HIT%
+08:20:03:      2085      143       99    95.23
+08:20:08:      2077      143       98    95.24
+08:20:14:      2071      144      100    95.15
+
+
+USAGE message:
+
+# ./dcstat -h
+USAGE: ./dcstat [interval [count]]
diff --git a/tools/deadlock_detector.c b/tools/deadlock_detector.c
new file mode 100644
index 0000000..09899b0
--- /dev/null
+++ b/tools/deadlock_detector.c
@@ -0,0 +1,206 @@
+/*
+ * deadlock_detector.c  Detects potential deadlocks in a running process.
+ *                      For Linux, uses BCC, eBPF. See .py file.
+ *
+ * Copyright 2017 Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * 1-Feb-2016   Kenny Yu   Created this.
+ */
+
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+// Maximum number of mutexes a single thread can hold at once.
+// If the number is too big, the unrolled loops wil cause the stack
+// to be too big, and the bpf verifier will fail.
+#define MAX_HELD_MUTEXES 16
+
+// Info about held mutexes. `mutex` will be 0 if not held.
+struct held_mutex_t {
+  u64 mutex;
+  u64 stack_id;
+};
+
+// List of mutexes that a thread is holding. Whenever we loop over this array,
+// we need to force the compiler to unroll the loop, otherwise the bcc verifier
+// will fail because the loop will create a backwards edge.
+struct thread_to_held_mutex_leaf_t {
+  struct held_mutex_t held_mutexes[MAX_HELD_MUTEXES];
+};
+
+// Map of thread ID -> array of (mutex addresses, stack id)
+BPF_HASH(thread_to_held_mutexes, u32, struct thread_to_held_mutex_leaf_t, 2097152);
+
+// Key type for edges. Represents an edge from mutex1 => mutex2.
+struct edges_key_t {
+  u64 mutex1;
+  u64 mutex2;
+};
+
+// Leaf type for edges. Holds information about where each mutex was acquired.
+struct edges_leaf_t {
+  u64 mutex1_stack_id;
+  u64 mutex2_stack_id;
+  u32 thread_pid;
+  char comm[TASK_COMM_LEN];
+};
+
+// Represents all edges currently in the mutex wait graph.
+BPF_HASH(edges, struct edges_key_t, struct edges_leaf_t, 2097152);
+
+// Info about parent thread when a child thread is created.
+struct thread_created_leaf_t {
+  u64 stack_id;
+  u32 parent_pid;
+  char comm[TASK_COMM_LEN];
+};
+
+// Map of child thread pid -> info about parent thread.
+BPF_HASH(thread_to_parent, u32, struct thread_created_leaf_t);
+
+// Stack traces when threads are created and when mutexes are locked/unlocked.
+BPF_STACK_TRACE(stack_traces, 655360);
+
+// The first argument to the user space function we are tracing
+// is a pointer to the mutex M held by thread T.
+//
+// For all mutexes N held by mutexes_held[T]
+//   add edge N => M (held by T)
+// mutexes_held[T].add(M)
+int trace_mutex_acquire(struct pt_regs *ctx, void *mutex_addr) {
+  // Higher 32 bits is process ID, Lower 32 bits is thread ID
+  u32 pid = bpf_get_current_pid_tgid();
+  u64 mutex = (u64)mutex_addr;
+
+  struct thread_to_held_mutex_leaf_t empty_leaf = {};
+  struct thread_to_held_mutex_leaf_t *leaf =
+      thread_to_held_mutexes.lookup_or_init(&pid, &empty_leaf);
+  if (!leaf) {
+    bpf_trace_printk(
+        "could not add thread_to_held_mutex key, thread: %d, mutex: %p\n", pid,
+        mutex);
+    return 1; // Could not insert, no more memory
+  }
+
+  // Recursive mutexes lock the same mutex multiple times. We cannot tell if
+  // the mutex is recursive after the mutex is already created. To avoid noisy
+  // reports, disallow self edges. Do one pass to check if we are already
+  // holding the mutex, and if we are, do nothing.
+  #pragma unroll
+  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
+    if (leaf->held_mutexes[i].mutex == mutex) {
+      return 1; // Disallow self edges
+    }
+  }
+
+  u64 stack_id =
+      stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);
+
+  int added_mutex = 0;
+  #pragma unroll
+  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
+    // If this is a free slot, see if we can insert.
+    if (!leaf->held_mutexes[i].mutex) {
+      if (!added_mutex) {
+        leaf->held_mutexes[i].mutex = mutex;
+        leaf->held_mutexes[i].stack_id = stack_id;
+        added_mutex = 1;
+      }
+      continue; // Nothing to do for a free slot
+    }
+
+    // Add edges from held mutex => current mutex
+    struct edges_key_t edge_key = {};
+    edge_key.mutex1 = leaf->held_mutexes[i].mutex;
+    edge_key.mutex2 = mutex;
+
+    struct edges_leaf_t edge_leaf = {};
+    edge_leaf.mutex1_stack_id = leaf->held_mutexes[i].stack_id;
+    edge_leaf.mutex2_stack_id = stack_id;
+    edge_leaf.thread_pid = pid;
+    bpf_get_current_comm(&edge_leaf.comm, sizeof(edge_leaf.comm));
+
+    // Returns non-zero on error
+    int result = edges.update(&edge_key, &edge_leaf);
+    if (result) {
+      bpf_trace_printk("could not add edge key %p, %p, error: %d\n",
+                       edge_key.mutex1, edge_key.mutex2, result);
+      continue; // Could not insert, no more memory
+    }
+  }
+
+  // There were no free slots for this mutex.
+  if (!added_mutex) {
+    bpf_trace_printk("could not add mutex %p, added_mutex: %d\n", mutex,
+                     added_mutex);
+    return 1;
+  }
+  return 0;
+}
+
+// The first argument to the user space function we are tracing
+// is a pointer to the mutex M held by thread T.
+//
+// mutexes_held[T].remove(M)
+int trace_mutex_release(struct pt_regs *ctx, void *mutex_addr) {
+  // Higher 32 bits is process ID, Lower 32 bits is thread ID
+  u32 pid = bpf_get_current_pid_tgid();
+  u64 mutex = (u64)mutex_addr;
+
+  struct thread_to_held_mutex_leaf_t *leaf =
+      thread_to_held_mutexes.lookup(&pid);
+  if (!leaf) {
+    // If the leaf does not exist for the pid, then it means we either missed
+    // the acquire event, or we had no more memory and could not add it.
+    bpf_trace_printk(
+        "could not find thread_to_held_mutex, thread: %d, mutex: %p\n", pid,
+        mutex);
+    return 1;
+  }
+
+  // For older kernels without "Bpf: allow access into map value arrays"
+  // (https://lkml.org/lkml/2016/8/30/287) the bpf verifier will fail with an
+  // invalid memory access on `leaf->held_mutexes[i]` below. On newer kernels,
+  // we can avoid making this extra copy in `value` and use `leaf` directly.
+  struct thread_to_held_mutex_leaf_t value = {};
+  bpf_probe_read(&value, sizeof(struct thread_to_held_mutex_leaf_t), leaf);
+
+  #pragma unroll
+  for (int i = 0; i < MAX_HELD_MUTEXES; ++i) {
+    // Find the current mutex (if it exists), and clear it.
+    // Note: Can't use `leaf->` in this if condition, see comment above.
+    if (value.held_mutexes[i].mutex == mutex) {
+      leaf->held_mutexes[i].mutex = 0;
+      leaf->held_mutexes[i].stack_id = 0;
+    }
+  }
+
+  return 0;
+}
+
+// Trace return from clone() syscall in the child thread (return value > 0).
+int trace_clone(struct pt_regs *ctx, unsigned long flags, void *child_stack,
+                void *ptid, void *ctid, struct pt_regs *regs) {
+  u32 child_pid = PT_REGS_RC(ctx);
+  if (child_pid <= 0) {
+    return 1;
+  }
+
+  struct thread_created_leaf_t thread_created_leaf = {};
+  thread_created_leaf.parent_pid = bpf_get_current_pid_tgid();
+  thread_created_leaf.stack_id =
+      stack_traces.get_stackid(ctx, BPF_F_USER_STACK | BPF_F_REUSE_STACKID);
+  bpf_get_current_comm(&thread_created_leaf.comm,
+                       sizeof(thread_created_leaf.comm));
+
+  struct thread_created_leaf_t *insert_result =
+      thread_to_parent.lookup_or_init(&child_pid, &thread_created_leaf);
+  if (!insert_result) {
+    bpf_trace_printk(
+        "could not add thread_created_key, child: %d, parent: %d\n", child_pid,
+        thread_created_leaf.parent_pid);
+    return 1; // Could not insert, no more memory
+  }
+  return 0;
+}
diff --git a/tools/deadlock_detector.py b/tools/deadlock_detector.py
new file mode 100755
index 0000000..cbc0691
--- /dev/null
+++ b/tools/deadlock_detector.py
@@ -0,0 +1,556 @@
+#!/usr/bin/env python
+#
+# deadlock_detector  Detects potential deadlocks (lock order inversions)
+#                    on a running process. For Linux, uses BCC, eBPF.
+#
+# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH]
+#                             [--verbose] [--lock-symbols LOCK_SYMBOLS]
+#                             [--unlock-symbols UNLOCK_SYMBOLS]
+#                             pid
+#
+# This traces pthread mutex lock and unlock calls to build a directed graph
+# representing the mutex wait graph:
+#
+# - Nodes in the graph represent mutexes.
+# - Edge (A, B) exists if there exists some thread T where lock(A) was called
+#   and lock(B) was called before unlock(A) was called.
+#
+# If the program finds a potential lock order inversion, the program will dump
+# the cycle of mutexes and the stack traces where each mutex was acquired, and
+# then exit.
+#
+# This program can only find potential deadlocks that occur while the program
+# is tracing the process. It cannot find deadlocks that may have occurred
+# before the program was attached to the process.
+#
+# Since this traces all mutex lock and unlock events and all thread creation
+# events on the traced process, the overhead of this bpf program can be very
+# high if the process has many threads and mutexes. You should only run this on
+# a process where the slowdown is acceptable.
+#
+# Note: This tool does not work for shared mutexes or recursive mutexes.
+#
+# For shared (read-write) mutexes, a deadlock requires a cycle in the wait
+# graph where at least one of the mutexes in the cycle is acquiring exclusive
+# (write) ownership.
+#
+# For recursive mutexes, lock() is called multiple times on the same mutex.
+# However, there is no way to determine if a mutex is a recursive mutex
+# after the mutex has been created. As a result, this tool will not find
+# potential deadlocks that involve only one mutex.
+#
+# Copyright 2017 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 01-Feb-2017   Kenny Yu   Created this.
+
+from __future__ import (
+    absolute_import, division, unicode_literals, print_function
+)
+from bcc import BPF
+from collections import defaultdict
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+
+
+class DiGraph(object):
+    '''
+    Adapted from networkx: http://networkx.github.io/
+    Represents a directed graph. Edges can store (key, value) attributes.
+    '''
+
+    def __init__(self):
+        # Map of node -> set of nodes
+        self.adjacency_map = {}
+        # Map of (node1, node2) -> map string -> arbitrary attribute
+        # This will not be copied in subgraph()
+        self.attributes_map = {}
+
+    def neighbors(self, node):
+        return self.adjacency_map.get(node, set())
+
+    def edges(self):
+        edges = []
+        for node, neighbors in self.adjacency_map.items():
+            for neighbor in neighbors:
+                edges.append((node, neighbor))
+        return edges
+
+    def nodes(self):
+        return self.adjacency_map.keys()
+
+    def attributes(self, node1, node2):
+        return self.attributes_map[(node1, node2)]
+
+    def add_edge(self, node1, node2, **kwargs):
+        if node1 not in self.adjacency_map:
+            self.adjacency_map[node1] = set()
+        if node2 not in self.adjacency_map:
+            self.adjacency_map[node2] = set()
+        self.adjacency_map[node1].add(node2)
+        self.attributes_map[(node1, node2)] = kwargs
+
+    def remove_node(self, node):
+        self.adjacency_map.pop(node, None)
+        for _, neighbors in self.adjacency_map.items():
+            neighbors.discard(node)
+
+    def subgraph(self, nodes):
+        graph = DiGraph()
+        for node in nodes:
+            for neighbor in self.neighbors(node):
+                if neighbor in nodes:
+                    graph.add_edge(node, neighbor)
+        return graph
+
+    def node_link_data(self):
+        '''
+        Returns the graph as a dictionary in a format that can be
+        serialized.
+        '''
+        data = {
+            'directed': True,
+            'multigraph': False,
+            'graph': {},
+            'links': [],
+            'nodes': [],
+        }
+
+        # Do one pass to build a map of node -> position in nodes
+        node_to_number = {}
+        for node in self.adjacency_map.keys():
+            node_to_number[node] = len(data['nodes'])
+            data['nodes'].append({'id': node})
+
+        # Do another pass to build the link information
+        for node, neighbors in self.adjacency_map.items():
+            for neighbor in neighbors:
+                link = self.attributes_map[(node, neighbor)].copy()
+                link['source'] = node_to_number[node]
+                link['target'] = node_to_number[neighbor]
+                data['links'].append(link)
+        return data
+
+
+def strongly_connected_components(G):
+    '''
+    Adapted from networkx: http://networkx.github.io/
+    Parameters
+    ----------
+    G : DiGraph
+    Returns
+    -------
+    comp : generator of sets
+        A generator of sets of nodes, one for each strongly connected
+        component of G.
+    '''
+    preorder = {}
+    lowlink = {}
+    scc_found = {}
+    scc_queue = []
+    i = 0  # Preorder counter
+    for source in G.nodes():
+        if source not in scc_found:
+            queue = [source]
+            while queue:
+                v = queue[-1]
+                if v not in preorder:
+                    i = i + 1
+                    preorder[v] = i
+                done = 1
+                v_nbrs = G.neighbors(v)
+                for w in v_nbrs:
+                    if w not in preorder:
+                        queue.append(w)
+                        done = 0
+                        break
+                if done == 1:
+                    lowlink[v] = preorder[v]
+                    for w in v_nbrs:
+                        if w not in scc_found:
+                            if preorder[w] > preorder[v]:
+                                lowlink[v] = min([lowlink[v], lowlink[w]])
+                            else:
+                                lowlink[v] = min([lowlink[v], preorder[w]])
+                    queue.pop()
+                    if lowlink[v] == preorder[v]:
+                        scc_found[v] = True
+                        scc = {v}
+                        while (
+                            scc_queue and preorder[scc_queue[-1]] > preorder[v]
+                        ):
+                            k = scc_queue.pop()
+                            scc_found[k] = True
+                            scc.add(k)
+                        yield scc
+                    else:
+                        scc_queue.append(v)
+
+
+def simple_cycles(G):
+    '''
+    Adapted from networkx: http://networkx.github.io/
+    Parameters
+    ----------
+    G : DiGraph
+    Returns
+    -------
+    cycle_generator: generator
+       A generator that produces elementary cycles of the graph.
+       Each cycle is represented by a list of nodes along the cycle.
+    '''
+
+    def _unblock(thisnode, blocked, B):
+        stack = set([thisnode])
+        while stack:
+            node = stack.pop()
+            if node in blocked:
+                blocked.remove(node)
+                stack.update(B[node])
+                B[node].clear()
+
+    # Johnson's algorithm requires some ordering of the nodes.
+    # We assign the arbitrary ordering given by the strongly connected comps
+    # There is no need to track the ordering as each node removed as processed.
+    # save the actual graph so we can mutate it here
+    # We only take the edges because we do not want to
+    # copy edge and node attributes here.
+    subG = G.subgraph(G.nodes())
+    sccs = list(strongly_connected_components(subG))
+    while sccs:
+        scc = sccs.pop()
+        # order of scc determines ordering of nodes
+        startnode = scc.pop()
+        # Processing node runs 'circuit' routine from recursive version
+        path = [startnode]
+        blocked = set()  # vertex: blocked from search?
+        closed = set()  # nodes involved in a cycle
+        blocked.add(startnode)
+        B = defaultdict(set)  # graph portions that yield no elementary circuit
+        stack = [(startnode, list(subG.neighbors(startnode)))]
+        while stack:
+            thisnode, nbrs = stack[-1]
+            if nbrs:
+                nextnode = nbrs.pop()
+                if nextnode == startnode:
+                    yield path[:]
+                    closed.update(path)
+                elif nextnode not in blocked:
+                    path.append(nextnode)
+                    stack.append((nextnode, list(subG.neighbors(nextnode))))
+                    closed.discard(nextnode)
+                    blocked.add(nextnode)
+                    continue
+            # done with nextnode... look for more neighbors
+            if not nbrs:  # no more nbrs
+                if thisnode in closed:
+                    _unblock(thisnode, blocked, B)
+                else:
+                    for nbr in subG.neighbors(thisnode):
+                        if thisnode not in B[nbr]:
+                            B[nbr].add(thisnode)
+                stack.pop()
+                path.pop()
+        # done processing this node
+        subG.remove_node(startnode)
+        H = subG.subgraph(scc)  # make smaller to avoid work in SCC routine
+        sccs.extend(list(strongly_connected_components(H)))
+
+
+def find_cycle(graph):
+    '''
+    Looks for a cycle in the graph. If found, returns the first cycle.
+    If nodes a1, a2, ..., an are in a cycle, then this returns:
+        [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
+    Otherwise returns an empty list.
+    '''
+    cycles = list(simple_cycles(graph))
+    if cycles:
+        nodes = cycles[0]
+        nodes.append(nodes[0])
+        edges = []
+        prev = nodes[0]
+        for node in nodes[1:]:
+            edges.append((prev, node))
+            prev = node
+        return edges
+    else:
+        return []
+
+
+def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn):
+    '''
+    Prints the cycle in the mutex graph in the following format:
+
+    Potential Deadlock Detected!
+
+    Cycle in lock order graph: M0 => M1 => M2 => M0
+
+    for (m, n) in cycle:
+        Mutex n acquired here while holding Mutex m in thread T:
+            [ stack trace ]
+
+        Mutex m previously acquired by thread T here:
+            [ stack trace ]
+
+    for T in all threads:
+        Thread T was created here:
+            [ stack trace ]
+    '''
+
+    # List of mutexes in the cycle, first and last repeated
+    nodes_in_order = []
+    # Map mutex address -> readable alias
+    node_addr_to_name = {}
+    for counter, (m, n) in enumerate(edges):
+        nodes_in_order.append(m)
+        # For global or static variables, try to symbolize the mutex address.
+        symbol = symbolize_with_objdump(binary, m)
+        if symbol:
+            symbol += ' '
+        node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m)
+    nodes_in_order.append(nodes_in_order[0])
+
+    print('----------------\nPotential Deadlock Detected!\n')
+    print(
+        'Cycle in lock order graph: %s\n' %
+        (' => '.join([node_addr_to_name[n] for n in nodes_in_order]))
+    )
+
+    # Set of threads involved in the lock inversion
+    thread_pids = set()
+
+    # For each edge in the cycle, print where the two mutexes were held
+    for (m, n) in edges:
+        thread_pid = graph.attributes(m, n)['thread_pid']
+        thread_comm = graph.attributes(m, n)['thread_comm']
+        first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id']
+        second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id']
+        thread_pids.add(thread_pid)
+        print(
+            '%s acquired here while holding %s in Thread %d (%s):' % (
+                node_addr_to_name[n], node_addr_to_name[m], thread_pid,
+                thread_comm
+            )
+        )
+        print_stack_trace_fn(second_mutex_stack_id)
+        print('')
+        print(
+            '%s previously acquired by the same Thread %d (%s) here:' %
+            (node_addr_to_name[m], thread_pid, thread_comm)
+        )
+        print_stack_trace_fn(first_mutex_stack_id)
+        print('')
+
+    # Print where the threads were created, if available
+    for thread_pid in thread_pids:
+        parent_pid, stack_id, parent_comm = thread_info.get(
+            thread_pid, (None, None, None)
+        )
+        if parent_pid:
+            print(
+                'Thread %d created by Thread %d (%s) here: ' %
+                (thread_pid, parent_pid, parent_comm)
+            )
+            print_stack_trace_fn(stack_id)
+        else:
+            print(
+                'Could not find stack trace where Thread %d was created' %
+                thread_pid
+            )
+        print('')
+
+
+def symbolize_with_objdump(binary, addr):
+    '''
+    Searches the binary for the address using objdump. Returns the symbol if
+    it is found, otherwise returns empty string.
+    '''
+    try:
+        command = (
+            'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' %
+            (binary, addr)
+        )
+        output = subprocess.check_output(command, shell=True)
+        return output.decode('utf-8').strip()
+    except subprocess.CalledProcessError:
+        return ''
+
+
+def strlist(s):
+    '''Given a comma-separated string, returns a list of substrings'''
+    return s.strip().split(',')
+
+
+def main():
+    examples = '''Examples:
+    deadlock_detector 181        # Analyze PID 181
+
+    deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
+                                 # Analyze PID 181 and locks from this binary.
+                                 # If tracing a process that is running from
+                                 # a dynamically-linked binary, this argument
+                                 # is required and should be the path to the
+                                 # pthread library.
+
+    deadlock_detector 181 --verbose
+                                 # Analyze PID 181 and print statistics about
+                                 # the mutex wait graph.
+
+    deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\
+        --unlock-symbols my_mutex_unlock1,my_mutex_unlock2
+                                 # Analyze PID 181 and trace custom mutex
+                                 # symbols instead of pthread mutexes.
+
+    deadlock_detector 181 --dump-graph graph.json
+                                 # Analyze PID 181 and dump the mutex wait
+                                 # graph to graph.json.
+    '''
+    parser = argparse.ArgumentParser(
+        description=(
+            'Detect potential deadlocks (lock inversions) in a running binary.'
+            '\nMust be run as root.'
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=examples,
+    )
+    parser.add_argument('pid', type=int, help='Pid to trace')
+    # Binaries with `:` in the path will fail to attach uprobes on kernels
+    # running without this patch: https://lkml.org/lkml/2017/1/13/585.
+    # Symlinks to the binary without `:` in the path can get around this issue.
+    parser.add_argument(
+        '--binary',
+        type=str,
+        default='',
+        help='If set, trace the mutexes from the binary at this path. '
+        'For statically-linked binaries, this argument is not required. '
+        'For dynamically-linked binaries, this argument is required and '
+        'should be the path of the pthread library the binary is using. '
+        'Example: /lib/x86_64-linux-gnu/libpthread.so.0',
+    )
+    parser.add_argument(
+        '--dump-graph',
+        type=str,
+        default='',
+        help='If set, this will dump the mutex graph to the specified file.',
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Print statistics about the mutex wait graph.',
+    )
+    parser.add_argument(
+        '--lock-symbols',
+        type=strlist,
+        default=['pthread_mutex_lock'],
+        help='Comma-separated list of lock symbols to trace. Default is '
+        'pthread_mutex_lock. These symbols cannot be inlined in the binary.',
+    )
+    parser.add_argument(
+        '--unlock-symbols',
+        type=strlist,
+        default=['pthread_mutex_unlock'],
+        help='Comma-separated list of unlock symbols to trace. Default is '
+        'pthread_mutex_unlock. These symbols cannot be inlined in the binary.',
+    )
+    args = parser.parse_args()
+    if not args.binary:
+        try:
+            args.binary = os.readlink('/proc/%d/exe' % args.pid)
+        except OSError as e:
+            print('%s. Is the process (pid=%d) running?' % (str(e), args.pid))
+            sys.exit(1)
+
+    bpf = BPF(src_file=b'deadlock_detector.c')
+
+    # Trace where threads are created
+    bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone')
+
+    # We must trace unlock first, otherwise in the time we attached the probe
+    # on lock() and have not yet attached the probe on unlock(), a thread can
+    # acquire mutexes and release them, but the release events will not be
+    # traced, resulting in noisy reports.
+    for symbol in args.unlock_symbols:
+        try:
+            bpf.attach_uprobe(
+                name=args.binary,
+                sym=symbol,
+                fn_name='trace_mutex_release',
+                pid=args.pid,
+            )
+        except Exception as e:
+            print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
+            sys.exit(1)
+    for symbol in args.lock_symbols:
+        try:
+            bpf.attach_uprobe(
+                name=args.binary,
+                sym=symbol,
+                fn_name='trace_mutex_acquire',
+                pid=args.pid,
+            )
+        except Exception as e:
+            print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
+            sys.exit(1)
+
+    def print_stack_trace(stack_id):
+        '''Closure that prints the symbolized stack trace.'''
+        for addr in bpf.get_table('stack_traces').walk(stack_id):
+            line = bpf.sym(addr, args.pid)
+            # Try to symbolize with objdump if we cannot with bpf.
+            if line == '[unknown]':
+                symbol = symbolize_with_objdump(args.binary, addr)
+                if symbol:
+                    line = symbol
+            print('@ %016x %s' % (addr, line))
+
+    print('Tracing... Hit Ctrl-C to end.')
+    while True:
+        try:
+            # Map of child thread pid -> parent info
+            thread_info = {
+                child.value: (parent.parent_pid, parent.stack_id, parent.comm)
+                for child, parent in bpf.get_table('thread_to_parent').items()
+            }
+
+            # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists
+            # if there exists some thread T where lock(A) was called and
+            # lock(B) was called before unlock(A) was called.
+            graph = DiGraph()
+            for key, leaf in bpf.get_table('edges').items():
+                graph.add_edge(
+                    key.mutex1,
+                    key.mutex2,
+                    thread_pid=leaf.thread_pid,
+                    thread_comm=leaf.comm.decode('utf-8'),
+                    first_mutex_stack_id=leaf.mutex1_stack_id,
+                    second_mutex_stack_id=leaf.mutex2_stack_id,
+                )
+            if args.verbose:
+                print(
+                    'Mutexes: %d, Edges: %d' %
+                    (len(graph.nodes()), len(graph.edges()))
+                )
+            if args.dump_graph:
+                with open(args.dump_graph, 'w') as f:
+                    data = graph.node_link_data()
+                    f.write(json.dumps(data, indent=2))
+
+            cycle = find_cycle(graph)
+            if cycle:
+                print_cycle(
+                    args.binary, graph, cycle, thread_info, print_stack_trace
+                )
+                sys.exit(1)
+
+            time.sleep(1)
+        except KeyboardInterrupt:
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deadlock_detector_example.txt b/tools/deadlock_detector_example.txt
new file mode 100644
index 0000000..6cd2395
--- /dev/null
+++ b/tools/deadlock_detector_example.txt
@@ -0,0 +1,365 @@
+Demonstrations of deadlock_detector.
+
+This program detects potential deadlocks on a running process. The program
+attaches uprobes on `pthread_mutex_lock` and `pthread_mutex_unlock` to build
+a mutex wait directed graph, and then looks for a cycle in this graph. This
+graph has the following properties:
+
+- Nodes in the graph represent mutexes.
+- Edge (A, B) exists if there exists some thread T where lock(A) was called
+  and lock(B) was called before unlock(A) was called.
+
+If there is a cycle in this graph, this indicates that there is a lock order
+inversion (potential deadlock). If the program finds a lock order inversion, the
+program will dump the cycle of mutexes, dump the stack traces where each mutex
+was acquired, and then exit.
+
+This program can only find potential deadlocks that occur while the program
+is tracing the process. It cannot find deadlocks that may have occurred
+before the program was attached to the process.
+
+Since this traces all mutex lock and unlock events and all thread creation
+events on the traced process, the overhead of this bpf program can be very
+high if the process has many threads and mutexes. You should only run this on
+a process where the slowdown is acceptable.
+
+Note: This tool does not work for shared mutexes or recursive mutexes.
+
+For shared (read-write) mutexes, a deadlock requires a cycle in the wait
+graph where at least one of the mutexes in the cycle is acquiring exclusive
+(write) ownership.
+
+For recursive mutexes, lock() is called multiple times on the same mutex.
+However, there is no way to determine if a mutex is a recursive mutex
+after the mutex has been created. As a result, this tool will not find
+potential deadlocks that involve only one mutex.
+
+
+# ./deadlock_detector.py 181
+Tracing... Hit Ctrl-C to end.
+----------------
+Potential Deadlock Detected!
+
+Cycle in lock order graph: Mutex M0 (main::static_mutex3 0x0000000000473c60) => Mutex M1 (0x00007fff6d738400) => Mutex M2 (global_mutex1 0x0000000000473be0) => Mutex M3 (global_mutex2 0x0000000000473c20) => Mutex M0 (main::static_mutex3 0x0000000000473c60)
+
+Mutex M1 (0x00007fff6d738400) acquired here while holding Mutex M0 (main::static_mutex3 0x0000000000473c60) in Thread 357250 (lockinversion):
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402e38 main::{lambda()#3}::operator()() const
+@ 0000000000406ba8 void std::_Bind_simple<main::{lambda()#3} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406951 std::_Bind_simple<main::{lambda()#3} ()>::operator()()
+@ 000000000040673a std::thread::_Impl<std::_Bind_simple<main::{lambda()#3} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M0 (main::static_mutex3 0x0000000000473c60) previously acquired by the same Thread 357250 (lockinversion) here:
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402e22 main::{lambda()#3}::operator()() const
+@ 0000000000406ba8 void std::_Bind_simple<main::{lambda()#3} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406951 std::_Bind_simple<main::{lambda()#3} ()>::operator()()
+@ 000000000040673a std::thread::_Impl<std::_Bind_simple<main::{lambda()#3} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M2 (global_mutex1 0x0000000000473be0) acquired here while holding Mutex M1 (0x00007fff6d738400) in Thread 357251 (lockinversion):
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402ea8 main::{lambda()#4}::operator()() const
+@ 0000000000406b46 void std::_Bind_simple<main::{lambda()#4} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 000000000040692d std::_Bind_simple<main::{lambda()#4} ()>::operator()()
+@ 000000000040671c std::thread::_Impl<std::_Bind_simple<main::{lambda()#4} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M1 (0x00007fff6d738400) previously acquired by the same Thread 357251 (lockinversion) here:
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402e97 main::{lambda()#4}::operator()() const
+@ 0000000000406b46 void std::_Bind_simple<main::{lambda()#4} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 000000000040692d std::_Bind_simple<main::{lambda()#4} ()>::operator()()
+@ 000000000040671c std::thread::_Impl<std::_Bind_simple<main::{lambda()#4} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M3 (global_mutex2 0x0000000000473c20) acquired here while holding Mutex M2 (global_mutex1 0x0000000000473be0) in Thread 357247 (lockinversion):
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402d5f main::{lambda()#1}::operator()() const
+@ 0000000000406c6c void std::_Bind_simple<main::{lambda()#1} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406999 std::_Bind_simple<main::{lambda()#1} ()>::operator()()
+@ 0000000000406776 std::thread::_Impl<std::_Bind_simple<main::{lambda()#1} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M2 (global_mutex1 0x0000000000473be0) previously acquired by the same Thread 357247 (lockinversion) here:
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402d4e main::{lambda()#1}::operator()() const
+@ 0000000000406c6c void std::_Bind_simple<main::{lambda()#1} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406999 std::_Bind_simple<main::{lambda()#1} ()>::operator()()
+@ 0000000000406776 std::thread::_Impl<std::_Bind_simple<main::{lambda()#1} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M0 (main::static_mutex3 0x0000000000473c60) acquired here while holding Mutex M3 (global_mutex2 0x0000000000473c20) in Thread 357248 (lockinversion):
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402dc9 main::{lambda()#2}::operator()() const
+@ 0000000000406c0a void std::_Bind_simple<main::{lambda()#2} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406975 std::_Bind_simple<main::{lambda()#2} ()>::operator()()
+@ 0000000000406758 std::thread::_Impl<std::_Bind_simple<main::{lambda()#2} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Mutex M3 (global_mutex2 0x0000000000473c20) previously acquired by the same Thread 357248 (lockinversion) here:
+@ 00000000004024d0 pthread_mutex_lock
+@ 0000000000406dd0 std::mutex::lock()
+@ 00000000004070d2 std::lock_guard<std::mutex>::lock_guard(std::mutex&)
+@ 0000000000402db8 main::{lambda()#2}::operator()() const
+@ 0000000000406c0a void std::_Bind_simple<main::{lambda()#2} ()>::_M_invoke<>(std::_Index_tuple<>)
+@ 0000000000406975 std::_Bind_simple<main::{lambda()#2} ()>::operator()()
+@ 0000000000406758 std::thread::_Impl<std::_Bind_simple<main::{lambda()#2} ()> >::_M_run()
+@ 00007fd4496564e1 execute_native_thread_routine
+@ 00007fd449dd57f1 start_thread
+@ 00007fd44909746d __clone
+
+Thread 357248 created by Thread 350692 (lockinversion) here:
+@ 00007fd449097431 __clone
+@ 00007fd449dd5ef5 pthread_create
+@ 00007fd449658440 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>)
+@ 00000000004033ac std::thread::thread<main::{lambda()#2}>(main::{lambda()#2}&&)
+@ 000000000040308f main
+@ 00007fd448faa0f6 __libc_start_main
+@ 0000000000402ad8 [unknown]
+
+Thread 357250 created by Thread 350692 (lockinversion) here:
+@ 00007fd449097431 __clone
+@ 00007fd449dd5ef5 pthread_create
+@ 00007fd449658440 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>)
+@ 00000000004034b2 std::thread::thread<main::{lambda()#3}>(main::{lambda()#3}&&)
+@ 00000000004030b9 main
+@ 00007fd448faa0f6 __libc_start_main
+@ 0000000000402ad8 [unknown]
+
+Thread 357251 created by Thread 350692 (lockinversion) here:
+@ 00007fd449097431 __clone
+@ 00007fd449dd5ef5 pthread_create
+@ 00007fd449658440 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>)
+@ 00000000004035b8 std::thread::thread<main::{lambda()#4}>(main::{lambda()#4}&&)
+@ 00000000004030e6 main
+@ 00007fd448faa0f6 __libc_start_main
+@ 0000000000402ad8 [unknown]
+
+Thread 357247 created by Thread 350692 (lockinversion) here:
+@ 00007fd449097431 __clone
+@ 00007fd449dd5ef5 pthread_create
+@ 00007fd449658440 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>)
+@ 00000000004032a6 std::thread::thread<main::{lambda()#1}>(main::{lambda()#1}&&)
+@ 0000000000403070 main
+@ 00007fd448faa0f6 __libc_start_main
+@ 0000000000402ad8 [unknown]
+
+This is output from a process that has a potential deadlock involving 4 mutexes
+and 4 threads:
+
+- Thread 357250 acquired M1 while holding M0 (edge M0 -> M1)
+- Thread 357251 acquired M2 while holding M1 (edge M1 -> M2)
+- Thread 357247 acquired M3 while holding M2 (edge M2 -> M3)
+- Thread 357248 acquired M0 while holding M3 (edge M3 -> M0)
+
+This is the C++ program that generated the output above:
+
+```c++
+#include <chrono>
+#include <iostream>
+#include <mutex>
+#include <thread>
+
+std::mutex global_mutex1;
+std::mutex global_mutex2;
+
+int main(void) {
+  static std::mutex static_mutex3;
+  std::mutex local_mutex4;
+
+  std::cout << "sleeping for a bit to allow trace to attach..." << std::endl;
+  std::this_thread::sleep_for(std::chrono::seconds(10));
+  std::cout << "starting program..." << std::endl;
+
+  auto t1 = std::thread([] {
+    std::lock_guard<std::mutex> g1(global_mutex1);
+    std::lock_guard<std::mutex> g2(global_mutex2);
+  });
+  t1.join();
+
+  auto t2 = std::thread([] {
+    std::lock_guard<std::mutex> g2(global_mutex2);
+    std::lock_guard<std::mutex> g3(static_mutex3);
+  });
+  t2.join();
+
+  auto t3 = std::thread([&local_mutex4] {
+    std::lock_guard<std::mutex> g3(static_mutex3);
+    std::lock_guard<std::mutex> g4(local_mutex4);
+  });
+  t3.join();
+
+  auto t4 = std::thread([&local_mutex4] {
+    std::lock_guard<std::mutex> g4(local_mutex4);
+    std::lock_guard<std::mutex> g1(global_mutex1);
+  });
+  t4.join();
+
+  std::cout << "sleeping to allow trace to collect data..." << std::endl;
+  std::this_thread::sleep_for(std::chrono::seconds(5));
+  std::cout << "done!" << std::endl;
+}
+```
+
+Note that an actual deadlock did not occur, although this mutex lock ordering
+creates the possibility of a deadlock, and this is a hint to the programmer to
+reconsider the lock ordering. If the mutexes are global or static and debug
+symbols are enabled, the output will contain the mutex symbol name. The output
+uses a similar format as ThreadSanitizer
+(https://github.com/google/sanitizers/wiki/ThreadSanitizerDeadlockDetector).
+
+
+# ./deadlock_detector.py 181 --binary /usr/local/bin/lockinversion
+
+Tracing... Hit Ctrl-C to end.
+^C
+
+If the traced process is instantiated from a statically-linked executable,
+this argument is optional, and the program will determine the path of the
+executable from the pid. However, on older kernels without this patch
+("uprobe: Find last occurrence of ':' when parsing uprobe PATH:OFFSET",
+https://lkml.org/lkml/2017/1/13/585), binaries that contain `:` in the path
+cannot be attached with uprobes. As a workaround, we can create a symlink
+to the binary, and provide the symlink name instead to the `--binary` option.
+
+
+# ./deadlock_detector.py 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
+
+Tracing... Hit Ctrl-C to end.
+^C
+
+If the traced process is instantiated from a dynamically-linked executable,
+this argument is required and needs to be the path to the pthread shared
+library used by the executable.
+
+
+# ./deadlock_detector.py 181 --dump-graph graph.json --verbose
+
+Tracing... Hit Ctrl-C to end.
+Mutexes: 0, Edges: 0
+Mutexes: 532, Edges: 411
+Mutexes: 735, Edges: 675
+Mutexes: 1118, Edges: 1278
+Mutexes: 1666, Edges: 2185
+Mutexes: 2056, Edges: 2694
+Mutexes: 2245, Edges: 2906
+Mutexes: 2656, Edges: 3479
+Mutexes: 2813, Edges: 3785
+^C
+
+If the program does not find a deadlock, it will keep running until you hit
+Ctrl-C. If you pass the `--verbose` flag, the program will also dump statistics
+about the number of mutexes and edges in the mutex wait graph. If you want to
+serialize the graph to analyze it later, you can pass the `--dump-graph FILE`
+flag, and the program will serialize the graph in json.
+
+
+# ./deadlock_detector.py 181 --lock-symbols custom_mutex1_lock,custom_mutex2_lock --unlock_symbols custom_mutex1_unlock,custom_mutex2_unlock --verbose
+
+Tracing... Hit Ctrl-C to end.
+Mutexes: 0, Edges: 0
+Mutexes: 532, Edges: 411
+Mutexes: 735, Edges: 675
+Mutexes: 1118, Edges: 1278
+Mutexes: 1666, Edges: 2185
+Mutexes: 2056, Edges: 2694
+Mutexes: 2245, Edges: 2906
+Mutexes: 2656, Edges: 3479
+Mutexes: 2813, Edges: 3785
+^C
+
+If your program is using custom mutexes and not pthread mutexes, you can use
+the `--lock-symbols` and `--unlock-symbols` flags to specify different mutex
+symbols to trace. The flags take a comma-separated string of symbol names.
+Note that if the symbols are inlined in the binary, then this program can result
+in false positives.
+
+
+USAGE message:
+
+# ./deadlock_detector.py -h
+
+usage: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH]
+                            [--verbose] [--lock-symbols LOCK_SYMBOLS]
+                            [--unlock-symbols UNLOCK_SYMBOLS]
+                            pid
+
+Detect potential deadlocks (lock inversions) in a running binary.
+Must be run as root.
+
+positional arguments:
+  pid                   Pid to trace
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --binary BINARY       If set, trace the mutexes from the binary at this
+                        path. For statically-linked binaries, this argument is
+                        not required. For dynamically-linked binaries, this
+                        argument is required and should be the path of the
+                        pthread library the binary is using. Example:
+                        /lib/x86_64-linux-gnu/libpthread.so.0
+  --dump-graph DUMP_GRAPH
+                        If set, this will dump the mutex graph to the
+                        specified file.
+  --verbose             Print statistics about the mutex wait graph.
+  --lock-symbols LOCK_SYMBOLS
+                        Comma-separated list of lock symbols to trace. Default
+                        is pthread_mutex_lock. These symbols cannot be inlined
+                        in the binary.
+  --unlock-symbols UNLOCK_SYMBOLS
+                        Comma-separated list of unlock symbols to trace.
+                        Default is pthread_mutex_unlock. These symbols cannot
+                        be inlined in the binary.
+
+Examples:
+    deadlock_detector 181        # Analyze PID 181
+
+    deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
+                                 # Analyze PID 181 and locks from this binary.
+                                 # If tracing a process that is running from
+                                 # a dynamically-linked binary, this argument
+                                 # is required and should be the path to the
+                                 # pthread library.
+
+    deadlock_detector 181 --verbose
+                                 # Analyze PID 181 and print statistics about
+                                 # the mutex wait graph.
+
+    deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \
+        --unlock-symbols my_mutex_unlock1,my_mutex_unlock2
+                                 # Analyze PID 181 and trace custom mutex
+                                 # symbols instead of pthread mutexes.
+
+    deadlock_detector 181 --dump-graph graph.json
+                                 # Analyze PID 181 and dump the mutex wait
+                                 # graph to graph.json.
diff --git a/tools/execsnoop.py b/tools/execsnoop.py
new file mode 100755
index 0000000..6fdde97
--- /dev/null
+++ b/tools/execsnoop.py
@@ -0,0 +1,249 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# execsnoop Trace new processes via exec() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: execsnoop [-h] [-t] [-x] [-n NAME]
+#
+# This currently will print up to a maximum of 19 arguments, plus the process
+# name, so 20 fields in total (MAXARG).
+#
+# This won't catch all new processes: an application may fork() but not exec().
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 07-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import ArgString, printb
+import bcc.utils as utils
+import argparse
+import ctypes as ct
+import re
+import time
+from collections import defaultdict
+
+# arguments
+examples = """examples:
+    ./execsnoop           # trace all exec() syscalls
+    ./execsnoop -x        # include failed exec()s
+    ./execsnoop -t        # include timestamps
+    ./execsnoop -q        # add "quotemarks" around arguments
+    ./execsnoop -n main   # only print command lines containing "main"
+    ./execsnoop -l tpkg   # only print command where arguments contains "tpkg"
+"""
+parser = argparse.ArgumentParser(
+    description="Trace exec() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--fails", action="store_true",
+    help="include failed exec()s")
+parser.add_argument("-q", "--quote", action="store_true",
+    help="Add quotemarks (\") around arguments."
+    )
+parser.add_argument("-n", "--name",
+    type=ArgString,
+    help="only print commands matching this name (regex), any arg")
+parser.add_argument("-l", "--line",
+    type=ArgString,
+    help="only print commands where arg contains this line (regex)")
+parser.add_argument("--max-args", default="20",
+    help="maximum number of arguments parsed and displayed, defaults to 20")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+#define ARGSIZE  128
+
+enum event_type {
+    EVENT_ARG,
+    EVENT_RET,
+};
+
+struct data_t {
+    u32 pid;  // PID as in the userspace term (i.e. task->tgid in kernel)
+    u32 ppid; // Parent PID as in the userspace term (i.e task->real_parent->tgid in kernel)
+    char comm[TASK_COMM_LEN];
+    enum event_type type;
+    char argv[ARGSIZE];
+    int retval;
+};
+
+BPF_PERF_OUTPUT(events);
+
+static int __submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
+{
+    bpf_probe_read(data->argv, sizeof(data->argv), ptr);
+    events.perf_submit(ctx, data, sizeof(struct data_t));
+    return 1;
+}
+
+static int submit_arg(struct pt_regs *ctx, void *ptr, struct data_t *data)
+{
+    const char *argp = NULL;
+    bpf_probe_read(&argp, sizeof(argp), ptr);
+    if (argp) {
+        return __submit_arg(ctx, (void *)(argp), data);
+    }
+    return 0;
+}
+
+int syscall__execve(struct pt_regs *ctx,
+    const char __user *filename,
+    const char __user *const __user *__argv,
+    const char __user *const __user *__envp)
+{
+    // create data here and pass to submit_arg to save stack space (#555)
+    struct data_t data = {};
+    struct task_struct *task;
+
+    data.pid = bpf_get_current_pid_tgid() >> 32;
+
+    task = (struct task_struct *)bpf_get_current_task();
+    // Some kernels, like Ubuntu 4.13.0-generic, return 0
+    // as the real_parent->tgid.
+    // We use the get_ppid function as a fallback in those cases. (#1883)
+    data.ppid = task->real_parent->tgid;
+
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    data.type = EVENT_ARG;
+
+    __submit_arg(ctx, (void *)filename, &data);
+
+    // skip first arg, as we submitted filename
+    #pragma unroll
+    for (int i = 1; i < MAXARG; i++) {
+        if (submit_arg(ctx, (void *)&__argv[i], &data) == 0)
+             goto out;
+    }
+
+    // handle truncated argument list
+    char ellipsis[] = "...";
+    __submit_arg(ctx, (void *)ellipsis, &data);
+out:
+    return 0;
+}
+
+int do_ret_sys_execve(struct pt_regs *ctx)
+{
+    struct data_t data = {};
+    struct task_struct *task;
+
+    data.pid = bpf_get_current_pid_tgid() >> 32;
+
+    task = (struct task_struct *)bpf_get_current_task();
+    // Some kernels, like Ubuntu 4.13.0-generic, return 0
+    // as the real_parent->tgid.
+    // We use the get_ppid function as a fallback in those cases. (#1883)
+    data.ppid = task->real_parent->tgid;
+
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    data.type = EVENT_RET;
+    data.retval = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+bpf_text = bpf_text.replace("MAXARG", args.max_args)
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+execve_fnname = b.get_syscall_fnname("execve")
+b.attach_kprobe(event=execve_fnname, fn_name="syscall__execve")
+b.attach_kretprobe(event=execve_fnname, fn_name="do_ret_sys_execve")
+
+# header
+if args.timestamp:
+    print("%-8s" % ("TIME(s)"), end="")
+print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS"))
+
+TASK_COMM_LEN = 16      # linux/sched.h
+ARGSIZE = 128           # should match #define in C above
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("ppid", ct.c_uint),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("type", ct.c_int),
+        ("argv", ct.c_char * ARGSIZE),
+        ("retval", ct.c_int),
+    ]
+
+class EventType(object):
+    EVENT_ARG = 0
+    EVENT_RET = 1
+
+start_ts = time.time()
+argv = defaultdict(list)
+
+# This is best-effort PPID matching. Short-lived processes may exit
+# before we get a chance to read the PPID.
+# This is a fallback for when fetching the PPID from task->real_parent->tgip
+# returns 0, which happens in some kernel versions.
+def get_ppid(pid):
+    try:
+        with open("/proc/%d/status" % pid) as status:
+            for line in status:
+                if line.startswith("PPid:"):
+                    return int(line.split()[1])
+    except IOError:
+        pass
+    return 0
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    skip = False
+
+    if event.type == EventType.EVENT_ARG:
+        argv[event.pid].append(event.argv)
+    elif event.type == EventType.EVENT_RET:
+        if event.retval != 0 and not args.fails:
+            skip = True
+        if args.name and not re.search(bytes(args.name), event.comm):
+            skip = True
+        if args.line and not re.search(bytes(args.line),
+                                       b' '.join(argv[event.pid])):
+            skip = True
+        if args.quote:
+            argv[event.pid] = [
+                "\"" + arg.replace("\"", "\\\"") + "\""
+                for arg in argv[event.pid]
+            ]
+
+        if not skip:
+            if args.timestamp:
+                print("%-8.3f" % (time.time() - start_ts), end="")
+            ppid = event.ppid if event.ppid > 0 else get_ppid(event.pid)
+            ppid = b"%d" % ppid if ppid > 0 else b"?"
+            argv_text = b' '.join(argv[event.pid]).replace(b'\n', b'\\n')
+            printb(b"%-16s %-6d %-6s %3d %s" % (event.comm, event.pid,
+                   ppid, event.retval, argv_text))
+        try:
+            del(argv[event.pid])
+        except Exception:
+            pass
+
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/execsnoop_example.txt b/tools/execsnoop_example.txt
new file mode 100644
index 0000000..ad5f65b
--- /dev/null
+++ b/tools/execsnoop_example.txt
@@ -0,0 +1,102 @@
+Demonstrations of execsnoop, the Linux eBPF/bcc version.
+
+
+execsnoop traces new processes. For example, tracing the commands invoked when
+running "man ls":
+
+# ./execsnoop
+PCOMM            PID    RET ARGS
+bash             15887    0 /usr/bin/man ls
+preconv          15894    0 /usr/bin/preconv -e UTF-8
+man              15896    0 /usr/bin/tbl
+man              15897    0 /usr/bin/nroff -mandoc -rLL=169n -rLT=169n -Tutf8
+man              15898    0 /usr/bin/pager -s
+nroff            15900    0 /usr/bin/locale charmap
+nroff            15901    0 /usr/bin/groff -mtty-char -Tutf8 -mandoc -rLL=169n -rLT=169n
+groff            15902    0 /usr/bin/troff -mtty-char -mandoc -rLL=169n -rLT=169n -Tutf8
+groff            15903    0 /usr/bin/grotty
+
+The output shows the parent process/command name (PCOMM), the PID, the return
+value of the exec() (RET), and the filename with arguments (ARGS). 
+
+This works by traces the execve() system call (commonly used exec() variant),
+and shows details of the arguments and return value. This catches new processes
+that follow the fork->exec sequence, as well as processes that re-exec()
+themselves. Some applications fork() but do not exec(), eg, for worker
+processes, which won't be included in the execsnoop output.
+
+
+The -x option can be used to include failed exec()s. For example:
+
+# ./execsnoop -x
+PCOMM            PID    RET ARGS
+supervise        9660     0 ./run
+supervise        9661     0 ./run
+mkdir            9662     0 /bin/mkdir -p ./main
+run              9663     0 ./run
+chown            9664     0 /bin/chown nobody:nobody ./main
+run              9665     0 /bin/mkdir -p ./main
+supervise        9667     0 ./run
+run              9660    -2 /usr/local/bin/setuidgid nobody /command/multilog t ./main
+chown            9668     0 /bin/chown nobody:nobody ./main
+run              9666     0 /bin/chmod 0777 main
+run              9663    -2 /usr/local/bin/setuidgid nobody /command/multilog t ./main
+run              9669     0 /bin/mkdir -p ./main
+run              9661    -2 /usr/local/bin/setuidgid nobody /command/multilog t ./main
+supervise        9670     0 ./run
+[...]
+
+This example shows various regular system daemon activity, including some
+failures (trying to execute a /usr/local/bin/setuidgid, which I just noticed
+doesn't exist).
+
+
+A -t option can be used to include a timestamp column, and a -n option to match
+on a name. Regular expressions are allowed. 
+For example, matching commands containing "mount":
+
+# ./execsnoop -tn mount
+TIME(s) PCOMM            PID    RET ARGS
+2.849   mount            18049    0 /bin/mount -p
+
+The -l option can be used to only show command where one of the arguments
+matches specified line. The limitation is that we are looking only into first 20
+arguments of the command. For example, matching all command where one of the argument
+is "testpkg":
+
+# ./execsnoop.py -l testpkg
+PCOMM            PID    PPID   RET ARGS
+service          3344535 4146419   0 /usr/sbin/service testpkg status
+systemctl        3344535 4146419   0 /bin/systemctl status testpkg.service
+yum              3344856 4146419   0 /usr/local/bin/yum remove testpkg
+python           3344856 4146419   0 /usr/local/bin/python /usr/local/bin/yum remove testpkg
+yum              3344856 4146419   0 /usr/bin/yum remove testpkg
+yum              3345086 4146419   0 /usr/local/bin/yum install testpkg
+python           3345086 4146419   0 /usr/local/bin/python /usr/local/bin/yum install testpkg
+yum              3345086 4146419   0 /usr/bin/yum install testpkg
+rpm              3345452 4146419   0 /bin/rpm -qa testpkg
+
+USAGE message:
+
+# ./execsnoop -h
+usage: execsnoop [-h] [-t] [-x] [-n NAME] [-l LINE] [--max-args MAX_ARGS]
+
+Trace exec() syscalls
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t, --timestamp       include timestamp on output
+  -x, --fails           include failed exec()s
+  -n NAME, --name NAME  only print commands matching this name (regex), any
+                        arg
+  -l LINE, --line LINE  only print commands where arg contains this line
+                        (regex)
+  --max-args MAX_ARGS   maximum number of arguments parsed and displayed,
+                        defaults to 20
+
+examples:
+    ./execsnoop           # trace all exec() syscalls
+    ./execsnoop -x        # include failed exec()s
+    ./execsnoop -t        # include timestamps
+    ./execsnoop -n main   # only print command lines containing "main"
+    ./execsnoop -l tpkg   # only print command where arguments contains "tpkg"
diff --git a/tools/ext4dist.py b/tools/ext4dist.py
new file mode 100755
index 0000000..bc797fb
--- /dev/null
+++ b/tools/ext4dist.py
@@ -0,0 +1,224 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ext4dist  Summarize ext4 operation latency.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# symbols
+kallsyms = "/proc/kallsyms"
+
+# arguments
+examples = """examples:
+    ./ext4dist            # show operation latency as a histogram
+    ./ext4dist -p 181     # trace PID 181 only
+    ./ext4dist 1 10       # print 1 second summaries, 10 times
+    ./ext4dist -m 5       # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize ext4 operation latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+    help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+    factor = 1000000
+    label = "msecs"
+else:
+    factor = 1000
+    label = "usecs"
+if args.interval and int(args.interval) == 0:
+    print("ERROR: interval 0. Exiting.")
+    exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+    char op[OP_NAME_LEN];
+    u64 slot;
+} dist_key_t;
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's
+// own function, for reads. So we need to trace that and then filter on ext4,
+// which I do by checking file->f_op.
+int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+
+    // ext4 filter on file->f_op == ext4_file_operations
+    struct file *fp = iocb->ki_filp;
+    if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
+        return 0;
+
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+    u64 *tsp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start or filtered
+    }
+    u64 delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+
+    // Skip entries with backwards time: temp workaround for #728
+    if ((s64) delta < 0)
+        return 0;
+
+    delta /= FACTOR;
+
+    // store as histogram
+    dist_key_t key = {.slot = bpf_log2l(delta)};
+    __builtin_memcpy(&key.op, op, sizeof(key.op));
+    dist.increment(key);
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    char *op = "read";
+    return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    char *op = "write";
+    return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    char *op = "open";
+    return trace_return(ctx, op);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    char *op = "fsync";
+    return trace_return(ctx, op);
+}
+"""
+
+# code replacements
+with open(kallsyms) as syms:
+    ops = ''
+    for line in syms:
+        (addr, size, name) = line.rstrip().split(" ", 2)
+        name = name.split("\t")[0]
+        if name == "ext4_file_operations":
+            ops = "0x" + addr
+            break
+    if ops == '':
+        print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.")
+        print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.")
+        exit()
+    bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops)
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# Common file functions. See earlier comment about generic_file_read_iter().
+# Comment by Joe Yin 
+# From Linux 4.10, the function .read_iter at the ext4_file_operations has 
+# changed to ext4_file_read_iter.
+# So, I add get_kprobe_functions(b'ext4_file_read_iter'),it will first to attach ext4_file_read_iter,
+# if fails and will attach the generic_file_read_iter which used to pre-4.10.
+
+if BPF.get_kprobe_functions(b'ext4_file_read_iter'):
+	b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_entry")
+else:
+	b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry")
+b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry")
+b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry")
+b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return")
+
+print("Tracing ext4 operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.interval and (not args.notimestamp):
+        print(strftime("%H:%M:%S:"))
+
+    dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode)
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/ext4dist_example.txt b/tools/ext4dist_example.txt
new file mode 100644
index 0000000..def8e8b
--- /dev/null
+++ b/tools/ext4dist_example.txt
@@ -0,0 +1,193 @@
+Demonstrations of ext4dist, the Linux eBPF/bcc version.
+
+
+ext4dist traces ext4 reads, writes, opens, and fsyncs, and summarizes their
+latency as a power-of-2 histogram. For example:
+
+# ./ext4dist 
+Tracing ext4 operation latency... Hit Ctrl-C to end.
+^C
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 1210     |****************************************|
+         2 -> 3          : 126      |****                                    |
+         4 -> 7          : 376      |************                            |
+         8 -> 15         : 86       |**                                      |
+        16 -> 31         : 9        |                                        |
+        32 -> 63         : 47       |*                                       |
+        64 -> 127        : 6        |                                        |
+       128 -> 255        : 24       |                                        |
+       256 -> 511        : 137      |****                                    |
+       512 -> 1023       : 66       |**                                      |
+      1024 -> 2047       : 13       |                                        |
+      2048 -> 4095       : 7        |                                        |
+      4096 -> 8191       : 13       |                                        |
+      8192 -> 16383      : 3        |                                        |
+
+operation = 'write'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 75       |****************************************|
+        16 -> 31         : 5        |**                                      |
+
+operation = 'open'
+     usecs               : count     distribution
+         0 -> 1          : 1278     |****************************************|
+         2 -> 3          : 40       |*                                       |
+         4 -> 7          : 4        |                                        |
+         8 -> 15         : 1        |                                        |
+        16 -> 31         : 1        |                                        |
+
+This output shows a bi-modal distribution for read latency, with a faster
+mode of less than 7 microseconds, and a slower mode of between 256 and 1023
+microseconds. The count column shows how many events fell into that latency
+range. It's likely that the faster mode was a hit from the in-memory file
+system cache, and the slower mode is a read from a storage device (disk).
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example:
+
+# ./ext4dist -m 1 5
+Tracing ext4 operation latency... Hit Ctrl-C to end.
+
+10:19:00:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 576      |****************************************|
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 6        |                                        |
+         8 -> 15         : 13       |                                        |
+        16 -> 31         : 17       |*                                       |
+        32 -> 63         : 5        |                                        |
+        64 -> 127        : 1        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 20       |****************************************|
+
+operation = 'open'
+     msecs               : count     distribution
+         0 -> 1          : 346      |****************************************|
+
+10:19:01:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 584      |****************************************|
+         2 -> 3          : 10       |                                        |
+         4 -> 7          : 11       |                                        |
+         8 -> 15         : 16       |*                                       |
+        16 -> 31         : 6        |                                        |
+        32 -> 63         : 4        |                                        |
+        64 -> 127        : 2        |                                        |
+       128 -> 255        : 1        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 20       |****************************************|
+
+operation = 'open'
+     msecs               : count     distribution
+         0 -> 1          : 336      |****************************************|
+
+10:19:02:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 678      |****************************************|
+         2 -> 3          : 7        |                                        |
+         4 -> 7          : 9        |                                        |
+         8 -> 15         : 25       |*                                       |
+        16 -> 31         : 10       |                                        |
+        32 -> 63         : 3        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 19       |****************************************|
+         2 -> 3          : 1        |**                                      |
+
+operation = 'open'
+     msecs               : count     distribution
+         0 -> 1          : 390      |****************************************|
+
+10:19:03:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 567      |****************************************|
+         2 -> 3          : 7        |                                        |
+         4 -> 7          : 9        |                                        |
+         8 -> 15         : 20       |*                                       |
+        16 -> 31         : 15       |*                                       |
+        32 -> 63         : 5        |                                        |
+        64 -> 127        : 2        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 20       |****************************************|
+
+operation = 'open'
+     msecs               : count     distribution
+         0 -> 1          : 417      |****************************************|
+
+10:19:04:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 762      |****************************************|
+         2 -> 3          : 9        |                                        |
+         4 -> 7          : 9        |                                        |
+         8 -> 15         : 11       |                                        |
+        16 -> 31         : 20       |*                                       |
+        32 -> 63         : 4        |                                        |
+        64 -> 127        : 1        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 20       |****************************************|
+
+operation = 'open'
+     msecs               : count     distribution
+         0 -> 1          : 427      |****************************************|
+
+This shows a mixed read/write workload.
+
+
+USAGE message:
+
+# ./ext4dist -h
+usage: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize ext4 operation latency
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --notimestamp   don't include timestamp on interval output
+  -m, --milliseconds  output in milliseconds
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./ext4dist            # show operation latency as a histogram
+    ./ext4dist -p 181     # trace PID 181 only
+    ./ext4dist 1 10       # print 1 second summaries, 10 times
+    ./ext4dist -m 5       # 5s summaries, milliseconds
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
new file mode 100755
index 0000000..88db831
--- /dev/null
+++ b/tools/ext4slower.py
@@ -0,0 +1,359 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ext4slower  Trace slow ext4 operations.
+#             For Linux, uses BCC, eBPF.
+#
+# USAGE: ext4slower [-h] [-j] [-p PID] [min_ms]
+#
+# This script traces common ext4 file operations: reads, writes, opens, and
+# syncs. It measures the time spent in these operations, and prints details
+# for each that exceeded a threshold.
+#
+# WARNING: This adds low-overhead instrumentation to these ext4 operations,
+# including reads and writes from the file system cache. Such reads and writes
+# can be very frequent (depending on the workload; eg, 1M/sec), at which
+# point the overhead of this tool (even if it prints no "slower" events) can
+# begin to become significant.
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Feb-2016   Brendan Gregg   Created this.
+# 15-Oct-2016   Dina Goldshtein -p to filter by process ID.
+# 13-Jun-2018   Joe Yin modify generic_file_read_iter to ext4_file_read_iter.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# symbols
+kallsyms = "/proc/kallsyms"
+
+# arguments
+examples = """examples:
+    ./ext4slower             # trace operations slower than 10 ms (default)
+    ./ext4slower 1           # trace operations slower than 1 ms
+    ./ext4slower -j 1        # ... 1 ms, parsable output (csv)
+    ./ext4slower 0           # trace all operations (warning: verbose)
+    ./ext4slower -p 185      # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace common ext4 file operations slower than a threshold",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="just print fields: comma-separated values")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("min_ms", nargs="?", default='10',
+    help="minimum I/O duration to trace, in ms (default 10)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+pid = args.pid
+csv = args.csv
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/dcache.h>
+
+// XXX: switch these to char's when supported
+#define TRACE_READ      0
+#define TRACE_WRITE     1
+#define TRACE_OPEN      2
+#define TRACE_FSYNC     3
+
+struct val_t {
+    u64 ts;
+    u64 offset;
+    struct file *fp;
+};
+
+struct data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 type;
+    u64 size;
+    u64 offset;
+    u64 delta_us;
+    u64 pid;
+    char task[TASK_COMM_LEN];
+    char file[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(entryinfo, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+//
+// Store timestamp and size on entry
+//
+
+// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's
+// own function, for reads. So we need to trace that and then filter on ext4,
+// which I do by checking file->f_op.
+// The new Linux version (since form 4.10) uses ext4_file_read_iter(), And if the 'CONFIG_FS_DAX' 
+// is not set ,then ext4_file_read_iter() will call generic_file_read_iter(), else it will call 
+// ext4_dax_read_iter(), and trace generic_file_read_iter() will fail.
+int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u64 id =  bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // ext4 filter on file->f_op == ext4_file_operations
+    struct file *fp = iocb->ki_filp;
+    if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = fp;
+    val.offset = iocb->ki_pos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// ext4_file_write_iter():
+int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = iocb->ki_filp;
+    val.offset = iocb->ki_pos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// ext4_file_open():
+int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
+    struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// ext4_sync_file():
+int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+//
+// Output
+//
+
+static int trace_return(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    valp = entryinfo.lookup(&id);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+
+    // calculate delta
+    u64 ts = bpf_ktime_get_ns();
+    u64 delta_us = (ts - valp->ts) / 1000;
+    entryinfo.delete(&id);
+    if (FILTER_US)
+        return 0;
+
+    // populate output struct
+    u32 size = PT_REGS_RC(ctx);
+    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
+        .pid = pid};
+    data.ts_us = ts / 1000;
+    data.offset = valp->offset;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // workaround (rewriter should handle file to d_name in one step):
+    struct dentry *de = NULL;
+    struct qstr qs = {};
+    de = valp->fp->f_path.dentry;
+    qs = de->d_name;
+    if (qs.len == 0)
+        return 0;
+    bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_WRITE);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_OPEN);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_FSYNC);
+}
+
+"""
+
+# code replacements
+with open(kallsyms) as syms:
+    ops = ''
+    for line in syms:
+        (addr, size, name) = line.rstrip().split(" ", 2)
+        name = name.split("\t")[0]
+        if name == "ext4_file_operations":
+            ops = "0x" + addr
+            break
+    if ops == '':
+        print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.")
+        print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.")
+        exit()
+    bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops)
+if min_ms == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US',
+        'delta_us <= %s' % str(min_ms * 1000))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("type", ct.c_ulonglong),
+        ("size", ct.c_ulonglong),
+        ("offset", ct.c_ulonglong),
+        ("delta_us", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("file", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    type = 'R'
+    if event.type == 1:
+        type = 'W'
+    elif event.type == 2:
+        type = 'O'
+    elif event.type == 3:
+        type = 'S'
+
+    if (csv):
+        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
+            event.ts_us, event.task.decode('utf-8', 'replace'), event.pid,
+            type, event.size, event.offset, event.delta_us,
+            event.file.decode('utf-8', 'replace')))
+        return
+    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
+        event.task.decode('utf-8', 'replace'), event.pid, type, event.size,
+        event.offset / 1024, float(event.delta_us) / 1000,
+        event.file.decode('utf-8', 'replace')))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# Common file functions. See earlier comment about generic_file_read_iter().
+if BPF.get_kprobe_functions(b'ext4_file_read_iter'):
+    b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_read_entry")
+else:
+    b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry")
+b.attach_kprobe(event="ext4_file_open", fn_name="trace_open_entry")
+b.attach_kprobe(event="ext4_sync_file", fn_name="trace_fsync_entry")
+if BPF.get_kprobe_functions(b'ext4_file_read_iter'):
+    b.attach_kretprobe(event="ext4_file_read_iter", fn_name="trace_read_return")
+else:
+    b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return")
+
+# header
+if (csv):
+    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
+else:
+    if min_ms == 0:
+        print("Tracing ext4 operations")
+    else:
+        print("Tracing ext4 operations slower than %d ms" % min_ms)
+    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
+        "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/ext4slower_example.txt b/tools/ext4slower_example.txt
new file mode 100644
index 0000000..9348e42
--- /dev/null
+++ b/tools/ext4slower_example.txt
@@ -0,0 +1,209 @@
+Demonstrations of ext4slower, the Linux eBPF/bcc version.
+
+
+ext4slower shows ext4 reads, writes, opens, and fsyncs, slower than a threshold.
+For example:
+
+# ./ext4slower
+Tracing ext4 operations slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:35:01 cron           16464  R 1249    0          16.05 common-auth
+06:35:01 cron           16463  R 1249    0          16.04 common-auth
+06:35:01 cron           16465  R 1249    0          16.03 common-auth
+06:35:01 cron           16465  R 4096    0          10.62 login.defs
+06:35:01 cron           16464  R 4096    0          10.61 login.defs
+06:35:01 cron           16463  R 4096    0          10.63 login.defs
+06:35:01 cron           16465  R 2972    0          18.52 pam_env.conf
+06:35:01 cron           16464  R 2972    0          18.51 pam_env.conf
+06:35:01 cron           16463  R 2972    0          18.49 pam_env.conf
+06:35:01 dumpsystemstat 16473  R 128     0          12.58 date
+06:35:01 debian-sa1     16474  R 283     0          12.66 sysstat
+06:35:01 debian-sa1     16474  R 128     0          10.39 sa1
+06:35:01 dumpsystemstat 16491  R 128     0          13.22 ifconfig
+06:35:01 DumpThreads    16534  R 128     0          12.78 cut
+06:35:01 cron           16545  R 128     0          14.76 sendmail
+06:35:01 sendmail       16545  R 274     0          10.88 dynamicmaps.cf
+06:35:02 postdrop       16546  R 118     0          32.94 Universal
+06:35:02 pickup         9574   R 118     0          21.02 localtime
+[...]
+
+This shows various system tasks reading from ext4. The high latency here is
+due to disk I/O, as I had just evicted the file system cache for this example.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+The threshold can be provided as an argument. Eg, I/O slower than 1 ms:
+
+# ./ext4slower 1
+Tracing ext4 operations slower than 1 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:49:17 bash           3616   R 128     0           7.75 cksum
+06:49:17 cksum          3616   R 39552   0           1.34 [
+06:49:17 cksum          3616   R 96      0           5.36 2to3-2.7
+06:49:17 cksum          3616   R 96      0          14.94 2to3-3.4
+06:49:17 cksum          3616   R 10320   0           6.82 411toppm
+06:49:17 cksum          3616   R 65536   0           4.01 a2p
+06:49:17 cksum          3616   R 55400   0           8.77 ab
+06:49:17 cksum          3616   R 36792   0          16.34 aclocal-1.14
+06:49:17 cksum          3616   R 15008   0          19.31 acpi_listen
+06:49:17 cksum          3616   R 6123    0          17.23 add-apt-repository
+06:49:17 cksum          3616   R 6280    0          18.40 addpart
+06:49:17 cksum          3616   R 27696   0           2.16 addr2line
+06:49:17 cksum          3616   R 58080   0          10.11 ag
+06:49:17 cksum          3616   R 906     0           6.30 ec2-meta-data
+06:49:17 cksum          3616   R 6320    0          10.00 animate.im6
+06:49:17 cksum          3616   R 5680    0          18.69 anytopnm
+06:49:17 cksum          3616   R 2671    0          20.27 apport-bug
+06:49:17 cksum          3616   R 12566   0          16.72 apport-cli
+06:49:17 cksum          3616   R 1622    0           7.95 apport-unpack
+06:49:17 cksum          3616   R 10440   0           2.37 appres
+06:49:17 cksum          3616   R 48112   0           5.42 whatis
+06:49:17 cksum          3616   R 14832   0           6.24 apt
+06:49:17 cksum          3616   R 65536   0          24.74 apt-cache
+06:49:17 cksum          3616   R 27264   0           1.68 apt-cdrom
+06:49:17 cksum          3616   R 23224   0           5.31 apt-extracttemplates
+06:49:17 cksum          3616   R 65536   0           8.08 apt-ftparchive
+06:49:17 cksum          3616   R 65536   128         2.92 apt-ftparchive
+06:49:17 cksum          3616   R 65536   0           9.58 aptitude-curses
+06:49:17 cksum          3616   R 65536   128        44.25 aptitude-curses
+06:49:17 cksum          3616   R 65536   384         1.69 aptitude-curses
+[...]
+
+This time a cksum(1) command can be seen reading various files (from /usr/bin).
+
+
+A threshold of 0 will trace all operations. Warning: the output will be
+verbose, as it will include all file system cache hits.
+
+# ./ext4slower 0
+Tracing ext4 operations
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:58:05 supervise      1884   O 0       0           0.00 status.new
+06:58:05 supervise      1884   W 18      0           0.02 status.new
+06:58:05 supervise      1884   O 0       0           0.00 status.new
+06:58:05 supervise      1884   W 18      0           0.01 status.new
+06:58:05 supervise      15817  O 0       0           0.00 run
+06:58:05 supervise      15817  R 92      0           0.00 run
+06:58:05 supervise      15817  O 0       0           0.00 bash
+06:58:05 supervise      15817  R 128     0           0.00 bash
+06:58:05 supervise      15817  R 504     0           0.00 bash
+06:58:05 supervise      15817  R 28      0           0.00 bash
+06:58:05 supervise      15817  O 0       0           0.00 ld-2.19.so
+06:58:05 supervise      15817  R 64      0           0.00 ld-2.19.so
+06:58:05 supervise      15817  R 392     0           0.00 ld-2.19.so
+06:58:05 run            15817  O 0       0           0.00 ld.so.cache
+06:58:05 run            15817  O 0       0           0.00 libtinfo.so.5.9
+06:58:05 run            15817  R 832     0           0.00 libtinfo.so.5.9
+06:58:05 run            15817  O 0       0           0.00 libdl-2.19.so
+06:58:05 run            15817  R 832     0           0.00 libdl-2.19.so
+06:58:05 run            15817  O 0       0           0.00 libc-2.19.so
+06:58:05 run            15817  R 832     0           0.00 libc-2.19.so
+06:58:05 supervise      1876   O 0       0           0.00 status.new
+06:58:05 supervise      1876   W 18      0           0.01 status.new
+06:58:05 supervise      1895   O 0       0           0.00 status.new
+06:58:05 supervise      1895   W 18      0           0.02 status.new
+06:58:05 supervise      1876   O 0       0           0.00 status.new
+06:58:05 supervise      1876   W 18      0           0.01 status.new
+06:58:05 supervise      1872   O 0       0           0.00 status.new
+06:58:05 supervise      1872   W 18      0           0.02 status.new
+06:58:05 supervise      1895   O 0       0           0.00 status.new
+06:58:05 supervise      1895   W 18      0           0.01 status.new
+06:58:05 supervise      15818  R 92      0           0.00 run
+06:58:05 supervise      15818  O 0       0           0.00 bash
+06:58:05 supervise      15818  R 128     0           0.00 bash
+06:58:05 supervise      15818  R 504     0           0.00 bash
+06:58:05 supervise      15818  R 28      0           0.00 bash
+06:58:05 supervise      15818  O 0       0           0.00 ld-2.19.so
+06:58:05 supervise      15818  R 64      0           0.00 ld-2.19.so
+06:58:05 supervise      15818  R 392     0           0.00 ld-2.19.so
+06:58:05 supervise      15818  O 0       0           0.00 run
+06:58:05 supervise      1888   O 0       0           0.00 status.new
+06:58:05 supervise      1888   W 18      0           0.01 status.new
+06:58:05 supervise      1888   O 0       0           0.00 status.new
+06:58:05 supervise      1888   W 18      0           0.02 status.new
+06:58:05 supervise      15822  R 119     0           0.00 run
+06:58:05 supervise      15822  O 0       0           0.00 bash
+06:58:05 supervise      15822  R 128     0           0.00 bash
+06:58:05 supervise      15822  R 504     0           0.00 bash
+06:58:05 supervise      15822  R 28      0           0.00 bash
+06:58:05 supervise      15822  O 0       0           0.00 ld-2.19.so
+06:58:05 supervise      15822  R 64      0           0.00 ld-2.19.so
+06:58:05 supervise      15822  R 392     0           0.00 ld-2.19.so
+06:58:05 supervise      1892   O 0       0           0.00 status.new
+06:58:05 supervise      1892   W 18      0           0.02 status.new
+06:58:05 supervise      1892   O 0       0           0.00 status.new
+06:58:05 supervise      1892   W 18      0           0.02 status.new
+06:58:05 supervise      15820  O 0       0           0.00 run
+[...]
+
+The output now includes open operations ("O"), and writes ("W").
+
+
+A -j option will print just the fields (parsable output, csv):
+
+# ./ext4slower -j 1
+ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE
+127200712278,bash,17225,R,128,0,14329,cksum
+127200722986,cksum,17225,R,3274,0,8368,command-not-found
+127200735581,cksum,17225,R,65536,0,10903,libbfd-2.24-system.so
+127200738482,cksum,17225,R,65536,131072,2419,libbfd-2.24-system.so
+127200749226,cksum,17225,R,65536,655360,8995,libbfd-2.24-system.so
+127200776273,cksum,17225,R,55080,0,25297,libbind9.so.90.0.9
+127200784688,cksum,17225,R,65536,0,7873,libblas.so.3.0
+127200787551,cksum,17225,R,65536,131072,2386,libblas.so.3.0
+127200795524,cksum,17225,R,18624,0,4947,libcpupower.so.3.13.0-49
+127200802073,cksum,17225,R,65536,0,6410,libcwidget.so.3.0.0
+127200808718,cksum,17225,R,65536,131072,6181,libcwidget.so.3.0.0
+127200829518,cksum,17225,R,65536,0,14213,libdns.so.100.2.2
+127200832916,cksum,17225,R,65536,131072,2911,libdns.so.100.2.2
+127200841044,cksum,17225,R,65536,655360,6376,libdns.so.100.2.2
+127200853646,cksum,17225,R,956,0,1022,libdumbnet.la
+127200857814,cksum,17225,R,61096,0,4111,libdumbnet.so.1.0.1
+127200869655,cksum,17225,R,65536,0,11252,libgettextlib-0.18.3.so
+127200872985,cksum,17225,R,65536,131072,2882,libgettextlib-0.18.3.so
+127200883063,cksum,17225,R,65536,0,9661,libgettextsrc-0.18.3.so
+127200884767,cksum,17225,R,65536,131072,1251,libgettextsrc-0.18.3.so
+127200904830,cksum,17225,R,65536,0,19571,libgirepository-1.0.so.1.0.0
+127200906354,cksum,17225,R,65536,131072,1080,libgirepository-1.0.so.1.0.0
+127200936047,cksum,17225,R,65536,0,28674,libGraphicsMagick.a
+127200939091,cksum,17225,R,65536,131072,2576,libGraphicsMagick.a
+127200947295,cksum,17225,R,65536,655360,6463,libGraphicsMagick.a
+127200958793,cksum,17225,R,65536,1966080,7034,libGraphicsMagick.a
+[...]
+
+This may be useful for visualizing with another tool, for example, for
+producing a scatter plot of ENDTIME vs LATENCY, to look for time-based
+patterns.
+
+
+USAGE message:
+
+# ./ext4slower -h
+usage: ext4slower [-h] [-j] [-p PID] [min_ms]
+
+Trace common ext4 file operations slower than a threshold
+
+positional arguments:
+  min_ms             minimum I/O duration to trace, in ms (default 10)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -j, --csv          just print fields: comma-separated values
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./ext4slower             # trace operations slower than 10 ms (default)
+    ./ext4slower 1           # trace operations slower than 1 ms
+    ./ext4slower -j 1        # ... 1 ms, parsable output (csv)
+    ./ext4slower 0           # trace all operations (warning: verbose)
+    ./ext4slower -p 185      # trace PID 185 only
diff --git a/tools/filelife.py b/tools/filelife.py
new file mode 100755
index 0000000..410659d
--- /dev/null
+++ b/tools/filelife.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# filelife    Trace the lifespan of short-lived files.
+#             For Linux, uses BCC, eBPF. Embedded C.
+#
+# This traces the creation and deletion of files, providing information
+# on who deleted the file, the file age, and the file name. The intent is to
+# provide information on short-lived files, for debugging or performance
+# analysis.
+#
+# USAGE: filelife [-h] [-p PID]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Feb-2015   Brendan Gregg   Created this.
+# 17-Feb-2016   Allan McAleavy updated for BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./filelife           # trace all stat() syscalls
+    ./filelife -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace stat() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+struct data_t {
+    u32 pid;
+    u64 delta;
+    char comm[TASK_COMM_LEN];
+    char fname[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(birth, struct dentry *);
+BPF_PERF_OUTPUT(events);
+
+// trace file creation time
+int trace_create(struct pt_regs *ctx, struct inode *dir, struct dentry *dentry)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+
+    u64 ts = bpf_ktime_get_ns();
+    birth.update(&dentry, &ts);
+
+    return 0;
+};
+
+// trace file deletion and output details
+int trace_unlink(struct pt_regs *ctx, struct inode *dir, struct dentry *dentry)
+{
+    struct data_t data = {};
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+
+    u64 *tsp, delta;
+    tsp = birth.lookup(&dentry);
+    if (tsp == 0) {
+        return 0;   // missed create
+    }
+
+    delta = (bpf_ktime_get_ns() - *tsp) / 1000000;
+    birth.delete(&dentry);
+
+    struct qstr d_name = dentry->d_name;
+    if (d_name.len == 0)
+        return 0;
+
+    if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
+        data.pid = pid;
+        data.delta = delta;
+        bpf_probe_read(&data.fname, sizeof(data.fname), d_name.name);
+    }
+
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+TASK_COMM_LEN = 16            # linux/sched.h
+DNAME_INLINE_LEN = 255        # linux/dcache.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("delta", ct.c_ulonglong),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("fname", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="vfs_create", fn_name="trace_create")
+# newer kernels (say, 4.8) may don't fire vfs_create, so record (or overwrite)
+# the timestamp in security_inode_create():
+b.attach_kprobe(event="security_inode_create", fn_name="trace_create")
+b.attach_kprobe(event="vfs_unlink", fn_name="trace_unlink")
+
+# header
+print("%-8s %-6s %-16s %-7s %s" % ("TIME", "PID", "COMM", "AGE(s)", "FILE"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-8s %-6d %-16s %-7.2f %s" % (strftime("%H:%M:%S"), event.pid,
+        event.comm.decode('utf-8', 'replace'), float(event.delta) / 1000,
+        event.fname.decode('utf-8', 'replace')))
+
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/filelife_example.txt b/tools/filelife_example.txt
new file mode 100644
index 0000000..c3d6795
--- /dev/null
+++ b/tools/filelife_example.txt
@@ -0,0 +1,52 @@
+Demonstrations of filelife, the Linux eBPF/bcc version.
+
+
+filelife traces short-lived files: those that have been created and then
+deleted while tracing. For example:
+
+# ./filelife 
+TIME     PID    COMM             AGE(s)  FILE
+05:57:59 8556   gcc              0.04    ccCB5EDe.s
+05:57:59 8560   rm               0.02    .entry_64.o.d
+05:57:59 8563   gcc              0.02    cc5UFHXf.s
+05:57:59 8567   rm               0.01    .thunk_64.o.d
+05:57:59 8578   rm               0.02    .syscall_64.o.d
+05:58:00 8589   rm               0.03    .common.o.d
+05:58:00 8596   rm               0.01    .8592.tmp
+05:58:00 8601   rm               0.01    .8597.tmp
+05:58:00 8606   rm               0.01    .8602.tmp
+05:58:00 8639   rm               0.02    .vma.o.d
+05:58:00 8650   rm               0.02    .vdso32-setup.o.d
+05:58:00 8656   rm               0.00    .vdso.lds.d
+05:58:00 8659   gcc              0.01    ccveeJAz.s
+05:58:00 8663   rm               0.01    .vdso-note.o.d
+05:58:00 8674   rm               0.02    .vclock_gettime.o.d
+05:58:01 8684   rm               0.01    .vgetcpu.o.d
+05:58:01 8690   collect2         0.00    ccvKMxdm.ld
+
+This has caught short-lived files that were created during a Linux kernel
+build. The PID shows the process ID that finally deleted the file, and COMM
+is its process name. The AGE(s) column shows the age of the file, in seconds,
+when it was deleted. These are all short-lived, and existed for less than
+one tenth of a second.
+
+Creating, populating, and then deleting files as part of another process can
+be an inefficient method of inter-process communication. It can cause disk I/O
+as files are closed and their file descriptors flushed, only later to be
+deleted. As such, short-lived files can be a target of performance
+optimizations.
+
+USAGE message:
+
+# ./filelife -h
+usage: filelife [-h] [-p PID]
+
+Trace stat() syscalls
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./filelife           # trace all stat() syscalls
+    ./filelife -p 181    # only trace PID 181
diff --git a/tools/fileslower.py b/tools/fileslower.py
new file mode 100755
index 0000000..25443a2
--- /dev/null
+++ b/tools/fileslower.py
@@ -0,0 +1,253 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# fileslower  Trace slow synchronous file reads and writes.
+#             For Linux, uses BCC, eBPF.
+#
+# USAGE: fileslower [-h] [-p PID] [-a] [min_ms]
+#
+# This script uses kernel dynamic tracing of synchronous reads and writes
+# at the VFS interface, to identify slow file reads and writes for any file
+# system.
+#
+# This works by tracing __vfs_read() and __vfs_write(), and filtering for
+# synchronous I/O (the path to new_sync_read() and new_sync_write()), and
+# for I/O with filenames. This approach provides a view of just two file
+# system request types. There are typically many others: asynchronous I/O,
+# directory operations, file handle operations, etc, that this tool does not
+# instrument.
+#
+# WARNING: This traces VFS reads and writes, which can be extremely frequent,
+# and so the overhead of this tool can become severe depending on the
+# workload.
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 06-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import time
+
+# arguments
+examples = """examples:
+    ./fileslower             # trace sync file I/O slower than 10 ms (default)
+    ./fileslower 1           # trace sync file I/O slower than 1 ms
+    ./fileslower -p 185      # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace slow synchronous file reads and writes",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
+    help="trace this PID only")
+parser.add_argument("-a", "--all-files", action="store_true",
+    help="include non-regular file types (sockets, FIFOs, etc)")
+parser.add_argument("min_ms", nargs="?", default='10',
+    help="minimum I/O duration to trace, in ms (default 10)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+tgid = args.tgid
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+enum trace_mode {
+    MODE_READ,
+    MODE_WRITE
+};
+
+struct val_t {
+    u32 sz;
+    u64 ts;
+    u32 name_len;
+    // de->d_name.name may point to de->d_iname so limit len accordingly
+    char name[DNAME_INLINE_LEN];
+    char comm[TASK_COMM_LEN];
+};
+
+struct data_t {
+    enum trace_mode mode;
+    u32 pid;
+    u32 sz;
+    u64 delta_us;
+    u32 name_len;
+    char name[DNAME_INLINE_LEN];
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_HASH(entryinfo, pid_t, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+// store timestamp and size on entry
+static int trace_rw_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count)
+{
+    u32 tgid = bpf_get_current_pid_tgid() >> 32;
+    if (TGID_FILTER)
+        return 0;
+
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // skip I/O lacking a filename
+    struct dentry *de = file->f_path.dentry;
+    int mode = file->f_inode->i_mode;
+    if (de->d_name.len == 0 || TYPE_FILTER)
+        return 0;
+
+    // store size and timestamp by pid
+    struct val_t val = {};
+    val.sz = count;
+    val.ts = bpf_ktime_get_ns();
+
+    struct qstr d_name = de->d_name;
+    val.name_len = d_name.len;
+    bpf_probe_read(&val.name, sizeof(val.name), d_name.name);
+    bpf_get_current_comm(&val.comm, sizeof(val.comm));
+    entryinfo.update(&pid, &val);
+
+    return 0;
+}
+
+int trace_read_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count)
+{
+    // skip non-sync I/O; see kernel code for __vfs_read()
+    if (!(file->f_op->read_iter))
+        return 0;
+    return trace_rw_entry(ctx, file, buf, count);
+}
+
+int trace_write_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count)
+{
+    // skip non-sync I/O; see kernel code for __vfs_write()
+    if (!(file->f_op->write_iter))
+        return 0;
+    return trace_rw_entry(ctx, file, buf, count);
+}
+
+// output
+static int trace_rw_return(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    valp = entryinfo.lookup(&pid);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+    u64 delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000;
+    entryinfo.delete(&pid);
+    if (delta_us < MIN_US)
+        return 0;
+
+    struct data_t data = {};
+    data.mode = type;
+    data.pid = pid;
+    data.sz = valp->sz;
+    data.delta_us = delta_us;
+    data.name_len = valp->name_len;
+    bpf_probe_read(&data.name, sizeof(data.name), valp->name);
+    bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_rw_return(ctx, MODE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_rw_return(ctx, MODE_WRITE);
+}
+
+"""
+bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000))
+if args.tgid:
+    bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid)
+else:
+    bpf_text = bpf_text.replace('TGID_FILTER', '0')
+if args.all_files:
+    bpf_text = bpf_text.replace('TYPE_FILTER', '0')
+else:
+    bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# I'd rather trace these via new_sync_read/new_sync_write (which used to be
+# do_sync_read/do_sync_write), but those became static. So trace these from
+# the parent functions, at the cost of more overhead, instead.
+# Ultimately, we should be using [V]FS tracepoints.
+b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry")
+b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return")
+try:
+    b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
+    b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return")
+except:
+    # older kernels don't have __vfs_write so try vfs_write instead
+    b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")
+    b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return")
+
+TASK_COMM_LEN = 16  # linux/sched.h
+DNAME_INLINE_LEN = 32  # linux/dcache.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("mode", ct.c_int),
+        ("pid", ct.c_uint),
+        ("sz", ct.c_uint),
+        ("delta_us", ct.c_ulonglong),
+        ("name_len", ct.c_uint),
+        ("name", ct.c_char * DNAME_INLINE_LEN),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+    ]
+
+mode_s = {
+    0: 'R',
+    1: 'W',
+}
+
+# header
+print("Tracing sync read/writes slower than %d ms" % min_ms)
+print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D",
+    "BYTES", "LAT(ms)", "FILENAME"))
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    ms = float(event.delta_us) / 1000
+    name = event.name.decode('utf-8', 'replace')
+    if event.name_len > DNAME_INLINE_LEN:
+        name = name[:-3] + "..."
+
+    print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % (
+        time.time() - start_ts, event.comm.decode('utf-8', 'replace'),
+        event.pid, mode_s[event.mode], event.sz, ms, name))
+
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/fileslower_example.txt b/tools/fileslower_example.txt
new file mode 100644
index 0000000..0e0c7ca
--- /dev/null
+++ b/tools/fileslower_example.txt
@@ -0,0 +1,123 @@
+Demonstrations of fileslower, the Linux eBPF/bcc version.
+
+
+fileslower shows file-based synchronous reads and writes slower than a
+threshold. For example:
+
+# ./fileslower 
+Tracing sync read/writes slower than 10 ms
+TIME(s)  COMM           PID    D BYTES   LAT(ms) FILENAME
+0.000    randread.pl    4762   R 8192      12.70 data1
+8.850    randread.pl    4762   R 8192      11.26 data1
+12.852   randread.pl    4762   R 8192      10.43 data1
+
+This showed a few reads from a "randread.pl" program, each 8 Kbytes in size,
+and from a "data1" file. These all had over 10 ms latency.
+
+This "latency" is measured from when the read or write was issued at the VFS
+interface, to when it completed. This spans everything: block device I/O (disk
+I/O), file system CPU cycles, file system locks, run queue latency, etc. This
+is a better measure of the latency suffered by applications reading from the
+file system than measuring this down at the block device interface.
+
+Note that this only traces file reads and writes: other file system operations
+(eg, directory operations, open(), fflush()) are not traced.
+
+
+The threshold can be provided as an argument. Eg, I/O slower than 1 ms:
+
+# ./fileslower 1
+Tracing sync read/writes slower than 1 ms
+TIME(s)  COMM           PID    D BYTES   LAT(ms) FILENAME
+0.000    randread.pl    6925   R 8192       1.06 data1
+0.082    randread.pl    6925   R 8192       2.42 data1
+0.116    randread.pl    6925   R 8192       1.78 data1
+0.153    randread.pl    6925   R 8192       2.31 data1
+0.330    randread.pl    6925   R 8192       1.14 data1
+0.345    randread.pl    6925   R 8192       1.52 data1
+0.359    randread.pl    6925   R 8192       1.04 data1
+0.532    randread.pl    6925   R 8192       2.56 data1
+0.609    supervise      1892   W 18         3.65 status.new
+0.610    randread.pl    6925   R 8192       1.37 data1
+0.614    randread.pl    6925   R 8192       3.04 data1
+0.729    randread.pl    6925   R 8192       2.90 data1
+0.755    randread.pl    6925   R 8192       1.12 data1
+0.762    randread.pl    6925   R 8192       2.62 data1
+0.771    randread.pl    6925   R 8192       1.07 data1
+0.816    randread.pl    6925   R 8192      10.50 data1
+0.983    randread.pl    6925   R 8192       1.73 data1
+0.989    randread.pl    6925   R 8192       2.12 data1
+0.992    randread.pl    6925   R 8192       2.17 data1
+1.001    randread.pl    6925   R 8192       1.93 data1
+1.007    randread.pl    6925   R 8192       2.03 data1
+1.210    randread.pl    6925   R 8192       1.82 data1
+1.213    randread.pl    6925   R 8192       2.58 data1
+1.219    randread.pl    6925   R 8192       2.20 data1
+1.430    randread.pl    6925   R 8192       1.01 data1
+1.448    randread.pl    6925   R 8192       2.22 data1
+[...]
+
+There's now much more output (this spans only 1.4 seconds, the previous output
+spanned 12 seconds), and the lower threshold is catching more I/O.
+
+
+In the following example, the file system caches were dropped before running
+fileslower, and then in another session a "man ls" was executed. The command
+and files read from disk can be seen:
+
+# echo 3 > /proc/sys/vm/drop_caches; ./fileslower 1
+Tracing sync read/writes slower than 1 ms
+TIME(s)  COMM           PID    D BYTES   LAT(ms) FILENAME
+0.000    bash           9647   R 128        5.83 man
+0.050    man            9647   R 832       19.52 libmandb-2.6.7.1.so
+0.066    man            9647   R 832       15.79 libman-2.6.7.1.so
+0.123    man            9647   R 832       56.36 libpipeline.so.1.3.0
+0.135    man            9647   R 832        9.79 libgdbm.so.3.0.0
+0.323    man            9647   R 4096      59.52 locale.alias
+0.540    man            9648   R 8192      11.11 ls.1.gz
+0.558    man            9647   R 72         6.97 index.db
+0.563    man            9647   R 4096       5.12 index.db
+0.723    man            9658   R 128       12.06 less
+0.725    man            9656   R 128       14.52 nroff
+0.779    man            9655   R 128       68.86 tbl
+0.814    nroff          9660   R 128       14.55 locale
+0.830    pager          9658   R 4096      28.27 .lesshst
+0.866    man            9654   R 128      163.12 preconv
+0.980    nroff          9684   R 128       13.80 groff
+0.999    groff          9684   R 4096      14.29 DESC
+1.036    groff          9685   R 128        5.94 troff
+1.038    groff          9686   R 128        7.76 grotty
+1.065    troff          9685   R 4096       6.33 R
+1.082    troff          9685   R 4096      10.52 BI
+1.096    troff          9685   R 4096       8.70 troffrc
+1.176    troff          9685   R 4096      80.12 composite.tmac
+1.195    troff          9685   R 4096      19.20 fallbacks.tmac
+1.202    troff          9685   R 4096       6.79 tty.tmac
+1.221    troff          9685   R 4096       7.87 man.local
+2.977    supervise      1876   W 18         4.23 status.new
+
+This caught an individual I/O reaching 163.12 ms, for the "preconv" file. While
+the file system cache was flushed, causing these to need to be read from disk,
+the duration here may not be entirely disk I/O: it can include file system
+locks, run queue latency, etc. These can be explored using other commands.
+
+
+USAGE message:
+
+# ./fileslower -h
+usage: fileslower.py [-h] [-p PID] [-a] [min_ms]
+
+Trace slow synchronous file reads and writes
+
+positional arguments:
+  min_ms             minimum I/O duration to trace, in ms (default 10)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  trace this PID only
+  -a, --all-files    include non-regular file types
+
+examples:
+    ./fileslower             # trace sync file I/O slower than 10 ms (default)
+    ./fileslower 1           # trace sync file I/O slower than 1 ms
+    ./fileslower -p 185      # trace PID 185 only
diff --git a/tools/filetop.py b/tools/filetop.py
new file mode 100755
index 0000000..4c7a28a
--- /dev/null
+++ b/tools/filetop.py
@@ -0,0 +1,211 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# filetop  file reads and writes by process.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: filetop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+#
+# This uses in-kernel eBPF maps to store per process summaries for efficiency.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 06-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+from subprocess import call
+
+# arguments
+examples = """examples:
+    ./filetop            # file I/O top, 1 second refresh
+    ./filetop -C         # don't clear the screen
+    ./filetop -p 181     # PID 181 only
+    ./filetop 5          # 5 second summaries
+    ./filetop 5 10       # 5 second summaries, 10 times only
+"""
+parser = argparse.ArgumentParser(
+    description="File reads and writes by process",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-a", "--all-files", action="store_true",
+    help="include non-regular file types (sockets, FIFOs, etc)")
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-r", "--maxrows", default=20,
+    help="maximum rows to print, default 20")
+parser.add_argument("-s", "--sort", default="rbytes",
+    choices=["reads", "writes", "rbytes", "wbytes"],
+    help="sort column, default rbytes")
+parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+interval = int(args.interval)
+countdown = int(args.count)
+maxrows = int(args.maxrows)
+clear = not int(args.noclear)
+debug = 0
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+// the key for the output summary
+struct info_t {
+    u32 pid;
+    u32 name_len;
+    char comm[TASK_COMM_LEN];
+    // de->d_name.name may point to de->d_iname so limit len accordingly
+    char name[DNAME_INLINE_LEN];
+    char type;
+};
+
+// the value of the output summary
+struct val_t {
+    u64 reads;
+    u64 writes;
+    u64 rbytes;
+    u64 wbytes;
+};
+
+BPF_HASH(counts, struct info_t, struct val_t);
+
+static int do_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count, int is_read)
+{
+    u32 tgid = bpf_get_current_pid_tgid() >> 32;
+    if (TGID_FILTER)
+        return 0;
+
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // skip I/O lacking a filename
+    struct dentry *de = file->f_path.dentry;
+    int mode = file->f_inode->i_mode;
+    struct qstr d_name = de->d_name;
+    if (d_name.len == 0 || TYPE_FILTER)
+        return 0;
+
+    // store counts and sizes by pid & file
+    struct info_t info = {.pid = pid};
+    bpf_get_current_comm(&info.comm, sizeof(info.comm));
+    info.name_len = d_name.len;
+    bpf_probe_read(&info.name, sizeof(info.name), d_name.name);
+    if (S_ISREG(mode)) {
+        info.type = 'R';
+    } else if (S_ISSOCK(mode)) {
+        info.type = 'S';
+    } else {
+        info.type = 'O';
+    }
+
+    struct val_t *valp, zero = {};
+    valp = counts.lookup_or_init(&info, &zero);
+    if (is_read) {
+        valp->reads++;
+        valp->rbytes += count;
+    } else {
+        valp->writes++;
+        valp->wbytes += count;
+    }
+
+    return 0;
+}
+
+int trace_read_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count)
+{
+    return do_entry(ctx, file, buf, count, 1);
+}
+
+int trace_write_entry(struct pt_regs *ctx, struct file *file,
+    char __user *buf, size_t count)
+{
+    return do_entry(ctx, file, buf, count, 0);
+}
+
+"""
+if args.tgid:
+    bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
+else:
+    bpf_text = bpf_text.replace('TGID_FILTER', '0')
+if args.all_files:
+    bpf_text = bpf_text.replace('TYPE_FILTER', '0')
+else:
+    bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="vfs_read", fn_name="trace_read_entry")
+b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")
+
+DNAME_INLINE_LEN = 32  # linux/dcache.h
+
+print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
+
+# output
+exiting = 0
+while 1:
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    # header
+    if clear:
+        call("clear")
+    else:
+        print()
+    with open(loadavg) as stats:
+        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+    print("%-6s %-16s %-6s %-6s %-7s %-7s %1s %s" % ("TID", "COMM",
+        "READS", "WRITES", "R_Kb", "W_Kb", "T", "FILE"))
+
+    # by-TID output
+    counts = b.get_table("counts")
+    line = 0
+    for k, v in reversed(sorted(counts.items(),
+                                key=lambda counts:
+                                  getattr(counts[1], args.sort))):
+        name = k.name.decode('utf-8', 'replace')
+        if k.name_len > DNAME_INLINE_LEN:
+            name = name[:-3] + "..."
+
+        # print line
+        print("%-6d %-16s %-6d %-6d %-7d %-7d %1s %s" % (k.pid,
+            k.comm.decode('utf-8', 'replace'), v.reads, v.writes,
+            v.rbytes / 1024, v.wbytes / 1024,
+            k.type.decode('utf-8', 'replace'), name))
+
+        line += 1
+        if line >= maxrows:
+            break
+    counts.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        print("Detaching...")
+        exit()
diff --git a/tools/filetop_example.txt b/tools/filetop_example.txt
new file mode 100644
index 0000000..66595ad
--- /dev/null
+++ b/tools/filetop_example.txt
@@ -0,0 +1,159 @@
+Demonstrations of filetop, the Linux eBPF/bcc version.
+
+
+filetop shows reads and writes by file, with process details. For example:
+
+# ./filetop -C
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+08:00:23 loadavg: 0.91 0.33 0.23 3/286 26635
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+26628  ld               161    186    643     152     R built-in.o
+26634  cc1              1      0      200     0       R autoconf.h
+26618  cc1              1      0      200     0       R autoconf.h
+26634  cc1              12     0      192     0       R tracepoint.h
+26584  cc1              2      0      143     0       R mm.h
+26634  cc1              2      0      143     0       R mm.h
+26631  make             34     0      136     0       R auto.conf
+26634  cc1              1      0      98      0       R fs.h
+26584  cc1              1      0      98      0       R fs.h
+26634  cc1              1      0      91      0       R sched.h
+26634  cc1              1      0      78      0       R printk.c
+26634  cc1              3      0      73      0       R mmzone.h
+26628  ld               18     0      72      0       R hibernate.o
+26628  ld               16     0      64      0       R suspend.o
+26628  ld               16     0      64      0       R snapshot.o
+26628  ld               16     0      64      0       R qos.o
+26628  ld               13     0      52      0       R main.o
+26628  ld               12     0      52      0       R swap.o
+[...]
+
+This shows various files read and written during a Linux kernel build. By
+default the output is sorted by the total read size in Kbytes (R_Kb). Sorting
+order can be changed via -s option. This is instrumenting at the VFS interface,
+so this is reads and writes that may return entirely from the file system cache
+(page cache).
+
+While not printed, the average read and write size can be calculated by
+dividing R_Kb by READS, and the same for writes.
+
+The "T" column indicates the type of the file: "R" for regular files, "S" for
+sockets, and "O" for other (including pipes). By default only regular files are
+shown; use the -a option to show all file types.
+
+This script works by tracing the vfs_read() and vfs_write() functions using
+kernel dynamic tracing, which instruments explicit read and write calls. If
+files are read or written using another means (eg, via mmap()), then they
+will not be visible using this tool.
+
+This should be useful for file system workload characterization when analyzing
+the performance of applications.
+
+Note that tracing VFS level reads and writes can be a frequent activity, and
+this tool can begin to cost measurable overhead at high I/O rates.
+
+
+A -C option will stop clearing the screen, and -r with a number will restrict
+the output to that many rows (20 by default). For example, not clearing
+the screen and showing the top 5 only:
+
+# ./filetop -Cr 5
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+08:05:11 loadavg: 0.75 0.35 0.25 3/285 822
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+32672  cksum            5006   0      320384  0       R data1
+809    run              2      0      8       0       R nsswitch.conf
+811    run              2      0      8       0       R nsswitch.conf
+804    chown            2      0      8       0       R nsswitch.conf
+
+08:05:12 loadavg: 0.75 0.35 0.25 3/285 845
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+32672  cksum            4986   0      319104  0       R data1
+845    chown            2      0      8       0       R nsswitch.conf
+828    run              2      0      8       0       R nsswitch.conf
+835    run              2      0      8       0       R nsswitch.conf
+830    run              2      0      8       0       R nsswitch.conf
+
+08:05:13 loadavg: 0.75 0.35 0.25 3/285 868
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+32672  cksum            4985   0      319040  0       R data1
+857    run              2      0      8       0       R nsswitch.conf
+858    run              2      0      8       0       R nsswitch.conf
+859    run              2      0      8       0       R nsswitch.conf
+848    run              2      0      8       0       R nsswitch.conf
+[...]
+
+This output shows a cksum command reading data1.
+
+
+An optional interval and optional count can also be added to the end of the
+command line. For example, for 1 second interval, and 3 summaries in total:
+
+# ./filetop -Cr 5 -a 1 3
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+08:08:20 loadavg: 0.30 0.42 0.31 3/282 5187
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+12421  sshd             14101  0      225616  0       O ptmx
+12296  sshd             4      0      64      0       O ptmx
+12421  sshd             3      14104  48      778     S TCP
+5178   run              2      0      8       0       R nsswitch.conf
+5165   run              2      0      8       0       R nsswitch.conf
+
+08:08:21 loadavg: 0.30 0.42 0.31 5/282 5210
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+12421  sshd             9159   0      146544  0       O ptmx
+12421  sshd             3      9161   48      534     S TCP
+12296  sshd             1      0      16      0       S TCP
+5188   run              2      0      8       0       R nsswitch.conf
+5203   run              2      0      8       0       R nsswitch.conf
+
+08:08:22 loadavg: 0.30 0.42 0.31 2/282 5233
+
+PID    COMM             READS  WRITES R_Kb    W_Kb    T FILE
+12421  sshd             26166  0      418656  0       O ptmx
+12421  sshd             4      26171  64      1385    S TCP
+12296  sshd             1      0      16      0       O ptmx
+5214   run              2      0      8       0       R nsswitch.conf
+5227   run              2      0      8       0       R nsswitch.conf
+Detaching...
+
+This example shows the -a option to include all file types. It caught heavy
+socket I/O from an sshd process, showing up as non-regular file types (the "O"
+for other, and "S" for socket, in the type column: "T").
+
+
+USAGE message:
+
+# ./filetop -h
+usage: filetop.py [-h] [-a] [-C] [-r MAXROWS] [-p PID] [interval] [count]
+
+File reads and writes by process
+
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -a, --all-files       include non-regular file types (sockets, FIFOs, etc)
+  -C, --noclear         don't clear the screen
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+  -s {reads,writes,rbytes,wbytes}, --sort {reads,writes,rbytes,wbytes}
+                        sort column, default rbytes
+  -p PID, --pid PID     trace this PID only
+
+examples:
+    ./filetop            # file I/O top, 1 second refresh
+    ./filetop -C         # don't clear the screen
+    ./filetop -p 181     # PID 181 only
+    ./filetop 5          # 5 second summaries
+    ./filetop 5 10       # 5 second summaries, 10 times only
diff --git a/tools/funccount.py b/tools/funccount.py
new file mode 100755
index 0000000..69dd01c
--- /dev/null
+++ b/tools/funccount.py
@@ -0,0 +1,304 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# funccount Count functions, tracepoints, and USDT probes.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: funccount [-h] [-p PID] [-i INTERVAL] [-d DURATION] [-T] [-r] pattern
+#
+# The pattern is a string with optional '*' wildcards, similar to file
+# globbing. If you'd prefer to use regular expressions, use the -r option.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Sep-2015   Brendan Gregg       Created this.
+# 18-Oct-2016   Sasha Goldshtein    Generalized for uprobes, tracepoints, USDT.
+
+from __future__ import print_function
+from bcc import ArgString, BPF, USDT
+from time import sleep, strftime
+import argparse
+import os
+import re
+import signal
+import sys
+import traceback
+
+debug = False
+
+def verify_limit(num):
+    probe_limit = 1000
+    if num > probe_limit:
+        raise Exception("maximum of %d probes allowed, attempted %d" %
+                        (probe_limit, num))
+
+class Probe(object):
+    def __init__(self, pattern, use_regex=False, pid=None):
+        """Init a new probe.
+
+        Init the probe from the pattern provided by the user. The supported
+        patterns mimic the 'trace' and 'argdist' tools, but are simpler because
+        we don't have to distinguish between probes and retprobes.
+
+            func            -- probe a kernel function
+            lib:func        -- probe a user-space function in the library 'lib'
+            /path:func      -- probe a user-space function in binary '/path'
+            p::func         -- same thing as 'func'
+            p:lib:func      -- same thing as 'lib:func'
+            t:cat:event     -- probe a kernel tracepoint
+            u:lib:probe     -- probe a USDT tracepoint
+        """
+        parts = bytes(pattern).split(b':')
+        if len(parts) == 1:
+            parts = [b"p", b"", parts[0]]
+        elif len(parts) == 2:
+            parts = [b"p", parts[0], parts[1]]
+        elif len(parts) == 3:
+            if parts[0] == b"t":
+                parts = [b"t", b"", b"%s:%s" % tuple(parts[1:])]
+            if parts[0] not in [b"p", b"t", b"u"]:
+                raise Exception("Type must be 'p', 't', or 'u', but got %s" %
+                                parts[0])
+        else:
+            raise Exception("Too many ':'-separated components in pattern %s" %
+                            pattern)
+
+        (self.type, self.library, self.pattern) = parts
+        if not use_regex:
+            self.pattern = self.pattern.replace(b'*', b'.*')
+            self.pattern = b'^' + self.pattern + b'$'
+
+        if (self.type == b"p" and self.library) or self.type == b"u":
+            libpath = BPF.find_library(self.library)
+            if libpath is None:
+                # This might be an executable (e.g. 'bash')
+                libpath = BPF.find_exe(self.library)
+            if libpath is None or len(libpath) == 0:
+                raise Exception("unable to find library %s" % self.library)
+            self.library = libpath
+
+        self.pid = pid
+        self.matched = 0
+        self.trace_functions = {}   # map location number to function name
+
+    def is_kernel_probe(self):
+        return self.type == b"t" or (self.type == b"p" and self.library == b"")
+
+    def attach(self):
+        if self.type == b"p" and not self.library:
+            for index, function in self.trace_functions.items():
+                self.bpf.attach_kprobe(
+                        event=function,
+                        fn_name="trace_count_%d" % index)
+        elif self.type == b"p" and self.library:
+            for index, function in self.trace_functions.items():
+                self.bpf.attach_uprobe(
+                        name=self.library,
+                        sym=function,
+                        fn_name="trace_count_%d" % index,
+                        pid=self.pid or -1)
+        elif self.type == b"t":
+            for index, function in self.trace_functions.items():
+                self.bpf.attach_tracepoint(
+                        tp=function,
+                        fn_name="trace_count_%d" % index)
+        elif self.type == b"u":
+            pass    # Nothing to do -- attach already happened in `load`
+
+    def _add_function(self, template, probe_name):
+        new_func = b"trace_count_%d" % self.matched
+        text = template.replace(b"PROBE_FUNCTION", new_func)
+        text = text.replace(b"LOCATION", b"%d" % self.matched)
+        self.trace_functions[self.matched] = probe_name
+        self.matched += 1
+        return text
+
+    def _generate_functions(self, template):
+        self.usdt = None
+        text = b""
+        if self.type == b"p" and not self.library:
+            functions = BPF.get_kprobe_functions(self.pattern)
+            verify_limit(len(functions))
+            for function in functions:
+                text += self._add_function(template, function)
+        elif self.type == b"p" and self.library:
+            # uprobes are tricky because the same function may have multiple
+            # addresses, and the same address may be mapped to multiple
+            # functions. We aren't allowed to create more than one uprobe
+            # per address, so track unique addresses and ignore functions that
+            # map to an address that we've already seen. Also ignore functions
+            # that may repeat multiple times with different addresses.
+            addresses, functions = (set(), set())
+            functions_and_addresses = BPF.get_user_functions_and_addresses(
+                                        self.library, self.pattern)
+            verify_limit(len(functions_and_addresses))
+            for function, address in functions_and_addresses:
+                if address in addresses or function in functions:
+                    continue
+                addresses.add(address)
+                functions.add(function)
+                text += self._add_function(template, function)
+        elif self.type == b"t":
+            tracepoints = BPF.get_tracepoints(self.pattern)
+            verify_limit(len(tracepoints))
+            for tracepoint in tracepoints:
+                text += self._add_function(template, tracepoint)
+        elif self.type == b"u":
+            self.usdt = USDT(path=self.library, pid=self.pid)
+            matches = []
+            for probe in self.usdt.enumerate_probes():
+                if not self.pid and (probe.bin_path != self.library):
+                    continue
+                if re.match(self.pattern, probe.name):
+                    matches.append(probe.name)
+            verify_limit(len(matches))
+            for match in matches:
+                new_func = b"trace_count_%d" % self.matched
+                text += self._add_function(template, match)
+                self.usdt.enable_probe(match, new_func)
+            if debug:
+                print(self.usdt.get_text())
+        return text
+
+    def load(self):
+        trace_count_text = b"""
+int PROBE_FUNCTION(void *ctx) {
+    FILTER
+    int loc = LOCATION;
+    u64 *val = counts.lookup(&loc);
+    if (!val) {
+        return 0;   // Should never happen, # of locations is known
+    }
+    (*val)++;
+    return 0;
+}
+        """
+        bpf_text = b"""#include <uapi/linux/ptrace.h>
+
+BPF_ARRAY(counts, u64, NUMLOCATIONS);
+        """
+
+        # We really mean the tgid from the kernel's perspective, which is in
+        # the top 32 bits of bpf_get_current_pid_tgid().
+        if self.pid:
+            trace_count_text = trace_count_text.replace(b'FILTER',
+                b"""u32 pid = bpf_get_current_pid_tgid() >> 32;
+                   if (pid != %d) { return 0; }""" % self.pid)
+        else:
+            trace_count_text = trace_count_text.replace(b'FILTER', b'')
+
+        bpf_text += self._generate_functions(trace_count_text)
+        bpf_text = bpf_text.replace(b"NUMLOCATIONS",
+                                    b"%d" % len(self.trace_functions))
+        if debug:
+            print(bpf_text)
+
+        if self.matched == 0:
+            raise Exception("No functions matched by pattern %s" %
+                            self.pattern)
+
+        self.bpf = BPF(text=bpf_text,
+                       usdt_contexts=[self.usdt] if self.usdt else [])
+        self.clear()    # Initialize all array items to zero
+
+    def counts(self):
+        return self.bpf["counts"]
+
+    def clear(self):
+        counts = self.bpf["counts"]
+        for location, _ in list(self.trace_functions.items()):
+            counts[counts.Key(location)] = counts.Leaf()
+
+class Tool(object):
+    def __init__(self):
+        examples = """examples:
+    ./funccount 'vfs_*'             # count kernel fns starting with "vfs"
+    ./funccount -r '^vfs.*'         # same as above, using regular expressions
+    ./funccount -Ti 5 'vfs_*'       # output every 5 seconds, with timestamps
+    ./funccount -d 10 'vfs_*'       # trace for 10 seconds only
+    ./funccount -p 185 'vfs_*'      # count vfs calls for PID 181 only
+    ./funccount t:sched:sched_fork  # count calls to the sched_fork tracepoint
+    ./funccount -p 185 u:node:gc*   # count all GC USDT probes in node, PID 185
+    ./funccount c:malloc            # count all malloc() calls in libc
+    ./funccount go:os.*             # count all "os.*" calls in libgo
+    ./funccount -p 185 go:os.*      # count all "os.*" calls in libgo, PID 185
+    ./funccount ./test:read*        # count "read*" calls in the ./test binary
+    """
+        parser = argparse.ArgumentParser(
+            description="Count functions, tracepoints, and USDT probes",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=examples)
+        parser.add_argument("-p", "--pid", type=int,
+            help="trace this PID only")
+        parser.add_argument("-i", "--interval",
+            help="summary interval, seconds")
+        parser.add_argument("-d", "--duration",
+            help="total duration of trace, seconds")
+        parser.add_argument("-T", "--timestamp", action="store_true",
+            help="include timestamp on output")
+        parser.add_argument("-r", "--regexp", action="store_true",
+            help="use regular expressions. Default is \"*\" wildcards only.")
+        parser.add_argument("-D", "--debug", action="store_true",
+            help="print BPF program before starting (for debugging purposes)")
+        parser.add_argument("pattern",
+            type=ArgString,
+            help="search expression for events")
+        self.args = parser.parse_args()
+        global debug
+        debug = self.args.debug
+        self.probe = Probe(self.args.pattern, self.args.regexp, self.args.pid)
+        if self.args.duration and not self.args.interval:
+            self.args.interval = self.args.duration
+        if not self.args.interval:
+            self.args.interval = 99999999
+
+    @staticmethod
+    def _signal_ignore(signal, frame):
+        print()
+
+    def run(self):
+        self.probe.load()
+        self.probe.attach()
+        print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+              (self.probe.matched, bytes(self.args.pattern)))
+        exiting = 0 if self.args.interval else 1
+        seconds = 0
+        while True:
+            try:
+                sleep(int(self.args.interval))
+                seconds += int(self.args.interval)
+            except KeyboardInterrupt:
+                exiting = 1
+                # as cleanup can take many seconds, trap Ctrl-C:
+                signal.signal(signal.SIGINT, Tool._signal_ignore)
+            if self.args.duration and seconds >= int(self.args.duration):
+                exiting = 1
+
+            print()
+            if self.args.timestamp:
+                print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+            print("%-36s %8s" % ("FUNC", "COUNT"))
+            counts = self.probe.counts()
+            for k, v in sorted(counts.items(),
+                               key=lambda counts: counts[1].value):
+                if v.value == 0:
+                    continue
+                print("%-36s %8d" %
+                      (self.probe.trace_functions[k.value], v.value))
+
+            if exiting:
+                print("Detaching...")
+                exit()
+            else:
+                self.probe.clear()
+
+if __name__ == "__main__":
+    try:
+        Tool().run()
+    except Exception:
+        if debug:
+            traceback.print_exc()
+        elif sys.exc_info()[0] is not SystemExit:
+            print(sys.exc_info()[1])
diff --git a/tools/funccount_example.txt b/tools/funccount_example.txt
new file mode 100644
index 0000000..d06cfd9
--- /dev/null
+++ b/tools/funccount_example.txt
@@ -0,0 +1,364 @@
+Demonstrations of funccount, the Linux eBPF/bcc version.
+
+
+This program traces functions, tracepoints, or USDT probes that match a
+specified pattern, and when Ctrl-C is hit prints a summary of their count
+while tracing. Eg, tracing all kernel functions that begin with "vfs_":
+
+# ./funccount 'vfs_*'
+Tracing... Ctrl-C to end.
+^C
+FUNC                          COUNT
+vfs_create                        1
+vfs_rename                        1
+vfs_fsync_range                   2
+vfs_lock_file                    30
+vfs_fstatat                     152
+vfs_fstat                       154
+vfs_write                       166
+vfs_getattr_nosec               262
+vfs_getattr                     262
+vfs_open                        264
+vfs_read                        470
+Detaching...
+
+The above output shows that while tracing the vfs_read() function was called 470
+times, and vfs_open() 264 times, etc.
+
+This is useful for exploring kernel code, to figure out which functions are in
+use and which are not. This can narrow down an investigation to just a few
+functions, whose counts are similar to the workload investigated.
+
+
+Tracing all tcp functions:
+
+# ./funccount 'tcp_*'
+Tracing... Ctrl-C to end.
+^C
+FUNC                          COUNT
+tcp_try_undo_recovery             1
+tcp_twsk_destructor               1
+tcp_enter_recovery                1
+tcp_xmit_retransmit_queue         1
+tcp_update_scoreboard             1
+tcp_verify_retransmit_hint        1
+tcp_tsq_handler.part.31           1
+tcp_sacktag_write_queue           1
+tcp_match_skb_to_sack             1
+tcp_time_wait                     1
+tcp_mark_head_lost                1
+tcp_init_cwnd_reduction           1
+tcp_sacktag_one                   1
+tcp_sacktag_walk                  1
+tcp_retransmit_skb                1
+tcp_tasklet_func                  1
+tcp_resume_early_retransmit       1
+tcp_dsack_set                     1
+tcp_v4_syn_recv_sock              2
+tcp_ca_openreq_child              2
+tcp_try_fastopen                  2
+tcp_openreq_init_rwin             2
+tcp_v4_init_req                   2
+tcp_create_openreq_child          2
+tcp_v4_send_synack                2
+tcp_v4_init_sequence              2
+tcp_fragment                      2
+tcp_v4_conn_request               2
+tcp_conn_request                  2
+tcp_v4_route_req                  2
+tcp_fragment_tstamp               2
+tcp_try_keep_open                 2
+tcp_v4_reqsk_destructor           2
+tcp_may_send_now                  2
+tcp_make_synack                   2
+tcp_child_process                 2
+tcp_check_req                     2
+tcp_fastretrans_alert             2
+tcp_set_keepalive                 2
+tcp_finish_connect                3
+tcp_connect_queue_skb             3
+tcp_v4_connect                    3
+tcp_init_sock                     3
+tcp_v4_init_sock                  3
+tcp_connect                       3
+tcp_any_retrans_done.part.35        3
+tcp_clear_retrans                 3
+tcp_setsockopt                    4
+tcp_update_metrics                5
+tcp_done                          5
+tcp_initialize_rcv_mss            5
+tcp_sndbuf_expand                 5
+tcp_fin                           5
+tcp_init_xmit_timers              5
+tcp_close                         5
+tcp_init_congestion_control        5
+tcp_init_metrics                  5
+tcp_gro_complete                  5
+tcp_free_fastopen_req             5
+tcp_v4_destroy_sock               5
+tcp_cleanup_congestion_control        5
+tcp_send_fin                      5
+tcp_init_buffer_space             5
+tcp_init_cwnd                     5
+tcp_select_initial_window         5
+tcp_check_oom                     5
+tcp_default_init_rwnd             5
+tcp_assign_congestion_control        5
+tcp_getsockopt                    6
+tcp_ioctl                         6
+tcp_mtup_init                     8
+tcp_parse_options                 8
+tcp_mss_to_mtu                    8
+tcp_try_rmem_schedule             8
+tcp_get_metrics                  10
+tcp_try_coalesce                 10
+tcp_rcv_state_process            14
+tcp_sync_mss                     14
+tcp_write_timer_handler          15
+tcp_write_timer                  16
+tcp_grow_window.isra.27          22
+tcp_set_state                    23
+tcp_send_ack                     37
+tcp_delack_timer                 42
+tcp_delack_timer_handler         42
+tcp_validate_incoming            91
+tcp_prequeue_process            112
+tcp_v4_early_demux              117
+tcp_gro_receive                 146
+tcp_queue_rcv                   167
+tcp_data_queue                  215
+tcp_urg                         219
+tcp_send_delayed_ack            257
+tcp_send_mss                    275
+tcp_push                        275
+tcp_sendmsg                     275
+tcp_event_data_recv             275
+tcp_nagle_check                 279
+tcp_write_xmit                  282
+tcp_event_new_data_sent         282
+tcp_current_mss                 284
+tcp_init_tso_segs               284
+tcp_wfree                       286
+tcp_schedule_loss_probe         305
+tcp_v4_send_check               323
+tcp_transmit_skb                323
+tcp_recvmsg                     323
+tcp_options_write               325
+tcp_rcv_space_adjust            328
+tcp_check_space                 332
+tcp_rcv_established             337
+tcp_ack                         337
+tcp_parse_aligned_timestamp.part.43      345
+tcp_prequeue                    346
+tcp_v4_do_rcv                   351
+tcp_v4_rcv                      351
+tcp_parse_md5sig_option         351
+tcp_cleanup_rbuf                436
+tcp_poll                        468
+tcp_established_options         604
+tcp_v4_md5_lookup               615
+tcp_release_cb                  736
+tcp_rearm_rto                   843
+tcp_md5_do_lookup               968
+Detaching...
+
+The current implementation can take many seconds to detach from tracing, after
+Ctrl-C has been hit.
+
+
+User functions can be traced in executables or libraries, and per-process
+filtering is allowed:
+
+# ./funccount -p 1442 /home/ubuntu/contentions:*
+Tracing 15 functions for "/home/ubuntu/contentions:*"... Hit Ctrl-C to end.
+^C
+FUNC                                           COUNT
+main                                               1
+_start                                             1
+primes_thread                                      2
+insert_result                                  87186
+is_prime                                     1252772
+Detaching...
+
+If /home/ubuntu is in the $PATH, then the following command will also work:
+
+# ./funccount -p 1442 contentions:*
+
+
+Counting libc write and read calls using regular expression syntax (-r):
+
+# ./funccount -r 'c:(write|read)$'
+Tracing 2 functions for "c:(write|read)$"... Hit Ctrl-C to end.
+^C
+FUNC                                    COUNT
+read                                        2
+write                                       4
+Detaching...
+
+
+Kernel tracepoints are also available as targets. For example, trace common
+block I/O tracepoints and see how often they are invoked:
+
+# ./funccount t:block:*
+Tracing 19 functions for "t:block:*"... Hit Ctrl-C to end.
+^C
+FUNC                                    COUNT
+block:block_rq_complete                     7
+block:block_rq_issue                        7
+block:block_getrq                           7
+block:block_rq_insert                       7
+Detaching...
+
+
+Likewise, user-mode statically defined traces (USDT) can also be probed. For
+example, count mutex-related events in pthreads:
+
+# ./funccount u:pthread:*mutex* -p 1442
+Tracing 7 functions for "u:pthread:*mutex*"... Hit Ctrl-C to end.
+^C
+FUNC                                    COUNT
+mutex_init                                  1
+mutex_entry                            547122
+mutex_acquired                         547175
+mutex_release                          547185
+Detaching...
+
+
+An interval can be provided. Eg, printing output every 1 second for vfs calls:
+
+# ./funccount -i 1 'vfs_*'
+Tracing... Ctrl-C to end.
+
+FUNC                          COUNT
+vfs_fstatat                       1
+vfs_fstat                        16
+vfs_getattr_nosec                17
+vfs_getattr                      17
+vfs_write                        52
+vfs_read                         79
+vfs_open                         98
+
+FUNC                          COUNT
+vfs_fstatat                      10
+vfs_fstat                        10
+vfs_open                         13
+vfs_getattr_nosec                20
+vfs_getattr                      20
+vfs_write                        28
+vfs_read                         39
+
+FUNC                          COUNT
+vfs_fsync_range                   2
+vfs_lock_file                    30
+vfs_write                       107
+vfs_fstatat                     129
+vfs_fstat                       130
+vfs_open                        154
+vfs_getattr_nosec               222
+vfs_getattr                     222
+vfs_read                        384
+^C
+Detaching...
+
+This can be useful for making some ad hoc tools, exposing new counts of
+kernel activity that aren't visible in other metrics.
+
+Include -T to print timestamps on output.
+
+
+A maximum duration can be set. For example, to print 5 x 1 second summaries
+of vfs_read() calls:
+
+# ./funccount -i 1 -d 5 vfs_read
+Tracing 1 functions for "vfs_read"... Hit Ctrl-C to end.
+
+FUNC                                    COUNT
+vfs_read                                   30
+
+FUNC                                    COUNT
+vfs_read                                   26
+
+FUNC                                    COUNT
+vfs_read                                   54
+
+FUNC                                    COUNT
+vfs_read                                   25
+
+FUNC                                    COUNT
+vfs_read                                   31
+Detaching...
+
+By leaving off the "-i 1", this will print a single 5 second summary:
+
+# funccount.py -d 5 vfs_read
+Tracing 1 functions for "vfs_read"... Hit Ctrl-C to end.
+
+FUNC                                    COUNT
+vfs_read                                  167
+Detaching...
+
+This can be useful for finding out rates: trace all functions for ten seconds
+and then divide by ten for the per-second rate.
+
+
+The "*" wildcard can be used multiple times. Eg, matching functions that contain
+the word "readdir":
+
+# ./funccount '*readdir*'
+Tracing... Ctrl-C to end.
+^C
+FUNC                          COUNT
+ext4_readdir                      4
+Detaching...
+
+Matching "tcp" then "send":
+
+# ./funccount '*tcp*send*'
+Tracing... Ctrl-C to end.
+^C
+FUNC                          COUNT
+tcp_send_ack                      4
+tcp_send_delayed_ack             19
+tcp_send_mss                     26
+tcp_sendmsg                      26
+tcp_v4_send_check                30
+__tcp_v4_send_check              30
+Detaching...
+
+
+Full USAGE:
+
+# ./funccount -h
+usage: funccount [-h] [-p PID] [-i INTERVAL] [-d DURATION] [-T] [-r] [-D]
+                    pattern
+
+Count functions, tracepoints, and USDT probes
+
+positional arguments:
+  pattern               search expression for events
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -i INTERVAL, --interval INTERVAL
+                        summary interval, seconds
+  -d DURATION, --duration DURATION
+                        total duration of trace, seconds
+  -T, --timestamp       include timestamp on output
+  -r, --regexp          use regular expressions. Default is "*" wildcards
+                        only.
+  -D, --debug           print BPF program before starting (for debugging
+                        purposes)
+
+examples:
+    ./funccount 'vfs_*'             # count kernel fns starting with "vfs"
+    ./funccount -r '^vfs.*'         # same as above, using regular expressions
+    ./funccount -Ti 5 'vfs_*'       # output every 5 seconds, with timestamps
+    ./funccount -d 10 'vfs_*'       # trace for 10 seconds only
+    ./funccount -p 185 'vfs_*'      # count vfs calls for PID 181 only
+    ./funccount t:sched:sched_fork  # count calls to the sched_fork tracepoint
+    ./funccount -p 185 u:node:gc*   # count all GC USDT probes in node, PID 185
+    ./funccount c:malloc            # count all malloc() calls in libc
+    ./funccount go:os.*             # count all "os.*" calls in libgo
+    ./funccount -p 185 go:os.*      # count all "os.*" calls in libgo, PID 185
+    ./funccount ./test:read*        # count "read*" calls in the ./test binary
diff --git a/tools/funclatency.py b/tools/funclatency.py
new file mode 100755
index 0000000..3f08a7e
--- /dev/null
+++ b/tools/funclatency.py
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# funclatency   Time functions and print latency as a histogram.
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r] [-v]
+#                    pattern
+#
+# Run "funclatency -h" for full usage.
+#
+# The pattern is a string with optional '*' wildcards, similar to file
+# globbing. If you'd prefer to use regular expressions, use the -r option.
+#
+# Currently nested or recursive functions are not supported properly, and
+# timestamps will be overwritten, creating dubious output. Try to match single
+# functions, or groups of functions that run at the same stack layer, and
+# don't ultimately call each other.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Sep-2015   Brendan Gregg       Created this.
+# 06-Oct-2016   Sasha Goldshtein    Added user function support.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+
+# arguments
+examples = """examples:
+    ./funclatency do_sys_open       # time the do_sys_open() kernel function
+    ./funclatency c:read            # time the read() C library function
+    ./funclatency -u vfs_read       # time vfs_read(), in microseconds
+    ./funclatency -m do_nanosleep   # time do_nanosleep(), in milliseconds
+    ./funclatency -i 2 -d 10 c:open # output every 2 seconds, for duration 10s
+    ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
+    ./funclatency -p 181 vfs_read   # time process 181 only
+    ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency 'c:*printf'       # time the *printf family of functions
+    ./funclatency -F 'vfs_r*'       # show one histogram per matched function
+"""
+parser = argparse.ArgumentParser(
+    description="Time functions and print latency as a histogram",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", type=int,
+    help="trace this PID only")
+parser.add_argument("-i", "--interval", type=int,
+    help="summary interval, in seconds")
+parser.add_argument("-d", "--duration", type=int,
+    help="total duration of trace, in seconds")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-u", "--microseconds", action="store_true",
+    help="microsecond histogram")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="millisecond histogram")
+parser.add_argument("-F", "--function", action="store_true",
+    help="show a separate histogram per function")
+parser.add_argument("-r", "--regexp", action="store_true",
+    help="use regular expressions. Default is \"*\" wildcards only.")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program (for debugging purposes)")
+parser.add_argument("pattern",
+    help="search expression for functions")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+if args.duration and not args.interval:
+    args.interval = args.duration
+if not args.interval:
+    args.interval = 99999999
+
+def bail(error):
+    print("Error: " + error)
+    exit(1)
+
+parts = args.pattern.split(':')
+if len(parts) == 1:
+    library = None
+    pattern = args.pattern
+elif len(parts) == 2:
+    library = parts[0]
+    libpath = BPF.find_library(library) or BPF.find_exe(library)
+    if not libpath:
+        bail("can't resolve library %s" % library)
+    library = libpath
+    pattern = parts[1]
+else:
+    bail("unrecognized pattern format '%s'" % pattern)
+
+if not args.regexp:
+    pattern = pattern.replace('*', '.*')
+    pattern = '^' + pattern + '$'
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+typedef struct ip_pid {
+    u64 ip;
+    u64 pid;
+} ip_pid_t;
+
+typedef struct hist_key {
+    ip_pid_t key;
+    u64 slot;
+} hist_key_t;
+
+BPF_HASH(start, u32);
+STORAGE
+
+int trace_func_entry(struct pt_regs *ctx)
+{
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    u32 pid = pid_tgid;
+    u32 tgid = pid_tgid >> 32;
+    u64 ts = bpf_ktime_get_ns();
+
+    FILTER
+    ENTRYSTORE
+    start.update(&pid, &ts);
+
+    return 0;
+}
+
+int trace_func_return(struct pt_regs *ctx)
+{
+    u64 *tsp, delta;
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    u32 pid = pid_tgid;
+    u32 tgid = pid_tgid >> 32;
+
+    // calculate delta time
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+    FACTOR
+
+    // store as histogram
+    STORE
+
+    return 0;
+}
+"""
+
+# do we need to store the IP and pid for each invocation?
+need_key = args.function or (library and not args.pid)
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (tgid != %d) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if args.milliseconds:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
+    label = "msecs"
+elif args.microseconds:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
+    label = "usecs"
+else:
+    bpf_text = bpf_text.replace('FACTOR', '')
+    label = "nsecs"
+if need_key:
+    bpf_text = bpf_text.replace('STORAGE', 'BPF_HASH(ipaddr, u32);\n' +
+        'BPF_HISTOGRAM(dist, hist_key_t);')
+    # stash the IP on entry, as on return it's kretprobe_trampoline:
+    bpf_text = bpf_text.replace('ENTRYSTORE',
+        'u64 ip = PT_REGS_IP(ctx); ipaddr.update(&pid, &ip);')
+    pid = '-1' if not library else 'tgid'
+    bpf_text = bpf_text.replace('STORE',
+        """
+    u64 ip, *ipp = ipaddr.lookup(&pid);
+    if (ipp) {
+        ip = *ipp;
+        hist_key_t key;
+        key.key.ip = ip;
+        key.key.pid = %s;
+        key.slot = bpf_log2l(delta);
+        dist.increment(key);
+        ipaddr.delete(&pid);
+    }
+        """ % pid)
+else:
+    bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+    bpf_text = bpf_text.replace('ENTRYSTORE', '')
+    bpf_text = bpf_text.replace('STORE',
+        'dist.increment(bpf_log2l(delta));')
+if args.verbose or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# attach probes
+if not library:
+    b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
+    b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
+    matched = b.num_open_kprobes()
+else:
+    b.attach_uprobe(name=library, sym_re=pattern, fn_name="trace_func_entry",
+                    pid=args.pid or -1)
+    b.attach_uretprobe(name=library, sym_re=pattern,
+                       fn_name="trace_func_return", pid=args.pid or -1)
+    matched = b.num_open_uprobes()
+
+if matched == 0:
+    print("0 functions matched by \"%s\". Exiting." % args.pattern)
+    exit()
+
+# header
+print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+    (matched / 2, args.pattern))
+
+# output
+def print_section(key):
+    if not library:
+        return BPF.sym(key[0], -1)
+    else:
+        return "%s [%d]" % (BPF.sym(key[0], key[1]), key[1])
+
+exiting = 0 if args.interval else 1
+seconds = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(args.interval)
+        seconds += args.interval
+    except KeyboardInterrupt:
+        exiting = 1
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+    if args.duration and seconds >= args.duration:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    if need_key:
+        dist.print_log2_hist(label, "Function", section_print_fn=print_section,
+            bucket_fn=lambda k: (k.ip, k.pid))
+    else:
+        dist.print_log2_hist(label)
+    dist.clear()
+
+    if exiting:
+        print("Detaching...")
+        exit()
diff --git a/tools/funclatency_example.txt b/tools/funclatency_example.txt
new file mode 100644
index 0000000..d8217a2
--- /dev/null
+++ b/tools/funclatency_example.txt
@@ -0,0 +1,367 @@
+Demonstrations of funclatency, the Linux eBPF/bcc version.
+
+
+Timing the do_sys_open() kernel function until Ctrl-C:
+
+# ./funclatency do_sys_open
+Tracing do_sys_open... Hit Ctrl-C to end.
+^C
+     nsecs           : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 0        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 0        |                                      |
+     512 -> 1023     : 0        |                                      |
+    1024 -> 2047     : 0        |                                      |
+    2048 -> 4095     : 124      |****************                      |
+    4096 -> 8191     : 291      |**************************************|
+    8192 -> 16383    : 36       |****                                  |
+   16384 -> 32767    : 16       |**                                    |
+   32768 -> 65535    : 8        |*                                     |
+   65536 -> 131071   : 0        |                                      |
+  131072 -> 262143   : 0        |                                      |
+  262144 -> 524287   : 0        |                                      |
+  524288 -> 1048575  : 0        |                                      |
+ 1048576 -> 2097151  : 0        |                                      |
+ 2097152 -> 4194303  : 1        |                                      |
+Detaching...
+
+The output shows a histogram of function latency (call time), measured from when
+the function began executing (was called) to when it finished (returned).
+
+This example output shows that most of the time, do_sys_open() took between
+2048 and 65536 nanoseconds (2 to 65 microseconds). The peak of this distribution
+shows 291 calls of between 4096 and 8191 nanoseconds. There was also one
+occurrence, an outlier, in the 2 to 4 millisecond range.
+
+How this works: the function entry and return are traced using the kernel kprobe
+and kretprobe tracer. Timestamps are collected, the delta time calculated, which
+is the bucketized and stored as an in-kernel histogram for efficiency. The
+histogram is visible in the output: it's the "count" column; everything else is
+decoration. Only the count column is copied to user-level on output. This is an
+efficient way to time kernel functions and examine their latency distribution.
+
+
+Now trace a user function, pthread_mutex_lock in libpthread, to determine if
+there is considerable lock contention:
+
+# ./funclatency pthread:pthread_mutex_lock -p $(pidof contentions)
+Tracing 1 function for "pthread:pthread_mutex_lock"... Hit Ctrl-C to end.
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 508967   |****************************************|
+      4096 -> 8191       : 70072    |*****                                   |
+      8192 -> 16383      : 27686    |**                                      |
+     16384 -> 32767      : 5075     |                                        |
+     32768 -> 65535      : 2318     |                                        |
+     65536 -> 131071     : 581      |                                        |
+    131072 -> 262143     : 38       |                                        |
+    262144 -> 524287     : 5        |                                        |
+    524288 -> 1048575    : 1        |                                        |
+   1048576 -> 2097151    : 9        |                                        |
+Detaching...
+
+It seems that most calls to pthread_mutex_lock completed rather quickly (in
+under 4us), but there were some cases of considerable contention, sometimes
+over a full millisecond.
+
+
+Run a quick-and-dirty profiler over all the functions in an executable:
+# ./funclatency /home/user/primes:* -p $(pidof primes) -F
+Tracing 15 functions for "/home/user/primes:*"... Hit Ctrl-C to end.
+^C
+
+Function = is_prime [6556]
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 1495322  |****************************************|
+      4096 -> 8191       : 95744    |**                                      |
+      8192 -> 16383      : 9926     |                                        |
+     16384 -> 32767      : 3070     |                                        |
+     32768 -> 65535      : 1415     |                                        |
+     65536 -> 131071     : 112      |                                        |
+    131072 -> 262143     : 9        |                                        |
+    262144 -> 524287     : 3        |                                        |
+    524288 -> 1048575    : 0        |                                        |
+   1048576 -> 2097151    : 8        |                                        |
+
+Function = insert_result [6556]
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 111047   |****************************************|
+      4096 -> 8191       : 3998     |*                                       |
+      8192 -> 16383      : 720      |                                        |
+     16384 -> 32767      : 238      |                                        |
+     32768 -> 65535      : 106      |                                        |
+     65536 -> 131071     : 5        |                                        |
+    131072 -> 262143     : 4        |                                        |
+Detaching...
+
+From the results, we can see that the is_prime function has something resembling
+an exponential distribution -- very few primes take a very long time to test,
+while most numbers are verified as prime or composite in less than 4us. The
+insert_result function exhibits a similar phenomenon, likely due to contention
+over a shared results container.
+
+
+Now vfs_read() is traced, and a microseconds histogram printed:
+
+# ./funclatency -u vfs_read
+Tracing vfs_read... Hit Ctrl-C to end.
+^C
+     usecs           : count     distribution
+       0 -> 1        : 1143     |**************************************|
+       2 -> 3        : 420      |*************                         |
+       4 -> 7        : 159      |*****                                 |
+       8 -> 15       : 295      |*********                             |
+      16 -> 31       : 25       |                                      |
+      32 -> 63       : 5        |                                      |
+      64 -> 127      : 1        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 0        |                                      |
+     512 -> 1023     : 0        |                                      |
+    1024 -> 2047     : 1        |                                      |
+    2048 -> 4095     : 0        |                                      |
+    4096 -> 8191     : 5        |                                      |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 0        |                                      |
+   65536 -> 131071   : 7        |                                      |
+  131072 -> 262143   : 7        |                                      |
+  262144 -> 524287   : 3        |                                      |
+  524288 -> 1048575  : 7        |                                      |
+Detaching...
+
+This shows a bimodal distribution. Many vfs_read() calls were faster than 15
+microseconds, however, there was also a small handful between 65 milliseconds
+and 1 second, seen at the bottom of the table. These are likely network reads
+from SSH, waiting on interactive keystrokes.
+
+
+Tracing do_nanosleep() in milliseconds:
+
+# ./funclatency -m do_nanosleep
+Tracing do_nanosleep... Hit Ctrl-C to end.
+^C
+     msecs           : count     distribution
+       0 -> 1        : 0        |                                      |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 0        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 0        |                                      |
+     512 -> 1023     : 328      |**************************************|
+    1024 -> 2047     : 0        |                                      |
+    2048 -> 4095     : 0        |                                      |
+    4096 -> 8191     : 32       |***                                   |
+    8192 -> 16383    : 0        |                                      |
+   16384 -> 32767    : 0        |                                      |
+   32768 -> 65535    : 2        |                                      |
+Detaching...
+
+This looks like it has found threads that are sleeping every 1, 5, and 60
+seconds.
+
+
+An interval can be provided using -i, and timestamps added using -T. For
+example, tracing vfs_read() latency in milliseconds and printing output
+every 5 seconds:
+
+# ./funclatency -mTi 5 vfs_read
+Tracing vfs_read... Hit Ctrl-C to end.
+
+20:10:08
+     msecs           : count     distribution
+       0 -> 1        : 1500     |*************************************+|
+       2 -> 3        : 3        |                                      |
+       4 -> 7        : 1        |                                      |
+       8 -> 15       : 2        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 4        |                                      |
+     128 -> 255      : 3        |                                      |
+     256 -> 511      : 1        |                                      |
+     512 -> 1023     : 7        |                                      |
+
+20:10:13
+     msecs           : count     distribution
+       0 -> 1        : 1251     |*************************************+|
+       2 -> 3        : 3        |                                      |
+       4 -> 7        : 2        |                                      |
+       8 -> 15       : 0        |                                      |
+      16 -> 31       : 2        |                                      |
+      32 -> 63       : 3        |                                      |
+      64 -> 127      : 5        |                                      |
+     128 -> 255      : 5        |                                      |
+     256 -> 511      : 3        |                                      |
+     512 -> 1023     : 6        |                                      |
+    1024 -> 2047     : 2        |                                      |
+
+20:10:18
+     msecs           : count     distribution
+       0 -> 1        : 1265     |*************************************+|
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 5        |                                      |
+       8 -> 15       : 9        |                                      |
+      16 -> 31       : 7        |                                      |
+      32 -> 63       : 1        |                                      |
+      64 -> 127      : 2        |                                      |
+     128 -> 255      : 3        |                                      |
+     256 -> 511      : 5        |                                      |
+     512 -> 1023     : 5        |                                      |
+    1024 -> 2047     : 0        |                                      |
+    2048 -> 4095     : 1        |                                      |
+^C
+20:10:20
+     msecs           : count     distribution
+       0 -> 1        : 249      |*************************************+|
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 1        |                                      |
+      16 -> 31       : 0        |                                      |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 0        |                                      |
+     128 -> 255      : 0        |                                      |
+     256 -> 511      : 0        |                                      |
+     512 -> 1023     : 1        |                                      |
+Detaching...
+
+
+A single process can be traced, which filters in-kernel for efficiency. Here,
+the vfs_read() function is timed as milliseconds for PID 17064, which is a
+bash shell:
+
+# ./funclatency -mp 17064 vfs_read
+Tracing vfs_read... Hit Ctrl-C to end.
+^C
+     msecs           : count     distribution
+       0 -> 1        : 1        |**                                    |
+       2 -> 3        : 0        |                                      |
+       4 -> 7        : 0        |                                      |
+       8 -> 15       : 1        |**                                    |
+      16 -> 31       : 2        |*****                                 |
+      32 -> 63       : 0        |                                      |
+      64 -> 127      : 13       |**************************************|
+     128 -> 255      : 10       |*****************************         |
+     256 -> 511      : 4        |***********                           |
+Detaching...
+
+The distribution between 64 and 511 milliseconds shows keystroke latency.
+
+
+The -F option can be used to print a histogram per function. Eg:
+
+# ./funclatency -uF 'vfs_r*'
+Tracing 5 functions for "vfs_r*"... Hit Ctrl-C to end.
+^C
+
+Function = vfs_read
+     usecs               : count     distribution
+         0 -> 1          : 1044     |****************************************|
+         2 -> 3          : 383      |**************                          |
+         4 -> 7          : 76       |**                                      |
+         8 -> 15         : 41       |*                                       |
+        16 -> 31         : 26       |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 4        |                                        |
+      4096 -> 8191       : 2        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 2        |                                        |
+     65536 -> 131071     : 5        |                                        |
+    131072 -> 262143     : 5        |                                        |
+    262144 -> 524287     : 3        |                                        |
+    524288 -> 1048575    : 7        |                                        |
+
+Function = vfs_rename
+     usecs               : count     distribution
+         0 -> 1          : 2        |****                                    |
+         2 -> 3          : 2        |****                                    |
+         4 -> 7          : 2        |****                                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 6        |*************                           |
+        32 -> 63         : 18       |****************************************|
+Detaching...
+
+
+
+USAGE message:
+
+# ./funclatency -h
+usage: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r] [-v]
+                   pattern
+
+Time functions and print latency as a histogram
+
+positional arguments:
+  pattern               search expression for functions
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -i INTERVAL, --interval INTERVAL
+                        summary interval, in seconds
+  -d DURATION, --duration DURATION
+			total duration of trace, in seconds
+  -T, --timestamp       include timestamp on output
+  -u, --microseconds    microsecond histogram
+  -m, --milliseconds    millisecond histogram
+  -F, --function        show a separate histogram per function
+  -r, --regexp          use regular expressions. Default is "*" wildcards
+                        only.
+  -v, --verbose         print the BPF program (for debugging purposes)
+
+examples:
+    ./funclatency do_sys_open       # time the do_sys_open() kernel function
+    ./funclatency c:read            # time the read() C library function
+    ./funclatency -u vfs_read       # time vfs_read(), in microseconds
+    ./funclatency -m do_nanosleep   # time do_nanosleep(), in milliseconds
+    ./funclatency -i 2 -d 10 c:open # output every 2 seconds, for duration 10s
+    ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
+    ./funclatency -p 181 vfs_read   # time process 181 only
+    ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency 'c:*printf'       # time the *printf family of functions
+    ./funclatency -F 'vfs_r*'       # show one histogram per matched function
diff --git a/tools/funcslower.py b/tools/funcslower.py
new file mode 100755
index 0000000..261869e
--- /dev/null
+++ b/tools/funcslower.py
@@ -0,0 +1,333 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# funcslower  Trace slow kernel or user function calls.
+#             For Linux, uses BCC, eBPF.
+#
+# USAGE: funcslower [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS]
+#                   [-T] [-t] [-v] function [function ...]
+#
+# WARNING: This tool traces function calls by instrumenting the entry and
+# return from each function. For commonly-invoked functions like memory allocs
+# or file writes, this can be extremely expensive. Mind the overhead.
+#
+# NOTE: This tool cannot trace nested functions in the same invocation
+# due to instrumentation specifics, only innermost calls will be visible.
+#
+# By default, a minimum millisecond threshold of 1 is used.
+#
+# Copyright 2017, Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 30-Mar-2017   Sasha Goldshtein    Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+import time
+
+examples = """examples:
+  ./funcslower vfs_write        # trace vfs_write calls slower than 1ms
+  ./funcslower -m 10 vfs_write  # same, but slower than 10ms
+  ./funcslower -u 10 c:open     # trace open calls slower than 10us
+  ./funcslower -p 135 c:open    # trace pid 135 only
+  ./funcslower c:malloc c:free  # trace both malloc and free slower than 1ms
+  ./funcslower -a 2 c:open      # show first two arguments to open
+  ./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms
+  ./funcslower -f -UK c:open    # Output in folded format for flame graphs
+"""
+parser = argparse.ArgumentParser(
+    description="Trace slow kernel or user function calls.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid",
+    help="trace this PID only")
+parser.add_argument("-m", "--min-ms", type=float, dest="min_ms",
+    help="minimum duration to trace (ms)")
+parser.add_argument("-u", "--min-us", type=float, dest="min_us",
+    help="minimum duration to trace (us)")
+parser.add_argument("-a", "--arguments", type=int,
+    help="print this many entry arguments, as hex")
+parser.add_argument("-T", "--time", action="store_true",
+    help="show HH:MM:SS timestamp")
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="show timestamp in seconds at us resolution")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program for debugging purposes")
+parser.add_argument(metavar="function", nargs="+", dest="functions",
+    help="function(s) to trace")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format, one line per stack (for flame graphs)")
+parser.add_argument("-U", "--user-stack",
+  action="store_true", help="output user stack trace")
+parser.add_argument("-K", "--kernel-stack",
+  action="store_true", help="output kernel stack trace")
+
+args = parser.parse_args()
+# fractions are allowed, but rounded to an integer nanosecond
+if args.min_ms:
+    duration_ns = int(args.min_ms * 1000000)
+elif args.min_us:
+    duration_ns = int(args.min_us * 1000)
+else:
+    duration_ns = 1000000   # default to 1ms
+
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>    // for TASK_COMM_LEN
+
+struct entry_t {
+    u64 id;
+    u64 start_ns;
+#ifdef GRAB_ARGS
+    u64 args[6];
+#endif
+};
+
+struct data_t {
+    u64 id;
+    u64 tgid_pid;
+    u64 start_ns;
+    u64 duration_ns;
+    u64 retval;
+    char comm[TASK_COMM_LEN];
+#ifdef USER_STACKS
+    int user_stack_id;
+#endif
+#ifdef KERNEL_STACKS
+    int kernel_stack_id;
+    u64 kernel_ip;
+#endif
+#ifdef GRAB_ARGS
+    u64 args[6];
+#endif
+};
+
+BPF_HASH(entryinfo, u64, struct entry_t);
+BPF_PERF_OUTPUT(events);
+
+#if defined(USER_STACKS) || defined(KERNEL_STACKS)
+BPF_STACK_TRACE(stacks, 2048);
+#endif
+
+static int trace_entry(struct pt_regs *ctx, int id)
+{
+    u64 tgid_pid = bpf_get_current_pid_tgid();
+    u32 tgid = tgid_pid >> 32;
+    if (TGID_FILTER)
+        return 0;
+
+    u32 pid = tgid_pid;
+
+    struct entry_t entry = {};
+    entry.start_ns = bpf_ktime_get_ns();
+    entry.id = id;
+#ifdef GRAB_ARGS
+    entry.args[0] = PT_REGS_PARM1(ctx);
+    entry.args[1] = PT_REGS_PARM2(ctx);
+    entry.args[2] = PT_REGS_PARM3(ctx);
+    entry.args[3] = PT_REGS_PARM4(ctx);
+    entry.args[4] = PT_REGS_PARM5(ctx);
+    entry.args[5] = PT_REGS_PARM6(ctx);
+#endif
+
+    entryinfo.update(&tgid_pid, &entry);
+
+    return 0;
+}
+
+int trace_return(struct pt_regs *ctx)
+{
+    struct entry_t *entryp;
+    u64 tgid_pid = bpf_get_current_pid_tgid();
+
+    entryp = entryinfo.lookup(&tgid_pid);
+    if (entryp == 0) {
+        return 0;
+    }
+
+    u64 delta_ns = bpf_ktime_get_ns() - entryp->start_ns;
+    entryinfo.delete(&tgid_pid);
+
+    if (delta_ns < DURATION_NS)
+        return 0;
+
+    struct data_t data = {};
+    data.id = entryp->id;
+    data.tgid_pid = tgid_pid;
+    data.start_ns = entryp->start_ns;
+    data.duration_ns = delta_ns;
+    data.retval = PT_REGS_RC(ctx);
+
+#ifdef USER_STACKS
+    data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
+#endif
+
+#ifdef KERNEL_STACKS
+    data.kernel_stack_id = stacks.get_stackid(ctx, 0);
+
+    if (data.kernel_stack_id >= 0) {
+        u64 ip = PT_REGS_IP(ctx);
+        u64 page_offset;
+
+        // if ip isn't sane, leave key ips as zero for later checking
+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
+        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
+        page_offset = __PAGE_OFFSET_BASE;
+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
+        // x64, 4.17, and later
+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
+        page_offset = __PAGE_OFFSET_BASE_L5;
+#else
+        page_offset = __PAGE_OFFSET_BASE_L4;
+#endif
+#else
+        // earlier x86_64 kernels, e.g., 4.6, comes here
+        // arm64, s390, powerpc, x86_32
+        page_offset = PAGE_OFFSET;
+#endif
+
+        if (ip > page_offset) {
+            data.kernel_ip = ip;
+        }
+    }
+#endif
+
+#ifdef GRAB_ARGS
+    bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args);
+#endif
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns))
+if args.arguments:
+    bpf_text = "#define GRAB_ARGS\n" + bpf_text
+if args.user_stack:
+    bpf_text = "#define USER_STACKS\n" + bpf_text
+if args.kernel_stack:
+    bpf_text = "#define KERNEL_STACKS\n" + bpf_text
+if args.tgid:
+    bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
+else:
+    bpf_text = bpf_text.replace('TGID_FILTER', '0')
+
+for i in range(len(args.functions)):
+    bpf_text += """
+int trace_%d(struct pt_regs *ctx) {
+    return trace_entry(ctx, %d);
+}
+""" % (i, i)
+
+if args.verbose or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+b = BPF(text=bpf_text)
+
+for i, function in enumerate(args.functions):
+    if ":" in function:
+        library, func = function.split(":")
+        b.attach_uprobe(name=library, sym=func, fn_name="trace_%d" % i)
+        b.attach_uretprobe(name=library, sym=func, fn_name="trace_return")
+    else:
+        b.attach_kprobe(event=function, fn_name="trace_%d" % i)
+        b.attach_kretprobe(event=function, fn_name="trace_return")
+
+TASK_COMM_LEN = 16  # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("id", ct.c_ulonglong),
+        ("tgid_pid", ct.c_ulonglong),
+        ("start_ns", ct.c_ulonglong),
+        ("duration_ns", ct.c_ulonglong),
+        ("retval", ct.c_ulonglong),
+        ("comm", ct.c_char * TASK_COMM_LEN)
+    ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \
+            ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \
+            ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
+
+time_designator = "us" if args.min_us else "ms"
+time_value = args.min_us or args.min_ms or 1
+time_multiplier = 1000 if args.min_us else 1000000
+time_col = args.time or args.timestamp
+
+# Do not print header when folded
+if not args.folded:
+    print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
+          (time_value, time_designator))
+    print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
+        ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
+        "FUNC" + (" ARGS" if args.arguments else "")))
+
+earliest_ts = 0
+
+def time_str(event):
+    if args.time:
+        return "%-10s " % time.strftime("%H:%M:%S")
+    if args.timestamp:
+        global earliest_ts
+        if earliest_ts == 0:
+            earliest_ts = event.start_ns
+        return "%-10.6f " % ((event.start_ns - earliest_ts) / 1000000000.0)
+    return ""
+
+def args_str(event):
+    if not args.arguments:
+        return ""
+    return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]])
+
+def print_stack(event):
+    user_stack = []
+    stack_traces = b.get_table("stacks")
+
+    if args.user_stack and event.user_stack_id > 0:
+        user_stack = stack_traces.walk(event.user_stack_id)
+
+    kernel_stack = []
+    if args.kernel_stack and event.kernel_stack_id > 0:
+        kernel_tmp = stack_traces.walk(event.kernel_stack_id)
+
+        # fix kernel stack
+        for addr in kernel_tmp:
+            kernel_stack.append(addr)
+
+    do_delimiter = user_stack and kernel_stack
+
+    if args.folded:
+        # print folded stack output
+        user_stack = list(user_stack)
+        kernel_stack = list(kernel_stack)
+        line = [event.comm.decode('utf-8', 'replace')] + \
+            [b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
+            (do_delimiter and ["-"] or []) + \
+            [b.ksym(addr) for addr in reversed(kernel_stack)]
+        print("%s %d" % (";".join(line), 1))
+    else:
+        # print default multi-line stack output.
+        for addr in kernel_stack:
+            print("    %s" % b.ksym(addr))
+        for addr in user_stack:
+            print("    %s" % b.sym(addr, event.tgid_pid))
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    ts = float(event.duration_ns) / time_multiplier
+    if not args.folded:
+        print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
+            (event.comm.decode('utf-8', 'replace'), event.tgid_pid >> 32,
+             ts, event.retval, args.functions[event.id], args_str(event)))
+    if args.user_stack or args.kernel_stack:
+        print_stack(event)
+
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while True:
+    b.perf_buffer_poll()
diff --git a/tools/funcslower_example.txt b/tools/funcslower_example.txt
new file mode 100644
index 0000000..86524c2
--- /dev/null
+++ b/tools/funcslower_example.txt
@@ -0,0 +1,150 @@
+Demonstrations of funcslower, the Linux eBPF/bcc version.
+
+
+funcslower shows kernel or user function invocations slower than a threshold.
+This can be used for last-resort diagnostics when aggregation-based tools have
+failed. For example, trace the open() function in libc when it is slower than
+1 microsecond (us):
+
+# ./funcslower c:open -u 1
+Tracing function calls slower than 1 us... Ctrl+C to quit.
+COMM           PID    LAT(us)             RVAL FUNC
+less           27074    33.77                3 c:open 
+less           27074     9.96 ffffffffffffffff c:open 
+less           27074     5.92 ffffffffffffffff c:open 
+less           27074    15.88 ffffffffffffffff c:open 
+less           27074     8.89                3 c:open 
+less           27074    15.89                3 c:open 
+sh             27075    20.97                4 c:open 
+bash           27075    20.14                4 c:open 
+lesspipe.sh    27075    18.77                4 c:open 
+lesspipe.sh    27075    11.21                4 c:open 
+lesspipe.sh    27075    13.68                4 c:open 
+file           27076    14.83 ffffffffffffffff c:open 
+file           27076     8.02                4 c:open 
+file           27076    10.26                4 c:open 
+file           27076     6.55                4 c:open 
+less           27074    11.67                4 c:open 
+^C
+
+This shows several open operations performed by less and some helpers it invoked
+in the process. The latency (in microseconds) is shown, as well as the return
+value from the open() function, which helps indicate if there is a correlation
+between failures and slow invocations. Most open() calls seemed to have 
+completed successfully (returning a valid file descriptor), but some have failed
+and returned -1.
+
+You can also trace kernel functions:
+
+# ./funcslower -m 10 vfs_read
+Tracing function calls slower than 10 ms... Ctrl+C to quit.
+COMM           PID    LAT(ms)             RVAL FUNC
+bash           11527    78.97                1 vfs_read 
+bash           11527   101.26                1 vfs_read 
+bash           11527  1053.60                1 vfs_read 
+bash           11527    44.21                1 vfs_read 
+bash           11527    79.50                1 vfs_read 
+bash           11527    33.37                1 vfs_read 
+bash           11527   112.17                1 vfs_read 
+bash           11527   101.49                1 vfs_read 
+^C
+
+Occasionally, it is also useful to see the arguments passed to the functions.
+The raw hex values of the arguments are available when using the -a switch:
+
+# ./funcslower __kmalloc -a 2 -u 1
+Tracing function calls slower than 1 us... Ctrl+C to quit.
+COMM           PID    LAT(us)             RVAL FUNC ARGS
+kworker/0:2    27077     7.46 ffff90054f9f8e40 __kmalloc 0x98 0x1400000
+kworker/0:2    27077     6.84 ffff90054f9f8e40 __kmalloc 0x98 0x1400000
+bash           11527     6.87 ffff90054f9f8e40 __kmalloc 0x90 0x1408240
+bash           11527     1.15 ffff90054f9f8e40 __kmalloc 0x90 0x1408240
+bash           11527     1.15 ffff90055a1b8c00 __kmalloc 0x2c 0x1400240
+bash           11527     1.18 ffff90054b87d240 __kmalloc 0x1c 0x1400040
+bash           11527    10.59 ffff900546d60000 __kmalloc 0x10000 0x14082c0
+bash           11527     1.49 ffff90054fbd4c00 __kmalloc 0x280 0x15080c0
+bash           11527     1.00 ffff90054789b000 __kmalloc 0x800 0x15012c0
+bash           27128     3.47 ffff90057ca1a200 __kmalloc 0x150 0x1400240
+bash           27128     1.82 ffff90054fbd4c00 __kmalloc 0x230 0x14000c0
+bash           27128     1.17 ffff90054b87d5a0 __kmalloc 0x1c 0x14000c0
+perf           27128     4.81 ffff90054f9f8e40 __kmalloc 0x90 0x1408240
+perf           27128    24.71 ffff900566990000 __kmalloc 0x10000 0x14082c0
+^C
+
+This shows the first two arguments to __kmalloc -- the first one is the size
+of the requested allocation. The return value is also shown (null return values
+would indicate a failure).
+
+# ./funcslower -U -m 30 '/usr/sbin/nginx:database_write'
+Tracing function calls slower than 30 ms... Ctrl+C to quit.
+COMM           PID    LAT(ms)             RVAL FUNC
+nginx          1617     30.15                9 /usr/sbin/nginx:database_write
+    DataBaseProvider::setData(std::string const&, record_s&)
+    UserDataProvider::saveRecordData(RecordData const&)
+    RequestProcessor::writeResponse(int)
+    RequestProcessor::processRequest()
+    RequestRouter::processRequest(RequestWrapper*, ResponseWrapper*)
+    ngx_http_core_content_phase
+    ngx_http_core_run_phases
+    ngx_http_process_request
+    ngx_process_events_and_timers
+    ngx_spawn_process
+    ngx_master_process_cycle
+    main
+    __libc_start_main
+    [unknown]
+nginx          1629     30.14                9 /usr/sbin/nginx:database_write
+    DataBaseProvider::setData(std::string const&, record_s&)
+    UserDataProvider::saveRecordData(RecordData const&)
+    RequestProcessor::writeResponse(int)
+    RequestProcessor::processRequest()
+    RequestRouter::processRequest(RequestWrapper*, ResponseWrapper*)
+    ngx_http_core_content_phase
+    ngx_http_core_run_phases
+    ngx_http_process_request
+    ngx_process_events_and_timers
+    ngx_spawn_process
+    ngx_master_process_cycle
+    main
+    __libc_start_main
+    [unknown]
+^C
+
+Shows the user space stack trace of calls to the user space function call open taking longer than 30 ms.
+
+USAGE message:
+
+usage: funcslower.py [-hf] [-p PID] [-U | -K] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T]
+                     [-t] [-v]
+                     function [function ...]
+
+Trace slow kernel or user function calls.
+
+positional arguments:
+  function              function(s) to trace
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -m MIN_MS, --min-ms MIN_MS
+                        minimum duration to trace (ms)
+  -u MIN_US, --min-us MIN_US
+                        minimum duration to trace (us)
+  -U, --user-stack
+                        show stacks from user space
+  -K, --kernel-stack
+                        show stacks from kernel space
+  -f                    print output in folded stack format.
+  -a ARGUMENTS, --arguments ARGUMENTS
+                        print this many entry arguments, as hex
+  -T, --time            show HH:MM:SS timestamp
+  -t, --timestamp       show timestamp in seconds at us resolution
+  -v, --verbose         print the BPF program for debugging purposes
+
+examples:
+  ./funcslower vfs_write       # trace vfs_write calls slower than 1ms
+  ./funcslower -m 10 vfs_write # same, but slower than 10ms
+  ./funcslower -u 10 c:open    # trace open calls slower than 10us
+  ./funcslower -p 135 c:open   # trace pid 135 only
+  ./funcslower c:malloc c:free # trace both malloc and free slower than 1ms
+  ./funcslower -a 2 c:open     # show first two arguments to open
diff --git a/tools/gethostlatency.py b/tools/gethostlatency.py
new file mode 100755
index 0000000..3a967ae
--- /dev/null
+++ b/tools/gethostlatency.py
@@ -0,0 +1,138 @@
+#!/usr/bin/python
+#
+# gethostlatency  Show latency for getaddrinfo/gethostbyname[2] calls.
+#                 For Linux, uses BCC, eBPF. Embedded C.
+#
+# This can be useful for identifying DNS latency, by identifying which
+# remote host name lookups were slow, and by how much.
+#
+# This uses dynamic tracing of user-level functions and registers, and may
+# need modifications to match your software and processor architecture.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 28-Jan-2016    Brendan Gregg   Created this.
+# 30-Mar-2016   Allan McAleavy updated for BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+from time import strftime
+import argparse
+import ctypes as ct
+
+examples = """examples:
+    ./gethostlatency           # trace all TCP accept()s
+    ./gethostlatency -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Show latency for getaddrinfo/gethostbyname[2] calls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", help="trace this PID only", type=int,
+    default=-1)
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct val_t {
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+    char host[80];
+    u64 ts;
+};
+
+struct data_t {
+    u32 pid;
+    u64 delta;
+    char comm[TASK_COMM_LEN];
+    char host[80];
+};
+
+BPF_HASH(start, u32, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+int do_entry(struct pt_regs *ctx) {
+    if (!PT_REGS_PARM1(ctx))
+        return 0;
+
+    struct val_t val = {};
+    u32 pid = bpf_get_current_pid_tgid();
+
+    if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
+        bpf_probe_read(&val.host, sizeof(val.host),
+                       (void *)PT_REGS_PARM1(ctx));
+        val.pid = bpf_get_current_pid_tgid();
+        val.ts = bpf_ktime_get_ns();
+        start.update(&pid, &val);
+    }
+
+    return 0;
+}
+
+int do_return(struct pt_regs *ctx) {
+    struct val_t *valp;
+    struct data_t data = {};
+    u64 delta;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    u64 tsp = bpf_ktime_get_ns();
+
+    valp = start.lookup(&pid);
+    if (valp == 0)
+        return 0;       // missed start
+
+    bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
+    bpf_probe_read(&data.host, sizeof(data.host), (void *)valp->host);
+    data.pid = valp->pid;
+    data.delta = tsp - valp->ts;
+    events.perf_submit(ctx, &data, sizeof(data));
+    start.delete(&pid);
+    return 0;
+}
+"""
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+b = BPF(text=bpf_text)
+b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry", pid=args.pid)
+b.attach_uprobe(name="c", sym="gethostbyname", fn_name="do_entry",
+                pid=args.pid)
+b.attach_uprobe(name="c", sym="gethostbyname2", fn_name="do_entry",
+                pid=args.pid)
+b.attach_uretprobe(name="c", sym="getaddrinfo", fn_name="do_return",
+                   pid=args.pid)
+b.attach_uretprobe(name="c", sym="gethostbyname", fn_name="do_return",
+                   pid=args.pid)
+b.attach_uretprobe(name="c", sym="gethostbyname2", fn_name="do_return",
+                   pid=args.pid)
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("delta", ct.c_ulonglong),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("host", ct.c_char * 80)
+    ]
+
+# header
+print("%-9s %-6s %-16s %10s %s" % ("TIME", "PID", "COMM", "LATms", "HOST"))
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-9s %-6d %-16s %10.2f %s" % (strftime("%H:%M:%S"), event.pid,
+        event.comm.decode('utf-8', 'replace'), (float(event.delta) / 1000000),
+        event.host.decode('utf-8', 'replace')))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/gethostlatency_example.txt b/tools/gethostlatency_example.txt
new file mode 100644
index 0000000..debb2df
--- /dev/null
+++ b/tools/gethostlatency_example.txt
@@ -0,0 +1,37 @@
+Demonstrations of gethostlatency, the Linux eBPF/bcc version.
+
+
+This traces host name lookup calls (getaddrinfo(), gethostbyname(), and
+gethostbyname2()), and shows the PID and command performing the lookup, the
+latency (duration) of the call in milliseconds, and the host string:
+
+# ./gethostlatency
+TIME      PID    COMM          LATms HOST
+06:10:24  28011  wget          90.00 www.iovisor.org
+06:10:28  28127  wget           0.00 www.iovisor.org
+06:10:41  28404  wget           9.00 www.netflix.com
+06:10:48  28544  curl          35.00 www.netflix.com.au
+06:11:10  29054  curl          31.00 www.plumgrid.com
+06:11:16  29195  curl           3.00 www.facebook.com
+06:11:25  29404  curl          72.00 foo
+06:11:28  29475  curl           1.00 foo
+
+In this example, the first call to lookup "www.iovisor.org" took 90 ms, and
+the second took 0 ms (cached). The slowest call in this example was to "foo",
+which was an unsuccessful lookup.
+
+
+USAGE message:
+
+# ./gethostlatency -h
+usage: gethostlatency [-h] [-p PID]
+
+Show latency for getaddrinfo/gethostbyname[2] calls
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./gethostlatency           # trace all TCP accept()s
+    ./gethostlatency -p 181    # only trace PID 181
diff --git a/tools/hardirqs.py b/tools/hardirqs.py
new file mode 100755
index 0000000..589a890
--- /dev/null
+++ b/tools/hardirqs.py
@@ -0,0 +1,180 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# hardirqs  Summarize hard IRQ (interrupt) event time.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: hardirqs [-h] [-T] [-N] [-C] [-d] [interval] [outputs]
+#
+# Thanks Amer Ather for help understanding irq behavior.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./hardirqs            # sum hard irq event time
+    ./hardirqs -d         # show hard irq event time as histograms
+    ./hardirqs 1 10       # print 1 second summaries, 10 times
+    ./hardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize hard irq event time as histograms",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-N", "--nanoseconds", action="store_true",
+    help="output in nanoseconds")
+parser.add_argument("-C", "--count", action="store_true",
+    help="show event counts instead of timing")
+parser.add_argument("-d", "--dist", action="store_true",
+    help="show distributions as histograms")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("outputs", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.outputs)
+if args.count and (args.dist or args.nanoseconds):
+    print("The --count option can't be used with time-based options")
+    exit()
+if args.count:
+    factor = 1
+    label = "count"
+elif args.nanoseconds:
+    factor = 1
+    label = "nsecs"
+else:
+    factor = 1000
+    label = "usecs"
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/interrupt.h>
+
+typedef struct irq_key {
+    char name[32];
+    u64 slot;
+} irq_key_t;
+BPF_HASH(start, u32);
+BPF_HASH(irqdesc, u32, struct irq_desc *);
+BPF_HISTOGRAM(dist, irq_key_t);
+
+// count IRQ
+int count_only(struct pt_regs *ctx, struct irq_desc *desc)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+
+    struct irqaction *action = desc->action;
+    char *name = (char *)action->name;
+
+    irq_key_t key = {.slot = 0 /* ignore */};
+    bpf_probe_read(&key.name, sizeof(key.name), name);
+    dist.increment(key);
+
+    return 0;
+}
+
+// time IRQ
+int trace_start(struct pt_regs *ctx, struct irq_desc *desc)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    irqdesc.update(&pid, &desc);
+    return 0;
+}
+
+int trace_completion(struct pt_regs *ctx)
+{
+    u64 *tsp, delta;
+    struct irq_desc **descp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    descp = irqdesc.lookup(&pid);
+    if (tsp == 0 || descp == 0) {
+        return 0;   // missed start
+    }
+    struct irq_desc *desc = *descp;
+    struct irqaction *action = desc->action;
+    char *name = (char *)action->name;
+    delta = bpf_ktime_get_ns() - *tsp;
+
+    // store as sum or histogram
+    STORE
+
+    start.delete(&pid);
+    irqdesc.delete(&pid);
+    return 0;
+}
+"""
+
+# code substitutions
+if args.dist:
+    bpf_text = bpf_text.replace('STORE',
+        'irq_key_t key = {.slot = bpf_log2l(delta / %d)};' % factor +
+        'bpf_probe_read(&key.name, sizeof(key.name), name);' +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORE',
+        'irq_key_t key = {.slot = 0 /* ignore */};' +
+        'bpf_probe_read(&key.name, sizeof(key.name), name);' +
+        'dist.increment(key, delta);')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# these should really use irq:irq_handler_entry/exit tracepoints:
+if args.count:
+    b.attach_kprobe(event="handle_irq_event_percpu", fn_name="count_only")
+    print("Tracing hard irq events... Hit Ctrl-C to end.")
+else:
+    b.attach_kprobe(event="handle_irq_event_percpu", fn_name="trace_start")
+    b.attach_kretprobe(event="handle_irq_event_percpu",
+        fn_name="trace_completion")
+    print("Tracing hard irq event time... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    if args.dist:
+        dist.print_log2_hist(label, "hardirq")
+    else:
+        print("%-26s %11s" % ("HARDIRQ", "TOTAL_" + label))
+        for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
+            print("%-26s %11d" % (k.name.decode('utf-8', 'replace'), v.value / factor))
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/hardirqs_example.txt b/tools/hardirqs_example.txt
new file mode 100644
index 0000000..d7dcc9c
--- /dev/null
+++ b/tools/hardirqs_example.txt
@@ -0,0 +1,672 @@
+Demonstrations of hardirqs, the Linux eBPF/bcc version.
+
+
+This program traces hard interrupts (irqs), and stores timing statistics
+in-kernel for efficiency. For example:
+
+# ./hardirqs
+Tracing hard irq event time... Hit Ctrl-C to end.
+^C
+HARDIRQ                    TOTAL_usecs
+callfuncsingle0                      2
+callfuncsingle5                      5
+callfuncsingle6                      5
+callfuncsingle7                     21
+blkif                               66
+timer7                              84
+resched5                            94
+resched0                            97
+resched3                           102
+resched7                           111
+resched6                           255
+timer3                             362
+resched4                           367
+timer5                             474
+timer1                             529
+timer6                             679
+timer2                             746
+timer4                             943
+resched1                          1048
+timer0                            1558
+resched2                          1750
+eth0                             11441
+
+The HARDIRQ column prints the interrupt action name. While tracing, the eth0
+hard irq action ran for 11441 microseconds (11 milliseconds) in total.
+
+Many other interrupts are visible in the output: this is an 8 CPU system, and
+some of these interrupts have a separate action per-CPU (eg, "timer",
+"resched").
+
+
+An interval can be provided, and also optionally a count. Eg, printing output
+every 1 second, and including timestamps (-T):
+
+# ./hardirqs -T 1 3
+Tracing hard irq event time... Hit Ctrl-C to end.
+
+22:16:14
+HARDIRQ                    TOTAL_usecs
+callfuncsingle0                      2
+callfuncsingle7                      5
+callfuncsingle3                      5
+callfuncsingle2                      5
+callfuncsingle6                      6
+callfuncsingle1                     11
+resched0                            32
+blkif                               51
+resched5                            71
+resched7                            71
+resched4                            72
+resched6                            82
+timer7                             172
+resched1                           187
+resched2                           236
+timer3                             252
+resched3                           282
+timer1                             320
+timer2                             374
+timer6                             396
+timer5                             427
+timer4                             470
+timer0                            1430
+eth0                              7498
+
+22:16:15
+HARDIRQ                    TOTAL_usecs
+callfuncsingle7                      6
+callfuncsingle5                     11
+callfuncsingle4                     13
+timer2                              17
+callfuncsingle6                     18
+resched0                            21
+blkif                               33
+resched3                            40
+resched5                            60
+resched4                            69
+resched6                            70
+resched7                            74
+timer7                              86
+resched2                            91
+timer3                             134
+resched1                           293
+timer5                             354
+timer1                             433
+timer6                             497
+timer4                            1112
+timer0                            1768
+eth0                              6972
+
+22:16:16
+HARDIRQ                    TOTAL_usecs
+callfuncsingle7                      5
+callfuncsingle3                      5
+callfuncsingle2                      6
+timer3                              10
+resched0                            18
+callfuncsingle4                     22
+resched5                            27
+resched6                            44
+blkif                               45
+resched7                            65
+resched4                            69
+timer4                              77
+resched2                            97
+timer7                              98
+resched3                           103
+timer2                             169
+resched1                           226
+timer5                             525
+timer1                             691
+timer6                             697
+timer0                            1415
+eth0                              7152
+
+This can be useful for quantifying where CPU cycles are spent among the hard
+interrupts (summarized as the %irq column from mpstat(1)). The output above
+shows that most time was spent processing for eth0 (network interface), which
+was around 7 milliseconds per second (total across all CPUs).
+
+Note that the time spent among the "timer" interrupts was low, and usually less
+than one microsecond per second. Here's the hardirq per-second output when the
+perf tool is performing a 999 Hertz CPU profile ("perf record -F 999 -a ..."):
+
+22:13:59
+HARDIRQ                    TOTAL_usecs
+callfuncsingle7                      5
+callfuncsingle5                      5
+callfuncsingle3                      6
+callfuncsingle4                      7
+callfuncsingle6                     19
+blkif                               66
+resched0                            66
+resched2                            82
+resched7                            87
+resched3                            96
+resched4                           118
+resched5                           120
+resched6                           130
+resched1                           230
+timer3                             946
+timer1                            1981
+timer7                            2618
+timer5                            3063
+timer6                            3141
+timer4                            3511
+timer2                            3554
+timer0                            5044
+eth0                             16015
+
+This sheds some light into the CPU overhead of the perf profiler, which cost
+around 3 milliseconds per second. Note that I'm usually profiling at a much
+lower rate, 99 Hertz, which looks like this:
+
+22:22:12
+HARDIRQ                    TOTAL_usecs
+callfuncsingle3                      5
+callfuncsingle6                      5
+callfuncsingle5                     22
+blkif                               46
+resched6                            47
+resched5                            57
+resched4                            66
+resched7                            78
+resched2                            97
+resched0                           214
+timer2                             326
+timer0                             498
+timer5                             536
+timer6                             576
+timer1                             600
+timer4                             982
+resched1                          1315
+timer7                            1364
+timer3                            1825
+resched3                          5708
+eth0                              9743
+
+Much lower (and remember to compare this to the baseline). Note that perf has
+other overheads (non-irq CPU cycles, file system storage).
+
+
+The distribution of interrupt run time can be printed as a histogram with the -d
+option. Eg:
+
+# ./hardirqs -d
+Tracing hard irq event time... Hit Ctrl-C to end.
+^C
+
+hardirq = 'callfuncsingle1'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 1        |****************************************|
+
+hardirq = 'callfuncsingle0'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 1        |****************************************|
+
+hardirq = 'callfuncsingle3'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 3        |****************************************|
+
+hardirq = 'callfuncsingle2'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 2        |****************************************|
+
+hardirq = 'callfuncsingle5'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 5        |****************************************|
+
+hardirq = 'callfuncsingle4'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 6        |****************************************|
+
+hardirq = 'callfuncsingle7'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 4        |****************************************|
+
+hardirq = 'callfuncsingle6'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 4        |****************************************|
+
+hardirq = 'eth0'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 5102     |*********                               |
+      1024 -> 2047       : 20617    |****************************************|
+      2048 -> 4095       : 4832     |*********                               |
+      4096 -> 8191       : 12       |                                        |
+
+hardirq = 'timer7'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 9        |***                                     |
+      2048 -> 4095       : 70       |*****************************           |
+      4096 -> 8191       : 94       |****************************************|
+
+hardirq = 'timer6'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |                                        |
+      2048 -> 4095       : 86       |***********                             |
+      4096 -> 8191       : 295      |****************************************|
+      8192 -> 16383      : 28       |***                                     |
+
+hardirq = 'timer5'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 137      |****************************************|
+      4096 -> 8191       : 123      |***********************************     |
+      8192 -> 16383      : 8        |**                                      |
+
+hardirq = 'timer4'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 46       |*********                               |
+      4096 -> 8191       : 198      |****************************************|
+      8192 -> 16383      : 49       |*********                               |
+
+hardirq = 'timer3'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 4        |                                        |
+      2048 -> 4095       : 210      |****************************************|
+      4096 -> 8191       : 186      |***********************************     |
+
+hardirq = 'timer2'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 245      |****************************************|
+      4096 -> 8191       : 227      |*************************************   |
+      8192 -> 16383      : 6        |                                        |
+
+hardirq = 'timer1'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 6        |*                                       |
+      2048 -> 4095       : 112      |************************                |
+      4096 -> 8191       : 181      |****************************************|
+      8192 -> 16383      : 7        |*                                       |
+
+hardirq = 'timer0'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 887      |****************************************|
+      8192 -> 16383      : 92       |****                                    |
+
+hardirq = 'blkif'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 9        |****************************************|
+      8192 -> 16383      : 7        |*******************************         |
+     16384 -> 32767      : 2        |********                                |
+
+hardirq = 'resched4'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 104      |****************************************|
+      2048 -> 4095       : 80       |******************************          |
+
+hardirq = 'resched5'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 27       |*****                                   |
+      1024 -> 2047       : 216      |****************************************|
+      2048 -> 4095       : 27       |*****                                   |
+      4096 -> 8191       : 1        |                                        |
+
+hardirq = 'resched6'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 480      |*******************                     |
+      1024 -> 2047       : 1003     |****************************************|
+      2048 -> 4095       : 64       |**                                      |
+
+hardirq = 'resched7'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 46       |*********                               |
+      1024 -> 2047       : 190      |****************************************|
+      2048 -> 4095       : 42       |********                                |
+
+hardirq = 'resched0'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 11       |****                                    |
+      1024 -> 2047       : 100      |****************************************|
+      2048 -> 4095       : 23       |*********                               |
+
+hardirq = 'resched1'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 96       |********                                |
+      1024 -> 2047       : 462      |****************************************|
+      2048 -> 4095       : 36       |***                                     |
+
+hardirq = 'resched2'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 120      |**************************              |
+      1024 -> 2047       : 183      |****************************************|
+      2048 -> 4095       : 41       |********                                |
+
+hardirq = 'resched3'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 789      |****************************************|
+      2048 -> 4095       : 39       |*                                       |
+
+
+Sometimes you just want counts of events, and don't need the distribution
+of times. You can use the -C or --count option:
+
+# ./hardirqs.py -C
+Tracing hard irq events... Hit Ctrl-C to end.
+^C
+HARDIRQ                    TOTAL_count
+blkif                                2
+callfuncsingle3                      8
+callfuncsingle2                     10
+callfuncsingle1                     18
+resched7                            25
+callfuncsingle6                     25
+callfuncsingle5                     27
+callfuncsingle0                     27
+eth0                                34
+resched2                            40
+resched1                            66
+timer7                              70
+resched6                            71
+resched0                            73
+resched5                            79
+resched4                            90
+timer6                              95
+timer4                             100
+timer1                             109
+timer2                             115
+timer0                             117
+timer3                             123
+resched3                           140
+timer5                             288
+
+
+USAGE message:
+
+# ./hardirqs -h
+usage: hardirqs [-h] [-T] [-N] [-C] [-d] [interval] [outputs]
+
+Summarize hard irq event time as histograms
+
+positional arguments:
+  interval           output interval, in seconds
+  outputs            number of outputs
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -T, --timestamp    include timestamp on output
+  -N, --nanoseconds  output in nanoseconds
+  -C, --count        show event counts instead of timing
+  -d, --dist         show distributions as histograms
+
+examples:
+    ./hardirqs            # sum hard irq event time
+    ./hardirqs -d         # show hard irq event time as histograms
+    ./hardirqs 1 10       # print 1 second summaries, 10 times
+    ./hardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
diff --git a/tools/inject.py b/tools/inject.py
new file mode 100755
index 0000000..031679b
--- /dev/null
+++ b/tools/inject.py
@@ -0,0 +1,503 @@
+#!/usr/bin/env python
+#
+# This script generates a BPF program with structure inspired by trace.py. The
+# generated program operates on PID-indexed stacks. Generally speaking,
+# bookkeeping is done at every intermediate function kprobe/kretprobe to enforce
+# the goal of "fail iff this call chain and these predicates".
+#
+# Top level functions(the ones at the end of the call chain) are responsible for
+# creating the pid_struct and deleting it from the map in kprobe and kretprobe
+# respectively.
+#
+# Intermediate functions(between should_fail_whatever and the top level
+# functions) are responsible for updating the stack to indicate "I have been
+# called and one of my predicate(s) passed" in their entry probes. In their exit
+# probes, they do the opposite, popping their stack to maintain correctness.
+# This implementation aims to ensure correctness in edge cases like recursive
+# calls, so there's some additional information stored in pid_struct for that.
+#
+# At the bottom level function(should_fail_whatever), we do a simple check to
+# ensure all necessary calls/predicates have passed before error injection.
+#
+# Note: presently there are a few hacks to get around various rewriter/verifier
+# issues.
+#
+# Note: this tool requires:
+# - CONFIG_BPF_KPROBE_OVERRIDE
+#
+# USAGE: inject [-h] [-I header] [-P probability] [-v] mode spec
+#
+# Copyright (c) 2018 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 16-Mar-2018   Howard McLauchlan   Created this.
+
+import argparse
+import re
+from bcc import BPF
+
+
+class Probe:
+    errno_mapping = {
+        "kmalloc": "-ENOMEM",
+        "bio": "-EIO",
+    }
+
+    @classmethod
+    def configure(cls, mode, probability):
+        cls.mode = mode
+        cls.probability = probability
+
+    def __init__(self, func, preds, length, entry):
+        # length of call chain
+        self.length = length
+        self.func = func
+        self.preds = preds
+        self.is_entry = entry
+
+    def _bail(self, err):
+        raise ValueError("error in probe '%s': %s" %
+                (self.spec, err))
+
+    def _get_err(self):
+        return Probe.errno_mapping[Probe.mode]
+
+    def _get_if_top(self):
+        # ordering guarantees that if this function is top, the last tup is top
+        chk = self.preds[0][1] == 0
+        if not chk:
+            return ""
+
+        if Probe.probability == 1:
+            early_pred = "false"
+        else:
+            early_pred = "bpf_get_prandom_u32() > %s" % str(int((1<<32)*Probe.probability))
+        # init the map
+        # dont do an early exit here so the singular case works automatically
+        # have an early exit for probability option
+        enter = """
+        /*
+         * Early exit for probability case
+         */
+        if (%s)
+               return 0;
+        /*
+         * Top level function init map
+         */
+        struct pid_struct p_struct = {0, 0};
+        m.insert(&pid, &p_struct);
+        """ % early_pred
+
+        # kill the entry
+        exit = """
+        /*
+         * Top level function clean up map
+         */
+        m.delete(&pid);
+        """
+
+        return enter if self.is_entry else exit
+
+    def _get_heading(self):
+
+        # we need to insert identifier and ctx into self.func
+        # gonna make a lot of formatting assumptions to make this work
+        left = self.func.find("(")
+        right = self.func.rfind(")")
+
+        # self.event and self.func_name need to be accessible
+        self.event = self.func[0:left]
+        self.func_name = self.event + ("_entry" if self.is_entry else "_exit")
+        func_sig = "struct pt_regs *ctx"
+
+        # assume theres something in there, no guarantee its well formed
+        if right > left + 1 and self.is_entry:
+            func_sig += ", " + self.func[left + 1:right]
+
+        return "int %s(%s)" % (self.func_name, func_sig)
+
+    def _get_entry_logic(self):
+        # there is at least one tup(pred, place) for this function
+        text = """
+
+        if (p->conds_met >= %s)
+                return 0;
+        if (p->conds_met == %s && %s) {
+                p->stack[%s] = p->curr_call;
+                p->conds_met++;
+        }"""
+        text = text % (self.length, self.preds[0][1], self.preds[0][0],
+                self.preds[0][1])
+
+        # for each additional pred
+        for tup in self.preds[1:]:
+            text += """
+        else if (p->conds_met == %s && %s) {
+                p->stack[%s] = p->curr_call;
+                p->conds_met++;
+        }
+            """ % (tup[1], tup[0], tup[1])
+        return text
+
+    def _generate_entry(self):
+        prog = self._get_heading() + """
+{
+        u32 pid = bpf_get_current_pid_tgid();
+        %s
+
+        struct pid_struct *p = m.lookup(&pid);
+
+        if (!p)
+                return 0;
+
+        /*
+         * preparation for predicate, if necessary
+         */
+         %s
+        /*
+         * Generate entry logic
+         */
+        %s
+
+        p->curr_call++;
+
+        return 0;
+}"""
+
+        prog = prog % (self._get_if_top(), self.prep, self._get_entry_logic())
+        return prog
+
+    # only need to check top of stack
+    def _get_exit_logic(self):
+        text = """
+        if (p->conds_met < 1 || p->conds_met >= %s)
+                return 0;
+
+        if (p->stack[p->conds_met - 1] == p->curr_call)
+                p->conds_met--;
+        """
+        return text % str(self.length + 1)
+
+    def _generate_exit(self):
+        prog = self._get_heading() + """
+{
+        u32 pid = bpf_get_current_pid_tgid();
+
+        struct pid_struct *p = m.lookup(&pid);
+
+        if (!p)
+                return 0;
+
+        p->curr_call--;
+
+        /*
+         * Generate exit logic
+         */
+        %s
+        %s
+        return 0;
+}"""
+
+        prog = prog % (self._get_exit_logic(), self._get_if_top())
+
+        return prog
+
+    # Special case for should_fail_whatever
+    def _generate_bottom(self):
+        pred = self.preds[0][0]
+        text = self._get_heading() + """
+{
+        /*
+         * preparation for predicate, if necessary
+         */
+         %s
+        /*
+         * If this is the only call in the chain and predicate passes
+         */
+        if (%s == 1 && %s) {
+                bpf_override_return(ctx, %s);
+                return 0;
+        }
+        u32 pid = bpf_get_current_pid_tgid();
+
+        struct pid_struct *p = m.lookup(&pid);
+
+        if (!p)
+                return 0;
+
+        /*
+         * If all conds have been met and predicate passes
+         */
+        if (p->conds_met == %s && %s)
+                bpf_override_return(ctx, %s);
+        return 0;
+}"""
+        return text % (self.prep, self.length, pred, self._get_err(),
+                    self.length - 1, pred, self._get_err())
+
+    # presently parses and replaces STRCMP
+    # STRCMP exists because string comparison is inconvenient and somewhat buggy
+    # https://github.com/iovisor/bcc/issues/1617
+    def _prepare_pred(self):
+        self.prep = ""
+        for i in range(len(self.preds)):
+            new_pred = ""
+            pred = self.preds[i][0]
+            place = self.preds[i][1]
+            start, ind = 0, 0
+            while start < len(pred):
+                ind = pred.find("STRCMP(", start)
+                if ind == -1:
+                    break
+                new_pred += pred[start:ind]
+                # 7 is len("STRCMP(")
+                start = pred.find(")", start + 7) + 1
+
+                # then ind ... start is STRCMP(...)
+                ptr, literal = pred[ind + 7:start - 1].split(",")
+                literal = literal.strip()
+
+                # x->y->z, some string literal
+                # we make unique id with place_ind
+                uuid = "%s_%s" % (place, ind)
+                unique_bool = "is_true_%s" % uuid
+                self.prep += """
+        char *str_%s = %s;
+        bool %s = true;\n""" % (uuid, ptr.strip(), unique_bool)
+
+                check = "\t%s &= *(str_%s++) == '%%s';\n" % (unique_bool, uuid)
+
+                for ch in literal:
+                    self.prep += check % ch
+                self.prep += check % r'\0'
+                new_pred += unique_bool
+
+            new_pred += pred[start:]
+            self.preds[i] = (new_pred, place)
+
+    def generate_program(self):
+        # generate code to work around various rewriter issues
+        self._prepare_pred()
+
+        # special case for bottom
+        if self.preds[-1][1] == self.length - 1:
+            return self._generate_bottom()
+
+        return self._generate_entry() if self.is_entry else self._generate_exit()
+
+    def attach(self, bpf):
+        if self.is_entry:
+            bpf.attach_kprobe(event=self.event,
+                    fn_name=self.func_name)
+        else:
+            bpf.attach_kretprobe(event=self.event,
+                    fn_name=self.func_name)
+
+
+class Tool:
+
+    examples ="""
+EXAMPLES:
+# ./inject.py kmalloc -v 'SyS_mount()'
+    Fails all calls to syscall mount
+# ./inject.py kmalloc -v '(true) => SyS_mount()(true)'
+    Explicit rewriting of above
+# ./inject.py kmalloc -v 'mount_subtree() => btrfs_mount()'
+    Fails btrfs mounts only
+# ./inject.py kmalloc -v 'd_alloc_parallel(struct dentry *parent, const struct \\
+    qstr *name)(STRCMP(name->name, 'bananas'))'
+    Fails dentry allocations of files named 'bananas'
+# ./inject.py kmalloc -v -P 0.01 'SyS_mount()'
+    Fails calls to syscall mount with 1% probability
+    """
+    # add cases as necessary
+    error_injection_mapping = {
+        "kmalloc": "should_failslab(struct kmem_cache *s, gfp_t gfpflags)",
+        "bio": "should_fail_bio(struct bio *bio)",
+    }
+
+    def __init__(self):
+        parser = argparse.ArgumentParser(description="Fail specified kernel" +
+                " functionality when call chain and predicates are met",
+                formatter_class=argparse.RawDescriptionHelpFormatter,
+                epilog=Tool.examples)
+        parser.add_argument(dest="mode", choices=['kmalloc','bio'],
+                help="indicate which base kernel function to fail")
+        parser.add_argument(metavar="spec", dest="spec",
+                help="specify call chain")
+        parser.add_argument("-I", "--include", action="append",
+                metavar="header",
+                help="additional header files to include in the BPF program")
+        parser.add_argument("-P", "--probability", default=1,
+                metavar="probability", type=float,
+                help="probability that this call chain will fail")
+        parser.add_argument("-v", "--verbose", action="store_true",
+                help="print BPF program")
+        self.args = parser.parse_args()
+
+        self.program = ""
+        self.spec = self.args.spec
+        self.map = {}
+        self.probes = []
+        self.key = Tool.error_injection_mapping[self.args.mode]
+
+    # create_probes and associated stuff
+    def _create_probes(self):
+        self._parse_spec()
+        Probe.configure(self.args.mode, self.args.probability)
+        # self, func, preds, total, entry
+
+        # create all the pair probes
+        for fx, preds in self.map.items():
+
+            # do the enter
+            self.probes.append(Probe(fx, preds, self.length, True))
+
+            if self.key == fx:
+                continue
+
+            # do the exit
+            self.probes.append(Probe(fx, preds, self.length, False))
+
+    def _parse_frames(self):
+        # sentinel
+        data = self.spec + '\0'
+        start, count = 0, 0
+
+        frames = []
+        cur_frame = []
+        i = 0
+        last_frame_added = 0
+
+        while i < len(data):
+            # improper input
+            if count < 0:
+                raise Exception("Check your parentheses")
+            c = data[i]
+            count += c == '('
+            count -= c == ')'
+            if not count:
+                if c == '\0' or (c == '=' and data[i + 1] == '>'):
+                    # This block is closing a chunk. This means cur_frame must
+                    # have something in it.
+                    if not cur_frame:
+                        raise Exception("Cannot parse spec, missing parens")
+                    if len(cur_frame) == 2:
+                        frame = tuple(cur_frame)
+                    elif cur_frame[0][0] == '(':
+                        frame = self.key, cur_frame[0]
+                    else:
+                        frame = cur_frame[0], '(true)'
+                    frames.append(frame)
+                    del cur_frame[:]
+                    i += 1
+                    start = i + 1
+                elif c == ')':
+                    cur_frame.append(data[start:i + 1].strip())
+                    start = i + 1
+                    last_frame_added = start
+            i += 1
+
+        # We only permit spaces after the last frame
+        if self.spec[last_frame_added:].strip():
+            raise Exception("Invalid characters found after last frame");
+        # improper input
+        if count:
+            raise Exception("Check your parentheses")
+        return frames
+
+    def _parse_spec(self):
+        frames = self._parse_frames()
+        frames.reverse()
+
+        absolute_order = 0
+        for f in frames:
+            # default case
+            func, pred = f[0], f[1]
+
+            if not self._validate_predicate(pred):
+                raise Exception("Invalid predicate")
+            if not self._validate_identifier(func):
+                raise Exception("Invalid function identifier")
+            tup = (pred, absolute_order)
+
+            if func not in self.map:
+                self.map[func] = [tup]
+            else:
+                self.map[func].append(tup)
+
+            absolute_order += 1
+
+        if self.key not in self.map:
+            self.map[self.key] = [('(true)', absolute_order)]
+            absolute_order += 1
+
+        self.length = absolute_order
+
+    def _validate_identifier(self, func):
+        # We've already established paren balancing. We will only look for
+        # identifier validity here.
+        paren_index = func.find("(")
+        potential_id = func[:paren_index]
+        pattern = '[_a-zA-z][_a-zA-Z0-9]*$'
+        if re.match(pattern, potential_id):
+            return True
+        return False
+
+    def _validate_predicate(self, pred):
+
+        if len(pred) > 0 and pred[0] == "(":
+            open = 1
+            for i in range(1, len(pred)):
+                if pred[i] == "(":
+                    open += 1
+                elif pred[i] == ")":
+                    open -= 1
+            if open != 0:
+                # not well formed, break
+                return False
+
+        return True
+
+    def _def_pid_struct(self):
+        text = """
+struct pid_struct {
+    u64 curr_call; /* book keeping to handle recursion */
+    u64 conds_met; /* stack pointer */
+    u64 stack[%s];
+};
+""" % self.length
+        return text
+
+    def _attach_probes(self):
+        self.bpf = BPF(text=self.program)
+        for p in self.probes:
+            p.attach(self.bpf)
+
+    def _generate_program(self):
+        # leave out auto includes for now
+        self.program += '#include <linux/mm.h>\n'
+        for include in (self.args.include or []):
+            self.program += "#include <%s>\n" % include
+
+        self.program += self._def_pid_struct()
+        self.program += "BPF_HASH(m, u32, struct pid_struct);\n"
+        for p in self.probes:
+            self.program += p.generate_program() + "\n"
+
+        if self.args.verbose:
+            print(self.program)
+
+    def _main_loop(self):
+        while True:
+            self.bpf.perf_buffer_poll()
+
+    def run(self):
+        self._create_probes()
+        self._generate_program()
+        self._attach_probes()
+        self._main_loop()
+
+
+if __name__ == "__main__":
+    Tool().run()
diff --git a/tools/inject_example.txt b/tools/inject_example.txt
new file mode 100644
index 0000000..101a39d
--- /dev/null
+++ b/tools/inject_example.txt
@@ -0,0 +1,145 @@
+Some examples for inject
+
+inject guarantees the appropriate erroneous return of the specified injection
+mode (kmalloc,bio,etc) given a call chain and an optional set of predicates. You
+can also optionally print out the generated BPF program for
+modification/debugging purposes.
+
+As a simple example, let's say you wanted to fail all mounts. As of 4.17 we can
+fail syscalls directly, so let's do that:
+
+# ./inject.py kmalloc -v 'SyS_mount()'
+
+The first argument indicates the mode (or what to fail). Appropriate headers are
+specified, if necessary. The verbosity flag prints the generated program. Note
+that some syscalls will be available as 'SyS_xyz' and some will be available as
+'sys_xyz'. This is largely dependent on the number of arguments each syscall
+takes.
+
+Trying to mount various filesystems will fail and report an inability to
+allocate memory, as expected.
+
+Whenever a predicate is missing, an implicit "(true)" is inserted. The example
+above can be explicitly written as:
+
+# ./inject.py kmalloc -v '(true) => SyS_mount()(true)'
+
+The "(true)" without an associated function is a predicate for the error
+injection mechanism of the current mode. In the case of kmalloc, the predicate
+would have access to the arguments of:
+
+	int should_failslab(struct kmem_cache *s, gfp_t gfpflags);
+
+The bio mode works similarly, with access to the arguments of:
+	
+	static noinline int should_fail_bio(struct bio *bio)
+
+We also note that it's unnecessary to state the arguments of the function if you
+have no intention to reference them in the associated predicate.
+
+Now let's say we want to be a bit more specific; suppose you want to fail
+kmalloc() from mount_subtree() when called from btrfs_mount(). This will fail
+only btrfs mounts:
+
+# ./inject.py kmalloc -v 'mount_subtree() => btrfs_mount()'
+
+Attempting to mount btrfs filesystem during the execution of this command will
+yield an error, but other filesystems will be fine.
+
+Next, lets say we want to hit one of the BUG_ONs in fs/btrfs. As of 4.16-rc3,
+there is a BUG_ON in btrfs_prepare_close_one_device() at fs/btrfs/volumes.c:1002
+
+To hit this, we can use the following:
+
+# ./inject.py kmalloc -v 'btrfs_alloc_device() => btrfs_close_devices()'
+
+While the script was executing, I mounted and unmounted btrfs, causing a
+segfault on umount(since that satisfied the call path indicated). A look at
+dmesg will confirm that the erroneous return value injected by the script
+tripped the BUG_ON, causing a segfault down the line.
+
+In general, it's worth noting that the required specificity of the call chain is
+dependent on how much granularity you need. The example above might have
+performed as expected without the intermediate btrfs_alloc_device, but might
+have also done something unexpected(an earlier kmalloc could have failed before
+the one we were targetting).
+
+For hot paths, the approach outlined above isn't enough. If a path is traversed
+very often, we can distinguish distinct calls with function arguments. Let's say
+we want to fail the dentry allocation of a file creatively named 'bananas'. We
+can do the following:
+
+# ./inject.py kmalloc -v 'd_alloc_parallel(struct dentry *parent, const struct
+qstr *name)(STRCMP(name->name, 'bananas'))' 
+
+While this script is executing, any operation that would cause a dentry
+allocation where the name is 'bananas' fails, as expected.
+
+Here, since we're referencing a function argument in our predicate, we need to
+provide the function signature up to the argument we're using.
+
+To note, STRCMP is a workaround for some rewriter issues. It will take input of
+the form (x->...->z, 'literal'), and generate some equivalent code that the
+verifier is more friendly about. It's not horribly robust, but works for the
+purposes of making string comparisons a bit easier.
+
+Finally, we briefly demonstrate how to inject bio failures. The mechanism is
+identical, so any information from above will apply.
+
+Let's say we want to fail bio requests when the request is to some specific
+sector. An example use case would be to fail superblock writes in btrfs. For
+btrfs, we know that there must be a superblock at 65536 bytes, or sector 128.
+This allows us to run the following:
+
+# ./inject.py bio -v -I 'linux/blkdev.h'  '(({struct gendisk *d = bio->bi_disk;
+struct disk_part_tbl *tbl = d->part_tbl; struct hd_struct **parts = (void *)tbl +
+sizeof(struct disk_part_tbl); struct hd_struct **partp = parts + bio->bi_partno;
+struct hd_struct *p = *partp; dev_t disk = p->__dev.devt; disk ==
+MKDEV(254,16);}) && bio->bi_iter.bi_sector == 128)'
+
+The predicate in the command above has two parts. The first is a compound
+statement which shortens to "only if the system is btrfs", but is long due
+to rewriter/verifier shenanigans. The major/minor information can be found
+however; I used Python. The second part simply checks the starting
+address of bi_iter. While executing, this script effectively fails superblock
+writes to the superblock at sector 128 without affecting other filesystems.
+
+As an extension to the above, one could easily fail all btrfs superblock writes
+(we only fail the primary) by calculating the sector number of the mirrors and
+amending the predicate accordingly.
+
+Inject also provides a probability option; this allows you to fail the
+path+predicates some percentage of the time. For example, let's say we want to
+fail our mounts half the time:
+
+# ./inject.py kmalloc -v -P 0.01 'SyS_mount()'
+
+USAGE message:
+usage: inject.py [-h] [-I header] [-P probability] [-v] {kmalloc,bio} spec
+
+Fail specified kernel functionality when call chain and predicates are met
+
+positional arguments:
+  {kmalloc,bio}         indicate which base kernel function to fail
+  spec                  specify call chain
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -I header, --include header
+                        additional header files to include in the BPF program
+  -P probability, --probability probability
+                        probability that this call chain will fail
+  -v, --verbose         print BPF program
+
+EXAMPLES:
+# ./inject.py kmalloc -v 'SyS_mount()'
+    Fails all calls to syscall mount
+# ./inject.py kmalloc -v '(true) => SyS_mount()(true)'
+    Explicit rewriting of above
+# ./inject.py kmalloc -v 'mount_subtree() => btrfs_mount()'
+    Fails btrfs mounts only
+# ./inject.py kmalloc -v 'd_alloc_parallel(struct dentry *parent, const struct \
+    qstr *name)(STRCMP(name->name, 'bananas'))'
+    Fails dentry allocations of files named 'bananas'
+# ./inject.py kmalloc -v -P 0.01 'SyS_mount()'
+    Fails calls to syscall mount with 1% probability
diff --git a/tools/javacalls.sh b/tools/javacalls.sh
new file mode 100755
index 0000000..701510b
--- /dev/null
+++ b/tools/javacalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l java "$@"
diff --git a/tools/javacalls_example.txt b/tools/javacalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/javacalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/javaflow.sh b/tools/javaflow.sh
new file mode 100755
index 0000000..e39d153
--- /dev/null
+++ b/tools/javaflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l java "$@"
diff --git a/tools/javaflow_example.txt b/tools/javaflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/javaflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/javagc.sh b/tools/javagc.sh
new file mode 100755
index 0000000..ad59188
--- /dev/null
+++ b/tools/javagc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ugc.py -l java "$@"
diff --git a/tools/javagc_example.txt b/tools/javagc_example.txt
new file mode 120000
index 0000000..303ccbd
--- /dev/null
+++ b/tools/javagc_example.txt
@@ -0,0 +1 @@
+lib/ugc_example.txt
\ No newline at end of file
diff --git a/tools/javaobjnew.sh b/tools/javaobjnew.sh
new file mode 100755
index 0000000..e3f069b
--- /dev/null
+++ b/tools/javaobjnew.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uobjnew.py -l java "$@"
diff --git a/tools/javaobjnew_example.txt b/tools/javaobjnew_example.txt
new file mode 120000
index 0000000..a8a83c3
--- /dev/null
+++ b/tools/javaobjnew_example.txt
@@ -0,0 +1 @@
+lib/uobjnew_example.txt
\ No newline at end of file
diff --git a/tools/javastat.sh b/tools/javastat.sh
new file mode 100755
index 0000000..f758dca
--- /dev/null
+++ b/tools/javastat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l java "$@"
diff --git a/tools/javastat_example.txt b/tools/javastat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/javastat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/javathreads.sh b/tools/javathreads.sh
new file mode 100755
index 0000000..442d2c9
--- /dev/null
+++ b/tools/javathreads.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uthreads.py -l java "$@"
diff --git a/tools/javathreads_example.txt b/tools/javathreads_example.txt
new file mode 120000
index 0000000..4e678a8
--- /dev/null
+++ b/tools/javathreads_example.txt
@@ -0,0 +1 @@
+lib/uthreads_example.txt
\ No newline at end of file
diff --git a/tools/killsnoop.py b/tools/killsnoop.py
new file mode 100755
index 0000000..d60c72e
--- /dev/null
+++ b/tools/killsnoop.py
@@ -0,0 +1,148 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# killsnoop Trace signals issued by the kill() syscall.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: killsnoop [-h] [-x] [-p PID]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Sep-2015   Brendan Gregg   Created this.
+# 19-Feb-2016   Allan McAleavy migrated to BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import ArgString, printb
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./killsnoop           # trace all kill() signals
+    ./killsnoop -x        # only show failed kills
+    ./killsnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace signals issued by the kill() syscall",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed kill syscalls")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct val_t {
+   u64 pid;
+   int sig;
+   int tpid;
+   char comm[TASK_COMM_LEN];
+};
+
+struct data_t {
+   u64 pid;
+   int tpid;
+   int sig;
+   int ret;
+   char comm[TASK_COMM_LEN];
+};
+
+BPF_HASH(infotmp, u32, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+int syscall__kill(struct pt_regs *ctx, int tpid, int sig)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+
+    struct val_t val = {.pid = pid};
+    if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
+        val.tpid = tpid;
+        val.sig = sig;
+        infotmp.update(&pid, &val);
+    }
+
+    return 0;
+};
+
+int do_ret_sys_kill(struct pt_regs *ctx)
+{
+    struct data_t data = {};
+    struct val_t *valp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    valp = infotmp.lookup(&pid);
+    if (valp == 0) {
+        // missed entry
+        return 0;
+    }
+
+    bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
+    data.pid = pid;
+    data.tpid = valp->tpid;
+    data.ret = PT_REGS_RC(ctx);
+    data.sig = valp->sig;
+
+    events.perf_submit(ctx, &data, sizeof(data));
+    infotmp.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+kill_fnname = b.get_syscall_fnname("kill")
+b.attach_kprobe(event=kill_fnname, fn_name="syscall__kill")
+b.attach_kretprobe(event=kill_fnname, fn_name="do_ret_sys_kill")
+
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("tpid", ct.c_int),
+        ("sig", ct.c_int),
+        ("ret", ct.c_int),
+        ("comm", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# header
+print("%-9s %-6s %-16s %-4s %-6s %s" % (
+    "TIME", "PID", "COMM", "SIG", "TPID", "RESULT"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    if (args.failed and (event.ret >= 0)):
+        return
+
+    printb(b"%-9s %-6d %-16s %-4d %-6d %d" % (strftime("%H:%M:%S").encode('ascii'),
+        event.pid, event.comm, event.sig, event.tpid, event.ret))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/killsnoop_example.txt b/tools/killsnoop_example.txt
new file mode 100644
index 0000000..29d56b0
--- /dev/null
+++ b/tools/killsnoop_example.txt
@@ -0,0 +1,34 @@
+Demonstrations of killsnoop, the Linux eBPF/bcc version.
+
+
+This traces signals sent via the kill() syscall. For example:
+
+# ./killsnoop
+TIME      PID    COMM             SIG  TPID   RESULT
+12:10:51  13967  bash             9    13885  0
+12:11:34  13967  bash             9    1024   -3
+12:11:41  815    systemd-udevd    15   14076  0
+
+The first line showed a SIGKILL (9) sent from PID 13967 (a bash shell) to
+PID 13885. The result, 0, means success.
+
+The second line showed the same signal sent, this time resulting in a -3
+(ESRCH: no such process).
+
+
+USAGE message:
+
+# ./killsnoop -h
+usage: killsnoop [-h] [-x] [-p PID]
+
+Trace signals issued by the kill() syscall
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -x, --failed       only show failed kill syscalls
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./killsnoop           # trace all kill() signals
+    ./killsnoop -x        # only show failed kills
+    ./killsnoop -p 181    # only trace PID 181
diff --git a/tools/lib/CMakeLists.txt b/tools/lib/CMakeLists.txt
new file mode 100644
index 0000000..3ed2730
--- /dev/null
+++ b/tools/lib/CMakeLists.txt
@@ -0,0 +1,8 @@
+file(GLOB PY_FILES *.py)
+file(GLOB TXT_FILES *.txt)
+list(REMOVE_ITEM TXT_FILES ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt)
+foreach(FIL ${PY_FILES})
+  get_filename_component(FIL_WE ${FIL} NAME_WE)
+  install(PROGRAMS ${FIL} DESTINATION share/bcc/tools/lib RENAME ${FIL_WE})
+endforeach()
+install(FILES ${TXT_FILES} DESTINATION share/bcc/tools/doc/lib)
diff --git a/tools/lib/ucalls.py b/tools/lib/ucalls.py
new file mode 100755
index 0000000..18ca22c
--- /dev/null
+++ b/tools/lib/ucalls.py
@@ -0,0 +1,342 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ucalls  Summarize method calls in high-level languages and/or system calls.
+#         For Linux, uses BCC, eBPF.
+#
+# USAGE: ucalls [-l {java,perl,php,python,ruby,tcl}] [-h] [-T TOP] [-L] [-S] [-v] [-m]
+#        pid [interval]
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT, utils
+from time import sleep
+import os
+
+languages = ["java", "perl", "php", "python", "ruby", "tcl"]
+
+examples = """examples:
+    ./ucalls -l java 185        # trace Java calls and print statistics on ^C
+    ./ucalls -l python 2020 1   # trace Python calls and print every second
+    ./ucalls -l java 185 -S     # trace Java calls and syscalls
+    ./ucalls 6712 -S            # trace only syscall counts
+    ./ucalls -l ruby 1344 -T 10 # trace top 10 Ruby method calls
+    ./ucalls -l ruby 1344 -L    # trace Ruby calls including latency
+    ./ucalls -l php 443 -LS     # trace PHP calls and syscalls with latency
+    ./ucalls -l python 2020 -mL # trace Python calls including latency in ms
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize method calls in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("interval", type=int, nargs='?',
+    help="print every specified number of seconds")
+parser.add_argument("-l", "--language", choices=languages + ["none"],
+    help="language to trace (if none, trace syscalls only)")
+parser.add_argument("-T", "--top", type=int,
+    help="number of most frequent/slow calls to print")
+parser.add_argument("-L", "--latency", action="store_true",
+    help="record method latency from enter to exit (except recursive calls)")
+parser.add_argument("-S", "--syscalls", action="store_true",
+    help="record syscall latency (adds overhead)")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="report times in milliseconds (default is microseconds)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+language = args.language
+if not language:
+    language = utils.detect_language(languages, args.pid)
+
+# We assume that the entry and return probes have the same arguments. This is
+# the case for Java, Python, Ruby, and PHP. If there's a language where it's
+# not the case, we will need to build a custom correlator from entry to exit.
+extra_message = ""
+if language == "java":
+    # TODO for JVM entries, we actually have the real length of the class
+    #      and method strings in arg3 and arg5 respectively, so we can insert
+    #      the null terminator in its proper position.
+    entry_probe = "method__entry"
+    return_probe = "method__return"
+    read_class = "bpf_usdt_readarg(2, ctx, &clazz);"
+    read_method = "bpf_usdt_readarg(4, ctx, &method);"
+    extra_message = ("If you do not see any results, make sure you ran java"
+                     " with option -XX:+ExtendedDTraceProbes")
+elif language == "perl":
+    entry_probe = "sub__entry"
+    return_probe = "sub__return"
+    read_class = "bpf_usdt_readarg(2, ctx, &clazz);"    # filename really
+    read_method = "bpf_usdt_readarg(1, ctx, &method);"
+elif language == "php":
+    entry_probe = "function__entry"
+    return_probe = "function__return"
+    read_class = "bpf_usdt_readarg(4, ctx, &clazz);"
+    read_method = "bpf_usdt_readarg(1, ctx, &method);"
+    extra_message = ("If you do not see any results, make sure the environment"
+                     " variable USE_ZEND_DTRACE is set to 1")
+elif language == "python":
+    entry_probe = "function__entry"
+    return_probe = "function__return"
+    read_class = "bpf_usdt_readarg(1, ctx, &clazz);"    # filename really
+    read_method = "bpf_usdt_readarg(2, ctx, &method);"
+elif language == "ruby":
+    # TODO Also probe cmethod__entry and cmethod__return with same arguments
+    entry_probe = "method__entry"
+    return_probe = "method__return"
+    read_class = "bpf_usdt_readarg(1, ctx, &clazz);"
+    read_method = "bpf_usdt_readarg(2, ctx, &method);"
+elif language == "tcl":
+    # TODO Also consider probe cmd__entry and cmd__return with same arguments
+    entry_probe = "proc__entry"
+    return_probe = "proc__return"
+    read_class = ""  # no class/file info available
+    read_method = "bpf_usdt_readarg(1, ctx, &method);"
+elif not language or language == "none":
+    if not args.syscalls:
+        print("Nothing to do; use -S to trace syscalls.")
+        exit(1)
+    entry_probe, return_probe, read_class, read_method = ("", "", "", "")
+    if language:
+        language = None
+
+program = """
+#include <linux/ptrace.h>
+
+#define MAX_STRING_LENGTH 80
+DEFINE_NOLANG
+DEFINE_LATENCY
+DEFINE_SYSCALLS
+
+struct method_t {
+    char clazz[MAX_STRING_LENGTH];
+    char method[MAX_STRING_LENGTH];
+};
+struct entry_t {
+    u64 pid;
+    struct method_t method;
+};
+struct info_t {
+    u64 num_calls;
+    u64 total_ns;
+};
+struct syscall_entry_t {
+    u64 timestamp;
+    u64 ip;
+};
+
+#ifndef LATENCY
+  BPF_HASH(counts, struct method_t, u64);            // number of calls
+  #ifdef SYSCALLS
+    BPF_HASH(syscounts, u64, u64);                   // number of calls per IP
+  #endif  // SYSCALLS
+#else
+  BPF_HASH(times, struct method_t, struct info_t);
+  BPF_HASH(entry, struct entry_t, u64);              // timestamp at entry
+  #ifdef SYSCALLS
+    BPF_HASH(systimes, u64, struct info_t);          // latency per IP
+    BPF_HASH(sysentry, u64, struct syscall_entry_t); // ts + IP at entry
+  #endif  // SYSCALLS
+#endif
+
+#ifndef NOLANG
+int trace_entry(struct pt_regs *ctx) {
+    u64 clazz = 0, method = 0, val = 0;
+    u64 *valp;
+    struct entry_t data = {0};
+#ifdef LATENCY
+    u64 timestamp = bpf_ktime_get_ns();
+    data.pid = bpf_get_current_pid_tgid();
+#endif
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.method.clazz, sizeof(data.method.clazz),
+                   (void *)clazz);
+    bpf_probe_read(&data.method.method, sizeof(data.method.method),
+                   (void *)method);
+#ifndef LATENCY
+    valp = counts.lookup_or_init(&data.method, &val);
+    ++(*valp);
+#endif
+#ifdef LATENCY
+    entry.update(&data, &timestamp);
+#endif
+    return 0;
+}
+
+#ifdef LATENCY
+int trace_return(struct pt_regs *ctx) {
+    u64 *entry_timestamp, clazz = 0, method = 0;
+    struct info_t *info, zero = {};
+    struct entry_t data = {};
+    data.pid = bpf_get_current_pid_tgid();
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.method.clazz, sizeof(data.method.clazz),
+                   (void *)clazz);
+    bpf_probe_read(&data.method.method, sizeof(data.method.method),
+                   (void *)method);
+    entry_timestamp = entry.lookup(&data);
+    if (!entry_timestamp) {
+        return 0;   // missed the entry event
+    }
+    info = times.lookup_or_init(&data.method, &zero);
+    info->num_calls += 1;
+    info->total_ns += bpf_ktime_get_ns() - *entry_timestamp;
+    entry.delete(&data);
+    return 0;
+}
+#endif  // LATENCY
+#endif  // NOLANG
+
+#ifdef SYSCALLS
+int syscall_entry(struct pt_regs *ctx) {
+    u64 pid = bpf_get_current_pid_tgid();
+    u64 *valp, ip = PT_REGS_IP(ctx), val = 0;
+    PID_FILTER
+#ifdef LATENCY
+    struct syscall_entry_t data = {};
+    data.timestamp = bpf_ktime_get_ns();
+    data.ip = ip;
+#endif
+#ifndef LATENCY
+    valp = syscounts.lookup_or_init(&ip, &val);
+    ++(*valp);
+#endif
+#ifdef LATENCY
+    sysentry.update(&pid, &data);
+#endif
+    return 0;
+}
+
+#ifdef LATENCY
+int syscall_return(struct pt_regs *ctx) {
+    struct syscall_entry_t *e;
+    struct info_t *info, zero = {};
+    u64 pid = bpf_get_current_pid_tgid(), ip;
+    PID_FILTER
+    e = sysentry.lookup(&pid);
+    if (!e) {
+        return 0;   // missed the entry event
+    }
+    ip = e->ip;
+    info = systimes.lookup_or_init(&ip, &zero);
+    info->num_calls += 1;
+    info->total_ns += bpf_ktime_get_ns() - e->timestamp;
+    sysentry.delete(&pid);
+    return 0;
+}
+#endif  // LATENCY
+#endif  // SYSCALLS
+""".replace("READ_CLASS", read_class) \
+   .replace("READ_METHOD", read_method) \
+   .replace("PID_FILTER", "if ((pid >> 32) != %d) { return 0; }" % args.pid) \
+   .replace("DEFINE_NOLANG", "#define NOLANG" if not language else "") \
+   .replace("DEFINE_LATENCY", "#define LATENCY" if args.latency else "") \
+   .replace("DEFINE_SYSCALLS", "#define SYSCALLS" if args.syscalls else "")
+
+if language:
+    usdt = USDT(pid=args.pid)
+    usdt.enable_probe_or_bail(entry_probe, "trace_entry")
+    if args.latency:
+        usdt.enable_probe_or_bail(return_probe, "trace_return")
+else:
+    usdt = None
+
+if args.ebpf or args.verbose:
+    if args.verbose and usdt:
+        print(usdt.get_text())
+    print(program)
+    if args.ebpf:
+        exit()
+
+bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else [])
+if args.syscalls:
+    syscall_regex = "^[Ss]y[Ss]_.*"
+    bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry")
+    if args.latency:
+        bpf.attach_kretprobe(event_re=syscall_regex, fn_name="syscall_return")
+    print("Attached %d kernel probes for syscall tracing." %
+          bpf.num_open_kprobes())
+
+def get_data():
+    # Will be empty when no language was specified for tracing
+    if args.latency:
+        data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \
+                                    + "." + \
+                                    kv[0].method.decode('utf-8', 'replace'),
+                                   (kv[1].num_calls, kv[1].total_ns)),
+                   bpf["times"].items()))
+    else:
+        data = list(map(lambda kv: (kv[0].clazz.decode('utf-8', 'replace') \
+                                    + "." + \
+                                    kv[0].method.decode('utf-8', 'replace'),
+                                   (kv[1].value, 0)),
+                   bpf["counts"].items()))
+
+    if args.syscalls:
+        if args.latency:
+            syscalls = map(lambda kv: (bpf.ksym(kv[0].value),
+                                           (kv[1].num_calls, kv[1].total_ns)),
+                           bpf["systimes"].items())
+            data.extend(syscalls)
+        else:
+            syscalls = map(lambda kv: (bpf.ksym(kv[0].value),
+                                       (kv[1].value, 0)),
+                           bpf["syscounts"].items())
+            data.extend(syscalls)
+
+    return sorted(data, key=lambda kv: kv[1][1 if args.latency else 0])
+
+def clear_data():
+    if args.latency:
+        bpf["times"].clear()
+    else:
+        bpf["counts"].clear()
+
+    if args.syscalls:
+        if args.latency:
+            bpf["systimes"].clear()
+        else:
+            bpf["syscounts"].clear()
+
+exit_signaled = False
+print("Tracing calls in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, language or "none"))
+if extra_message:
+    print(extra_message)
+while True:
+    try:
+        sleep(args.interval or 99999999)
+    except KeyboardInterrupt:
+        exit_signaled = True
+    print()
+    data = get_data()   # [(function, (num calls, latency in ns))]
+    if args.latency:
+        time_col = "TIME (ms)" if args.milliseconds else "TIME (us)"
+        print("%-50s %8s %8s" % ("METHOD", "# CALLS", time_col))
+    else:
+        print("%-50s %8s" % ("METHOD", "# CALLS"))
+    if args.top:
+        data = data[-args.top:]
+    for key, value in data:
+        if args.latency:
+            time = value[1] / 1000000.0 if args.milliseconds else \
+                   value[1] / 1000.0
+            print("%-50s %8d %6.2f" % (key, value[0], time))
+        else:
+            print("%-50s %8d" % (key, value[0]))
+    if args.interval and not exit_signaled:
+        clear_data()
+    else:
+        if args.syscalls:
+            print("Detaching kernel probes, please wait...")
+        exit()
diff --git a/tools/lib/ucalls_example.txt b/tools/lib/ucalls_example.txt
new file mode 100644
index 0000000..7191fb8
--- /dev/null
+++ b/tools/lib/ucalls_example.txt
@@ -0,0 +1,94 @@
+Demonstrations of ucalls.
+
+
+ucalls summarizes method calls in various high-level languages, including Java,
+Perl, PHP, Python, Ruby, Tcl, and Linux system calls. It displays statistics on
+the most frequently called methods, as well as the latency (duration) of these
+methods.
+
+Through the syscalls support, ucalls can provide basic information on a 
+process' interaction with the system including syscall counts and latencies. 
+This can then be used for further exploration with other BCC tools like trace,
+argdist, biotop, fileslower, and others.
+
+For example, to trace method call latency in a Java application:
+
+# ucalls -L $(pidof java)
+Tracing calls in process 26877 (language: java)... Ctrl-C to quit.
+
+METHOD                                              # CALLS TIME (us)
+java/io/BufferedInputStream.getBufIfOpen                  1 7.00
+slowy/App.isSimplePrime                                8970 8858.35
+slowy/App.isDivisible                               3228196 3076985.12
+slowy/App.isPrime                                      8969 4841017.64
+^C
+
+
+To trace only syscalls in a particular process and print the top 10 most 
+frequently-invoked ones:
+
+# ucalls -ST 10 3018
+Attached 375 kernel probes for syscall tracing.
+Tracing calls in process 3018 (language: none)... Ctrl-C to quit.
+
+METHOD                                              # CALLS
+sys_rt_sigaction                                          4
+SyS_rt_sigprocmask                                        4
+sys_mprotect                                              5
+sys_read                                                 22
+SyS_write                                                39
+SyS_epoll_wait                                           42
+sys_futex                                               177
+SyS_mmap                                                180
+sys_mmap_pgoff                                          181
+sys_munmap                                              817
+^C
+Detaching kernel probes, please wait...
+
+
+To print only the top 5 methods and report times in milliseconds (the default
+is microseconds):
+
+# ucalls -mT 5 $(pidof python)
+Tracing calls in process 26914 (language: python)... Ctrl-C to quit.
+
+METHOD                                              # CALLS
+<stdin>.<module>                                          1
+<stdin>.fibo                                       14190928
+^C
+
+
+USAGE message:
+
+# ./ucalls.py -h
+usage: ucalls.py [-h] [-l {java,perl,php,python,ruby,tcl,none}] [-T TOP] [-L] [-S] [-v]
+                 [-m]
+                 pid [interval]
+
+Summarize method calls in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+  interval              print every specified number of seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,perl,php,python,ruby,tcl,none}, --language {java,perl,php,python,ruby,tcl,none}
+                        language to trace (if none, trace syscalls only)
+  -T TOP, --top TOP     number of most frequent/slow calls to print
+  -L, --latency         record method latency from enter to exit (except
+                        recursive calls)
+  -S, --syscalls        record syscall latency (adds overhead)
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+  -m, --milliseconds    report times in milliseconds (default is microseconds)
+
+examples:
+    ./ucalls -l java 185        # trace Java calls and print statistics on ^C
+    ./ucalls -l python 2020 1   # trace Python calls and print every second
+    ./ucalls -l java 185 -S     # trace Java calls and syscalls
+    ./ucalls 6712 -S            # trace only syscall counts
+    ./ucalls -l ruby 1344 -T 10 # trace top 10 Ruby method calls
+    ./ucalls -l ruby 1344 -L    # trace Ruby calls including latency
+    ./ucalls -l php 443 -LS     # trace PHP calls and syscalls with latency
+    ./ucalls -l python 2020 -mL # trace Python calls including latency in ms
diff --git a/tools/lib/uflow.py b/tools/lib/uflow.py
new file mode 100755
index 0000000..02cad55
--- /dev/null
+++ b/tools/lib/uflow.py
@@ -0,0 +1,209 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uflow  Trace method execution flow in high-level languages.
+#        For Linux, uses BCC, eBPF.
+#
+# USAGE: uflow [-C CLASS] [-M METHOD] [-v] {java,perl,php,python,ruby,tcl} pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 27-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT, utils
+import ctypes as ct
+import time
+import os
+
+languages = ["java", "perl", "php", "python", "ruby", "tcl"]
+
+examples = """examples:
+    ./uflow -l java 185                # trace Java method calls in process 185
+    ./uflow -l ruby 134                # trace Ruby method calls in process 134
+    ./uflow -M indexOf -l java 185     # trace only 'indexOf'-prefixed methods
+    ./uflow -C '<stdin>' -l python 180 # trace only REPL-defined methods
+"""
+parser = argparse.ArgumentParser(
+    description="Trace method execution flow in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--language", choices=languages,
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-M", "--method",
+    help="trace only calls to methods starting with this prefix")
+parser.add_argument("-C", "--class", dest="clazz",
+    help="trace only calls to classes starting with this prefix")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct call_t {
+    u64 depth;                  // first bit is direction (0 entry, 1 return)
+    u64 pid;                    // (tgid << 32) + pid from bpf_get_current...
+    char clazz[80];
+    char method[80];
+};
+
+BPF_PERF_OUTPUT(calls);
+BPF_HASH(entry, u64, u64);
+"""
+
+prefix_template = """
+static inline bool prefix_%s(char *actual) {
+    char expected[] = "%s";
+    for (int i = 0; i < sizeof(expected) - 1; ++i) {
+        if (expected[i] != actual[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+"""
+
+if args.clazz:
+    program += prefix_template % ("class", args.clazz)
+if args.method:
+    program += prefix_template % ("method", args.method)
+
+trace_template = """
+int NAME(struct pt_regs *ctx) {
+    u64 *depth, zero = 0, clazz = 0, method = 0 ;
+    struct call_t data = {};
+
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.clazz, sizeof(data.clazz), (void *)clazz);
+    bpf_probe_read(&data.method, sizeof(data.method), (void *)method);
+
+    FILTER_CLASS
+    FILTER_METHOD
+
+    data.pid = bpf_get_current_pid_tgid();
+    depth = entry.lookup_or_init(&data.pid, &zero);
+    data.depth = DEPTH;
+    UPDATE
+
+    calls.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+
+def enable_probe(probe_name, func_name, read_class, read_method, is_return):
+    global program, trace_template, usdt
+    depth = "*depth + 1" if not is_return else "*depth | (1ULL << 63)"
+    update = "++(*depth);" if not is_return else "if (*depth) --(*depth);"
+    filter_class = "if (!prefix_class(data.clazz)) { return 0; }" \
+                   if args.clazz else ""
+    filter_method = "if (!prefix_method(data.method)) { return 0; }" \
+                   if args.method else ""
+    program += trace_template.replace("NAME", func_name)                \
+                             .replace("READ_CLASS", read_class)         \
+                             .replace("READ_METHOD", read_method)       \
+                             .replace("FILTER_CLASS", filter_class)     \
+                             .replace("FILTER_METHOD", filter_method)   \
+                             .replace("DEPTH", depth)                   \
+                             .replace("UPDATE", update)
+    usdt.enable_probe_or_bail(probe_name, func_name)
+
+usdt = USDT(pid=args.pid)
+
+language = args.language
+if not language:
+    language = utils.detect_language(languages, args.pid)
+
+if language == "java":
+    enable_probe("method__entry", "java_entry",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(4, ctx, &method);", is_return=False)
+    enable_probe("method__return", "java_return",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(4, ctx, &method);", is_return=True)
+elif language == "perl":
+    enable_probe("sub__entry", "perl_entry",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=False)
+    enable_probe("sub__return", "perl_return",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=True)
+elif language == "php":
+    enable_probe("function__entry", "php_entry",
+                 "bpf_usdt_readarg(4, ctx, &clazz);",
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=False)
+    enable_probe("function__return", "php_return",
+                 "bpf_usdt_readarg(4, ctx, &clazz);",
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=True)
+elif language == "python":
+    enable_probe("function__entry", "python_entry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",   # filename really
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("function__return", "python_return",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",   # filename really
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+elif language == "ruby":
+    enable_probe("method__entry", "ruby_entry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("method__return", "ruby_return",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+    enable_probe("cmethod__entry", "ruby_centry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("cmethod__return", "ruby_creturn",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+elif language == "tcl":
+    enable_probe("proc__args", "tcl_entry",
+                 "",  # no class/file info available
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=False)
+    enable_probe("proc__return", "tcl_return",
+                 "",  # no class/file info available
+                 "bpf_usdt_readarg(1, ctx, &method);", is_return=True)
+else:
+    print("No language detected; use -l to trace a language.")
+    exit(1)
+
+if args.ebpf or args.verbose:
+    if args.verbose:
+        print(usdt.get_text())
+    print(program)
+    if args.ebpf:
+        exit()
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing method calls in %s process %d... Ctrl-C to quit." %
+      (language, args.pid))
+print("%-3s %-6s %-6s %-8s %s" % ("CPU", "PID", "TID", "TIME(us)", "METHOD"))
+
+class CallEvent(ct.Structure):
+    _fields_ = [
+        ("depth", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("clazz", ct.c_char * 80),
+        ("method", ct.c_char * 80)
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(CallEvent)).contents
+    depth = event.depth & (~(1 << 63))
+    direction = "<- " if event.depth & (1 << 63) else "-> "
+    print("%-3d %-6d %-6d %-8.3f %-40s" % (cpu, event.pid >> 32,
+        event.pid & 0xFFFFFFFF, time.time() - start_ts,
+        ("  " * (depth - 1)) + direction + \
+            event.clazz.decode('utf-8', 'replace') + "." + \
+            event.method.decode('utf-8', 'replace')))
+
+bpf["calls"].open_perf_buffer(print_event)
+while 1:
+    bpf.perf_buffer_poll()
diff --git a/tools/lib/uflow_example.txt b/tools/lib/uflow_example.txt
new file mode 100644
index 0000000..5dccb8f
--- /dev/null
+++ b/tools/lib/uflow_example.txt
@@ -0,0 +1,114 @@
+Demonstrations of uflow.
+
+
+uflow traces method entry and exit events and prints a visual flow graph that
+shows how methods are entered and exited, similar to a tracing debugger with
+breakpoints. This can be useful for understanding program flow in high-level
+languages such as Java, Perl, PHP, Python, Ruby, and Tcl which provide USDT
+probes for method invocations.
+
+
+For example, trace all Ruby method calls in a specific process:
+
+# ./uflow -l ruby 27245
+Tracing method calls in ruby process 27245... Ctrl-C to quit.
+CPU PID    TID    TIME(us) METHOD
+3   27245  27245  4.536    <- IO.gets                              
+3   27245  27245  4.536    <- IRB::StdioInputMethod.gets           
+3   27245  27245  4.536    -> IRB::Context.verbose?                
+3   27245  27245  4.536      -> NilClass.nil?                      
+3   27245  27245  4.536      <- NilClass.nil?                      
+3   27245  27245  4.536      -> IO.tty?                            
+3   27245  27245  4.536      <- IO.tty?                            
+3   27245  27245  4.536      -> Kernel.kind_of?                    
+3   27245  27245  4.536      <- Kernel.kind_of?                    
+3   27245  27245  4.536    <- IRB::Context.verbose?                
+3   27245  27245  4.536    <- IRB::Irb.signal_status               
+3   27245  27245  4.536    -> String.chars                         
+3   27245  27245  4.536    <- String.chars                         
+^C
+
+In the preceding output, indentation indicates the depth of the flow graph,
+and the <- and -> arrows indicate the direction of the event (exit or entry).
+
+Often, the amount of output can be overwhelming. You can filter specific 
+classes or methods. For example, trace only methods from the Thread class:
+
+# ./uflow -C java/lang/Thread $(pidof java)
+Tracing method calls in java process 27722... Ctrl-C to quit.
+CPU PID    TID    TIME(us) METHOD
+3   27722  27731  3.144    -> java/lang/Thread.<init>              
+3   27722  27731  3.144      -> java/lang/Thread.init              
+3   27722  27731  3.144        -> java/lang/Thread.init            
+3   27722  27731  3.144          -> java/lang/Thread.currentThread 
+3   27722  27731  3.144          <- java/lang/Thread.currentThread 
+3   27722  27731  3.144          -> java/lang/Thread.getThreadGroup
+3   27722  27731  3.144          <- java/lang/Thread.getThreadGroup
+3   27722  27731  3.144          -> java/lang/ThreadGroup.checkAccess
+3   27722  27731  3.144          <- java/lang/ThreadGroup.checkAccess
+3   27722  27731  3.144          -> java/lang/ThreadGroup.addUnstarted
+3   27722  27731  3.144          <- java/lang/ThreadGroup.addUnstarted
+3   27722  27731  3.145          -> java/lang/Thread.isDaemon     
+3   27722  27731  3.145          <- java/lang/Thread.isDaemon     
+3   27722  27731  3.145          -> java/lang/Thread.getPriority   
+3   27722  27731  3.145          <- java/lang/Thread.getPriority   
+3   27722  27731  3.145          -> java/lang/Thread.getContextClassLoader
+3   27722  27731  3.145          <- java/lang/Thread.getContextClassLoader
+3   27722  27731  3.145          -> java/lang/Thread.setPriority   
+3   27722  27731  3.145            -> java/lang/Thread.checkAccess 
+3   27722  27731  3.145            <- java/lang/Thread.checkAccess 
+3   27722  27731  3.145            -> java/lang/Thread.getThreadGroup
+3   27722  27731  3.145            <- java/lang/Thread.getThreadGroup
+3   27722  27731  3.145            -> java/lang/ThreadGroup.getMaxPriority
+3   27722  27731  3.145            <- java/lang/ThreadGroup.getMaxPriority
+3   27722  27731  3.145            -> java/lang/Thread.setPriority0
+3   27722  27731  3.145            <- java/lang/Thread.setPriority0
+3   27722  27731  3.145          <- java/lang/Thread.setPriority   
+3   27722  27731  3.145          -> java/lang/Thread.nextThreadID  
+3   27722  27731  3.145          <- java/lang/Thread.nextThreadID  
+3   27722  27731  3.145        <- java/lang/Thread.init            
+3   27722  27731  3.145      <- java/lang/Thread.init              
+3   27722  27731  3.145    <- java/lang/Thread.<init>              
+3   27722  27731  3.145    -> java/lang/Thread.start               
+3   27722  27731  3.145      -> java/lang/ThreadGroup.add          
+3   27722  27731  3.145      <- java/lang/ThreadGroup.add          
+3   27722  27731  3.145      -> java/lang/Thread.start0            
+3   27722  27731  3.145      <- java/lang/Thread.start0            
+3   27722  27731  3.146    <- java/lang/Thread.start               
+2   27722  27742  3.146    -> java/lang/Thread.run                 
+^C
+
+The reason that the CPU number is printed in the first column is that events
+from different threads can be reordered when running on different CPUs, and
+produce non-sensible output. By looking for changes in the CPU column, you can
+easily see if the events you're following make sense and belong to the same
+thread running on the same CPU.
+
+
+USAGE message:
+
+# ./uflow -h
+usage: uflow.py [-h] [-l {java,perl,php,python,ruby,tcl}] [-M METHOD] [-C CLAZZ] [-v]
+                pid
+
+Trace method execution flow in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,perl,php,python,ruby,tcl}, --language {java,perl,php,python,ruby,tcl}
+                        language to trace
+  -M METHOD, --method METHOD
+                        trace only calls to methods starting with this prefix
+  -C CLAZZ, --class CLAZZ
+                        trace only calls to classes starting with this prefix
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uflow -l java 185                # trace Java method calls in process 185
+    ./uflow -l ruby 134                # trace Ruby method calls in process 134
+    ./uflow -M indexOf -l java 185     # trace only 'indexOf'-prefixed methods
+    ./uflow -C '<stdin>' -l python 180 # trace only REPL-defined methods
diff --git a/tools/lib/ugc.py b/tools/lib/ugc.py
new file mode 100755
index 0000000..8288910
--- /dev/null
+++ b/tools/lib/ugc.py
@@ -0,0 +1,247 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ugc  Summarize garbage collection events in high-level languages.
+#      For Linux, uses BCC, eBPF.
+#
+# USAGE: ugc [-v] [-m] [-M MSEC] [-F FILTER] {java,node,python,ruby} pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT, utils
+import ctypes as ct
+import time
+import os
+
+languages = ["java", "node", "python", "ruby"]
+
+examples = """examples:
+    ./ugc -l java 185        # trace Java GCs in process 185
+    ./ugc -l ruby 1344 -m    # trace Ruby GCs reporting in ms
+    ./ugc -M 10 -l java 185  # trace only Java GCs longer than 10ms
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize garbage collection events in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--language", choices=languages,
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="report times in milliseconds (default is microseconds)")
+parser.add_argument("-M", "--minimum", type=int, default=0,
+    help="display only GCs longer than this many milliseconds")
+parser.add_argument("-F", "--filter", type=str,
+    help="display only GCs whose description contains this text")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct gc_event_t {
+    u64 probe_index;
+    u64 elapsed_ns;
+    u64 field1;
+    u64 field2;
+    u64 field3;
+    u64 field4;
+    char string1[32];
+    char string2[32];
+};
+struct entry_t {
+    u64 start_ns;
+    u64 field1;
+    u64 field2;
+};
+
+BPF_PERF_OUTPUT(gcs);
+BPF_HASH(entry, u64, struct entry_t);
+"""
+
+class Probe(object):
+    def __init__(self, begin, end, begin_save, end_save, formatter):
+        self.begin = begin
+        self.end = end
+        self.begin_save = begin_save
+        self.end_save = end_save
+        self.formatter = formatter
+
+    def generate(self):
+        text = """
+int trace_%s(struct pt_regs *ctx) {
+    u64 pid = bpf_get_current_pid_tgid();
+    struct entry_t e = {};
+    e.start_ns = bpf_ktime_get_ns();
+    %s
+    entry.update(&pid, &e);
+    return 0;
+}
+int trace_%s(struct pt_regs *ctx) {
+    u64 elapsed;
+    struct entry_t *e;
+    struct gc_event_t event = {};
+    u64 pid = bpf_get_current_pid_tgid();
+    e = entry.lookup(&pid);
+    if (!e) {
+        return 0;   // missed the entry event on this thread
+    }
+    elapsed = bpf_ktime_get_ns() - e->start_ns;
+    if (elapsed < %d) {
+        return 0;
+    }
+    event.elapsed_ns = elapsed;
+    %s
+    gcs.perf_submit(ctx, &event, sizeof(event));
+    return 0;
+}
+        """ % (self.begin, self.begin_save, self.end,
+               args.minimum * 1000000, self.end_save)
+        return text
+
+    def attach(self):
+        usdt.enable_probe_or_bail(self.begin, "trace_%s" % self.begin)
+        usdt.enable_probe_or_bail(self.end, "trace_%s" % self.end)
+
+    def format(self, data):
+        return self.formatter(data)
+
+probes = []
+
+language = args.language
+if not language:
+    language = utils.detect_language(languages, args.pid)
+
+#
+# Java
+#
+if language == "java":
+    # Oddly, the gc__begin/gc__end probes don't really have any useful
+    # information, while the mem__pool* ones do. There's also a bunch of
+    # probes described in the hotspot_gc*.stp file which aren't there
+    # when looking at a live Java process.
+    begin_save = """
+    bpf_usdt_readarg(6, ctx, &e.field1);    // used bytes
+    bpf_usdt_readarg(8, ctx, &e.field2);    // max bytes
+    """
+    end_save = """
+    event.field1 = e->field1;                  // used bytes at start
+    event.field2 = e->field2;                  // max bytes at start
+    bpf_usdt_readarg(6, ctx, &event.field3);   // used bytes at end
+    bpf_usdt_readarg(8, ctx, &event.field4);   // max bytes at end
+    u64 manager = 0, pool = 0;
+    bpf_usdt_readarg(1, ctx, &manager);        // ptr to manager name
+    bpf_usdt_readarg(3, ctx, &pool);           // ptr to pool name
+    bpf_probe_read(&event.string1, sizeof(event.string1), (void *)manager);
+    bpf_probe_read(&event.string2, sizeof(event.string2), (void *)pool);
+    """
+
+    def formatter(e):
+        "%s %s used=%d->%d max=%d->%d" % \
+            (e.string1, e.string2, e.field1, e.field3, e.field2, e.field4)
+    probes.append(Probe("mem__pool__gc__begin", "mem__pool__gc__end",
+                        begin_save, end_save, formatter))
+    probes.append(Probe("gc__begin", "gc__end",
+                        "", "", lambda _: "no additional info available"))
+#
+# Node
+#
+elif language == "node":
+    end_save = """
+    u32 gc_type = 0;
+    bpf_usdt_readarg(1, ctx, &gc_type);
+    event.field1 = gc_type;
+    """
+    descs = {"GC scavenge": 1, "GC mark-sweep-compact": 2,
+             "GC incremental mark": 4, "GC weak callbacks": 8}
+    probes.append(Probe("gc__start", "gc__done", "", end_save,
+                  lambda e: str.join(", ",
+                                     [desc for desc, val in descs.items()
+                                      if e.field1 & val != 0])))
+#
+# Python
+#
+elif language == "python":
+    begin_save = """
+    int gen = 0;
+    bpf_usdt_readarg(1, ctx, &gen);
+    e.field1 = gen;
+    """
+    end_save = """
+    long objs = 0;
+    bpf_usdt_readarg(1, ctx, &objs);
+    event.field1 = e->field1;
+    event.field2 = objs;
+    """
+
+    def formatter(event):
+        "gen %d GC collected %d objects" % \
+            (event.field1, event.field2)
+    probes.append(Probe("gc__start", "gc__done",
+                        begin_save, end_save, formatter))
+#
+# Ruby
+#
+elif language == "ruby":
+    # Ruby GC probes do not have any additional information available.
+    probes.append(Probe("gc__mark__begin", "gc__mark__end",
+                        "", "", lambda _: "GC mark stage"))
+    probes.append(Probe("gc__sweep__begin", "gc__sweep__end",
+                        "", "", lambda _: "GC sweep stage"))
+
+else:
+    print("No language detected; use -l to trace a language.")
+    exit(1)
+
+
+for probe in probes:
+    program += probe.generate()
+    probe.attach()
+
+if args.ebpf or args.verbose:
+    if args.verbose:
+        print(usdt.get_text())
+    print(program)
+    if args.ebpf:
+        exit()
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing garbage collections in %s process %d... Ctrl-C to quit." %
+      (language, args.pid))
+time_col = "TIME (ms)" if args.milliseconds else "TIME (us)"
+print("%-8s %-8s %-40s" % ("START", time_col, "DESCRIPTION"))
+
+class GCEvent(ct.Structure):
+    _fields_ = [
+        ("probe_index", ct.c_ulonglong),
+        ("elapsed_ns", ct.c_ulonglong),
+        ("field1", ct.c_ulonglong),
+        ("field2", ct.c_ulonglong),
+        ("field3", ct.c_ulonglong),
+        ("field4", ct.c_ulonglong),
+        ("string1", ct.c_char * 32),
+        ("string2", ct.c_char * 32)
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(GCEvent)).contents
+    elapsed = event.elapsed_ns / 1000000 if args.milliseconds else \
+              event.elapsed_ns / 1000
+    description = probes[event.probe_index].format(event)
+    if args.filter and args.filter not in description:
+        return
+    print("%-8.3f %-8.2f %s" % (time.time() - start_ts, elapsed, description))
+
+bpf["gcs"].open_perf_buffer(print_event)
+while 1:
+    bpf.perf_buffer_poll()
diff --git a/tools/lib/ugc_example.txt b/tools/lib/ugc_example.txt
new file mode 100644
index 0000000..083cdb6
--- /dev/null
+++ b/tools/lib/ugc_example.txt
@@ -0,0 +1,95 @@
+Demonstrations of ugc.
+
+
+ugc traces garbage collection events in high-level languages, including Java,
+Python, Ruby, and Node. Each GC event is printed with some additional 
+information provided by that language's runtime, if available. The duration of
+the GC event is also provided.
+
+For example, to trace all garbage collection events in a specific Node process:
+
+# ugc $(pidof node)
+Tracing garbage collections in node process 30012... Ctrl-C to quit.
+START    TIME (us) DESCRIPTION                             
+1.500    1181.00  GC scavenge
+1.505    1704.00  GC scavenge
+1.509    1534.00  GC scavenge
+1.515    1953.00  GC scavenge
+1.519    2155.00  GC scavenge
+1.525    2055.00  GC scavenge
+1.530    2164.00  GC scavenge
+1.536    2170.00  GC scavenge
+1.541    2237.00  GC scavenge
+1.547    1982.00  GC scavenge
+1.551    2333.00  GC scavenge
+1.557    2043.00  GC scavenge
+1.561    2028.00  GC scavenge
+1.573    3650.00  GC scavenge
+1.580    4443.00  GC scavenge
+1.604    6236.00  GC scavenge
+1.615    8324.00  GC scavenge
+1.659    11249.00 GC scavenge
+1.678    16084.00 GC scavenge
+1.747    15250.00 GC scavenge
+1.937    191.00   GC incremental mark
+2.001    63120.00 GC mark-sweep-compact
+3.185    153.00   GC incremental mark
+3.207    20847.00 GC mark-sweep-compact
+^C
+
+The above output shows some fairly long GCs, notably around 2 seconds in there
+is a collection that takes over 60ms (mark-sweep-compact).
+
+Occasionally, it might be useful to filter out collections that are very short,
+or display only collections that have a specific description. The -M and -F
+switches can be useful for this:
+
+# ugc -F Tenured $(pidof java)
+Tracing garbage collections in java process 29907... Ctrl-C to quit.
+START    TIME (us) DESCRIPTION                             
+0.360    4309.00  MarkSweepCompact Tenured Gen used=287528->287528 max=173408256->173408256
+2.459    4232.00  MarkSweepCompact Tenured Gen used=287528->287528 max=173408256->173408256
+4.648    4139.00  MarkSweepCompact Tenured Gen used=287528->287528 max=173408256->173408256
+^C
+
+# ugc -M 1 $(pidof java)
+Tracing garbage collections in java process 29907... Ctrl-C to quit.
+START    TIME (us) DESCRIPTION                             
+0.160    3715.00  MarkSweepCompact Code Cache used=287528->3209472 max=173408256->251658240
+0.160    3975.00  MarkSweepCompact Metaspace used=287528->3092104 max=173408256->18446744073709551615
+0.160    4058.00  MarkSweepCompact Compressed Class Space used=287528->266840 max=173408256->1073741824
+0.160    4110.00  MarkSweepCompact Eden Space used=287528->0 max=173408256->69337088
+0.160    4159.00  MarkSweepCompact Survivor Space used=287528->0 max=173408256->8650752
+0.160    4207.00  MarkSweepCompact Tenured Gen used=287528->287528 max=173408256->173408256
+0.160    4289.00    used=0->0 max=0->0
+^C
+
+
+USAGE message:
+
+# ugc -h
+usage: ugc.py [-h] [-l {java,python,ruby,node}] [-v] [-m] [-M MINIMUM]
+              [-F FILTER]
+              pid
+
+Summarize garbage collection events in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,python,ruby,node}, --language {java,python,ruby,node}
+                        language to trace
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+  -m, --milliseconds    report times in milliseconds (default is microseconds)
+  -M MINIMUM, --minimum MINIMUM
+                        display only GCs longer than this many milliseconds
+  -F FILTER, --filter FILTER
+                        display only GCs whose description contains this text
+
+examples:
+    ./ugc -l java 185        # trace Java GCs in process 185
+    ./ugc -l ruby 1344 -m    # trace Ruby GCs reporting in ms
+    ./ugc -M 10 -l java 185  # trace only Java GCs longer than 10ms
diff --git a/tools/lib/uobjnew.py b/tools/lib/uobjnew.py
new file mode 100755
index 0000000..85f5768
--- /dev/null
+++ b/tools/lib/uobjnew.py
@@ -0,0 +1,200 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uobjnew  Summarize object allocations in high-level languages.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: uobjnew [-h] [-T TOP] [-v] {c,java,ruby,tcl} pid [interval]
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT, utils
+from time import sleep
+import os
+
+# C needs to be the last language.
+languages = ["c", "java", "ruby", "tcl"]
+
+examples = """examples:
+    ./uobjnew -l java 145         # summarize Java allocations in process 145
+    ./uobjnew -l c 2020 1         # grab malloc() sizes and print every second
+    ./uobjnew -l ruby 6712 -C 10  # top 10 Ruby types by number of allocations
+    ./uobjnew -l ruby 6712 -S 10  # top 10 Ruby types by total size
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize object allocations in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--language", choices=languages,
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("interval", type=int, nargs='?',
+    help="print every specified number of seconds")
+parser.add_argument("-C", "--top-count", type=int,
+    help="number of most frequently allocated types to print")
+parser.add_argument("-S", "--top-size", type=int,
+    help="number of largest types by allocated bytes to print")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+language = args.language
+if not language:
+    language = utils.detect_language(languages, args.pid)
+
+program = """
+#include <linux/ptrace.h>
+
+struct key_t {
+#if MALLOC_TRACING
+    u64 size;
+#else
+    char name[50];
+#endif
+};
+
+struct val_t {
+    u64 total_size;
+    u64 num_allocs;
+};
+
+BPF_HASH(allocs, struct key_t, struct val_t);
+""".replace("MALLOC_TRACING", "1" if language == "c" else "0")
+
+usdt = USDT(pid=args.pid)
+
+#
+# C
+#
+if language == "c":
+    program += """
+int alloc_entry(struct pt_regs *ctx, size_t size) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    key.size = size;
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+#
+# Java
+#
+elif language == "java":
+    program += """
+int alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    u64 classptr = 0, size = 0;
+    bpf_usdt_readarg(2, ctx, &classptr);
+    bpf_usdt_readarg(4, ctx, &size);
+    bpf_probe_read(&key.name, sizeof(key.name), (void *)classptr);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+    usdt.enable_probe_or_bail("object__alloc", "alloc_entry")
+#
+# Ruby
+#
+elif language == "ruby":
+    create_template = """
+int THETHING_alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = { .name = "THETHING" };
+    struct val_t *valp, zero = {};
+    u64 size = 0;
+    bpf_usdt_readarg(1, ctx, &size);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+    program += """
+int object_alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    u64 classptr = 0;
+    bpf_usdt_readarg(1, ctx, &classptr);
+    bpf_probe_read(&key.name, sizeof(key.name), (void *)classptr);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->num_allocs += 1;  // We don't know the size, unfortunately
+    return 0;
+}
+    """
+    usdt.enable_probe_or_bail("object__create", "object_alloc_entry")
+    for thing in ["string", "hash", "array"]:
+        program += create_template.replace("THETHING", thing)
+        usdt.enable_probe_or_bail("%s__create" % thing,
+                                  "%s_alloc_entry" % thing)
+#
+# Tcl
+#
+elif language == "tcl":
+    program += """
+int alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = { .name = "<ALL>" };
+    struct val_t *valp, zero = {};
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+    usdt.enable_probe_or_bail("obj__create", "alloc_entry")
+else:
+    print("No language detected; use -l to trace a language.")
+    exit(1)
+
+
+if args.ebpf or args.verbose:
+    if args.verbose:
+        print(usdt.get_text())
+    print(program)
+    if args.ebpf:
+        exit()
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+if language == "c":
+    bpf.attach_uprobe(name="c", sym="malloc", fn_name="alloc_entry",
+                      pid=args.pid)
+
+exit_signaled = False
+print("Tracing allocations in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, language or "none"))
+while True:
+    try:
+        sleep(args.interval or 99999999)
+    except KeyboardInterrupt:
+        exit_signaled = True
+    print()
+    data = bpf["allocs"]
+    if args.top_count:
+        data = sorted(data.items(), key=lambda kv: kv[1].num_allocs)
+        data = data[-args.top_count:]
+    elif args.top_size:
+        data = sorted(data.items(), key=lambda kv: kv[1].total_size)
+        data = data[-args.top_size:]
+    else:
+        data = sorted(data.items(), key=lambda kv: kv[1].total_size)
+    print("%-30s %8s %12s" % ("NAME/TYPE", "# ALLOCS", "# BYTES"))
+    for key, value in data:
+        if language == "c":
+            obj_type = "block size %d" % key.size
+        else:
+            obj_type = key.name
+        print("%-30s %8d %12d" %
+              (obj_type, value.num_allocs, value.total_size))
+    if args.interval and not exit_signaled:
+        bpf["allocs"].clear()
+    else:
+        exit()
diff --git a/tools/lib/uobjnew_example.txt b/tools/lib/uobjnew_example.txt
new file mode 100644
index 0000000..fcb2d21
--- /dev/null
+++ b/tools/lib/uobjnew_example.txt
@@ -0,0 +1,75 @@
+Demonstrations of uobjnew.
+
+
+uobjnew summarizes new object allocation events and prints out statistics on
+which object type has been allocated frequently, and how many bytes of that
+type have been allocated. This helps diagnose common allocation paths, which
+can in turn cause heavy garbage collection.
+
+For example, trace Ruby object allocations when running some simple commands
+in irb (the Ruby REPL):
+
+# ./uobjnew -l ruby 27245
+Tracing allocations in process 27245 (language: ruby)... Ctrl-C to quit.
+
+TYPE                           # ALLOCS      # BYTES
+NameError                             1            0
+RubyToken::TkSPACE                    1            0
+RubyToken::TkSTRING                   1            0
+String                                7            0
+RubyToken::TkNL                       2            0
+RubyToken::TkIDENTIFIER               2            0
+array                                55          129
+string                              344         1348
+^C
+
+
+Plain C/C++ allocations (through "malloc") are also supported. We can't report
+the type being allocated, but we can report the object sizes at least. Also,
+print only the top 10 rows by number of bytes allocated:
+
+# ./uobjnew -S 10 -l c 27245
+Tracing allocations in process 27245 (language: c)... Ctrl-C to quit.
+
+TYPE                           # ALLOCS      # BYTES
+block size 64                        22         1408
+block size 992                        2         1984
+block size 32                        68         2176
+block size 48                        48         2304
+block size 944                        4         3776
+block size 1104                       4         4416
+block size 160                       32         5120
+block size 535                       15         8025
+block size 128                      112        14336
+block size 80                       569        45520
+^C
+
+
+USAGE message:
+
+# ./uobjnew -h
+usage: uobjnew.py [-h] [-l {c,java,ruby,tcl}] [-C TOP_COUNT] [-S TOP_SIZE] [-v]
+                  pid [interval]
+
+Summarize object allocations in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+  interval              print every specified number of seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {c,java,ruby,tcl}, --language {c,java,ruby,tcl}
+                        language to trace
+  -C TOP_COUNT, --top-count TOP_COUNT
+                        number of most frequently allocated types to print
+  -S TOP_SIZE, --top-size TOP_SIZE
+                        number of largest types by allocated bytes to print
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uobjnew -l java 145         # summarize Java allocations in process 145
+    ./uobjnew -l c 2020 1         # grab malloc() sizes and print every second
+    ./uobjnew -l ruby 6712 -C 10  # top 10 Ruby types by number of allocations
+    ./uobjnew -l ruby 6712 -S 10  # top 10 Ruby types by total size
diff --git a/tools/lib/ustat.py b/tools/lib/ustat.py
new file mode 100755
index 0000000..3661a14
--- /dev/null
+++ b/tools/lib/ustat.py
@@ -0,0 +1,296 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ustat  Activity stats from high-level languages, including exceptions,
+#        method calls, class loads, garbage collections, and more.
+#        For Linux, uses BCC, eBPF.
+#
+# USAGE: ustat [-l {java,node,perl,php,python,ruby,tcl}] [-C]
+#        [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d]
+#        [interval [count]]
+#
+# This uses in-kernel eBPF maps to store per process summaries for efficiency.
+# Newly-created processes might only be traced at the next interval, if the
+# relevant USDT probe requires enabling through a semaphore.
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 26-Oct-2016   Sasha Goldshtein    Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+import os
+from subprocess import call
+from time import sleep, strftime
+
+class Category(object):
+    THREAD = "THREAD"
+    METHOD = "METHOD"
+    OBJNEW = "OBJNEW"
+    CLOAD = "CLOAD"
+    EXCP = "EXCP"
+    GC = "GC"
+
+class Probe(object):
+    def __init__(self, language, procnames, events):
+        """
+        Initialize a new probe object with a specific language, set of process
+        names to monitor for that language, and a dictionary of events and
+        categories. The dictionary is a mapping of USDT probe names (such as
+        'gc__start') to event categories supported by this tool -- from the
+        Category class.
+        """
+        self.language = language
+        self.procnames = procnames
+        self.events = events
+
+    def _find_targets(self):
+        """Find pids where the comm is one of the specified list"""
+        self.targets = {}
+        all_pids = [int(pid) for pid in os.listdir('/proc') if pid.isdigit()]
+        for pid in all_pids:
+            try:
+                comm = open('/proc/%d/comm' % pid).read().strip()
+                if comm in self.procnames:
+                    cmdline = open('/proc/%d/cmdline' % pid).read()
+                    self.targets[pid] = cmdline.replace('\0', ' ')
+            except IOError:
+                continue    # process may already have terminated
+
+    def _enable_probes(self):
+        self.usdts = []
+        for pid in self.targets:
+            usdt = USDT(pid=pid)
+            for event in self.events:
+                try:
+                    usdt.enable_probe(event, "%s_%s" % (self.language, event))
+                except Exception:
+                    # This process might not have a recent version of the USDT
+                    # probes enabled, or might have been compiled without USDT
+                    # probes at all. The process could even have been shut down
+                    # and the pid been recycled. We have to gracefully handle
+                    # the possibility that we can't attach probes to it at all.
+                    pass
+            self.usdts.append(usdt)
+
+    def _generate_tables(self):
+        text = """
+BPF_HASH(%s_%s_counts, u32, u64);   // pid to event count
+        """
+        return str.join('', [text % (self.language, event)
+                             for event in self.events])
+
+    def _generate_functions(self):
+        text = """
+int %s_%s(void *ctx) {
+    u64 *valp, zero = 0;
+    u32 tgid = bpf_get_current_pid_tgid() >> 32;
+    valp = %s_%s_counts.lookup_or_init(&tgid, &zero);
+    ++(*valp);
+    return 0;
+}
+        """
+        lang = self.language
+        return str.join('', [text % (lang, event, lang, event)
+                             for event in self.events])
+
+    def get_program(self):
+        self._find_targets()
+        self._enable_probes()
+        return self._generate_tables() + self._generate_functions()
+
+    def get_usdts(self):
+        return self.usdts
+
+    def get_counts(self, bpf):
+        """Return a map of event counts per process"""
+        event_dict = dict([(category, 0) for category in self.events.values()])
+        result = dict([(pid, event_dict.copy()) for pid in self.targets])
+        for event, category in self.events.items():
+            counts = bpf["%s_%s_counts" % (self.language, event)]
+            for pid, count in counts.items():
+                result[pid.value][category] = count.value
+            counts.clear()
+        return result
+
+    def cleanup(self):
+        self.usdts = None
+
+class Tool(object):
+    def _parse_args(self):
+        examples = """examples:
+  ./ustat              # stats for all languages, 1 second refresh
+  ./ustat -C           # don't clear the screen
+  ./ustat -l java      # Java processes only
+  ./ustat 5            # 5 second summaries
+  ./ustat 5 10         # 5 second summaries, 10 times only
+        """
+        parser = argparse.ArgumentParser(
+            description="Activity stats from high-level languages.",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=examples)
+        parser.add_argument("-l", "--language",
+            choices=["java", "node", "perl", "php", "python", "ruby", "tcl"],
+            help="language to trace (default: all languages)")
+        parser.add_argument("-C", "--noclear", action="store_true",
+            help="don't clear the screen")
+        parser.add_argument("-S", "--sort",
+            choices=[cat.lower() for cat in dir(Category) if cat.isupper()],
+            help="sort by this field (descending order)")
+        parser.add_argument("-r", "--maxrows", default=20, type=int,
+            help="maximum rows to print, default 20")
+        parser.add_argument("-d", "--debug", action="store_true",
+            help="Print the resulting BPF program (for debugging purposes)")
+        parser.add_argument("interval", nargs="?", default=1, type=int,
+            help="output interval, in seconds")
+        parser.add_argument("count", nargs="?", default=99999999, type=int,
+            help="number of outputs")
+        parser.add_argument("--ebpf", action="store_true",
+            help=argparse.SUPPRESS)
+        self.args = parser.parse_args()
+
+    def _create_probes(self):
+        probes_by_lang = {
+                "java": Probe("java", ["java"], {
+                    "gc__begin": Category.GC,
+                    "mem__pool__gc__begin": Category.GC,
+                    "thread__start": Category.THREAD,
+                    "class__loaded": Category.CLOAD,
+                    "object__alloc": Category.OBJNEW,
+                    "method__entry": Category.METHOD,
+                    "ExceptionOccurred__entry": Category.EXCP
+                    }),
+                "node": Probe("node", ["node"], {
+                    "gc__start": Category.GC
+                    }),
+                "perl": Probe("perl", ["perl"], {
+                    "sub__entry": Category.METHOD
+                    }),
+                "php": Probe("php", ["php"], {
+                    "function__entry": Category.METHOD,
+                    "compile__file__entry": Category.CLOAD,
+                    "exception__thrown": Category.EXCP
+                    }),
+                "python": Probe("python", ["python"], {
+                    "function__entry": Category.METHOD,
+                    "gc__start": Category.GC
+                    }),
+                "ruby": Probe("ruby", ["ruby", "irb"], {
+                    "method__entry": Category.METHOD,
+                    "cmethod__entry": Category.METHOD,
+                    "gc__mark__begin": Category.GC,
+                    "gc__sweep__begin": Category.GC,
+                    "object__create": Category.OBJNEW,
+                    "hash__create": Category.OBJNEW,
+                    "string__create": Category.OBJNEW,
+                    "array__create": Category.OBJNEW,
+                    "require__entry": Category.CLOAD,
+                    "load__entry": Category.CLOAD,
+                    "raise": Category.EXCP
+                    }),
+                "tcl": Probe("tcl", ["tclsh", "wish"], {
+                    "proc__entry": Category.METHOD,
+                    "obj__create": Category.OBJNEW
+                    }),
+                }
+
+        if self.args.language:
+            self.probes = [probes_by_lang[self.args.language]]
+        else:
+            self.probes = probes_by_lang.values()
+
+    def _attach_probes(self):
+        program = str.join('\n', [p.get_program() for p in self.probes])
+        if self.args.debug or self.args.ebpf:
+            print(program)
+            if self.args.ebpf:
+                exit()
+            for probe in self.probes:
+                print("Attached to %s processes:" % probe.language,
+                        str.join(', ', map(str, probe.targets)))
+        self.bpf = BPF(text=program)
+        usdts = [usdt for probe in self.probes for usdt in probe.get_usdts()]
+        # Filter out duplicates when we have multiple processes with the same
+        # uprobe. We are attaching to these probes manually instead of using
+        # the USDT support from the bcc module, because the USDT class attaches
+        # to each uprobe with a specific pid. When there is more than one
+        # process from some language, we end up attaching more than once to the
+        # same uprobe (albeit with different pids), which is not allowed.
+        # Instead, we use a global attach (with pid=-1).
+        uprobes = set([(path, func, addr) for usdt in usdts
+                       for (path, func, addr, _)
+                       in usdt.enumerate_active_probes()])
+        for (path, func, addr) in uprobes:
+            self.bpf.attach_uprobe(name=path, fn_name=func, addr=addr, pid=-1)
+
+    def _detach_probes(self):
+        for probe in self.probes:
+            probe.cleanup()     # Cleans up USDT contexts
+        self.bpf.cleanup()      # Cleans up all attached probes
+        self.bpf = None
+
+    def _loop_iter(self):
+        self._attach_probes()
+        try:
+            sleep(self.args.interval)
+        except KeyboardInterrupt:
+            self.exiting = True
+
+        if not self.args.noclear:
+            call("clear")
+        else:
+            print()
+        with open("/proc/loadavg") as stats:
+            print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+        print("%-6s %-20s %-10s %-6s %-10s %-8s %-6s %-6s" % (
+            "PID", "CMDLINE", "METHOD/s", "GC/s", "OBJNEW/s",
+            "CLOAD/s", "EXC/s", "THR/s"))
+
+        line = 0
+        counts = {}
+        targets = {}
+        for probe in self.probes:
+            counts.update(probe.get_counts(self.bpf))
+            targets.update(probe.targets)
+        if self.args.sort:
+            sort_field = self.args.sort.upper()
+            counts = sorted(counts.items(),
+                            key=lambda kv: -kv[1].get(sort_field, 0))
+        else:
+            counts = sorted(counts.items(), key=lambda kv: kv[0])
+        for pid, stats in counts:
+            print("%-6d %-20s %-10d %-6d %-10d %-8d %-6d %-6d" % (
+                  pid, targets[pid][:20],
+                  stats.get(Category.METHOD, 0) / self.args.interval,
+                  stats.get(Category.GC, 0) / self.args.interval,
+                  stats.get(Category.OBJNEW, 0) / self.args.interval,
+                  stats.get(Category.CLOAD, 0) / self.args.interval,
+                  stats.get(Category.EXCP, 0) / self.args.interval,
+                  stats.get(Category.THREAD, 0) / self.args.interval
+                  ))
+            line += 1
+            if line >= self.args.maxrows:
+                break
+        self._detach_probes()
+
+    def run(self):
+        self._parse_args()
+        self._create_probes()
+        print('Tracing... Output every %d secs. Hit Ctrl-C to end' %
+              self.args.interval)
+        countdown = self.args.count
+        self.exiting = False
+        while True:
+            self._loop_iter()
+            countdown -= 1
+            if self.exiting or countdown == 0:
+                print("Detaching...")
+                exit()
+
+if __name__ == "__main__":
+    try:
+        Tool().run()
+    except KeyboardInterrupt:
+        pass
diff --git a/tools/lib/ustat_example.txt b/tools/lib/ustat_example.txt
new file mode 100644
index 0000000..11ee2de
--- /dev/null
+++ b/tools/lib/ustat_example.txt
@@ -0,0 +1,78 @@
+Demonstrations of ustat.
+
+
+ustat is a "top"-like tool for monitoring events in high-level languages. It 
+prints statistics about garbage collections, method calls, object allocations,
+and various other events for every process that it recognizes with a Java,
+Node, Perl, PHP, Python, Ruby, and Tcl runtime.
+
+For example:
+
+# ./ustat.py
+Tracing... Output every 10 secs. Hit Ctrl-C to end
+12:17:17 loadavg: 0.33 0.08 0.02 5/211 26284
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          3      0          0        0      0     
+^C
+Detaching...
+
+
+If desired, you can instruct ustat to print a certain number of entries and 
+exit, which can be useful to get a quick picture on what's happening on the 
+system over a short time interval. Here, we ask ustat to print 5-second 
+summaries 12 times (for a total time of 1 minute):
+
+# ./ustat.py -C 5 12
+Tracing... Output every 5 secs. Hit Ctrl-C to end
+12:18:26 loadavg: 0.27 0.11 0.04 2/336 26455
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          1      0          0        0      0     
+
+12:18:31 loadavg: 0.33 0.12 0.04 2/336 26456
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          0      0          0        0      0     
+26439  java -XX:+ExtendedDT 2776045    0      0          0        0      0     
+
+12:18:37 loadavg: 0.38 0.14 0.05 2/336 26457
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          0      0          0        0      0     
+26439  java -XX:+ExtendedDT 2804378    0      0          0        0      0     
+
+(...more output omitted for brevity)
+
+
+USAGE message:
+
+# ./ustat.py -h
+usage: ustat.py [-h] [-l {java,node,perl,php,python,ruby,tcl}] [-C]
+                [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d]
+                [interval] [count]
+
+Activity stats from high-level languages.
+
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,node,perl,php,python,ruby,tcl}, --language {java,node,perl,php,python,ruby,tcl}
+                        language to trace (default: all languages)
+  -C, --noclear         don't clear the screen
+  -S {cload,excp,gc,method,objnew,thread}, --sort {cload,excp,gc,method,objnew,thread}
+                        sort by this field (descending order)
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+  -d, --debug           Print the resulting BPF program (for debugging
+                        purposes)
+
+examples:
+  ./ustat              # stats for all languages, 1 second refresh
+  ./ustat -C           # don't clear the screen
+  ./ustat -l java      # Java processes only
+  ./ustat 5            # 5 second summaries
+  ./ustat 5 10         # 5 second summaries, 10 times only 
diff --git a/tools/lib/uthreads.py b/tools/lib/uthreads.py
new file mode 100755
index 0000000..71e9c6a
--- /dev/null
+++ b/tools/lib/uthreads.py
@@ -0,0 +1,131 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uthreads  Trace thread creation/destruction events in high-level languages.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: uthreads [-l {c,java,none}] [-v] pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT, utils
+import ctypes as ct
+import time
+import os
+
+languages = ["c", "java"]
+
+examples = """examples:
+    ./uthreads -l java 185    # trace Java threads in process 185
+    ./uthreads -l none 12245  # trace only pthreads in process 12245
+"""
+parser = argparse.ArgumentParser(
+    description="Trace thread creation/destruction events in " +
+                "high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--language", choices=languages + ["none"],
+    help="language to trace (none for pthreads only)")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct thread_event_t {
+    u64 runtime_id;
+    u64 native_id;
+    char type[8];
+    char name[80];
+};
+
+BPF_PERF_OUTPUT(threads);
+
+int trace_pthread(struct pt_regs *ctx) {
+    struct thread_event_t te = {};
+    u64 start_routine = 0;
+    char type[] = "pthread";
+    te.native_id = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
+    bpf_usdt_readarg(2, ctx, &start_routine);
+    te.runtime_id = start_routine;  // This is really a function pointer
+    __builtin_memcpy(&te.type, type, sizeof(te.type));
+    threads.perf_submit(ctx, &te, sizeof(te));
+    return 0;
+}
+"""
+usdt.enable_probe_or_bail("pthread_start", "trace_pthread")
+
+language = args.language
+if not language:
+    language = utils.detect_language(languages, args.pid)
+
+if language == "c":
+    # Nothing to add
+    pass
+elif language == "java":
+    template = """
+int %s(struct pt_regs *ctx) {
+    char type[] = "%s";
+    struct thread_event_t te = {};
+    u64 nameptr = 0, id = 0, native_id = 0;
+    bpf_usdt_readarg(1, ctx, &nameptr);
+    bpf_usdt_readarg(3, ctx, &id);
+    bpf_usdt_readarg(4, ctx, &native_id);
+    bpf_probe_read(&te.name, sizeof(te.name), (void *)nameptr);
+    te.runtime_id = id;
+    te.native_id = native_id;
+    __builtin_memcpy(&te.type, type, sizeof(te.type));
+    threads.perf_submit(ctx, &te, sizeof(te));
+    return 0;
+}
+    """
+    program += template % ("trace_start", "start")
+    program += template % ("trace_stop", "stop")
+    usdt.enable_probe_or_bail("thread__start", "trace_start")
+    usdt.enable_probe_or_bail("thread__stop", "trace_stop")
+
+if args.ebpf or args.verbose:
+    if args.verbose:
+        print(usdt.get_text())
+    print(program)
+    if args.ebpf:
+        exit()
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing thread events in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, language or "none"))
+print("%-8s %-16s %-8s %-30s" % ("TIME", "ID", "TYPE", "DESCRIPTION"))
+
+class ThreadEvent(ct.Structure):
+    _fields_ = [
+        ("runtime_id", ct.c_ulonglong),
+        ("native_id", ct.c_ulonglong),
+        ("type", ct.c_char * 8),
+        ("name", ct.c_char * 80),
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(ThreadEvent)).contents
+    name = event.name
+    if event.type == "pthread":
+        name = bpf.sym(event.runtime_id, args.pid, show_module=True)
+        tid = event.native_id
+    else:
+        tid = "R=%s/N=%s" % (event.runtime_id, event.native_id)
+    print("%-8.3f %-16s %-8s %-30s" % (
+        time.time() - start_ts, tid, event.type, name))
+
+bpf["threads"].open_perf_buffer(print_event)
+while 1:
+    bpf.perf_buffer_poll()
diff --git a/tools/lib/uthreads_example.txt b/tools/lib/uthreads_example.txt
new file mode 100644
index 0000000..9880926
--- /dev/null
+++ b/tools/lib/uthreads_example.txt
@@ -0,0 +1,58 @@
+Demonstrations of uthreads.
+
+
+uthreads traces thread creation events in Java or raw (C) pthreads, and prints
+details about the newly created thread. For Java threads, the thread name is
+printed; for pthreads, the thread's start function is printed, if there is
+symbol information to resolve it.
+
+For example, trace all Java thread creation events:
+
+# ./uthreads -l java 27420
+Tracing thread events in process 27420 (language: java)... Ctrl-C to quit.
+TIME     ID               TYPE     DESCRIPTION                   
+18.596   R=9/N=0          start    SIGINT handler                
+18.596   R=4/N=0          stop     Signal Dispatcher             
+^C
+
+The ID column in the preceding output shows the thread's runtime ID and native
+ID, when available. The accuracy of this information depends on the Java 
+runtime.
+
+
+Next, trace only pthread creation events in some native application:
+
+# ./uthreads 27450
+Tracing thread events in process 27450 (language: c)... Ctrl-C to quit.
+TIME     ID               TYPE     DESCRIPTION                   
+0.924    27462            pthread  primes_thread [primes]
+0.927    27463            pthread  primes_thread [primes]     
+0.928    27464            pthread  primes_thread [primes]        
+0.928    27465            pthread  primes_thread [primes]        
+^C
+
+The thread name ("primes_thread" in this example) is resolved from debuginfo.
+If symbol information is not present, the thread's start address is printed
+instead.
+
+
+USAGE message:
+
+# ./uthreads -h
+usage: uthreads.py [-h] [-l {c,java,none}] [-v] pid
+
+Trace thread creation/destruction events in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {c,java,none}, --language {c,java,none}
+                        language to trace (none for pthreads only)
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uthreads -l java 185    # trace Java threads in process 185
+    ./uthreads -l none 12245  # trace only pthreads in process 12245
diff --git a/tools/llcstat.py b/tools/llcstat.py
new file mode 100755
index 0000000..ec2c1f8
--- /dev/null
+++ b/tools/llcstat.py
@@ -0,0 +1,119 @@
+#!/usr/bin/python
+#
+# llcstat.py Summarize cache references and cache misses by PID.
+#            Cache reference and cache miss are corresponding events defined in
+#            uapi/linux/perf_event.h, it varies to different architecture.
+#            On x86-64, they mean LLC references and LLC misses.
+#
+#            For Linux, uses BCC, eBPF. Embedded C.
+#
+# SEE ALSO: perf top -e cache-misses -e cache-references -a -ns pid,cpu,comm
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
+#
+# Copyright (c) 2016 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2016   Teng Qin   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, PerfType, PerfHWConfig
+import signal
+from time import sleep
+
+parser = argparse.ArgumentParser(
+    description="Summarize cache references and misses by PID",
+    formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument(
+    "-c", "--sample_period", type=int, default=100,
+    help="Sample one in this many number of cache reference / miss events")
+parser.add_argument(
+    "duration", nargs="?", default=10, help="Duration, in seconds, to run")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+# load BPF program
+bpf_text="""
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+
+struct key_t {
+    int cpu;
+    int pid;
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HASH(ref_count, struct key_t);
+BPF_HASH(miss_count, struct key_t);
+
+static inline __attribute__((always_inline)) void get_key(struct key_t* key) {
+    key->cpu = bpf_get_smp_processor_id();
+    key->pid = bpf_get_current_pid_tgid();
+    bpf_get_current_comm(&(key->name), sizeof(key->name));
+}
+
+int on_cache_miss(struct bpf_perf_event_data *ctx) {
+    struct key_t key = {};
+    get_key(&key);
+
+    miss_count.increment(key, ctx->sample_period);
+
+    return 0;
+}
+
+int on_cache_ref(struct bpf_perf_event_data *ctx) {
+    struct key_t key = {};
+    get_key(&key);
+
+    ref_count.increment(key, ctx->sample_period);
+
+    return 0;
+}
+"""
+
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+b = BPF(text=bpf_text)
+try:
+    b.attach_perf_event(
+        ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_MISSES,
+        fn_name="on_cache_miss", sample_period=args.sample_period)
+    b.attach_perf_event(
+        ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_REFERENCES,
+        fn_name="on_cache_ref", sample_period=args.sample_period)
+except:
+    print("Failed to attach to a hardware event. Is this a virtual machine?")
+    exit()
+
+print("Running for {} seconds or hit Ctrl-C to end.".format(args.duration))
+
+try:
+    sleep(float(args.duration))
+except KeyboardInterrupt:
+    signal.signal(signal.SIGINT, lambda signal, frame: print())
+
+miss_count = {}
+for (k, v) in b.get_table('miss_count').items():
+    miss_count[(k.pid, k.cpu, k.name)] = v.value
+
+print('PID      NAME             CPU     REFERENCE         MISS    HIT%')
+tot_ref = 0
+tot_miss = 0
+for (k, v) in b.get_table('ref_count').items():
+    try:
+        miss = miss_count[(k.pid, k.cpu, k.name)]
+    except KeyError:
+        miss = 0
+    tot_ref += v.value
+    tot_miss += miss
+    # This happens on some PIDs due to missed counts caused by sampling
+    hit = (v.value - miss) if (v.value >= miss) else 0
+    print('{:<8d} {:<16s} {:<4d} {:>12d} {:>12d} {:>6.2f}%'.format(
+        k.pid, k.name.decode('utf-8', 'replace'), k.cpu, v.value, miss,
+        (float(hit) / float(v.value)) * 100.0))
+print('Total References: {} Total Misses: {} Hit Rate: {:.2f}%'.format(
+    tot_ref, tot_miss, (float(tot_ref - tot_miss) / float(tot_ref)) * 100.0))
diff --git a/tools/llcstat_example.txt b/tools/llcstat_example.txt
new file mode 100644
index 0000000..ef2aec1
--- /dev/null
+++ b/tools/llcstat_example.txt
@@ -0,0 +1,56 @@
+Demonstrations of llcstat.
+
+
+llcstat traces cache reference and cache miss events system-wide, and summarizes
+them by PID and CPU.
+
+These events, defined in uapi/linux/perf_event.h, have different meanings on
+different architecture. For x86-64, they mean misses and references to LLC.
+
+Example output:
+
+# ./llcstat.py 20 -c 5000
+Running for 20 seconds or hit Ctrl-C to end.
+PID      NAME             CPU     REFERENCE         MISS   HIT%
+0        swapper/15       15        3515000       640000  81.79%
+238      migration/38     38           5000            0 100.00%
+4512     ntpd             11           5000            0 100.00%
+150867   ipmitool         3           25000         5000  80.00%
+150895   lscpu            17         280000        25000  91.07%
+151807   ipmitool         15          15000         5000  66.67%
+150757   awk              2           15000         5000  66.67%
+151213   chef-client      5         1770000       240000  86.44%
+151822   scribe-dispatch  12          15000            0 100.00%
+123386   mysqld           5            5000            0 100.00%
+[...]
+Total References: 518920000 Total Misses: 90265000 Hit Rate: 82.61%
+
+This shows each PID's cache hit rate during the 20 seconds run period.
+
+A count of 5000 was used in this example, which means that one in every 5,000
+events will trigger an in-kernel counter to be incremented. This is refactored
+on the output, which is why it is always in multiples of 5,000.
+
+We don't instrument every single event since the overhead would be prohibitive,
+nor do we need to: this is a type of sampling profiler. Because of this, the
+processes that trigger the 5,000'th cache reference or misses can happen to
+some degree by chance. Overall it should make sense. But for low counts,
+you might find a case where -- by chance -- a process has been tallied with
+more misses than references, which would seem impossible.
+
+
+USAGE message:
+
+# ./llcstat.py --help
+usage: llcstat.py [-h] [-c SAMPLE_PERIOD] [duration]
+
+Summarize cache references and misses by PID
+
+positional arguments:
+  duration                Duration, in seconds, to run
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    -c SAMPLE_PERIOD, --sample_period SAMPLE_PERIOD
+                          Sample one in this many number of cache reference
+                          and miss events
diff --git a/tools/mdflush.py b/tools/mdflush.py
new file mode 100755
index 0000000..70afc4d
--- /dev/null
+++ b/tools/mdflush.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# mdflush  Trace md flush events.
+#          For Linux, uses BCC, eBPF.
+#
+# Todo: add more details of the flush (latency, I/O count).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Feb-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import strftime
+import ctypes as ct
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/bio.h>
+
+struct data_t {
+    u64 pid;
+    char comm[TASK_COMM_LEN];
+    char disk[DISK_NAME_LEN];
+};
+BPF_PERF_OUTPUT(events);
+
+int kprobe__md_flush_request(struct pt_regs *ctx, void *mddev, struct bio *bio)
+{
+    struct data_t data = {};
+    u32 pid = bpf_get_current_pid_tgid();
+    data.pid = pid;
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+/*
+ * The following deals with a kernel version change (in mainline 4.14, although
+ * it may be backported to earlier kernels) with how the disk name is accessed.
+ * We handle both pre- and post-change versions here. Please avoid kernel
+ * version tests like this as much as possible: they inflate the code, test,
+ * and maintenance burden.
+ */
+#ifdef bio_dev
+    struct gendisk *bi_disk = bio->bi_disk;
+#else
+    struct gendisk *bi_disk = bio->bi_bdev->bd_disk;
+#endif
+    bpf_probe_read(&data.disk, sizeof(data.disk), bi_disk->disk_name);
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+""")
+
+# event data
+TASK_COMM_LEN = 16  # linux/sched.h
+DISK_NAME_LEN = 32  # linux/genhd.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("disk", ct.c_char * DISK_NAME_LEN)
+    ]
+
+# header
+print("Tracing md flush requests... Hit Ctrl-C to end.")
+print("%-8s %-6s %-16s %s" % ("TIME", "PID", "COMM", "DEVICE"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-8s %-6d %-16s %s" % (strftime("%H:%M:%S"), event.pid,
+        event.comm.decode('utf-8', 'replace'),
+        event.disk.decode('utf-8', 'replace')))
+
+# read events
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/mdflush_example.txt b/tools/mdflush_example.txt
new file mode 100644
index 0000000..36d6d7e
--- /dev/null
+++ b/tools/mdflush_example.txt
@@ -0,0 +1,47 @@
+Demonstrations of mdflush, the Linux eBPF/bcc version.
+
+
+The mdflush tool traces flushes at the md driver level, and prints details
+including the time of the flush:
+
+# ./mdflush
+Tracing md flush requests... Hit Ctrl-C to end.
+TIME     PID    COMM             DEVICE
+03:13:49 16770  sync             md0
+03:14:08 16864  sync             md0
+03:14:49 496    kworker/1:0H     md0
+03:14:49 488    xfsaild/md0      md0
+03:14:54 488    xfsaild/md0      md0
+03:15:00 488    xfsaild/md0      md0
+03:15:02 85     kswapd0          md0
+03:15:02 488    xfsaild/md0      md0
+03:15:05 488    xfsaild/md0      md0
+03:15:08 488    xfsaild/md0      md0
+03:15:10 488    xfsaild/md0      md0
+03:15:11 488    xfsaild/md0      md0
+03:15:11 488    xfsaild/md0      md0
+03:15:11 488    xfsaild/md0      md0
+03:15:11 488    xfsaild/md0      md0
+03:15:11 488    xfsaild/md0      md0
+03:15:12 488    xfsaild/md0      md0
+03:15:13 488    xfsaild/md0      md0
+03:15:15 488    xfsaild/md0      md0
+03:15:19 496    kworker/1:0H     md0
+03:15:49 496    kworker/1:0H     md0
+03:15:55 18840  sync             md0
+03:16:49 496    kworker/1:0H     md0
+03:17:19 496    kworker/1:0H     md0
+03:20:19 496    kworker/1:0H     md0
+03:21:19 496    kworker/1:0H     md0
+03:21:49 496    kworker/1:0H     md0
+03:25:19 496    kworker/1:0H     md0
+[...]
+
+This can be useful for correlation with latency outliers or spikes in disk
+latency, as measured using another tool (eg, system monitoring). If spikes in
+disk latency often coincide with md flush events, then it would make flushing
+a target for tuning.
+
+Note that the flush events are likely to originate from higher in the I/O
+stack, such as from file systems. This traces md processing them, and the
+timestamp corresponds with when md began to issue the flush to disks.
diff --git a/tools/memleak.py b/tools/memleak.py
new file mode 100755
index 0000000..4021bf8
--- /dev/null
+++ b/tools/memleak.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python
+#
+# memleak   Trace and display outstanding allocations to detect
+#           memory leaks in user-mode processes and the kernel.
+#
+# USAGE: memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND]
+#                [--combined-only] [-s SAMPLE_RATE] [-T TOP] [-z MIN_SIZE]
+#                [-Z MAX_SIZE] [-O OBJ]
+#                [interval] [count]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Copyright (C) 2016 Sasha Goldshtein.
+
+from bcc import BPF
+from time import sleep
+from datetime import datetime
+import resource
+import argparse
+import subprocess
+import os
+import sys
+
+class Allocation(object):
+    def __init__(self, stack, size):
+        self.stack = stack
+        self.count = 1
+        self.size = size
+
+    def update(self, size):
+        self.count += 1
+        self.size += size
+
+def run_command_get_output(command):
+        p = subprocess.Popen(command.split(),
+                stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        return iter(p.stdout.readline, b'')
+
+def run_command_get_pid(command):
+        p = subprocess.Popen(command.split())
+        return p.pid
+
+examples = """
+EXAMPLES:
+
+./memleak -p $(pidof allocs)
+        Trace allocations and display a summary of "leaked" (outstanding)
+        allocations every 5 seconds
+./memleak -p $(pidof allocs) -t
+        Trace allocations and display each individual allocator function call
+./memleak -ap $(pidof allocs) 10
+        Trace allocations and display allocated addresses, sizes, and stacks
+        every 10 seconds for outstanding allocations
+./memleak -c "./allocs"
+        Run the specified command and trace its allocations
+./memleak
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations every 5 seconds
+./memleak -o 60000
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations that are at least one minute (60 seconds) old
+./memleak -s 5
+        Trace roughly every 5th allocation, to reduce overhead
+"""
+
+description = """
+Trace outstanding memory allocations that weren't freed.
+Supports both user-mode allocations made with libc functions and kernel-mode
+allocations made with kmalloc/kmem_cache_alloc/get_free_pages and corresponding
+memory release functions.
+"""
+
+parser = argparse.ArgumentParser(description=description,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=examples)
+parser.add_argument("-p", "--pid", type=int, default=-1,
+        help="the PID to trace; if not specified, trace kernel allocs")
+parser.add_argument("-t", "--trace", action="store_true",
+        help="print trace messages for each alloc/free call")
+parser.add_argument("interval", nargs="?", default=5, type=int,
+        help="interval in seconds to print outstanding allocations")
+parser.add_argument("count", nargs="?", type=int,
+        help="number of times to print the report before exiting")
+parser.add_argument("-a", "--show-allocs", default=False, action="store_true",
+        help="show allocation addresses and sizes as well as call stacks")
+parser.add_argument("-o", "--older", default=500, type=int,
+        help="prune allocations younger than this age in milliseconds")
+parser.add_argument("-c", "--command",
+        help="execute and trace the specified command")
+parser.add_argument("--combined-only", default=False, action="store_true",
+        help="show combined allocation statistics only")
+parser.add_argument("-s", "--sample-rate", default=1, type=int,
+        help="sample every N-th allocation to decrease the overhead")
+parser.add_argument("-T", "--top", type=int, default=10,
+        help="display only this many top allocating stacks (by size)")
+parser.add_argument("-z", "--min-size", type=int,
+        help="capture only allocations larger than this size")
+parser.add_argument("-Z", "--max-size", type=int,
+        help="capture only allocations smaller than this size")
+parser.add_argument("-O", "--obj", type=str, default="c",
+        help="attach to allocator functions in the specified object")
+parser.add_argument("--ebpf", action="store_true",
+        help=argparse.SUPPRESS)
+
+args = parser.parse_args()
+
+pid = args.pid
+command = args.command
+kernel_trace = (pid == -1 and command is None)
+trace_all = args.trace
+interval = args.interval
+min_age_ns = 1e6 * args.older
+sample_every_n = args.sample_rate
+num_prints = args.count
+top_stacks = args.top
+min_size = args.min_size
+max_size = args.max_size
+obj = args.obj
+
+if min_size is not None and max_size is not None and min_size > max_size:
+        print("min_size (-z) can't be greater than max_size (-Z)")
+        exit(1)
+
+if command is not None:
+        print("Executing '%s' and tracing the resulting process." % command)
+        pid = run_command_get_pid(command)
+
+bpf_source = """
+#include <uapi/linux/ptrace.h>
+
+struct alloc_info_t {
+        u64 size;
+        u64 timestamp_ns;
+        int stack_id;
+};
+
+struct combined_alloc_info_t {
+        u64 total_size;
+        u64 number_of_allocs;
+};
+
+BPF_HASH(sizes, u64);
+BPF_TABLE("hash", u64, struct alloc_info_t, allocs, 1000000);
+BPF_HASH(memptrs, u64, u64);
+BPF_STACK_TRACE(stack_traces, 10240);
+BPF_TABLE("hash", u64, struct combined_alloc_info_t, combined_allocs, 10240);
+
+static inline void update_statistics_add(u64 stack_id, u64 sz) {
+        struct combined_alloc_info_t *existing_cinfo;
+        struct combined_alloc_info_t cinfo = {0};
+
+        existing_cinfo = combined_allocs.lookup(&stack_id);
+        if (existing_cinfo != 0)
+                cinfo = *existing_cinfo;
+
+        cinfo.total_size += sz;
+        cinfo.number_of_allocs += 1;
+
+        combined_allocs.update(&stack_id, &cinfo);
+}
+
+static inline void update_statistics_del(u64 stack_id, u64 sz) {
+        struct combined_alloc_info_t *existing_cinfo;
+        struct combined_alloc_info_t cinfo = {0};
+
+        existing_cinfo = combined_allocs.lookup(&stack_id);
+        if (existing_cinfo != 0)
+                cinfo = *existing_cinfo;
+
+        if (sz >= cinfo.total_size)
+                cinfo.total_size = 0;
+        else
+                cinfo.total_size -= sz;
+
+        if (cinfo.number_of_allocs > 0)
+                cinfo.number_of_allocs -= 1;
+
+        combined_allocs.update(&stack_id, &cinfo);
+}
+
+static inline int gen_alloc_enter(struct pt_regs *ctx, size_t size) {
+        SIZE_FILTER
+        if (SAMPLE_EVERY_N > 1) {
+                u64 ts = bpf_ktime_get_ns();
+                if (ts % SAMPLE_EVERY_N != 0)
+                        return 0;
+        }
+
+        u64 pid = bpf_get_current_pid_tgid();
+        u64 size64 = size;
+        sizes.update(&pid, &size64);
+
+        if (SHOULD_PRINT)
+                bpf_trace_printk("alloc entered, size = %u\\n", size);
+        return 0;
+}
+
+static inline int gen_alloc_exit2(struct pt_regs *ctx, u64 address) {
+        u64 pid = bpf_get_current_pid_tgid();
+        u64* size64 = sizes.lookup(&pid);
+        struct alloc_info_t info = {0};
+
+        if (size64 == 0)
+                return 0; // missed alloc entry
+
+        info.size = *size64;
+        sizes.delete(&pid);
+
+        info.timestamp_ns = bpf_ktime_get_ns();
+        info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS);
+        allocs.update(&address, &info);
+        update_statistics_add(info.stack_id, info.size);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("alloc exited, size = %lu, result = %lx\\n",
+                                 info.size, address);
+        }
+        return 0;
+}
+
+static inline int gen_alloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit2(ctx, PT_REGS_RC(ctx));
+}
+
+static inline int gen_free_enter(struct pt_regs *ctx, void *address) {
+        u64 addr = (u64)address;
+        struct alloc_info_t *info = allocs.lookup(&addr);
+        if (info == 0)
+                return 0;
+
+        allocs.delete(&addr);
+        update_statistics_del(info->stack_id, info->size);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("free entered, address = %lx, size = %lu\\n",
+                                 address, info->size);
+        }
+        return 0;
+}
+
+int malloc_enter(struct pt_regs *ctx, size_t size) {
+        return gen_alloc_enter(ctx, size);
+}
+
+int malloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int free_enter(struct pt_regs *ctx, void *address) {
+        return gen_free_enter(ctx, address);
+}
+
+int calloc_enter(struct pt_regs *ctx, size_t nmemb, size_t size) {
+        return gen_alloc_enter(ctx, nmemb * size);
+}
+
+int calloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int realloc_enter(struct pt_regs *ctx, void *ptr, size_t size) {
+        gen_free_enter(ctx, ptr);
+        return gen_alloc_enter(ctx, size);
+}
+
+int realloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int posix_memalign_enter(struct pt_regs *ctx, void **memptr, size_t alignment,
+                         size_t size) {
+        u64 memptr64 = (u64)(size_t)memptr;
+        u64 pid = bpf_get_current_pid_tgid();
+
+        memptrs.update(&pid, &memptr64);
+        return gen_alloc_enter(ctx, size);
+}
+
+int posix_memalign_exit(struct pt_regs *ctx) {
+        u64 pid = bpf_get_current_pid_tgid();
+        u64 *memptr64 = memptrs.lookup(&pid);
+        void *addr;
+
+        if (memptr64 == 0)
+                return 0;
+
+        memptrs.delete(&pid);
+
+        if (bpf_probe_read(&addr, sizeof(void*), (void*)(size_t)*memptr64))
+                return 0;
+
+        u64 addr64 = (u64)(size_t)addr;
+        return gen_alloc_exit2(ctx, addr64);
+}
+
+int aligned_alloc_enter(struct pt_regs *ctx, size_t alignment, size_t size) {
+        return gen_alloc_enter(ctx, size);
+}
+
+int aligned_alloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int valloc_enter(struct pt_regs *ctx, size_t size) {
+        return gen_alloc_enter(ctx, size);
+}
+
+int valloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int memalign_enter(struct pt_regs *ctx, size_t alignment, size_t size) {
+        return gen_alloc_enter(ctx, size);
+}
+
+int memalign_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int pvalloc_enter(struct pt_regs *ctx, size_t size) {
+        return gen_alloc_enter(ctx, size);
+}
+
+int pvalloc_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+"""
+
+bpf_source_kernel = """
+
+TRACEPOINT_PROBE(kmem, kmalloc) {
+        gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+        return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmalloc_node) {
+        gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+        return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kfree) {
+        return gen_free_enter((struct pt_regs *)args, (void *)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_alloc) {
+        gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+        return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_alloc_node) {
+        gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+        return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_free) {
+        return gen_free_enter((struct pt_regs *)args, (void *)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, mm_page_alloc) {
+        gen_alloc_enter((struct pt_regs *)args, PAGE_SIZE << args->order);
+        return gen_alloc_exit2((struct pt_regs *)args, args->pfn);
+}
+
+TRACEPOINT_PROBE(kmem, mm_page_free) {
+        return gen_free_enter((struct pt_regs *)args, (void *)args->pfn);
+}
+"""
+
+if kernel_trace:
+        bpf_source += bpf_source_kernel
+
+bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
+bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n))
+bpf_source = bpf_source.replace("PAGE_SIZE", str(resource.getpagesize()))
+
+size_filter = ""
+if min_size is not None and max_size is not None:
+        size_filter = "if (size < %d || size > %d) return 0;" % \
+                      (min_size, max_size)
+elif min_size is not None:
+        size_filter = "if (size < %d) return 0;" % min_size
+elif max_size is not None:
+        size_filter = "if (size > %d) return 0;" % max_size
+bpf_source = bpf_source.replace("SIZE_FILTER", size_filter)
+
+stack_flags = "BPF_F_REUSE_STACKID"
+if not kernel_trace:
+        stack_flags += "|BPF_F_USER_STACK"
+bpf_source = bpf_source.replace("STACK_FLAGS", stack_flags)
+
+if args.ebpf:
+    print(bpf_source)
+    exit()
+
+bpf = BPF(text=bpf_source)
+
+if not kernel_trace:
+        print("Attaching to pid %d, Ctrl+C to quit." % pid)
+
+        def attach_probes(sym, fn_prefix=None, can_fail=False):
+                if fn_prefix is None:
+                        fn_prefix = sym
+
+                try:
+                        bpf.attach_uprobe(name=obj, sym=sym,
+                                          fn_name=fn_prefix + "_enter",
+                                          pid=pid)
+                        bpf.attach_uretprobe(name=obj, sym=sym,
+                                             fn_name=fn_prefix + "_exit",
+                                             pid=pid)
+                except Exception:
+                        if can_fail:
+                                return
+                        else:
+                                raise
+
+        attach_probes("malloc")
+        attach_probes("calloc")
+        attach_probes("realloc")
+        attach_probes("posix_memalign")
+        attach_probes("valloc")
+        attach_probes("memalign")
+        attach_probes("pvalloc")
+        attach_probes("aligned_alloc", can_fail=True)  # added in C11
+        bpf.attach_uprobe(name=obj, sym="free", fn_name="free_enter",
+                                  pid=pid)
+
+else:
+        print("Attaching to kernel allocators, Ctrl+C to quit.")
+
+        # No probe attaching here. Allocations are counted by attaching to
+        # tracepoints.
+        #
+        # Memory allocations in Linux kernel are not limited to malloc/free
+        # equivalents. It's also common to allocate a memory page or multiple
+        # pages. Page allocator have two interfaces, one working with page
+        # frame numbers (PFN), while other working with page addresses. It's
+        # possible to allocate pages with one kind of functions, and free them
+        # with another. Code in kernel can easy convert PFNs to addresses and
+        # back, but it's hard to do the same in eBPF kprobe without fragile
+        # hacks.
+        #
+        # Fortunately, Linux exposes tracepoints for memory allocations, which
+        # can be instrumented by eBPF programs. Tracepoint for page allocations
+        # gives access to PFNs for both allocator interfaces. So there is no
+        # need to guess which allocation corresponds to which free.
+
+def print_outstanding():
+        print("[%s] Top %d stacks with outstanding allocations:" %
+              (datetime.now().strftime("%H:%M:%S"), top_stacks))
+        alloc_info = {}
+        allocs = bpf["allocs"]
+        stack_traces = bpf["stack_traces"]
+        for address, info in sorted(allocs.items(), key=lambda a: a[1].size):
+                if BPF.monotonic_time() - min_age_ns < info.timestamp_ns:
+                        continue
+                if info.stack_id < 0:
+                        continue
+                if info.stack_id in alloc_info:
+                        alloc_info[info.stack_id].update(info.size)
+                else:
+                        stack = list(stack_traces.walk(info.stack_id))
+                        combined = []
+                        for addr in stack:
+                                combined.append(bpf.sym(addr, pid,
+                                        show_module=True, show_offset=True))
+                        alloc_info[info.stack_id] = Allocation(combined,
+                                                               info.size)
+                if args.show_allocs:
+                        print("\taddr = %x size = %s" %
+                              (address.value, info.size))
+        to_show = sorted(alloc_info.values(),
+                         key=lambda a: a.size)[-top_stacks:]
+        for alloc in to_show:
+                print("\t%d bytes in %d allocations from stack\n\t\t%s" %
+                      (alloc.size, alloc.count, b"\n\t\t".join(alloc.stack)))
+
+def print_outstanding_combined():
+        stack_traces = bpf["stack_traces"]
+        stacks = sorted(bpf["combined_allocs"].items(),
+                        key=lambda a: -a[1].total_size)
+        cnt = 1
+        entries = []
+        for stack_id, info in stacks:
+                try:
+                        trace = []
+                        for addr in stack_traces.walk(stack_id.value):
+                                sym = bpf.sym(addr, pid,
+                                                      show_module=True,
+                                                      show_offset=True)
+                                trace.append(sym)
+                        trace = "\n\t\t".join(trace)
+                except KeyError:
+                        trace = "stack information lost"
+
+                entry = ("\t%d bytes in %d allocations from stack\n\t\t%s" %
+                         (info.total_size, info.number_of_allocs, trace))
+                entries.append(entry)
+
+                cnt += 1
+                if cnt > top_stacks:
+                        break
+
+        print("[%s] Top %d stacks with outstanding allocations:" %
+              (datetime.now().strftime("%H:%M:%S"), top_stacks))
+
+        print('\n'.join(reversed(entries)))
+
+count_so_far = 0
+while True:
+        if trace_all:
+                print(bpf.trace_fields())
+        else:
+                try:
+                        sleep(interval)
+                except KeyboardInterrupt:
+                        exit()
+                if args.combined_only:
+                        print_outstanding_combined()
+                else:
+                        print_outstanding()
+                sys.stdout.flush()
+                count_so_far += 1
+                if num_prints is not None and count_so_far >= num_prints:
+                        exit()
diff --git a/tools/memleak_example.txt b/tools/memleak_example.txt
new file mode 100644
index 0000000..307a9fa
--- /dev/null
+++ b/tools/memleak_example.txt
@@ -0,0 +1,208 @@
+Demonstrations of memleak.
+
+
+memleak traces and matches memory allocation and deallocation requests, and
+collects call stacks for each allocation. memleak can then print a summary
+of which call stacks performed allocations that weren't subsequently freed.
+For example:
+
+# ./memleak -p $(pidof allocs)
+Attaching to pid 5193, Ctrl+C to quit.
+[11:16:33] Top 2 stacks with outstanding allocations:
+        80 bytes in 5 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+[11:16:34] Top 2 stacks with outstanding allocations:
+        160 bytes in 10 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+
+Each entry printed is a set of allocations that originate from the same call
+stack, and that weren't freed yet. The number of bytes and number of allocs
+are followed by the call stack, top to bottom, of the allocation site.
+
+As time goes on, it becomes apparent that the main function in the allocs
+process is leaking memory, 16 bytes at a time. Fortunately, you don't have to
+inspect each allocation individually -- you get a nice summary of which stack
+is responsible for a large leak.
+
+Occasionally, you do want the individual allocation details. Perhaps the same
+stack is allocating various sizes and you want to confirm which sizes are 
+prevalent. Use the -a switch:
+
+# ./memleak -p $(pidof allocs) -a
+Attaching to pid 5193, Ctrl+C to quit.
+[11:16:33] Top 2 stacks with outstanding allocations:
+        addr = 948cd0 size = 16
+        addr = 948d10 size = 16
+        addr = 948d30 size = 16
+        addr = 948cf0 size = 16
+        64 bytes in 4 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+[11:16:34] Top 2 stacks with outstanding allocations:
+        addr = 948d50 size = 16
+        addr = 948cd0 size = 16
+        addr = 948d10 size = 16
+        addr = 948d30 size = 16
+        addr = 948cf0 size = 16
+        addr = 948dd0 size = 16
+        addr = 948d90 size = 16
+        addr = 948db0 size = 16
+        addr = 948d70 size = 16
+        addr = 948df0 size = 16
+        160 bytes in 10 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+
+When using the -p switch, memleak traces the libc allocations of a particular
+process. Without this switch, kernel allocations are traced instead.
+For example:
+
+# ./memleak
+Attaching to kernel allocators, Ctrl+C to quit.
+...
+        248 bytes in 4 allocations from stack
+                 bpf_prog_load [kernel]
+                 sys_bpf [kernel]
+
+        328 bytes in 1 allocations from stack
+                 perf_mmap [kernel]
+                 mmap_region [kernel]
+                 do_mmap [kernel]
+                 vm_mmap_pgoff [kernel]
+                 sys_mmap_pgoff [kernel]
+                 sys_mmap [kernel]
+
+        464 bytes in 1 allocations from stack
+                 traceprobe_command [kernel]
+                 traceprobe_probes_write [kernel]
+                 probes_write [kernel]
+                 __vfs_write [kernel]
+                 vfs_write [kernel]
+                 sys_write [kernel]
+                 entry_SYSCALL_64_fastpath [kernel]
+
+        8192 bytes in 1 allocations from stack
+                 alloc_and_copy_ftrace_hash.constprop.59 [kernel]
+                 ftrace_set_hash [kernel]
+                 ftrace_set_filter_ip [kernel]
+                 arm_kprobe [kernel]
+                 enable_kprobe [kernel]
+                 kprobe_register [kernel]
+                 perf_trace_init [kernel]
+                 perf_tp_event_init [kernel]
+
+
+Here you can see that arming the kprobe to which our eBPF program is attached
+consumed 8KB of memory. Loading the BPF program also consumed a couple hundred
+bytes (in bpf_prog_load).
+
+memleak stores each allocated block along with its size, timestamp, and the
+stack that allocated it. When the block is deleted, this information is freed
+to reduce the memory overhead.
+
+To avoid false positives, allocations younger than a certain age (500ms by
+default) are not printed. To change this threshold, use the -o switch.
+
+By default, memleak prints its output every 5 seconds. To change this 
+interval, pass the interval as a positional parameter to memleak. You can 
+also control the number of times the output will be printed before exiting.
+For example:
+
+# ./memleak 1 10
+
+... will print the outstanding allocation statistics every second, for ten
+times, and then exit. 
+
+memleak may introduce considerable overhead if your application or kernel is
+allocating and freeing memory at a very high rate. In that case, you can 
+control the overhead by sampling every N-th allocation. For example, to sample
+roughly 10% of the allocations and print the outstanding allocations every 5
+seconds, 3 times before quitting:
+
+# ./memleak -p $(pidof allocs) -s 10 5 3
+Attaching to pid 2614, Ctrl+C to quit.
+[11:16:33] Top 2 stacks with outstanding allocations:
+        16 bytes in 1 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+[11:16:38] Top 2 stacks with outstanding allocations:
+        16 bytes in 1 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+[11:16:43] Top 2 stacks with outstanding allocations:
+        32 bytes in 2 allocations from stack
+                 main+0x6d [allocs]
+                 __libc_start_main+0xf0 [libc-2.21.so]
+
+Note that even though the application leaks 16 bytes of memory every second, 
+the report (printed every 5 seconds) doesn't "see" all the allocations because
+of the sampling rate applied. 
+
+
+USAGE message:
+
+# ./memleak -h
+usage: memleak.py [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND]
+                  [--combined-only] [-s SAMPLE_RATE] [-T TOP] [-z MIN_SIZE]
+                  [-Z MAX_SIZE] [-O OBJ]
+                  [interval] [count]
+
+Trace outstanding memory allocations that weren't freed.
+Supports both user-mode allocations made with libc functions and kernel-mode
+allocations made with kmalloc/kmem_cache_alloc/get_free_pages and corresponding
+memory release functions.
+
+positional arguments:
+  interval              interval in seconds to print outstanding allocations
+  count                 number of times to print the report before exiting
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     the PID to trace; if not specified, trace kernel
+                        allocs
+  -t, --trace           print trace messages for each alloc/free call
+  -a, --show-allocs     show allocation addresses and sizes as well as call
+                        stacks
+  -o OLDER, --older OLDER
+                        prune allocations younger than this age in
+                        milliseconds
+  -c COMMAND, --command COMMAND
+                        execute and trace the specified command
+  --combined-only       show combined allocation statistics only
+  -s SAMPLE_RATE, --sample-rate SAMPLE_RATE
+                        sample every N-th allocation to decrease the overhead
+  -T TOP, --top TOP     display only this many top allocating stacks (by size)
+  -z MIN_SIZE, --min-size MIN_SIZE
+                        capture only allocations larger than this size
+  -Z MAX_SIZE, --max-size MAX_SIZE
+                        capture only allocations smaller than this size
+  -O OBJ, --obj OBJ     attach to allocator functions in the specified object
+
+EXAMPLES:
+
+./memleak -p $(pidof allocs)
+        Trace allocations and display a summary of "leaked" (outstanding)
+        allocations every 5 seconds
+./memleak -p $(pidof allocs) -t
+        Trace allocations and display each individual allocator function call
+./memleak -ap $(pidof allocs) 10
+        Trace allocations and display allocated addresses, sizes, and stacks
+        every 10 seconds for outstanding allocations
+./memleak -c "./allocs"
+        Run the specified command and trace its allocations
+./memleak
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations every 5 seconds
+./memleak -o 60000
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations that are at least one minute (60 seconds) old
+./memleak -s 5
+        Trace roughly every 5th allocation, to reduce overhead
diff --git a/tools/mountsnoop.py b/tools/mountsnoop.py
new file mode 100755
index 0000000..e9b5865
--- /dev/null
+++ b/tools/mountsnoop.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python
+#
+# mountsnoop Trace mount() and umount syscalls.
+#            For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: mountsnoop [-h]
+#
+# Copyright (c) 2016 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Oct-2016   Omar Sandoval   Created this.
+
+from __future__ import print_function
+import argparse
+import bcc
+import ctypes
+import errno
+import functools
+import sys
+
+
+bpf_text = r"""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#include <linux/nsproxy.h>
+#include <linux/ns_common.h>
+
+/*
+ * XXX: struct mnt_namespace is defined in fs/mount.h, which is private to the
+ * VFS and not installed in any kernel-devel packages. So, let's duplicate the
+ * important part of the definition. There are actually more members in the
+ * real struct, but we don't need them, and they're more likely to change.
+ */
+struct mnt_namespace {
+    atomic_t count;
+    struct ns_common ns;
+};
+
+/*
+ * XXX: this could really use first-class string support in BPF. target is a
+ * NUL-terminated path up to PATH_MAX in length. source and type are
+ * NUL-terminated strings up to PAGE_SIZE in length. data is a weird case: it's
+ * almost always a NUL-terminated string, but for some filesystems (e.g., older
+ * NFS variants), it's a binary structure with plenty of NUL bytes, so the
+ * kernel always copies up to PAGE_SIZE bytes, stopping when it hits a fault.
+ *
+ * The best we can do with the existing BPF helpers is to copy as much of each
+ * argument as we can. Our stack space is limited, and we need to leave some
+ * headroom for the rest of the function, so this should be a decent value.
+ */
+#define MAX_STR_LEN 412
+
+enum event_type {
+    EVENT_MOUNT,
+    EVENT_MOUNT_SOURCE,
+    EVENT_MOUNT_TARGET,
+    EVENT_MOUNT_TYPE,
+    EVENT_MOUNT_DATA,
+    EVENT_MOUNT_RET,
+    EVENT_UMOUNT,
+    EVENT_UMOUNT_TARGET,
+    EVENT_UMOUNT_RET,
+};
+
+struct data_t {
+    enum event_type type;
+    pid_t pid, tgid;
+    union {
+        /* EVENT_MOUNT, EVENT_UMOUNT */
+        struct {
+            /* current->nsproxy->mnt_ns->ns.inum */
+            unsigned int mnt_ns;
+            char comm[TASK_COMM_LEN];
+            unsigned long flags;
+        } enter;
+        /*
+         * EVENT_MOUNT_SOURCE, EVENT_MOUNT_TARGET, EVENT_MOUNT_TYPE,
+         * EVENT_MOUNT_DATA, EVENT_UMOUNT_TARGET
+         */
+        char str[MAX_STR_LEN];
+        /* EVENT_MOUNT_RET, EVENT_UMOUNT_RET */
+        int retval;
+    };
+};
+
+BPF_PERF_OUTPUT(events);
+
+int syscall__mount(struct pt_regs *ctx, char __user *source,
+                      char __user *target, char __user *type,
+                      unsigned long flags)
+{
+    /* sys_mount takes too many arguments */
+    char __user *data = (char __user *)PT_REGS_PARM5(ctx);
+    struct data_t event = {};
+    struct task_struct *task;
+    struct nsproxy *nsproxy;
+    struct mnt_namespace *mnt_ns;
+
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+
+    event.type = EVENT_MOUNT;
+    bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
+    event.enter.flags = flags;
+    task = (struct task_struct *)bpf_get_current_task();
+    nsproxy = task->nsproxy;
+    mnt_ns = nsproxy->mnt_ns;
+    event.enter.mnt_ns = mnt_ns->ns.inum;
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_SOURCE;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), source);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_TARGET;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), target);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_TYPE;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), type);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_DATA;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), data);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int do_ret_sys_mount(struct pt_regs *ctx)
+{
+    struct data_t event = {};
+
+    event.type = EVENT_MOUNT_RET;
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+    event.retval = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int syscall__umount(struct pt_regs *ctx, char __user *target, int flags)
+{
+    struct data_t event = {};
+    struct task_struct *task;
+    struct nsproxy *nsproxy;
+    struct mnt_namespace *mnt_ns;
+
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+
+    event.type = EVENT_UMOUNT;
+    bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
+    event.enter.flags = flags;
+    task = (struct task_struct *)bpf_get_current_task();
+    nsproxy = task->nsproxy;
+    mnt_ns = nsproxy->mnt_ns;
+    event.enter.mnt_ns = mnt_ns->ns.inum;
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_UMOUNT_TARGET;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), target);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int do_ret_sys_umount(struct pt_regs *ctx)
+{
+    struct data_t event = {};
+
+    event.type = EVENT_UMOUNT_RET;
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+    event.retval = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+"""
+
+# sys/mount.h
+MS_MGC_VAL = 0xc0ed0000
+MS_MGC_MSK = 0xffff0000
+MOUNT_FLAGS = [
+    ('MS_RDONLY', 1),
+    ('MS_NOSUID', 2),
+    ('MS_NODEV', 4),
+    ('MS_NOEXEC', 8),
+    ('MS_SYNCHRONOUS', 16),
+    ('MS_REMOUNT', 32),
+    ('MS_MANDLOCK', 64),
+    ('MS_DIRSYNC', 128),
+    ('MS_NOATIME', 1024),
+    ('MS_NODIRATIME', 2048),
+    ('MS_BIND', 4096),
+    ('MS_MOVE', 8192),
+    ('MS_REC', 16384),
+    ('MS_SILENT', 32768),
+    ('MS_POSIXACL', 1 << 16),
+    ('MS_UNBINDABLE', 1 << 17),
+    ('MS_PRIVATE', 1 << 18),
+    ('MS_SLAVE', 1 << 19),
+    ('MS_SHARED', 1 << 20),
+    ('MS_RELATIME', 1 << 21),
+    ('MS_KERNMOUNT', 1 << 22),
+    ('MS_I_VERSION', 1 << 23),
+    ('MS_STRICTATIME', 1 << 24),
+    ('MS_LAZYTIME', 1 << 25),
+    ('MS_ACTIVE', 1 << 30),
+    ('MS_NOUSER', 1 << 31),
+]
+UMOUNT_FLAGS = [
+    ('MNT_FORCE', 1),
+    ('MNT_DETACH', 2),
+    ('MNT_EXPIRE', 4),
+    ('UMOUNT_NOFOLLOW', 8),
+]
+
+
+TASK_COMM_LEN = 16  # linux/sched.h
+MAX_STR_LEN = 412
+
+
+class EventType(object):
+    EVENT_MOUNT = 0
+    EVENT_MOUNT_SOURCE = 1
+    EVENT_MOUNT_TARGET = 2
+    EVENT_MOUNT_TYPE = 3
+    EVENT_MOUNT_DATA = 4
+    EVENT_MOUNT_RET = 5
+    EVENT_UMOUNT = 6
+    EVENT_UMOUNT_TARGET = 7
+    EVENT_UMOUNT_RET = 8
+
+
+class EnterData(ctypes.Structure):
+    _fields_ = [
+        ('mnt_ns', ctypes.c_uint),
+        ('comm', ctypes.c_char * TASK_COMM_LEN),
+        ('flags', ctypes.c_ulong),
+    ]
+
+
+class DataUnion(ctypes.Union):
+    _fields_ = [
+        ('enter', EnterData),
+        ('str', ctypes.c_char * MAX_STR_LEN),
+        ('retval', ctypes.c_int),
+    ]
+
+
+class Event(ctypes.Structure):
+    _fields_ = [
+        ('type', ctypes.c_uint),
+        ('pid', ctypes.c_uint),
+        ('tgid', ctypes.c_uint),
+        ('union', DataUnion),
+    ]
+
+
+def _decode_flags(flags, flag_list):
+    str_flags = []
+    for flag, bit in flag_list:
+        if flags & bit:
+            str_flags.append(flag)
+        flags &= ~bit
+    if flags or not str_flags:
+        str_flags.append('0x{:x}'.format(flags))
+    return str_flags
+
+
+def decode_flags(flags, flag_list):
+    return '|'.join(_decode_flags(flags, flag_list))
+
+
+def decode_mount_flags(flags):
+    str_flags = []
+    if flags & MS_MGC_MSK == MS_MGC_VAL:
+        flags &= ~MS_MGC_MSK
+        str_flags.append('MS_MGC_VAL')
+    str_flags.extend(_decode_flags(flags, MOUNT_FLAGS))
+    return '|'.join(str_flags)
+
+
+def decode_umount_flags(flags):
+    return decode_flags(flags, UMOUNT_FLAGS)
+
+
+def decode_errno(retval):
+    try:
+        return '-' + errno.errorcode[-retval]
+    except KeyError:
+        return str(retval)
+
+
+_escape_chars = {
+    ord('\a'): '\\a',
+    ord('\b'): '\\b',
+    ord('\t'): '\\t',
+    ord('\n'): '\\n',
+    ord('\v'): '\\v',
+    ord('\f'): '\\f',
+    ord('\r'): '\\r',
+    ord('"'): '\\"',
+    ord('\\'): '\\\\',
+}
+
+
+def escape_character(c):
+    try:
+        return _escape_chars[c]
+    except KeyError:
+        if 0x20 <= c <= 0x7e:
+            return chr(c)
+        else:
+            return '\\x{:02x}'.format(c)
+
+
+if sys.version_info.major < 3:
+    def decode_mount_string(s):
+        return '"{}"'.format(''.join(escape_character(ord(c)) for c in s))
+else:
+    def decode_mount_string(s):
+        return '"{}"'.format(''.join(escape_character(c) for c in s))
+
+
+def print_event(mounts, umounts, cpu, data, size):
+    event = ctypes.cast(data, ctypes.POINTER(Event)).contents
+
+    try:
+        if event.type == EventType.EVENT_MOUNT:
+            mounts[event.pid] = {
+                'pid': event.pid,
+                'tgid': event.tgid,
+                'mnt_ns': event.union.enter.mnt_ns,
+                'comm': event.union.enter.comm,
+                'flags': event.union.enter.flags,
+            }
+        elif event.type == EventType.EVENT_MOUNT_SOURCE:
+            mounts[event.pid]['source'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_TARGET:
+            mounts[event.pid]['target'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_TYPE:
+            mounts[event.pid]['type'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_DATA:
+            # XXX: data is not always a NUL-terminated string
+            mounts[event.pid]['data'] = event.union.str
+        elif event.type == EventType.EVENT_UMOUNT:
+            umounts[event.pid] = {
+                'pid': event.pid,
+                'tgid': event.tgid,
+                'mnt_ns': event.union.enter.mnt_ns,
+                'comm': event.union.enter.comm,
+                'flags': event.union.enter.flags,
+            }
+        elif event.type == EventType.EVENT_UMOUNT_TARGET:
+            umounts[event.pid]['target'] = event.union.str
+        elif (event.type == EventType.EVENT_MOUNT_RET or
+              event.type == EventType.EVENT_UMOUNT_RET):
+            if event.type == EventType.EVENT_MOUNT_RET:
+                syscall = mounts.pop(event.pid)
+                call = ('mount({source}, {target}, {type}, {flags}, {data}) ' +
+                        '= {retval}').format(
+                    source=decode_mount_string(syscall['source']),
+                    target=decode_mount_string(syscall['target']),
+                    type=decode_mount_string(syscall['type']),
+                    flags=decode_mount_flags(syscall['flags']),
+                    data=decode_mount_string(syscall['data']),
+                    retval=decode_errno(event.union.retval))
+            else:
+                syscall = umounts.pop(event.pid)
+                call = 'umount({target}, {flags}) = {retval}'.format(
+                    target=decode_mount_string(syscall['target']),
+                    flags=decode_umount_flags(syscall['flags']),
+                    retval=decode_errno(event.union.retval))
+            print('{:16} {:<7} {:<7} {:<11} {}'.format(
+                syscall['comm'].decode('utf-8', 'replace'), syscall['tgid'],
+                syscall['pid'], syscall['mnt_ns'], call))
+    except KeyError:
+        # This might happen if we lost an event.
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='trace mount() and umount() syscalls'
+    )
+    parser.add_argument("--ebpf", action="store_true",
+        help=argparse.SUPPRESS)
+    args = parser.parse_args()
+
+    mounts = {}
+    umounts = {}
+    if args.ebpf:
+        print(bpf_text)
+        exit()
+    b = bcc.BPF(text=bpf_text)
+    mount_fnname = b.get_syscall_fnname("mount")
+    b.attach_kprobe(event=mount_fnname, fn_name="syscall__mount")
+    b.attach_kretprobe(event=mount_fnname, fn_name="do_ret_sys_mount")
+    umount_fnname = b.get_syscall_fnname("umount")
+    b.attach_kprobe(event=umount_fnname, fn_name="syscall__umount")
+    b.attach_kretprobe(event=umount_fnname, fn_name="do_ret_sys_umount")
+    b['events'].open_perf_buffer(
+        functools.partial(print_event, mounts, umounts))
+    print('{:16} {:<7} {:<7} {:<11} {}'.format(
+        'COMM', 'PID', 'TID', 'MNT_NS', 'CALL'))
+    while True:
+        b.perf_buffer_poll()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/mountsnoop_example.txt b/tools/mountsnoop_example.txt
new file mode 100644
index 0000000..1c5144e
--- /dev/null
+++ b/tools/mountsnoop_example.txt
@@ -0,0 +1,28 @@
+Demonstrations of mountsnoop.
+
+mountsnoop traces the mount() and umount syscalls system-wide. For example,
+running the following series of commands produces this output:
+
+# mount --bind /mnt /mnt
+# umount /mnt
+# unshare -m
+# mount --bind /mnt /mnt
+# umount /mnt
+
+# ./mountsnoop.py
+COMM             PID     TID     MNT_NS      CALL
+mount            710     710     4026531840  mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
+umount           714     714     4026531840  umount("/mnt", 0x0) = 0
+unshare          717     717     4026532160  mount("none", "/", "", MS_REC|MS_PRIVATE, "") = 0
+mount            725     725     4026532160  mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
+umount           728     728     4026532160  umount("/mnt", 0x0) = 0
+
+The output shows the calling command, its process ID and thread ID, the mount
+namespace the call was made in, and the call itself.
+
+The mount namespace number is an inode number that uniquely identifies the
+namespace in the running system. This can also be obtained from readlink
+/proc/$PID/ns/mnt.
+
+Note that because of restrictions in BPF, the string arguments to either
+syscall may be truncated.
diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py
new file mode 100755
index 0000000..d760773
--- /dev/null
+++ b/tools/mysqld_qslower.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python
+#
+# mysqld_qslower    MySQL server queries slower than a threshold.
+#                   For Linux, uses BCC, BPF. Embedded C.
+#
+# USAGE: mysqld_qslower PID [min_ms]
+#
+# By default, a threshold of 1.0 ms is used. Set this to 0 ms to trace all
+# queries (verbose).
+#
+# This uses USDT probes, and needs a MySQL server with -DENABLE_DTRACE=1.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 30-Jul-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import sys
+import ctypes as ct
+
+# arguments
+def usage():
+    print("USAGE: mysqld_latency PID [min_ms]")
+    exit()
+if len(sys.argv) < 2:
+    usage()
+if sys.argv[1][0:1] == "-":
+    usage()
+pid = int(sys.argv[1])
+min_ns = 1 * 1000000
+min_ms_text = 1
+if len(sys.argv) == 3:
+    min_ns = float(sys.argv[2]) * 1000000
+    min_ms_text = sys.argv[2]
+debug = 0
+QUERY_MAX = 128
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+#define QUERY_MAX	""" + str(QUERY_MAX) + """
+
+struct start_t {
+    u64 ts;
+    char *query;
+};
+
+struct data_t {
+    u64 pid;
+    u64 ts;
+    u64 delta;
+    char query[QUERY_MAX];
+};
+
+BPF_HASH(start_tmp, u32, struct start_t);
+BPF_PERF_OUTPUT(events);
+
+int do_start(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct start_t start = {};
+    start.ts = bpf_ktime_get_ns();
+    bpf_usdt_readarg(1, ctx, &start.query);
+    start_tmp.update(&pid, &start);
+    return 0;
+};
+
+int do_done(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct start_t *sp;
+
+    sp = start_tmp.lookup(&pid);
+    if (sp == 0) {
+        // missed tracing start
+        return 0;
+    }
+
+    // check if query exceeded our threshold
+    u64 delta = bpf_ktime_get_ns() - sp->ts;
+    if (delta >= """ + str(min_ns) + """) {
+        // populate and emit data struct
+        struct data_t data = {.pid = pid, .ts = sp->ts, .delta = delta};
+        bpf_probe_read(&data.query, sizeof(data.query), (void *)sp->query);
+        events.perf_submit(ctx, &data, sizeof(data));
+    }
+
+    start_tmp.delete(&pid);
+
+    return 0;
+};
+
+"""
+
+# enable USDT probe from given PID
+u = USDT(pid=pid)
+u.enable_probe(probe="query__start", fn_name="do_start")
+u.enable_probe(probe="query__done", fn_name="do_done")
+if debug:
+    print(u.get_text())
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text, usdt_contexts=[u])
+
+# header
+print("Tracing MySQL server queries for PID %d slower than %s ms..." % (pid,
+    min_ms_text))
+print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("ts", ct.c_ulonglong),
+        ("delta", ct.c_ulonglong),
+        ("query", ct.c_char * QUERY_MAX)
+    ]
+
+# process event
+start = 0
+def print_event(cpu, data, size):
+    global start
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    if start == 0:
+        start = event.ts
+    print("%-14.6f %-6d %8.3f %s" % (float(event.ts - start) / 1000000000,
+        event.pid, float(event.delta) / 1000000, event.query))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/mysqld_qslower_example.txt b/tools/mysqld_qslower_example.txt
new file mode 100644
index 0000000..73d52fa
--- /dev/null
+++ b/tools/mysqld_qslower_example.txt
@@ -0,0 +1,58 @@
+Demonstrations of mysqld_qslower, the Linux eBPF/bcc version.
+
+
+mysqld_qslower traces queries served by a MySQL server, and prints those that
+exceed a latency (query time) threshold. By default a threshold of 1 ms is
+used. For example:
+
+# ./mysqld_qslower.py `pgrep -n mysqld`
+Tracing MySQL server queries for PID 14371 slower than 1 ms...
+TIME(s)        PID          MS QUERY
+0.000000       18608   130.751 SELECT * FROM words WHERE word REGEXP '^bre.*n$'
+2.921535       18608   130.590 SELECT * FROM words WHERE word REGEXP '^alex.*$'
+4.603549       18608    24.164 SELECT COUNT(*) FROM words
+9.733847       18608   130.936 SELECT count(*) AS count FROM words WHERE word REGEXP '^bre.*n$'
+17.864776      18608   130.298 SELECT * FROM words WHERE word REGEXP '^bre.*n$' ORDER BY word
+
+This traced 5 queries, 4 of which took about 130 milliseconds.
+
+A pgrep command was used to specify the PID of mysqld.
+
+
+In this example, a lower threshold is used of 0.1 ms:
+
+# ./mysqld_qslower.py `pgrep -n mysqld` 0.1
+Tracing MySQL server queries for PID 14371 slower than 0.1 ms...
+TIME(s)        PID          MS QUERY
+0.000000       18608    24.201 SELECT COUNT(*) FROM words
+13.242390      18608   130.378 SELECT * FROM words WHERE word REGEXP '^bre.*n$'
+23.601751      18608   119.198 SELECT * FROM words WHERE word REGEXP '^zzzzzzzz$'
+
+It worked, but I'm not catching any faster queries in this example. Notice I
+added a query that searched for "zzzzzzzz": it returned an empty set, and ran
+11 ms faster.
+
+
+A 0 ms threshold can be specified to trace all queries:
+
+# ./mysqld_qslower.py `pgrep -n mysqld` 0
+Tracing MySQL server queries for PID 14371 slower than 0 ms...
+TIME(s)        PID          MS QUERY
+0.000000       18608     0.105 select @@version_comment limit 1
+2.049312       18608     0.099 SELECT DATABASE()
+2.050666       18608     0.274 show databases
+2.051040       18608     0.176 show tables
+5.730044       18608   130.365 SELECT count(*) AS count FROM words WHERE word REGEXP '^bre.*n$'
+9.273837       18608     0.096 select 1
+9.553742       18608     0.059 select 1
+9.986087       18608     0.080 select 1
+
+This includes an initialization of a mysql client command, and selecting the
+database. I also added some "select 1;" queries, which do no work and return
+quickly.
+
+
+USAGE:
+
+# ./mysqld_qslower.py -h
+USAGE: mysqld_latency PID [min_ms]
diff --git a/tools/nfsdist.py b/tools/nfsdist.py
new file mode 100755
index 0000000..ff78506
--- /dev/null
+++ b/tools/nfsdist.py
@@ -0,0 +1,173 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# nfsdist   Summarize NFS operation latency
+#           for Linux, uses BCC and eBPF
+#
+# USAGE: nfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# 4-Sep-2017    Samuel Nair     created this
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./nfsdist            # show operation latency as a histogram
+    ./nfsdist -p 181     # trace PID 181 only
+    ./nfsdist 1 10       # print 1 second summaries, 10 times
+    ./nfsdist -m 5       # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+        description="Summarize NFS operation latency",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+                    help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+                    help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+                    help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+                    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+                    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+                    help=argparse.SUPPRESS)
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+    factor = 1000000
+    label = "msecs"
+else:
+    factor = 1000
+    label = "usecs"
+    if args.interval and int(args.interval) == 0:
+        print("ERROR: interval 0. Exiting.")
+        exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+    char op[OP_NAME_LEN];
+    u64 slot;
+} dist_key_t;
+
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+    u64 *tsp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start or filtered
+    }
+    u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+
+    // store as histogram
+    dist_key_t key = {.slot = bpf_log2l(delta)};
+    __builtin_memcpy(&key.op, op, sizeof(key.op));
+    dist.increment(key);
+
+    start.delete(&pid);
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    char *op = "read";
+    return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    char *op = "write";
+    return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    char *op = "open";
+    return trace_return(ctx, op);
+}
+
+int trace_getattr_return(struct pt_regs *ctx)
+{
+    char *op = "getattr";
+    return trace_return(ctx, op);
+}
+"""
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# common file functions
+b.attach_kprobe(event="nfs_file_read", fn_name="trace_entry")
+b.attach_kprobe(event="nfs_file_write", fn_name="trace_entry")
+b.attach_kprobe(event="nfs4_file_open", fn_name="trace_entry")
+b.attach_kprobe(event="nfs_file_open", fn_name="trace_entry")
+b.attach_kprobe(event="nfs_getattr", fn_name="trace_entry")
+
+b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return")
+b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return")
+b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="nfs_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")
+
+print("Tracing NFS operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.interval and (not args.notimestamp):
+        print(strftime("%H:%M:%S:"))
+
+    dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode)
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/nfsdist_example.txt b/tools/nfsdist_example.txt
new file mode 100644
index 0000000..b057569
--- /dev/null
+++ b/tools/nfsdist_example.txt
@@ -0,0 +1,160 @@
+Demonstrations of nfsdist, the Linux eBPF/bcc version.
+
+nfsdist traces NFS reads, writes, opens, and getattr, and summarizes their
+latency as a power-of-2 histogram. For example:
+
+
+./nfsdist.py
+
+Tracing NFS operation latency... Hit Ctrl-C to end.
+
+operation = read
+     usecs               : count     distribution
+         0 -> 1          : 4        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 7107     |**************                          |
+        16 -> 31         : 19864    |****************************************|
+        32 -> 63         : 1494     |***                                     |
+        64 -> 127        : 491      |                                        |
+       128 -> 255        : 1810     |***                                     |
+       256 -> 511        : 6356     |************                            |
+       512 -> 1023       : 4860     |*********                               |
+      1024 -> 2047       : 3070     |******                                  |
+      2048 -> 4095       : 1853     |***                                     |
+      4096 -> 8191       : 921      |*                                       |
+      8192 -> 16383      : 122      |                                        |
+     16384 -> 32767      : 15       |                                        |
+     32768 -> 65535      : 5        |                                        |
+     65536 -> 131071     : 2        |                                        |
+    131072 -> 262143     : 1        |                                        |
+
+operation = write
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 9        |                                        |
+        64 -> 127        : 19491    |****************************************|
+       128 -> 255        : 3064     |******                                  |
+       256 -> 511        : 940      |*                                       |
+       512 -> 1023       : 365      |                                        |
+      1024 -> 2047       : 312      |                                        |
+      2048 -> 4095       : 119      |                                        |
+      4096 -> 8191       : 31       |                                        |
+      8192 -> 16383      : 84       |                                        |
+     16384 -> 32767      : 31       |                                        |
+     32768 -> 65535      : 5        |                                        |
+     65536 -> 131071     : 3        |                                        |
+    131072 -> 262143     : 0        |                                        |
+    262144 -> 524287     : 1        |                                        |
+
+operation = getattr
+     usecs               : count     distribution
+         0 -> 1          : 27       |****************************************|
+         2 -> 3          : 2        |**                                      |
+         4 -> 7          : 3        |****                                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 2        |**                                      |
+       512 -> 1023       : 2        |**                                      |
+
+operation = open
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 2        |****************************************|
+
+
+In this example you can see that the read traffic is rather bi-modal, with about
+26K reads falling within 8 - 30 usecs and about 18K reads spread between 128 -
+8191 usecs. Write traffic is largely clustered in the 64 - 127 usecs bracket.
+The faster read traffic is probably coming from a filesystem cache and the slower
+traffic from disk. The reason why the writes are so consistently fast is because
+this example test was run on a couple of VM's and I believe the hypervisor was
+caching all the write traffic to memory.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+RPC latency, network latency, file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from a NFS share and can better expose problems
+experienced by NFS clients.
+
+Note that this only traces the common NFS operations (read, write, open and
+getattr). I chose to include getattr as a significant percentage of NFS
+traffic end up being getattr calls and are a good indicator of problems
+with an NFS server.
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example:
+
+./nfsdist -m 1 5
+Tracing NFS operation latency... Hit Ctrl-C to end.
+
+11:02:39:
+
+operation = write
+     msecs               : count     distribution
+         0 -> 1          : 1        |                                        |
+         2 -> 3          : 24       |********                                |
+         4 -> 7          : 114      |****************************************|
+         8 -> 15         : 9        |***                                     |
+        16 -> 31         : 1        |                                        |
+        32 -> 63         : 1        |                                        |
+
+11:02:40:
+
+operation = write
+     msecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 11       |***                                     |
+         4 -> 7          : 111      |****************************************|
+         8 -> 15         : 13       |****                                    |
+        16 -> 31         : 1        |                                        |
+
+11:02:41:
+
+operation = write
+     msecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 21       |******                                  |
+         4 -> 7          : 137      |****************************************|
+         8 -> 15         : 3        |                                        |
+
+This shows a write workload, with writes hovering primarily in the 4-7ms range.
+
+USAGE message:
+
+
+./nfsdist -h
+usage: nfsdist.py [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize NFS operation latency
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --notimestamp   don't include timestamp on interval output
+  -m, --milliseconds  output in milliseconds
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./nfsdist            # show operation latency as a histogram
+    ./nfsdist -p 181     # trace PID 181 only
+    ./nfsdist 1 10       # print 1 second summaries, 10 times
+    ./nfsdist -m 5       # 5s summaries, milliseconds
diff --git a/tools/nfsslower.py b/tools/nfsslower.py
new file mode 100755
index 0000000..2f92c90
--- /dev/null
+++ b/tools/nfsslower.py
@@ -0,0 +1,328 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# nfsslower     Trace slow NFS operations
+#               for Linux using BCC & eBPF
+#
+# Usage: nfsslower [-h] [-p PID] [min_ms]
+#
+# This script traces some common NFS operations: read, write, opens and
+# getattr. It measures the time spent in these operations, and prints details
+# for each that exceeded a threshold.
+#
+# WARNING: This adds low-overhead instrumentation to these NFS operations,
+# including reads and writes from the file system cache. Such reads and writes
+# can be very frequent (depending on the workload; eg, 1M/sec), at which
+# point the overhead of this tool (even if it prints no "slower" events) can
+# begin to become significant.
+#
+# Most of this code is copied from similar tools (ext4slower, zfsslower etc)
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# This tool uses kprobes to instrument the kernel for entry and exit
+# information, in the future a preferred way would be to use tracepoints.
+# Currently there are'nt any tracepoints available for nfs_read_file,
+# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
+# tracepoints but we chose to use kprobes for consistency
+#
+# 31-Aug-2017   Samuel Nair created this. Should work with NFSv{3,4}
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+examples = """
+    ./nfsslower         # trace operations slower than 10ms
+    ./nfsslower 1       # trace operations slower than 1ms
+    ./nfsslower -j 1    # ... 1 ms, parsable output (csv)
+    ./nfsslower 0       # trace all nfs operations
+    ./nfsslower -p 121  # trace pid 121 only
+"""
+parser = argparse.ArgumentParser(
+    description="""Trace READ, WRITE, OPEN \
+and GETATTR NFS calls slower than a threshold,\
+supports NFSv{3,4}""",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+
+parser.add_argument("-j", "--csv", action="store_true",
+                    help="just print fields: comma-separated values")
+parser.add_argument("-p", "--pid", help="Trace this pid only")
+parser.add_argument("min_ms", nargs="?", default='10',
+                    help="Minimum IO duration to trace in ms (default=10ms)")
+parser.add_argument("--ebpf", action="store_true",
+                    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+pid = args.pid
+csv = args.csv
+debug = 0
+
+bpf_text = """
+
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/dcache.h>
+
+#define TRACE_READ 0
+#define TRACE_WRITE 1
+#define TRACE_OPEN 2
+#define TRACE_GETATTR 3
+
+struct val_t {
+    u64 ts;
+    u64 offset;
+    struct file *fp;
+    struct dentry *d;
+};
+
+struct data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 type;
+    u64 size;
+    u64 offset;
+    u64 delta_us;
+    u64 pid;
+    char task[TASK_COMM_LEN];
+    char file[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(entryinfo, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
+                                struct iov_iter *data)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if(FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = iocb->ki_filp;
+    val.d = NULL;
+    val.offset = iocb->ki_pos;
+
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+int trace_file_open_entry (struct pt_regs *ctx, struct inode *inode,
+                                struct file *filp)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if(FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = filp;
+    val.d = NULL;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+int trace_getattr_entry(struct pt_regs *ctx, struct vfsmount *mnt,
+                        struct dentry *dentry, struct kstat *stat)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if(FILTER_PID)
+        return 0;
+
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = NULL;
+    val.d = dentry;
+    val.offset = 0;
+    if (val.d)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+static int trace_exit(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    valp = entryinfo.lookup(&id);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+
+    // calculate delta
+    u64 ts = bpf_ktime_get_ns();
+    u64 delta_us = (ts - valp->ts) / 1000;
+    entryinfo.delete(&id);
+
+    if (FILTER_US)
+        return 0;
+
+    // populate output struct
+    u32 size = PT_REGS_RC(ctx);
+    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
+        .pid = pid};
+    data.ts_us = ts / 1000;
+    data.offset = valp->offset;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // workaround (rewriter should handle file to d_name in one step):
+    struct dentry *de = NULL;
+    struct qstr qs = {};
+    if(type == TRACE_GETATTR)
+    {
+        bpf_probe_read(&de,sizeof(de), &valp->d);
+    }
+    else
+    {
+        bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry);
+    }
+
+    bpf_probe_read(&qs, sizeof(qs), (void *)&de->d_name);
+    if (qs.len == 0)
+        return 0;
+
+    bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+
+int trace_file_open_return(struct pt_regs *ctx)
+{
+    return trace_exit(ctx, TRACE_OPEN);
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_exit(ctx, TRACE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_exit(ctx, TRACE_WRITE);
+}
+
+int trace_getattr_return(struct pt_regs *ctx)
+{
+    return trace_exit(ctx, TRACE_GETATTR);
+}
+
+"""
+if min_ms == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US',
+                                'delta_us <= %s' % str(min_ms * 1000))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("type", ct.c_ulonglong),
+        ("size", ct.c_ulonglong),
+        ("offset", ct.c_ulonglong),
+        ("delta_us", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("file", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    type = 'R'
+    if event.type == 1:
+        type = 'W'
+    elif event.type == 2:
+        type = 'O'
+    elif event.type == 3:
+        type = 'G'
+
+    if(csv):
+        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
+            event.ts_us, event.task, event.pid, type, event.size,
+            event.offset, event.delta_us, event.file))
+        return
+    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" %
+          (strftime("%H:%M:%S"),
+           event.task.decode('utf-8', 'replace'),
+           event.pid,
+           type,
+           event.size,
+           event.offset / 1024,
+           float(event.delta_us) / 1000,
+           event.file.decode('utf-8', 'replace')))
+
+
+# Currently specifically works for NFSv4, the other kprobes are generic
+# so it should work with earlier NFS versions
+
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
+b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
+b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
+b.attach_kprobe(event="nfs_file_open", fn_name="trace_file_open_entry")
+b.attach_kprobe(event="nfs_getattr", fn_name="trace_getattr_entry")
+
+b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return")
+b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return")
+b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_file_open_return")
+b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
+b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")
+
+if(csv):
+    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
+else:
+    if min_ms == 0:
+        print("Tracing NFS operations... Ctrl-C to quit")
+    else:
+        print("""Tracing NFS operations that are slower than \
+%d ms... Ctrl-C to quit"""
+              % min_ms)
+    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME",
+                                                    "COMM",
+                                                    "PID",
+                                                    "T",
+                                                    "BYTES",
+                                                    "OFF_KB",
+                                                    "LAT(ms)",
+                                                    "FILENAME"))
+
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+        b.perf_buffer_poll()
diff --git a/tools/nfsslower_example.txt b/tools/nfsslower_example.txt
new file mode 100644
index 0000000..823b64a
--- /dev/null
+++ b/tools/nfsslower_example.txt
@@ -0,0 +1,158 @@
+Demonstrations of nfsslower, the Linux eBPF/bcc version.
+
+nfsslower show NFS reads, writes, opens and getattrs, slower than a
+threshold. For example:
+
+./nfsslower.py
+Tracing NFS operations that are slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+11:25:16 dd             21295  W 1048576 15360      14.84 1.test
+11:25:16 dd             21295  W 1048576 16384      12.73 1.test
+11:25:16 dd             21295  W 1048576 17408      24.27 1.test
+11:25:16 dd             21295  W 1048576 18432      22.93 1.test
+11:25:16 dd             21295  W 1048576 19456      14.65 1.test
+11:25:16 dd             21295  W 1048576 20480      12.58 1.test
+11:25:16 dd             21297  W 1048576 6144       10.50 1.test.w
+11:25:16 dd             21297  W 1048576 7168       16.65 1.test.w
+11:25:16 dd             21297  W 1048576 8192       13.01 1.test.w
+11:25:16 dd             21297  W 1048576 9216       14.06 1.test.w
+
+This shows NFS writes from dd each 1MB in size to 2 different files. The
+writes all had latency higher than 10ms.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+RPC latency, network latency, file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from a NFS share and can better expose problems
+experienced by NFS clients.
+
+Note that this only traces the common NFS operations (read,write,open and
+getattr). I chose to include getattr as a significant percentage of NFS
+traffic end up being getattr calls and are a good indicator of problems
+with an NFS server.
+
+The threshold can be provided as an argument. E.g. I/O slower than 1 ms:
+
+./nfsslower.py 1
+Tracing NFS operations that are slower than 1 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+11:40:16 cp             21583  R 131072  0           4.35 1.test
+11:40:16 cp             21583  R 131072  256         1.87 1.test
+11:40:16 cp             21583  R 131072  384         2.99 1.test
+11:40:16 cp             21583  R 131072  512         4.19 1.test
+11:40:16 cp             21583  R 131072  640         4.25 1.test
+11:40:16 cp             21583  R 131072  768         4.65 1.test
+11:40:16 cp             21583  R 131072  1280        1.08 1.test
+11:40:16 cp             21583  R 131072  1408        3.29 1.test
+11:40:16 cp             21583  R 131072  1792        3.12 1.test
+11:40:16 cp             21583  R 131072  3712        3.55 1.test
+11:40:16 cp             21583  R 131072  3840        1.12 1.test
+11:40:16 cp             21583  R 131072  4096        3.23 1.test
+11:40:16 cp             21583  R 131072  4224        2.73 1.test
+11:40:16 cp             21583  R 131072  4352        2.73 1.test
+11:40:16 cp             21583  R 131072  4480        6.09 1.test
+11:40:16 cp             21583  R 131072  5120        4.40 1.test
+[...]
+
+This shows all NFS_READS that were more than 1ms. Depending on your
+latency to your fileserver, you might need to tweak this value to
+remove 
+
+A threshold of 0 will trace all operations. Warning: the output will be
+verbose, as it will include all file system cache hits.
+
+./nfsslower.py 0
+Tracing NFS operations
+11:56:50 dd             21852  W 1048576 0           0.42 1.test
+11:56:50 dd             21852  W 1048576 1024        0.46 1.test
+11:56:50 dd             21852  W 1048576 2048        0.36 1.test
+11:56:50 cp             21854  G 0       0           0.35 1.test
+11:56:50 cp             21854  O 0       0           0.33 1.test
+11:56:50 cp             21854  G 0       0           0.00 1.test
+11:56:50 cp             21854  R 131072  0           0.07 1.test
+11:56:50 cp             21854  R 131072  128         0.02 1.test
+11:56:50 cp             21854  R 131072  256         0.02 1.test
+11:56:50 cp             21854  R 131072  384         0.02 1.test
+11:56:50 cp             21854  R 131072  512         0.02 1.test
+11:56:50 cp             21854  R 131072  640         0.02 1.test
+11:56:50 cp             21854  R 131072  768         0.02 1.test
+11:56:50 cp             21854  R 131072  896         0.02 1.test
+11:56:50 cp             21854  R 131072  1024        0.02 1.test
+11:56:50 cp             21854  R 131072  1152        0.02 1.test
+11:56:50 cp             21854  R 131072  1280        0.02 1.test
+11:56:50 cp             21854  R 131072  1408        0.02 1.test
+11:56:50 cp             21854  R 131072  1536        0.02 1.test
+11:56:50 cp             21854  R 131072  1664        0.02 1.test
+11:56:50 cp             21854  R 131072  1792        0.02 1.test
+11:56:50 cp             21854  R 131072  1920        0.02 1.test
+11:56:50 cp             21854  R 131072  2048        0.02 1.test
+11:56:50 cp             21854  R 131072  2176        0.04 1.test
+11:56:50 cp             21854  R 131072  2304        0.02 1.test
+11:56:50 cp             21854  R 131072  2432        0.03 1.test
+11:56:50 cp             21854  R 131072  2560        0.03 1.test
+11:56:50 cp             21854  R 131072  2688        0.02 1.test
+11:56:50 cp             21854  R 131072  2816        0.03 1.test
+11:56:50 cp             21854  R 131072  2944        0.02 1.test
+11:56:50 cp             21854  R 0       3072        0.00 1.test
+11:56:50 ls             21855  G 0       0           0.00 1.test
+11:56:50 ls             21856  G 0       0           0.36 music
+11:56:50 ls             21856  G 0       0           0.00 music
+11:56:50 ls             21856  G 0       0           0.00 test
+11:56:50 ls             21856  G 0       0           0.00 ff
+11:56:50 ls             21856  G 0       0           0.00 34.log
+11:56:50 ls             21856  G 0       0           0.00 vmlinuz-linux
+11:56:50 ls             21856  G 0       0           0.00 2.test
+11:56:50 ls             21856  G 0       0           0.00 rt.log
+11:56:50 ls             21856  G 0       0           0.00 1.lod
+11:56:50 ls             21856  G 0       0           0.00 COPYRIGHT.txt
+11:56:50 ls             21856  G 0       0           0.00 gg
+11:56:50 ls             21856  G 0       0           0.00 qw.log
+11:56:50 ls             21856  G 0       0           0.00 README.md
+11:56:50 ls             21856  G 0       0           0.00 1.log
+
+The output now includes open operations ("O"), and reads ("R") wand getattrs ("G").
+A cp operation
+
+
+A -j option will print just the fields (parsable output, csv):
+
+./nfsslower.py -j 0
+ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE
+87054476520,dd,22754,W,1048576,0,425,1.test
+87054482916,dd,22754,W,1048576,1048576,320,1.test
+87054488179,dd,22754,W,1048576,2097152,389,1.test
+87054511340,cp,22756,G,0,0,371,1.test
+87054511685,cp,22756,O,0,0,306,1.test
+87054511700,cp,22756,G,0,0,2,1.test
+87054512325,cp,22756,R,131072,0,56,1.test
+87054512432,cp,22756,R,131072,131072,22,1.test
+87054512520,cp,22756,R,131072,262144,32,1.test
+87054512600,cp,22756,R,131072,393216,21,1.test
+87054512678,cp,22756,R,131072,524288,21,1.test
+87054512803,cp,22756,R,131072,655360,56,1.test
+
+This may be useful for visualizing with another tool, for example, for
+producing a scatter plot of ENDTIME vs LATENCY, to look for time-based
+patterns.
+
+USAGE message:
+
+usage: nfsslower.py [-h] [-j] [-p PID] [min_ms]
+
+Trace READ, WRITE, OPEN and GETATTR NFS calls slower than a threshold,supports NFSv{3,4}
+
+positional arguments:
+  min_ms             Minimum IO duration to trace in ms (default=10ms)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -j, --csv          just print fields: comma-separated values
+  -p PID, --pid PID  Trace this pid only
+
+ ./nfsslower         # trace operations slower than 10ms
+ ./nfsslower 1       # trace operations slower than 1ms
+ ./nfsslower -j 1    # ... 1 ms, parsable output (csv)
+ ./nfsslower 0       # trace all nfs operations
+ ./nfsslower -p 121  # trace pid 121 only
+
diff --git a/tools/nodegc.sh b/tools/nodegc.sh
new file mode 100755
index 0000000..5453c2a
--- /dev/null
+++ b/tools/nodegc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ugc.py -l node "$@"
diff --git a/tools/nodegc_example.txt b/tools/nodegc_example.txt
new file mode 120000
index 0000000..303ccbd
--- /dev/null
+++ b/tools/nodegc_example.txt
@@ -0,0 +1 @@
+lib/ugc_example.txt
\ No newline at end of file
diff --git a/tools/nodestat.sh b/tools/nodestat.sh
new file mode 100755
index 0000000..8a468f5
--- /dev/null
+++ b/tools/nodestat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l node "$@"
diff --git a/tools/nodestat_example.txt b/tools/nodestat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/nodestat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/offcputime.py b/tools/offcputime.py
new file mode 100755
index 0000000..d84ae52
--- /dev/null
+++ b/tools/offcputime.py
@@ -0,0 +1,323 @@
+#!/usr/bin/python
+#
+# offcputime    Summarize off-CPU time by stack trace
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: offcputime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Jan-2016	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from sys import stderr
+from time import sleep, strftime
+import argparse
+import errno
+import signal
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+# arguments
+examples = """examples:
+    ./offcputime             # trace off-CPU stack time until Ctrl-C
+    ./offcputime 5           # trace for 5 seconds only
+    ./offcputime -f 5        # 5 seconds, and output in folded format
+    ./offcputime -m 1000     # trace only events that last more than 1000 usec
+    ./offcputime -M 10000    # trace only events that last less than 10000 usec
+    ./offcputime -p 185      # only trace threads for PID 185
+    ./offcputime -t 188      # only trace thread 188
+    ./offcputime -u          # only trace user threads (no kernel)
+    ./offcputime -k          # only trace kernel threads (no user)
+    ./offcputime -U          # only show user space stacks (no kernel)
+    ./offcputime -K          # only show kernel space stacks (no user)
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize off-CPU time by stack trace",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+thread_group = parser.add_mutually_exclusive_group()
+# Note: this script provides --pid and --tid flags but their arguments are
+# referred to internally using kernel nomenclature: TGID and PID.
+thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid",
+    help="trace this PID only", type=positive_int)
+thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid",
+    help="trace this TID only", type=positive_int)
+thread_group.add_argument("-u", "--user-threads-only", action="store_true",
+    help="user threads only (no kernel threads)")
+thread_group.add_argument("-k", "--kernel-threads-only", action="store_true",
+    help="kernel threads only (no user threads)")
+stack_group = parser.add_mutually_exclusive_group()
+stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
+    help="show stacks from user space only (no kernel space stacks)")
+stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
+    help="show stacks from kernel space only (no user space stacks)")
+parser.add_argument("-d", "--delimited", action="store_true",
+    help="insert delimiter between kernel/user stacks")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("--stack-storage-size", default=1024,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+         "displayed (default 1024)")
+parser.add_argument("duration", nargs="?", default=99999999,
+    type=positive_nonzero_int,
+    help="duration of trace, in seconds")
+parser.add_argument("-m", "--min-block-time", default=1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds over which we " +
+         "store traces (default 1)")
+parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds under which we " +
+         "store traces (default U64_MAX)")
+parser.add_argument("--state", type=positive_int,
+    help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" +
+         ") see include/linux/sched.h")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+if args.pid and args.tgid:
+    parser.error("specify only one of -p and -t")
+folded = args.folded
+duration = int(args.duration)
+debug = 0
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MINBLOCK_US    MINBLOCK_US_VALUEULL
+#define MAXBLOCK_US    MAXBLOCK_US_VALUEULL
+
+struct key_t {
+    u32 pid;
+    u32 tgid;
+    int user_stack_id;
+    int kernel_stack_id;
+    char name[TASK_COMM_LEN];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
+    u32 pid = prev->pid;
+    u32 tgid = prev->tgid;
+    u64 ts, *tsp;
+
+    // record previous thread sleep time
+    if ((THREAD_FILTER) && (STATE_FILTER)) {
+        ts = bpf_ktime_get_ns();
+        start.update(&pid, &ts);
+    }
+
+    // get the current thread's start time
+    pid = bpf_get_current_pid_tgid();
+    tgid = bpf_get_current_pid_tgid() >> 32;
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;        // missed start or filtered
+    }
+
+    // calculate current thread's delta time
+    u64 delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+    delta = delta / 1000;
+    if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) {
+        return 0;
+    }
+
+    // create map key
+    struct key_t key = {};
+
+    key.pid = pid;
+    key.tgid = tgid;
+    key.user_stack_id = USER_STACK_GET;
+    key.kernel_stack_id = KERNEL_STACK_GET;
+    bpf_get_current_comm(&key.name, sizeof(key.name));
+
+    counts.increment(key, delta);
+    return 0;
+}
+"""
+
+# set thread filter
+thread_context = ""
+if args.tgid is not None:
+    thread_context = "PID %d" % args.tgid
+    thread_filter = 'tgid == %d' % args.tgid
+elif args.pid is not None:
+    thread_context = "TID %d" % args.pid
+    thread_filter = 'pid == %d' % args.pid
+elif args.user_threads_only:
+    thread_context = "user threads"
+    thread_filter = '!(prev->flags & PF_KTHREAD)'
+elif args.kernel_threads_only:
+    thread_context = "kernel threads"
+    thread_filter = 'prev->flags & PF_KTHREAD'
+else:
+    thread_context = "all threads"
+    thread_filter = '1'
+if args.state == 0:
+    state_filter = 'prev->state == 0'
+elif args.state:
+    # these states are sometimes bitmask checked
+    state_filter = 'prev->state & %d' % args.state
+else:
+    state_filter = '1'
+bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
+bpf_text = bpf_text.replace('STATE_FILTER', state_filter)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
+bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))
+
+# handle stack args
+kernel_stack_get = "stack_traces.get_stackid(ctx, 0)"
+user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)"
+stack_context = ""
+if args.user_stacks_only:
+    stack_context = "user"
+    kernel_stack_get = "-1"
+elif args.kernel_stacks_only:
+    stack_context = "kernel"
+    user_stack_get = "-1"
+else:
+    stack_context = "user + kernel"
+bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
+bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
+
+need_delimiter = args.delimited and not (args.kernel_stacks_only or
+                                         args.user_stacks_only)
+
+# check for an edge case; the code below will handle this case correctly
+# but ultimately nothing will be displayed
+if args.kernel_threads_only and args.user_stacks_only:
+    print("ERROR: Displaying user stacks for kernel threads " +
+          "doesn't make sense.", file=stderr)
+    exit(1)
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("error: 0 functions traced. Exiting.", file=stderr)
+    exit(1)
+
+# header
+if not folded:
+    print("Tracing off-CPU time (us) of %s by %s stack" %
+        (thread_context, stack_context), end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+try:
+    sleep(duration)
+except KeyboardInterrupt:
+    # as cleanup can take many seconds, trap Ctrl-C:
+    signal.signal(signal.SIGINT, signal_ignore)
+
+if not folded:
+    print()
+
+missing_stacks = 0
+has_enomem = False
+counts = b.get_table("counts")
+stack_traces = b.get_table("stack_traces")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    # handle get_stackid errors
+    if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
+        missing_stacks += 1
+        has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM
+    if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
+        missing_stacks += 1
+        has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM
+
+    # user stacks will be symbolized by tgid, not pid, to avoid the overhead
+    # of one symbol resolver per thread
+    user_stack = [] if k.user_stack_id < 0 else \
+        stack_traces.walk(k.user_stack_id)
+    kernel_stack = [] if k.kernel_stack_id < 0 else \
+        stack_traces.walk(k.kernel_stack_id)
+
+    if folded:
+        # print folded stack output
+        user_stack = list(user_stack)
+        kernel_stack = list(kernel_stack)
+        line = [k.name.decode('utf-8', 'replace')]
+        # if we failed to get the stack is, such as due to no space (-ENOMEM) or
+        # hash collision (-EEXIST), we still print a placeholder for consistency
+        if not args.kernel_stacks_only:
+            if stack_id_err(k.user_stack_id):
+                line.append("[Missed User Stack]")
+            else:
+                line.extend([b.sym(addr, k.tgid) for addr in reversed(user_stack)])
+        if not args.user_stacks_only:
+            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
+            if stack_id_err(k.kernel_stack_id):
+                line.append("[Missed Kernel Stack]")
+            else:
+                line.extend([b.ksym(addr) for addr in reversed(kernel_stack)])
+        print("%s %d" % (";".join(line), v.value))
+    else:
+        # print default multi-line stack output
+        if not args.user_stacks_only:
+            if stack_id_err(k.kernel_stack_id):
+                print("    [Missed Kernel Stack]")
+            else:
+                for addr in kernel_stack:
+                    print("    %s" % b.ksym(addr))
+        if not args.kernel_stacks_only:
+            if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
+                print("    --")
+            if stack_id_err(k.user_stack_id):
+                print("    [Missed User Stack]")
+            else:
+                for addr in user_stack:
+                    print("    %s" % b.sym(addr, k.tgid))
+        print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
+        print("        %d\n" % v.value)
+
+if missing_stacks > 0:
+    enomem_str = "" if not has_enomem else \
+        " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces lost and could not be displayed.%s" %
+        (missing_stacks, enomem_str),
+        file=stderr)
diff --git a/tools/offcputime_example.txt b/tools/offcputime_example.txt
new file mode 100644
index 0000000..1f6066d
--- /dev/null
+++ b/tools/offcputime_example.txt
@@ -0,0 +1,771 @@
+Demonstrations of offcputime, the Linux eBPF/bcc version.
+
+
+This program shows stack traces that were blocked, and the total duration they
+were blocked. It works by tracing when threads block and when they return to
+CPU, measuring both the time they were blocked (aka the "off-CPU time") and the
+blocked stack trace and the task name. This data is summarized in kernel by
+summing the blocked time by unique stack trace and task name.
+
+Here is some example output. The -K option was used to only match kernel stacks.
+To explain what we are seeing: the very first stack trace looks like a page
+fault (do_page_fault() etc) from the "chmod" command, and in total was off-CPU
+for 13 microseconds.
+
+# ./offcputime -K
+Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end.
+^C
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    wait_on_page_bit_killable
+    __lock_page_or_retry
+    filemap_fault
+    __do_fault
+    handle_mm_fault
+    __do_page_fault
+    do_page_fault
+    page_fault
+    chmod
+        13
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    ddebug_tables
+    rcuos/0
+        22
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit_lock
+    __lock_page
+    lock_page
+    __do_fault
+    handle_mm_fault
+    __do_page_fault
+    do_page_fault
+    page_fault
+    run
+        27
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    wait_on_page_bit_killable
+    __lock_page_or_retry
+    filemap_fault
+    __do_fault
+    handle_mm_fault
+    __do_page_fault
+    do_page_fault
+    page_fault
+    clear_user
+    padzero
+    load_elf_binary
+    search_binary_handler
+    load_script
+    search_binary_handler
+    do_execveat_common.isra.27
+    run
+        28
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    wait_on_page_bit_killable
+    __lock_page_or_retry
+    filemap_fault
+    __do_fault
+    handle_mm_fault
+    __do_page_fault
+    do_page_fault
+    page_fault
+    run
+        82
+
+    schedule
+    pipe_wait
+    pipe_read
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    bash
+        94
+
+    schedule
+    rcu_gp_kthread
+    kthread
+    ret_from_fork
+    ddebug_tables
+    rcu_sched
+        104
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    out_of_line_wait_on_bit
+    __wait_on_buffer
+    jbd2_journal_commit_transaction
+    kjournald2
+    kthread
+    ret_from_fork
+    mb_cache_list
+    jbd2/xvda1-8
+        986
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    out_of_line_wait_on_bit
+    __wait_on_buffer
+    jbd2_journal_commit_transaction
+    kjournald2
+    kthread
+    ret_from_fork
+    mb_cache_list
+    jbd2/xvda1-8
+        6630
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    out_of_line_wait_on_bit
+    do_get_write_access
+    jbd2_journal_get_write_access
+    __ext4_journal_get_write_access
+    ext4_mb_mark_diskspace_used
+    ext4_mb_new_blocks
+    ext4_ext_map_blocks
+    ext4_map_blocks
+    ext4_writepages
+    do_writepages
+    __filemap_fdatawrite_range
+    filemap_flush
+    ext4_alloc_da_blocks
+    ext4_rename
+    ext4_rename2
+    supervise
+        6645
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    out_of_line_wait_on_bit
+    do_get_write_access
+    jbd2_journal_get_write_access
+    __ext4_journal_get_write_access
+    __ext4_new_inode
+    ext4_create
+    vfs_create
+    path_openat
+    do_filp_open
+    do_sys_open
+    sys_open
+    entry_SYSCALL_64_fastpath
+    supervise
+        12702
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/2
+        16036
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/4
+        24085
+
+    schedule
+    do_wait
+    sys_wait4
+    entry_SYSCALL_64_fastpath
+    run
+        233055
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    bit_wait_io
+    __wait_on_bit
+    wait_on_page_bit
+    truncate_inode_pages_range
+    truncate_inode_pages_final
+    ext4_evict_inode
+    evict
+    iput
+    __dentry_kill
+    dput
+    sys_rename
+    entry_SYSCALL_64_fastpath
+    supervise
+        297113
+
+    schedule
+    schedule_timeout
+    wait_woken
+    n_tty_read
+    tty_read
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    bash
+        1789866
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    do_blockdev_direct_IO
+    __blockdev_direct_IO
+    blkdev_direct_IO
+    generic_file_read_iter
+    blkdev_read_iter
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    dd
+        3310763
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/1
+        3999989
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/5
+        3999995
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/4
+        3999996
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/0
+        3999996
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/3
+        3999998
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/7
+        3999999
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/2
+        4000001
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    watchdog/6
+        4000001
+
+    schedule
+    do_wait
+    sys_wait4
+    entry_SYSCALL_64_fastpath
+    bash
+        4039675
+
+    schedule
+    do_nanosleep
+    hrtimer_nanosleep
+    sys_nanosleep
+    entry_SYSCALL_64_fastpath
+    svscan
+        5000112
+
+    schedule
+    schedule_hrtimeout_range_clock
+    schedule_hrtimeout_range
+    poll_schedule_timeout
+    do_select
+    core_sys_select
+    sys_select
+    entry_SYSCALL_64_fastpath
+    snmpd
+        5998761
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/3
+        6149779
+
+    schedule
+    schedule_hrtimeout_range_clock
+    schedule_hrtimeout_range
+    poll_schedule_timeout
+    do_select
+    core_sys_select
+    sys_select
+    entry_SYSCALL_64_fastpath
+    ntpd
+        6999832
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/u16:2
+        7131941
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/3:0
+        7999844
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/1:1
+        7999872
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/2:1
+        7999889
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/5:1
+        7999936
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/7:1
+        7999938
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/6:1
+        7999940
+
+    schedule
+    do_nanosleep
+    hrtimer_nanosleep
+    sys_nanosleep
+    entry_SYSCALL_64_fastpath
+    tail
+        8000905
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/7
+        8197046
+
+    schedule
+    pipe_wait
+    pipe_read
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    readproctitle
+        8197835
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/4
+        8201851
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/2
+        8203375
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/6
+        8208664
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/5
+        8209819
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    ddebug_tables
+    migration/0
+        8211292
+
+    schedule
+    smpboot_thread_fn
+    kthread
+    ret_from_fork
+    migration/1
+        8212100
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/0:2
+        8270305
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/3
+        8349697
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/2
+        8363357
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/1
+        8365338
+
+    schedule
+    schedule_timeout
+    xfs_buf_terminate
+    kthread
+    ret_from_fork
+    xfsaild/md0
+        8371514
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/4
+        8384013
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/5
+        8390016
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    ddebug_tables
+    rcuos/0
+        8405428
+
+    schedule
+    schedule_timeout
+    rcu_gp_kthread
+    kthread
+    ret_from_fork
+    ddebug_tables
+    rcu_sched
+        8406930
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/7
+        8409575
+
+    schedule
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+    rcuos/6
+        8415062
+
+    schedule
+    schedule_hrtimeout_range_clock
+    schedule_hrtimeout_range
+    poll_schedule_timeout
+    do_select
+    core_sys_select
+    sys_select
+    entry_SYSCALL_64_fastpath
+    offcputime
+        8421478
+
+    schedule
+    worker_thread
+    kthread
+    ret_from_fork
+    kworker/4:0
+        8421492
+
+    schedule
+    schedule_hrtimeout_range_clock
+    schedule_hrtimeout_range
+    poll_schedule_timeout
+    do_select
+    core_sys_select
+    sys_select
+    entry_SYSCALL_64_fastpath
+    sshd
+        14249005
+
+    schedule
+    schedule_hrtimeout_range_clock
+    schedule_hrtimeout_range
+    poll_schedule_timeout
+    do_sys_poll
+    sys_poll
+    entry_SYSCALL_64_fastpath
+    supervise
+        81670888
+
+The last few stack traces aren't very interesting, since they are threads that
+are often blocked off-CPU waiting for work.
+
+Do be somewhat careful with overhead: this is tracing scheduler functions, which
+can be called very frequently. While this uses in-kernel summaries for
+efficiency, the rate of scheduler functions can be very high (> 1,000,000/sec),
+and this is performing stack walks when threads return to CPU. At some point
+the overhead will be measurable.
+
+
+A -p option can be used to filter (in-kernel) on a single process ID. For
+example, only matching PID 26651, which is a running "dd" command:
+
+# ./offcputime -K -p 26651
+Tracing off-CPU time (us) of all threads by kernel stack... Hit Ctrl-C to end.
+^C
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    do_blockdev_direct_IO
+    __blockdev_direct_IO
+    blkdev_direct_IO
+    generic_file_read_iter
+    blkdev_read_iter
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    dd
+        2405710
+
+The stack trace shows "dd" is blocked waiting on disk I/O, as expected, for a
+total of 2.4 seconds during tracing.
+
+
+A duration can be added, for example, tracing for 5 seconds only:
+
+# ./offcputime -K -p 26651 5
+Tracing off-CPU time (us) of all threads by kernel stack for 5 secs.
+
+    schedule
+    schedule_timeout
+    io_schedule_timeout
+    do_blockdev_direct_IO
+    __blockdev_direct_IO
+    blkdev_direct_IO
+    generic_file_read_iter
+    blkdev_read_iter
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    dd
+        4413909
+
+Here, dd was blocked for 4.4 seconds out of 5. Or put differently, likely
+on-CPU for about 12% of the time. Which matches the ratio seen by time(1):
+
+# time dd if=/dev/md0 iflag=direct of=/dev/null bs=1k
+^C108115+0 records in
+108114+0 records out
+110708736 bytes (111 MB) copied, 13.7565 s, 8.0 MB/s
+
+real	0m13.760s
+user	0m0.000s
+sys	0m1.739s
+
+
+A -f option will emit output using the "folded stacks" format, which can be
+read directly by flamegraph.pl from the FlameGraph open source software
+(https://github.com/brendangregg/FlameGraph). Eg:
+
+# ./offcputime -K -f 5
+bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 8
+yes;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;call_rwsem_down_read_failed;rwsem_down_read_failed;schedule 14
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 33
+rcuos/4;ret_from_fork;kthread;rcu_nocb_kthread;schedule 45
+bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;pipe_wait;schedule 88
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 108
+jbd2/xvda1-8;mb_cache_list;ret_from_fork;kthread;kjournald2;jbd2_journal_commit_transaction;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 828
+jbd2/xvda1-8;mb_cache_list;ret_from_fork;kthread;kjournald2;jbd2_journal_commit_transaction;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 6201
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;io_schedule_timeout;schedule_timeout;schedule 41049
+run;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule 120709
+bash;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule 699320
+ksoftirqd/0;ret_from_fork;kthread;smpboot_thread_fn;schedule 1077529
+bash;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;wait_woken;schedule_timeout;schedule 1362045
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 1377627
+migration/0;ddebug_tables;ret_from_fork;kthread;smpboot_thread_fn;schedule 2040753
+snmpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 2197568
+migration/5;ret_from_fork;kthread;smpboot_thread_fn;schedule 3079426
+migration/7;ret_from_fork;kthread;smpboot_thread_fn;schedule 3084746
+kworker/6:2;ret_from_fork;kthread;worker_thread;schedule 3940583
+kworker/5:1;ret_from_fork;kthread;worker_thread;schedule 3944892
+kworker/1:2;ret_from_fork;kthread;worker_thread;schedule 3999646
+ntpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 3999904
+kworker/u16:0;ret_from_fork;kthread;worker_thread;schedule 3999967
+kworker/7:0;ret_from_fork;kthread;worker_thread;schedule 3999987
+tail;entry_SYSCALL_64_fastpath;sys_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule 4000473
+migration/1;ret_from_fork;kthread;smpboot_thread_fn;schedule 4091150
+migration/4;ret_from_fork;kthread;smpboot_thread_fn;schedule 4095217
+readproctitle;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;pipe_wait;schedule 4108470
+migration/3;ret_from_fork;kthread;smpboot_thread_fn;schedule 4109264
+migration/2;ret_from_fork;kthread;smpboot_thread_fn;schedule 4109280
+migration/6;ret_from_fork;kthread;smpboot_thread_fn;schedule 4111143
+kworker/4:0;ret_from_fork;kthread;worker_thread;schedule 4402350
+kworker/3:0;ret_from_fork;kthread;worker_thread;schedule 4433988
+kworker/2:1;ret_from_fork;kthread;worker_thread;schedule 4636142
+kworker/0:2;ret_from_fork;kthread;worker_thread;schedule 4832023
+rcuos/1;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4974186
+rcuos/5;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4977137
+rcuos/6;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4987769
+rcuos/3;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992282
+rcuos/4;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992364
+rcuos/2;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4992714
+rcuos/0;ddebug_tables;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4996504
+rcuos/7;ret_from_fork;kthread;rcu_nocb_kthread;schedule 4998497
+rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;schedule_timeout;schedule 5000686
+offcputime;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 5005063
+dd;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;blkdev_read_iter;generic_file_read_iter;blkdev_direct_IO;__blockdev_direct_IO;do_blockdev_direct_IO;io_schedule_timeout;schedule_timeout;schedule 8025599
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule 40835611
+
+The stack traces are shown as single lines, with functions separated by
+semicolons. The first entry is the task name. The 2nd column is the total
+off-CPU time.
+
+I'd save this output to a file, then move it to the system where you'll be
+creating your "off-CPU time flame graphs".
+
+
+USAGE message:
+
+# ./offcputime.py -h
+usage: offcputime.py [-h] [-p PID | -t TID | -u | -k] [-U | -K] [-d] [-f]
+                     [--stack-storage-size STACK_STORAGE_SIZE]
+                     [-m MIN_BLOCK_TIME] [-M MAX_BLOCK_TIME] [--state STATE]
+                     [duration]
+
+Summarize off-CPU time by stack trace
+
+positional arguments:
+  duration              duration of trace, in seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -u, --user-threads-only
+                        user threads only (no kernel threads)
+  -k, --kernel-threads-only
+                        kernel threads only (no user threads)
+  -U, --user-stacks-only
+                        show stacks from user space only (no kernel space
+                        stacks)
+  -K, --kernel-stacks-only
+                        show stacks from kernel space only (no user space
+                        stacks)
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -f, --folded          output folded format
+  --stack-storage-size STACK_STORAGE_SIZE
+                        the number of unique stack traces that can be stored
+                        and displayed (default 1024)
+  -m MIN_BLOCK_TIME, --min-block-time MIN_BLOCK_TIME
+                        the amount of time in microseconds over which we store
+                        traces (default 1)
+  -M MAX_BLOCK_TIME, --max-block-time MAX_BLOCK_TIME
+                        the amount of time in microseconds under which we
+                        store traces (default U64_MAX)
+  --state STATE         filter on this thread state bitmask (eg, 2 ==
+                        TASK_UNINTERRUPTIBLE) see include/linux/sched.h
+
+examples:
+    ./offcputime             # trace off-CPU stack time until Ctrl-C
+    ./offcputime 5           # trace for 5 seconds only
+    ./offcputime -f 5        # 5 seconds, and output in folded format
+    ./offcputime -m 1000     # trace only events that last more than 1000 usec
+    ./offcputime -M 10000    # trace only events that last less than 10000 usec
+    ./offcputime -p 185      # only trace threads for PID 185
+    ./offcputime -t 188      # only trace thread 188
+    ./offcputime -u          # only trace user threads (no kernel)
+    ./offcputime -k          # only trace kernel threads (no user)
+    ./offcputime -U          # only show user space stacks (no kernel)
+    ./offcputime -K          # only show kernel space stacks (no user)
diff --git a/tools/offwaketime.py b/tools/offwaketime.py
new file mode 100755
index 0000000..0e4f35e
--- /dev/null
+++ b/tools/offwaketime.py
@@ -0,0 +1,393 @@
+#!/usr/bin/python
+#
+# offwaketime   Summarize blocked time by kernel off-CPU stack + waker stack
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: offwaketime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Jan-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+import argparse
+import signal
+import errno
+from sys import stderr
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+# arguments
+examples = """examples:
+    ./offwaketime             # trace off-CPU + waker stack time until Ctrl-C
+    ./offwaketime 5           # trace for 5 seconds only
+    ./offwaketime -f 5        # 5 seconds, and output in folded format
+    ./offwaketime -m 1000     # trace only events that last more than 1000 usec
+    ./offwaketime -M 9000     # trace only events that last less than 9000 usec
+    ./offwaketime -p 185      # only trace threads for PID 185
+    ./offwaketime -t 188      # only trace thread 188
+    ./offwaketime -u          # only trace user threads (no kernel)
+    ./offwaketime -k          # only trace kernel threads (no user)
+    ./offwaketime -U          # only show user space stacks (no kernel)
+    ./offwaketime -K          # only show kernel space stacks (no user)
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize blocked time by kernel stack trace + waker stack",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+thread_group = parser.add_mutually_exclusive_group()
+# Note: this script provides --pid and --tid flags but their arguments are
+# referred to internally using kernel nomenclature: TGID and PID.
+thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid",
+    help="trace this PID only", type=positive_int)
+thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid",
+    help="trace this TID only", type=positive_int)
+thread_group.add_argument("-u", "--user-threads-only", action="store_true",
+    help="user threads only (no kernel threads)")
+thread_group.add_argument("-k", "--kernel-threads-only", action="store_true",
+    help="kernel threads only (no user threads)")
+stack_group = parser.add_mutually_exclusive_group()
+stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
+    help="show stacks from user space only (no kernel space stacks)")
+stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
+    help="show stacks from kernel space only (no user space stacks)")
+parser.add_argument("-d", "--delimited", action="store_true",
+    help="insert delimiter between kernel/user stacks")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("--stack-storage-size", default=1024,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+         "displayed (default 1024)")
+parser.add_argument("duration", nargs="?", default=99999999,
+    type=positive_nonzero_int,
+    help="duration of trace, in seconds")
+parser.add_argument("-m", "--min-block-time", default=1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds over which we " +
+         "store traces (default 1)")
+parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds under which we " +
+         "store traces (default U64_MAX)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+folded = args.folded
+duration = int(args.duration)
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MINBLOCK_US    MINBLOCK_US_VALUEULL
+#define MAXBLOCK_US    MAXBLOCK_US_VALUEULL
+
+struct key_t {
+    char waker[TASK_COMM_LEN];
+    char target[TASK_COMM_LEN];
+    int w_k_stack_id;
+    int w_u_stack_id;
+    int t_k_stack_id;
+    int t_u_stack_id;
+    u32 t_pid;
+    u32 t_tgid;
+    u32 w_pid;
+    u32 w_tgid;
+};
+BPF_HASH(counts, struct key_t);
+
+// Key of this hash is PID of waiting Process,
+// value is timestamp when it went into waiting
+BPF_HASH(start, u32);
+
+struct wokeby_t {
+    char name[TASK_COMM_LEN];
+    int k_stack_id;
+    int u_stack_id;
+    int w_pid;
+    int w_tgid;
+};
+// Key of the hash is PID of the Process to be waken, value is information
+// of the Process who wakes it
+BPF_HASH(wokeby, u32, struct wokeby_t);
+
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+int waker(struct pt_regs *ctx, struct task_struct *p) {
+    // PID and TGID of the target Process to be waken
+    u32 pid = p->pid;
+    u32 tgid = p->tgid;
+
+    if (!(THREAD_FILTER)) {
+        return 0;
+    }
+
+    // Construct information about current (the waker) Process
+    struct wokeby_t woke = {};
+    bpf_get_current_comm(&woke.name, sizeof(woke.name));
+    woke.k_stack_id = KERNEL_STACK_GET;
+    woke.u_stack_id = USER_STACK_GET;
+    woke.w_pid = bpf_get_current_pid_tgid();
+    woke.w_tgid = bpf_get_current_pid_tgid() >> 32;
+
+    wokeby.update(&pid, &woke);
+    return 0;
+}
+
+int oncpu(struct pt_regs *ctx, struct task_struct *p) {
+    // PID and TGID of the previous Process (Process going into waiting)
+    u32 pid = p->pid;
+    u32 tgid = p->tgid;
+    u64 *tsp;
+    u64 ts = bpf_ktime_get_ns();
+
+    // Record timestamp for the previous Process (Process going into waiting)
+    if (THREAD_FILTER) {
+        start.update(&pid, &ts);
+    }
+
+    // Calculate current Process's wait time by finding the timestamp of when
+    // it went into waiting.
+    // pid and tgid are now the PID and TGID of the current (waking) Process.
+    pid = bpf_get_current_pid_tgid();
+    tgid = bpf_get_current_pid_tgid() >> 32;
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        // Missed or filtered when the Process went into waiting
+        return 0;
+    }
+    u64 delta = ts - *tsp;
+    start.delete(&pid);
+    delta = delta / 1000;
+    if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) {
+        return 0;
+    }
+
+    // create map key
+    struct key_t key = {};
+    struct wokeby_t *woke;
+
+    bpf_get_current_comm(&key.target, sizeof(key.target));
+    key.t_pid = pid;
+    key.t_tgid = tgid;
+    key.t_k_stack_id = KERNEL_STACK_GET;
+    key.t_u_stack_id = USER_STACK_GET;
+
+    woke = wokeby.lookup(&pid);
+    if (woke) {
+        key.w_k_stack_id = woke->k_stack_id;
+        key.w_u_stack_id = woke->u_stack_id;
+        key.w_pid = woke->w_pid;
+        key.w_tgid = woke->w_tgid;
+        __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN);
+        wokeby.delete(&pid);
+    }
+
+    counts.increment(key, delta);
+    return 0;
+}
+"""
+
+# set thread filter
+thread_context = ""
+if args.tgid is not None:
+    thread_context = "PID %d" % args.tgid
+    thread_filter = 'tgid == %d' % args.tgid
+elif args.pid is not None:
+    thread_context = "TID %d" % args.pid
+    thread_filter = 'pid == %d' % args.pid
+elif args.user_threads_only:
+    thread_context = "user threads"
+    thread_filter = '!(p->flags & PF_KTHREAD)'
+elif args.kernel_threads_only:
+    thread_context = "kernel threads"
+    thread_filter = 'p->flags & PF_KTHREAD'
+else:
+    thread_context = "all threads"
+    thread_filter = '1'
+bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
+bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))
+
+# handle stack args
+kernel_stack_get = "stack_traces.get_stackid(ctx, 0)"
+user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)"
+stack_context = ""
+if args.user_stacks_only:
+    stack_context = "user"
+    kernel_stack_get = "-1"
+elif args.kernel_stacks_only:
+    stack_context = "kernel"
+    user_stack_get = "-1"
+else:
+    stack_context = "user + kernel"
+bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
+bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
+b.attach_kprobe(event="try_to_wake_up", fn_name="waker")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions traced. Exiting.")
+    exit()
+
+# header
+if not folded:
+    print("Tracing blocked time (us) by %s off-CPU and waker stack" %
+        stack_context, end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+try:
+    sleep(duration)
+except KeyboardInterrupt:
+    # as cleanup can take many seconds, trap Ctrl-C:
+    # print a newline for folded output on Ctrl-C
+    signal.signal(signal.SIGINT, signal_ignore)
+
+
+if not folded:
+    print()
+
+missing_stacks = 0
+has_enomem = False
+counts = b.get_table("counts")
+stack_traces = b.get_table("stack_traces")
+need_delimiter = args.delimited and not (args.kernel_stacks_only or
+                                         args.user_stacks_only)
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    # handle get_stackid errors
+    if not args.user_stacks_only:
+        missing_stacks += int(stack_id_err(k.w_k_stack_id))
+        missing_stacks += int(stack_id_err(k.t_k_stack_id))
+        has_enomem = has_enomem or (k.w_k_stack_id == -errno.ENOMEM) or \
+                     (k.t_k_stack_id == -errno.ENOMEM)
+    if not args.kernel_stacks_only:
+        missing_stacks += int(stack_id_err(k.w_u_stack_id))
+        missing_stacks += int(stack_id_err(k.t_u_stack_id))
+        has_enomem = has_enomem or (k.w_u_stack_id == -errno.ENOMEM) or \
+                     (k.t_u_stack_id == -errno.ENOMEM)
+
+    waker_user_stack = [] if k.w_u_stack_id < 1 else \
+        reversed(list(stack_traces.walk(k.w_u_stack_id))[1:])
+    waker_kernel_stack = [] if k.w_k_stack_id < 1 else \
+        reversed(list(stack_traces.walk(k.w_k_stack_id))[1:])
+    target_user_stack = [] if k.t_u_stack_id < 1 else \
+        stack_traces.walk(k.t_u_stack_id)
+    target_kernel_stack = [] if k.t_k_stack_id < 1 else \
+        stack_traces.walk(k.t_k_stack_id)
+
+    if folded:
+        # print folded stack output
+        line = [k.target.decode('utf-8', 'replace')]
+        if not args.kernel_stacks_only:
+            if stack_id_err(k.t_u_stack_id):
+                line.append("[Missed User Stack]")
+            else:
+                line.extend([b.sym(addr, k.t_tgid)
+                    for addr in reversed(list(target_user_stack)[1:])])
+        if not args.user_stacks_only:
+            line.extend(["-"] if (need_delimiter and k.t_k_stack_id > 0 and k.t_u_stack_id > 0) else [])
+            if stack_id_err(k.t_k_stack_id):
+                line.append("[Missed Kernel Stack]")
+            else:
+                line.extend([b.ksym(addr)
+                    for addr in reversed(list(target_kernel_stack)[1:])])
+        line.append("--")
+        if not args.user_stacks_only:
+            if stack_id_err(k.w_k_stack_id):
+                line.append("[Missed Kernel Stack]")
+            else:
+                line.extend([b.ksym(addr)
+                    for addr in reversed(list(waker_kernel_stack))])
+        if not args.kernel_stacks_only:
+            line.extend(["-"] if (need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0) else [])
+            if stack_id_err(k.w_u_stack_id):
+                line.extend("[Missed User Stack]")
+            else:
+                line.extend([b.sym(addr, k.w_tgid)
+                    for addr in reversed(list(waker_user_stack))])
+        line.append(k.waker.decode('utf-8', 'replace'))
+        print("%s %d" % (";".join(line), v.value))
+    else:
+        # print wakeup name then stack in reverse order
+        print("    %-16s %s %s" % ("waker:", k.waker.decode('utf-8', 'replace'), k.t_pid))
+        if not args.kernel_stacks_only:
+            if stack_id_err(k.w_u_stack_id):
+                print("    [Missed User Stack]")
+            else:
+                for addr in waker_user_stack:
+                    print("    %s" % b.sym(addr, k.w_tgid))
+        if not args.user_stacks_only:
+            if need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0:
+                print("    -")
+            if stack_id_err(k.w_k_stack_id):
+                print("    [Missed Kernel Stack]")
+            else:
+                for addr in waker_kernel_stack:
+                    print("    %s" % b.ksym(addr))
+
+        # print waker/wakee delimiter
+        print("    %-16s %s" % ("--", "--"))
+
+        if not args.user_stacks_only:
+            if stack_id_err(k.t_k_stack_id):
+                print("    [Missed Kernel Stack]")
+            else:
+                for addr in target_kernel_stack:
+                    print("    %s" % b.ksym(addr))
+        if not args.kernel_stacks_only:
+            if need_delimiter and k.t_u_stack_id > 0 and k.t_k_stack_id > 0:
+                print("    -")
+            if stack_id_err(k.t_u_stack_id):
+                print("    [Missed User Stack]")
+            else:
+                for addr in target_user_stack:
+                    print("    %s" % b.sym(addr, k.t_tgid))
+        print("    %-16s %s %s" % ("target:", k.target.decode('utf-8', 'replace'), k.w_pid))
+        print("        %d\n" % v.value)
+
+if missing_stacks > 0:
+    enomem_str = " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces lost and could not be displayed.%s" %
+        (missing_stacks, (enomem_str if has_enomem else "")),
+        file=stderr)
diff --git a/tools/offwaketime_example.txt b/tools/offwaketime_example.txt
new file mode 100644
index 0000000..8291e2f
--- /dev/null
+++ b/tools/offwaketime_example.txt
@@ -0,0 +1,355 @@
+Demonstrations of offwaketime, the Linux eBPF/bcc version.
+
+
+This program shows kernel stack traces and task names that were blocked and
+"off-CPU", along with the stack traces and task names for the threads that woke
+them, and the total elapsed time from when they blocked to when they were woken
+up.  This combines the summaries from both the offwaketime and wakeuptime tools.
+The time measurement will be very similar to off-CPU time, however, off-CPU time
+may include a little extra time spent waiting on a run queue to be scheduled.
+The combined stacks, task names, and total time is summarized in kernel context
+for efficiency, using an eBPF map.
+
+The output summary will further help you identify reasons why threads
+were blocking, and quantify the time from when they were blocked to woken up.
+This spans all types of blocking activity: disk I/O, network I/O, locks, page
+faults, swapping, sleeping, involuntary context switches, etc.
+
+Here is some sample output from a 5 second trace, truncated to highlight several
+stack pairs:
+
+# ./offwaketime 5
+Tracing blocked time (us) by kernel off-CPU and waker stack for 5 secs.
+
+[...]
+
+    waker:           swapper/0
+    ffffffff8137897c blk_mq_complete_request
+    ffffffff81378930 __blk_mq_complete_request
+    ffffffff81378793 blk_mq_end_request
+    ffffffff813778b9 blk_mq_free_request
+    ffffffff8137782d __blk_mq_free_request
+    ffffffff8137bc57 blk_mq_put_tag
+    ffffffff8137b2c7 bt_clear_tag
+    ffffffff810b54d9 __wake_up
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b5b12 autoremove_wake_function
+    -                -
+    ffffffff81785085 schedule
+    ffffffff81787e16 schedule_timeout
+    ffffffff81784634 __sched_text_start
+    ffffffff8137b839 bt_get
+    ffffffff8137bbf7 blk_mq_get_tag
+    ffffffff8137761b __blk_mq_alloc_request
+    ffffffff81379442 blk_mq_map_request
+    ffffffff8137a445 blk_sq_make_request
+    ffffffff8136ebc3 generic_make_request
+    ffffffff8136ed07 submit_bio
+    ffffffff81225adf submit_bh_wbc
+    ffffffff81225b42 submit_bh
+    ffffffff812721e0 __ext4_get_inode_loc
+    ffffffff812751dd ext4_iget
+    ffffffff81275c90 ext4_iget_normal
+    ffffffff8127f45b ext4_lookup
+    ffffffff811f94ed lookup_real
+    ffffffff811fad43 __lookup_hash
+    ffffffff811fc3fb walk_component
+    ffffffff811fd050 link_path_walk
+    target:          cksum
+        56529
+
+[...]
+
+    waker:           swapper/1
+    ffffffff81475cf0 xen_evtchn_do_upcall
+    ffffffff81473e83 __xen_evtchn_do_upcall
+    ffffffff814766f7 evtchn_2l_handle_events
+    ffffffff810cb0c2 generic_handle_irq
+    ffffffff810cf1ca handle_percpu_irq
+    ffffffff810cb9c8 handle_irq_event_percpu
+    ffffffff8100b9e1 xen_timer_interrupt
+    ffffffff810dfba8 hrtimer_interrupt
+    ffffffff810df494 __hrtimer_run_queues
+    ffffffff810df082 hrtimer_wakeup
+    -                -
+    ffffffff81785085 schedule
+    ffffffff817880bf do_nanosleep
+    ffffffff810e003d hrtimer_nanosleep
+    ffffffff810e018c sys_nanosleep
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    target:          vmstat
+        3000331
+
+[...]
+
+    waker:           swapper/0
+    ffffffff81378930 __blk_mq_complete_request
+    ffffffff8137875a blk_mq_end_request
+    ffffffff8136f157 blk_update_request
+    ffffffff8136836f bio_endio
+    ffffffff812ba709 mpage_end_io
+    ffffffff81176af9 unlock_page
+    ffffffff810b5781 __wake_up_bit
+    ffffffff810b54d9 __wake_up
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b5b7e wake_bit_function
+    -                -
+    ffffffff81785085 schedule
+    ffffffff81787e16 schedule_timeout
+    ffffffff81784634 __sched_text_start
+    ffffffff8178586b bit_wait_io
+    ffffffff8178563e __wait_on_bit_lock
+    ffffffff8117616e __lock_page_killable
+    ffffffff81177fce generic_file_read_iter
+    ffffffff811ef9c7 __vfs_read
+    ffffffff811f0206 vfs_read
+    ffffffff811f0eb6 sys_read
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    target:          cksum
+        4334521
+
+[...]
+
+    waker:           kworker/u16:2
+    ffffffff8178940f ret_from_fork
+    ffffffff81092979 kthread
+    ffffffff8108caeb worker_thread
+    ffffffff8108c80a process_one_work
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810b5462 __wake_up_common
+    ffffffff812037b6 pollwake
+    -                -
+    ffffffff81785085 schedule
+    ffffffff81788234 schedule_hrtimeout_range_clock
+    ffffffff81788253 schedule_hrtimeout_range
+    ffffffff812035d4 poll_schedule_timeout
+    ffffffff8120402a do_select
+    ffffffff812042f0 core_sys_select
+    ffffffff8120449b sys_select
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    target:          sshd
+        6530897
+
+[...]
+
+    waker:           swapper/0
+    ffffffff81475cf0 xen_evtchn_do_upcall
+    ffffffff81473e83 __xen_evtchn_do_upcall
+    ffffffff814766f7 evtchn_2l_handle_events
+    ffffffff810cb0c2 generic_handle_irq
+    ffffffff810cf1ca handle_percpu_irq
+    ffffffff810cb9c8 handle_irq_event_percpu
+    ffffffff8100b9e1 xen_timer_interrupt
+    ffffffff810dfba8 hrtimer_interrupt
+    ffffffff810df494 __hrtimer_run_queues
+    ffffffff810df082 hrtimer_wakeup
+    -                -
+    ffffffff81785085 schedule
+    ffffffff81787fc3 schedule_hrtimeout_range_clock.part.23
+    ffffffff81788219 schedule_hrtimeout_range_clock
+    ffffffff81788253 schedule_hrtimeout_range
+    ffffffff812035d4 poll_schedule_timeout
+    ffffffff81204b6d do_sys_poll
+    ffffffff81204cf2 sys_poll
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    target:          supervise
+        16332240
+
+Detaching...
+
+The output includes two paths from the cksum(1) command, one for reading files
+via vfs_read() and the other doing a link_path_walk(). There is also a vmstat(8)
+stack showing it sleeping between intervals, and an sshd(8) stack showing it
+waiting on a file descriptor for input.
+
+The stack shown at the bottom is the off-CPU stack belonging to the task name
+shown after "target:". Then there is a separator, "-", and above it the waker
+stack and the waker task name after "waker:". The wakeup stack is printed
+in reverse order.
+
+The number beneath the stacks is the total time spent from the blocking event
+to the wakeup event. This is summed for all occurrences with the same stack
+pairs.
+
+
+The -u option will print user-mode target threads only, and the -f option will
+show the stacks in "folded stacks" format. Eg:
+
+# ./offwaketime -fu 5
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;ext4_truncate;ext4_ext_truncate;ext4_ext_remove_space;ext4_free_blocks;__ext4_handle_dirty_metadata;_cond_resched;preempt_schedule_common;-; 2
+sshd;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;tty_read;n_tty_read;down_read;_cond_resched;preempt_schedule_common;-; 2
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;iget_locked;alloc_inode;ext4_alloc_inode;kmem_cache_alloc;_cond_resched;preempt_schedule_common;-; 3
+mkdir;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;anon_vma_prepare;_cond_resched;preempt_schedule_common;-; 3
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;link_path_walk;walk_component;__lookup_hash;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;__ext4_get_inode_loc;__breadahead;ll_rw_block;submit_bh_wbc;bio_alloc_bioset;mempool_alloc;_cond_resched;preempt_schedule_common;-; 3
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;page_cache_sync_readahead;ondemand_readahead;__do_page_cache_readahead;ext4_readpages;ext4_mpage_readpages;ext4_map_blocks;down_read;_cond_resched;preempt_schedule_common;-; 3
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;fd_install;__fd_install;_cond_resched;preempt_schedule_common;-; 3
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;elf_map;vm_munmap;down_write;_cond_resched;preempt_schedule_common;-; 3
+svscan;entry_SYSCALL_64_fastpath;sys_getdents;iterate_dir;ext4_readdir;ext4_htree_fill_tree;htree_dirblock_to_tree;ext4_htree_store_dirent;__kmalloc;_cond_resched;preempt_schedule_common;-; 4
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;mutex_lock;_cond_resched;preempt_schedule_common;-; 4
+run;entry_SYSCALL_64_fastpath;sys_mprotect;down_write;_cond_resched;preempt_schedule_common;-; 5
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;_cond_resched;preempt_schedule_common;-; 5
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;set_brk;vm_brk;down_write;_cond_resched;preempt_schedule_common;-; 5
+supervise;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;anon_vma_fork;anon_vma_clone;down_write;_cond_resched;preempt_schedule_common;-; 6
+svscan;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/0 11
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;free_pgtables;unlink_anon_vmas;down_write;_cond_resched;preempt_schedule_common;-; 12
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;flush_old_exec;mmput;_cond_resched;preempt_schedule_common;-; 13
+sshd;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;sock_write_iter;sock_sendmsg;inet_sendmsg;tcp_sendmsg;lock_sock_nested;_cond_resched;preempt_schedule_common;-; 14
+cksum;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;mutex_lock;_cond_resched;preempt_schedule_common;-; 19
+sshd;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 24
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;vm_brk;down_write;_cond_resched;preempt_schedule_common;-; 31
+sshd;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;sock_write_iter;sock_sendmsg;inet_sendmsg;tcp_sendmsg;sk_stream_alloc_skb;__alloc_skb;kmem_cache_alloc_node;_cond_resched;preempt_schedule_common;-; 32
+run;page_fault;do_page_fault;__do_page_fault;_cond_resched;preempt_schedule_common;-; 33
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;anon_vma_prepare;_cond_resched;preempt_schedule_common;-; 33
+run;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;down_write;_cond_resched;preempt_schedule_common;-; 35
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;_cond_resched;preempt_schedule_common;-; 36
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;elf_map;vm_mmap;vm_mmap_pgoff;down_write;_cond_resched;preempt_schedule_common;-; 38
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;__ext4_get_inode_loc;__getblk_gfp;_cond_resched;preempt_schedule_common;-; 38
+chmod;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 39
+run;entry_SYSCALL_64_fastpath;sys_munmap;vm_munmap;do_munmap;unmap_region;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 41
+readproctitle;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;mutex_lock;_cond_resched;preempt_schedule_common;-; 44
+run;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;kmem_cache_alloc;_cond_resched;preempt_schedule_common;-; 48
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 49
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;tty_poll;tty_ldisc_ref_wait;ldsem_down_read;_cond_resched;preempt_schedule_common;-; 50
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;remove_arg_zero;get_user_pages;__get_user_pages;_cond_resched;preempt_schedule_common;-; 50
+readproctitle;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 51
+mkdir;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 53
+supervise;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;copy_creds;prepare_creds;kmem_cache_alloc;_cond_resched;preempt_schedule_common;-; 66
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;__ext4_read_dirblock;ext4_bread;ext4_getblk;__getblk_gfp;_cond_resched;preempt_schedule_common;-; 76
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;kernel_read;vfs_read;__vfs_read;generic_file_read_iter;_cond_resched;preempt_schedule_common;-; 96
+chmod;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;mmput;exit_mmap;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 100
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__do_fault;filemap_fault;__lock_page_or_retry;wait_on_page_bit_killable;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;handle_mm_fault;__do_page_fault;do_page_fault;page_fault;;run 117
+run;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;copy_page_range;_cond_resched;preempt_schedule_common;-; 117
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;_cond_resched;preempt_schedule_common;-; 121
+chown;entry_SYSCALL_64_fastpath;sys_mmap;sys_mmap_pgoff;vm_mmap_pgoff;down_write;_cond_resched;preempt_schedule_common;-; 137
+chown;entry_SYSCALL_64_fastpath;sys_mmap;sys_mmap_pgoff;vm_mmap_pgoff;do_mmap;mmap_region;kmem_cache_alloc;_cond_resched;preempt_schedule_common;-; 138
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;count.isra.21.constprop.38;_cond_resched;preempt_schedule_common;-; 145
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;mkdir 147
+chmod;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;clear_user;page_fault;do_page_fault;__do_page_fault;_cond_resched;preempt_schedule_common;-; 159
+chown;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;setup_arg_pages;shift_arg_pages;vma_adjust;down_write;_cond_resched;preempt_schedule_common;-; 173
+chown;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 176
+chmod;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 191
+chmod;entry_SYSCALL_64_fastpath;sys_fchmodat;chmod_common;notify_change;ext4_setattr;__mark_inode_dirty;ext4_dirty_inode;ext4_mark_inode_dirty;ext4_reserve_inode_write;__ext4_get_inode_loc;__getblk_gfp;_cond_resched;preempt_schedule_common;-; 221
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;trailing_symlink;page_follow_link_light;page_getlink.isra.34.constprop.38;read_cache_page;do_read_cache_page;wait_on_page_read;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;swapper/0 230
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;__ext4_read_dirblock;ext4_bread;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;rcu_sched 231
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;flush_old_exec;mmput;_cond_resched;preempt_schedule_common;-; 234
+chown;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 249
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;svscan 273
+mkdir;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;mmput;exit_mmap;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 382
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 389
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;do_wp_page;wp_page_copy.isra.57;anon_vma_prepare;_cond_resched;preempt_schedule_common;-; 390
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;wait_for_completion;_cond_resched;preempt_schedule_common;-; 409
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;wait_for_completion;_cond_resched;preempt_schedule_common;-; 419
+mkdir;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 457
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;rcuos/0 460
+run;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;mmput;exit_mmap;unmap_vmas;unmap_single_vma;_cond_resched;preempt_schedule_common;-; 481
+sshd;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 495
+cksum;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;dx_probe;__ext4_read_dirblock;ext4_bread;ext4_getblk;ext4_map_blocks;ext4_ext_map_blocks;ext4_find_extent;__read_extent_tree_block;bh_submit_read;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 495
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;dx_probe;__ext4_read_dirblock;ext4_bread;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 514
+run;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;alloc_pid;kmem_cache_alloc;_cond_resched;preempt_schedule_common;-; 572
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;clear_user;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;__pte_alloc;pte_alloc_one;alloc_pages_current;__alloc_pages_nodemask;_cond_resched;preempt_schedule_common;-; 579
+supervise;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;copy_page_range;_cond_resched;preempt_schedule_common;-; 590
+cksum;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 592
+chmod;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;setup_arg_pages;shift_arg_pages;vma_adjust;down_write;_cond_resched;preempt_schedule_common;-; 697
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;__ext4_read_dirblock;ext4_bread;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 706
+cksum;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;mutex_lock;_cond_resched;preempt_schedule_common;-;woken_wake_function;__wake_up_common;__wake_up;n_tty_read;tty_read;__vfs_read;vfs_read;sys_read;entry_SYSCALL_64_fastpath;;sshd 804
+supervise;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 1101
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;free_pgtables;unlink_anon_vmas;__put_anon_vma;_cond_resched;preempt_schedule_common;-; 1122
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;readproctitle 1319
+run;int_ret_from_sys_call;syscall_return_slowpath;exit_to_usermode_loop;schedule;-; 1902
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;chown 1925
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;cksum 2181
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;link_path_walk;walk_component;__lookup_hash;lookup_real;ext4_lookup;ext4_find_entry;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 2599
+cksum;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 2816
+supervise;ext4_rename2;ext4_rename;ext4_alloc_da_blocks;filemap_flush;__filemap_fdatawrite_range;do_writepages;ext4_writepages;ext4_map_blocks;ext4_ext_map_blocks;ext4_mb_new_blocks;ext4_mb_mark_diskspace_used;__ext4_journal_get_write_access;jbd2_journal_get_write_access;do_get_write_access;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;journal_end_buffer_io_sync;end_bio_bh_io_sync;bio_endio;blk_update_request;blk_mq_end_request;swapper/0 3393
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;supervise 5398
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;pollwake;__wake_up_common;__wake_up_sync_key;sock_def_readable;tcp_data_queue;tcp_rcv_established;tcp_v4_do_rcv;tcp_v4_rcv;ip_local_deliver_finish;ip_local_deliver;mkdir 6582
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;chmod 8310
+run;retint_user;prepare_exit_to_usermode;exit_to_usermode_loop;schedule;-; 8444
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;readproctitle 9768
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;run 9945
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;mkdir 11978
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;link_path_walk;walk_component;__lookup_hash;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;__ext4_get_inode_loc;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 12120
+cksum;link_path_walk;walk_component;__lookup_hash;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;__ext4_get_inode_loc;submit_bh;submit_bh_wbc;submit_bio;generic_make_request;blk_sq_make_request;blk_mq_map_request;__blk_mq_alloc_request;blk_mq_get_tag;bt_get;__sched_text_start;schedule_timeout;schedule;-;autoremove_wake_function;__wake_up_common;__wake_up;bt_clear_tag;blk_mq_put_tag;__blk_mq_free_request;blk_mq_free_request;blk_mq_end_request;__blk_mq_complete_request;blk_mq_complete_request;swapper/0 23243
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_find_entry;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 24767
+run;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule;-;child_wait_callback;__wake_up_common;__wake_up_sync_key;__wake_up_parent;do_notify_parent;do_exit;do_group_exit;sys_exit_group;entry_SYSCALL_64_fastpath;;chmod 33289
+run;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule;-;child_wait_callback;__wake_up_common;__wake_up_sync_key;__wake_up_parent;do_notify_parent;do_exit;do_group_exit;sys_exit_group;entry_SYSCALL_64_fastpath;;mkdir 34991
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;supervise 35746
+run;entry_SYSCALL_64_fastpath;sys_wait4;do_wait;schedule;-;child_wait_callback;__wake_up_common;__wake_up_sync_key;__wake_up_parent;do_notify_parent;do_exit;do_group_exit;sys_exit_group;entry_SYSCALL_64_fastpath;;chown 36942
+supervise;entry_SYSCALL_64_fastpath;sys_rename;dput;__dentry_kill;iput;evict;ext4_evict_inode;truncate_inode_pages_final;truncate_inode_pages_range;wait_on_page_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;end_page_writeback;ext4_finish_bio;ext4_end_bio;bio_endio;blk_update_request;blk_mq_end_request;swapper/0 42993
+cksum;entry_SYSCALL_64_fastpath;sys_open;do_sys_open;do_filp_open;path_openat;lookup_real;ext4_lookup;ext4_iget_normal;ext4_iget;__ext4_get_inode_loc;__wait_on_buffer;out_of_line_wait_on_bit;__wait_on_bit;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;wake_up_bit;unlock_buffer;__end_buffer_read_notouch;end_buffer_read_sync;end_bio_bh_io_sync;bio_endio;swapper/0 53348
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;pollwake;__wake_up_common;__wake_up;n_tty_receive_buf_common;n_tty_receive_buf2;flush_to_ldisc;process_one_work;worker_thread;kthread;ret_from_fork;kworker/u16:2 86256
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;run 109480
+ntpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;signal_wake_up_state;complete_signal;__send_signal;send_signal;do_send_sig_info;group_send_sig_info;kill_pid_info;it_real_fn;__hrtimer_run_queues;hrtimer_interrupt;cksum 999975
+ntpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;signal_wake_up_state;complete_signal;__send_signal;send_signal;do_send_sig_info;group_send_sig_info;kill_pid_info;it_real_fn;__hrtimer_run_queues;hrtimer_interrupt;swapper/0 999976
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;chmod 1021082
+snmpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/0 1726275
+ntpd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;signal_wake_up_state;complete_signal;__send_signal;send_signal;do_send_sig_info;group_send_sig_info;kill_pid_info;it_real_fn;__hrtimer_run_queues;hrtimer_interrupt;swapper/1 1999944
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;cksum 2041945
+cksum;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;generic_file_read_iter;__lock_page_killable;__wait_on_bit_lock;bit_wait_io;__sched_text_start;schedule_timeout;schedule;-;wake_bit_function;__wake_up_common;__wake_up;__wake_up_bit;unlock_page;mpage_end_io;bio_endio;blk_update_request;blk_mq_end_request;__blk_mq_complete_request;swapper/0 3720413
+vmstat;entry_SYSCALL_64_fastpath;sys_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/0 4000402
+tail;entry_SYSCALL_64_fastpath;sys_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/0 4000447
+readproctitle;entry_SYSCALL_64_fastpath;sys_read;vfs_read;__vfs_read;pipe_read;pipe_wait;schedule;-;autoremove_wake_function;__wake_up_common;__wake_up_sync_key;pipe_write;__vfs_write;vfs_write;sys_write;entry_SYSCALL_64_fastpath;;run 4149862
+offwaketime.py;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/1 5005058
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/1 8168600
+sshd;entry_SYSCALL_64_fastpath;sys_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;-;pollwake;__wake_up_common;__wake_up;n_tty_receive_buf_common;n_tty_receive_buf2;flush_to_ldisc;process_one_work;worker_thread;kthread;ret_from_fork;kworker/u16:1 8821767
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;run 9186846
+supervise;entry_SYSCALL_64_fastpath;sys_poll;do_sys_poll;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule_hrtimeout_range_clock.part.23;schedule;-;hrtimer_wakeup;__hrtimer_run_queues;hrtimer_interrupt;xen_timer_interrupt;handle_irq_event_percpu;handle_percpu_irq;generic_handle_irq;evtchn_2l_handle_events;__xen_evtchn_do_upcall;xen_evtchn_do_upcall;swapper/0 20415299
+
+This output format is suitable for feeding into the open source FlameGraph
+software, which visualizes these.
+
+
+USAGE message:
+
+# ./offwaketime -h
+usage: offwaketime [-h] [-p PID | -t TID | -u | -k] [-U | -K] [-d] [-f]
+                   [--stack-storage-size STACK_STORAGE_SIZE]
+                   [-m MIN_BLOCK_TIME] [-M MAX_BLOCK_TIME]
+                   [duration]
+
+Summarize blocked time by kernel stack trace + waker stack
+
+positional arguments:
+  duration              duration of trace, in seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -u, --user-threads-only
+                        user threads only (no kernel threads)
+  -k, --kernel-threads-only
+                        kernel threads only (no user threads)
+  -U, --user-stacks-only
+                        show stacks from user space only (no kernel space
+                        stacks)
+  -K, --kernel-stacks-only
+                        show stacks from kernel space only (no user space
+                        stacks)
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -f, --folded          output folded format
+  --stack-storage-size STACK_STORAGE_SIZE
+                        the number of unique stack traces that can be stored
+                        and displayed (default 1024)
+  -m MIN_BLOCK_TIME, --min-block-time MIN_BLOCK_TIME
+                        the amount of time in microseconds over which we store
+                        traces (default 1)
+  -M MAX_BLOCK_TIME, --max-block-time MAX_BLOCK_TIME
+                        the amount of time in microseconds under which we
+                        store traces (default U64_MAX)
+
+examples:
+    ./offwaketime             # trace off-CPU + waker stack time until Ctrl-C
+    ./offwaketime 5           # trace for 5 seconds only
+    ./offwaketime -f 5        # 5 seconds, and output in folded format
+    ./offwaketime -m 1000     # trace only events that last more than 1000 usec
+    ./offwaketime -M 10000    # trace only events that last less than 10000 usec
+    ./offwaketime -p 185      # only trace threads for PID 185
+    ./offwaketime -t 188      # only trace thread 188
+    ./offwaketime -u          # only trace user threads (no kernel)
+    ./offwaketime -k          # only trace kernel threads (no user)
+    ./offwaketime -U          # only show user space stacks (no kernel)
+    ./offwaketime -K          # only show kernel space stacks (no user)
diff --git a/tools/old/CMakeLists.txt b/tools/old/CMakeLists.txt
new file mode 100644
index 0000000..89b76a0
--- /dev/null
+++ b/tools/old/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB PY_FILES *.py)
+foreach(FIL ${PY_FILES})
+  get_filename_component(FIL_WE ${FIL} NAME_WE)
+  install(PROGRAMS ${FIL} DESTINATION share/bcc/tools/old RENAME ${FIL_WE})
+endforeach()
diff --git a/tools/old/bashreadline.py b/tools/old/bashreadline.py
new file mode 100755
index 0000000..571b662
--- /dev/null
+++ b/tools/old/bashreadline.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+#
+# bashreadline  Print entered bash commands from all running shells.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# This works by tracing the readline() function using a uretprobe (uprobes).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 28-Jan-2016    Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import strftime
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+int printret(struct pt_regs *ctx) {
+    if (!ctx->ax)
+        return 0;
+
+    char str[80] = {};
+    bpf_probe_read(&str, sizeof(str), (void *)PT_REGS_RC(ctx));
+    bpf_trace_printk("%s\\n", &str);
+
+    return 0;
+};
+"""
+b = BPF(text=bpf_text)
+b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret")
+
+# header
+print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    print("%-9s %-6d %s" % (strftime("%H:%M:%S"), pid, msg))
diff --git a/tools/old/biosnoop.py b/tools/old/biosnoop.py
new file mode 100755
index 0000000..37ee3f9
--- /dev/null
+++ b/tools/old/biosnoop.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biosnoop  Trace block device I/O and print details including issuing PID.
+#       For Linux, uses BCC, eBPF.
+#
+# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
+# request, as well as a starting timestamp for calculating I/O latency.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 16-Sep-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+#include <linux/blkdev.h>
+
+struct val_t {
+    u32 pid;
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HASH(start, struct request *);
+BPF_HASH(infobyreq, struct request *, struct val_t);
+
+// cache PID and comm by-req
+int trace_pid_start(struct pt_regs *ctx, struct request *req)
+{
+    struct val_t val = {};
+
+    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
+        val.pid = bpf_get_current_pid_tgid();
+        infobyreq.update(&req, &val);
+    }
+
+    return 0;
+}
+
+// time block I/O
+int trace_req_start(struct pt_regs *ctx, struct request *req)
+{
+    u64 ts;
+
+    ts = bpf_ktime_get_ns();
+    start.update(&req, &ts);
+
+    return 0;
+}
+
+// output
+int trace_req_completion(struct pt_regs *ctx, struct request *req)
+{
+    u64 *tsp, delta;
+    u32 *pidp = 0;
+    struct val_t *valp;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&req);
+    if (tsp == 0) {
+        // missed tracing issue
+        return 0;
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+
+    //
+    // Fetch and output issuing pid and comm.
+    // As bpf_trace_prink() is limited to a maximum of 1 string and 2
+    // integers, we'll use more than one to output the data.
+    //
+    valp = infobyreq.lookup(&req);
+    if (valp == 0) {
+        bpf_trace_printk("0 0 ? %d\\n", req->__data_len);
+    } else {
+        bpf_trace_printk("0 %d %s %d\\n", valp->pid, valp->name,
+            req->__data_len);
+    }
+
+    // output remaining details
+    if (req->cmd_flags & REQ_WRITE) {
+        bpf_trace_printk("1 W %s %d %d ?\\n", req->rq_disk->disk_name,
+            req->__sector, delta / 1000);
+    } else {
+        bpf_trace_printk("1 R %s %d %d ?\\n", req->rq_disk->disk_name,
+            req->__sector, delta / 1000);
+    }
+
+    start.delete(&req);
+    infobyreq.delete(&req);
+
+    return 0;
+}
+""", debug=0)
+b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
+b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
+b.attach_kprobe(event="blk_account_io_completion",
+    fn_name="trace_req_completion")
+
+# header
+print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
+    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    args = msg.split(" ")
+
+    if start_ts == 0:
+        start_ts = ts
+
+    if args[0] == "0":
+        (real_pid, real_comm, bytes_s) = (args[1], args[2], args[3])
+        continue
+    else:
+        (type_s, disk_s, sector_s, us_s) = (args[1], args[2], args[3],
+            args[4])
+
+    ms = float(int(us_s, 10)) / 1000
+
+    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
+        ts - start_ts, real_comm, real_pid, disk_s, type_s, sector_s,
+        bytes_s, ms))
diff --git a/tools/old/filelife.py b/tools/old/filelife.py
new file mode 100755
index 0000000..075be08
--- /dev/null
+++ b/tools/old/filelife.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# filelife    Trace the lifespan of short-lived files.
+#             For Linux, uses BCC, eBPF. Embedded C.
+#
+# This traces the creation and deletion of files, providing information
+# on who deleted the file, the file age, and the file name. The intent is to
+# provide information on short-lived files, for debugging or performance
+# analysis.
+#
+# USAGE: filelife [-h] [-p PID]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Feb-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+
+# arguments
+examples = """examples:
+    ./filelife           # trace all stat() syscalls
+    ./filelife -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace stat() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+
+BPF_HASH(birth, struct dentry *);
+
+// trace file creation time
+int trace_create(struct pt_regs *ctx, struct inode *dir, struct dentry *dentry)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+
+    u64 ts = bpf_ktime_get_ns();
+    birth.update(&dentry, &ts);
+
+    return 0;
+};
+
+// trace file deletion and output details
+int trace_unlink(struct pt_regs *ctx, struct inode *dir, struct dentry *dentry)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+
+    u64 *tsp, delta;
+    tsp = birth.lookup(&dentry);
+    if (tsp == 0) {
+        return 0;   // missed create
+    }
+    delta = (bpf_ktime_get_ns() - *tsp) / 1000000;
+    birth.delete(&dentry);
+
+    if (dentry->d_iname[0] == 0)
+        return 0;
+
+    bpf_trace_printk("%d %s\\n", delta, dentry->d_iname);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="vfs_create", fn_name="trace_create")
+b.attach_kprobe(event="vfs_unlink", fn_name="trace_unlink")
+
+# header
+print("%-8s %-6s %-16s %-7s %s" % ("TIME", "PID", "COMM", "AGE(s)", "FILE"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (delta, filename) = msg.split(" ", 1)
+
+    # print columns
+    print("%-8s %-6d %-16s %-7.2f %s" % (strftime("%H:%M:%S"), pid, task,
+        float(delta) / 1000, filename))
diff --git a/tools/old/gethostlatency.py b/tools/old/gethostlatency.py
new file mode 100755
index 0000000..7d32cb8
--- /dev/null
+++ b/tools/old/gethostlatency.py
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+#
+# gethostlatency  Show latency for getaddrinfo/gethostbyname[2] calls.
+#                 For Linux, uses BCC, eBPF. Embedded C.
+#
+# This can be useful for identifying DNS latency, by identifying which
+# remote host name lookups were slow, and by how much.
+#
+# This uses dynamic tracing of user-level functions and registers, and may
+# need modifications to match your software and processor architecture.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 28-Jan-2016    Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import strftime
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+struct val_t {
+    char host[80];
+    u64 ts;
+};
+BPF_HASH(start, u32, struct val_t);
+
+int do_entry(struct pt_regs *ctx) {
+    if (!ctx->di)
+        return 0;
+    struct val_t val = {};
+    u32 pid = bpf_get_current_pid_tgid();
+    bpf_probe_read(&val.host, sizeof(val.host), (void *)ctx->di);
+    val.ts = bpf_ktime_get_ns();
+    start.update(&pid, &val);
+    return 0;
+}
+
+int do_return(struct pt_regs *ctx) {
+    struct val_t *valp;
+    u64 delta;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    valp = start.lookup(&pid);
+    if (valp == 0)
+        return 0;       // missed start
+
+    delta = (bpf_ktime_get_ns() - valp->ts) / 1000;
+    bpf_trace_printk("%d %s\\n", delta, valp->host);
+    start.delete(&pid);
+    return 0;
+}
+"""
+b = BPF(text=bpf_text)
+b.attach_uprobe(name="c", sym="getaddrinfo", fn_name="do_entry")
+b.attach_uprobe(name="c", sym="gethostbyname", fn_name="do_entry")
+b.attach_uprobe(name="c", sym="gethostbyname2", fn_name="do_entry")
+b.attach_uretprobe(name="c", sym="getaddrinfo", fn_name="do_return")
+b.attach_uretprobe(name="c", sym="gethostbyname", fn_name="do_return")
+b.attach_uretprobe(name="c", sym="gethostbyname2", fn_name="do_return")
+
+# header
+print("%-9s %-6s %-12s %6s %s" % ("TIME", "PID", "COMM", "LATms", "HOST"))
+
+# format output
+while 1:
+    try:
+        (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    except ValueError:
+        continue
+    (delta, host) = msg.split(" ")
+    deltams = int(delta) / 1000
+    print("%-9s %-6d %-12.12s %6.2f %s" % (strftime("%H:%M:%S"), pid, task,
+        deltams, host))
diff --git a/tools/old/killsnoop.py b/tools/old/killsnoop.py
new file mode 100755
index 0000000..ddf9d5a
--- /dev/null
+++ b/tools/old/killsnoop.py
@@ -0,0 +1,106 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# killsnoop Trace signals issued by the kill() syscall.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: killsnoop [-h] [-t] [-x] [-p PID]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Sep-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./killsnoop           # trace all kill() signals
+    ./killsnoop -t        # include timestamps
+    ./killsnoop -x        # only show failed kills
+    ./killsnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace signals issued by the kill() syscall",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed opens")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(args_pid, u32, int);
+BPF_HASH(args_sig, u32, int);
+
+int kprobe__sys_kill(struct pt_regs *ctx, int tpid, int sig)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+    args_pid.update(&pid, &tpid);
+    args_sig.update(&pid, &sig);
+
+    return 0;
+};
+
+int kretprobe__sys_kill(struct pt_regs *ctx)
+{
+    int *tpidp, *sigp, ret = ctx->ax;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    tpidp = args_pid.lookup(&pid);
+    sigp = args_sig.lookup(&pid);
+    if (tpidp == 0 || sigp == 0) {
+        return 0;   // missed entry
+    }
+
+    bpf_trace_printk("%d %d %d\\n", *tpidp, *sigp, ret);
+    args_pid.delete(&pid);
+    args_sig.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %-4s %-6s %s" % ("PID", "COMM", "SIG", "TPID", "RESULT"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (tpid_s, sig_s, ret_s) = msg.split(" ")
+
+    ret = int(ret_s)
+    if (args.failed and (ret >= 0)):
+        continue
+
+    # print columns
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-14.9f" % (ts - start_ts), end="")
+    print("%-6d %-16s %-4s %-6s %s" % (pid, task, sig_s, tpid_s, ret_s))
diff --git a/tools/old/memleak.py b/tools/old/memleak.py
new file mode 100755
index 0000000..b962c99
--- /dev/null
+++ b/tools/old/memleak.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+#
+# memleak   Trace and display outstanding allocations to detect
+#           memory leaks in user-mode processes and the kernel.
+#
+# USAGE: memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND]
+#                [-s SAMPLE_RATE] [-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE]
+#                [-Z MAX_SIZE]
+#                [interval] [count]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Copyright (C) 2016 Sasha Goldshtein.
+
+from bcc import BPF
+from time import sleep
+from datetime import datetime
+import argparse
+import subprocess
+import os
+
+def decode_stack(bpf, pid, info):
+        stack = ""
+        if info.num_frames <= 0:
+                return "???"
+        for i in range(0, info.num_frames):
+                addr = info.callstack[i]
+                stack += " %s ;" % bpf.sym(addr, pid, show_offset=True)
+        return stack
+
+def run_command_get_output(command):
+        p = subprocess.Popen(command.split(),
+                stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        return iter(p.stdout.readline, b'')
+
+def run_command_get_pid(command):
+        p = subprocess.Popen(command.split())
+        return p.pid
+
+examples = """
+EXAMPLES:
+
+./memleak -p $(pidof allocs)
+        Trace allocations and display a summary of "leaked" (outstanding)
+        allocations every 5 seconds
+./memleak -p $(pidof allocs) -t
+        Trace allocations and display each individual call to malloc/free
+./memleak -ap $(pidof allocs) 10
+        Trace allocations and display allocated addresses, sizes, and stacks
+        every 10 seconds for outstanding allocations
+./memleak -c "./allocs"
+        Run the specified command and trace its allocations
+./memleak
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations every 5 seconds
+./memleak -o 60000
+        Trace allocations in kernel mode and display a summary of outstanding
+        allocations that are at least one minute (60 seconds) old
+./memleak -s 5
+        Trace roughly every 5th allocation, to reduce overhead
+"""
+
+description = """
+Trace outstanding memory allocations that weren't freed.
+Supports both user-mode allocations made with malloc/free and kernel-mode
+allocations made with kmalloc/kfree.
+"""
+
+parser = argparse.ArgumentParser(description=description,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=examples)
+parser.add_argument("-p", "--pid", type=int, default=-1,
+        help="the PID to trace; if not specified, trace kernel allocs")
+parser.add_argument("-t", "--trace", action="store_true",
+        help="print trace messages for each alloc/free call")
+parser.add_argument("interval", nargs="?", default=5, type=int,
+        help="interval in seconds to print outstanding allocations")
+parser.add_argument("count", nargs="?", type=int,
+        help="number of times to print the report before exiting")
+parser.add_argument("-a", "--show-allocs", default=False, action="store_true",
+        help="show allocation addresses and sizes as well as call stacks")
+parser.add_argument("-o", "--older", default=500, type=int,
+        help="prune allocations younger than this age in milliseconds")
+parser.add_argument("-c", "--command",
+        help="execute and trace the specified command")
+parser.add_argument("-s", "--sample-rate", default=1, type=int,
+        help="sample every N-th allocation to decrease the overhead")
+parser.add_argument("-d", "--stack-depth", default=10, type=int,
+        help="maximum stack depth to capture")
+parser.add_argument("-T", "--top", type=int, default=10,
+        help="display only this many top allocating stacks (by size)")
+parser.add_argument("-z", "--min-size", type=int,
+        help="capture only allocations larger than this size")
+parser.add_argument("-Z", "--max-size", type=int,
+        help="capture only allocations smaller than this size")
+
+args = parser.parse_args()
+
+pid = args.pid
+command = args.command
+kernel_trace = (pid == -1 and command is None)
+trace_all = args.trace
+interval = args.interval
+min_age_ns = 1e6 * args.older
+sample_every_n = args.sample_rate
+num_prints = args.count
+max_stack_size = args.stack_depth + 2
+top_stacks = args.top
+min_size = args.min_size
+max_size = args.max_size
+
+if min_size is not None and max_size is not None and min_size > max_size:
+        print("min_size (-z) can't be greater than max_size (-Z)")
+        exit(1)
+
+if command is not None:
+        print("Executing '%s' and tracing the resulting process." % command)
+        pid = run_command_get_pid(command)
+
+bpf_source = """
+#include <uapi/linux/ptrace.h>
+
+struct alloc_info_t {
+        u64 size;
+        u64 timestamp_ns;
+        int num_frames;
+        u64 callstack[MAX_STACK_SIZE];
+};
+
+BPF_HASH(sizes, u64);
+BPF_HASH(allocs, u64, struct alloc_info_t);
+
+// Adapted from https://github.com/iovisor/bcc/tools/offcputime.py
+static u64 get_frame(u64 *bp) {
+        if (*bp) {
+                // The following stack walker is x86_64 specific
+                u64 ret = 0;
+                if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+                        return 0;
+                if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+                        *bp = 0;
+                return ret;
+        }
+        return 0;
+}
+static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info)
+{
+        int depth = 0;
+        u64 bp = ctx->bp;
+        GRAB_ONE_FRAME
+        return depth;
+}
+
+int alloc_enter(struct pt_regs *ctx, size_t size)
+{
+        SIZE_FILTER
+        if (SAMPLE_EVERY_N > 1) {
+                u64 ts = bpf_ktime_get_ns();
+                if (ts % SAMPLE_EVERY_N != 0)
+                        return 0;
+        }
+
+        u64 pid = bpf_get_current_pid_tgid();
+        u64 size64 = size;
+        sizes.update(&pid, &size64);
+
+        if (SHOULD_PRINT)
+                bpf_trace_printk("alloc entered, size = %u\\n", size);
+        return 0;
+}
+
+int alloc_exit(struct pt_regs *ctx)
+{
+        u64 address = ctx->ax;
+        u64 pid = bpf_get_current_pid_tgid();
+        u64* size64 = sizes.lookup(&pid);
+        struct alloc_info_t info = {0};
+
+        if (size64 == 0)
+                return 0; // missed alloc entry
+
+        info.size = *size64;
+        sizes.delete(&pid);
+
+        info.timestamp_ns = bpf_ktime_get_ns();
+        info.num_frames = grab_stack(ctx, &info) - 2;
+        allocs.update(&address, &info);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("alloc exited, size = %lu, result = %lx,"
+                                 "frames = %d\\n", info.size, address,
+                                 info.num_frames);
+        }
+        return 0;
+}
+
+int free_enter(struct pt_regs *ctx, void *address)
+{
+        u64 addr = (u64)address;
+        struct alloc_info_t *info = allocs.lookup(&addr);
+        if (info == 0)
+                return 0;
+
+        allocs.delete(&addr);
+
+        if (SHOULD_PRINT) {
+                bpf_trace_printk("free entered, address = %lx, size = %lu\\n",
+                                 address, info->size);
+        }
+        return 0;
+}
+"""
+bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
+bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n))
+bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size *
+        "\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n")
+bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size))
+
+size_filter = ""
+if min_size is not None and max_size is not None:
+        size_filter = "if (size < %d || size > %d) return 0;" % \
+                      (min_size, max_size)
+elif min_size is not None:
+        size_filter = "if (size < %d) return 0;" % min_size
+elif max_size is not None:
+        size_filter = "if (size > %d) return 0;" % max_size
+bpf_source = bpf_source.replace("SIZE_FILTER", size_filter)
+
+bpf_program = BPF(text=bpf_source)
+
+if not kernel_trace:
+        print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid)
+        bpf_program.attach_uprobe(name="c", sym="malloc",
+                                  fn_name="alloc_enter", pid=pid)
+        bpf_program.attach_uretprobe(name="c", sym="malloc",
+                                     fn_name="alloc_exit", pid=pid)
+        bpf_program.attach_uprobe(name="c", sym="free",
+                                  fn_name="free_enter", pid=pid)
+else:
+        print("Attaching to kmalloc and kfree, Ctrl+C to quit.")
+        bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter")
+        bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit")
+        bpf_program.attach_kprobe(event="kfree", fn_name="free_enter")
+
+def print_outstanding():
+        stacks = {}
+        print("[%s] Top %d stacks with outstanding allocations:" %
+              (datetime.now().strftime("%H:%M:%S"), top_stacks))
+        allocs = bpf_program.get_table("allocs")
+        for address, info in sorted(allocs.items(), key=lambda a: a[1].size):
+                if BPF.monotonic_time() - min_age_ns < info.timestamp_ns:
+                        continue
+                stack = decode_stack(bpf_program, pid, info)
+                if stack in stacks:
+                        stacks[stack] = (stacks[stack][0] + 1,
+                                         stacks[stack][1] + info.size)
+                else:
+                        stacks[stack] = (1, info.size)
+                if args.show_allocs:
+                        print("\taddr = %x size = %s" %
+                              (address.value, info.size))
+        to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:]
+        for stack, (count, size) in to_show:
+                print("\t%d bytes in %d allocations from stack\n\t\t%s" %
+                      (size, count, stack.replace(";", "\n\t\t")))
+
+count_so_far = 0
+while True:
+        if trace_all:
+                print(bpf_program.trace_fields())
+        else:
+                try:
+                        sleep(interval)
+                except KeyboardInterrupt:
+                        exit()
+                print_outstanding()
+                count_so_far += 1
+                if num_prints is not None and count_so_far >= num_prints:
+                        exit()
diff --git a/tools/old/offcputime.py b/tools/old/offcputime.py
new file mode 100755
index 0000000..38d12a2
--- /dev/null
+++ b/tools/old/offcputime.py
@@ -0,0 +1,209 @@
+#!/usr/bin/python
+#
+# offcputime    Summarize off-CPU time by kernel stack trace
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: offcputime [-h] [-u] [-p PID] [-v] [-f] [duration]
+#
+# The current implementation uses an unrolled loop for x86_64, and was written
+# as a proof of concept. This implementation should be replaced in the future
+# with an appropriate bpf_ call, when available.
+#
+# Currently limited to a stack trace depth of 21 (maxdepth + 1).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Jan-2016	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+
+# arguments
+examples = """examples:
+    ./offcputime             # trace off-CPU stack time until Ctrl-C
+    ./offcputime 5           # trace for 5 seconds only
+    ./offcputime -f 5        # 5 seconds, and output in folded format
+    ./offcputime -u          # don't include kernel threads (user only)
+    ./offcputime -p 185      # trace fo PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize off-CPU time by kernel stack trace",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-u", "--useronly", action="store_true",
+    help="user threads only (no kernel threads)")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="show raw addresses")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("duration", nargs="?", default=99999999,
+    help="duration of trace, in seconds")
+args = parser.parse_args()
+folded = args.folded
+duration = int(args.duration)
+debug = 0
+maxdepth = 20    # and MAXDEPTH
+if args.pid and args.useronly:
+    print("ERROR: use either -p or -u.")
+    exit()
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MAXDEPTH	20
+#define MINBLOCK_US	1
+
+struct key_t {
+    char name[TASK_COMM_LEN];
+    // Skip saving the ip
+    u64 ret[MAXDEPTH];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+
+static u64 get_frame(u64 *bp) {
+    if (*bp) {
+        // The following stack walker is x86_64 specific
+        u64 ret = 0;
+        if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+            return 0;
+        if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+            *bp = 0;
+        if (ret < __START_KERNEL_map)
+            return 0;
+        return ret;
+    }
+    return 0;
+}
+
+int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
+    u32 pid = prev->pid;
+    u64 ts, *tsp;
+
+    // record previous thread sleep time
+    if (FILTER) {
+        ts = bpf_ktime_get_ns();
+        start.update(&pid, &ts);
+    }
+
+    // calculate current thread's delta time
+    pid = bpf_get_current_pid_tgid();
+    tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return 0;        // missed start or filtered
+    u64 delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+    delta = delta / 1000;
+    if (delta < MINBLOCK_US)
+        return 0;
+
+    // create map key
+    u64 zero = 0, *val, bp = 0;
+    int depth = 0;
+    struct key_t key = {};
+    bpf_get_current_comm(&key.name, sizeof(key.name));
+    bp = ctx->bp;
+
+    // unrolled loop (MAXDEPTH):
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+
+out:
+    val = counts.lookup_or_init(&key, &zero);
+    (*val) += delta;
+    return 0;
+}
+"""
+if args.pid:
+    filter = 'pid == %s' % args.pid
+elif args.useronly:
+    filter = '!(prev->flags & PF_KTHREAD)'
+else:
+    filter = '1'
+bpf_text = bpf_text.replace('FILTER', filter)
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions traced. Exiting.")
+    exit()
+
+# header
+if not folded:
+    print("Tracing off-CPU time (us) by kernel stack", end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+# output
+while (1):
+    try:
+        sleep(duration)
+    except KeyboardInterrupt:
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    if not folded:
+        print()
+    counts = b.get_table("counts")
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+        if folded:
+            # print folded stack output
+            line = k.name.decode('utf-8', 'replace') + ";"
+            for i in reversed(range(0, maxdepth)):
+                if k.ret[i] == 0:
+                    continue
+                line = line + b.ksym(k.ret[i])
+                if i != 0:
+                    line = line + ";"
+            print("%s %d" % (line, v.value))
+        else:
+            # print default multi-line stack output
+            for i in range(0, maxdepth):
+                if k.ret[i] == 0:
+                    break
+                print("    %-16x %s" % (k.ret[i],
+                    b.ksym(k.ret[i])))
+            print("    %-16s %s" % ("-", k.name))
+            print("        %d\n" % v.value)
+    counts.clear()
+
+    if not folded:
+        print("Detaching...")
+    exit()
diff --git a/tools/old/offwaketime.py b/tools/old/offwaketime.py
new file mode 100755
index 0000000..3b5bb36
--- /dev/null
+++ b/tools/old/offwaketime.py
@@ -0,0 +1,284 @@
+#!/usr/bin/python
+#
+# offwaketime   Summarize blocked time by kernel off-CPU stack + waker stack
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: offwaketime [-h] [-u] [-p PID] [-T] [duration]
+#
+# The current implementation uses an unrolled loop for x86_64, and was written
+# as a proof of concept. This implementation should be replaced in the future
+# with an appropriate bpf_ call, when available.
+#
+# The Off-CPU stack is currently limited to a stack trace depth of 20
+# (maxtdepth), and the waker stack limited to 10 (maxwdepth). This is also
+# limited to kernel stacks, and x86_64 only. Check for future versions, where
+# these limitations should be removed.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Jan-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+import argparse
+import signal
+
+# arguments
+examples = """examples:
+    ./offwaketime             # trace off-CPU + waker stack time until Ctrl-C
+    ./offwaketime 5           # trace for 5 seconds only
+    ./offwaketime -f 5        # 5 seconds, and output in folded format
+    ./offwaketime -u          # don't include kernel threads (user only)
+    ./offwaketime -p 185      # trace fo PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize blocked time by kernel stack trace + waker stack",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-u", "--useronly", action="store_true",
+    help="user threads only (no kernel threads)")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="show raw addresses")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("duration", nargs="?", default=99999999,
+    help="duration of trace, in seconds")
+args = parser.parse_args()
+folded = args.folded
+duration = int(args.duration)
+debug = 0
+maxwdepth = 10    # and MAXWDEPTH
+maxtdepth = 20    # and MAXTDEPTH
+if args.pid and args.useronly:
+    print("ERROR: use either -p or -u.")
+    exit()
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MAXWDEPTH	10
+#define MAXTDEPTH	20
+#define MINBLOCK_US	1
+
+struct key_t {
+    char waker[TASK_COMM_LEN];
+    char target[TASK_COMM_LEN];
+    u64 wret[MAXWDEPTH];
+    u64 tret[MAXTDEPTH];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+struct wokeby_t {
+    char name[TASK_COMM_LEN];
+    u64 ret[MAXWDEPTH];
+};
+BPF_HASH(wokeby, u32, struct wokeby_t);
+
+static u64 get_frame(u64 *bp) {
+    if (*bp) {
+        // The following stack walker is x86_64 specific
+        u64 ret = 0;
+        if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+            return 0;
+        if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+            *bp = 0;
+        if (ret < __START_KERNEL_map)
+            return 0;
+        return ret;
+    }
+    return 0;
+}
+
+int waker(struct pt_regs *ctx, struct task_struct *p) {
+    u32 pid = p->pid;
+
+    if (!(FILTER))
+        return 0;
+
+    u64 bp = 0;
+    struct wokeby_t woke = {};
+    int depth = 0;
+    bpf_get_current_comm(&woke.name, sizeof(woke.name));
+    bp = ctx->bp;
+
+    // unrolled loop (MAXWDEPTH):
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
+    woke.ret[depth] = get_frame(&bp);
+
+out:
+    wokeby.update(&pid, &woke);
+    return 0;
+}
+
+int oncpu(struct pt_regs *ctx, struct task_struct *p) {
+    u32 pid = p->pid;
+    u64 ts, *tsp;
+
+    // record previous thread sleep time
+    if (FILTER) {
+        ts = bpf_ktime_get_ns();
+        start.update(&pid, &ts);
+    }
+
+    // calculate current thread's delta time
+    pid = bpf_get_current_pid_tgid();
+    tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return 0;        // missed start or filtered
+    u64 delta = bpf_ktime_get_ns() - *tsp;
+    start.delete(&pid);
+    delta = delta / 1000;
+    if (delta < MINBLOCK_US)
+        return 0;
+
+    // create map key
+    u64 zero = 0, *val, bp = 0;
+    int depth = 0;
+    struct key_t key = {};
+    struct wokeby_t *woke;
+    bpf_get_current_comm(&key.target, sizeof(key.target));
+    bp = ctx->bp;
+
+    // unrolled loop (MAXTDEPTH):
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.tret[depth++] = get_frame(&bp))) goto out;
+    key.tret[depth] = get_frame(&bp);
+
+out:
+    woke = wokeby.lookup(&pid);
+    if (woke) {
+        __builtin_memcpy(&key.wret, woke->ret, sizeof(key.wret));
+        __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN);
+        wokeby.delete(&pid);
+    }
+
+    val = counts.lookup_or_init(&key, &zero);
+    (*val) += delta;
+    return 0;
+}
+"""
+if args.pid:
+    filter = 'pid == %s' % args.pid
+elif args.useronly:
+    filter = '!(p->flags & PF_KTHREAD)'
+else:
+    filter = '1'
+bpf_text = bpf_text.replace('FILTER', filter)
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
+b.attach_kprobe(event="try_to_wake_up", fn_name="waker")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions traced. Exiting.")
+    exit()
+
+# header
+if not folded:
+    print("Tracing blocked time (us) by kernel off-CPU and waker stack",
+        end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+# output
+while (1):
+    try:
+        sleep(duration)
+    except KeyboardInterrupt:
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    if not folded:
+        print()
+    counts = b.get_table("counts")
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+        if folded:
+            # fold target stack
+            line = k.target + ";"
+            for i in reversed(range(0, maxtdepth)):
+                if k.tret[i] == 0:
+                    continue
+                line = line + b.ksym(k.tret[i])
+                if i != 0:
+                    line = line + ";"
+
+            # add delimiter
+            line = line + ";-"
+
+            # fold waker stack
+            for i in range(0, maxwdepth):
+                line = line + ";"
+                if k.wret[i] == 0:
+                    break
+                line = line + b.ksym(k.wret[i])
+            if i != 0:
+                line = line + ";" + k.waker
+
+            # print as a line
+            print("%s %d" % (line, v.value))
+        else:
+            # print wakeup name then stack in reverse order
+            print("    %-16s %s" % ("waker:", k.waker))
+            for i in reversed(range(0, maxwdepth)):
+                if k.wret[i] == 0:
+                    continue
+                print("    %-16x %s" % (k.wret[i],
+                    b.ksym(k.wret[i])))
+
+            # print delimiter
+            print("    %-16s %s" % ("-", "-"))
+
+            # print default multi-line stack output
+            for i in range(0, maxtdepth):
+                if k.tret[i] == 0:
+                    break
+                print("    %-16x %s" % (k.tret[i],
+                    b.ksym(k.tret[i])))
+            print("    %-16s %s" % ("target:", k.target))
+            print("        %d\n" % v.value)
+    counts.clear()
+
+    if not folded:
+        print("Detaching...")
+    exit()
diff --git a/tools/old/oomkill.py b/tools/old/oomkill.py
new file mode 100755
index 0000000..b99f852
--- /dev/null
+++ b/tools/old/oomkill.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+#
+# oomkill   Trace oom_kill_process(). For Linux, uses BCC, eBPF.
+#
+# This traces the kernel out-of-memory killer, and prints basic details,
+# including the system load averages. This can provide more context on the
+# system state at the time of OOM: was it getting busier or steady, based
+# on the load averages? This tool may also be useful to customize for
+# investigations; for example, by adding other task_struct details at the time
+# of OOM.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Feb-2016   Brendan Gregg   Created this.
+
+from bcc import BPF
+from time import strftime
+import ctypes as ct
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/oom.h>
+
+struct data_t {
+    u64 fpid;
+    u64 tpid;
+    u64 pages;
+    char fcomm[TASK_COMM_LEN];
+    char tcomm[TASK_COMM_LEN];
+};
+
+BPF_PERF_OUTPUT(events);
+
+void kprobe__oom_kill_process(struct pt_regs *ctx, struct oom_control *oc,
+    struct task_struct *p, unsigned int points, unsigned long totalpages)
+{
+    struct data_t data = {};
+    u32 pid = bpf_get_current_pid_tgid();
+    data.fpid = pid;
+    data.tpid = p->pid;
+    data.pages = totalpages;
+    bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
+    bpf_probe_read(&data.tcomm, sizeof(data.tcomm), p->comm);
+    events.perf_submit(ctx, &data, sizeof(data));
+}
+"""
+
+# kernel->user event data: struct data_t
+TASK_COMM_LEN = 16  # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("fpid", ct.c_ulonglong),
+        ("tpid", ct.c_ulonglong),
+        ("pages", ct.c_ulonglong),
+        ("fcomm", ct.c_char * TASK_COMM_LEN),
+        ("tcomm", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    with open(loadavg) as stats:
+        avgline = stats.read().rstrip()
+    print(("%s Triggered by PID %d (\"%s\"), OOM kill of PID %d (\"%s\")"
+        ", %d pages, loadavg: %s") % (strftime("%H:%M:%S"), event.fpid,
+        event.fcomm.decode('utf-8', 'replace'), event.tpid,
+        event.tcomm.decode('utf-8', 'replace'), event.pages, avgline))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+print("Tracing OOM kills... Ctrl-C to stop.")
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/old/opensnoop.py b/tools/old/opensnoop.py
new file mode 100755
index 0000000..5df3b41
--- /dev/null
+++ b/tools/old/opensnoop.py
@@ -0,0 +1,112 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# opensnoop Trace open() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: opensnoop [-h] [-t] [-x] [-p PID]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 17-Sep-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./opensnoop           # trace all open() syscalls
+    ./opensnoop -t        # include timestamps
+    ./opensnoop -x        # only show failed opens
+    ./opensnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace open() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed opens")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(args_filename, u32, const char *);
+
+int kprobe__sys_open(struct pt_regs *ctx, const char __user *filename)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+    args_filename.update(&pid, &filename);
+
+    return 0;
+};
+
+int kretprobe__sys_open(struct pt_regs *ctx)
+{
+    const char **filenamep;
+    int ret = ctx->ax;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    filenamep = args_filename.lookup(&pid);
+    if (filenamep == 0) {
+        // missed entry
+        return 0;
+    }
+
+    bpf_trace_printk("%d %s\\n", ret, *filenamep);
+    args_filename.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %4s %3s %s" % ("PID", "COMM", "FD", "ERR", "PATH"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (ret_s, filename) = msg.split(" ", 1)
+
+    ret = int(ret_s)
+    if (args.failed and (ret >= 0)):
+        continue
+
+    # split return value into FD and errno columns
+    if ret >= 0:
+        fd_s = ret
+        err = 0
+    else:
+        fd_s = "-1"
+        err = - ret
+
+    # print columns
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-14.9f" % (ts - start_ts), end="")
+    print("%-6d %-16s %4s %3s %s" % (pid, task, fd_s, err, filename))
diff --git a/tools/old/profile.py b/tools/old/profile.py
new file mode 100755
index 0000000..e308208
--- /dev/null
+++ b/tools/old/profile.py
@@ -0,0 +1,364 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# profile  Profile CPU usage by sampling stack traces at a timed interval.
+#          For Linux, uses BCC, BPF, perf_events. Embedded C.
+#
+# This is an efficient profiler, as stack traces are frequency counted in
+# kernel context, rather than passing every stack to user space for frequency
+# counting there. Only the unique stacks and counts are passed to user space
+# at the end of the profile, greatly reducing the kernel<->user transfer.
+#
+# This uses perf_event_open to setup a timer which is instrumented by BPF,
+# and for efficiency it does not initialize the perf ring buffer, so the
+# redundant perf samples are not collected.
+#
+# Kernel stacks are post-process in user-land to skip the interrupt framework
+# frames. You can improve efficiency a little by specifying the exact number
+# of frames to skip with -s, provided you know what that is. If you get -s
+# wrong, note that the first line is the IP, and then the (skipped) stack.
+#
+# Note: if another perf-based sampling session is active, the output may become
+# polluted with their events. On older kernels, the ouptut may also become
+# polluted with tracing sessions (when the kprobe is used instead of the
+# tracepoint). If this becomes a problem, logic can be added to filter events.
+#
+# REQUIRES: Linux 4.6+ (BPF_MAP_TYPE_STACK_TRACE support), and the
+# perf_misc_flags() function symbol to exist. The latter may or may not
+# exist depending on your kernel build. Linux 4.9 provides a proper solution
+# to this (this tool will be updated).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# THANKS: Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote
+# much of the code here, borrowed from tracepoint.py and offcputime.py.
+#
+# 15-Jul-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF, Perf
+from sys import stderr
+from time import sleep
+import argparse
+import signal
+import os
+import errno
+import multiprocessing
+import ctypes as ct
+
+#
+# Process Arguments
+#
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+# arguments
+examples = """examples:
+    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
+    ./profile -F 99       # profile stack traces at 99 Hertz
+    ./profile 5           # profile at 49 Hertz for 5 seconds only
+    ./profile -f 5        # output in folded format for flame graphs
+    ./profile -p 185      # only profile threads for PID 185
+    ./profile -U          # only show user space stacks (no kernel)
+    ./profile -K          # only show kernel space stacks (no user)
+    ./profile -S 11       # always skip 11 frames of kernel stack
+"""
+parser = argparse.ArgumentParser(
+    description="Profile CPU stack traces at a timed interval",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+thread_group = parser.add_mutually_exclusive_group()
+thread_group.add_argument("-p", "--pid", type=positive_int,
+    help="profile this PID only")
+# TODO: add options for user/kernel threads only
+stack_group = parser.add_mutually_exclusive_group()
+stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
+    help="show stacks from user space only (no kernel space stacks)")
+stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
+    help="show stacks from kernel space only (no user space stacks)")
+parser.add_argument("-F", "--frequency", type=positive_int, default=49,
+    help="sample frequency, Hertz (default 49)")
+parser.add_argument("-d", "--delimited", action="store_true",
+    help="insert delimiter between kernel/user stacks")
+parser.add_argument("-a", "--annotations", action="store_true",
+    help="add _[k] annotations to kernel frames")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format, one line per stack (for flame graphs)")
+parser.add_argument("--stack-storage-size", default=2048,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+        "displayed (default 2048)")
+parser.add_argument("-S", "--kernel-skip", type=positive_int, default=0,
+    help="skip this many kernel frames (default 3)")
+parser.add_argument("duration", nargs="?", default=99999999,
+    type=positive_nonzero_int,
+    help="duration of trace, in seconds")
+
+# option logic
+args = parser.parse_args()
+skip = args.kernel_skip
+pid = int(args.pid) if args.pid is not None else -1
+duration = int(args.duration)
+debug = 0
+need_delimiter = args.delimited and not (args.kernel_stacks_only or
+    args.user_stacks_only)
+# TODO: add stack depth, and interval
+
+#
+# Setup BPF
+#
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct key_t {
+    u32 pid;
+    u64 kernel_ip;
+    u64 kernel_ret_ip;
+    int user_stack_id;
+    int kernel_stack_id;
+    char name[TASK_COMM_LEN];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+// This code gets a bit complex. Probably not suitable for casual hacking.
+
+PERF_TRACE_EVENT {
+    u32 pid = bpf_get_current_pid_tgid();
+    if (!(THREAD_FILTER))
+        return 0;
+
+    // create map key
+    u64 zero = 0, *val;
+    struct key_t key = {.pid = pid};
+    bpf_get_current_comm(&key.name, sizeof(key.name));
+
+    // get stacks
+    key.user_stack_id = USER_STACK_GET;
+    key.kernel_stack_id = KERNEL_STACK_GET;
+
+    if (key.kernel_stack_id >= 0) {
+        // populate extras to fix the kernel stack
+        struct pt_regs regs = {};
+        bpf_probe_read(&regs, sizeof(regs), (void *)REGS_LOCATION);
+        u64 ip = PT_REGS_IP(&regs);
+
+        // if ip isn't sane, leave key ips as zero for later checking
+#ifdef CONFIG_RANDOMIZE_MEMORY
+        if (ip > __PAGE_OFFSET_BASE) {
+#else
+        if (ip > PAGE_OFFSET) {
+#endif
+            key.kernel_ip = ip;
+            if (DO_KERNEL_RIP) {
+                /*
+                 * User didn't specify a skip value (-s), so we will figure
+                 * out how many interrupt framework frames to skip by recording
+                 * the kernel rip, then later scanning for it on the stack.
+                 * This is likely x86_64 specific; can use -s as a workaround
+                 * until this supports your architecture.
+                 */
+                bpf_probe_read(&key.kernel_ret_ip, sizeof(key.kernel_ret_ip),
+                (void *)(regs.bp + 8));
+            }
+        }
+    }
+
+    val = counts.lookup_or_init(&key, &zero);
+    (*val)++;
+    return 0;
+}
+"""
+
+# set thread filter
+thread_context = ""
+perf_filter = "-a"
+if args.pid is not None:
+    thread_context = "PID %s" % args.pid
+    thread_filter = 'pid == %s' % args.pid
+    perf_filter = '-p %s' % args.pid
+else:
+    thread_context = "all threads"
+    thread_filter = '1'
+bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+
+# handle stack args
+kernel_stack_get = "stack_traces.get_stackid(args, " \
+    "%d | BPF_F_REUSE_STACKID)" % skip
+user_stack_get = \
+    "stack_traces.get_stackid(args, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)"
+stack_context = ""
+if args.user_stacks_only:
+    stack_context = "user"
+    kernel_stack_get = "-1"
+elif args.kernel_stacks_only:
+    stack_context = "kernel"
+    user_stack_get = "-1"
+else:
+    stack_context = "user + kernel"
+bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
+bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
+if skip:
+    # don't record the rip, as we won't use it
+    bpf_text = bpf_text.replace('DO_KERNEL_RIP', '0')
+else:
+    # rip is used to skip interrupt infrastructure frames
+    bpf_text = bpf_text.replace('DO_KERNEL_RIP', '1')
+
+# header
+if not args.folded:
+    print("Sampling at %d Hertz of %s by %s stack" %
+        (args.frequency, thread_context, stack_context), end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+# kprobe perf_misc_flags()
+bpf_text = bpf_text.replace('PERF_TRACE_EVENT',
+    'int kprobe__perf_misc_flags(struct pt_regs *args)')
+bpf_text = bpf_text.replace('REGS_LOCATION', 'PT_REGS_PARM1(args)')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+try:
+    b = BPF(text=bpf_text)
+except:
+    print("BPF initialization failed. perf_misc_flags() may be inlined in " +
+        "your kernel build.\nThis tool will be updated in the future to " +
+        "support Linux 4.9, which has reliable profiling support. Exiting.")
+    exit()
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+#
+# Setup perf_events
+#
+
+# use perf_events to sample
+try:
+    Perf.perf_event_open(0, pid=-1, ptype=Perf.PERF_TYPE_SOFTWARE,
+        freq=args.frequency)
+except:
+    print("ERROR: initializing perf_events for sampling.\n"
+        "To debug this, try running the following command:\n"
+        "    perf record -F 49 -e cpu-clock %s -- sleep 1\n"
+        "If that also doesn't work, fix it first." % perf_filter, file=stderr)
+    exit(0)
+
+#
+# Output Report
+#
+
+# collect samples
+try:
+    sleep(duration)
+except KeyboardInterrupt:
+    # as cleanup can take some time, trap Ctrl-C:
+    signal.signal(signal.SIGINT, signal_ignore)
+
+if not args.folded:
+    print()
+
+def aksym(addr):
+    if args.annotations:
+        return b.ksym(addr) + "_[k]"
+    else:
+        return b.ksym(addr)
+
+# output stacks
+missing_stacks = 0
+has_enomem = False
+counts = b.get_table("counts")
+stack_traces = b.get_table("stack_traces")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    # handle get_stackid erorrs
+    if (not args.user_stacks_only and k.kernel_stack_id < 0 and
+            k.kernel_stack_id != -errno.EFAULT) or \
+            (not args.kernel_stacks_only and k.user_stack_id < 0 and
+            k.user_stack_id != -errno.EFAULT):
+        missing_stacks += 1
+        # check for an ENOMEM error
+        if k.kernel_stack_id == -errno.ENOMEM or \
+                k.user_stack_id == -errno.ENOMEM:
+            has_enomem = True
+
+    user_stack = [] if k.user_stack_id < 0 else \
+        stack_traces.walk(k.user_stack_id)
+    kernel_tmp = [] if k.kernel_stack_id < 0 else \
+        stack_traces.walk(k.kernel_stack_id)
+
+    # fix kernel stack
+    kernel_stack = []
+    if k.kernel_stack_id >= 0:
+        if skip:
+            # fixed skip
+            for addr in kernel_tmp:
+                kernel_stack.append(addr)
+            kernel_stack = kernel_stack[skip:]
+        else:
+            # skip the interrupt framework stack by searching for our RIP
+            skipping = 1
+            for addr in kernel_tmp:
+                if k.kernel_ret_ip == addr:
+                    skipping = 0
+                if not skipping:
+                    kernel_stack.append(addr)
+        if k.kernel_ip:
+            kernel_stack.insert(0, k.kernel_ip)
+
+    do_delimiter = need_delimiter and kernel_stack
+
+    if args.folded:
+        # print folded stack output
+        user_stack = list(user_stack)
+        kernel_stack = list(kernel_stack)
+        line = [k.name.decode('utf-8', 'replace')] + \
+            [b.sym(addr, k.pid) for addr in reversed(user_stack)] + \
+            (do_delimiter and ["-"] or []) + \
+            [aksym(addr) for addr in reversed(kernel_stack)]
+        print("%s %d" % (";".join(line), v.value))
+    else:
+        # print default multi-line stack output.
+        for addr in kernel_stack:
+            print("    %s" % aksym(addr))
+        if do_delimiter:
+            print("    --")
+        for addr in user_stack:
+            print("    %s" % b.sym(addr, k.pid))
+        print("    %-16s %s (%d)" % ("-", k.name, k.pid))
+        print("        %d\n" % v.value)
+
+# check missing
+if missing_stacks > 0:
+    enomem_str = "" if not has_enomem else \
+        " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces could not be displayed.%s" %
+        (missing_stacks, enomem_str),
+        file=stderr)
diff --git a/tools/old/profile_example.txt b/tools/old/profile_example.txt
new file mode 100644
index 0000000..cd0c5ef
--- /dev/null
+++ b/tools/old/profile_example.txt
@@ -0,0 +1,788 @@
+Demonstrations of profile, the Linux eBPF/bcc version.
+
+
+This is a CPU profiler. It works by taking samples of stack traces at timed
+intervals, and frequency counting them in kernel context for efficiency.
+
+Example output:
+
+# ./profile
+Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
+^C
+    ffffffff81189249 filemap_map_pages
+    ffffffff811bd3f5 handle_mm_fault
+    ffffffff81065990 __do_page_fault
+    ffffffff81065caf do_page_fault
+    ffffffff817ce228 page_fault
+    00007fed989afcc0 [unknown]
+    -                cp (9036)
+        1
+
+    00007f31d76c3251 [unknown]
+    47a2c1e752bf47f7 [unknown]
+    -                sign-file (8877)
+        1
+
+    ffffffff813d0af8 __clear_user
+    ffffffff813d5277 iov_iter_zero
+    ffffffff814ec5f2 read_iter_zero
+    ffffffff8120be9d __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        4
+
+    0000000000400542 func_a
+    0000000000400598 main
+    00007f12a133e830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (13549)
+        5
+
+[...]
+
+    ffffffff8105eb66 native_safe_halt
+    ffffffff8103659e default_idle
+    ffffffff81036d1f arch_cpu_idle
+    ffffffff810bba5a default_idle_call
+    ffffffff810bbd07 cpu_startup_entry
+    ffffffff817bf4a7 rest_init
+    ffffffff81d65f58 start_kernel
+    ffffffff81d652db x86_64_start_reservations
+    ffffffff81d65418 x86_64_start_kernel
+    -                swapper/0 (0)
+        72
+
+    ffffffff8105eb66 native_safe_halt
+    ffffffff8103659e default_idle
+    ffffffff81036d1f arch_cpu_idle
+    ffffffff810bba5a default_idle_call
+    ffffffff810bbd07 cpu_startup_entry
+    ffffffff8104df55 start_secondary
+    -                swapper/1 (0)
+        75
+
+The output was long; I truncated some lines ("[...]").
+
+This default output prints stack traces as two columns (raw addresses, and
+then translated symbol names), followed by a line to describe the process (a
+dash, the process name, and a PID in parenthesis), and then an integer count
+of how many times this stack trace was sampled.
+
+The output above shows the most frequent stack was from the "swapper/1"
+process (PID 0), running the native_safe_halt() function, which was called
+by default_idle(), which was called by arch_cpu_idle(), and so on. This is
+the idle thread. Stacks can be read top-down, to follow ancestry: child,
+parent, grandparent, etc.
+
+The func_ab process is running the func_a() function, called by main(),
+called by __libc_start_main(), and called by "[unknown]" with what looks
+like a bogus address (1st column). That's evidence of a broken stack trace.
+It's common for user-level software that hasn't been compiled with frame
+pointers (in this case, libc).
+
+The dd process has called read(), and then enters the kernel via
+entry_SYSCALL_64_fastpath(), calling sys_read(), and so on. Yes, I'm now
+reading it bottom up. That way follows the code flow.
+
+
+The dd process is actually "dd if=/dev/zero of=/dev/null": it's a simple
+workload to analyze that just moves bytes from /dev/zero to /dev/null.
+Profiling just that process:
+
+# ./profile -p 25036
+Sampling at 49 Hertz of PID 25036 by user + kernel stack... Hit Ctrl-C to end.
+^C
+    0000000000402748 [unknown]
+    00007fc56561422c [unknown]
+    -                dd (25036)
+        1
+
+    00007fc5652ada0e __write
+    -                dd (25036)
+        1
+
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        1
+
+[...]
+
+    00000000004047b2 [unknown]
+    00007fc56561422c [unknown]
+    -                dd (25036)
+        2
+
+    ffffffff817cc060 entry_SYSCALL_64_fastpath
+    00007fc5652ada10 __write
+    00007fc56561422c [unknown]
+    -                dd (25036)
+        3
+
+    ffffffff817cc060 entry_SYSCALL_64_fastpath
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        3
+
+    ffffffff813d0af8 __clear_user
+    ffffffff813d5277 iov_iter_zero
+    ffffffff814ec5f2 read_iter_zero
+    ffffffff8120be9d __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    00007fc5652ad9b0 read
+    00007fc56561422c [unknown]
+    -                dd (25036)
+        3
+
+    ffffffff813d0af8 __clear_user
+    ffffffff813d5277 iov_iter_zero
+    ffffffff814ec5f2 read_iter_zero
+    ffffffff8120be9d __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        7
+
+Again, I've truncated some lines. Now we're just analyzing the dd process.
+The filtering is performed in kernel context, for efficiency.
+
+This output has some "[unknown]" frames that probably have valid addresses,
+but we're lacking the symbol translation. This is a common for all profilers
+on Linux, and is usually fixable. See the DEBUGGING section of the profile(8)
+man page.
+
+
+Lets add delimiters between the user and kernel stacks, using -d:
+
+# ./profile -p 25036 -d
+^C
+    ffffffff8120b385 __vfs_write
+    ffffffff8120d826 sys_write
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    --
+    00007fc5652ada10 __write
+    -                dd (25036)
+        1
+
+    --
+    00007fc565255ef3 [unknown]
+    00007fc56561422c [unknown]
+    -                dd (25036)
+        1
+
+    ffffffff813d4569 iov_iter_init
+    ffffffff8120be8e __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    --
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        1
+
+[...]
+
+    ffffffff813d0af8 __clear_user
+    ffffffff813d5277 iov_iter_zero
+    ffffffff814ec5f2 read_iter_zero
+    ffffffff8120be9d __vfs_read
+    ffffffff8120c385 vfs_read
+    ffffffff8120d786 sys_read
+    ffffffff817cc076 entry_SYSCALL_64_fastpath
+    --
+    00007fc5652ad9b0 read
+    -                dd (25036)
+        9
+
+In this mode, the delimiters are "--".
+
+
+
+Here's another example, a func_ab program that runs two functions, func_a() and
+func_b(). Profiling it for 5 seconds:
+
+# ./profile -p `pgrep -n func_ab` 5
+Sampling at 49 Hertz of PID 2930 by user + kernel stack for 5 secs.
+
+    000000000040053e func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        2
+
+    0000000000400566 func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        3
+
+    000000000040053a func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        5
+
+    0000000000400562 func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        12
+
+    000000000040056a func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        19
+
+    0000000000400542 func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        22
+
+    0000000000400571 func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        64
+
+    0000000000400549 func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        72
+
+Note that the same stack (2nd column) seems to be repeated. Weren't we doing
+frequency counting and only printing unique stacks? We are, but in terms of
+the raw addresses, not the symbols. See the 1st column: those stacks are
+all unique.
+
+
+We can output in "folded format", which puts the stack trace on one line,
+separating frames with semi-colons. Eg:
+
+# ./profile -f -p `pgrep -n func_ab` 5
+func_ab;[unknown];__libc_start_main;main;func_a 2
+func_ab;[unknown];__libc_start_main;main;func_b 2
+func_ab;[unknown];__libc_start_main;main;func_a 11
+func_ab;[unknown];__libc_start_main;main;func_b 12
+func_ab;[unknown];__libc_start_main;main;func_a 23
+func_ab;[unknown];__libc_start_main;main;func_b 28
+func_ab;[unknown];__libc_start_main;main;func_b 57
+func_ab;[unknown];__libc_start_main;main;func_a 64
+
+I find this pretty useful for writing to files and later grepping.
+
+
+Folded format can also be used by flame graph stack visualizers, including
+the original implementation:
+
+	https://github.com/brendangregg/FlameGraph
+
+I'd include delimiters, -d. For example:
+
+# ./profile -df -p `pgrep -n func_ab` 5 > out.profile
+# git clone https://github.com/brendangregg/FlameGraph
+# ./FlameGraph/flamegraph.pl < out.profile > out.svg
+
+(Yes, I could pipe profile directly into flamegraph.pl, however, I like to
+keep the raw folded profiles around: can be useful for regenerating flamegraphs
+with different options, and, for differential flame graphs.)
+
+
+Some flamegraph.pl palettes recognize kernel annotations, which can be added
+with -a. It simply adds a "_[k]" at the end of kernel function names.
+For example:
+
+# ./profile -adf -p `pgrep -n dd` 10
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;[unknown];[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__fsnotify_parent_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fdget_pos_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__fsnotify_parent_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fsnotify_parent_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];fsnotify_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];iov_iter_init_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k];write_null_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__vfs_read_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fsnotify_parent_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;sys_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];common_file_perm_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];vfs_read_[k] 1
+dd;__write 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;[unknown];[unknown] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown] 1
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];_cond_resched_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];iov_iter_init_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];rw_verify_area_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];fsnotify_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];__fdget_pos_[k] 2
+dd;[unknown];[unknown] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fdget_pos_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 2
+dd;[unknown];[unknown] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];__fdget_pos_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];[unknown] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];fsnotify_[k] 2
+dd;__write;-;sys_write_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 2
+dd;read;-;SyS_read_[k] 2
+dd;[unknown] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 2
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];rw_verify_area_[k] 2
+dd;[unknown];[unknown] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];rw_verify_area_[k] 3
+dd;[unknown];[unknown] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 3
+dd;[unknown];[unknown] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 3
+dd;[unknown];[unknown] 3
+dd;[unknown];[unknown] 3
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 3
+dd;[unknown] 4
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 4
+dd;[unknown];[unknown] 4
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 4
+dd;[unknown] 4
+dd;[unknown];[unknown] 4
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 4
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 5
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 5
+dd;[unknown];[unknown] 5
+dd;[unknown];[unknown] 5
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 6
+dd;read 15
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 19
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k] 20
+dd;read;-;entry_SYSCALL_64_fastpath_[k] 23
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 24
+dd;__write;-;entry_SYSCALL_64_fastpath_[k] 25
+dd;__write 29
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k] 31
+
+This can be made into a flamegraph. Eg:
+
+# ./profile -adf -p `pgrep -n func_ab` 10 > out.profile
+# git clone https://github.com/brendangregg/FlameGraph
+# ./FlameGraph/flamegraph.pl --color=java < out.profile > out.svg
+
+It will highlight the kernel frames in orange, and user-level in red (and Java
+in green, and C++ in yellow). If you copy-n-paste the above output into a
+out.profile file, you can try it out.
+
+
+You can increase or decrease the sample frequency. Eg, sampling at 9 Hertz:
+
+# ./profile -F 9
+Sampling at 9 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
+^C
+    000000000040056a func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        1
+
+[...]
+
+    ffffffff8105eb66 native_safe_halt
+    ffffffff8103659e default_idle
+    ffffffff81036d1f arch_cpu_idle
+    ffffffff810bba5a default_idle_call
+    ffffffff810bbd07 cpu_startup_entry
+    ffffffff8104df55 start_secondary
+    -                swapper/3 (0)
+        8
+
+    ffffffff8105eb66 native_safe_halt
+    ffffffff8103659e default_idle
+    ffffffff81036d1f arch_cpu_idle
+    ffffffff810bba5a default_idle_call
+    ffffffff810bbd07 cpu_startup_entry
+    ffffffff817bf497 rest_init
+    ffffffff81d65f58 start_kernel
+    ffffffff81d652db x86_64_start_reservations
+    ffffffff81d65418 x86_64_start_kernel
+    -                swapper/0 (0)
+        8
+
+
+You can also restrict profiling to just kernel stacks (-K) or user stacks (-U).
+For example, just user stacks:
+
+# ./profile -U
+Sampling at 49 Hertz of all threads by user stack... Hit Ctrl-C to end.
+^C
+    0000000000402ccc [unknown]
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        1
+
+    0000000000404b80 [unknown]
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        1
+
+    0000000000404d77 [unknown]
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        1
+
+    00007f45a5e85e5e [unknown]
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        1
+
+    0000000000402d12 [unknown]
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        1
+
+    0000000000400562 func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        1
+
+    0000000000404805 [unknown]
+    -                dd (2931)
+        1
+
+    00000000004047de [unknown]
+    -                dd (2931)
+        1
+
+    0000000000400542 func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        3
+
+    00007f45a5edda10 __write
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        3
+
+    000000000040053a func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        4
+
+    000000000040056a func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        7
+
+    -                swapper/6 (0)
+        10
+
+    0000000000400571 func_b
+    00000000004005ac main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        10
+
+    00007f45a5edda10 __write
+    -                dd (2931)
+        10
+
+    0000000000400549 func_a
+    0000000000400598 main
+    00007f0458819830 __libc_start_main
+    083e258d4c544155 [unknown]
+    -                func_ab (2930)
+        11
+
+    00007f45a5edd9b0 read
+    -                dd (2931)
+        12
+
+    00007f45a5edd9b0 read
+    00007f45a624422c [unknown]
+    -                dd (2931)
+        14
+
+    -                swapper/7 (0)
+        46
+
+    -                swapper/0 (0)
+        46
+
+    -                swapper/2 (0)
+        46
+
+    -                swapper/1 (0)
+        46
+
+    -                swapper/3 (0)
+        46
+
+    -                swapper/4 (0)
+        46
+
+
+If there are too many unique stack traces for the kernel to save, a warning
+will be printed. Eg:
+
+# ./profile
+[...]
+WARNING: 8 stack traces could not be displayed. Consider increasing --stack-storage-size.
+
+Run ./profile -h to see the default.
+
+
+There is a -S option to skip kernel frames. You probably don't need to mess
+with this. Here's why it exists: consider the following kernel stack trace,
+and IP:
+
+    ffffffff81174e78 perf_swevent_hrtimer
+    ffffffff810e6984 __hrtimer_run_queues
+    ffffffff810e70f8 hrtimer_interrupt
+    ffffffff81022c69 xen_timer_interrupt
+    ffffffff810d2942 handle_irq_event_percpu
+    ffffffff810d62da handle_percpu_irq
+    ffffffff810d1f52 generic_handle_irq
+    ffffffff814a5137 evtchn_2l_handle_events
+    ffffffff814a2853 __xen_evtchn_do_upcall
+    ffffffff814a4740 xen_evtchn_do_upcall
+    ffffffff817cd50c xen_hvm_callback_vector
+    ffffffff8103663e default_idle
+    ffffffff81036dbf arch_cpu_idle
+    ffffffff810bb8ea default_idle_call
+    ffffffff810bbb97 cpu_startup_entry
+    ffffffff8104df85 start_secondary
+
+IP: ffffffff8105eb66 native_safe_halt
+
+This is the idle thread. The first function is native_safe_halt(), and its
+parent is default_idle(). But what you see there is really what we are
+profiling. All that stuff above default_idle()? Interrupt framework stack.
+
+So we have to exclude those interrupt frames. I do this by fetching the ret IP
+from the kernel stack, and then scanning for it in user-level: in this case
+it would be default_idle(). Ok.
+
+If this doesn't work on your architecture (and your kernel stacks are a
+single line, the IP), then you might consider setting a fixed skip count,
+which avoids this ret IP logic. For the above stack, I'd set "-S 11", and
+it would slice off those 11 interrupt frames nicely. It also does this in
+kernel context for efficiency.
+
+So how do you figure out what number to use? 11? 14? 5? Well.. Try "-S 1",
+and then see how much higher you need to set it. Remember on the real
+profile output that the IP line is printed on top of the sliced stack.
+
+
+USAGE message:
+
+# ./profile -h
+usage: profile [-h] [-p PID] [-U | -K] [-F FREQUENCY] [-d] [-a] [-f]
+                  [--stack-storage-size STACK_STORAGE_SIZE] [-S KERNEL_SKIP]
+                  [duration]
+
+Profile CPU stack traces at a timed interval
+
+positional arguments:
+  duration              duration of trace, in seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     profile this PID only
+  -U, --user-stacks-only
+                        show stacks from user space only (no kernel space
+                        stacks)
+  -K, --kernel-stacks-only
+                        show stacks from kernel space only (no user space
+                        stacks)
+  -F FREQUENCY, --frequency FREQUENCY
+                        sample frequency, Hertz (default 49)
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -a, --annotations     add _[k] annotations to kernel frames
+  -f, --folded          output folded format, one line per stack (for flame
+                        graphs)
+  --stack-storage-size STACK_STORAGE_SIZE
+                        the number of unique stack traces that can be stored
+                        and displayed (default 2048)
+  -S KERNEL_SKIP, --kernel-skip KERNEL_SKIP
+                        skip this many kernel frames (default 3)
+
+examples:
+    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
+    ./profile -F 99       # profile stack traces at 99 Hertz
+    ./profile 5           # profile at 49 Hertz for 5 seconds only
+    ./profile -f 5        # output in folded format for flame graphs
+    ./profile -p 185      # only profile threads for PID 185
+    ./profile -U          # only show user space stacks (no kernel)
+    ./profile -K          # only show kernel space stacks (no user)
+    ./profile -S 11       # always skip 11 frames of kernel stack
diff --git a/tools/old/softirqs.py b/tools/old/softirqs.py
new file mode 100755
index 0000000..3b40b1a
--- /dev/null
+++ b/tools/old/softirqs.py
@@ -0,0 +1,204 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# softirqs  Summarize soft IRQ (interrupt) event time.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: softirqs [-h] [-T] [-N] [-d] [interval] [count]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Oct-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./softirqs            # sum soft irq event time
+    ./softirqs -d         # show soft irq event time as histograms
+    ./softirqs 1 10       # print 1 second summaries, 10 times
+    ./softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize soft irq event time as histograms",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-N", "--nanoseconds", action="store_true",
+    help="output in nanoseconds")
+parser.add_argument("-d", "--dist", action="store_true",
+    help="show distributions as histograms")
+parser.add_argument("-C", "--bycpu", action="store_true",
+    help="break down softirqs to individual cpus")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+if args.nanoseconds:
+    factor = 1
+    label = "nsecs"
+else:
+    factor = 1000
+    label = "usecs"
+debug = 0
+
+# define BPF program
+bpf_text = ""
+if args.bycpu:
+    bpf_text = """
+    #include <uapi/linux/ptrace.h>
+
+    typedef struct irq_cpu_key {
+        s64 cpu;
+        u64 slot;
+    } irq_key_t;
+
+    BPF_HASH(start, u32);
+    BPF_HISTOGRAM(dist, irq_key_t);
+
+    // time IRQ
+    int trace_start_cpu(struct pt_regs *ctx)
+    {
+        int curr_cpu = bpf_get_smp_processor_id();
+        u64 ts = bpf_ktime_get_ns();
+        start.update(&curr_cpu, &ts);
+        return 0;
+    }
+
+    int trace_completion_cpu(struct pt_regs *ctx)
+    {
+        u64 *tsp, delta;
+        int curr_cpu = bpf_get_smp_processor_id();
+
+        // fetch timestamp and calculate delta
+        tsp = start.lookup(&curr_cpu);
+        COMMON
+
+        // store as sum or histogram
+        irq_key_t key = {.cpu = curr_cpu,
+        STORE
+
+        start.delete(&curr_cpu);
+        return 0;
+    }
+    """
+else:
+    bpf_text = """
+    #include <uapi/linux/ptrace.h>
+
+    typedef struct irq_key {
+        u64 ip;
+        u64 slot;
+    } irq_key_t;
+
+    BPF_HASH(start, u32);
+    BPF_HASH(iptr, u32);
+    BPF_HISTOGRAM(dist, irq_key_t);
+
+    // time IRQ
+    int trace_start(struct pt_regs *ctx)
+    {
+        u32 pid = bpf_get_current_pid_tgid();
+        u64 ip = PT_REGS_IP(ctx), ts = bpf_ktime_get_ns();
+        start.update(&pid, &ts);
+        iptr.update(&pid, &ip);
+        return 0;
+    }
+
+    int trace_completion(struct pt_regs *ctx)
+    {
+        u64 *tsp, delta, ip, *ipp;
+        u32 pid = bpf_get_current_pid_tgid();
+        // fetch timestamp and calculate delta
+        tsp = start.lookup(&pid);
+        ipp = iptr.lookup(&pid);
+        COMMON
+
+        // store as sum or histogram
+        irq_key_t key = {
+        STORE
+
+        start.delete(&pid);
+        iptr.delete(&pid);
+        return 0;
+    }
+    """
+
+# code substitutions
+bpf_text = bpf_text.replace('COMMON',
+        """if (tsp == 0) {
+            return 0;   // missed start
+        }
+        delta = bpf_ktime_get_ns() - *tsp;
+        """)
+
+if args.dist:
+    bpf_text = bpf_text.replace('STORE',
+        '.slot = bpf_log2l(delta)};' +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORE',
+        ' .ip = ip, .slot = 0 /* ignore */};' +
+        'u64 zero = 0, *vp = dist.lookup_or_init(&key, &zero);' +
+        '(*vp) += delta;')
+if debug:
+    print(bpf_text)
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# this should really use irq:softirq_entry/exit tracepoints; for now the
+# soft irq functions are individually traced (search your kernel for
+# open_softirq() calls, and adjust the following list as needed).
+for softirqfunc in ("blk_iopoll_softirq", "blk_done_softirq",
+        "rcu_process_callbacks", "run_rebalance_domains", "tasklet_action",
+        "tasklet_hi_action", "run_timer_softirq", "net_tx_action",
+        "net_rx_action"):
+    if args.bycpu:
+        b.attach_kprobe(event=softirqfunc, fn_name="trace_start_cpu")
+        b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion_cpu")
+    else:
+        b.attach_kprobe(event=softirqfunc, fn_name="trace_start")
+        b.attach_kretprobe(event=softirqfunc, fn_name="trace_completion")
+
+print("Tracing soft irq event time... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    if args.dist:
+        if args.bycpu:
+            dist.print_log2_hist(label, "CPU")
+        else:
+            dist.print_log2_hist(label, "softirq", section_print_fn=b.ksym)
+    else:
+        if args.bycpu:
+            print("%-26s %11s %11s" % ("SOFTIRQ", "CPU", "TOTAL_" + label))
+            for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
+                print("%-26s %11d %11d" % (b.ksym(k.ip), k.cpu, v.value / factor))
+        else:
+            print("%-26s %11s" % ("SOFTIRQ", "TOTAL_" + label))
+            for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
+                print("%-26s %11d" % (b.ksym(k.ip), v.value / factor))
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/old/stackcount.py b/tools/old/stackcount.py
new file mode 100755
index 0000000..108c800
--- /dev/null
+++ b/tools/old/stackcount.py
@@ -0,0 +1,176 @@
+#!/usr/bin/python
+#
+# stackcount    Count kernel function calls and their stack traces.
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] pattern
+#
+# The pattern is a string with optional '*' wildcards, similar to file
+# globbing. If you'd prefer to use regular expressions, use the -r option.
+#
+# The current implementation uses an unrolled loop for x86_64, and was written
+# as a proof of concept. This implementation should be replaced in the future
+# with an appropriate bpf_ call, when available.
+#
+# Currently limited to a stack trace depth of 11 (maxdepth + 1).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Jan-2016	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+
+# arguments
+examples = """examples:
+    ./stackcount submit_bio       # count kernel stack traces for submit_bio
+    ./stackcount ip_output        # count kernel stack traces for ip_output
+    ./stackcount -s ip_output     # show symbol offsets
+    ./stackcount -sv ip_output    # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'      # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*' # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output  # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Count kernel function calls and their stack traces",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-i", "--interval", default=99999999,
+    help="summary interval, seconds")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-r", "--regexp", action="store_true",
+    help="use regular expressions. Default is \"*\" wildcards only.")
+parser.add_argument("-s", "--offset", action="store_true",
+    help="show address offsets")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="show raw addresses")
+parser.add_argument("pattern",
+    help="search expression for kernel functions")
+args = parser.parse_args()
+pattern = args.pattern
+if not args.regexp:
+    pattern = pattern.replace('*', '.*')
+    pattern = '^' + pattern + '$'
+offset = args.offset
+verbose = args.verbose
+debug = 0
+maxdepth = 10    # and MAXDEPTH
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# load BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+#define MAXDEPTH	10
+
+struct key_t {
+    u64 ip;
+    u64 ret[MAXDEPTH];
+};
+BPF_HASH(counts, struct key_t);
+
+static u64 get_frame(u64 *bp) {
+    if (*bp) {
+        // The following stack walker is x86_64 specific
+        u64 ret = 0;
+        if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+            return 0;
+        if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+            *bp = 0;
+        if (ret < __START_KERNEL_map)
+            return 0;
+        return ret;
+    }
+    return 0;
+}
+
+int trace_count(struct pt_regs *ctx) {
+    FILTER
+    struct key_t key = {};
+    u64 zero = 0, *val, bp = 0;
+    int depth = 0;
+
+    key.ip = ctx->ip;
+    bp = ctx->bp;
+
+    // unrolled loop, 10 (MAXDEPTH) frames deep:
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+
+out:
+    val = counts.lookup_or_init(&key, &zero);
+    (*val)++;
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        ('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
+        'if (pid != %s) { return 0; }') % (args.pid))
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+b = BPF(text=bpf_text)
+b.attach_kprobe(event_re=pattern, fn_name="trace_count")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions matched by \"%s\". Exiting." % args.pattern)
+    exit()
+
+# header
+print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+    (matched, args.pattern))
+
+def print_frame(addr):
+    print("  ", end="")
+    if verbose:
+        print("%-16x " % addr, end="")
+    print(b.ksym(addr, show_offset=offset))
+
+# output
+exiting = 0 if args.interval else 1
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    counts = b.get_table("counts")
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+        print_frame(k.ip)
+        for i in range(0, maxdepth):
+            if k.ret[i] == 0:
+                break
+            print_frame(k.ret[i])
+        print("    %d\n" % v.value)
+    counts.clear()
+
+    if exiting:
+        print("Detaching...")
+        exit()
diff --git a/tools/old/stacksnoop.py b/tools/old/stacksnoop.py
new file mode 100755
index 0000000..9fcc12b
--- /dev/null
+++ b/tools/old/stacksnoop.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python
+#
+# stacksnoop    Trace a kernel function and print all kernel stack traces.
+#               For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
+#
+# USAGE: stacksnoop [-h] [-p PID] [-s] [-v] function
+#
+# The current implementation uses an unrolled loop for x86_64, and was written
+# as a proof of concept. This implementation should be replaced in the future
+# with an appropriate bpf_ call, when available.
+#
+# The stack depth is limited to 10 (+1 for the current instruction pointer).
+# This could be tunable in a future version.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Jan-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./stacksnoop ext4_sync_fs    # print kernel stack traces for ext4_sync_fs
+    ./stacksnoop -s ext4_sync_fs    # ... also show symbol offsets
+    ./stacksnoop -v ext4_sync_fs    # ... show extra columns
+    ./stacksnoop -p 185 ext4_sync_fs    # ... only when PID 185 is on-CPU
+"""
+parser = argparse.ArgumentParser(
+    description="Trace and print kernel stack traces for a kernel function",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-s", "--offset", action="store_true",
+    help="show address offsets")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print more fields")
+parser.add_argument("function",
+    help="kernel function name")
+args = parser.parse_args()
+function = args.function
+offset = args.offset
+verbose = args.verbose
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+static int print_frame(u64 *bp, int *depth) {
+    if (*bp) {
+        // The following stack walker is x86_64 specific
+        u64 ret = 0;
+        if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+            return 0;
+        if (ret < __START_KERNEL_map)
+            return 0;
+        bpf_trace_printk("r%d: %llx\\n", *depth, ret);
+        if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+            return 0;
+        *depth += 1;
+        return 1;
+    }
+    return 0;
+}
+
+void trace_stack(struct pt_regs *ctx) {
+    FILTER
+    u64 bp = 0;
+    int depth = 0;
+
+    bpf_trace_printk("\\n");
+    if (ctx->ip)
+        bpf_trace_printk("ip: %llx\\n", ctx->ip);
+    bp = ctx->bp;
+
+    // unrolled loop, 10 frames deep:
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+    if (!print_frame(&bp, &depth)) return;
+};
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        ('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
+        'if (pid != %s) { return; }') % (args.pid))
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event=function, fn_name="trace_stack")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("Function \"%s\" not found. Exiting." % function)
+    exit()
+
+# header
+if verbose:
+    print("%-18s %-12s %-6s %-3s %s" % ("TIME(s)", "COMM", "PID", "CPU",
+        "STACK"))
+else:
+    print("%-18s %s" % ("TIME(s)", "STACK"))
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    if msg != "":
+        (reg, addr) = msg.split(" ")
+        ip = b.ksym(int(addr, 16), show_offset=offset)
+        msg = msg + " " + ip
+    if verbose:
+        print("%-18.9f %-12.12s %-6d %-3d %s" % (ts, task, pid, cpu, msg))
+    else:
+        print("%-18.9f %s" % (ts, msg))
diff --git a/tools/old/statsnoop.py b/tools/old/statsnoop.py
new file mode 100755
index 0000000..ad54ac7
--- /dev/null
+++ b/tools/old/statsnoop.py
@@ -0,0 +1,118 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# statsnoop Trace stat() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: statsnoop [-h] [-t] [-x] [-p PID]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./statsnoop           # trace all stat() syscalls
+    ./statsnoop -t        # include timestamps
+    ./statsnoop -x        # only show failed stats
+    ./statsnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace stat() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed stats")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+BPF_HASH(args_filename, u32, const char *);
+
+int trace_entry(struct pt_regs *ctx, const char __user *filename)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+    args_filename.update(&pid, &filename);
+
+    return 0;
+};
+
+int trace_return(struct pt_regs *ctx)
+{
+    const char **filenamep;
+    int ret = ctx->ax;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    filenamep = args_filename.lookup(&pid);
+    if (filenamep == 0) {
+        // missed entry
+        return 0;
+    }
+
+    bpf_trace_printk("%d %s\\n", ret, *filenamep);
+    args_filename.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="sys_stat", fn_name="trace_entry")
+b.attach_kprobe(event="sys_statfs", fn_name="trace_entry")
+b.attach_kprobe(event="sys_newstat", fn_name="trace_entry")
+b.attach_kretprobe(event="sys_stat", fn_name="trace_return")
+b.attach_kretprobe(event="sys_statfs", fn_name="trace_return")
+b.attach_kretprobe(event="sys_newstat", fn_name="trace_return")
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %4s %3s %s" % ("PID", "COMM", "FD", "ERR", "PATH"))
+
+start_ts = 0
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (ret_s, filename) = msg.split(" ", 1)
+
+    ret = int(ret_s)
+    if (args.failed and (ret >= 0)):
+        continue
+
+    # split return value into FD and errno columns
+    if ret >= 0:
+        fd_s = ret
+        err = 0
+    else:
+        fd_s = "-1"
+        err = - ret
+
+    # print columns
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-14.9f" % (ts - start_ts), end="")
+    print("%-6d %-16s %4s %3s %s" % (pid, task, fd_s, err, filename))
diff --git a/tools/old/syncsnoop.py b/tools/old/syncsnoop.py
new file mode 100755
index 0000000..cae57ea
--- /dev/null
+++ b/tools/old/syncsnoop.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# syncsnoop Trace sync() syscall.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of BCC trace & reformat. See
+# examples/hello_world.py for a BCC trace with default output example.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Aug-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+
+# load BPF program
+b = BPF(text="""
+void kprobe__sys_sync(void *ctx) {
+    bpf_trace_printk("sync()\\n");
+};
+""")
+
+# header
+print("%-18s %s" % ("TIME(s)", "CALL"))
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    print("%-18.9f %s" % (ts, msg))
diff --git a/tools/old/tcpaccept.py b/tools/old/tcpaccept.py
new file mode 100755
index 0000000..8125eaa
--- /dev/null
+++ b/tools/old/tcpaccept.py
@@ -0,0 +1,134 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpaccept Trace TCP accept()s.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpaccept [-h] [-t] [-p PID]
+#
+# This uses dynamic tracing of the kernel inet_csk_accept() socket function
+# (from tcp_prot.accept), and will need to be modified to match kernel changes.
+#
+# IPv4 addresses are printed as dotted quads. For IPv6 addresses, the last four
+# bytes are printed after "..."; check for future versions with better IPv6
+# support.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Oct-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./tcpaccept           # trace all TCP accept()s
+    ./tcpaccept -t        # include timestamps
+    ./tcpaccept -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP accepts",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+int kretprobe__inet_csk_accept(struct pt_regs *ctx)
+{
+    struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
+    u32 pid = bpf_get_current_pid_tgid();
+
+    if (newsk == NULL)
+        return 0;
+
+    // check this is TCP
+    u8 protocol = 0;
+    // workaround for reading the sk_protocol bitfield:
+    bpf_probe_read(&protocol, 1, (void *)((long)&newsk->sk_wmem_queued) - 3);
+    if (protocol != IPPROTO_TCP)
+        return 0;
+
+    // pull in details
+    u16 family = 0, lport = 0;
+    u32 saddr = 0, daddr = 0;
+    bpf_probe_read(&family, sizeof(family), &newsk->__sk_common.skc_family);
+    bpf_probe_read(&lport, sizeof(lport), &newsk->__sk_common.skc_num);
+    if (family == AF_INET) {
+        bpf_probe_read(&saddr, sizeof(saddr),
+            &newsk->__sk_common.skc_rcv_saddr);
+        bpf_probe_read(&daddr, sizeof(daddr),
+            &newsk->__sk_common.skc_daddr);
+
+        // output
+        bpf_trace_printk("4 %x %x %d\\n", daddr, saddr, lport);
+    } else if (family == AF_INET6) {
+        // just grab the last 4 bytes for now
+        bpf_probe_read(&saddr, sizeof(saddr),
+            &newsk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[3]);
+        bpf_probe_read(&daddr, sizeof(daddr),
+            &newsk->__sk_common.skc_v6_daddr.in6_u.u6_addr32[3]);
+
+        // output and flip byte order of addresses
+        bpf_trace_printk("6 %x %x %d\\n", bpf_ntohl(daddr),
+            bpf_ntohl(saddr), lport);
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.timestamp:
+    print("%-9s" % ("TIME(s)"), end="")
+print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "RADDR",
+    "LADDR", "LPORT"))
+
+start_ts = 0
+
+def inet_ntoa(addr):
+    dq = ''
+    for i in range(0, 4):
+        dq = dq + str(addr & 0xff)
+        if (i != 3):
+            dq = dq + '.'
+        addr = addr >> 8
+    return dq
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (ip_s, raddr_hs, laddr_hs, lport_s) = msg.split(" ")
+
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-9.3f" % (ts - start_ts), end="")
+    print("%-6d %-12.12s %-2s %-16s %-16s %-4s" % (pid, task, ip_s,
+        inet_ntoa(int(raddr_hs, 16)) if ip_s == "4" else "..." + raddr_hs,
+        inet_ntoa(int(laddr_hs, 16)) if ip_s == "4" else "..." + laddr_hs,
+        lport_s))
diff --git a/tools/old/tcpconnect.py b/tools/old/tcpconnect.py
new file mode 100755
index 0000000..579a85f
--- /dev/null
+++ b/tools/old/tcpconnect.py
@@ -0,0 +1,158 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpconnect    Trace TCP connect()s.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpconnect [-h] [-t] [-p PID]
+#
+# All connection attempts are traced, even if they ultimately fail.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Sep-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./tcpconnect           # trace all TCP connect()s
+    ./tcpconnect -t        # include timestamps
+    ./tcpconnect -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP connects",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+int trace_connect_entry(struct pt_regs *ctx, struct sock *sk)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+
+    // stash the sock ptr for lookup on return
+    currsock.update(&pid, &sk);
+
+    return 0;
+};
+
+static int trace_connect_return(struct pt_regs *ctx, short ipver)
+{
+    int ret = PT_REGS_RC(ctx);
+    u32 pid = bpf_get_current_pid_tgid();
+
+    struct sock **skpp;
+    skpp = currsock.lookup(&pid);
+    if (skpp == 0) {
+        return 0;   // missed entry
+    }
+
+    if (ret != 0) {
+        // failed to send SYNC packet, may not have populated
+        // socket __sk_common.{skc_rcv_saddr, ...}
+        currsock.delete(&pid);
+        return 0;
+    }
+
+    // pull in details
+    struct sock *skp = *skpp;
+    u32 saddr = 0, daddr = 0;
+    u16 dport = 0;
+    dport = skp->__sk_common.skc_dport;
+    if (ipver == 4) {
+        saddr = skp->__sk_common.skc_rcv_saddr;
+        daddr = skp->__sk_common.skc_daddr;
+
+        // output
+        bpf_trace_printk("4 %x %x %d\\n", saddr, daddr, ntohs(dport));
+    } else /* 6 */ {
+        // just grab the last 4 bytes for now
+        bpf_probe_read(&saddr, sizeof(saddr),
+            &skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[3]);
+        bpf_probe_read(&daddr, sizeof(daddr),
+            &skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32[3]);
+
+        // output and flip byte order of addresses
+        bpf_trace_printk("6 %x %x %d\\n", bpf_ntohl(saddr),
+            bpf_ntohl(daddr), ntohs(dport));
+    }
+
+    currsock.delete(&pid);
+
+    return 0;
+}
+
+int trace_connect_v4_return(struct pt_regs *ctx)
+{
+    return trace_connect_return(ctx, 4);
+}
+
+int trace_connect_v6_return(struct pt_regs *ctx)
+{
+    return trace_connect_return(ctx, 6);
+}
+"""
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry")
+b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_entry")
+b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return")
+b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return")
+
+# header
+if args.timestamp:
+    print("%-9s" % ("TIME(s)"), end="")
+print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "SADDR",
+    "DADDR", "DPORT"))
+
+start_ts = 0
+
+def inet_ntoa(addr):
+    dq = ''
+    for i in range(0, 4):
+        dq = dq + str(addr & 0xff)
+        if (i != 3):
+            dq = dq + '.'
+        addr = addr >> 8
+    return dq
+
+# format output
+while 1:
+    (task, pid, cpu, flags, ts, msg) = b.trace_fields()
+    (ip_s, saddr_hs, daddr_hs, dport_s) = msg.split(" ")
+
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = ts
+        print("%-9.3f" % (ts - start_ts), end="")
+    print("%-6d %-12.12s %-2s %-16s %-16s %-4s" % (pid, task, ip_s,
+        inet_ntoa(int(saddr_hs, 16)) if ip_s == "4" else "..." + saddr_hs,
+        inet_ntoa(int(daddr_hs, 16)) if ip_s == "4" else "..." + daddr_hs,
+        dport_s))
diff --git a/tools/old/wakeuptime.py b/tools/old/wakeuptime.py
new file mode 100644
index 0000000..783c7ff
--- /dev/null
+++ b/tools/old/wakeuptime.py
@@ -0,0 +1,224 @@
+#!/usr/bin/python
+#
+# wakeuptime    Summarize sleep to wakeup time by waker kernel stack
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: wakeuptime [-h] [-u] [-p PID] [-v] [-f] [duration]
+#
+# The current implementation uses an unrolled loop for x86_64, and was written
+# as a proof of concept. This implementation should be replaced in the future
+# with an appropriate bpf_ call, when available.
+#
+# Currently limited to a stack trace depth of 21 (maxdepth + 1).
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Jan-2016	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import signal
+
+# arguments
+examples = """examples:
+    ./wakeuptime             # trace blocked time with waker stacks
+    ./wakeuptime 5           # trace for 5 seconds only
+    ./wakeuptime -f 5        # 5 seconds, and output in folded format
+    ./wakeuptime -u          # don't include kernel threads (user only)
+    ./wakeuptime -p 185      # trace fo PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize sleep to wakeup time by waker kernel stack",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-u", "--useronly", action="store_true",
+    help="user threads only (no kernel threads)")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="show raw addresses")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("duration", nargs="?", default=99999999,
+    help="duration of trace, in seconds")
+args = parser.parse_args()
+folded = args.folded
+duration = int(args.duration)
+debug = 0
+maxdepth = 20    # and MAXDEPTH
+if args.pid and args.useronly:
+    print("ERROR: use either -p or -u.")
+    exit()
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MAXDEPTH	20
+#define MINBLOCK_US	1
+
+struct key_t {
+    char waker[TASK_COMM_LEN];
+    char target[TASK_COMM_LEN];
+    // Skip saving the ip
+    u64 ret[MAXDEPTH];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+
+static u64 get_frame(u64 *bp) {
+    if (*bp) {
+        // The following stack walker is x86_64/arm64 specific
+        u64 ret = 0;
+        if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
+            return 0;
+        if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
+            return 0;
+#ifdef __x86_64__
+        if (ret < __START_KERNEL_map)
+#elif __aarch64__
+        if (ret < VA_START)
+#else
+#error "Unsupported architecture for stack walker"
+#endif
+            return 0;
+        return ret;
+    }
+    return 0;
+}
+
+int offcpu(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    u64 ts = bpf_ktime_get_ns();
+    // XXX: should filter here too, but need task_struct
+    start.update(&pid, &ts);
+    return 0;
+}
+
+int waker(struct pt_regs *ctx, struct task_struct *p) {
+    u32 pid = p->pid;
+    u64 delta, *tsp, ts;
+
+    tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return 0;        // missed start
+    start.delete(&pid);
+
+    if (FILTER)
+        return 0;
+
+    // calculate delta time
+    delta = bpf_ktime_get_ns() - *tsp;
+    delta = delta / 1000;
+    if (delta < MINBLOCK_US)
+        return 0;
+
+    struct key_t key = {};
+    u64 zero = 0, *val, bp = 0;
+    int depth = 0;
+
+    bpf_probe_read(&key.target, sizeof(key.target), p->comm);
+    bpf_get_current_comm(&key.waker, sizeof(key.waker));
+    bp = PT_REGS_FP(ctx);
+
+    // unrolled loop (MAXDEPTH):
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+    if (!(key.ret[depth++] = get_frame(&bp))) goto out;
+
+out:
+    val = counts.lookup_or_init(&key, &zero);
+    (*val) += delta;
+    return 0;
+}
+"""
+if args.pid:
+    filter = 'pid != %s' % args.pid
+elif args.useronly:
+    filter = 'p->flags & PF_KTHREAD'
+else:
+    filter = '0'
+bpf_text = bpf_text.replace('FILTER', filter)
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="schedule", fn_name="offcpu")
+b.attach_kprobe(event="try_to_wake_up", fn_name="waker")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions traced. Exiting.")
+    exit()
+
+# header
+if not folded:
+    print("Tracing blocked time (us) by kernel stack", end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+# output
+while (1):
+    try:
+        sleep(duration)
+    except KeyboardInterrupt:
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    if not folded:
+        print()
+    counts = b.get_table("counts")
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+        if folded:
+            # print folded stack output
+            line = k.waker.decode('utf-8', 'replace') + ";"
+            for i in reversed(range(0, maxdepth)):
+                if k.ret[i] == 0:
+                    continue
+                line = line + b.ksym(k.ret[i])
+                if i != 0:
+                    line = line + ";"
+            print("%s;%s %d" % (line, k.target.decode('utf-8', 'replace'), v.value))
+        else:
+            # print default multi-line stack output
+            print("    %-16s %s" % ("target:", k.target.decode('utf-8', 'replace')))
+            for i in range(0, maxdepth):
+                if k.ret[i] == 0:
+                    break
+                print("    %-16x %s" % (k.ret[i],
+                    b.ksym(k.ret[i])))
+            print("    %-16s %s" % ("waker:", k.waker.decode('utf-8', 'replace')))
+            print("        %d\n" % v.value)
+    counts.clear()
+
+    if not folded:
+        print("Detaching...")
+    exit()
diff --git a/tools/oomkill.py b/tools/oomkill.py
new file mode 100755
index 0000000..0677e49
--- /dev/null
+++ b/tools/oomkill.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+#
+# oomkill   Trace oom_kill_process(). For Linux, uses BCC, eBPF.
+#
+# This traces the kernel out-of-memory killer, and prints basic details,
+# including the system load averages. This can provide more context on the
+# system state at the time of OOM: was it getting busier or steady, based
+# on the load averages? This tool may also be useful to customize for
+# investigations; for example, by adding other task_struct details at the time
+# of OOM.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 09-Feb-2016   Brendan Gregg   Created this.
+
+from bcc import BPF
+from time import strftime
+import ctypes as ct
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/oom.h>
+
+struct data_t {
+    u64 fpid;
+    u64 tpid;
+    u64 pages;
+    char fcomm[TASK_COMM_LEN];
+    char tcomm[TASK_COMM_LEN];
+};
+
+BPF_PERF_OUTPUT(events);
+
+void kprobe__oom_kill_process(struct pt_regs *ctx, struct oom_control *oc, const char *message)
+{
+    unsigned long totalpages;
+    struct task_struct *p = oc->chosen;
+    struct data_t data = {};
+    u32 pid = bpf_get_current_pid_tgid();
+    data.fpid = pid;
+    data.tpid = p->pid;
+    data.pages = oc->totalpages;
+    bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
+    bpf_probe_read(&data.tcomm, sizeof(data.tcomm), p->comm);
+    events.perf_submit(ctx, &data, sizeof(data));
+}
+"""
+
+# kernel->user event data: struct data_t
+TASK_COMM_LEN = 16  # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("fpid", ct.c_ulonglong),
+        ("tpid", ct.c_ulonglong),
+        ("pages", ct.c_ulonglong),
+        ("fcomm", ct.c_char * TASK_COMM_LEN),
+        ("tcomm", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    with open(loadavg) as stats:
+        avgline = stats.read().rstrip()
+    print(("%s Triggered by PID %d (\"%s\"), OOM kill of PID %d (\"%s\")"
+        ", %d pages, loadavg: %s") % (strftime("%H:%M:%S"), event.fpid,
+        event.fcomm.decode('utf-8', 'replace'), event.tpid,
+        event.tcomm.decode('utf-8', 'replace'), event.pages, avgline))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+print("Tracing OOM kills... Ctrl-C to stop.")
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/oomkill_example.txt b/tools/oomkill_example.txt
new file mode 100644
index 0000000..ceeb1b7
--- /dev/null
+++ b/tools/oomkill_example.txt
@@ -0,0 +1,39 @@
+Demonstrations of oomkill, the Linux eBPF/bcc version.
+
+
+oomkill is a simple program that traces the Linux out-of-memory (OOM) killer,
+and shows basic details on one line per OOM kill:
+
+# ./oomkill
+Tracing oom_kill_process()... Ctrl-C to end.
+21:03:39 Triggered by PID 3297 ("ntpd"), OOM kill of PID 22516 ("perl"), 3850642 pages, loadavg: 0.99 0.39 0.30 3/282 22724
+21:03:48 Triggered by PID 22517 ("perl"), OOM kill of PID 22517 ("perl"), 3850642 pages, loadavg: 0.99 0.41 0.30 2/282 22932
+
+The first line shows that PID 22516, with process name "perl", was OOM killed
+when it reached 3850642 pages (usually 4 Kbytes per page). This OOM kill
+happened to be triggered by PID 3297, process name "ntpd", doing some memory
+allocation.
+
+The system log (dmesg) shows pages of details and system context about an OOM
+kill. What it currently lacks, however, is context on how the system had been
+changing over time. I've seen OOM kills where I wanted to know if the system
+was at steady state at the time, or if there had been a recent increase in
+workload that triggered the OOM event. oomkill provides some context: at the
+end of the line is the load average information from /proc/loadavg. For both
+of the oomkills here, we can see that the system was getting busier at the
+time (a higher 1 minute "average" of 0.99, compared to the 15 minute "average"
+of 0.30).
+
+oomkill can also be the basis of other tools and customizations. For example,
+you can edit it to include other task_struct details from the target PID at
+the time of the OOM kill.
+
+
+The following commands can be used to test this program, and invoke a memory
+consuming process that exhausts system memory and is OOM killed:
+
+sysctl -w vm.overcommit_memory=1              # always overcommit
+perl -e 'while (1) { $a .= "A" x 1024; }'     # eat all memory
+
+WARNING: This exhausts system memory after disabling some overcommit checks.
+Only test in a lab environment.
diff --git a/tools/opensnoop.py b/tools/opensnoop.py
new file mode 100755
index 0000000..418d47b
--- /dev/null
+++ b/tools/opensnoop.py
@@ -0,0 +1,194 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# opensnoop Trace open() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: opensnoop [-h] [-T] [-x] [-p PID] [-d DURATION] [-t TID] [-n NAME]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 17-Sep-2015   Brendan Gregg   Created this.
+# 29-Apr-2016   Allan McAleavy  Updated for BPF_PERF_OUTPUT.
+# 08-Oct-2016   Dina Goldshtein Support filtering by PID and TID.
+
+from __future__ import print_function
+from bcc import ArgString, BPF
+import argparse
+import ctypes as ct
+from datetime import datetime, timedelta
+
+# arguments
+examples = """examples:
+    ./opensnoop           # trace all open() syscalls
+    ./opensnoop -T        # include timestamps
+    ./opensnoop -x        # only show failed opens
+    ./opensnoop -p 181    # only trace PID 181
+    ./opensnoop -t 123    # only trace TID 123
+    ./opensnoop -d 10     # trace for 10 seconds only
+    ./opensnoop -n main   # only print process names containing "main"
+"""
+parser = argparse.ArgumentParser(
+    description="Trace open() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed opens")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
+parser.add_argument("-d", "--duration",
+    help="total duration of trace in seconds")
+parser.add_argument("-n", "--name",
+    type=ArgString,
+    help="only print process names containing this name")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+if args.duration:
+    args.duration = timedelta(seconds=int(args.duration))
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/limits.h>
+#include <linux/sched.h>
+
+struct val_t {
+    u64 id;
+    char comm[TASK_COMM_LEN];
+    const char *fname;
+};
+
+struct data_t {
+    u64 id;
+    u64 ts;
+    int ret;
+    char comm[TASK_COMM_LEN];
+    char fname[NAME_MAX];
+};
+
+BPF_HASH(infotmp, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+int trace_entry(struct pt_regs *ctx, int dfd, const char __user *filename)
+{
+    struct val_t val = {};
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
+
+    FILTER
+    if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
+        val.id = id;
+        val.fname = filename;
+        infotmp.update(&id, &val);
+    }
+
+    return 0;
+};
+
+int trace_return(struct pt_regs *ctx)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    struct val_t *valp;
+    struct data_t data = {};
+
+    u64 tsp = bpf_ktime_get_ns();
+
+    valp = infotmp.lookup(&id);
+    if (valp == 0) {
+        // missed entry
+        return 0;
+    }
+    bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
+    bpf_probe_read(&data.fname, sizeof(data.fname), (void *)valp->fname);
+    data.id = valp->id;
+    data.ts = tsp / 1000;
+    data.ret = PT_REGS_RC(ctx);
+
+    events.perf_submit(ctx, &data, sizeof(data));
+    infotmp.delete(&id);
+
+    return 0;
+}
+"""
+if args.tid:  # TID trumps PID
+    bpf_text = bpf_text.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="do_sys_open", fn_name="trace_entry")
+b.attach_kretprobe(event="do_sys_open", fn_name="trace_return")
+
+TASK_COMM_LEN = 16    # linux/sched.h
+NAME_MAX = 255        # linux/limits.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("id", ct.c_ulonglong),
+        ("ts", ct.c_ulonglong),
+        ("ret", ct.c_int),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("fname", ct.c_char * NAME_MAX)
+    ]
+
+initial_ts = 0
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %4s %3s %s" %
+      ("TID" if args.tid else "PID", "COMM", "FD", "ERR", "PATH"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    global initial_ts
+
+    # split return value into FD and errno columns
+    if event.ret >= 0:
+        fd_s = event.ret
+        err = 0
+    else:
+        fd_s = -1
+        err = - event.ret
+
+    if not initial_ts:
+        initial_ts = event.ts
+
+    if args.failed and (event.ret >= 0):
+        return
+
+    if args.name and bytes(args.name) not in event.comm:
+        return
+
+    if args.timestamp:
+        delta = event.ts - initial_ts
+        print("%-14.9f" % (float(delta) / 1000000), end="")
+
+    print("%-6d %-16s %4d %3d %s" %
+          (event.id & 0xffffffff if args.tid else event.id >> 32,
+           event.comm.decode('utf-8', 'replace'), fd_s, err,
+           event.fname.decode('utf-8', 'replace')))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+start_time = datetime.now()
+while not args.duration or datetime.now() - start_time < args.duration:
+    b.perf_buffer_poll()
diff --git a/tools/opensnoop_example.txt b/tools/opensnoop_example.txt
new file mode 100644
index 0000000..1d00f12
--- /dev/null
+++ b/tools/opensnoop_example.txt
@@ -0,0 +1,159 @@
+Demonstrations of opensnoop, the Linux eBPF/bcc version.
+
+
+opensnoop traces the open() syscall system-wide, and prints various details.
+Example output:
+
+# ./opensnoop
+PID    COMM      FD ERR PATH
+17326  <...>      7   0 /sys/kernel/debug/tracing/trace_pipe
+1576   snmpd      9   0 /proc/net/dev
+1576   snmpd     11   0 /proc/net/if_inet6
+1576   snmpd     11   0 /proc/sys/net/ipv4/neigh/eth0/retrans_time_ms
+1576   snmpd     11   0 /proc/sys/net/ipv6/neigh/eth0/retrans_time_ms
+1576   snmpd     11   0 /proc/sys/net/ipv6/conf/eth0/forwarding
+1576   snmpd     11   0 /proc/sys/net/ipv6/neigh/eth0/base_reachable_time_ms
+1576   snmpd     11   0 /proc/sys/net/ipv4/neigh/lo/retrans_time_ms
+1576   snmpd     11   0 /proc/sys/net/ipv6/neigh/lo/retrans_time_ms
+1576   snmpd     11   0 /proc/sys/net/ipv6/conf/lo/forwarding
+1576   snmpd     11   0 /proc/sys/net/ipv6/neigh/lo/base_reachable_time_ms
+1576   snmpd      9   0 /proc/diskstats
+1576   snmpd      9   0 /proc/stat
+1576   snmpd      9   0 /proc/vmstat
+1956   supervise  9   0 supervise/status.new
+1956   supervise  9   0 supervise/status.new
+17358  run        3   0 /etc/ld.so.cache
+17358  run        3   0 /lib/x86_64-linux-gnu/libtinfo.so.5
+17358  run        3   0 /lib/x86_64-linux-gnu/libdl.so.2
+17358  run        3   0 /lib/x86_64-linux-gnu/libc.so.6
+17358  run       -1   6 /dev/tty
+17358  run        3   0 /proc/meminfo
+17358  run        3   0 /etc/nsswitch.conf
+17358  run        3   0 /etc/ld.so.cache
+17358  run        3   0 /lib/x86_64-linux-gnu/libnss_compat.so.2
+17358  run        3   0 /lib/x86_64-linux-gnu/libnsl.so.1
+17358  run        3   0 /etc/ld.so.cache
+17358  run        3   0 /lib/x86_64-linux-gnu/libnss_nis.so.2
+17358  run        3   0 /lib/x86_64-linux-gnu/libnss_files.so.2
+17358  run        3   0 /etc/passwd
+17358  run        3   0 ./run
+^C
+
+While tracing, the snmpd process opened various /proc files (reading metrics),
+and a "run" process read various libraries and config files (looks like it
+was starting up: a new process).
+
+opensnoop can be useful for discovering configuration and log files, if used
+during application startup.
+
+
+The -p option can be used to filter on a PID, which is filtered in-kernel. Here
+I've used it with -T to print timestamps:
+
+ ./opensnoop -Tp 1956
+TIME(s)       PID    COMM               FD ERR PATH
+0.000000000   1956   supervise           9   0 supervise/status.new
+0.000289999   1956   supervise           9   0 supervise/status.new
+1.023068000   1956   supervise           9   0 supervise/status.new
+1.023381997   1956   supervise           9   0 supervise/status.new
+2.046030000   1956   supervise           9   0 supervise/status.new
+2.046363000   1956   supervise           9   0 supervise/status.new
+3.068203997   1956   supervise           9   0 supervise/status.new
+3.068544999   1956   supervise           9   0 supervise/status.new
+
+This shows the supervise process is opening the status.new file twice every
+second.
+
+
+The -x option only prints failed opens:
+
+# ./opensnoop -x
+PID    COMM      FD ERR PATH
+18372  run       -1   6 /dev/tty
+18373  run       -1   6 /dev/tty
+18373  multilog  -1  13 lock
+18372  multilog  -1  13 lock
+18384  df        -1   2 /usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo
+18384  df        -1   2 /usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo
+18384  df        -1   2 /usr/share/locale/en_US/LC_MESSAGES/coreutils.mo
+18384  df        -1   2 /usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo
+18384  df        -1   2 /usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo
+18384  df        -1   2 /usr/share/locale/en/LC_MESSAGES/coreutils.mo
+18385  run       -1   6 /dev/tty
+18386  run       -1   6 /dev/tty
+
+This caught a df command failing to open a coreutils.mo file, and trying from
+different directories.
+
+The ERR column is the system error number. Error number 2 is ENOENT: no such
+file or directory.
+
+
+A maximum tracing duration can be set with the -d option. For example, to trace
+for 2 seconds:
+
+# ./opensnoop -d 2
+PID    COMM               FD ERR PATH
+2191   indicator-multi    11   0 /sys/block
+2191   indicator-multi    11   0 /sys/block
+2191   indicator-multi    11   0 /sys/block
+2191   indicator-multi    11   0 /sys/block
+2191   indicator-multi    11   0 /sys/block
+
+
+The -n option can be used to filter on process name using partial matches:
+
+# ./opensnoop -n ed
+
+PID    COMM               FD ERR PATH
+2679   sed                 3   0 /etc/ld.so.cache
+2679   sed                 3   0 /lib/x86_64-linux-gnu/libselinux.so.1
+2679   sed                 3   0 /lib/x86_64-linux-gnu/libc.so.6
+2679   sed                 3   0 /lib/x86_64-linux-gnu/libpcre.so.3
+2679   sed                 3   0 /lib/x86_64-linux-gnu/libdl.so.2
+2679   sed                 3   0 /lib/x86_64-linux-gnu/libpthread.so.0
+2679   sed                 3   0 /proc/filesystems
+2679   sed                 3   0 /usr/lib/locale/locale-archive
+2679   sed                -1   2
+2679   sed                 3   0 /usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache
+2679   sed                 3   0 /dev/null
+2680   sed                 3   0 /etc/ld.so.cache
+2680   sed                 3   0 /lib/x86_64-linux-gnu/libselinux.so.1
+2680   sed                 3   0 /lib/x86_64-linux-gnu/libc.so.6
+2680   sed                 3   0 /lib/x86_64-linux-gnu/libpcre.so.3
+2680   sed                 3   0 /lib/x86_64-linux-gnu/libdl.so.2
+2680   sed                 3   0 /lib/x86_64-linux-gnu/libpthread.so.0
+2680   sed                 3   0 /proc/filesystems
+2680   sed                 3   0 /usr/lib/locale/locale-archive
+2680   sed                -1   2
+^C
+
+This caught the 'sed' command because it partially matches 'ed' that's passed
+to the '-n' option.
+
+
+USAGE message:
+
+# ./opensnoop -h
+usage: opensnoop [-h] [-T] [-x] [-p PID] [-t TID] [-d DURATION] [-n NAME]
+
+Trace open() syscalls
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -T, --timestamp       include timestamp on output
+  -x, --failed          only show failed opens
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -d DURATION, --duration DURATION
+                        total duration of trace in seconds
+  -n NAME, --name NAME  only print process names containing this name
+
+examples:
+    ./opensnoop           # trace all open() syscalls
+    ./opensnoop -T        # include timestamps
+    ./opensnoop -x        # only show failed opens
+    ./opensnoop -p 181    # only trace PID 181
+    ./opensnoop -t 123    # only trace TID 123
+    ./opensnoop -d 10     # trace for 10 seconds only
+    ./opensnoop -n main   # only print process names containing "main"
diff --git a/tools/perlcalls.sh b/tools/perlcalls.sh
new file mode 100755
index 0000000..74c6b03
--- /dev/null
+++ b/tools/perlcalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l perl "$@"
diff --git a/tools/perlcalls_example.txt b/tools/perlcalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/perlcalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/perlflow.sh b/tools/perlflow.sh
new file mode 100755
index 0000000..4fd2397
--- /dev/null
+++ b/tools/perlflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l perl "$@"
diff --git a/tools/perlflow_example.txt b/tools/perlflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/perlflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/perlstat.sh b/tools/perlstat.sh
new file mode 100755
index 0000000..4bb417f
--- /dev/null
+++ b/tools/perlstat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l perl "$@"
diff --git a/tools/perlstat_example.txt b/tools/perlstat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/perlstat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/phpcalls.sh b/tools/phpcalls.sh
new file mode 100755
index 0000000..726ffcf
--- /dev/null
+++ b/tools/phpcalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l php "$@"
diff --git a/tools/phpcalls_example.txt b/tools/phpcalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/phpcalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/phpflow.sh b/tools/phpflow.sh
new file mode 100755
index 0000000..5eb83f3
--- /dev/null
+++ b/tools/phpflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l php "$@"
diff --git a/tools/phpflow_example.txt b/tools/phpflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/phpflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/phpstat.sh b/tools/phpstat.sh
new file mode 100755
index 0000000..455b308
--- /dev/null
+++ b/tools/phpstat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l php "$@"
diff --git a/tools/phpstat_example.txt b/tools/phpstat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/phpstat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/pidpersec.py b/tools/pidpersec.py
new file mode 100755
index 0000000..c449004
--- /dev/null
+++ b/tools/pidpersec.py
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# pidpersec Count new processes (via fork).
+#           For Linux, uses BCC, eBPF. See .c file.
+#
+# USAGE: pidpersec
+#
+# Written as a basic example of counting an event.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Aug-2015   Brendan Gregg   Created this.
+
+from bcc import BPF
+from ctypes import c_int
+from time import sleep, strftime
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+enum stat_types {
+    S_COUNT = 1,
+    S_MAXSTAT
+};
+
+BPF_ARRAY(stats, u64, S_MAXSTAT);
+
+static void stats_increment(int key) {
+    u64 *leaf = stats.lookup(&key);
+    if (leaf) (*leaf)++;
+}
+
+void do_count(struct pt_regs *ctx) { stats_increment(S_COUNT); }
+""")
+b.attach_kprobe(event="sched_fork", fn_name="do_count")
+
+# stat indexes
+S_COUNT = c_int(1)
+
+# header
+print("Tracing... Ctrl-C to end.")
+
+# output
+while (1):
+    try:
+        sleep(1)
+    except KeyboardInterrupt:
+        exit()
+
+    print("%s: PIDs/sec: %d" % (strftime("%H:%M:%S"),
+        b["stats"][S_COUNT].value))
+    b["stats"].clear()
diff --git a/tools/pidpersec_example.txt b/tools/pidpersec_example.txt
new file mode 100644
index 0000000..b274dd4
--- /dev/null
+++ b/tools/pidpersec_example.txt
@@ -0,0 +1,22 @@
+Demonstrations of pidpersec, the Linux eBPF/bcc version.
+
+
+This shows the number of new processes created per second, measured by tracing
+the kernel fork() routine:
+
+# ./pidpersec
+Tracing... Ctrl-C to end.
+18:33:06: PIDs/sec: 4
+18:33:07: PIDs/sec: 5
+18:33:08: PIDs/sec: 4
+18:33:09: PIDs/sec: 4
+18:33:10: PIDs/sec: 21
+18:33:11: PIDs/sec: 5
+18:33:12: PIDs/sec: 4
+18:33:13: PIDs/sec: 4
+
+Each second there are four new processes (this happens to be caused by a
+launcher script that is retrying in a loop, and encountering errors).
+
+At 18:33:10, I typed "man ls" in another server session, which caused an
+increase in the number of new processes as the necessary commands were run.
diff --git a/tools/profile.py b/tools/profile.py
new file mode 100755
index 0000000..d1d3d26
--- /dev/null
+++ b/tools/profile.py
@@ -0,0 +1,349 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# profile  Profile CPU usage by sampling stack traces at a timed interval.
+#          For Linux, uses BCC, BPF, perf_events. Embedded C.
+#
+# This is an efficient profiler, as stack traces are frequency counted in
+# kernel context, rather than passing every stack to user space for frequency
+# counting there. Only the unique stacks and counts are passed to user space
+# at the end of the profile, greatly reducing the kernel<->user transfer.
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# THANKS: Alexei Starovoitov, who added proper BPF profiling support to Linux;
+# Sasha Goldshtein, Andrew Birchall, and Evgeny Vereshchagin, who wrote much
+# of the code here, borrowed from tracepoint.py and offcputime.py; and
+# Teng Qin, who added perf support in bcc.
+#
+# 15-Jul-2016   Brendan Gregg   Created this.
+# 20-Oct-2016      "      "     Switched to use the new 4.9 support.
+
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from sys import stderr
+from time import sleep
+import argparse
+import signal
+import os
+import errno
+import multiprocessing
+import ctypes as ct
+
+#
+# Process Arguments
+#
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+# arguments
+examples = """examples:
+    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
+    ./profile -F 99       # profile stack traces at 99 Hertz
+    ./profile -c 1000000  # profile stack traces every 1 in a million events
+    ./profile 5           # profile at 49 Hertz for 5 seconds only
+    ./profile -f 5        # output in folded format for flame graphs
+    ./profile -p 185      # only profile threads for PID 185
+    ./profile -U          # only show user space stacks (no kernel)
+    ./profile -K          # only show kernel space stacks (no user)
+"""
+parser = argparse.ArgumentParser(
+    description="Profile CPU stack traces at a timed interval",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+thread_group = parser.add_mutually_exclusive_group()
+thread_group.add_argument("-p", "--pid", type=positive_int,
+    help="profile this PID only")
+# TODO: add options for user/kernel threads only
+stack_group = parser.add_mutually_exclusive_group()
+stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
+    help="show stacks from user space only (no kernel space stacks)")
+stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
+    help="show stacks from kernel space only (no user space stacks)")
+sample_group = parser.add_mutually_exclusive_group()
+sample_group.add_argument("-F", "--frequency", type=positive_int,
+    help="sample frequency, Hertz")
+sample_group.add_argument("-c", "--count", type=positive_int,
+    help="sample period, number of events")
+parser.add_argument("-d", "--delimited", action="store_true",
+    help="insert delimiter between kernel/user stacks")
+parser.add_argument("-a", "--annotations", action="store_true",
+    help="add _[k] annotations to kernel frames")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format, one line per stack (for flame graphs)")
+parser.add_argument("--stack-storage-size", default=16384,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+        "displayed (default %(default)s)")
+parser.add_argument("duration", nargs="?", default=99999999,
+    type=positive_nonzero_int,
+    help="duration of trace, in seconds")
+parser.add_argument("-C", "--cpu", type=int, default=-1,
+    help="cpu number to run profile on")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+
+# option logic
+args = parser.parse_args()
+pid = int(args.pid) if args.pid is not None else -1
+duration = int(args.duration)
+debug = 0
+need_delimiter = args.delimited and not (args.kernel_stacks_only or
+    args.user_stacks_only)
+# TODO: add stack depth, and interval
+
+#
+# Setup BPF
+#
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+
+struct key_t {
+    u32 pid;
+    u64 kernel_ip;
+    u64 kernel_ret_ip;
+    int user_stack_id;
+    int kernel_stack_id;
+    char name[TASK_COMM_LEN];
+};
+BPF_HASH(counts, struct key_t);
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+// This code gets a bit complex. Probably not suitable for casual hacking.
+
+int do_perf_event(struct bpf_perf_event_data *ctx) {
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+    if (!(THREAD_FILTER))
+        return 0;
+
+    // create map key
+    struct key_t key = {.pid = pid};
+    bpf_get_current_comm(&key.name, sizeof(key.name));
+
+    // get stacks
+    key.user_stack_id = USER_STACK_GET;
+    key.kernel_stack_id = KERNEL_STACK_GET;
+
+    if (key.kernel_stack_id >= 0) {
+        // populate extras to fix the kernel stack
+        u64 ip = PT_REGS_IP(&ctx->regs);
+        u64 page_offset;
+
+        // if ip isn't sane, leave key ips as zero for later checking
+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
+        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
+        page_offset = __PAGE_OFFSET_BASE;
+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
+        // x64, 4.17, and later
+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
+        page_offset = __PAGE_OFFSET_BASE_L5;
+#else
+        page_offset = __PAGE_OFFSET_BASE_L4;
+#endif
+#else
+        // earlier x86_64 kernels, e.g., 4.6, comes here
+        // arm64, s390, powerpc, x86_32
+        page_offset = PAGE_OFFSET;
+#endif
+
+        if (ip > page_offset) {
+            key.kernel_ip = ip;
+        }
+    }
+
+    counts.increment(key);
+    return 0;
+}
+"""
+
+# set thread filter
+thread_context = ""
+perf_filter = "-a"
+if args.pid is not None:
+    thread_context = "PID %s" % args.pid
+    thread_filter = 'pid == %s' % args.pid
+    perf_filter = '-p %s' % args.pid
+else:
+    thread_context = "all threads"
+    thread_filter = '1'
+bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+
+# handle stack args
+kernel_stack_get = "stack_traces.get_stackid(&ctx->regs, 0)"
+user_stack_get = "stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK)"
+stack_context = ""
+if args.user_stacks_only:
+    stack_context = "user"
+    kernel_stack_get = "-1"
+elif args.kernel_stacks_only:
+    stack_context = "kernel"
+    user_stack_get = "-1"
+else:
+    stack_context = "user + kernel"
+bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
+bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
+
+sample_freq = 0
+sample_period = 0
+if args.frequency:
+    sample_freq = args.frequency
+elif args.count:
+    sample_period = args.count
+else:
+    # If user didn't specify anything, use default 49Hz sampling
+    sample_freq = 49
+sample_context = "%s%d %s" % (("", sample_freq, "Hertz") if sample_freq
+                         else ("every ", sample_period, "events"))
+
+# header
+if not args.folded:
+    print("Sampling at %s of %s by %s stack" %
+        (sample_context, thread_context, stack_context), end="")
+    if args.cpu >= 0:
+        print(" on CPU#{}".format(args.cpu), end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
+    sample_period=sample_period, sample_freq=sample_freq, cpu=args.cpu)
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+#
+# Output Report
+#
+
+# collect samples
+try:
+    sleep(duration)
+except KeyboardInterrupt:
+    # as cleanup can take some time, trap Ctrl-C:
+    signal.signal(signal.SIGINT, signal_ignore)
+
+if not args.folded:
+    print()
+
+def aksym(addr):
+    if args.annotations:
+        return b.ksym(addr) + "_[k]".encode()
+    else:
+        return b.ksym(addr)
+
+# output stacks
+missing_stacks = 0
+has_enomem = False
+counts = b.get_table("counts")
+stack_traces = b.get_table("stack_traces")
+need_delimiter = args.delimited and not (args.kernel_stacks_only or
+                                         args.user_stacks_only)
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    # handle get_stackid errors
+    if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
+        missing_stacks += 1
+        has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM
+    if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
+        missing_stacks += 1
+        has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM
+
+    user_stack = [] if k.user_stack_id < 0 else \
+        stack_traces.walk(k.user_stack_id)
+    kernel_tmp = [] if k.kernel_stack_id < 0 else \
+        stack_traces.walk(k.kernel_stack_id)
+
+    # fix kernel stack
+    kernel_stack = []
+    if k.kernel_stack_id >= 0:
+        for addr in kernel_tmp:
+            kernel_stack.append(addr)
+        # the later IP checking
+        if k.kernel_ip:
+            kernel_stack.insert(0, k.kernel_ip)
+
+    if args.folded:
+        # print folded stack output
+        user_stack = list(user_stack)
+        kernel_stack = list(kernel_stack)
+        line = [k.name]
+        # if we failed to get the stack is, such as due to no space (-ENOMEM) or
+        # hash collision (-EEXIST), we still print a placeholder for consistency
+        if not args.kernel_stacks_only:
+            if stack_id_err(k.user_stack_id):
+                line.append("[Missed User Stack]")
+            else:
+                line.extend([b.sym(addr, k.pid) for addr in reversed(user_stack)])
+        if not args.user_stacks_only:
+            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
+            if stack_id_err(k.kernel_stack_id):
+                line.append("[Missed Kernel Stack]")
+            else:
+                line.extend([b.ksym(addr) for addr in reversed(kernel_stack)])
+        print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value))
+    else:
+        # print default multi-line stack output
+        if not args.user_stacks_only:
+            if stack_id_err(k.kernel_stack_id):
+                print("    [Missed Kernel Stack]")
+            else:
+                for addr in kernel_stack:
+                    print("    %s" % aksym(addr))
+        if not args.kernel_stacks_only:
+            if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
+                print("    --")
+            if stack_id_err(k.user_stack_id):
+                print("    [Missed User Stack]")
+            else:
+                for addr in user_stack:
+                    print("    %s" % b.sym(addr, k.pid).decode('utf-8', 'replace'))
+        print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
+        print("        %d\n" % v.value)
+
+# check missing
+if missing_stacks > 0:
+    enomem_str = "" if not has_enomem else \
+        " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces could not be displayed.%s" %
+        (missing_stacks, enomem_str),
+        file=stderr)
diff --git a/tools/profile_example.txt b/tools/profile_example.txt
new file mode 100644
index 0000000..6fe6f74
--- /dev/null
+++ b/tools/profile_example.txt
@@ -0,0 +1,781 @@
+Demonstrations of profile, the Linux eBPF/bcc version.
+
+
+This is a CPU profiler. It works by taking samples of stack traces at timed
+intervals, and frequency counting them in kernel context for efficiency.
+
+Example output:
+
+# ./profile
+Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
+^C
+    filemap_map_pages
+    handle_mm_fault
+    __do_page_fault
+    do_page_fault
+    page_fault
+    [unknown]
+    -                cp (9036)
+        1
+
+    [unknown]
+    [unknown]
+    -                sign-file (8877)
+        1
+
+    __clear_user
+    iov_iter_zero
+    read_iter_zero
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    read
+    -                dd (25036)
+        4
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (13549)
+        5
+
+[...]
+
+    native_safe_halt
+    default_idle
+    arch_cpu_idle
+    default_idle_call
+    cpu_startup_entry
+    rest_init
+    start_kernel
+    x86_64_start_reservations
+    x86_64_start_kernel
+    -                swapper/0 (0)
+        72
+
+    native_safe_halt
+    default_idle
+    arch_cpu_idle
+    default_idle_call
+    cpu_startup_entry
+    start_secondary
+    -                swapper/1 (0)
+        75
+
+The output was long; I truncated some lines ("[...]").
+
+This default output prints stack traces, followed by a line to describe the
+process (a dash, the process name, and a PID in parenthesis), and then an
+integer count of how many times this stack trace was sampled.
+
+The output above shows the most frequent stack was from the "swapper/1"
+process (PID 0), running the native_safe_halt() function, which was called
+by default_idle(), which was called by arch_cpu_idle(), and so on. This is
+the idle thread. Stacks can be read top-down, to follow ancestry: child,
+parent, grandparent, etc.
+
+The func_ab process is running the func_a() function, called by main(),
+called by __libc_start_main(), and called by "[unknown]" with what looks
+like a bogus address (1st column). That's evidence of a broken stack trace.
+It's common for user-level software that hasn't been compiled with frame
+pointers (in this case, libc).
+
+The dd process has called read(), and then enters the kernel via
+entry_SYSCALL_64_fastpath(), calling sys_read(), and so on. Yes, I'm now
+reading it bottom up. That way follows the code flow.
+
+
+The dd process is actually "dd if=/dev/zero of=/dev/null": it's a simple
+workload to analyze that just moves bytes from /dev/zero to /dev/null.
+Profiling just that process:
+
+# ./profile -p 25036
+Sampling at 49 Hertz of PID 25036 by user + kernel stack... Hit Ctrl-C to end.
+^C
+    [unknown]
+    [unknown]
+    -                dd (25036)
+        1
+
+    __write
+    -                dd (25036)
+        1
+
+    read
+    -                dd (25036)
+        1
+
+[...]
+
+    [unknown]
+    [unknown]
+    -                dd (25036)
+        2
+
+    entry_SYSCALL_64_fastpath
+    __write
+    [unknown]
+    -                dd (25036)
+        3
+
+    entry_SYSCALL_64_fastpath
+    read
+    -                dd (25036)
+        3
+
+    __clear_user
+    iov_iter_zero
+    read_iter_zero
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    read
+    [unknown]
+    -                dd (25036)
+        3
+
+    __clear_user
+    iov_iter_zero
+    read_iter_zero
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    read
+    -                dd (25036)
+        7
+
+Again, I've truncated some lines. Now we're just analyzing the dd process.
+The filtering is performed in kernel context, for efficiency.
+
+This output has some "[unknown]" frames that probably have valid addresses,
+but we're lacking the symbol translation. This is a common for all profilers
+on Linux, and is usually fixable. See the DEBUGGING section of the profile(8)
+man page.
+
+
+Lets add delimiters between the user and kernel stacks, using -d:
+
+# ./profile -p 25036 -d
+^C
+    __vfs_write
+    sys_write
+    entry_SYSCALL_64_fastpath
+    --
+    __write
+    -                dd (25036)
+        1
+
+    --
+    [unknown]
+    [unknown]
+    -                dd (25036)
+        1
+
+    iov_iter_init
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    --
+    read
+    -                dd (25036)
+        1
+
+[...]
+
+    __clear_user
+    iov_iter_zero
+    read_iter_zero
+    __vfs_read
+    vfs_read
+    sys_read
+    entry_SYSCALL_64_fastpath
+    --
+    read
+    -                dd (25036)
+        9
+
+In this mode, the delimiters are "--".
+
+
+
+Here's another example, a func_ab program that runs two functions, func_a() and
+func_b(). Profiling it for 5 seconds:
+
+# ./profile -p `pgrep -n func_ab` 5
+Sampling at 49 Hertz of PID 2930 by user + kernel stack for 5 secs.
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        2
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        3
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        5
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        12
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        19
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        22
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        64
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        72
+
+Note that the same stack (2nd column) seems to be repeated. Weren't we doing
+frequency counting and only printing unique stacks? We are, but in terms of
+the raw addresses, not the symbols. See the 1st column: those stacks are
+all unique.
+
+
+We can output in "folded format", which puts the stack trace on one line,
+separating frames with semi-colons. Eg:
+
+# ./profile -f -p `pgrep -n func_ab` 5
+func_ab;[unknown];__libc_start_main;main;func_a 2
+func_ab;[unknown];__libc_start_main;main;func_b 2
+func_ab;[unknown];__libc_start_main;main;func_a 11
+func_ab;[unknown];__libc_start_main;main;func_b 12
+func_ab;[unknown];__libc_start_main;main;func_a 23
+func_ab;[unknown];__libc_start_main;main;func_b 28
+func_ab;[unknown];__libc_start_main;main;func_b 57
+func_ab;[unknown];__libc_start_main;main;func_a 64
+
+I find this pretty useful for writing to files and later grepping.
+
+
+Folded format can also be used by flame graph stack visualizers, including
+the original implementation:
+
+	https://github.com/brendangregg/FlameGraph
+
+I'd include delimiters, -d. For example:
+
+# ./profile -df -p `pgrep -n func_ab` 5 > out.profile
+# git clone https://github.com/brendangregg/FlameGraph
+# ./FlameGraph/flamegraph.pl < out.profile > out.svg
+
+(Yes, I could pipe profile directly into flamegraph.pl, however, I like to
+keep the raw folded profiles around: can be useful for regenerating flamegraphs
+with different options, and, for differential flame graphs.)
+
+
+Some flamegraph.pl palettes recognize kernel annotations, which can be added
+with -a. It simply adds a "_[k]" at the end of kernel function names.
+For example:
+
+# ./profile -adf -p `pgrep -n dd` 10
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;[unknown];[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__fsnotify_parent_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fdget_pos_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__fsnotify_parent_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fsnotify_parent_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];fsnotify_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];iov_iter_init_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k];write_null_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];security_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__vfs_read_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fsnotify_parent_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;sys_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fsnotify_parent_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];common_file_perm_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];[unknown] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];vfs_read_[k] 1
+dd;__write 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 1
+dd;[unknown];[unknown] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;[unknown] 1
+dd;[unknown] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];[unknown] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];_cond_resched_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];iov_iter_init_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];__fsnotify_parent_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];rw_verify_area_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 1
+dd;[unknown] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];fsnotify_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fdget_pos_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];__vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fget_light_[k] 1
+dd;[unknown] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 1
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];vfs_write_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 1
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k] 1
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];fsnotify_[k] 1
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];apparmor_file_permission_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];__fdget_pos_[k] 2
+dd;[unknown];[unknown] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];__fdget_pos_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k];common_file_perm_[k] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 2
+dd;[unknown];[unknown] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];__fdget_pos_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];[unknown] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];rw_verify_area_[k];security_file_permission_[k];fsnotify_[k] 2
+dd;__write;-;sys_write_[k] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];fsnotify_[k] 2
+dd;[unknown];[unknown] 2
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 2
+dd;read;-;SyS_read_[k] 2
+dd;[unknown] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k] 2
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];__fget_light_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k] 2
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k];rw_verify_area_[k];security_file_permission_[k];apparmor_file_permission_[k] 2
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];__clear_user_[k] 2
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];rw_verify_area_[k] 2
+dd;[unknown];[unknown] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];rw_verify_area_[k] 3
+dd;[unknown];[unknown] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 3
+dd;[unknown];[unknown] 3
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 3
+dd;[unknown];[unknown] 3
+dd;[unknown];[unknown] 3
+dd;__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 3
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 3
+dd;[unknown] 4
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 4
+dd;[unknown];[unknown] 4
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k] 4
+dd;[unknown] 4
+dd;[unknown];[unknown] 4
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k] 4
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 5
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k];sys_write_[k];vfs_write_[k] 5
+dd;[unknown];[unknown] 5
+dd;[unknown];[unknown] 5
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k] 6
+dd;read 15
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 19
+dd;[unknown];__write;-;entry_SYSCALL_64_fastpath_[k] 20
+dd;read;-;entry_SYSCALL_64_fastpath_[k] 23
+dd;read;-;entry_SYSCALL_64_fastpath_[k];SyS_read_[k];vfs_read_[k];__vfs_read_[k];read_iter_zero_[k];iov_iter_zero_[k];__clear_user_[k] 24
+dd;__write;-;entry_SYSCALL_64_fastpath_[k] 25
+dd;__write 29
+dd;[unknown];read;-;entry_SYSCALL_64_fastpath_[k] 31
+
+This can be made into a flamegraph. Eg:
+
+# ./profile -adf -p `pgrep -n func_ab` 10 > out.profile
+# git clone https://github.com/brendangregg/FlameGraph
+# ./FlameGraph/flamegraph.pl --color=java < out.profile > out.svg
+
+It will highlight the kernel frames in orange, and user-level in red (and Java
+in green, and C++ in yellow). If you copy-n-paste the above output into a
+out.profile file, you can try it out.
+
+
+You can increase or decrease the sample frequency. Eg, sampling at 9 Hertz:
+
+# ./profile -F 9
+Sampling at 9 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
+^C
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        1
+
+[...]
+
+    native_safe_halt
+    default_idle
+    arch_cpu_idle
+    default_idle_call
+    cpu_startup_entry
+    start_secondary
+    -                swapper/3 (0)
+        8
+
+    native_safe_halt
+    default_idle
+    arch_cpu_idle
+    default_idle_call
+    cpu_startup_entry
+    rest_init
+    start_kernel
+    x86_64_start_reservations
+    x86_64_start_kernel
+    -                swapper/0 (0)
+        8
+
+
+You can also restrict profiling to just kernel stacks (-K) or user stacks (-U).
+For example, just user stacks:
+
+# ./profile -C 7 2
+Sampling at 49 Hertz of all threads by user + kernel stack on CPU#7 for 2 secs.
+
+    PyEval_EvalFrameEx
+    [unknown]
+    [unknown]
+    -                python (2827439)
+        1
+
+    PyDict_GetItem
+    [unknown]
+    -                python (2827439)
+        1
+
+    [unknown]
+    -                python (2827439)
+        1
+
+    PyEval_EvalFrameEx
+    [unknown]
+    [unknown]
+    -                python (2827439)
+        1
+
+    PyEval_EvalFrameEx
+    -                python (2827439)
+        1
+
+    [unknown]
+    [unknown]
+    -                python (2827439)
+
+in this example python application was busylooping on a single core/cpu (#7)
+we were collecting stack traces only from it
+
+# ./profile -U
+Sampling at 49 Hertz of all threads by user stack... Hit Ctrl-C to end.
+^C
+    [unknown]
+    [unknown]
+    -                dd (2931)
+        1
+
+    [unknown]
+    [unknown]
+    -                dd (2931)
+        1
+
+    [unknown]
+    [unknown]
+    -                dd (2931)
+        1
+
+    [unknown]
+    [unknown]
+    -                dd (2931)
+        1
+
+    [unknown]
+    [unknown]
+    -                dd (2931)
+        1
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        1
+
+    [unknown]
+    -                dd (2931)
+        1
+
+    [unknown]
+    -                dd (2931)
+        1
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        3
+
+    __write
+    [unknown]
+    -                dd (2931)
+        3
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        4
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        7
+
+    -                swapper/6 (0)
+        10
+
+    func_b
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        10
+
+    __write
+    -                dd (2931)
+        10
+
+    func_a
+    main
+    __libc_start_main
+    [unknown]
+    -                func_ab (2930)
+        11
+
+    read
+    -                dd (2931)
+        12
+
+    read
+    [unknown]
+    -                dd (2931)
+        14
+
+    -                swapper/7 (0)
+        46
+
+    -                swapper/0 (0)
+        46
+
+    -                swapper/2 (0)
+        46
+
+    -                swapper/1 (0)
+        46
+
+    -                swapper/3 (0)
+        46
+
+    -                swapper/4 (0)
+        46
+
+
+If there are too many unique stack traces for the kernel to save, a warning
+will be printed. Eg:
+
+# ./profile
+[...]
+WARNING: 8 stack traces could not be displayed. Consider increasing --stack-storage-size.
+
+Run ./profile -h to see the default.
+
+
+USAGE message:
+
+# ./profile -h
+usage: profile [-h] [-p PID] [-U | -K] [-F FREQUENCY | -c COUNT] [-d] [-a]
+                  [-f] [--stack-storage-size STACK_STORAGE_SIZE]
+                  [duration]
+
+Profile CPU stack traces at a timed interval
+
+positional arguments:
+  duration              duration of trace, in seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     profile this PID only
+  -U, --user-stacks-only
+                        show stacks from user space only (no kernel space
+                        stacks)
+  -K, --kernel-stacks-only
+                        show stacks from kernel space only (no user space
+                        stacks)
+  -F FREQUENCY, --frequency FREQUENCY
+                        sample frequency, Hertz
+  -c COUNT, --count COUNT
+                        sample period, number of events
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -a, --annotations     add _[k] annotations to kernel frames
+  -f, --folded          output folded format, one line per stack (for flame
+                        graphs)
+  --stack-storage-size STACK_STORAGE_SIZE
+                        the number of unique stack traces that can be stored
+                        and displayed (default 2048)
+  -C CPU, --cpu CPU     cpu number to run profile on
+
+examples:
+    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
+    ./profile -F 99       # profile stack traces at 99 Hertz
+    ./profile -c 1000000  # profile stack traces every 1 in a million events
+    ./profile 5           # profile at 49 Hertz for 5 seconds only
+    ./profile -f 5        # output in folded format for flame graphs
+    ./profile -p 185      # only profile threads for PID 185
+    ./profile -U          # only show user space stacks (no kernel)
+    ./profile -K          # only show kernel space stacks (no user)
diff --git a/tools/pythoncalls.sh b/tools/pythoncalls.sh
new file mode 100755
index 0000000..e5fd65d
--- /dev/null
+++ b/tools/pythoncalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l python "$@"
diff --git a/tools/pythoncalls_example.txt b/tools/pythoncalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/pythoncalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/pythonflow.sh b/tools/pythonflow.sh
new file mode 100755
index 0000000..a346772
--- /dev/null
+++ b/tools/pythonflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l python "$@"
diff --git a/tools/pythonflow_example.txt b/tools/pythonflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/pythonflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/pythongc.sh b/tools/pythongc.sh
new file mode 100755
index 0000000..ca5baf3
--- /dev/null
+++ b/tools/pythongc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ugc.py -l python "$@"
diff --git a/tools/pythongc_example.txt b/tools/pythongc_example.txt
new file mode 120000
index 0000000..303ccbd
--- /dev/null
+++ b/tools/pythongc_example.txt
@@ -0,0 +1 @@
+lib/ugc_example.txt
\ No newline at end of file
diff --git a/tools/pythonstat.sh b/tools/pythonstat.sh
new file mode 100755
index 0000000..2133207
--- /dev/null
+++ b/tools/pythonstat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l python "$@"
diff --git a/tools/pythonstat_example.txt b/tools/pythonstat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/pythonstat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/reset-trace.sh b/tools/reset-trace.sh
new file mode 100755
index 0000000..fb891a7
--- /dev/null
+++ b/tools/reset-trace.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+#
+# reset-trace - reset state of tracing, disabling all tracing.
+#               Written for Linux.
+#
+# If a bcc tool crashed and you suspect tracing is partially enabled, you
+# can use this tool to reset the state of tracing, disabling anything still
+# enabled. Only use this tool in the case of error, and, consider filing a
+# bcc ticket so we can fix the error.
+#
+# bcc-used tracing facilities are reset. Other tracing facilities (ftrace) are
+# checked, and if not in an expected state, a note is printed. All tracing
+# files can be reset with -F for force, but this will interfere with any other
+# running tracing sessions (eg, ftrace).
+#
+# USAGE: ./reset-trace [-Fhqv]
+#
+# REQUIREMENTS: debugfs mounted on /sys/kernel/debug
+#
+# COPYRIGHT: Copyright (c) 2016 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Jul-2014	Brendan Gregg	Created this.
+# 18-Oct-2016      "      "     Updated for bcc use.
+
+tracing=/sys/kernel/debug/tracing
+opt_force=0; opt_verbose=0; opt_quiet=0
+
+function usage {
+	cat <<-END >&2
+	USAGE: reset-trace [-Fhqv]
+	                 -F             # force: reset all tracing files
+	                 -v             # verbose: print details while working
+	                 -h             # this usage message
+	                 -q             # quiet: no output
+	  eg,
+	       reset-trace              # disable semi-enabled tracing
+END
+	exit
+}
+
+function die {
+	echo >&2 "$@"
+	exit 1
+}
+
+function vecho {
+	(( ! opt_verbose )) && return
+	echo "$@"
+}
+
+function writefile {
+	file=$1
+	write=$2
+	if [[ ! -w $file ]]; then
+		echo >&2 "WARNING: file $file not writable/exists. Skipping."
+		return
+	fi
+
+	vecho "Checking $PWD/$file"
+        contents=$(grep -v '^#' $file)
+	if [[ "$contents" != "$expected" ]]; then
+		(( ! opt_quiet )) && echo "Needed to reset $PWD/$file"
+		vecho "$file, before (line enumerated):"
+		(( opt_verbose )) && cat -nv $file
+		cmd="echo $write > $file"
+		if ! eval "$cmd"; then
+			echo >&2 "WARNING: command failed \"$cmd\"." \
+			    "bcc still running? Continuing."
+		fi
+		vecho "$file, after (line enumerated):"
+		(( opt_verbose )) && cat -nv $file
+		vecho
+	fi
+}
+
+# only write when force is used
+function checkfile {
+	file=$1
+	write=$2
+	expected=$3
+	if [[ ! -e $file ]]; then
+		echo >&2 "WARNING: file $file doesn't exist. Skipping."
+		return
+	fi
+	if (( opt_force )); then
+		writefile $file $write
+		return
+	fi
+	(( opt_quiet )) && return
+
+	vecho "Checking $PWD/$file"
+        contents=$(grep -v '^#' $file)
+	if [[ "$contents" != "$expected" ]]; then
+		echo "Noticed unrelated tracing file $PWD/$file isn't set as" \
+		    "expected. Not reseting (-F to force, -v for verbose)."
+		vecho "Contents of $file is (line enumerated):"
+		(( opt_verbose )) && cat -nv $file
+		vecho "Expected \"$expected\"."
+	fi
+}
+
+### process options
+while getopts Fhqv opt
+do
+	case $opt in
+	F)	opt_force=1 ;;
+	q)	opt_quiet=1 ;;
+	v)	opt_verbose=1 ;;
+	h|?)	usage ;;
+	esac
+done
+shift $(( $OPTIND - 1 ))
+
+### reset tracing state
+vecho "Reseting tracing state..."
+vecho
+cd $tracing || die "ERROR: accessing tracing. Root user? /sys/kernel/debug?"
+
+# files bcc uses
+writefile kprobe_events "" ""
+writefile uprobe_events "" ""
+writefile trace "" ""         # clears trace_pipe
+
+# non-bcc files
+checkfile current_tracer nop nop
+checkfile set_ftrace_filter "" ""
+checkfile set_graph_function "" ""
+checkfile set_ftrace_pid "" "no pid"
+checkfile events/enable 0 0
+checkfile tracing_thresh 0 0
+checkfile tracing_on 1 1
+
+vecho
+vecho "Done."
diff --git a/tools/reset-trace_example.txt b/tools/reset-trace_example.txt
new file mode 100644
index 0000000..d0f6777
--- /dev/null
+++ b/tools/reset-trace_example.txt
@@ -0,0 +1,241 @@
+Demonstrations of reset-trace, for Linux bcc/BPF.
+
+
+You will probably never need this tool. If you kill -9 a bcc tool (plus other
+signals, like SIGTERM), or if a bcc tool crashes, then kernel tracing can be
+left in a semi-enabled state. It's not as bad as it sounds: there may just be
+overhead for writing to ring buffers that are never read. This tool can be
+used to clean up the tracing state, and reset and disable active tracing.
+
+WARNING: Make sure no other tracing sessions are active, as it will likely
+stop them from functioning (perhaps ungracefully).
+
+This specifically clears the state in at least the following files in
+/sys/kernel/debug/tracing: kprobe_events, uprobe_events, trace_pipe.
+Other tracing facilities (ftrace) are checked, and if not in an expected state,
+a note is printed. All tracing files can be reset with -F for force, but this
+will interfere with any other running tracing sessions (eg, ftrace).
+
+Here's an example:
+
+# ./reset-trace.sh
+#
+
+That's it.
+
+
+You can use -v to see what it does:
+
+# ./reset-trace.sh -v
+Reseting tracing state...
+
+Checking /sys/kernel/debug/tracing/kprobe_events
+Checking /sys/kernel/debug/tracing/uprobe_events
+Checking /sys/kernel/debug/tracing/trace
+Checking /sys/kernel/debug/tracing/current_tracer
+Checking /sys/kernel/debug/tracing/set_ftrace_filter
+Checking /sys/kernel/debug/tracing/set_graph_function
+Checking /sys/kernel/debug/tracing/set_ftrace_pid
+Checking /sys/kernel/debug/tracing/events/enable
+Checking /sys/kernel/debug/tracing/tracing_thresh
+Checking /sys/kernel/debug/tracing/tracing_on
+
+Done.
+
+In this example, no resetting was necessary.
+
+
+Here's an example of actually needing it:
+
+# ./funccount 'bash:r*'
+Tracing 317 functions for "bash:r*"... Hit Ctrl-C to end.
+^C
+FUNC                                    COUNT
+rl_free_undo_list                           1
+rl_deprep_terminal                          1
+readline_internal_teardown                  1
+rl_on_new_line                              1
+rl_crlf                                     1
+rl_clear_signals                            1
+rl_prep_terminal                            1
+rl_reset_line_state                         1
+rl_initialize                               1
+rl_newline                                  1
+readline_internal_setup                     1
+rl_set_screen_size                          1
+readline                                    1
+rl_set_signals                              1
+rl_expand_prompt                            1
+replace_history_data                        1
+rl_set_prompt                               1
+rl_add_undo                                 1
+rl_insert_text                              2
+rl_insert                                   2
+rl_redisplay                                3
+rl_read_key                                 3
+rl_getc                                     3
+readline_internal_char                      3
+restore_parser_state                        6
+reap_dead_jobs                              6
+reset_parser                                6
+restore_input_line_state                    6
+realloc                                     7
+read_octal                                 10
+read_tty_modified                          13
+run_exit_trap                              13
+redirection_expand                         13
+restore_pipestatus_array                   18
+reader_loop                                20
+run_return_trap                            21
+remember_args                              25
+reset_signal_handlers                      30
+remove_quoted_escapes                      60
+run_unwind_frame                          102
+reset_terminating_signals                 125
+restore_original_signals                  139
+reset_internal_getopt                     405
+run_debug_trap                            719
+read_command                              940
+remove_quoted_nulls                      1830
+run_pending_traps                        3207
+^C
+^C
+^C
+
+I've traced 317 functions using funccount, and when I hit Ctrl-C, funccount is
+not exiting (it can normally take many seconds, but this really looks stuck):
+
+# pidstat 1
+Linux 4.9.0-rc1-virtual (bgregg-xenial-bpf-i-xxx) 	10/18/2016 	_x86_64_	(8 CPU)
+
+10:00:33 PM   UID       PID    %usr %system  %guest    %CPU   CPU  Command
+10:00:34 PM 60004      3277    0.00    0.98    0.00    0.98     0  redis-server
+10:00:34 PM     0     27980   87.25   10.78    0.00   98.04     3  funccount.py
+10:00:34 PM     0     29965    0.00    0.98    0.00    0.98     6  pidstat
+
+10:00:34 PM   UID       PID    %usr %system  %guest    %CPU   CPU  Command
+10:00:35 PM 65534      3276    0.00    1.00    0.00    1.00     2  multilog
+10:00:35 PM     0     27980   77.00   23.00    0.00  100.00     3  funccount.py
+10:00:35 PM     0     29965    0.00    1.00    0.00    1.00     6  pidstat
+10:00:35 PM 60004     29990    0.00    1.00    0.00    1.00     6  catalina.sh
+
+funccount looks a lot like it's in an infinite loop (I can use a stack-sampling
+profiler to confirm). This is a known bug (#665) and may be fixed by the time
+you read this. But right now it's a good example of needing reset-trace.
+
+I'll send a SIGTERM, before resorting to a SIGKILL:
+
+# kill 27980
+Terminated
+
+Ok, so the process is now gone, but it did leave tracing in a semi-enabled
+state. Using reset-trace:
+
+# ./reset-trace.sh -v
+Reseting tracing state...
+
+Checking /sys/kernel/debug/tracing/kprobe_events
+Checking /sys/kernel/debug/tracing/uprobe_events
+Needed to reset /sys/kernel/debug/tracing/uprobe_events
+uprobe_events, before (line enumerated):
+     1	p:uprobes/p__bin_bash_0xa2540 /bin/bash:0x00000000000a2540
+     2	p:uprobes/p__bin_bash_0x21220 /bin/bash:0x0000000000021220
+     3	p:uprobes/p__bin_bash_0x78530 /bin/bash:0x0000000000078530
+     4	p:uprobes/p__bin_bash_0xa3840 /bin/bash:0x00000000000a3840
+     5	p:uprobes/p__bin_bash_0x9c550 /bin/bash:0x000000000009c550
+     6	p:uprobes/p__bin_bash_0x5e360 /bin/bash:0x000000000005e360
+     7	p:uprobes/p__bin_bash_0xb2630 /bin/bash:0x00000000000b2630
+     8	p:uprobes/p__bin_bash_0xb1e70 /bin/bash:0x00000000000b1e70
+     9	p:uprobes/p__bin_bash_0xb2540 /bin/bash:0x00000000000b2540
+    10	p:uprobes/p__bin_bash_0xb16e0 /bin/bash:0x00000000000b16e0
+[...]
+   312	p:uprobes/p__bin_bash_0xa80b0 /bin/bash:0x00000000000a80b0
+   313	p:uprobes/p__bin_bash_0x9e280 /bin/bash:0x000000000009e280
+   314	p:uprobes/p__bin_bash_0x9e100 /bin/bash:0x000000000009e100
+   315	p:uprobes/p__bin_bash_0xb2bd0 /bin/bash:0x00000000000b2bd0
+   316	p:uprobes/p__bin_bash_0x9d9c0 /bin/bash:0x000000000009d9c0
+   317	p:uprobes/p__bin_bash_0x4a930 /bin/bash:0x000000000004a930
+uprobe_events, after (line enumerated):
+
+Checking /sys/kernel/debug/tracing/trace
+Checking /sys/kernel/debug/tracing/current_tracer
+Checking /sys/kernel/debug/tracing/set_ftrace_filter
+Checking /sys/kernel/debug/tracing/set_graph_function
+Checking /sys/kernel/debug/tracing/set_ftrace_pid
+Checking /sys/kernel/debug/tracing/events/enable
+Checking /sys/kernel/debug/tracing/tracing_thresh
+Checking /sys/kernel/debug/tracing/tracing_on
+
+Done.
+
+Now looks clean. I did truncate the output here: there were a few hundred lines
+from uprobe_events.
+
+Here's the same situation, but without the verbose option:
+
+# ./reset-trace.sh
+Needed to reset /sys/kernel/debug/tracing/uprobe_events
+#
+
+And again with quiet:
+
+# ./reset-trace.sh -q
+#
+
+
+Here is an example of reset-trace detecting an unrelated tracing session:
+
+# ./reset-trace.sh 
+Noticed unrelated tracing file /sys/kernel/debug/tracing/set_ftrace_filter isn't set as expected. Not reseting (-F to force, -v for verbose).
+
+And verbose:
+
+# ./reset-trace.sh -v
+Reseting tracing state...
+
+Checking /sys/kernel/debug/tracing/kprobe_events
+Checking /sys/kernel/debug/tracing/uprobe_events
+Checking /sys/kernel/debug/tracing/trace
+Checking /sys/kernel/debug/tracing/current_tracer
+Checking /sys/kernel/debug/tracing/set_ftrace_filter
+Noticed unrelated tracing file /sys/kernel/debug/tracing/set_ftrace_filter isn't set as expected. Not reseting (-F to force, -v for verbose).
+Contents of set_ftrace_filter is (line enumerated):
+     1	tcp_send_mss
+     2	tcp_sendpage
+     3	tcp_sendmsg
+     4	tcp_send_dupack
+     5	tcp_send_challenge_ack.isra.53
+     6	tcp_send_rcvq
+     7	tcp_send_ack
+     8	tcp_send_loss_probe
+     9	tcp_send_fin
+    10	tcp_send_active_reset
+    11	tcp_send_synack
+    12	tcp_send_delayed_ack
+    13	tcp_send_window_probe
+    14	tcp_send_probe0
+Expected "".
+Checking /sys/kernel/debug/tracing/set_graph_function
+Checking /sys/kernel/debug/tracing/set_ftrace_pid
+Checking /sys/kernel/debug/tracing/events/enable
+Checking /sys/kernel/debug/tracing/tracing_thresh
+Checking /sys/kernel/debug/tracing/tracing_on
+
+Done.
+
+So this file is not currently used by bcc, but it may be useful to know that
+it's not in the default state -- something is either using it or has left it
+enabled. These files can be reset with -F, but that may break other tools that
+are currently using them.
+
+
+Use -h to print the USAGE message:
+
+# ./reset-trace.sh -h
+USAGE: reset-trace [-Fhqv]
+                 -F             # force: reset all tracing files
+                 -v             # verbose: print details while working
+                 -h             # this usage message
+                 -q             # quiet: no output
+  eg,
+       reset-trace              # disable semi-enabled tracing
diff --git a/tools/rubycalls.sh b/tools/rubycalls.sh
new file mode 100755
index 0000000..bbea144
--- /dev/null
+++ b/tools/rubycalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l ruby "$@"
diff --git a/tools/rubycalls_example.txt b/tools/rubycalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/rubycalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/rubyflow.sh b/tools/rubyflow.sh
new file mode 100755
index 0000000..fe1946a
--- /dev/null
+++ b/tools/rubyflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l ruby "$@"
diff --git a/tools/rubyflow_example.txt b/tools/rubyflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/rubyflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/rubygc.sh b/tools/rubygc.sh
new file mode 100755
index 0000000..81a29f6
--- /dev/null
+++ b/tools/rubygc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ugc.py -l ruby "$@"
diff --git a/tools/rubygc_example.txt b/tools/rubygc_example.txt
new file mode 120000
index 0000000..303ccbd
--- /dev/null
+++ b/tools/rubygc_example.txt
@@ -0,0 +1 @@
+lib/ugc_example.txt
\ No newline at end of file
diff --git a/tools/rubyobjnew.sh b/tools/rubyobjnew.sh
new file mode 100755
index 0000000..afeaa3a
--- /dev/null
+++ b/tools/rubyobjnew.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uobjnew.py -l ruby "$@"
diff --git a/tools/rubyobjnew_example.txt b/tools/rubyobjnew_example.txt
new file mode 120000
index 0000000..a8a83c3
--- /dev/null
+++ b/tools/rubyobjnew_example.txt
@@ -0,0 +1 @@
+lib/uobjnew_example.txt
\ No newline at end of file
diff --git a/tools/rubystat.sh b/tools/rubystat.sh
new file mode 100755
index 0000000..7c777cd
--- /dev/null
+++ b/tools/rubystat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l ruby "$@"
diff --git a/tools/rubystat_example.txt b/tools/rubystat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/rubystat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/runqlat.py b/tools/runqlat.py
new file mode 100755
index 0000000..9fd4064
--- /dev/null
+++ b/tools/runqlat.py
@@ -0,0 +1,277 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# runqlat   Run queue (scheduler) latency as a histogram.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: runqlat [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+#
+# This measures the time a task spends waiting on a run queue for a turn
+# on-CPU, and shows this time as a histogram. This time should be small, but a
+# task may need to wait its turn due to CPU load.
+#
+# This measures two types of run queue latency:
+# 1. The time from a task being enqueued on a run queue to its context switch
+#    and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
+#    finish_task_switch() with either raw tracepoints (if supported) or kprobes
+#    and instruments the run queue latency after a voluntary context switch.
+# 2. The time from when a task was involuntary context switched and still
+#    in the runnable state, to when it next executed. This is instrumented
+#    from finish_task_switch() alone.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 07-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./runqlat            # summarize run queue latency as a histogram
+    ./runqlat 1 10       # print 1 second summaries, 10 times
+    ./runqlat -mT 1      # 1s summaries, milliseconds, and timestamps
+    ./runqlat -P         # show each PID separately
+    ./runqlat -p 185     # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize run queue (scheduler) latency as a histogram",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="millisecond histogram")
+parser.add_argument("-P", "--pids", action="store_true",
+    help="print a histogram per process ID")
+# PID options are --pid and --pids, so namespaces should be --pidns (not done
+# yet) and --pidnss:
+parser.add_argument("--pidnss", action="store_true",
+    help="print a histogram per PID namespace")
+parser.add_argument("-L", "--tids", action="store_true",
+    help="print a histogram per thread ID")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+
+typedef struct pid_key {
+    u64 id;    // work around
+    u64 slot;
+} pid_key_t;
+
+typedef struct pidns_key {
+    u64 id;    // work around
+    u64 slot;
+} pidns_key_t;
+
+BPF_HASH(start, u32);
+STORAGE
+
+struct rq;
+
+// record enqueue timestamp
+static int trace_enqueue(u32 tgid, u32 pid)
+{
+    if (FILTER || pid == 0)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+"""
+
+bpf_text_kprobe = """
+int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
+{
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
+    int wake_flags)
+{
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+// calculate latency
+int trace_run(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u32 pid, tgid;
+
+    // ivcsw: treat like an enqueue event and store timestamp
+    if (prev->state == TASK_RUNNING) {
+        tgid = prev->tgid;
+        pid = prev->pid;
+        if (!(FILTER || pid == 0)) {
+            u64 ts = bpf_ktime_get_ns();
+            start.update(&pid, &ts);
+        }
+    }
+
+    tgid = bpf_get_current_pid_tgid() >> 32;
+    pid = bpf_get_current_pid_tgid();
+    if (FILTER || pid == 0)
+        return 0;
+    u64 *tsp, delta;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed enqueue
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+    FACTOR
+
+    // store as histogram
+    STORE
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+bpf_text_raw_tp = """
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+    struct task_struct *prev = (struct task_struct *)ctx->args[1];
+    struct task_struct *next = (struct task_struct *)ctx->args[2];
+    u32 pid, tgid;
+
+    // ivcsw: treat like an enqueue event and store timestamp
+    if (prev->state == TASK_RUNNING) {
+        tgid = prev->tgid;
+        pid = prev->pid;
+        if (!(FILTER || pid == 0)) {
+            u64 ts = bpf_ktime_get_ns();
+            start.update(&pid, &ts);
+        }
+    }
+
+    tgid = next->tgid;
+    pid = next->pid;
+    if (FILTER || pid == 0)
+        return 0;
+    u64 *tsp, delta;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed enqueue
+    }
+    delta = bpf_ktime_get_ns() - *tsp;
+    FACTOR
+
+    // store as histogram
+    STORE
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+is_support_raw_tp = BPF.support_raw_tracepoint()
+if is_support_raw_tp:
+    bpf_text += bpf_text_raw_tp
+else:
+    bpf_text += bpf_text_kprobe
+
+# code substitutions
+if args.pid:
+    # pid from userspace point of view is thread group from kernel pov
+    bpf_text = bpf_text.replace('FILTER', 'tgid != %s' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '0')
+if args.milliseconds:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
+    label = "msecs"
+else:
+    bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
+    label = "usecs"
+if args.pids or args.tids:
+    section = "pid"
+    pid = "tgid"
+    if args.tids:
+        pid = "pid"
+        section = "tid"
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, pid_key_t);')
+    bpf_text = bpf_text.replace('STORE',
+        'pid_key_t key = {.id = ' + pid + ', .slot = bpf_log2l(delta)}; ' +
+        'dist.increment(key);')
+elif args.pidnss:
+    section = "pidns"
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, pidns_key_t);')
+    bpf_text = bpf_text.replace('STORE', 'pidns_key_t key = ' +
+        '{.id = prev->nsproxy->pid_ns_for_children->ns.inum, ' +
+        '.slot = bpf_log2l(delta)}; dist.increment(key);')
+else:
+    section = ""
+    bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+    bpf_text = bpf_text.replace('STORE',
+        'dist.increment(bpf_log2l(delta));')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+if not is_support_raw_tp:
+    b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
+    b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
+    b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
+
+print("Tracing run queue latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    dist.print_log2_hist(label, section, section_print_fn=int)
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/runqlat_example.txt b/tools/runqlat_example.txt
new file mode 100644
index 0000000..857e516
--- /dev/null
+++ b/tools/runqlat_example.txt
@@ -0,0 +1,577 @@
+Demonstrations of runqlat, the Linux eBPF/bcc version.
+
+
+This program summarizes scheduler run queue latency as a histogram, showing
+how long tasks spent waiting their turn to run on-CPU.
+
+Here is a heavily loaded system:
+
+# ./runqlat 
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+     usecs               : count     distribution
+         0 -> 1          : 233      |***********                             |
+         2 -> 3          : 742      |************************************    |
+         4 -> 7          : 203      |**********                              |
+         8 -> 15         : 173      |********                                |
+        16 -> 31         : 24       |*                                       |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 30       |*                                       |
+       128 -> 255        : 6        |                                        |
+       256 -> 511        : 3        |                                        |
+       512 -> 1023       : 5        |                                        |
+      1024 -> 2047       : 27       |*                                       |
+      2048 -> 4095       : 30       |*                                       |
+      4096 -> 8191       : 20       |                                        |
+      8192 -> 16383      : 29       |*                                       |
+     16384 -> 32767      : 809      |****************************************|
+     32768 -> 65535      : 64       |***                                     |
+
+The distribution is bimodal, with one mode between 0 and 15 microseconds,
+and another between 16 and 65 milliseconds. These modes are visible as the
+spikes in the ASCII distribution (which is merely a visual representation
+of the "count" column). As an example of reading one line: 809 events fell
+into the 16384 to 32767 microsecond range (16 to 32 ms) while tracing.
+
+I would expect the two modes to be due the workload: 16 hot CPU-bound threads,
+and many other mostly idle threads doing occasional work. I suspect the mostly
+idle threads will run with a higher priority when they wake up, and are
+the reason for the low latency mode. The high latency mode will be the
+CPU-bound threads. More analysis with this and other tools can confirm.
+
+
+A -m option can be used to show milliseconds instead, as well as an interval
+and a count. For example, showing three x five second summary in milliseconds:
+
+# ./runqlat -m 5 3
+Tracing run queue latency... Hit Ctrl-C to end.
+
+     msecs               : count     distribution
+         0 -> 1          : 3818     |****************************************|
+         2 -> 3          : 39       |                                        |
+         4 -> 7          : 39       |                                        |
+         8 -> 15         : 62       |                                        |
+        16 -> 31         : 2214     |***********************                 |
+        32 -> 63         : 226      |**                                      |
+
+     msecs               : count     distribution
+         0 -> 1          : 3775     |****************************************|
+         2 -> 3          : 52       |                                        |
+         4 -> 7          : 37       |                                        |
+         8 -> 15         : 65       |                                        |
+        16 -> 31         : 2230     |***********************                 |
+        32 -> 63         : 212      |**                                      |
+
+     msecs               : count     distribution
+         0 -> 1          : 3816     |****************************************|
+         2 -> 3          : 49       |                                        |
+         4 -> 7          : 40       |                                        |
+         8 -> 15         : 53       |                                        |
+        16 -> 31         : 2228     |***********************                 |
+        32 -> 63         : 221      |**                                      |
+
+This shows a similar distribution across the three summaries.
+
+
+A -p option can be used to show one PID only, which is filtered in kernel for
+efficiency. For example, PID 4505, and one second summaries:
+
+# ./runqlat -mp 4505 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+     msecs               : count     distribution
+         0 -> 1          : 1        |*                                       |
+         2 -> 3          : 2        |***                                     |
+         4 -> 7          : 1        |*                                       |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 25       |****************************************|
+        32 -> 63         : 3        |****                                    |
+
+     msecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 2        |**                                      |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |*                                       |
+        16 -> 31         : 30       |****************************************|
+        32 -> 63         : 1        |*                                       |
+
+     msecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 28       |****************************************|
+        32 -> 63         : 2        |**                                      |
+
+     msecs               : count     distribution
+         0 -> 1          : 1        |*                                       |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 27       |****************************************|
+        32 -> 63         : 4        |*****                                   |
+[...]
+
+For comparison, here is pidstat(1) for that process:
+
+# pidstat -p 4505 1
+Linux 4.4.0-virtual (bgregg-xxxxxxxx) 	02/08/2016 	_x86_64_	(8 CPU)
+
+08:56:11 AM   UID       PID    %usr %system  %guest    %CPU   CPU  Command
+08:56:12 AM     0      4505    9.00    3.00    0.00   12.00     0  bash
+08:56:13 AM     0      4505    7.00    5.00    0.00   12.00     0  bash
+08:56:14 AM     0      4505   10.00    2.00    0.00   12.00     0  bash
+08:56:15 AM     0      4505   11.00    2.00    0.00   13.00     0  bash
+08:56:16 AM     0      4505    9.00    3.00    0.00   12.00     0  bash
+[...]
+
+This is a synthetic workload that is CPU bound. It's only spending 12% on-CPU
+each second because of high CPU demand on this server: the remaining time
+is spent waiting on a run queue, as visualized by runqlat.
+
+
+Here is the same system, but when it is CPU idle:
+
+# ./runqlat 5 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+     usecs               : count     distribution
+         0 -> 1          : 2250     |********************************        |
+         2 -> 3          : 2340     |**********************************      |
+         4 -> 7          : 2746     |****************************************|
+         8 -> 15         : 418      |******                                  |
+        16 -> 31         : 93       |*                                       |
+        32 -> 63         : 28       |                                        |
+        64 -> 127        : 119      |*                                       |
+       128 -> 255        : 9        |                                        |
+       256 -> 511        : 4        |                                        |
+       512 -> 1023       : 20       |                                        |
+      1024 -> 2047       : 22       |                                        |
+      2048 -> 4095       : 5        |                                        |
+      4096 -> 8191       : 2        |                                        |
+
+Back to a microsecond scale, this time there is little run queue latency past 1
+millisecond, as would be expected.
+
+
+Now 16 threads are performing heavy disk I/O:
+
+# ./runqlat 5 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+     usecs               : count     distribution
+         0 -> 1          : 204      |                                        |
+         2 -> 3          : 944      |*                                       |
+         4 -> 7          : 16315    |*********************                   |
+         8 -> 15         : 29897    |****************************************|
+        16 -> 31         : 1044     |*                                       |
+        32 -> 63         : 23       |                                        |
+        64 -> 127        : 128      |                                        |
+       128 -> 255        : 24       |                                        |
+       256 -> 511        : 5        |                                        |
+       512 -> 1023       : 13       |                                        |
+      1024 -> 2047       : 15       |                                        |
+      2048 -> 4095       : 13       |                                        |
+      4096 -> 8191       : 10       |                                        |
+
+The distribution hasn't changed too much. While the disks are 100% busy, there
+is still plenty of CPU headroom, and threads still don't spend much time
+waiting their turn.
+
+
+A -P option will print a distribution for each PID:
+
+# ./runqlat -P
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+
+pid = 0
+     usecs               : count     distribution
+         0 -> 1          : 351      |********************************        |
+         2 -> 3          : 96       |********                                |
+         4 -> 7          : 437      |****************************************|
+         8 -> 15         : 12       |*                                       |
+        16 -> 31         : 10       |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 16       |*                                       |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 1        |                                        |
+
+pid = 12929
+     usecs               : count     distribution
+         0 -> 1          : 1        |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+
+pid = 12930
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 1        |****************************************|
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |****************************************|
+
+pid = 12931
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |********************                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 2        |****************************************|
+
+pid = 12932
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 1        |****************************************|
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |****************************************|
+
+pid = 7
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 426      |*************************************   |
+         4 -> 7          : 457      |****************************************|
+         8 -> 15         : 16       |*                                       |
+
+pid = 9
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 425      |****************************************|
+         8 -> 15         : 16       |*                                       |
+
+pid = 11
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 10       |****************************************|
+
+pid = 14
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 8        |****************************************|
+         4 -> 7          : 2        |**********                              |
+
+pid = 18
+     usecs               : count     distribution
+         0 -> 1          : 414      |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 20       |*                                       |
+         8 -> 15         : 8        |                                        |
+
+pid = 12928
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |****************************************|
+
+pid = 1867
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 15       |****************************************|
+        16 -> 31         : 1        |**                                      |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 4        |**********                              |
+
+pid = 1871
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 2        |****************************************|
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |********************                    |
+
+pid = 1876
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 1        |****************************************|
+
+pid = 1878
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 3        |****************************************|
+
+pid = 1880
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 3        |****************************************|
+
+pid = 9307
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+
+pid = 1886
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |********************                    |
+         8 -> 15         : 2        |****************************************|
+
+pid = 1888
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 3        |****************************************|
+
+pid = 3297
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+
+pid = 1892
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 1        |********************                    |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 2        |****************************************|
+
+pid = 7024
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 4        |****************************************|
+
+pid = 16468
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 3        |****************************************|
+
+pid = 12922
+     usecs               : count     distribution
+         0 -> 1          : 1        |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+        16 -> 31         : 1        |****************************************|
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |****************************************|
+
+pid = 12923
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |********************                    |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 2        |****************************************|
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |********************                    |
+      1024 -> 2047       : 1        |********************                    |
+
+pid = 12924
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 2        |********************                    |
+         8 -> 15         : 4        |****************************************|
+        16 -> 31         : 1        |**********                              |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 1        |**********                              |
+
+pid = 12925
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |****************************************|
+
+pid = 12926
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 1        |****************************************|
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 1        |****************************************|
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |****************************************|
+
+pid = 12927
+     usecs               : count     distribution
+         0 -> 1          : 1        |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |****************************************|
+
+
+A -L option will print a distribution for each TID:
+
+# ./runqlat -L
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+
+tid = 0
+     usecs               : count     distribution
+         0 -> 1          : 593      |****************************            |
+         2 -> 3          : 829      |****************************************|
+         4 -> 7          : 300      |**************                          |
+         8 -> 15         : 321      |***************                         |
+        16 -> 31         : 132      |******                                  |
+        32 -> 63         : 58       |**                                      |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 13       |                                        |
+
+tid = 7
+     usecs               : count     distribution
+         0 -> 1          : 8        |********                                |
+         2 -> 3          : 19       |********************                    |
+         4 -> 7          : 37       |****************************************|
+[...]
+
+
+And a --pidnss option (short for PID namespaces)  will print for each PID
+namespace, for analyzing container performance:
+
+# ./runqlat --pidnss -m
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+
+pidns = 4026532870
+     msecs               : count     distribution
+         0 -> 1          : 40       |****************************************|
+         2 -> 3          : 1        |*                                       |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 2        |**                                      |
+        64 -> 127        : 5        |*****                                   |
+
+pidns = 4026532809
+     msecs               : count     distribution
+         0 -> 1          : 67       |****************************************|
+
+pidns = 4026532748
+     msecs               : count     distribution
+         0 -> 1          : 63       |****************************************|
+
+pidns = 4026532687
+     msecs               : count     distribution
+         0 -> 1          : 7        |****************************************|
+
+pidns = 4026532626
+     msecs               : count     distribution
+         0 -> 1          : 45       |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 3        |**                                      |
+
+pidns = 4026531836
+     msecs               : count     distribution
+         0 -> 1          : 314      |****************************************|
+         2 -> 3          : 1        |                                        |
+         4 -> 7          : 11       |*                                       |
+         8 -> 15         : 28       |***                                     |
+        16 -> 31         : 137      |*****************                       |
+        32 -> 63         : 86       |**********                              |
+        64 -> 127        : 1        |                                        |
+
+pidns = 4026532382
+     msecs               : count     distribution
+         0 -> 1          : 285      |****************************************|
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 16       |**                                      |
+         8 -> 15         : 9        |*                                       |
+        16 -> 31         : 69       |*********                               |
+        32 -> 63         : 25       |***                                     |
+
+Many of these distributions have two modes: the second, in this case, is
+caused by capping CPU usage via CPU shares.
+
+
+USAGE message:
+
+# ./runqlat -h
+usage: runqlat.py [-h] [-T] [-m] [-P] [--pidnss] [-L] [-p PID]
+                  [interval] [count]
+
+Summarize run queue (scheduler) latency as a histogram
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --timestamp     include timestamp on output
+  -m, --milliseconds  millisecond histogram
+  -P, --pids          print a histogram per process ID
+  --pidnss            print a histogram per PID namespace
+  -L, --tids          print a histogram per thread ID
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./runqlat            # summarize run queue latency as a histogram
+    ./runqlat 1 10       # print 1 second summaries, 10 times
+    ./runqlat -mT 1      # 1s summaries, milliseconds, and timestamps
+    ./runqlat -P         # show each PID separately
+    ./runqlat -p 185     # trace PID 185 only
diff --git a/tools/runqlen.py b/tools/runqlen.py
new file mode 100755
index 0000000..b56a591
--- /dev/null
+++ b/tools/runqlen.py
@@ -0,0 +1,255 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# runqlen    Summarize scheduler run queue length as a histogram.
+#            For Linux, uses BCC, eBPF.
+#
+# This counts the length of the run queue, excluding the currently running
+# thread, and shows it as a histogram.
+#
+# Also answers run queue occupancy.
+#
+# USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count]
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Dec-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from time import sleep, strftime
+from tempfile import NamedTemporaryFile
+from os import open, close, dup, unlink, O_WRONLY
+import argparse
+
+# arguments
+examples = """examples:
+    ./runqlen            # summarize run queue length as a histogram
+    ./runqlen 1 10       # print 1 second summaries, 10 times
+    ./runqlen -T 1       # 1s summaries and timestamps
+    ./runqlen -O         # report run queue occupancy
+    ./runqlen -C         # show each CPU separately
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize scheduler run queue length as a histogram",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-O", "--runqocc", action="store_true",
+    help="report run queue occupancy")
+parser.add_argument("-C", "--cpus", action="store_true",
+    help="print output for each CPU separately")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+frequency = 99
+
+# Linux 4.15 introduced a new field runnable_weight
+# in linux_src:kernel/sched/sched.h as
+#     struct cfs_rq {
+#         struct load_weight load;
+#         unsigned long runnable_weight;
+#         unsigned int nr_running, h_nr_running;
+#         ......
+#     }
+# and this tool requires to access nr_running to get
+# runqueue len information.
+#
+# The commit which introduces cfs_rq->runnable_weight
+# field also introduces the field sched_entity->runnable_weight
+# where sched_entity is defined in linux_src:include/linux/sched.h.
+#
+# To cope with pre-4.15 and 4.15/post-4.15 releases,
+# we run a simple BPF program to detect whether
+# field sched_entity->runnable_weight exists. The existence of
+# this field should infer the existence of cfs_rq->runnable_weight.
+#
+# This will need maintenance as the relationship between these
+# two fields may change in the future.
+#
+def check_runnable_weight_field():
+    # Define the bpf program for checking purpose
+    bpf_check_text = """
+#include <linux/sched.h>
+unsigned long dummy(struct sched_entity *entity)
+{
+    return entity->runnable_weight;
+}
+"""
+
+    # Get a temporary file name
+    tmp_file = NamedTemporaryFile(delete=False)
+    tmp_file.close();
+
+    # Duplicate and close stderr (fd = 2)
+    old_stderr = dup(2)
+    close(2)
+
+    # Open a new file, should get fd number 2
+    # This will avoid printing llvm errors on the screen
+    fd = open(tmp_file.name, O_WRONLY)
+    try:
+        t = BPF(text=bpf_check_text)
+        success_compile = True
+    except:
+        success_compile = False
+
+    # Release the fd 2, and next dup should restore old stderr
+    close(fd)
+    dup(old_stderr)
+    close(old_stderr)
+
+    # remove the temporary file and return
+    unlink(tmp_file.name)
+    return success_compile
+
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+// Declare enough of cfs_rq to find nr_running, since we can't #import the
+// header. This will need maintenance. It is from kernel/sched/sched.h:
+struct cfs_rq_partial {
+    struct load_weight load;
+    RUNNABLE_WEIGHT_FIELD
+    unsigned int nr_running, h_nr_running;
+};
+
+typedef struct cpu_key {
+    int cpu;
+    unsigned int slot;
+} cpu_key_t;
+STORAGE
+
+int do_perf_event()
+{
+    unsigned int len = 0;
+    pid_t pid = 0;
+    struct task_struct *task = NULL;
+    struct cfs_rq_partial *my_q = NULL;
+
+    // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
+    // unstable interface and may need maintenance. Perhaps a future version
+    // of BPF will support task_rq(p) or something similar as a more reliable
+    // interface.
+    task = (struct task_struct *)bpf_get_current_task();
+    my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
+    len = my_q->nr_running;
+
+    // Calculate run queue length by subtracting the currently running task,
+    // if present. len 0 == idle, len 1 == one running task.
+    if (len > 0)
+        len--;
+
+    STORE
+
+    return 0;
+}
+"""
+
+# code substitutions
+if args.cpus:
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, cpu_key_t);')
+    bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' +
+        'key.cpu = bpf_get_smp_processor_id(); ' +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, unsigned int);')
+    bpf_text = bpf_text.replace('STORE', 'dist.increment(len);')
+
+if check_runnable_weight_field():
+    bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;')
+else:
+    bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+    ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event",
+    sample_period=0, sample_freq=frequency)
+
+print("Sampling run queue length... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    if args.runqocc:
+        if args.cpus:
+            # run queue occupancy, per-CPU summary
+            idle = {}
+            queued = {}
+            cpumax = 0
+            for k, v in dist.items():
+                if k.cpu > cpumax:
+                    cpumax = k.cpu
+            for c in range(0, cpumax + 1):
+                idle[c] = 0
+                queued[c] = 0
+            for k, v in dist.items():
+                if k.slot == 0:
+                    idle[k.cpu] += v.value
+                else:
+                    queued[k.cpu] += v.value
+            for c in range(0, cpumax + 1):
+                samples = idle[c] + queued[c]
+                if samples:
+                    runqocc = float(queued[c]) / samples
+                else:
+                    runqocc = 0
+                print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc))
+
+        else:
+            # run queue occupancy, system-wide summary
+            idle = 0
+            queued = 0
+            for k, v in dist.items():
+                if k.value == 0:
+                    idle += v.value
+                else:
+                    queued += v.value
+            samples = idle + queued
+            if samples:
+                runqocc = float(queued) / samples
+            else:
+                runqocc = 0
+            print("runqocc: %0.2f%%" % (100 * runqocc))
+
+    else:
+        # run queue length histograms
+        dist.print_linear_hist("runqlen", "cpu")
+
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/runqlen_example.txt b/tools/runqlen_example.txt
new file mode 100644
index 0000000..4c10ed4
--- /dev/null
+++ b/tools/runqlen_example.txt
@@ -0,0 +1,295 @@
+Demonstrations of runqlen, the Linux eBPF/bcc version.
+
+
+This program summarizes scheduler queue length as a histogram, and can also
+show run queue occupancy. It works by sampling the run queue length on all
+CPUs at 99 Hertz.
+
+As an example, here is an idle system:
+
+# ./runqlen.py
+Sampling run queue length... Hit Ctrl-C to end.
+^C
+     runqlen       : count     distribution
+        0          : 1776     |****************************************|
+
+This shows a zero run queue length each time it was sampled.
+
+And now a heavily loaded system:
+
+# ./runqlen.py
+Sampling run queue length... Hit Ctrl-C to end.
+^C
+     runqlen       : count     distribution
+        0          : 1068     |****************************************|
+        1          : 642      |************************                |
+        2          : 369      |*************                           |
+        3          : 183      |******                                  |
+        4          : 104      |***                                     |
+        5          : 42       |*                                       |
+        6          : 13       |                                        |
+        7          : 2        |                                        |
+        8          : 1        |                                        |
+
+Now there is often threads queued, with one sample reaching a queue length
+of 8. This will cause run queue latency, which can be measured by the bcc
+runqlat tool.
+
+
+Here's an example of an issue that runqlen can indentify. Starting with the
+system-wide summary:
+
+# ./runqlen.py 
+Sampling run queue length... Hit Ctrl-C to end.
+^C
+     runqlen       : count     distribution
+        0          : 1209     |****************************************|
+        1          : 372      |************                            |
+        2          : 73       |**                                      |
+        3          : 3        |                                        |
+        4          : 1        |                                        |
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 237      |*******                                 |
+
+This shows there is often a run queue length of 7. Now using the -C option to
+see per-CPU histograms:
+
+# ./runqlen.py -C
+Sampling run queue length... Hit Ctrl-C to end.
+^C
+
+cpu = 0
+     runqlen       : count     distribution
+        0          : 257      |****************************************|
+        1          : 64       |*********                               |
+        2          : 5        |                                        |
+        3          : 0        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 1        |                                        |
+
+cpu = 1
+     runqlen       : count     distribution
+        0          : 226      |****************************************|
+        1          : 90       |***************                         |
+        2          : 11       |*                                       |
+
+cpu = 2
+     runqlen       : count     distribution
+        0          : 264      |****************************************|
+        1          : 52       |*******                                 |
+        2          : 8        |*                                       |
+        3          : 1        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 1        |                                        |
+        7          : 0        |                                        |
+        8          : 1        |                                        |
+
+cpu = 3
+     runqlen       : count     distribution
+        0          : 0        |                                        |
+        1          : 0        |                                        |
+        2          : 0        |                                        |
+        3          : 0        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 327      |****************************************|
+
+cpu = 4
+     runqlen       : count     distribution
+        0          : 255      |****************************************|
+        1          : 63       |*********                               |
+        2          : 9        |*                                       |
+
+cpu = 5
+     runqlen       : count     distribution
+        0          : 244      |****************************************|
+        1          : 78       |************                            |
+        2          : 3        |                                        |
+        3          : 2        |                                        |
+
+cpu = 6
+     runqlen       : count     distribution
+        0          : 253      |****************************************|
+        1          : 66       |**********                              |
+        2          : 6        |                                        |
+        3          : 1        |                                        |
+        4          : 1        |                                        |
+
+cpu = 7
+     runqlen       : count     distribution
+        0          : 243      |****************************************|
+        1          : 74       |************                            |
+        2          : 6        |                                        |
+        3          : 1        |                                        |
+        4          : 0        |                                        |
+        5          : 1        |                                        |
+        6          : 2        |                                        |
+
+The run queue length of 7 is isolated to CPU 3. It was caused by CPU binding
+(taskset). This can sometimes happen by applications that try to auto-bind
+to CPUs, leaving other CPUs idle while work is queued.
+
+
+runqlat accepts an interval and a count. For example, with -T for timestamps:
+
+# ./runqlen.py -T 1 5
+Sampling run queue length... Hit Ctrl-C to end.
+
+19:51:34
+     runqlen       : count     distribution
+        0          : 635      |****************************************|
+        1          : 142      |********                                |
+        2          : 13       |                                        |
+        3          : 0        |                                        |
+        4          : 1        |                                        |
+
+19:51:35
+     runqlen       : count     distribution
+        0          : 640      |****************************************|
+        1          : 136      |********                                |
+        2          : 13       |                                        |
+        3          : 1        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 0        |                                        |
+        8          : 0        |                                        |
+        9          : 0        |                                        |
+        10         : 1        |                                        |
+
+19:51:36
+     runqlen       : count     distribution
+        0          : 603      |****************************************|
+        1          : 170      |***********                             |
+        2          : 16       |*                                       |
+        3          : 1        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 0        |                                        |
+        8          : 0        |                                        |
+        9          : 1        |                                        |
+
+19:51:37
+     runqlen       : count     distribution
+        0          : 617      |****************************************|
+        1          : 154      |*********                               |
+        2          : 20       |*                                       |
+        3          : 0        |                                        |
+        4          : 0        |                                        |
+        5          : 0        |                                        |
+        6          : 0        |                                        |
+        7          : 0        |                                        |
+        8          : 0        |                                        |
+        9          : 0        |                                        |
+        10         : 0        |                                        |
+        11         : 1        |                                        |
+
+19:51:38
+     runqlen       : count     distribution
+        0          : 603      |****************************************|
+        1          : 161      |**********                              |
+        2          : 24       |*                                       |
+        3          : 4        |                                        |
+
+The spikes in run queue length of 11 are likely threads waking up at the same
+time (a thundering herd), and then are scheduled and complete their execution
+quickly.
+
+
+The -O option prints run queue occupancy: the percentage of time that there
+was work queued waiting its turn. Eg:
+
+# ./runqlen.py -OT 1 
+Sampling run queue length... Hit Ctrl-C to end.
+
+19:54:53
+runqocc: 41.09%
+
+19:54:54
+runqocc: 41.85%
+
+19:54:55
+runqocc: 41.47%
+
+19:54:56
+runqocc: 42.35%
+
+19:54:57
+runqocc: 40.83%
+[...]
+
+This can also be examined per-CPU:
+
+# ./runqlen.py -COT 1 
+Sampling run queue length... Hit Ctrl-C to end.
+
+19:55:03
+runqocc, CPU 0    32.32%
+runqocc, CPU 1    26.26%
+runqocc, CPU 2    38.38%
+runqocc, CPU 3   100.00%
+runqocc, CPU 4    26.26%
+runqocc, CPU 5    32.32%
+runqocc, CPU 6    39.39%
+runqocc, CPU 7    46.46%
+
+19:55:04
+runqocc, CPU 0    35.00%
+runqocc, CPU 1    32.32%
+runqocc, CPU 2    37.00%
+runqocc, CPU 3   100.00%
+runqocc, CPU 4    43.43%
+runqocc, CPU 5    31.31%
+runqocc, CPU 6    28.00%
+runqocc, CPU 7    31.31%
+
+19:55:05
+runqocc, CPU 0    43.43%
+runqocc, CPU 1    32.32%
+runqocc, CPU 2    45.45%
+runqocc, CPU 3   100.00%
+runqocc, CPU 4    29.29%
+runqocc, CPU 5    36.36%
+runqocc, CPU 6    36.36%
+runqocc, CPU 7    30.30%
+
+19:55:06
+runqocc, CPU 0    40.00%
+runqocc, CPU 1    38.00%
+runqocc, CPU 2    31.31%
+runqocc, CPU 3   100.00%
+runqocc, CPU 4    31.31%
+runqocc, CPU 5    28.28%
+runqocc, CPU 6    31.00%
+runqocc, CPU 7    29.29%
+[...]
+
+
+USAGE message:
+
+# ./runqlen -h
+usage: runqlen [-h] [-T] [-O] [-C] [interval] [count]
+
+Summarize scheduler run queue length as a histogram
+
+positional arguments:
+  interval         output interval, in seconds
+  count            number of outputs
+
+optional arguments:
+  -h, --help       show this help message and exit
+  -T, --timestamp  include timestamp on output
+  -O, --runqocc    report run queue occupancy
+  -C, --cpus       print output for each CPU separately
+
+examples:
+    ./runqlen            # summarize run queue length as a histogram
+    ./runqlen 1 10       # print 1 second summaries, 10 times
+    ./runqlen -T 1       # 1s summaries and timestamps
+    ./runqlen -O         # report run queue occupancy
+    ./runqlen -C         # show each CPU separately
diff --git a/tools/runqslower.py b/tools/runqslower.py
new file mode 100755
index 0000000..7a1869c
--- /dev/null
+++ b/tools/runqslower.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# runqslower    Trace long process scheduling delays.
+#               For Linux, uses BCC, eBPF.
+#
+# This script traces high scheduling delays between tasks being
+# ready to run and them running on CPU after that.
+#
+# USAGE: runqslower [-p PID] [min_us]
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
+#
+# This measures the time a task spends waiting on a run queue for a turn
+# on-CPU, and shows this time as a individual events. This time should be small,
+# but a task may need to wait its turn due to CPU load.
+#
+# This measures two types of run queue latency:
+# 1. The time from a task being enqueued on a run queue to its context switch
+#    and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
+#    finish_task_switch() with either raw tracepoints (if supported) or kprobes
+#    and instruments the run queue latency after a voluntary context switch.
+# 2. The time from when a task was involuntary context switched and still
+#    in the runnable state, to when it next executed. This is instrumented
+#    from finish_task_switch() alone.
+#
+# Copyright 2016 Cloudflare, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 02-May-2018   Ivan Babrou   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./runqslower         # trace run queue latency higher than 10000 us (default)
+    ./runqslower 1000    # trace run queue latency higher than 1000 us
+    ./runqslower -p 123  # trace pid 123 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace high run queue latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="pid",
+    help="trace this PID only")
+parser.add_argument("min_us", nargs="?", default='10000',
+    help="minimum run queue latecy to trace, in ms (default 10000)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_us = int(args.min_us)
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+
+BPF_HASH(start, u32);
+
+struct rq;
+
+struct data_t {
+    u32 pid;
+    char task[TASK_COMM_LEN];
+    u64 delta_us;
+};
+
+BPF_PERF_OUTPUT(events);
+
+// record enqueue timestamp
+static int trace_enqueue(u32 tgid, u32 pid)
+{
+    if (FILTER_PID || pid == 0)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+"""
+
+bpf_text_kprobe = """
+int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
+{
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
+    int wake_flags)
+{
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+// calculate latency
+int trace_run(struct pt_regs *ctx, struct task_struct *prev)
+{
+    u32 pid, tgid;
+
+    // ivcsw: treat like an enqueue event and store timestamp
+    if (prev->state == TASK_RUNNING) {
+        tgid = prev->tgid;
+        pid = prev->pid;
+        if (!(FILTER_PID || pid == 0)) {
+            u64 ts = bpf_ktime_get_ns();
+            start.update(&pid, &ts);
+        }
+    }
+
+    tgid = bpf_get_current_pid_tgid() >> 32;
+    pid = bpf_get_current_pid_tgid();
+
+    u64 *tsp, delta_us;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed enqueue
+    }
+    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+
+    if (FILTER_US)
+        return 0;
+
+    struct data_t data = {};
+    data.pid = pid;
+    data.delta_us = delta_us;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+bpf_text_raw_tp = """
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    return trace_enqueue(p->tgid, p->pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+    // TP_PROTO(struct task_struct *p)
+    struct task_struct *p = (struct task_struct *)ctx->args[0];
+    u32 tgid, pid;
+
+    bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
+    bpf_probe_read(&pid, sizeof(pid), &p->pid);
+    return trace_enqueue(tgid, pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+    struct task_struct *prev = (struct task_struct *)ctx->args[1];
+    struct task_struct *next= (struct task_struct *)ctx->args[2];
+    u32 pid, tgid;
+    long state;
+
+    // ivcsw: treat like an enqueue event and store timestamp
+    bpf_probe_read(&state, sizeof(long), &prev->state);
+    if (state == TASK_RUNNING) {
+        bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
+        bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
+        if (!(FILTER_PID || pid == 0)) {
+            u64 ts = bpf_ktime_get_ns();
+            start.update(&pid, &ts);
+        }
+    }
+
+    bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid);
+    bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
+
+    u64 *tsp, delta_us;
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed enqueue
+    }
+    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+
+    if (FILTER_US)
+        return 0;
+
+    struct data_t data = {};
+    data.pid = pid;
+    data.delta_us = delta_us;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+is_support_raw_tp = BPF.support_raw_tracepoint()
+if is_support_raw_tp:
+    bpf_text += bpf_text_raw_tp
+else:
+    bpf_text += bpf_text_kprobe
+
+# code substitutions
+if min_us == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("delta_us", ct.c_ulonglong),
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us))
+
+# load BPF program
+b = BPF(text=bpf_text)
+if not is_support_raw_tp:
+    b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
+    b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
+    b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
+
+print("Tracing run queue latency higher than %d us" % min_us)
+print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "PID", "LAT(us)"))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/runqslower_example.txt b/tools/runqslower_example.txt
new file mode 100644
index 0000000..64b604e
--- /dev/null
+++ b/tools/runqslower_example.txt
@@ -0,0 +1,49 @@
+Demonstrations of runqslower, the Linux eBPF/bcc version.
+
+
+runqslower shows high latency scheduling times between tasks being
+ready to run and them running on CPU after that. For example:
+
+# runqslower
+Tracing run queue latency higher than 10000 us
+TIME     COMM             PID           LAT(us)
+04:16:32 cc1              12924           12739
+04:16:32 sh               13640           12118
+04:16:32 make             13639           12730
+04:16:32 bash             13655           12047
+04:16:32 bash             13657           12744
+04:16:32 bash             13656           12880
+04:16:32 sh               13660           10846
+04:16:32 gcc              13663           12681
+04:16:32 make             13668           10814
+04:16:32 make             13670           12988
+04:16:32 gcc              13677           11770
+04:16:32 gcc              13678           23519
+04:16:32 as               12999           20541
+[...]
+
+This shows various processes waiting for available CPU during a Linux kernel
+build. By default the output contains delays for more than 10ms.
+
+These delays can be analyzed in depth with "perf sched" tool, see:
+
+* http://www.brendangregg.com/blog/2017-03-16/perf-sched.html
+
+USAGE message:
+
+# ./runqslower -h
+usage: runqslower.py [-h] [-p PID] [min_us]
+
+Trace high run queue latency
+
+positional arguments:
+  min_us             minimum run queue latecy to trace, in ms (default 10000)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./runqslower         # trace run queue latency higher than 10000 us (default)
+    ./runqslower 1000    # trace run queue latency higher than 1000 us
+    ./runqslower -p 123  # trace pid 123 only
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
new file mode 100755
index 0000000..101c585
--- /dev/null
+++ b/tools/slabratetop.py
@@ -0,0 +1,143 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# slabratetop  Summarize kmem_cache_alloc() calls.
+#              For Linux, uses BCC, eBPF.
+#
+# USAGE: slabratetop [-h] [-C] [-r MAXROWS] [interval] [count]
+#
+# This uses in-kernel BPF maps to store cache summaries for efficiency.
+#
+# SEE ALSO: slabtop(1), which shows the cache volumes.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Oct-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import printb
+from time import sleep, strftime
+import argparse
+import signal
+from subprocess import call
+
+# arguments
+examples = """examples:
+    ./slabratetop            # kmem_cache_alloc() top, 1 second refresh
+    ./slabratetop -C         # don't clear the screen
+    ./slabratetop 5          # 5 second summaries
+    ./slabratetop 5 10       # 5 second summaries, 10 times only
+"""
+parser = argparse.ArgumentParser(
+    description="Kernel SLAB/SLUB memory cache allocation rate top",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-r", "--maxrows", default=20,
+    help="maximum rows to print, default 20")
+parser.add_argument("interval", nargs="?", default=1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+interval = int(args.interval)
+countdown = int(args.count)
+maxrows = int(args.maxrows)
+clear = not int(args.noclear)
+debug = 0
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#else
+#include <linux/slab_def.h>
+#endif
+
+#define CACHE_NAME_SIZE 32
+
+// the key for the output summary
+struct info_t {
+    char name[CACHE_NAME_SIZE];
+};
+
+// the value of the output summary
+struct val_t {
+    u64 count;
+    u64 size;
+};
+
+BPF_HASH(counts, struct info_t, struct val_t);
+
+int kprobe__kmem_cache_alloc(struct pt_regs *ctx, struct kmem_cache *cachep)
+{
+    struct info_t info = {};
+    const char *name = cachep->name;
+    bpf_probe_read(&info.name, sizeof(info.name), name);
+
+    struct val_t *valp, zero = {};
+    valp = counts.lookup_or_init(&info, &zero);
+    valp->count++;
+    valp->size += cachep->size;
+
+    return 0;
+}
+"""
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
+
+# output
+exiting = 0
+while 1:
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    # header
+    if clear:
+        call("clear")
+    else:
+        print()
+    with open(loadavg) as stats:
+        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+    print("%-32s %6s %10s" % ("CACHE", "ALLOCS", "BYTES"))
+
+    # by-TID output
+    counts = b.get_table("counts")
+    line = 0
+    for k, v in reversed(sorted(counts.items(),
+                                key=lambda counts: counts[1].size)):
+        printb(b"%-32s %6d %10d" % (k.name, v.count, v.size))
+
+        line += 1
+        if line >= maxrows:
+            break
+    counts.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        print("Detaching...")
+        exit()
diff --git a/tools/slabratetop_example.txt b/tools/slabratetop_example.txt
new file mode 100644
index 0000000..7aa8c05
--- /dev/null
+++ b/tools/slabratetop_example.txt
@@ -0,0 +1,134 @@
+Demonstrations of slabratetop, the Linux eBPF/bcc version.
+
+
+slabratetop shows the rate of allocations and total bytes from the kernel
+memory allocation caches (SLAB or SLUB), in a top-like display that refreshes.
+For example:
+
+# ./slabratetop
+<screen clears>
+07:01:35 loadavg: 0.38 0.21 0.12 1/342 13297
+
+CACHE                            ALLOCS      BYTES
+kmalloc-4096                       3554   14557184
+kmalloc-256                        2382     609536
+cred_jar                           2568     493056
+anon_vma_chain                     2007     128448
+anon_vma                            972      77760
+sighand_cache                        24      50688
+mm_struct                            49      50176
+RAW                                  52      49920
+proc_inode_cache                     59      38232
+signal_cache                         24      26112
+dentry                              135      25920
+sock_inode_cache                     29      18560
+files_cache                          24      16896
+inode_cache                          13       7696
+TCP                                   2       3840
+pid                                  24       3072
+sigqueue                             17       2720
+ext4_inode_cache                      2       2160
+buffer_head                          16       1664
+xfs_trans                             5       1160
+
+By default the screen refreshes every one second, and only the top 20 caches
+are shown. These can be tuned with options: see USAGE (-h).
+
+The output above showed that the kmalloc-4096 cache allocated the most, about
+14 Mbytes during this interval. This is a generic cache; other caches have
+more meaningful names ("dentry", "TCP", "pid", etc).
+
+slabtop(1) is a similar tool that shows the current static volume and usage
+of the caches. slabratetop shows the active call rates and total size of the
+allocations.
+
+
+Since "kmalloc-4096" isn't very descriptive, I'm interested in seeing the
+kernel stacks that led to this allocation. In the future (maybe by now) the
+bcc trace tool could do this. As I'm writing this, it can't, so I'll use my
+older ftrace-based kprobe tool as a workarond. This is from my perf-tools
+collection: https://github.com/brendangregg/perf-tools.
+
+# ./perf-tools/bin/kprobe -s 'p:kmem_cache_alloc name=+0(+96(%di)):string' 'name == "kmalloc-4096' | head -100
+Tracing kprobe kmem_cache_alloc. Ctrl-C to end.
+          kprobe-3892  [002] d... 7888274.478331: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478333: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+          kprobe-3892  [002] d... 7888274.478340: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478341: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+          kprobe-3892  [002] d... 7888274.478345: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478346: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+          kprobe-3892  [002] d... 7888274.478350: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478351: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+          kprobe-3892  [002] d... 7888274.478355: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478355: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+          kprobe-3892  [002] d... 7888274.478359: kmem_cache_alloc: (kmem_cache_alloc+0x0/0x1b0) name="kmalloc-4096"
+          kprobe-3892  [002] d... 7888274.478359: <stack trace>
+ => kmem_cache_alloc
+ => user_path_at_empty
+ => vfs_fstatat
+ => SYSC_newstat
+ => SyS_newstat
+ => entry_SYSCALL_64_fastpath
+[...]
+
+This is just an example so that you can see it's possible to dig further.
+Please don't copy-n-paste that kprobe command, as it's unlikely to work (the
+"+0(+96(%di))" text is specific to a kernel version and architecture).
+
+So these allocations are coming from user_path_at_empty(), which calls other
+functions (not seen in the stack: I suspect it's a tail-call compiler
+optimization).
+
+
+USAGE:
+
+# ./slabratetop -h
+usage: slabratetop [-h] [-C] [-r MAXROWS] [interval] [count]
+
+Kernel SLAB/SLUB memory cache allocation rate top
+
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -C, --noclear         don't clear the screen
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+
+examples:
+    ./slabratetop            # kmem_cache_alloc() top, 1 second refresh
+    ./slabratetop -C         # don't clear the screen
+    ./slabratetop 5          # 5 second summaries
+    ./slabratetop 5 10       # 5 second summaries, 10 times only
diff --git a/tools/softirqs.py b/tools/softirqs.py
new file mode 100755
index 0000000..1e2daf5
--- /dev/null
+++ b/tools/softirqs.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# softirqs  Summarize soft IRQ (interrupt) event time.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: softirqs [-h] [-T] [-N] [-d] [interval] [count]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Oct-2015   Brendan Gregg     Created this.
+# 03-Apr-2017   Sasha Goldshtein  Migrated to kernel tracepoints.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./softirqs            # sum soft irq event time
+    ./softirqs -d         # show soft irq event time as histograms
+    ./softirqs 1 10       # print 1 second summaries, 10 times
+    ./softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize soft irq event time as histograms.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-N", "--nanoseconds", action="store_true",
+    help="output in nanoseconds")
+parser.add_argument("-d", "--dist", action="store_true",
+    help="show distributions as histograms")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.count)
+if args.nanoseconds:
+    factor = 1
+    label = "nsecs"
+else:
+    factor = 1000
+    label = "usecs"
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+
+typedef struct irq_key {
+    u32 vec;
+    u64 slot;
+} irq_key_t;
+
+typedef struct account_val {
+    u64 ts;
+    u32 vec;
+} account_val_t;
+
+BPF_HASH(start, u32, account_val_t);
+BPF_HASH(iptr, u32);
+BPF_HISTOGRAM(dist, irq_key_t);
+
+TRACEPOINT_PROBE(irq, softirq_entry)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    account_val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.vec = args->vec;
+    start.update(&pid, &val);
+    return 0;
+}
+
+TRACEPOINT_PROBE(irq, softirq_exit)
+{
+    u64 delta;
+    u32 vec;
+    u32 pid = bpf_get_current_pid_tgid();
+    account_val_t *valp;
+    irq_key_t key = {0};
+
+    // fetch timestamp and calculate delta
+    valp = start.lookup(&pid);
+    if (valp == 0) {
+        return 0;   // missed start
+    }
+    delta = bpf_ktime_get_ns() - valp->ts;
+    vec = valp->vec;
+
+    // store as sum or histogram
+    STORE
+
+    start.delete(&pid);
+    return 0;
+}
+"""
+
+# code substitutions
+if args.dist:
+    bpf_text = bpf_text.replace('STORE',
+        'key.vec = vec; key.slot = bpf_log2l(delta / %d); ' % factor +
+        'dist.increment(key);')
+else:
+    bpf_text = bpf_text.replace('STORE',
+        'key.vec = valp->vec; ' +
+        'dist.increment(key, delta);')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+def vec_to_name(vec):
+    # copied from softirq_to_name() in kernel/softirq.c
+    # may need updates if new softirq handlers are added
+    return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
+            "tasklet", "sched", "hrtimer", "rcu"][vec]
+
+print("Tracing soft irq event time... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+    if args.dist:
+        dist.print_log2_hist(label, "softirq", section_print_fn=vec_to_name)
+    else:
+        print("%-16s %11s" % ("SOFTIRQ", "TOTAL_" + label))
+        for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
+            print("%-16s %11d" % (vec_to_name(k.vec), v.value / factor))
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/softirqs_example.txt b/tools/softirqs_example.txt
new file mode 100644
index 0000000..ef3174a
--- /dev/null
+++ b/tools/softirqs_example.txt
@@ -0,0 +1,203 @@
+Demonstrations of softirqs, the Linux eBPF/bcc version.
+
+
+This program traces soft interrupts (irqs), and stores timing statistics
+in-kernel for efficiency. For example:
+
+# ./softirqs
+Tracing soft irq event time... Hit Ctrl-C to end.
+^C
+SOFTIRQ                    TOTAL_usecs
+rcu_process_callbacks              974
+run_rebalance_domains             1809
+run_timer_softirq                 2615
+net_tx_action                    14605
+tasklet_action                   38692
+net_rx_action                    88188
+
+The SOFTIRQ column prints the interrupt action function name. While tracing,
+the net_rx_action() soft interrupt ran for 20199 microseconds (20 milliseconds)
+in total.
+
+This tool works by dynamic tracing the individual softirq functions, and will
+need to be adjusted to match kernel/module changes. Future versions should
+use the softirq tracepoints instead.
+
+
+An interval can be provided, and also optionally a count. Eg, printing output
+every 1 second, and including timestamps (-T):
+
+# ./softirqs -T 1 3
+Tracing soft irq event time... Hit Ctrl-C to end.
+
+22:29:16
+SOFTIRQ                    TOTAL_usecs
+rcu_process_callbacks              456
+run_rebalance_domains             1005
+run_timer_softirq                 1196
+net_tx_action                     2796
+tasklet_action                    5534
+net_rx_action                    15075
+
+22:29:17
+SOFTIRQ                    TOTAL_usecs
+rcu_process_callbacks              456
+run_rebalance_domains              839
+run_timer_softirq                 1142
+net_tx_action                     1912
+tasklet_action                    4428
+net_rx_action                    14652
+
+22:29:18
+SOFTIRQ                    TOTAL_usecs
+rcu_process_callbacks              502
+run_rebalance_domains              840
+run_timer_softirq                 1192
+net_tx_action                     2341
+tasklet_action                    5496
+net_rx_action                    15656
+
+This can be useful for quantifying where CPU cycles are spent among the soft
+interrupts (summarized as the %softirq column from mpstat(1), and shown as
+event counts in /proc/softirqs). The output above shows that most time was spent
+processing net_rx_action(), which was around 15 milliseconds per second (total
+time across all CPUs).
+
+
+The distribution of interrupt run time can be printed as a histogram with the -d
+option. Eg:
+
+# ./softirqs -d
+Tracing soft irq event time... Hit Ctrl-C to end.
+^C
+
+softirq = net_tx_action
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 440      |                                        |
+       512 -> 1023       : 27613    |****************************************|
+      1024 -> 2047       : 5728     |********                                |
+      2048 -> 4095       : 439      |                                        |
+      4096 -> 8191       : 53       |                                        |
+      8192 -> 16383      : 2        |                                        |
+
+softirq = net_rx_action
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 6        |                                        |
+      1024 -> 2047       : 35       |                                        |
+      2048 -> 4095       : 3562     |****************                        |
+      4096 -> 8191       : 7023     |********************************        |
+      8192 -> 16383      : 8770     |****************************************|
+     16384 -> 32767      : 1780     |********                                |
+     32768 -> 65535      : 216      |                                        |
+     65536 -> 131071     : 4        |                                        |
+
+softirq = tasklet_action
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 95       |                                        |
+       512 -> 1023       : 12521    |****************************************|
+      1024 -> 2047       : 1068     |***                                     |
+      2048 -> 4095       : 1077     |***                                     |
+      4096 -> 8191       : 12349    |*************************************** |
+      8192 -> 16383      : 464      |*                                       |
+     16384 -> 32767      : 1        |                                        |
+
+softirq = rcu_process_callbacks
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 708      |****************************************|
+      1024 -> 2047       : 495      |***************************             |
+      2048 -> 4095       : 98       |*****                                   |
+      4096 -> 8191       : 62       |***                                     |
+      8192 -> 16383      : 4        |                                        |
+
+softirq = run_timer_softirq
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 2        |                                        |
+       512 -> 1023       : 366      |*********                               |
+      1024 -> 2047       : 1525     |****************************************|
+      2048 -> 4095       : 629      |****************                        |
+      4096 -> 8191       : 87       |**                                      |
+      8192 -> 16383      : 1        |                                        |
+
+softirq = run_rebalance_domains
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 3        |                                        |
+       512 -> 1023       : 18       |*                                       |
+      1024 -> 2047       : 80       |********                                |
+      2048 -> 4095       : 374      |****************************************|
+      4096 -> 8191       : 257      |***************************             |
+      8192 -> 16383      : 50       |*****                                   |
+     16384 -> 32767      : 24       |**                                      |
+
+
+USAGE message:
+
+# ./softirqs -h
+usage: softirqs [-h] [-T] [-N] [-d] [interval] [count]
+
+Summarize soft irq event time as histograms
+
+positional arguments:
+  interval           output interval, in seconds
+  count              number of outputs
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -T, --timestamp    include timestamp on output
+  -N, --nanoseconds  output in nanoseconds
+  -d, --dist         show distributions as histograms
+
+examples:
+    ./softirqs            # sum soft irq event time
+    ./softirqs -d         # show soft irq event time as histograms
+    ./softirqs 1 10       # print 1 second summaries, 10 times
+    ./softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
diff --git a/tools/solisten.py b/tools/solisten.py
new file mode 100755
index 0000000..6a35f82
--- /dev/null
+++ b/tools/solisten.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+#
+# solisten      Trace TCP listen events
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: solisten.py [-h] [-p PID] [--show-netns]
+#
+# This is provided as a basic example of TCP connection & socket tracing.
+# It could be useful in scenarios where load balancers needs to be updated
+# dynamically as application is fully initialized.
+#
+# All IPv4 listen attempts are traced, even if they ultimately fail or the
+# the listening program is not willing to accept().
+#
+# Copyright (c) 2016 Jean-Tiare Le Bigot.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 04-Mar-2016	Jean-Tiare Le Bigot	Created this.
+
+import os
+from socket import inet_ntop, AF_INET, AF_INET6, SOCK_STREAM, SOCK_DGRAM
+from struct import pack
+import argparse
+from bcc import BPF
+import ctypes as ct
+
+# Arguments
+examples = """Examples:
+    ./solisten.py              # Stream socket listen
+    ./solisten.py -p 1234      # Stream socket listen for specified PID only
+    ./solisten.py --netns 4242 # " for the specified network namespace ID only
+    ./solisten.py --show-netns # Show network ns ID (useful for containers)
+"""
+
+parser = argparse.ArgumentParser(
+    description="Stream sockets listen",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("--show-netns", action="store_true",
+    help="show network namespace")
+parser.add_argument("-p", "--pid", default=0, type=int,
+    help="trace this PID only")
+parser.add_argument("-n", "--netns", default=0, type=int,
+    help="trace this Network Namespace only")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+
+
+# BPF Program
+bpf_text = """
+#include <net/net_namespace.h>
+#include <bcc/proto.h>
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wenum-conversion"
+#include <net/inet_sock.h>
+#pragma clang diagnostic pop
+
+// Common structure for UDP/TCP IPv4/IPv6
+struct listen_evt_t {
+    u64 ts_us;
+    u64 pid_tgid;
+    u64 backlog;
+    u64 netns;
+    u64 proto;    // familiy << 16 | type
+    u64 lport;    // use only 16 bits
+    u64 laddr[2]; // IPv4: store in laddr[0]
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(listen_evt);
+
+// Send an event for each IPv4 listen with PID, bound address and port
+int kprobe__inet_listen(struct pt_regs *ctx, struct socket *sock, int backlog)
+{
+        // cast types. Intermediate cast not needed, kept for readability
+        struct sock *sk = sock->sk;
+        struct inet_sock *inet = (struct inet_sock *)sk;
+
+        // Built event for userland
+        struct listen_evt_t evt = {
+            .ts_us = bpf_ktime_get_ns() / 1000,
+            .backlog = backlog,
+        };
+
+        // Get process comm. Needs LLVM >= 3.7.1
+        // see https://github.com/iovisor/bcc/issues/393
+        bpf_get_current_comm(evt.task, TASK_COMM_LEN);
+
+        // Get socket IP family
+        u16 family = sk->__sk_common.skc_family;
+        evt.proto = family << 16 | SOCK_STREAM;
+
+        // Get PID
+        evt.pid_tgid = bpf_get_current_pid_tgid();
+
+        ##FILTER_PID##
+
+        // Get port
+        evt.lport = inet->inet_sport;
+        evt.lport = ntohs(evt.lport);
+
+        // Get network namespace id, if kernel supports it
+#ifdef CONFIG_NET_NS
+        evt.netns = sk->__sk_common.skc_net.net->ns.inum;
+#else
+        evt.netns = 0;
+#endif
+
+        ##FILTER_NETNS##
+
+        // Get IP
+        if (family == AF_INET) {
+            evt.laddr[0] = inet->inet_rcv_saddr;
+        } else if (family == AF_INET6) {
+            bpf_probe_read(evt.laddr, sizeof(evt.laddr),
+                           sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        }
+
+        // Send event to userland
+        listen_evt.perf_submit(ctx, &evt, sizeof(evt));
+
+        return 0;
+};
+"""
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+class ListenEvt(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid_tgid", ct.c_ulonglong),
+        ("backlog", ct.c_ulonglong),
+        ("netns", ct.c_ulonglong),
+        ("proto", ct.c_ulonglong),
+        ("lport", ct.c_ulonglong),
+        ("laddr", ct.c_ulonglong * 2),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+    # TODO: properties to unpack protocol / ip / pid / tgid ...
+
+# Format output
+def event_printer(show_netns):
+    def print_event(cpu, data, size):
+        # Decode event
+        event = ct.cast(data, ct.POINTER(ListenEvt)).contents
+
+        pid = event.pid_tgid & 0xffffffff
+        proto_family = event.proto & 0xff
+        proto_type = event.proto >> 16 & 0xff
+
+        if proto_family == SOCK_STREAM:
+            protocol = "TCP"
+        elif proto_family == SOCK_DGRAM:
+            protocol = "UDP"
+        else:
+            protocol = "UNK"
+
+        address = ""
+        if proto_type == AF_INET:
+            protocol += "v4"
+            address = inet_ntop(AF_INET, pack("I", event.laddr[0]))
+        elif proto_type == AF_INET6:
+            address = inet_ntop(AF_INET6, event.laddr)
+            protocol += "v6"
+
+        # Display
+        if show_netns:
+            print("%-6d %-12.12s %-12s %-6s %-8s %-5s %-39s" % (
+                pid, event.task, event.netns, protocol, event.backlog,
+                event.lport, address,
+            ))
+        else:
+            print("%-6d %-12.12s %-6s %-8s %-5s %-39s" % (
+                pid, event.task, protocol, event.backlog,
+                event.lport, address,
+            ))
+
+    return print_event
+
+if __name__ == "__main__":
+    # Parse arguments
+    args = parser.parse_args()
+
+    pid_filter = ""
+    netns_filter = ""
+
+    if args.pid:
+        pid_filter = "if (evt.pid_tgid != %d) return 0;" % args.pid
+    if args.netns:
+        netns_filter = "if (evt.netns != %d) return 0;" % args.netns
+
+    bpf_text = bpf_text.replace("##FILTER_PID##", pid_filter)
+    bpf_text = bpf_text.replace("##FILTER_NETNS##", netns_filter)
+
+    if args.ebpf:
+        print(bpf_text)
+        exit()
+
+    # Initialize BPF
+    b = BPF(text=bpf_text)
+    b["listen_evt"].open_perf_buffer(event_printer(args.show_netns))
+
+    # Print headers
+    if args.show_netns:
+        print("%-6s %-12s %-12s %-6s %-8s %-5s %-39s" %
+              ("PID", "COMM", "NETNS", "PROTO", "BACKLOG", "PORT", "ADDR"))
+    else:
+        print("%-6s %-12s %-6s %-8s %-5s %-39s" %
+              ("PID", "COMM", "PROTO", "BACKLOG", "PORT", "ADDR"))
+
+    # Read events
+    while 1:
+        b.perf_buffer_poll()
diff --git a/tools/solisten_example.txt b/tools/solisten_example.txt
new file mode 100644
index 0000000..f7ace8c
--- /dev/null
+++ b/tools/solisten_example.txt
@@ -0,0 +1,40 @@
+Demonstrations of solisten.py, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel function called when a program wants to listen
+for TCP connections. It will not see UDP neither UNIX domain sockets.
+
+It can be used to dynamically update a load balancer as a program is actually
+ready to accept connexion, hence avoiding the "downtime" while it is initializing.
+
+# ./solisten.py --show-netns
+PID    COMM         NETNS        PROTO  BACKLOG  ADDR                                    PORT
+3643   nc           4026531957   TCPv4  1        0.0.0.0                                 4242
+3659   nc           4026531957   TCPv6  1        2001:f0d0:1002:51::4                    4242
+4221   redis-server 4026532165   TCPv6  128      ::                                      6379
+4221   redis-server 4026532165   TCPv4  128      0.0.0.0                                 6379
+6067   nginx        4026531957   TCPv4  128      0.0.0.0                                 80
+6067   nginx        4026531957   TCPv6  128      ::                                      80
+6069   nginx        4026531957   TCPv4  128      0.0.0.0                                 80
+6069   nginx        4026531957   TCPv6  128      ::                                      80
+6069   nginx        4026531957   TCPv4  128      0.0.0.0                                 80
+6069   nginx        4026531957   TCPv6  128      ::                                      80
+
+This output show the listen event from 3 programs. Netcat was started twice as
+shown by the 2 different PIDs. The first time on the wilcard IPv4, the second
+time on an IPv6. Netcat being a "one shot" program. It can accept a single
+connection, hence the backlog of "1".
+
+The next program is redis-server. As the netns column shows, it is in a
+different network namespace than netcat and nginx. In this specific case
+it was launched in a docker container. It listens both on IPv4 and IPv4
+with up to 128 pending connections.
+
+Determining the actual container is out if the scope of this tool. It could
+be derived by scrapping /proc/<PID>/cgroup. Note that this is racy.
+
+The overhead of this tool is negligeable as it traces listen() calls which are
+invoked in the initialization path of a program. The operation part will remain
+unaffected. In particular, accept() calls will not be affected. Neither
+individual read() and write().
+
diff --git a/tools/sslsniff.py b/tools/sslsniff.py
new file mode 100755
index 0000000..0c9f976
--- /dev/null
+++ b/tools/sslsniff.py
@@ -0,0 +1,231 @@
+#!/usr/bin/python
+#
+# sslsniff  Captures data on read/recv or write/send functions of OpenSSL,
+#           GnuTLS and NSS
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: sslsniff.py [-h] [-p PID] [-c COMM] [-o] [-g] [-d]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Aug-2016    Adrian Lopez   Created this.
+# 13-Aug-2016    Mark Drayton   Fix SSL_Read
+# 17-Aug-2016    Adrian Lopez   Capture GnuTLS and add options
+#
+
+from __future__ import print_function
+import ctypes as ct
+from bcc import BPF
+import argparse
+
+# arguments
+examples = """examples:
+    ./sslsniff              # sniff OpenSSL and GnuTLS functions
+    ./sslsniff -p 181       # sniff PID 181 only
+    ./sslsniff -c curl      # sniff curl command only
+    ./sslsniff --no-openssl # don't show OpenSSL calls
+    ./sslsniff --no-gnutls  # don't show GnuTLS calls
+    ./sslsniff --no-nss     # don't show NSS calls
+"""
+parser = argparse.ArgumentParser(
+    description="Sniff SSL data",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-p", "--pid", type=int, help="sniff this PID only.")
+parser.add_argument("-c", "--comm",
+                    help="sniff only commands matching string.")
+parser.add_argument("-o", "--no-openssl", action="store_false", dest="openssl",
+                    help="do not show OpenSSL calls.")
+parser.add_argument("-g", "--no-gnutls", action="store_false", dest="gnutls",
+                    help="do not show GnuTLS calls.")
+parser.add_argument("-n", "--no-nss", action="store_false", dest="nss",
+                    help="do not show NSS calls.")
+parser.add_argument('-d', '--debug', dest='debug', action='count', default=0,
+                    help='debug mode.')
+parser.add_argument("--ebpf", action="store_true",
+                    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+
+prog = """
+#include <linux/ptrace.h>
+#include <linux/sched.h>        /* For TASK_COMM_LEN */
+
+struct probe_SSL_data_t {
+        u64 timestamp_ns;
+        u32 pid;
+        char comm[TASK_COMM_LEN];
+        char v0[464];
+        u32 len;
+};
+
+BPF_PERF_OUTPUT(perf_SSL_write);
+
+int probe_SSL_write(struct pt_regs *ctx, void *ssl, void *buf, int num) {
+        u32 pid = bpf_get_current_pid_tgid();
+        FILTER
+
+        struct probe_SSL_data_t __data = {0};
+        __data.timestamp_ns = bpf_ktime_get_ns();
+        __data.pid = pid;
+        __data.len = num;
+
+        bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
+
+        if ( buf != 0) {
+                bpf_probe_read(&__data.v0, sizeof(__data.v0), buf);
+        }
+
+        perf_SSL_write.perf_submit(ctx, &__data, sizeof(__data));
+        return 0;
+}
+
+BPF_PERF_OUTPUT(perf_SSL_read);
+
+BPF_HASH(bufs, u32, u64);
+
+int probe_SSL_read_enter(struct pt_regs *ctx, void *ssl, void *buf, int num) {
+        u32 pid = bpf_get_current_pid_tgid();
+        FILTER
+
+        bufs.update(&pid, (u64*)&buf);
+        return 0;
+}
+
+int probe_SSL_read_exit(struct pt_regs *ctx, void *ssl, void *buf, int num) {
+        u32 pid = bpf_get_current_pid_tgid();
+        FILTER
+
+        u64 *bufp = bufs.lookup(&pid);
+        if (bufp == 0) {
+                return 0;
+        }
+
+        struct probe_SSL_data_t __data = {0};
+        __data.timestamp_ns = bpf_ktime_get_ns();
+        __data.pid = pid;
+        __data.len = PT_REGS_RC(ctx);
+
+        bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
+
+        if (bufp != 0) {
+                bpf_probe_read(&__data.v0, sizeof(__data.v0), (char *)*bufp);
+        }
+
+        bufs.delete(&pid);
+
+        perf_SSL_read.perf_submit(ctx, &__data, sizeof(__data));
+        return 0;
+}
+"""
+
+if args.pid:
+    prog = prog.replace('FILTER', 'if (pid != %d) { return 0; }' % args.pid)
+else:
+    prog = prog.replace('FILTER', '')
+
+if args.debug or args.ebpf:
+    print(prog)
+    if args.ebpf:
+        exit()
+
+
+b = BPF(text=prog)
+
+# It looks like SSL_read's arguments aren't available in a return probe so you
+# need to stash the buffer address in a map on the function entry and read it
+# on its exit (Mark Drayton)
+#
+if args.openssl:
+    b.attach_uprobe(name="ssl", sym="SSL_write", fn_name="probe_SSL_write",
+                    pid=args.pid or -1)
+    b.attach_uprobe(name="ssl", sym="SSL_read", fn_name="probe_SSL_read_enter",
+                    pid=args.pid or -1)
+    b.attach_uretprobe(name="ssl", sym="SSL_read",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+
+if args.gnutls:
+    b.attach_uprobe(name="gnutls", sym="gnutls_record_send",
+                    fn_name="probe_SSL_write", pid=args.pid or -1)
+    b.attach_uprobe(name="gnutls", sym="gnutls_record_recv",
+                    fn_name="probe_SSL_read_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name="gnutls", sym="gnutls_record_recv",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+
+if args.nss:
+    b.attach_uprobe(name="nspr4", sym="PR_Write", fn_name="probe_SSL_write",
+                    pid=args.pid or -1)
+    b.attach_uprobe(name="nspr4", sym="PR_Send", fn_name="probe_SSL_write",
+                    pid=args.pid or -1)
+    b.attach_uprobe(name="nspr4", sym="PR_Read", fn_name="probe_SSL_read_enter",
+                    pid=args.pid or -1)
+    b.attach_uretprobe(name="nspr4", sym="PR_Read",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+    b.attach_uprobe(name="nspr4", sym="PR_Recv", fn_name="probe_SSL_read_enter",
+                    pid=args.pid or -1)
+    b.attach_uretprobe(name="nspr4", sym="PR_Recv",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+
+# define output data structure in Python
+TASK_COMM_LEN = 16  # linux/sched.h
+MAX_BUF_SIZE = 464  # Limited by the BPF stack
+
+
+# Max size of the whole struct: 512 bytes
+class Data(ct.Structure):
+    _fields_ = [
+            ("timestamp_ns", ct.c_ulonglong),
+            ("pid", ct.c_uint),
+            ("comm", ct.c_char * TASK_COMM_LEN),
+            ("v0", ct.c_char * MAX_BUF_SIZE),
+            ("len", ct.c_uint)
+    ]
+
+
+# header
+print("%-12s %-18s %-16s %-6s %-6s" % ("FUNC", "TIME(s)", "COMM", "PID",
+                                       "LEN"))
+
+# process event
+start = 0
+
+
+def print_event_write(cpu, data, size):
+    print_event(cpu, data, size, "WRITE/SEND")
+
+
+def print_event_read(cpu, data, size):
+    print_event(cpu, data, size, "READ/RECV")
+
+
+def print_event(cpu, data, size, rw):
+    global start
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    # Filter events by command
+    if args.comm:
+        if not args.comm == event.comm:
+            return
+
+    if start == 0:
+        start = event.timestamp_ns
+    time_s = (float(event.timestamp_ns - start)) / 1000000000
+
+    s_mark = "-" * 5 + " DATA " + "-" * 5
+
+    e_mark = "-" * 5 + " END DATA " + "-" * 5
+
+    truncated_bytes = event.len - MAX_BUF_SIZE
+    if truncated_bytes > 0:
+        e_mark = "-" * 5 + " END DATA (TRUNCATED, " + str(truncated_bytes) + \
+                " bytes lost) " + "-" * 5
+
+    fmt = "%-12s %-18.9f %-16s %-6d %-6d\n%s\n%s\n%s\n\n"
+    print(fmt % (rw, time_s, event.comm.decode('utf-8', 'replace'),
+                 event.pid, event.len, s_mark,
+                 event.v0.decode('utf-8', 'replace'), e_mark))
+
+b["perf_SSL_write"].open_perf_buffer(print_event_write)
+b["perf_SSL_read"].open_perf_buffer(print_event_read)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/sslsniff_example.txt b/tools/sslsniff_example.txt
new file mode 100644
index 0000000..8c51722
--- /dev/null
+++ b/tools/sslsniff_example.txt
@@ -0,0 +1,87 @@
+Demonstrations of sslsniff.py
+
+
+This tool traces the write/send and read/recv functions of OpenSSL,
+GnuTLS and NSS.  Data passed to this functions is printed as plain
+text.  Useful, for example, to sniff HTTP before encrypted with SSL.
+
+
+Output of tool executing in other shell "curl https://example.com"
+
+% sudo python sslsniff.py
+FUNC         TIME(s)            COMM             PID    LEN   
+WRITE/SEND   0.000000000        curl             12915  75    
+----- DATA -----
+GET / HTTP/1.1
+Host: example.com
+User-Agent: curl/7.50.1
+Accept: */*
+
+
+----- END DATA -----
+
+READ/RECV    0.127144585        curl             12915  333   
+----- DATA -----
+HTTP/1.1 200 OK
+Cache-Control: max-age=604800
+Content-Type: text/html
+Date: Tue, 16 Aug 2016 15:42:12 GMT
+Etag: "359670651+gzip+ident"
+Expires: Tue, 23 Aug 2016 15:42:12 GMT
+Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
+Server: ECS (iad/18CB)
+Vary: Accept-Encoding
+X-Cache: HIT
+x-ec-custom-error: 1
+Content-Length: 1270
+
+
+----- END DATA -----
+
+READ/RECV    0.129967972        curl             12915  1270  
+----- DATA -----
+<!doctype html>
+<html>
+<head>
+    <title>Example Domain</title>
+
+    <meta charset="utf-8" />
+    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <style type="text/css">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+        
+    }
+    div {
+        w
+----- END DATA (TRUNCATED, 798 bytes lost) -----
+
+
+
+
+USAGE message:
+
+usage: sslsniff.py [-h] [-p PID] [-c COMM] [-o] [-g] [-n] [-d]
+
+Sniff SSL data
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     sniff this PID only.
+  -c COMM, --comm COMM  sniff only commands matching string.
+  -o, --no-openssl      do not show OpenSSL calls.
+  -g, --no-gnutls       do not show GnuTLS calls.
+  -n, --no-nss          do not show NSS calls.
+  -d, --debug           debug mode.
+
+examples:
+    ./sslsniff              # sniff OpenSSL and GnuTLS functions
+    ./sslsniff -p 181       # sniff PID 181 only
+    ./sslsniff -c curl      # sniff curl command only
+    ./sslsniff --no-openssl # don't show OpenSSL calls
+    ./sslsniff --no-gnutls  # don't show GnuTLS calls
+    ./sslsniff --no-nss     # don't show NSS calls
diff --git a/tools/stackcount.py b/tools/stackcount.py
new file mode 100755
index 0000000..5554014
--- /dev/null
+++ b/tools/stackcount.py
@@ -0,0 +1,373 @@
+#!/usr/bin/env python
+#
+# stackcount    Count events and their stack traces.
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: stackcount.py [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
+#                      [-P] [-K] [-U] [-v] [-d] [-f] [--debug]
+#
+# The pattern is a string with optional '*' wildcards, similar to file
+# globbing. If you'd prefer to use regular expressions, use the -r option.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Jan-2016	Brendan Gregg	    Created this.
+# 09-Jul-2016   Sasha Goldshtein    Generalized for uprobes and tracepoints.
+
+from __future__ import print_function
+from bcc import BPF, USDT
+from time import sleep, strftime
+import argparse
+import re
+import signal
+import sys
+import traceback
+
+debug = False
+
+class Probe(object):
+    def __init__(self, pattern, kernel_stack, user_stack, use_regex=False,
+                 pid=None, per_pid=False):
+        """Init a new probe.
+
+        Init the probe from the pattern provided by the user. The supported
+        patterns mimic the 'trace' and 'argdist' tools, but are simpler because
+        we don't have to distinguish between probes and retprobes.
+
+            func            -- probe a kernel function
+            lib:func        -- probe a user-space function in the library 'lib'
+            p::func         -- same thing as 'func'
+            p:lib:func      -- same thing as 'lib:func'
+            t:cat:event     -- probe a kernel tracepoint
+            u:lib:probe     -- probe a USDT tracepoint
+        """
+        self.kernel_stack = kernel_stack
+        self.user_stack = user_stack
+        parts = pattern.split(':')
+        if len(parts) == 1:
+            parts = ["p", "", parts[0]]
+        elif len(parts) == 2:
+            parts = ["p", parts[0], parts[1]]
+        elif len(parts) == 3:
+            if parts[0] == "t":
+                parts = ["t", "", "%s:%s" % tuple(parts[1:])]
+            if parts[0] not in ["p", "t", "u"]:
+                raise Exception("Type must be 'p', 't', or 'u', but got %s" %
+                                parts[0])
+        else:
+            raise Exception("Too many ':'-separated components in pattern %s" %
+                            pattern)
+
+        (self.type, self.library, self.pattern) = parts
+        if not use_regex:
+            self.pattern = self.pattern.replace('*', '.*')
+            self.pattern = '^' + self.pattern + '$'
+
+        if (self.type == "p" and self.library) or self.type == "u":
+            libpath = BPF.find_library(self.library)
+            if libpath is None:
+                # This might be an executable (e.g. 'bash')
+                libpath = BPF.find_exe(self.library)
+            if libpath is None or len(libpath) == 0:
+                raise Exception("unable to find library %s" % self.library)
+            self.library = libpath
+
+        self.pid = pid
+        self.per_pid = per_pid
+        self.matched = 0
+
+    def is_kernel_probe(self):
+        return self.type == "t" or (self.type == "p" and self.library == "")
+
+    def attach(self):
+        if self.type == "p":
+            if self.library:
+                self.bpf.attach_uprobe(name=self.library,
+                                       sym_re=self.pattern,
+                                       fn_name="trace_count",
+                                       pid=self.pid or -1)
+                self.matched = self.bpf.num_open_uprobes()
+            else:
+                self.bpf.attach_kprobe(event_re=self.pattern,
+                                       fn_name="trace_count")
+                self.matched = self.bpf.num_open_kprobes()
+        elif self.type == "t":
+            self.bpf.attach_tracepoint(tp_re=self.pattern,
+                                       fn_name="trace_count")
+            self.matched = self.bpf.num_open_tracepoints()
+        elif self.type == "u":
+            pass    # Nothing to do -- attach already happened in `load`
+
+        if self.matched == 0:
+            raise Exception("No functions matched by pattern %s" %
+                            self.pattern)
+
+    def load(self):
+        ctx_name = "ctx"
+        stack_trace = ""
+        if self.user_stack:
+                stack_trace += """
+                    key.user_stack_id = stack_traces.get_stackid(
+                      %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK
+                    );""" % (ctx_name)
+        else:
+                stack_trace += "key.user_stack_id = -1;"
+        if self.kernel_stack:
+                stack_trace += """
+                    key.kernel_stack_id = stack_traces.get_stackid(
+                      %s, BPF_F_REUSE_STACKID
+                    );""" % (ctx_name)
+        else:
+                stack_trace += "key.kernel_stack_id = -1;"
+
+        trace_count_text = """
+int trace_count(void *ctx) {
+    FILTER
+    struct key_t key = {};
+    key.tgid = GET_TGID;
+    STORE_COMM
+    %s
+    counts.increment(key);
+    return 0;
+}
+        """
+        trace_count_text = trace_count_text % (stack_trace)
+
+        bpf_text = """#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct key_t {
+    // no pid (thread ID) so that we do not needlessly split this key
+    u32 tgid;
+    int kernel_stack_id;
+    int user_stack_id;
+    char name[TASK_COMM_LEN];
+};
+
+BPF_HASH(counts, struct key_t);
+BPF_STACK_TRACE(stack_traces, 1024);
+        """
+
+        # We really mean the tgid from the kernel's perspective, which is in
+        # the top 32 bits of bpf_get_current_pid_tgid().
+        if self.is_kernel_probe() and self.pid:
+            trace_count_text = trace_count_text.replace('FILTER',
+                ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' +
+                'if (pid != %d) { return 0; }') % (self.pid))
+        else:
+            trace_count_text = trace_count_text.replace('FILTER', '')
+
+        # We need per-pid statistics when tracing a user-space process, because
+        # the meaning of the symbols depends on the pid. We also need them if
+        # per-pid statistics were requested with -P, or for user stacks.
+        if self.per_pid or not self.is_kernel_probe() or self.user_stack:
+            trace_count_text = trace_count_text.replace('GET_TGID',
+                                        'bpf_get_current_pid_tgid() >> 32')
+            trace_count_text = trace_count_text.replace('STORE_COMM',
+                        'bpf_get_current_comm(&key.name, sizeof(key.name));')
+        else:
+            # kernel stacks only. skip splitting on PID so these aggregate
+            # together, and don't store the process name.
+            trace_count_text = trace_count_text.replace(
+                                    'GET_TGID', '0xffffffff')
+            trace_count_text = trace_count_text.replace('STORE_COMM', '')
+
+        self.usdt = None
+        if self.type == "u":
+            self.usdt = USDT(path=self.library, pid=self.pid)
+            for probe in self.usdt.enumerate_probes():
+                if not self.pid and (probe.bin_path != self.library):
+                    continue
+                if re.match(self.pattern, probe.name):
+                    # This hack is required because the bpf_usdt_readarg
+                    # functions generated need different function names for
+                    # each attached probe. If we just stick to trace_count,
+                    # we'd get multiple bpf_usdt_readarg helpers with the same
+                    # name when enabling more than one USDT probe.
+                    new_func = "trace_count_%d" % self.matched
+                    bpf_text += trace_count_text.replace(
+                                            "trace_count", new_func)
+                    self.usdt.enable_probe(probe.name, new_func)
+                    self.matched += 1
+            if debug:
+                print(self.usdt.get_text())
+        else:
+            bpf_text += trace_count_text
+
+        if debug:
+            print(bpf_text)
+        self.bpf = BPF(text=bpf_text,
+                       usdt_contexts=[self.usdt] if self.usdt else [])
+
+class Tool(object):
+    def __init__(self):
+        examples = """examples:
+    ./stackcount submit_bio         # count kernel stack traces for submit_bio
+    ./stackcount -d ip_output       # include a user/kernel stack delimiter
+    ./stackcount -s ip_output       # show symbol offsets
+    ./stackcount -sv ip_output      # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'        # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*'   # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output    # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output   # count ip_output stacks for PID 185 only
+    ./stackcount -p 185 c:malloc    # count stacks for malloc in PID 185
+    ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
+    ./stackcount -p 185 u:node:*    # count stacks for all USDT probes in node
+    ./stackcount -K t:sched:sched_switch   # kernel stacks only
+    ./stackcount -U t:sched:sched_switch   # user stacks only
+        """
+        parser = argparse.ArgumentParser(
+            description="Count events and their stack traces",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=examples)
+        parser.add_argument("-p", "--pid", type=int,
+            help="trace this PID only")
+        parser.add_argument("-i", "--interval",
+            help="summary interval, seconds")
+        parser.add_argument("-D", "--duration",
+            help="total duration of trace, seconds")
+        parser.add_argument("-T", "--timestamp", action="store_true",
+            help="include timestamp on output")
+        parser.add_argument("-r", "--regexp", action="store_true",
+            help="use regular expressions. Default is \"*\" wildcards only.")
+        parser.add_argument("-s", "--offset", action="store_true",
+            help="show address offsets")
+        parser.add_argument("-P", "--perpid", action="store_true",
+            help="display stacks separately for each process")
+        parser.add_argument("-K", "--kernel-stacks-only",
+            action="store_true", help="kernel stack only", default=False)
+        parser.add_argument("-U", "--user-stacks-only",
+            action="store_true", help="user stack only", default=False)
+        parser.add_argument("-v", "--verbose", action="store_true",
+            help="show raw addresses")
+        parser.add_argument("-d", "--delimited", action="store_true",
+            help="insert delimiter between kernel/user stacks")
+        parser.add_argument("-f", "--folded", action="store_true",
+            help="output folded format")
+        parser.add_argument("--debug", action="store_true",
+            help="print BPF program before starting (for debugging purposes)")
+        parser.add_argument("pattern",
+            help="search expression for events")
+        self.args = parser.parse_args()
+        global debug
+        debug = self.args.debug
+
+        if self.args.duration and not self.args.interval:
+            self.args.interval = self.args.duration
+        if not self.args.interval:
+            self.args.interval = 99999999
+
+        if self.args.kernel_stacks_only and self.args.user_stacks_only:
+            print("ERROR: -K and -U are mutually exclusive. If you want " +
+                "both stacks, that is the default.")
+            exit()
+        if not self.args.kernel_stacks_only and not self.args.user_stacks_only:
+            self.kernel_stack = True
+            self.user_stack = True
+        else:
+            self.kernel_stack = self.args.kernel_stacks_only
+            self.user_stack = self.args.user_stacks_only
+
+        self.probe = Probe(self.args.pattern,
+                           self.kernel_stack, self.user_stack,
+                           self.args.regexp, self.args.pid, self.args.perpid)
+        self.need_delimiter = self.args.delimited and not (
+                    self.args.kernel_stacks_only or self.args.user_stacks_only)
+
+    def _print_kframe(self, addr):
+        print("  ", end="")
+        if self.args.verbose:
+            print("%-16x " % addr, end="")
+        if self.args.offset:
+            print("%s" % self.probe.bpf.ksym(addr, show_offset=True))
+        else:
+            print("%s" % self.probe.bpf.ksym(addr))
+
+    def _print_uframe(self, addr, pid):
+        print("  ", end="")
+        if self.args.verbose:
+            print("%-16x " % addr, end="")
+        if self.args.offset:
+            print("%s" % self.probe.bpf.sym(addr, pid, show_offset=True))
+        else:
+            print("%s" % self.probe.bpf.sym(addr, pid))
+
+    @staticmethod
+    def _signal_ignore(signal, frame):
+        print()
+
+    def _print_comm(self, comm, pid):
+        print("    %s [%d]" % (comm, pid))
+
+    def run(self):
+        self.probe.load()
+        self.probe.attach()
+        if not self.args.folded:
+            print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+                  (self.probe.matched, self.args.pattern))
+        b = self.probe.bpf
+        exiting = 0 if self.args.interval else 1
+        seconds = 0
+        while True:
+            try:
+                sleep(int(self.args.interval))
+                seconds += int(self.args.interval)
+            except KeyboardInterrupt:
+                exiting = 1
+                # as cleanup can take many seconds, trap Ctrl-C:
+                signal.signal(signal.SIGINT, Tool._signal_ignore)
+            if self.args.duration and seconds >= int(self.args.duration):
+                exiting = 1
+
+            if not self.args.folded:
+                print()
+            if self.args.timestamp:
+                print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+            counts = self.probe.bpf["counts"]
+            stack_traces = self.probe.bpf["stack_traces"]
+            self.comm_cache = {}
+            for k, v in sorted(counts.items(),
+                               key=lambda counts: counts[1].value):
+                user_stack = [] if k.user_stack_id < 0 else \
+                    stack_traces.walk(k.user_stack_id)
+                kernel_stack = [] if k.kernel_stack_id < 0 else \
+                    stack_traces.walk(k.kernel_stack_id)
+
+                if self.args.folded:
+                    # print folded stack output
+                    user_stack = list(user_stack)
+                    kernel_stack = list(kernel_stack)
+                    line = [k.name.decode('utf-8', 'replace')] + \
+                        [b.sym(addr, k.tgid) for addr in
+                        reversed(user_stack)] + \
+                        (self.need_delimiter and ["-"] or []) + \
+                        [b.ksym(addr) for addr in reversed(kernel_stack)]
+                    print("%s %d" % (";".join(line), v.value))
+                else:
+                    # print multi-line stack output
+                    for addr in kernel_stack:
+                        self._print_kframe(addr)
+                    if self.need_delimiter:
+                        print("    --")
+                    for addr in user_stack:
+                        self._print_uframe(addr, k.tgid)
+                    if not self.args.pid and k.tgid != 0xffffffff:
+                        self._print_comm(k.name, k.tgid)
+                    print("    %d\n" % v.value)
+            counts.clear()
+
+            if exiting:
+                if not self.args.folded:
+                    print("Detaching...")
+                exit()
+
+if __name__ == "__main__":
+    try:
+        Tool().run()
+    except Exception:
+        if debug:
+            traceback.print_exc()
+        elif sys.exc_info()[0] is not SystemExit:
+            print(sys.exc_info()[1])
diff --git a/tools/stackcount_example.txt b/tools/stackcount_example.txt
new file mode 100644
index 0000000..92a77a8
--- /dev/null
+++ b/tools/stackcount_example.txt
@@ -0,0 +1,890 @@
+Demonstrations of stackcount, the Linux eBPF/bcc version.
+
+
+This program traces functions and frequency counts them with their entire
+stack trace, summarized in-kernel for efficiency. For example, counting
+stack traces that led to the submit_bio() kernel function, which creates
+block device I/O:
+
+# ./stackcount submit_bio
+Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
+^C
+  submit_bio
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+  [unknown]
+  [unknown]
+    tar [15069]
+    5
+
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+  [unknown]
+  [unknown]
+    tar [15069]
+    15
+
+  submit_bio
+  ext4_readpages
+  __do_page_cache_readahead
+  ondemand_readahead
+  page_cache_async_readahead
+  generic_file_read_iter
+  __vfs_read
+  vfs_read
+  sys_read
+  entry_SYSCALL_64_fastpath
+  [unknown]
+    tar [15069]
+    113
+
+Detaching...
+
+The output shows unique stack traces, in order from leaf (on-CPU) to root,
+followed by their occurrence count. The last stack trace in the above output
+shows syscall handling, sys_read(), vfs_read(), and then "readahead" functions:
+looks like an application issued file read has triggered read ahead. The
+application can be seen after the stack trace, in this case, "tar [15069]"
+for the "tar" command, PID 15069.
+
+The order of printed stack traces is from least to most frequent. The most
+frequent in this case, the ext4_rename() stack, was taken 113 times during
+tracing.
+
+The "[unknown]" frames are from user-level, since this simple workload is
+the tar command, which apparently has been compiled without frame pointers.
+It's a common compiler optimization, but it breaks frame pointer-based stack
+walkers. Similar broken stacks will be seen by other profilers and debuggers
+that use frame pointers. Hopefully your application preserves them so that
+the user-level stack trace is visible. So how does one get frame pointers, if
+your application doesn't have them to start with? For the current bcc (until
+it supports other stack walkers), you need to be running a application binaries
+that preserves frame pointers, eg, using gcc's -fno-omit-frame-pointer. That's
+about all I'll say here: this is a big topic that is not bcc/BPF specific.
+
+It can be useful to trace the path to submit_bio to explain unusual rates of
+disk IOPS. These could have in-kernel origins (eg, background scrub).
+
+
+Now adding the -d option to delimit kernel and user stacks:
+
+# ./stackcount -d submit_bio
+Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
+^C
+  submit_bio
+  submit_bh
+  journal_submit_commit_record
+  jbd2_journal_commit_transaction
+  kjournald2
+  kthread
+  ret_from_fork
+    --
+    jbd2/xvda1-8 [405]
+    1
+
+  submit_bio
+  submit_bh
+  jbd2_journal_commit_transaction
+  kjournald2
+  kthread
+  ret_from_fork
+    --
+    jbd2/xvda1-8 [405]
+    2
+
+  submit_bio
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    tar [15187]
+    5
+
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
+  ext4_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_flush
+  ext4_alloc_da_blocks
+  ext4_release_file
+  __fput
+  ____fput
+  task_work_run
+  exit_to_usermode_loop
+  syscall_return_slowpath
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    tar [15187]
+    15
+
+  submit_bio
+  ext4_readpages
+  __do_page_cache_readahead
+  ondemand_readahead
+  page_cache_async_readahead
+  generic_file_read_iter
+  __vfs_read
+  vfs_read
+  sys_read
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+  [unknown]
+    tar [15187]
+    171
+
+Detaching...
+
+A "--" is printed between the kernel and user stacks.
+
+
+As a different example, here is the kernel function hrtimer_init_sleeper():
+
+# ./stackcount.py -d hrtimer_init_sleeper
+Tracing 1 functions for "hrtimer_init_sleeper"... Hit Ctrl-C to end.
+^C
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    containerd [16020]
+    1
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  Monitor::IWait(Thread*, long)
+  Monitor::wait(bool, long, bool)
+  CompileQueue::get()
+  CompileBroker::compiler_thread_loop()
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    1
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    containerd [16020]
+    1
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  VMThread::loop()
+  VMThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    3
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    dockerd [16008]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+  [unknown]
+    dockerd [16008]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  Lio/netty/util/ThreadDeathWatcher$Watcher;::run
+  Interpreter
+  Interpreter
+  call_stub
+  JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
+  thread_entry(JavaThread*, Thread*)
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    4
+
+  hrtimer_init_sleeper
+  do_futex
+  SyS_futex
+  entry_SYSCALL_64_fastpath
+    --
+  __pthread_cond_timedwait
+  clock_gettime
+  [unknown]
+    java [4996]
+    79
+
+Detaching...
+
+I was just trying to find a more interesting example. This output includes
+some Java stacks where user-level has been walked correctly (even includes a
+JIT symbol translation). dockerd and containerd don't have frame pointers
+(grumble), but Java does (which is running with -XX:+PreserveFramePointer).
+
+
+Here's another kernel function, ip_output():
+
+# ./stackcount.py -d ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
+  ip_output
+  ip_queue_xmit
+  tcp_transmit_skb
+  tcp_write_xmit
+  __tcp_push_pending_frames
+  tcp_push
+  tcp_sendmsg
+  inet_sendmsg
+  sock_sendmsg
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+  [unknown]
+    sshd [15015]
+    5
+
+  ip_output
+  ip_queue_xmit
+  tcp_transmit_skb
+  tcp_write_xmit
+  __tcp_push_pending_frames
+  tcp_push
+  tcp_sendmsg
+  inet_sendmsg
+  sock_sendmsg
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+  [unknown]
+  [unknown]
+    sshd [8234]
+    5
+
+  ip_output
+  ip_queue_xmit
+  tcp_transmit_skb
+  tcp_write_xmit
+  __tcp_push_pending_frames
+  tcp_push
+  tcp_sendmsg
+  inet_sendmsg
+  sock_sendmsg
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    --
+  __write_nocancel
+    sshd [15015]
+    7
+
+Detaching...
+
+This time just sshd is triggering ip_output() calls.
+
+
+Watch what happens if I filter on kernel stacks only (-K) for ip_output():
+
+# ./stackcount.py -K ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
+  ip_output
+  ip_queue_xmit
+  tcp_transmit_skb
+  tcp_write_xmit
+  __tcp_push_pending_frames
+  tcp_push
+  tcp_sendmsg
+  inet_sendmsg
+  sock_sendmsg
+  sock_write_iter
+  __vfs_write
+  vfs_write
+  SyS_write
+  entry_SYSCALL_64_fastpath
+    13
+
+Detaching...
+
+They have grouped together as a single unique stack, since the kernel part
+was the same.
+
+
+Here is just the user stacks, fetched during the kernel function ip_output():
+
+# ./stackcount.py -U ip_output
+Tracing 1 functions for "ip_output"... Hit Ctrl-C to end.
+^C
+  [unknown]
+    snmpd [1645]
+    1
+
+  __write_nocancel
+  [unknown]
+  [unknown]
+    sshd [8234]
+    3
+
+  __write_nocancel
+    sshd [15015]
+    4
+
+I should really run a custom sshd with frame pointers so we can see its
+stack trace...
+
+
+User-space functions can also be traced if a library name is provided. For
+example, to quickly identify code locations that allocate heap memory for
+PID 4902 (using -p), by tracing malloc from libc ("c:malloc"):
+
+# ./stackcount -p 4902 c:malloc
+Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
+^C
+  malloc
+  rbtree_new
+  main
+  [unknown]
+    12
+
+  malloc
+  _rbtree_node_new_internal
+  _rbtree_node_insert
+  rbtree_insert
+  main
+  [unknown]
+    1189
+
+Detaching...
+
+Kernel stacks are absent as this didn't enter kernel code.
+
+Note that user-space uses of stackcount can be somewhat more limited because
+a lot of user-space libraries and binaries are compiled without frame-pointers
+as discussed earlier (the -fomit-frame-pointer compiler default) or are used
+without debuginfo.
+
+
+In addition to kernel and user-space functions, kernel tracepoints and USDT
+tracepoints are also supported. 
+
+For example, to determine where threads are being created in a particular 
+process, use the pthread_create USDT tracepoint:
+
+# ./stackcount -p $(pidof parprimes) u:pthread:pthread_create
+Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end.
+^C
+
+    parprimes [11923]
+  pthread_create@@GLIBC_2.2.5
+  main
+  __libc_start_main
+  [unknown]
+    7
+
+You can use "readelf -n file" to see if it has USDT tracepoints.
+
+
+Similarly, to determine where context switching is happening in the kernel, 
+use the sched:sched_switch kernel tracepoint:
+
+# ./stackcount t:sched:sched_switch
+  __schedule
+  schedule
+  worker_thread
+  kthread
+  ret_from_fork
+    kworker/0:2 [25482]
+    1
+
+  __schedule
+  schedule
+  schedule_hrtimeout_range_clock
+  schedule_hrtimeout_range
+  ep_poll
+  SyS_epoll_wait
+  entry_SYSCALL_64_fastpath
+  epoll_wait
+  Lsun/nio/ch/SelectorImpl;::lockAndDoSelect
+  Lsun/nio/ch/SelectorImpl;::select
+  Lio/netty/channel/nio/NioEventLoop;::select
+  Lio/netty/channel/nio/NioEventLoop;::run
+  Interpreter
+  Interpreter
+  call_stub
+  JavaCalls::call_helper(JavaValue*, methodHandle*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, KlassHandle, Symbol*, Symbol*, JavaCallArguments*, Thread*)
+  JavaCalls::call_virtual(JavaValue*, Handle, KlassHandle, Symbol*, Symbol*, Thread*)
+  thread_entry(JavaThread*, Thread*)
+  JavaThread::thread_main_inner()
+  JavaThread::run()
+  java_start(Thread*)
+  start_thread
+    java [4996]
+    1
+
+... (omitted for brevity)
+
+  __schedule
+  schedule
+  schedule_preempt_disabled
+  cpu_startup_entry
+  xen_play_dead
+  arch_cpu_idle_dead
+  cpu_startup_entry
+  cpu_bringup_and_idle
+    swapper/1 [0]
+    289
+
+
+A -i option can be used to set an output interval, and -T to include a
+timestamp. For example:
+
+# ./stackcount.py -Tdi 1 submit_bio
+Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end.
+
+06:05:13
+
+06:05:14
+  submit_bio
+  xfs_do_writepage
+  write_cache_pages
+  xfs_vm_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:1 [15686]
+    1
+
+  submit_bio
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    1
+
+  submit_bio
+  xfs_buf_submit
+  xlog_bdstrat
+  xlog_sync
+  xlog_state_release_iclog
+  _xfs_log_force
+  xfs_log_force
+  xfs_fs_sync_fs
+  sync_fs_one_sb
+  iterate_supers
+  sys_sync
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    sync [16039]
+    1
+
+  submit_bio
+  submit_bh
+  journal_submit_commit_record
+  jbd2_journal_commit_transaction
+  kjournald2
+  kthread
+  ret_from_fork
+    --
+    jbd2/xvda1-8 [405]
+    1
+
+  submit_bio
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/0:2 [25482]
+    2
+
+  submit_bio
+  ext4_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    4
+
+  submit_bio
+  xfs_vm_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:1 [15686]
+    5
+
+  submit_bio
+  __block_write_full_page
+  block_write_full_page
+  blkdev_writepage
+  __writepage
+  write_cache_pages
+  generic_writepages
+  blkdev_writepages
+  do_writepages
+  __filemap_fdatawrite_range
+  filemap_fdatawrite
+  fdatawrite_one_bdev
+  iterate_bdevs
+  sys_sync
+  entry_SYSCALL_64_fastpath
+    --
+  [unknown]
+    sync [16039]
+    7
+
+  submit_bio
+  submit_bh
+  jbd2_journal_commit_transaction
+  kjournald2
+  kthread
+  ret_from_fork
+    --
+    jbd2/xvda1-8 [405]
+    8
+
+  submit_bio
+  ext4_bio_write_page
+  mpage_submit_page
+  mpage_map_and_submit_buffers
+  ext4_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    8
+
+  submit_bio
+  __block_write_full_page
+  block_write_full_page
+  blkdev_writepage
+  __writepage
+  write_cache_pages
+  generic_writepages
+  blkdev_writepages
+  do_writepages
+  __writeback_single_inode
+  writeback_sb_inodes
+  __writeback_inodes_wb
+  wb_writeback
+  wb_workfn
+  process_one_work
+  worker_thread
+  kthread
+  ret_from_fork
+    --
+    kworker/u16:0 [16007]
+    60
+
+
+06:05:15
+
+06:05:16
+
+Detaching...
+
+This only included output for the 06:05:14 interval. The other internals
+did not span block device I/O.
+
+
+The -s output prints the return instruction offset for each function (aka
+symbol offset). Eg:
+
+# ./stackcount.py -s tcp_sendmsg
+Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
+^C
+  tcp_sendmsg+0x1
+  sock_sendmsg+0x38
+  sock_write_iter+0x85
+  __vfs_write+0xe3
+  vfs_write+0xb8
+  SyS_write+0x55
+  entry_SYSCALL_64_fastpath+0x1e
+  __write_nocancel+0x7
+    sshd [15015]
+    3
+
+  tcp_sendmsg+0x1
+  sock_sendmsg+0x38
+  sock_write_iter+0x85
+  __vfs_write+0xe3
+  vfs_write+0xb8
+  SyS_write+0x55
+  entry_SYSCALL_64_fastpath+0x1e
+  __write_nocancel+0x7
+    sshd [8234]
+    3
+
+Detaching...
+
+If it wasn't clear how one function called another, knowing the instruction
+offset can help you locate the lines of code from a disassembly dump.
+
+
+The -v output is verbose, and shows raw addresses:
+
+./stackcount.py -v tcp_sendmsg
+Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end.
+^C
+  ffffffff817b05c1 tcp_sendmsg
+  ffffffff8173ea48 sock_sendmsg
+  ffffffff8173eae5 sock_write_iter
+  ffffffff81232b33 __vfs_write
+  ffffffff812331b8 vfs_write
+  ffffffff81234625 SyS_write
+  ffffffff818739bb entry_SYSCALL_64_fastpath
+  7f120511e6e0     __write_nocancel
+    sshd [8234]
+    3
+
+  ffffffff817b05c1 tcp_sendmsg
+  ffffffff8173ea48 sock_sendmsg
+  ffffffff8173eae5 sock_write_iter
+  ffffffff81232b33 __vfs_write
+  ffffffff812331b8 vfs_write
+  ffffffff81234625 SyS_write
+  ffffffff818739bb entry_SYSCALL_64_fastpath
+  7f919c5a26e0     __write_nocancel
+    sshd [15015]
+    11
+
+Detaching...
+
+
+A wildcard can also be used. Eg, all functions beginning with "tcp_send",
+kernel stacks only (-K) with offsets (-s):
+
+# ./stackcount -Ks 'tcp_send*'
+Tracing 14 functions for "tcp_send*"... Hit Ctrl-C to end.
+^C
+  tcp_send_delayed_ack0x1
+  tcp_rcv_established0x3b1
+  tcp_v4_do_rcv0x130
+  tcp_v4_rcv0x8e0
+  ip_local_deliver_finish0x9f
+  ip_local_deliver0x51
+  ip_rcv_finish0x8a
+  ip_rcv0x29d
+  __netif_receive_skb_core0x637
+  __netif_receive_skb0x18
+  netif_receive_skb_internal0x23
+    1
+
+  tcp_send_delayed_ack0x1
+  tcp_rcv_established0x222
+  tcp_v4_do_rcv0x130
+  tcp_v4_rcv0x8e0
+  ip_local_deliver_finish0x9f
+  ip_local_deliver0x51
+  ip_rcv_finish0x8a
+  ip_rcv0x29d
+  __netif_receive_skb_core0x637
+  __netif_receive_skb0x18
+  netif_receive_skb_internal0x23
+    4
+
+  tcp_send_mss0x1
+  inet_sendmsg0x67
+  sock_sendmsg0x38
+  sock_write_iter0x78
+  __vfs_write0xaa
+  vfs_write0xa9
+  sys_write0x46
+  entry_SYSCALL_64_fastpath0x16
+    7
+
+  tcp_sendmsg0x1
+  sock_sendmsg0x38
+  sock_write_iter0x78
+  __vfs_write0xaa
+  vfs_write0xa9
+  sys_write0x46
+  entry_SYSCALL_64_fastpath0x16
+    7
+
+Detaching...
+
+Use -r to allow regular expressions.
+
+
+The -f option will emit folded output, which can be used as input to other
+tools including flame graphs. For example, with delimiters as well:
+
+# ./stackcount.py -df t:sched:sched_switch
+^Csnmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+kworker/7:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
+watchdog/0;-;ret_from_fork;kthread;smpboot_thread_fn;schedule;__schedule 1
+snmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+svscan;[unknown];-;entry_SYSCALL_64_fastpath;SyS_nanosleep;hrtimer_nanosleep;do_nanosleep;schedule;__schedule 1
+python;[unknown];__select_nocancel;-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1
+kworker/2:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1
+[...]
+
+Flame graphs visualize stack traces. For information about them and links
+to open source software, see http://www.brendangregg.com/flamegraphs.html .
+This folded output can be piped directly into flamegraph.pl (the Perl version).
+
+
+USAGE message:
+
+# ./stackcount -h
+usage: stackcount [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s]
+                  [-P] [-K] [-U] [-v] [-d] [-f] [--debug]
+                  pattern
+
+Count events and their stack traces
+
+positional arguments:
+  pattern               search expression for events
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace this PID only
+  -i INTERVAL, --interval INTERVAL
+                        summary interval, seconds
+  -D DURATION, --duration DURATION
+                        total duration of trace, seconds
+  -T, --timestamp       include timestamp on output
+  -r, --regexp          use regular expressions. Default is "*" wildcards
+                        only.
+  -s, --offset          show address offsets
+  -P, --perpid          display stacks separately for each process
+  -K, --kernel-stacks-only
+                        kernel stack only
+  -U, --user-stacks-only
+                        user stack only
+  -v, --verbose         show raw addresses
+  -d, --delimited       insert delimiter between kernel/user stacks
+  -f, --folded          output folded format
+  --debug               print BPF program before starting (for debugging
+                        purposes)
+
+examples:
+    ./stackcount submit_bio         # count kernel stack traces for submit_bio
+    ./stackcount -d ip_output       # include a user/kernel stack delimiter
+    ./stackcount -s ip_output       # show symbol offsets
+    ./stackcount -sv ip_output      # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'        # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*'   # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output    # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output   # count ip_output stacks for PID 185 only
+    ./stackcount -p 185 c:malloc    # count stacks for malloc in PID 185
+    ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint
+    ./stackcount -p 185 u:node:*    # count stacks for all USDT probes in node
+    ./stackcount -K t:sched:sched_switch   # kernel stacks only
+    ./stackcount -U t:sched:sched_switch   # user stacks only
diff --git a/tools/stacksnoop.lua b/tools/stacksnoop.lua
new file mode 100755
index 0000000..5bcef8c
--- /dev/null
+++ b/tools/stacksnoop.lua
@@ -0,0 +1,107 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 GitHub, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+struct data_t {
+    u64 stack_id;
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_STACK_TRACE(stack_traces, 128);
+BPF_PERF_OUTPUT(events);
+
+void trace_stack(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    struct data_t data = {};
+    data.stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID),
+    data.pid = pid;
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    events.perf_submit(ctx, &data, sizeof(data));
+}
+]]
+
+local ffi = require("ffi")
+
+return function(BPF, utils)
+  local parser = utils.argparse("stacksnoop",
+      "Trace and print kernel stack traces for a kernel function")
+  parser:flag("-s --offset")
+  parser:flag("-v --verbose")
+  parser:option("-p --pid"):convert(tonumber)
+  parser:argument("function", "kernel function name"):target("fn")
+
+  local args = parser:parse()
+  local ksym = BPF.SymbolCache()
+  local filter = ""
+
+  if args.pid then
+    filter = "if (pid != %d) { return; }" % args.pid
+  end
+
+  local text = program:gsub("FILTER", filter)
+  local bpf = BPF:new{text=text}
+  bpf:attach_kprobe{event=args.fn, fn_name="trace_stack"}
+
+  if BPF.num_open_kprobes() == 0 then
+    print("Function \"%s\" not found. Exiting." % {args.fn})
+    return
+  end
+
+  if args.verbose then
+    print("%-18s %-12s %-6s %-3s %s" %
+        {"TIME(s)", "COMM", "PID", "CPU", "FUNCTION"})
+  else
+    print("%-18s %s" % {"TIME(s)", "FUNCTION"})
+  end
+
+  local stack_traces = bpf:get_table("stack_traces")
+  local start_ts = utils.posix.time_ns()
+
+  local function print_event(cpu, event)
+    local ts = (utils.posix.time_ns() - start_ts) / 1e9
+
+    if args.verbose then
+      print("%-18.9f %-12.12s %-6d %-3d %s" %
+          {ts, ffi.string(event.comm), event.pid, cpu, args.fn})
+    else
+      print("%-18.9f %s" % {ts, args.fn})
+    end
+
+    for addr in stack_traces:walk(tonumber(event.stack_id)) do
+      local sym, offset = ksym:resolve(addr)
+      if args.offset then
+        print("\t%-16p %s+0x%x" % {addr, sym, tonumber(offset)})
+      else
+        print("\t%-16p %s" % {addr, sym})
+      end
+    end
+
+    print()
+  end
+
+  local TASK_COMM_LEN = 16 -- linux/sched.h
+
+  bpf:get_table("events"):open_perf_buffer(print_event,
+    "struct { uint64_t stack_id; uint32_t pid; char comm[$]; }",
+    {TASK_COMM_LEN})
+  bpf:perf_buffer_poll_loop()
+end
diff --git a/tools/statsnoop.py b/tools/statsnoop.py
new file mode 100755
index 0000000..4e62ebd
--- /dev/null
+++ b/tools/statsnoop.py
@@ -0,0 +1,182 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# statsnoop Trace stat() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: statsnoop [-h] [-t] [-x] [-p PID]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Feb-2016   Brendan Gregg   Created this.
+# 17-Feb-2016   Allan McAleavy updated for BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./statsnoop           # trace all stat() syscalls
+    ./statsnoop -t        # include timestamps
+    ./statsnoop -x        # only show failed stats
+    ./statsnoop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace stat() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-x", "--failed", action="store_true",
+    help="only show failed stats")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/limits.h>
+#include <linux/sched.h>
+
+struct val_t {
+    const char *fname;
+};
+
+struct data_t {
+    u32 pid;
+    u64 ts_ns;
+    int ret;
+    char comm[TASK_COMM_LEN];
+    char fname[NAME_MAX];
+};
+
+BPF_HASH(args_filename, u32, const char *);
+BPF_HASH(infotmp, u32, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+int syscall__entry(struct pt_regs *ctx, const char __user *filename)
+{
+    struct val_t val = {};
+    u32 pid = bpf_get_current_pid_tgid();
+
+    FILTER
+    val.fname = filename;
+    infotmp.update(&pid, &val);
+
+    return 0;
+};
+
+int trace_return(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    struct val_t *valp;
+
+    valp = infotmp.lookup(&pid);
+    if (valp == 0) {
+        // missed entry
+        return 0;
+    }
+
+    struct data_t data = {.pid = pid};
+    bpf_probe_read(&data.fname, sizeof(data.fname), (void *)valp->fname);
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    data.ts_ns = bpf_ktime_get_ns();
+    data.ret = PT_REGS_RC(ctx);
+
+    events.perf_submit(ctx, &data, sizeof(data));
+    infotmp.delete(&pid);
+    args_filename.delete(&pid);
+
+    return 0;
+}
+"""
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# for POSIX compliance, all architectures implement these
+# system calls but the name of the actual entry point may
+# be different for which we must check if the entry points
+# actually exist before attaching the probes
+syscall_fnname = b.get_syscall_fnname("stat")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+syscall_fnname = b.get_syscall_fnname("statfs")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+syscall_fnname = b.get_syscall_fnname("newstat")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+TASK_COMM_LEN = 16    # linux/sched.h
+NAME_MAX = 255        # linux/limits.h
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_ulonglong),
+        ("ts_ns", ct.c_ulonglong),
+        ("ret", ct.c_int),
+        ("comm", ct.c_char * TASK_COMM_LEN),
+        ("fname", ct.c_char * NAME_MAX)
+    ]
+
+start_ts = 0
+prev_ts = 0
+delta = 0
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %4s %3s %s" % ("PID", "COMM", "FD", "ERR", "PATH"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    global start_ts
+    global prev_ts
+    global delta
+    global cont
+
+    # split return value into FD and errno columns
+    if event.ret >= 0:
+        fd_s = event.ret
+        err = 0
+    else:
+        fd_s = -1
+        err = - event.ret
+
+    if start_ts == 0:
+        start_ts = event.ts_ns
+
+    if args.timestamp:
+        print("%-14.9f" % (float(event.ts_ns - start_ts) / 1000000000), end="")
+
+    print("%-6d %-16s %4d %3d %s" % (event.pid,
+        event.comm.decode('utf-8', 'replace'), fd_s, err,
+        event.fname.decode('utf-8', 'replace')))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/statsnoop_example.txt b/tools/statsnoop_example.txt
new file mode 100644
index 0000000..45f0e7e
--- /dev/null
+++ b/tools/statsnoop_example.txt
@@ -0,0 +1,73 @@
+Demonstrations of statsnoop, the Linux eBPF/bcc version.
+
+
+statsnoop traces the different stat() syscalls system-wide, and prints various
+details. Example output:
+
+# ./statsnoop 
+PID    COMM               FD ERR PATH
+31126  bash                0   0 .
+31126  bash               -1   2 /usr/local/sbin/iconfig
+31126  bash               -1   2 /usr/local/bin/iconfig
+31126  bash               -1   2 /usr/sbin/iconfig
+31126  bash               -1   2 /usr/bin/iconfig
+31126  bash               -1   2 /sbin/iconfig
+31126  bash               -1   2 /bin/iconfig
+31126  bash               -1   2 /usr/games/iconfig
+31126  bash               -1   2 /usr/local/games/iconfig
+31126  bash               -1   2 /apps/python/bin/iconfig
+31126  bash               -1   2 /mnt/src/llvm/build/bin/iconfig
+8902   command-not-fou    -1   2 /usr/bin/Modules/Setup
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/os.py
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/os.pyc
+8902   command-not-fou     0   0 /usr/lib/python3.4/os.py
+8902   command-not-fou    -1   2 /usr/bin/pybuilddir.txt
+8902   command-not-fou    -1   2 /usr/bin/lib/python3.4/lib-dynload
+8902   command-not-fou     0   0 /usr/lib/python3.4/lib-dynload
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /apps/python/lib/python2.7/site-packages
+8902   command-not-fou     0   0 /usr/lib/python3.4/
+8902   command-not-fou     0   0 /usr/lib/python3.4/
+[...]
+
+This output has caught me mistyping a command in another shell, "iconfig"
+instead of "ifconfig". The first several lines show the bash shell searching
+the $PATH, and failing to find it (ERR == 2 is file not found). Then, a
+"command-not-found" program executes (the name is truncated to 16 characters
+in the COMM field), which begins the process of searching for and suggesting
+a package. ie, this:
+
+# iconfig
+No command 'iconfig' found, did you mean:
+ Command 'vconfig' from package 'vlan' (main)
+ Command 'fconfig' from package 'redboot-tools' (universe)
+ Command 'mconfig' from package 'mono-devel' (main)
+ Command 'iwconfig' from package 'wireless-tools' (main)
+ Command 'zconfig' from package 'python-zconfig' (universe)
+ Command 'ifconfig' from package 'net-tools' (main)
+iconfig: command not found
+
+statsnoop can be used for general debugging, to see what file information has
+been requested, and whether those files exist. It can be used as a companion
+to opensnoop, which shows what files were actually opened.
+
+
+USAGE message:
+
+# ./statsnoop -h
+usage: statsnoop [-h] [-t] [-x] [-p PID]
+
+Trace stat() syscalls
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -t, --timestamp    include timestamp on output
+  -x, --failed       only show failed stats
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./statsnoop           # trace all stat() syscalls
+    ./statsnoop -t        # include timestamps
+    ./statsnoop -x        # only show failed stats
+    ./statsnoop -p 181    # only trace PID 181
diff --git a/tools/syncsnoop.py b/tools/syncsnoop.py
new file mode 100755
index 0000000..ba3f1d3
--- /dev/null
+++ b/tools/syncsnoop.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# syncsnoop Trace sync() syscall.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# Written as a basic example of BCC trace & reformat. See
+# examples/hello_world.py for a BCC trace with default output example.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Aug-2015   Brendan Gregg   Created this.
+# 19-Feb-2016   Allan McAleavy migrated to BPF_PERF_OUTPUT
+
+from __future__ import print_function
+from bcc import BPF
+import ctypes as ct
+
+# load BPF program
+b = BPF(text="""
+struct data_t {
+    u64 ts;
+};
+
+BPF_PERF_OUTPUT(events);
+
+void syscall__sync(void *ctx) {
+    struct data_t data = {};
+    data.ts = bpf_ktime_get_ns() / 1000;
+    events.perf_submit(ctx, &data, sizeof(data));
+};
+""")
+b.attach_kprobe(event=b.get_syscall_fnname("sync"),
+                fn_name="syscall__sync")
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts", ct.c_ulonglong)
+    ]
+
+# header
+print("%-18s %s" % ("TIME(s)", "CALL"))
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%-18.9f sync()" % (float(event.ts) / 1000000))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/syncsnoop_example.txt b/tools/syncsnoop_example.txt
new file mode 100644
index 0000000..4425c5b
--- /dev/null
+++ b/tools/syncsnoop_example.txt
@@ -0,0 +1,14 @@
+Demonstrations of syncsnoop, the Linux eBPF/bcc version.
+
+
+This program traces calls to the kernel sync() routine, with basic timestamps:
+
+# ./syncsnoop
+TIME(s)            CALL
+16458148.611952    sync()
+16458151.533709    sync()
+^C
+
+While tracing, the "sync" command was executed in another server session.
+
+This can be useful to identify that sync() is being called, and its frequency.
diff --git a/tools/syscount.py b/tools/syscount.py
new file mode 100755
index 0000000..191511c
--- /dev/null
+++ b/tools/syscount.py
@@ -0,0 +1,574 @@
+#!/usr/bin/env python
+#
+# syscount   Summarize syscall counts and latencies.
+#
+# USAGE: syscount [-p PID] [-i INTERVAL] [-T TOP] [-x] [-L] [-m] [-P] [-l]
+#
+# Copyright 2017, Sasha Goldshtein.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 15-Feb-2017   Sasha Goldshtein    Created this.
+
+from bcc import BPF
+from bcc.utils import printb
+from time import sleep, strftime
+import argparse
+import errno
+import itertools
+import subprocess
+import sys
+import signal
+import platform
+
+if sys.version_info.major < 3:
+    izip_longest = itertools.izip_longest
+else:
+    izip_longest = itertools.zip_longest
+
+#
+# Syscall table for Linux x86_64, not very recent.
+# Automatically generated from strace/linux/x86_64/syscallent.h using the
+# following command:
+#
+#  cat syscallent.h | awk -F, '{ gsub(/[ \t"}]/, "", $4);
+#                                gsub(/[ \t/*]/, "", $5);
+#                                print "    "$5": \""$4"\","; }
+#                              BEGIN { print "syscalls = {" }
+#                              END { print "}" }'
+#
+syscalls = {
+    0: b"read",
+    1: b"write",
+    2: b"open",
+    3: b"close",
+    4: b"stat",
+    5: b"fstat",
+    6: b"lstat",
+    7: b"poll",
+    8: b"lseek",
+    9: b"mmap",
+    10: b"mprotect",
+    11: b"munmap",
+    12: b"brk",
+    13: b"rt_sigaction",
+    14: b"rt_sigprocmask",
+    15: b"rt_sigreturn",
+    16: b"ioctl",
+    17: b"pread",
+    18: b"pwrite",
+    19: b"readv",
+    20: b"writev",
+    21: b"access",
+    22: b"pipe",
+    23: b"select",
+    24: b"sched_yield",
+    25: b"mremap",
+    26: b"msync",
+    27: b"mincore",
+    28: b"madvise",
+    29: b"shmget",
+    30: b"shmat",
+    31: b"shmctl",
+    32: b"dup",
+    33: b"dup2",
+    34: b"pause",
+    35: b"nanosleep",
+    36: b"getitimer",
+    37: b"alarm",
+    38: b"setitimer",
+    39: b"getpid",
+    40: b"sendfile",
+    41: b"socket",
+    42: b"connect",
+    43: b"accept",
+    44: b"sendto",
+    45: b"recvfrom",
+    46: b"sendmsg",
+    47: b"recvmsg",
+    48: b"shutdown",
+    49: b"bind",
+    50: b"listen",
+    51: b"getsockname",
+    52: b"getpeername",
+    53: b"socketpair",
+    54: b"setsockopt",
+    55: b"getsockopt",
+    56: b"clone",
+    57: b"fork",
+    58: b"vfork",
+    59: b"execve",
+    60: b"_exit",
+    61: b"wait4",
+    62: b"kill",
+    63: b"uname",
+    64: b"semget",
+    65: b"semop",
+    66: b"semctl",
+    67: b"shmdt",
+    68: b"msgget",
+    69: b"msgsnd",
+    70: b"msgrcv",
+    71: b"msgctl",
+    72: b"fcntl",
+    73: b"flock",
+    74: b"fsync",
+    75: b"fdatasync",
+    76: b"truncate",
+    77: b"ftruncate",
+    78: b"getdents",
+    79: b"getcwd",
+    80: b"chdir",
+    81: b"fchdir",
+    82: b"rename",
+    83: b"mkdir",
+    84: b"rmdir",
+    85: b"creat",
+    86: b"link",
+    87: b"unlink",
+    88: b"symlink",
+    89: b"readlink",
+    90: b"chmod",
+    91: b"fchmod",
+    92: b"chown",
+    93: b"fchown",
+    94: b"lchown",
+    95: b"umask",
+    96: b"gettimeofday",
+    97: b"getrlimit",
+    98: b"getrusage",
+    99: b"sysinfo",
+    100: b"times",
+    101: b"ptrace",
+    102: b"getuid",
+    103: b"syslog",
+    104: b"getgid",
+    105: b"setuid",
+    106: b"setgid",
+    107: b"geteuid",
+    108: b"getegid",
+    109: b"setpgid",
+    110: b"getppid",
+    111: b"getpgrp",
+    112: b"setsid",
+    113: b"setreuid",
+    114: b"setregid",
+    115: b"getgroups",
+    116: b"setgroups",
+    117: b"setresuid",
+    118: b"getresuid",
+    119: b"setresgid",
+    120: b"getresgid",
+    121: b"getpgid",
+    122: b"setfsuid",
+    123: b"setfsgid",
+    124: b"getsid",
+    125: b"capget",
+    126: b"capset",
+    127: b"rt_sigpending",
+    128: b"rt_sigtimedwait",
+    129: b"rt_sigqueueinfo",
+    130: b"rt_sigsuspend",
+    131: b"sigaltstack",
+    132: b"utime",
+    133: b"mknod",
+    134: b"uselib",
+    135: b"personality",
+    136: b"ustat",
+    137: b"statfs",
+    138: b"fstatfs",
+    139: b"sysfs",
+    140: b"getpriority",
+    141: b"setpriority",
+    142: b"sched_setparam",
+    143: b"sched_getparam",
+    144: b"sched_setscheduler",
+    145: b"sched_getscheduler",
+    146: b"sched_get_priority_max",
+    147: b"sched_get_priority_min",
+    148: b"sched_rr_get_interval",
+    149: b"mlock",
+    150: b"munlock",
+    151: b"mlockall",
+    152: b"munlockall",
+    153: b"vhangup",
+    154: b"modify_ldt",
+    155: b"pivot_root",
+    156: b"_sysctl",
+    157: b"prctl",
+    158: b"arch_prctl",
+    159: b"adjtimex",
+    160: b"setrlimit",
+    161: b"chroot",
+    162: b"sync",
+    163: b"acct",
+    164: b"settimeofday",
+    165: b"mount",
+    166: b"umount",
+    167: b"swapon",
+    168: b"swapoff",
+    169: b"reboot",
+    170: b"sethostname",
+    171: b"setdomainname",
+    172: b"iopl",
+    173: b"ioperm",
+    174: b"create_module",
+    175: b"init_module",
+    176: b"delete_module",
+    177: b"get_kernel_syms",
+    178: b"query_module",
+    179: b"quotactl",
+    180: b"nfsservctl",
+    181: b"getpmsg",
+    182: b"putpmsg",
+    183: b"afs_syscall",
+    184: b"tuxcall",
+    185: b"security",
+    186: b"gettid",
+    187: b"readahead",
+    188: b"setxattr",
+    189: b"lsetxattr",
+    190: b"fsetxattr",
+    191: b"getxattr",
+    192: b"lgetxattr",
+    193: b"fgetxattr",
+    194: b"listxattr",
+    195: b"llistxattr",
+    196: b"flistxattr",
+    197: b"removexattr",
+    198: b"lremovexattr",
+    199: b"fremovexattr",
+    200: b"tkill",
+    201: b"time",
+    202: b"futex",
+    203: b"sched_setaffinity",
+    204: b"sched_getaffinity",
+    205: b"set_thread_area",
+    206: b"io_setup",
+    207: b"io_destroy",
+    208: b"io_getevents",
+    209: b"io_submit",
+    210: b"io_cancel",
+    211: b"get_thread_area",
+    212: b"lookup_dcookie",
+    213: b"epoll_create",
+    214: b"epoll_ctl_old",
+    215: b"epoll_wait_old",
+    216: b"remap_file_pages",
+    217: b"getdents64",
+    218: b"set_tid_address",
+    219: b"restart_syscall",
+    220: b"semtimedop",
+    221: b"fadvise64",
+    222: b"timer_create",
+    223: b"timer_settime",
+    224: b"timer_gettime",
+    225: b"timer_getoverrun",
+    226: b"timer_delete",
+    227: b"clock_settime",
+    228: b"clock_gettime",
+    229: b"clock_getres",
+    230: b"clock_nanosleep",
+    231: b"exit_group",
+    232: b"epoll_wait",
+    233: b"epoll_ctl",
+    234: b"tgkill",
+    235: b"utimes",
+    236: b"vserver",
+    237: b"mbind",
+    238: b"set_mempolicy",
+    239: b"get_mempolicy",
+    240: b"mq_open",
+    241: b"mq_unlink",
+    242: b"mq_timedsend",
+    243: b"mq_timedreceive",
+    244: b"mq_notify",
+    245: b"mq_getsetattr",
+    246: b"kexec_load",
+    247: b"waitid",
+    248: b"add_key",
+    249: b"request_key",
+    250: b"keyctl",
+    251: b"ioprio_set",
+    252: b"ioprio_get",
+    253: b"inotify_init",
+    254: b"inotify_add_watch",
+    255: b"inotify_rm_watch",
+    256: b"migrate_pages",
+    257: b"openat",
+    258: b"mkdirat",
+    259: b"mknodat",
+    260: b"fchownat",
+    261: b"futimesat",
+    262: b"newfstatat",
+    263: b"unlinkat",
+    264: b"renameat",
+    265: b"linkat",
+    266: b"symlinkat",
+    267: b"readlinkat",
+    268: b"fchmodat",
+    269: b"faccessat",
+    270: b"pselect6",
+    271: b"ppoll",
+    272: b"unshare",
+    273: b"set_robust_list",
+    274: b"get_robust_list",
+    275: b"splice",
+    276: b"tee",
+    277: b"sync_file_range",
+    278: b"vmsplice",
+    279: b"move_pages",
+    280: b"utimensat",
+    281: b"epoll_pwait",
+    282: b"signalfd",
+    283: b"timerfd_create",
+    284: b"eventfd",
+    285: b"fallocate",
+    286: b"timerfd_settime",
+    287: b"timerfd_gettime",
+    288: b"accept4",
+    289: b"signalfd4",
+    290: b"eventfd2",
+    291: b"epoll_create1",
+    292: b"dup3",
+    293: b"pipe2",
+    294: b"inotify_init1",
+    295: b"preadv",
+    296: b"pwritev",
+    297: b"rt_tgsigqueueinfo",
+    298: b"perf_event_open",
+    299: b"recvmmsg",
+    300: b"fanotify_init",
+    301: b"fanotify_mark",
+    302: b"prlimit64",
+    303: b"name_to_handle_at",
+    304: b"open_by_handle_at",
+    305: b"clock_adjtime",
+    306: b"syncfs",
+    307: b"sendmmsg",
+    308: b"setns",
+    309: b"getcpu",
+    310: b"process_vm_readv",
+    311: b"process_vm_writev",
+    312: b"kcmp",
+    313: b"finit_module",
+}
+
+# Try to use ausyscall if it is available, because it can give us an up-to-date
+# list of syscalls for various architectures, rather than the x86-64 hardcoded
+# list above.
+def parse_syscall(line):
+    parts = line.split()
+    return (int(parts[0]), parts[1].strip())
+
+try:
+    # Skip the first line, which is a header. The rest of the lines are simply
+    # SYSCALL_NUM\tSYSCALL_NAME pairs.
+    out = subprocess.check_output('ausyscall --dump | tail -n +2', shell=True)
+    syscalls = dict(map(parse_syscall, out.strip().split(b'\n')))
+except Exception as e:
+    if platform.machine() == "x86_64":
+        pass
+    else:
+        raise Exception("ausyscall: command not found")
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+def handle_errno(errstr):
+    try:
+        return abs(int(errstr))
+    except ValueError:
+        pass
+
+    try:
+        return getattr(errno, errstr)
+    except AttributeError:
+        raise argparse.ArgumentTypeError("couldn't map %s to an errno" % errstr)
+
+
+parser = argparse.ArgumentParser(
+    description="Summarize syscall counts and latencies.")
+parser.add_argument("-p", "--pid", type=int, help="trace only this pid")
+parser.add_argument("-i", "--interval", type=int,
+    help="print summary at this interval (seconds)")
+parser.add_argument("-d", "--duration", type=int,
+    help="total duration of trace, in seconds")
+parser.add_argument("-T", "--top", type=int, default=10,
+    help="print only the top syscalls by count or latency")
+parser.add_argument("-x", "--failures", action="store_true",
+    help="trace only failed syscalls (return < 0)")
+parser.add_argument("-e", "--errno", type=handle_errno,
+    help="trace only syscalls that return this error (numeric or EPERM, etc.)")
+parser.add_argument("-L", "--latency", action="store_true",
+    help="collect syscall latency")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="display latency in milliseconds (default: microseconds)")
+parser.add_argument("-P", "--process", action="store_true",
+    help="count by process and not by syscall")
+parser.add_argument("-l", "--list", action="store_true",
+    help="print list of recognized syscalls and exit")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+if args.duration and not args.interval:
+    args.interval = args.duration
+if not args.interval:
+    args.interval = 99999999
+
+if args.list:
+    for grp in izip_longest(*(iter(sorted(syscalls.values())),) * 4):
+        print("   ".join(["%-20s" % s for s in grp if s is not None]))
+    sys.exit(0)
+
+text = """
+#ifdef LATENCY
+struct data_t {
+    u64 count;
+    u64 total_ns;
+};
+
+BPF_HASH(start, u64, u64);
+BPF_HASH(data, u32, struct data_t);
+#else
+BPF_HASH(data, u32, u64);
+#endif
+
+#ifdef LATENCY
+TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+
+#ifdef FILTER_PID
+    if (pid_tgid >> 32 != FILTER_PID)
+        return 0;
+#endif
+
+    u64 t = bpf_ktime_get_ns();
+    start.update(&pid_tgid, &t);
+    return 0;
+}
+#endif
+
+TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+
+#ifdef FILTER_PID
+    if (pid_tgid >> 32 != FILTER_PID)
+        return 0;
+#endif
+
+#ifdef FILTER_FAILED
+    if (args->ret >= 0)
+        return 0;
+#endif
+
+#ifdef FILTER_ERRNO
+    if (args->ret != -FILTER_ERRNO)
+        return 0;
+#endif
+
+#ifdef BY_PROCESS
+    u32 key = pid_tgid >> 32;
+#else
+    u32 key = args->id;
+#endif
+
+#ifdef LATENCY
+    struct data_t *val, zero = {};
+    u64 *start_ns = start.lookup(&pid_tgid);
+    if (!start_ns)
+        return 0;
+
+    val = data.lookup_or_init(&key, &zero);
+    val->count++;
+    val->total_ns += bpf_ktime_get_ns() - *start_ns;
+#else
+    u64 *val, zero = 0;
+    val = data.lookup_or_init(&key, &zero);
+    ++(*val);
+#endif
+    return 0;
+}
+"""
+
+if args.pid:
+    text = ("#define FILTER_PID %d\n" % args.pid) + text
+if args.failures:
+    text = "#define FILTER_FAILED\n" + text
+if args.errno:
+    text = "#define FILTER_ERRNO %d\n" % abs(args.errno) + text
+if args.latency:
+    text = "#define LATENCY\n" + text
+if args.process:
+    text = "#define BY_PROCESS\n" + text
+if args.ebpf:
+    print(text)
+    exit()
+
+bpf = BPF(text=text)
+
+def print_stats():
+    if args.latency:
+        print_latency_stats()
+    else:
+        print_count_stats()
+
+agg_colname = "PID    COMM" if args.process else "SYSCALL"
+time_colname = "TIME (ms)" if args.milliseconds else "TIME (us)"
+
+def comm_for_pid(pid):
+    try:
+        return open("/proc/%d/comm" % pid, "rb").read().strip()
+    except Exception:
+        return b"[unknown]"
+
+def agg_colval(key):
+    if args.process:
+        return b"%-6d %-15s" % (key.value, comm_for_pid(key.value))
+    else:
+        return syscalls.get(key.value, b"[unknown: %d]" % key.value)
+
+def print_count_stats():
+    data = bpf["data"]
+    print("[%s]" % strftime("%H:%M:%S"))
+    print("%-22s %8s" % (agg_colname, "COUNT"))
+    for k, v in sorted(data.items(), key=lambda kv: -kv[1].value)[:args.top]:
+        if k.value == 0xFFFFFFFF:
+            continue    # happens occasionally, we don't need it
+        printb(b"%-22s %8d" % (agg_colval(k), v.value))
+    print("")
+    data.clear()
+
+def print_latency_stats():
+    data = bpf["data"]
+    print("[%s]" % strftime("%H:%M:%S"))
+    print("%-22s %8s %16s" % (agg_colname, "COUNT", time_colname))
+    for k, v in sorted(data.items(),
+                       key=lambda kv: -kv[1].total_ns)[:args.top]:
+        if k.value == 0xFFFFFFFF:
+            continue    # happens occasionally, we don't need it
+        printb((b"%-22s %8d " + (b"%16.6f" if args.milliseconds else b"%16.3f")) %
+               (agg_colval(k), v.count,
+                v.total_ns / (1e6 if args.milliseconds else 1e3)))
+    print("")
+    data.clear()
+
+print("Tracing %ssyscalls, printing top %d... Ctrl+C to quit." %
+      ("failed " if args.failures else "", args.top))
+exiting = 0 if args.interval else 1
+seconds = 0
+while True:
+    try:
+        sleep(args.interval)
+        seconds += args.interval
+    except KeyboardInterrupt:
+        exiting = 1
+        signal.signal(signal.SIGINT, signal_ignore)
+    if args.duration and seconds >= args.duration:
+        exiting = 1
+
+    print_stats()
+
+    if exiting:
+        print("Detaching...")
+        exit()
diff --git a/tools/syscount_example.txt b/tools/syscount_example.txt
new file mode 100644
index 0000000..aad51c4
--- /dev/null
+++ b/tools/syscount_example.txt
@@ -0,0 +1,165 @@
+Demonstrations of syscount, the Linux/eBPF version.
+
+
+syscount summarizes syscall counts across the system or a specific process,
+with optional latency information. It is very useful for general workload
+characterization, for example:
+
+# syscount
+Tracing syscalls, printing top 10... Ctrl+C to quit.
+[09:39:04]
+SYSCALL             COUNT
+write               10739
+read                10584
+wait4                1460
+nanosleep            1457
+select                795
+rt_sigprocmask        689
+clock_gettime         653
+rt_sigaction          128
+futex                  86
+ioctl                  83
+^C
+
+These are the top 10 entries; you can get more by using the -T switch. Here,
+the output indicates that the write and read syscalls were very common, followed
+immediately by wait4, nanosleep, and so on. By default, syscount counts across
+the entire system, but we can point it to a specific process of interest:
+
+# syscount -p $(pidof dd)
+Tracing syscalls, printing top 10... Ctrl+C to quit.
+[09:40:21]
+SYSCALL             COUNT
+read              7878397
+write             7878397
+^C
+
+Indeed, dd's workload is a bit easier to characterize. Occasionally, the count
+of syscalls is not enough, and you'd also want an aggregate latency:
+
+# syscount -L
+Tracing syscalls, printing top 10... Ctrl+C to quit.
+[09:41:32]
+SYSCALL                   COUNT        TIME (us)
+select                       16      3415860.022
+nanosleep                   291        12038.707
+ftruncate                     1          122.939
+write                         4           63.389
+stat                          1           23.431
+fstat                         1            5.088
+[unknown: 321]               32            4.965
+timerfd_settime               1            4.830
+ioctl                         3            4.802
+kill                          1            4.342
+^C
+
+The select and nanosleep calls are responsible for a lot of time, but remember
+these are blocking calls. This output was taken from a mostly idle system. Note
+the "unknown" entry -- syscall 321 is the bpf() syscall, which is not in the
+table used by this tool (borrowed from strace sources).
+
+Another direction would be to understand which processes are making a lot of
+syscalls, thus responsible for a lot of activity. This is what the -P switch
+does:
+
+# syscount -P
+Tracing syscalls, printing top 10... Ctrl+C to quit.
+[09:58:13]
+PID    COMM               COUNT
+13820  vim                  548
+30216  sshd                 149
+29633  bash                  72
+25188  screen                70
+25776  mysqld                30
+31285  python                10
+529    systemd-udevd          9
+1      systemd                8
+494    systemd-journal        5
+^C
+
+This is again from a mostly idle system over an interval of a few seconds.
+
+Sometimes, you'd only care about failed syscalls -- these are the ones that
+might be worth investigating with follow-up tools like opensnoop, execsnoop,
+or trace. Use the -x switch for this; the following example also demonstrates
+the -i switch, for printing at predefined intervals:
+
+# syscount -x -i 5
+Tracing failed syscalls, printing top 10... Ctrl+C to quit.
+[09:44:16]
+SYSCALL             COUNT
+futex                  13
+getxattr               10
+stat                    8
+open                    6
+wait4                   3
+access                  2
+[unknown: 321]          1
+
+[09:44:21]
+SYSCALL             COUNT
+futex                  12
+getxattr               10
+[unknown: 321]          2
+wait4                   1
+access                  1
+pause                   1
+^C
+
+Similar to -x/--failures, sometimes you only care about certain syscall
+errors like EPERM or ENONET -- these are the ones that might be worth
+investigating with follow-up tools like opensnoop, execsnoop, or
+trace. Use the -e/--errno switch for this; the following example also
+demonstrates the -e switch, for printing ENOENT failures at predefined intervals:
+
+# syscount -e ENOENT -i 5
+Tracing syscalls, printing top 10... Ctrl+C to quit.
+[13:15:57]
+SYSCALL                   COUNT
+stat                       4669
+open                       1951
+access                      561
+lstat                        62
+openat                       42
+readlink                      8
+execve                        4
+newfstatat                    1
+
+[13:16:02]
+SYSCALL                   COUNT
+lstat                     18506
+stat                      13087
+open                       2907
+access                      412
+openat                       19
+readlink                     12
+execve                        7
+connect                       6
+unlink                        1
+rmdir                         1
+^C
+
+USAGE:
+# syscount -h
+usage: syscount.py [-h] [-p PID] [-i INTERVAL] [-T TOP] [-x] [-e ERRNO] [-L]
+                   [-m] [-P] [-l]
+
+Summarize syscall counts and latencies.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -p PID, --pid PID     trace only this pid
+  -i INTERVAL, --interval INTERVAL
+                        print summary at this interval (seconds)
+  -d DURATION, --duration DURATION
+			total duration of trace, in seconds
+  -T TOP, --top TOP     print only the top syscalls by count or latency
+  -x, --failures        trace only failed syscalls (return < 0)
+  -e ERRNO, --errno ERRNO
+                        trace only syscalls that return this error (numeric or
+                        EPERM, etc.)
+  -L, --latency         collect syscall latency
+  -m, --milliseconds    display latency in milliseconds (default:
+                        microseconds)
+  -P, --process         count by process and not by syscall
+  -l, --list            print list of recognized syscalls and exit
diff --git a/tools/tclcalls.sh b/tools/tclcalls.sh
new file mode 100755
index 0000000..fafd550
--- /dev/null
+++ b/tools/tclcalls.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ucalls.py -l tcl "$@"
diff --git a/tools/tclcalls_example.txt b/tools/tclcalls_example.txt
new file mode 120000
index 0000000..22b0fb3
--- /dev/null
+++ b/tools/tclcalls_example.txt
@@ -0,0 +1 @@
+lib/ucalls_example.txt
\ No newline at end of file
diff --git a/tools/tclflow.sh b/tools/tclflow.sh
new file mode 100755
index 0000000..8930466
--- /dev/null
+++ b/tools/tclflow.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uflow.py -l tcl "$@"
diff --git a/tools/tclflow_example.txt b/tools/tclflow_example.txt
new file mode 120000
index 0000000..bc71efc
--- /dev/null
+++ b/tools/tclflow_example.txt
@@ -0,0 +1 @@
+lib/uflow_example.txt
\ No newline at end of file
diff --git a/tools/tclobjnew.sh b/tools/tclobjnew.sh
new file mode 100755
index 0000000..6aed1ac
--- /dev/null
+++ b/tools/tclobjnew.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/uobjnew.py -l tcl "$@"
diff --git a/tools/tclobjnew_example.txt b/tools/tclobjnew_example.txt
new file mode 120000
index 0000000..a8a83c3
--- /dev/null
+++ b/tools/tclobjnew_example.txt
@@ -0,0 +1 @@
+lib/uobjnew_example.txt
\ No newline at end of file
diff --git a/tools/tclstat.sh b/tools/tclstat.sh
new file mode 100755
index 0000000..f4291ab
--- /dev/null
+++ b/tools/tclstat.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+lib=$(dirname $0)/lib
+$lib/ustat.py -l tcl "$@"
diff --git a/tools/tclstat_example.txt b/tools/tclstat_example.txt
new file mode 120000
index 0000000..544e5ad
--- /dev/null
+++ b/tools/tclstat_example.txt
@@ -0,0 +1 @@
+lib/ustat_example.txt
\ No newline at end of file
diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py
new file mode 100755
index 0000000..884b0c5
--- /dev/null
+++ b/tools/tcpaccept.py
@@ -0,0 +1,273 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpaccept Trace TCP accept()s.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpaccept [-h] [-t] [-p PID]
+#
+# This uses dynamic tracing of the kernel inet_csk_accept() socket function
+# (from tcp_prot.accept), and will need to be modified to match kernel changes.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 13-Oct-2015   Brendan Gregg   Created this.
+# 14-Feb-2016      "      "     Switch to bpf_perf_output.
+
+from __future__ import print_function
+from bcc import BPF
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import argparse
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./tcpaccept           # trace all TCP accept()s
+    ./tcpaccept -t        # include timestamps
+    ./tcpaccept -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP accepts",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u64 ts_us;
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u64 ip;
+    u16 lport;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u32 pid;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u64 ip;
+    u16 lport;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+"""
+
+#
+# The following is the code for older kernels(Linux pre-4.16).
+# It uses kprobes to instrument inet_csk_accept(). On Linux 4.16 and
+# later, the sock:inet_sock_set_state tracepoint should be used instead, as
+# is done by the code that follows this. 
+#
+bpf_text_kprobe = """
+int kretprobe__inet_csk_accept(struct pt_regs *ctx)
+{
+    struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
+    u32 pid = bpf_get_current_pid_tgid();
+
+    if (newsk == NULL)
+        return 0;
+
+    // check this is TCP
+    u8 protocol = 0;
+    // workaround for reading the sk_protocol bitfield:
+    
+    // Following comments add by Joe Yin:
+    // Unfortunately,it can not work since Linux 4.10,
+    // because the sk_wmem_queued is not following the bitfield of sk_protocol.
+    // And the following member is sk_gso_max_segs.
+    // So, we can use this:
+    // bpf_probe_read(&protocol, 1, (void *)((u64)&newsk->sk_gso_max_segs) - 3);
+    // In order to  diff the pre-4.10 and 4.10+ ,introduce the variables gso_max_segs_offset,sk_lingertime, 
+    // sk_lingertime is closed to the gso_max_segs_offset,and  
+    // the offset between the two members is 4 
+
+    int gso_max_segs_offset = offsetof(struct sock, sk_gso_max_segs);
+    int sk_lingertime_offset = offsetof(struct sock, sk_lingertime);
+
+    if (sk_lingertime_offset - gso_max_segs_offset == 4) 
+        // 4.10+ with little endian
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+        protocol = *(u8 *)((u64)&newsk->sk_gso_max_segs - 3);
+    else
+        // pre-4.10 with little endian
+        protocol = *(u8 *)((u64)&newsk->sk_wmem_queued - 3);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        // 4.10+ with big endian
+        protocol = *(u8 *)((u64)&newsk->sk_gso_max_segs - 1);
+    else
+        // pre-4.10 with big endian
+        protocol = *(u8 *)((u64)&newsk->sk_wmem_queued - 1);
+#else
+# error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+
+    if (protocol != IPPROTO_TCP)
+        return 0;
+
+    // pull in details
+    u16 family = 0, lport = 0;
+    family = newsk->__sk_common.skc_family;
+    lport = newsk->__sk_common.skc_num;
+
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {.pid = pid, .ip = 4};
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        data4.saddr = newsk->__sk_common.skc_rcv_saddr;
+        data4.daddr = newsk->__sk_common.skc_daddr;
+        data4.lport = lport;
+        bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else if (family == AF_INET6) {
+        struct ipv6_data_t data6 = {.pid = pid, .ip = 6};
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            &newsk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            &newsk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        data6.lport = lport;
+        bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+bpf_text_tracepoint = """
+TRACEPOINT_PROBE(sock, inet_sock_set_state)
+{
+    if (args->protocol != IPPROTO_TCP)
+        return 0;
+    u32 pid = bpf_get_current_pid_tgid();
+    // pull in details
+    u16 family = 0, lport = 0;
+    family = args->family;
+    lport = args->sport;
+
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {.pid = pid, .ip = 4};
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
+        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
+        data4.lport = lport;
+        bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        ipv4_events.perf_submit(args, &data4, sizeof(data4));
+    } else if (family == AF_INET6) {
+        struct ipv6_data_t data6 = {.pid = pid, .ip = 6};
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data6.saddr, args->saddr, sizeof(data6.saddr));
+        __builtin_memcpy(&data6.daddr, args->daddr, sizeof(data6.daddr));
+        data6.lport = lport;
+        bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        ipv6_events.perf_submit(args, &data6, sizeof(data6));
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
+    bpf_text += bpf_text_tracepoint
+else:
+    bpf_text += bpf_text_kprobe
+
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("lport", ct.c_ushort),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("ip", ct.c_ulonglong),
+        ("lport", ct.c_ushort),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET, pack("I", event.daddr)),
+        inet_ntop(AF_INET, pack("I", event.saddr)), event.lport))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET6, event.daddr),inet_ntop(AF_INET6, event.saddr),
+        event.lport))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.timestamp:
+    print("%-9s" % ("TIME(s)"), end="")
+print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "RADDR",
+    "LADDR", "LPORT"))
+
+start_ts = 0
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcpaccept_example.txt b/tools/tcpaccept_example.txt
new file mode 100644
index 0000000..f86c439
--- /dev/null
+++ b/tools/tcpaccept_example.txt
@@ -0,0 +1,51 @@
+Demonstrations of tcpaccept, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel function accepting TCP socket connections (eg, a
+passive connection via accept(); not connect()). Some example output (IP
+addresses changed to protect the innocent):
+
+# ./tcpaccept
+PID    COMM         IP RADDR            LADDR            LPORT
+907    sshd         4  192.168.56.1     192.168.56.102   22
+907    sshd         4  127.0.0.1        127.0.0.1        22
+5389   perl         6  1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001
+
+This output shows three connections, two IPv4 connections to PID 907, an "sshd"
+process listening on port 22, and one IPv6 connection to a "perl" process
+listening on port 7001.
+
+The overhead of this tool should be negligible, since it is only tracing the
+kernel function performing accept. It is not tracing every packet and then
+filtering.
+
+This tool only traces successful TCP accept()s. Connection attempts to closed
+ports will not be shown (those can be traced via other functions).
+
+
+The -t option prints a timestamp column:
+
+# ./tcpaccept -t
+TIME(s)  PID    COMM         IP RADDR            LADDR            LPORT
+0.000    907    sshd         4  127.0.0.1        127.0.0.1        22
+0.010    5389   perl         6  1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001
+0.992    907    sshd         4  127.0.0.1        127.0.0.1        22
+1.984    907    sshd         4  127.0.0.1        127.0.0.1        22
+
+
+USAGE message:
+
+# ./tcpaccept -h
+usage: tcpaccept [-h] [-t] [-p PID]
+
+Trace TCP accepts
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -t, --timestamp    include timestamp on output
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./tcpaccept           # trace all TCP accept()s
+    ./tcpaccept -t        # include timestamps
+    ./tcpaccept -p 181    # only trace PID 181
diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py
new file mode 100755
index 0000000..ac84326
--- /dev/null
+++ b/tools/tcpconnect.py
@@ -0,0 +1,240 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpconnect    Trace TCP connect()s.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpconnect [-h] [-t] [-p PID] [-P PORT [PORT ...]]
+#
+# All connection attempts are traced, even if they ultimately fail.
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Sep-2015   Brendan Gregg   Created this.
+# 14-Feb-2016      "      "     Switch to bpf_perf_output.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from socket import inet_ntop, ntohs, AF_INET, AF_INET6
+from struct import pack
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./tcpconnect           # trace all TCP connect()s
+    ./tcpconnect -t        # include timestamps
+    ./tcpconnect -p 181    # only trace PID 181
+    ./tcpconnect -P 80     # only trace port 80
+    ./tcpconnect -P 80,81  # only trace port 80 and 81
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP connects",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-P", "--port",
+    help="comma-separated list of destination ports to trace.")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(currsock, u32, struct sock *);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u64 ts_us;
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u64 ip;
+    u16 dport;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u32 pid;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u64 ip;
+    u16 dport;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+int trace_connect_entry(struct pt_regs *ctx, struct sock *sk)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER_PID
+
+    // stash the sock ptr for lookup on return
+    currsock.update(&pid, &sk);
+
+    return 0;
+};
+
+static int trace_connect_return(struct pt_regs *ctx, short ipver)
+{
+    int ret = PT_REGS_RC(ctx);
+    u32 pid = bpf_get_current_pid_tgid();
+
+    struct sock **skpp;
+    skpp = currsock.lookup(&pid);
+    if (skpp == 0) {
+        return 0;   // missed entry
+    }
+
+    if (ret != 0) {
+        // failed to send SYNC packet, may not have populated
+        // socket __sk_common.{skc_rcv_saddr, ...}
+        currsock.delete(&pid);
+        return 0;
+    }
+
+    // pull in details
+    struct sock *skp = *skpp;
+    u16 dport = skp->__sk_common.skc_dport;
+
+    FILTER_PORT
+
+    if (ipver == 4) {
+        struct ipv4_data_t data4 = {.pid = pid, .ip = ipver};
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        data4.saddr = skp->__sk_common.skc_rcv_saddr;
+        data4.daddr = skp->__sk_common.skc_daddr;
+        data4.dport = ntohs(dport);
+        bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else /* 6 */ {
+        struct ipv6_data_t data6 = {.pid = pid, .ip = ipver};
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        data6.dport = ntohs(dport);
+        bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+
+    currsock.delete(&pid);
+
+    return 0;
+}
+
+int trace_connect_v4_return(struct pt_regs *ctx)
+{
+    return trace_connect_return(ctx, 4);
+}
+
+int trace_connect_v6_return(struct pt_regs *ctx)
+{
+    return trace_connect_return(ctx, 6);
+}
+"""
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID',
+        'if (pid != %s) { return 0; }' % args.pid)
+if args.port:
+    dports = [int(dport) for dport in args.port.split(',')]
+    dports_if = ' && '.join(['dport != %d' % ntohs(dport) for dport in dports])
+    bpf_text = bpf_text.replace('FILTER_PORT',
+        'if (%s) { currsock.delete(&pid); return 0; }' % dports_if)
+
+bpf_text = bpf_text.replace('FILTER_PID', '')
+bpf_text = bpf_text.replace('FILTER_PORT', '')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("dport", ct.c_ushort),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("ip", ct.c_ulonglong),
+        ("dport", ct.c_ushort),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET, pack("I", event.saddr)),
+        inet_ntop(AF_INET, pack("I", event.daddr)), event.dport))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr),
+        event.dport))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_entry")
+b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_entry")
+b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return")
+b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return")
+
+# header
+if args.timestamp:
+    print("%-9s" % ("TIME(s)"), end="")
+print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "SADDR",
+    "DADDR", "DPORT"))
+
+start_ts = 0
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcpconnect_example.txt b/tools/tcpconnect_example.txt
new file mode 100644
index 0000000..6d2f8f8
--- /dev/null
+++ b/tools/tcpconnect_example.txt
@@ -0,0 +1,60 @@
+Demonstrations of tcpconnect, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel function performing active TCP connections
+(eg, via a connect() syscall; accept() are passive connections). Some example
+output (IP addresses changed to protect the innocent):
+
+# ./tcpconnect
+PID    COMM         IP SADDR            DADDR            DPORT
+1479   telnet       4  127.0.0.1        127.0.0.1        23
+1469   curl         4  10.201.219.236   54.245.105.25    80
+1469   curl         4  10.201.219.236   54.67.101.145    80
+1991   telnet       6  ::1              ::1              23
+2015   ssh          6  fe80::2000:bff:fe82:3ac fe80::2000:bff:fe82:3ac 22
+
+This output shows four connections, one from a "telnet" process, two from
+"curl", and one from "ssh". The output details shows the IP version, source
+address, destination address, and destination port. This traces attempted
+connections: these may have failed.
+
+The overhead of this tool should be negligible, since it is only tracing the
+kernel functions performing connect. It is not tracing every packet and then
+filtering.
+
+
+The -t option prints a timestamp column:
+
+# ./tcpconnect -t
+TIME(s)  PID    COMM         IP SADDR            DADDR            DPORT
+31.871   2482   local_agent  4  10.103.219.236   10.251.148.38    7001
+31.874   2482   local_agent  4  10.103.219.236   10.101.3.132     7001
+31.878   2482   local_agent  4  10.103.219.236   10.171.133.98    7101
+90.917   2482   local_agent  4  10.103.219.236   10.251.148.38    7001
+90.928   2482   local_agent  4  10.103.219.236   10.102.64.230    7001
+90.938   2482   local_agent  4  10.103.219.236   10.115.167.169   7101
+
+The output shows some periodic connections (or attempts) from a "local_agent"
+process to various other addresses. A few connections occur every minute.
+
+
+USAGE message:
+
+# ./tcpconnect -h
+usage: tcpconnect [-h] [-t] [-p PID] [-P PORT]
+
+Trace TCP connects
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -t, --timestamp    include timestamp on output
+  -p PID, --pid PID  trace this PID only
+  -P PORT, --port PORT
+                     comma-separated list of destination ports to trace.
+
+examples:
+    ./tcpconnect           # trace all TCP connect()s
+    ./tcpconnect -t        # include timestamps
+    ./tcpconnect -p 181    # only trace PID 181
+    ./tcpconnect -P 80     # only trace port 80
+    ./tcpconnect -P 80,81  # only trace port 80 and 81
diff --git a/tools/tcpconnlat.py b/tools/tcpconnlat.py
new file mode 100755
index 0000000..0d21b83
--- /dev/null
+++ b/tools/tcpconnlat.py
@@ -0,0 +1,267 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpconnlat    Trace TCP active connection latency (connect).
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpconnlat [-h] [-t] [-p PID]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import argparse
+import ctypes as ct
+
+# arg validation
+def positive_float(val):
+    try:
+        ival = float(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be a float")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+# arguments
+examples = """examples:
+    ./tcpconnlat           # trace all TCP connect()s
+    ./tcpconnlat 1         # trace connection latency slower than 1 ms
+    ./tcpconnlat 0.1       # trace connection latency slower than 100 us
+    ./tcpconnlat -t        # include timestamps
+    ./tcpconnlat -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP connects and show connection latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("duration_ms", nargs="?", default=0,
+    type=positive_float,
+    help="minimum duration to trace (ms)")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program for debugging purposes")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+
+if args.duration_ms:
+    # support fractions but round to nearest microsecond
+    duration_us = int(args.duration_ms * 1000)
+else:
+    duration_us = 0   # default is show all
+
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <bcc/proto.h>
+
+struct info_t {
+    u64 ts;
+    u32 pid;
+    char task[TASK_COMM_LEN];
+};
+BPF_HASH(start, struct sock *, struct info_t);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u64 ts_us;
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u64 ip;
+    u16 dport;
+    u64 delta_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u32 pid;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u64 ip;
+    u16 dport;
+    u64 delta_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+int trace_connect(struct pt_regs *ctx, struct sock *sk)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    struct info_t info = {.pid = pid};
+    info.ts = bpf_ktime_get_ns();
+    bpf_get_current_comm(&info.task, sizeof(info.task));
+    start.update(&sk, &info);
+    return 0;
+};
+
+// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN
+// are fast path and processed elsewhere, and leftovers are processed by
+// tcp_rcv_state_process(). We can trace this for handshake completion.
+// This should all be switched to static tracepoints when available.
+int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *skp)
+{
+    // will be in TCP_SYN_SENT for handshake
+    if (skp->__sk_common.skc_state != TCP_SYN_SENT)
+        return 0;
+
+    // check start and calculate delta
+    struct info_t *infop = start.lookup(&skp);
+    if (infop == 0) {
+        return 0;   // missed entry or filtered
+    }
+
+    u64 ts = infop->ts;
+    u64 now = bpf_ktime_get_ns();
+
+    u64 delta_us = (now - ts) / 1000ul;
+
+#ifdef MIN_LATENCY
+    if ( delta_us < DURATION_US ) {
+        return 0; // connect latency is below latency filter minimum
+    }
+#endif
+
+    // pull in details
+    u16 family = 0, dport = 0;
+    family = skp->__sk_common.skc_family;
+    dport = skp->__sk_common.skc_dport;
+
+    // emit to appropriate data path
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4};
+        data4.ts_us = now / 1000;
+        data4.saddr = skp->__sk_common.skc_rcv_saddr;
+        data4.daddr = skp->__sk_common.skc_daddr;
+        data4.dport = ntohs(dport);
+        data4.delta_us = delta_us;
+        __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task));
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else /* AF_INET6 */ {
+        struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6};
+        data6.ts_us = now / 1000;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        data6.dport = ntohs(dport);
+        data6.delta_us = delta_us;
+        __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task));
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+
+    start.delete(&skp);
+
+    return 0;
+}
+"""
+
+if duration_us > 0:
+    bpf_text = "#define MIN_LATENCY\n" + bpf_text
+    bpf_text = bpf_text.replace('DURATION_US', str(duration_us))
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.verbose or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect")
+b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect")
+b.attach_kprobe(event="tcp_rcv_state_process",
+    fn_name="trace_tcp_rcv_state_process")
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("dport", ct.c_ushort),
+        ("delta_us", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("ip", ct.c_ulonglong),
+        ("dport", ct.c_ushort),
+        ("delta_us", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+# process event
+start_ts = 0
+
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET, pack("I", event.saddr)),
+        inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
+        float(event.delta_us) / 1000))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
+    print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
+        event.task.decode('utf-8', 'replace'), event.ip,
+        inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr),
+        event.dport, float(event.delta_us) / 1000))
+
+# header
+if args.timestamp:
+    print("%-9s" % ("TIME(s)"), end="")
+print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", "SADDR",
+    "DADDR", "DPORT", "LAT(ms)"))
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcpconnlat_example.txt b/tools/tcpconnlat_example.txt
new file mode 100644
index 0000000..569d346
--- /dev/null
+++ b/tools/tcpconnlat_example.txt
@@ -0,0 +1,55 @@
+Demonstrations of tcpconnlat, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel function performing active TCP connections
+(eg, via a connect() syscall), and shows the latency (time) for the connection
+as measured locally: the time from SYN sent to the response packet.
+For example:
+
+# ./tcpconnlat
+PID    COMM         IP SADDR            DADDR            DPORT LAT(ms)
+1201   wget         4  10.153.223.157   23.23.100.231    80    1.65
+1201   wget         4  10.153.223.157   23.23.100.231    443   1.60
+1433   curl         4  10.153.223.157   104.20.25.153    80    0.75
+1690   wget         4  10.153.223.157   66.220.156.68    80    1.10
+1690   wget         4  10.153.223.157   66.220.156.68    443   0.95
+1690   wget         4  10.153.223.157   66.220.156.68    443   0.99
+2852   curl         4  10.153.223.157   23.101.17.61     80    250.86
+20337  python2.7    6  1234:ab12:2040:5020:2299:0:5:0 1234:ab12:20:9f1d:2299:dde9:0:f5 7001  62.20
+21588  nc           6  ::1              ::1              80    0.05
+[...]
+
+The first line shows a connection from the "wget" process to the IPv4
+destination address 23.23.100.231, port 80. This took 1.65 milliseconds: the
+time from the SYN to the response.
+
+TCP connection latency is a useful performance measure showing the time taken
+to establish a connection. This typically involves kernel TCP/IP processing
+and the network round trip time, and not application runtime.
+
+tcpconnlat measures the time from any connection to the response packet, even
+if the response is a RST (port closed).
+
+
+USAGE message:
+
+# ./tcpconnlat -h
+usage: tcpconnlat [-h] [-t] [-p PID] [min_ms]
+
+Trace TCP connects and show connection latency
+
+positional arguments:
+  min_ms             minimum duration to trace, in ms (default 0)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -t, --timestamp    include timestamp on output
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./tcpconnlat           # trace all TCP connect()s
+    ./tcpconnlat -t        # include timestamps
+    ./tcpconnlat -p 181    # only trace PID 181
+    ./tcpconnlat 1         # only show connects longer than 1 ms
+    ./tcpconnlat 0.1       # only show connects longer than 100 us
+    ./tcpconnlat -v        # Show the BPF program
diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py
new file mode 100755
index 0000000..d9fbdf5
--- /dev/null
+++ b/tools/tcpdrop.py
@@ -0,0 +1,224 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpdrop   Trace TCP kernel-dropped packets/segments.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# This provides information such as packet details, socket state, and kernel
+# stack trace for packets/segments that were dropped via tcp_drop().
+#
+# USAGE: tcpdrop [-c] [-h] [-l]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# Copyright 2018 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 30-May-2018   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import ctypes as ct
+from time import sleep
+from bcc import tcp
+
+# arguments
+examples = """examples:
+    ./tcpdrop           # trace kernel TCP drops
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP drops by the kernel",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/ip.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_STACK_TRACE(stack_traces, 1024);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u32 pid;
+    u64 ip;
+    u32 saddr;
+    u32 daddr;
+    u16 sport;
+    u16 dport;
+    u8 state;
+    u8 tcpflags;
+    u32 stack_id;
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u32 pid;
+    u64 ip;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 sport;
+    u16 dport;
+    u8 state;
+    u8 tcpflags;
+    u32 stack_id;
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+static struct tcphdr *skb_to_tcphdr(const struct sk_buff *skb)
+{
+    // unstable API. verify logic in tcp_hdr() -> skb_transport_header().
+    return (struct tcphdr *)(skb->head + skb->transport_header);
+}
+
+static inline struct iphdr *skb_to_iphdr(const struct sk_buff *skb)
+{
+    // unstable API. verify logic in ip_hdr() -> skb_network_header().
+    return (struct iphdr *)(skb->head + skb->network_header);
+}
+
+// from include/net/tcp.h:
+#ifndef tcp_flag_byte
+#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
+#endif
+
+int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb)
+{
+    if (sk == NULL)
+        return 0;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // pull in details from the packet headers and the sock struct
+    u16 family = sk->__sk_common.skc_family;
+    char state = sk->__sk_common.skc_state;
+    u16 sport = 0, dport = 0;
+    struct tcphdr *tcp = skb_to_tcphdr(skb);
+    struct iphdr *ip = skb_to_iphdr(skb);
+    u8 tcpflags = ((u_int8_t *)tcp)[13];
+    sport = tcp->source;
+    dport = tcp->dest;
+    sport = ntohs(sport);
+    dport = ntohs(dport);
+
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {};
+        data4.pid = pid;
+        data4.ip = 4;
+        data4.saddr = ip->saddr;
+        data4.daddr = ip->daddr;
+        data4.dport = dport;
+        data4.sport = sport;
+        data4.state = state;
+        data4.tcpflags = tcpflags;
+        data4.stack_id = stack_traces.get_stackid(ctx, 0);
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else if (family == AF_INET6) {
+        struct ipv6_data_t data6 = {};
+        data6.pid = pid;
+        data6.ip = 6;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        data6.dport = dport;
+        data6.sport = sport;
+        data6.state = state;
+        data6.tcpflags = tcpflags;
+        data6.stack_id = stack_traces.get_stackid(ctx, 0);
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("sport", ct.c_ushort),
+        ("dport", ct.c_ushort),
+        ("state", ct.c_ubyte),
+        ("tcpflags", ct.c_ubyte),
+        ("stack_id", ct.c_ulong)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("sport", ct.c_ushort),
+        ("dport", ct.c_ushort),
+        ("state", ct.c_ubyte),
+        ("tcpflags", ct.c_ubyte),
+        ("stack_id", ct.c_ulong)
+    ]
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % (
+        strftime("%H:%M:%S"), event.pid, event.ip,
+        "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.sport),
+        "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport),
+        tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags)))
+    for addr in stack_traces.walk(event.stack_id):
+        sym = b.ksym(addr, show_offset=True)
+        print("\t%s" % sym)
+    print("")
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % (
+        strftime("%H:%M:%S"), event.pid, event.ip,
+        "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.sport),
+        "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
+        tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags)))
+    for addr in stack_traces.walk(event.stack_id):
+        sym = b.ksym(addr, show_offset=True)
+        print("\t%s" % sym)
+    print("")
+
+# initialize BPF
+b = BPF(text=bpf_text)
+if b.get_kprobe_functions(b"tcp_drop"):
+    b.attach_kprobe(event="tcp_drop", fn_name="trace_tcp_drop")
+else:
+    print("ERROR: tcp_drop() kernel function not found or traceable. "
+        "Older kernel versions not supported.")
+    exit()
+stack_traces = b.get_table("stack_traces")
+
+# header
+print("%-8s %-6s %-2s %-20s > %-20s %s (%s)" % ("TIME", "PID", "IP",
+    "SADDR:SPORT", "DADDR:DPORT", "STATE", "FLAGS"))
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcpdrop_example.txt b/tools/tcpdrop_example.txt
new file mode 100644
index 0000000..752ec4b
--- /dev/null
+++ b/tools/tcpdrop_example.txt
@@ -0,0 +1,72 @@
+Demonstrations of tcpdrop, the Linux BPF/bcc version.
+
+
+tcpdrop prints details of TCP packets or segments that were dropped by the
+kernel, including the kernel stack trace that led to the drop:
+
+# ./tcpdrop.py
+TIME     PID    IP SADDR:SPORT          > DADDR:DPORT          STATE (FLAGS)
+20:49:06 0      4  10.32.119.56:443     > 10.66.65.252:22912   CLOSE (ACK)
+	tcp_drop+0x1
+	tcp_v4_do_rcv+0x135
+	tcp_v4_rcv+0x9c7
+	ip_local_deliver_finish+0x62
+	ip_local_deliver+0x6f
+	ip_rcv_finish+0x129
+	ip_rcv+0x28f
+	__netif_receive_skb_core+0x432
+	__netif_receive_skb+0x18
+	netif_receive_skb_internal+0x37
+	napi_gro_receive+0xc5
+	ena_clean_rx_irq+0x3c3
+	ena_io_poll+0x33f
+	net_rx_action+0x140
+	__softirqentry_text_start+0xdf
+	irq_exit+0xb6
+	do_IRQ+0x82
+	ret_from_intr+0x0
+	native_safe_halt+0x6
+	default_idle+0x20
+	arch_cpu_idle+0x15
+	default_idle_call+0x23
+	do_idle+0x17f
+	cpu_startup_entry+0x73
+	rest_init+0xae
+	start_kernel+0x4dc
+	x86_64_start_reservations+0x24
+	x86_64_start_kernel+0x74
+	secondary_startup_64+0xa5
+
+20:49:50 12431  4  127.0.0.1:8198       > 127.0.0.1:48280      CLOSE (RST|ACK)
+	tcp_drop+0x1
+	tcp_v4_do_rcv+0x135
+	__release_sock+0x88
+	release_sock+0x30
+	inet_stream_connect+0x47
+	SYSC_connect+0x9e
+	sys_connect+0xe
+	do_syscall_64+0x73
+	entry_SYSCALL_64_after_hwframe+0x3d
+
+[...]
+
+The last two columns show the state of the TCP session, and the TCP flags.
+These two examples show packets arriving for a session in the closed state,
+that were dropped by the kernel.
+
+This tool is useful for debugging high rates of drops, which can cause the
+remote end to do timer-based retransmits, hurting performance.
+
+
+USAGE:
+
+# ./tcpdrop.py -h
+usage: tcpdrop.py [-h]
+
+Trace TCP drops by the kernel
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+examples:
+    ./tcpdrop           # trace kernel TCP drops
diff --git a/tools/tcplife.lua b/tools/tcplife.lua
new file mode 100755
index 0000000..60fb51f
--- /dev/null
+++ b/tools/tcplife.lua
@@ -0,0 +1,439 @@
+#!/usr/bin/env bcc-lua
+--[[
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+18-Mar-2017  Simon Liu Created this.
+--]]
+
+local ffi = require("ffi")
+local bit = require("bit")
+
+ffi.cdef[[
+const char *inet_ntop(int af, const void *src, char *dst, int size);
+uint16_t ntohs(uint16_t netshort);
+]]
+
+local program = [[
+#include <uapi/linux/ptrace.h>
+#define KBUILD_MODNAME "foo"
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(birth, struct sock *, u64);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 pid;
+    u64 saddr;
+    u64 daddr;
+    u64 ports;
+    u64 rx_b;
+    u64 tx_b;
+    u64 span_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u64 pid;
+    u64 saddr[2];
+    u64 daddr[2];
+    u64 ports;
+    u64 rx_b;
+    u64 tx_b;
+    u64 span_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+struct id_t {
+    u32 pid;
+    char task[TASK_COMM_LEN];
+};
+BPF_HASH(whoami, struct sock *, struct id_t);
+
+int trace_tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
+{
+    bpf_trace_printk("tcp_set_stat");
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+
+    // lport is either used in a filter here, or later
+    u16 lport = sk->__sk_common.skc_num;
+    FILTER_LPORT
+
+    // dport is either used in a filter here, or later
+    u16 dport = sk->__sk_common.skc_dport;
+    FILTER_DPORT
+
+    /*
+     * This tool includes PID and comm context. It's best effort, and may
+     * be wrong in some situations. It currently works like this:
+     * - record timestamp on any state < TCP_FIN_WAIT1
+     * - cache task context on:
+     *       TCP_SYN_SENT: tracing from client
+     *       TCP_LAST_ACK: client-closed from server
+     * - do output on TCP_CLOSE:
+     *       fetch task context if cached, or use current task
+     */
+
+    // capture birth time
+    if (state < TCP_FIN_WAIT1) {
+        /*
+         * Matching just ESTABLISHED may be sufficient, provided no code-path
+         * sets ESTABLISHED without a tcp_set_state() call. Until we know
+         * that for sure, match all early states to increase chances a
+         * timestamp is set.
+         * Note that this needs to be set before the PID filter later on,
+         * since the PID isn't reliable for these early stages, so we must
+         * save all timestamps and do the PID filter later when we can.
+         */
+        u64 ts = bpf_ktime_get_ns();
+        birth.update(&sk, &ts);
+    }
+
+    // record PID & comm on SYN_SENT
+    if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
+        // now we can PID filter, both here and a little later on for CLOSE
+        FILTER_PID
+        struct id_t me = {.pid = pid};
+        bpf_get_current_comm(&me.task, sizeof(me.task));
+        whoami.update(&sk, &me);
+    }
+
+    if (state != TCP_CLOSE)
+        return 0;
+
+    // calculate lifespan
+    u64 *tsp, delta_us;
+    tsp = birth.lookup(&sk);
+    if (tsp == 0) {
+        whoami.delete(&sk);     // may not exist
+        return 0;               // missed create
+    }
+    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+    birth.delete(&sk);
+
+    // fetch possible cached data, and filter
+    struct id_t *mep;
+    mep = whoami.lookup(&sk);
+    if (mep != 0)
+        pid = mep->pid;
+    FILTER_PID
+
+    // get throughput stats. see tcp_get_info().
+    u64 rx_b = 0, tx_b = 0, sport = 0;
+    struct tcp_sock *tp = (struct tcp_sock *)sk;
+    rx_b = tp->bytes_received;
+    tx_b = tp->bytes_acked;
+
+    u16 family = sk->__sk_common.skc_family;
+
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {.span_us = delta_us,
+            .rx_b = rx_b, .tx_b = tx_b};
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        data4.saddr = sk->__sk_common.skc_rcv_saddr;
+        data4.daddr = sk->__sk_common.skc_daddr;
+        // a workaround until data4 compiles with separate lport/dport
+        data4.pid = pid;
+        data4.ports = ntohs(dport) + ((0ULL + lport) << 32);
+        if (mep == 0) {
+            bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        } else {
+            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
+        }
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else /* 6 */ {
+        struct ipv6_data_t data6 = {.span_us = delta_us,
+            .rx_b = rx_b, .tx_b = tx_b};
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        // a workaround until data6 compiles with separate lport/dport
+        data6.ports = ntohs(dport) + ((0ULL + lport) << 32);
+        data6.pid = pid;
+        if (mep == 0) {
+            bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        } else {
+            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
+        }
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+
+    if (mep != 0)
+        whoami.delete(&sk);
+
+    return 0;
+}
+]]
+
+local debug = false
+local start_ts = 0
+
+local inet_addresslen = #"255.255.255.255"
+local inet6_addresslen = #"ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"
+local AF_INET = 2
+local AF_INET6 = 10
+
+local header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
+local format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
+local ip_string = ""
+local ip_version = false
+local arg_timestamp = false
+local arg_csv = false
+local arg_time = false
+
+local examples = [[examples:
+    ./tcplife           # trace all TCP connect()s
+    ./tcplife -t        # include time column (HH:MM:SS)
+    ./tcplife -w        # wider colums (fit IPv6)
+    ./tcplife -stT      # csv output, with times & timestamps
+    ./tcplife -p 181    # only trace PID 181
+    ./tcplife -L 80     # only trace local port 80
+    ./tcplife -L 80,81  # only trace local ports 80 and 81
+    ./tcplife -D 80     # only trace remote port 80
+]]
+
+local function split(str,sep)
+   local t = {}
+   for w in string.gmatch(str, '([^,]+)') do
+      table.insert(t, w)
+   end
+   return t
+end
+
+local function inet_ntop(af, addr, len)
+   local addr_dst = ffi.new("char[?]", len)
+   local addr_src
+   if af == AF_INET then
+      addr_src = ffi.new("uint64_t[1]", addr)
+   else
+      addr_src = ffi.new("uint64_t[2]", addr)
+   end
+   ffi.C.inet_ntop(af, addr_src, addr_dst, len)
+   return ffi.string(addr_dst, len)
+end
+
+local function inet_ntohs(port)
+   local p = tonumber(port)
+   return ffi.C.ntohs(p)
+end
+
+local function print_ipv4_event(cpu, event)
+
+   local event_pid = tonumber(event.pid)
+   local event_task = ffi.string(event.task)
+   local event_ports = tonumber(event.ports)
+   local event_tx_b = tonumber(event.tx_b)
+   local event_rx_b = tonumber(event.rx_b)
+   local event_span_us = tonumber(event.span_us)
+   local event_ts_us = tonumber(event.ts_us)
+   local event_saddr = inet_ntop(AF_INET, tonumber(event.saddr), inet_addresslen)
+   local event_daddr = inet_ntop(AF_INET, tonumber(event.daddr), inet_addresslen)
+   if arg_time then
+      if arg_csv then
+         io.write("%s," % os.date("%H:%M:%S"))
+      else
+         io.write("%-8s " % os.date("%H:%M:%S"))
+      end
+   end
+   if arg_timestamp then
+      if start_ts == 0 then
+         start_ts = event_ts_us
+      end
+      local delta_s = (event_ts_us - start_ts) / 1000000
+      if arg.csv then
+         io.write("%.6f," % delta_s)
+      else
+         io.write("%-9.6f " % delta_s)
+      end
+   end
+   local iv = ""
+   if ip_version then
+      iv = "4"
+   end
+   print(string.format(format_string, event_pid, event_task, iv,
+                       event_saddr, bit.rshift(event_ports,32),
+                       event_daddr, bit.band(event_ports,0xffffffff),
+                       (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
+end
+
+
+local function print_ipv6_event(cpu, event)
+   local event_pid = tonumber(event.pid)
+   local event_task = ffi.string(event.task)
+   local event_ports = tonumber(event.ports)
+   local event_tx_b = tonumber(event.tx_b)
+   local event_rx_b = tonumber(event.rx_b)
+   local event_span_us = tonumber(event.span_us)
+   local event_ts_us = tonumber(event.ts_us)
+   local event_saddr = inet_ntop(AF_INET6, {tonumber(event.saddr[0]), tonumber(event.saddr[1])}, inet6_addresslen)
+   local event_daddr = inet_ntop(AF_INET6, {tonumber(event.daddr[0]), tonumber(event.daddr[1])}, inet6_addresslen)
+   if arg_time then
+      if arg_csv then
+         io.write("%s," % os.date("%H:%M:%S"))
+      else
+         io.write("%-8s " % os.date("%H:%M:%S"))
+      end
+   end
+   if arg_timestamp then
+      if start_ts == 0 then
+         start_ts = event_ts_us
+      end
+      local delta_s = (event_ts_us - start_ts) / 1000000
+      if arg.csv then
+         io.write("%.6f," % delta_s)
+      else
+         io.write("%-9.6f " % delta_s)
+      end
+   end
+   local iv = ""
+   if ip_version then
+      iv = "6"
+   end
+   print(string.format(format_string, event_pid, event_task, iv,
+                       event_saddr, bit.rshift(event_ports,32),
+                       event_daddr, bit.band(event_ports,0xffffffff),
+                       (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
+end
+
+local function parse_arg(utils)
+   local parser = utils.argparse("tcplife",
+                                 "Trace the lifespan of TCP sessions and summarize", examples)
+
+   parser:flag("-T --time", "include time column on output (HH:MM:SS)")
+   parser:flag("-t --timestamp", "include timestamp on output (seconds)")
+   parser:flag("-w --wide", "wide column output (fits IPv6 addresses)")
+   parser:flag("-s --csv", "comma separated values output")
+   parser:option("-p --pid", "trace this PID only"):convert(tonumber)
+   parser:option("-L --localport", "comma-separated list of local ports to trace.")
+   parser:option("-D --remoteport", "comma-separated list of remote ports to trace.")
+
+   local args = parser:parse()
+   if args.pid then
+      local filter = 'if (pid != %d) { return 0; }' % args.pid
+      program = program.gsub('FILTER_PID', filter)
+   end
+
+   if args.remoteport then
+      local dports = split(args.remoteport, ",")
+      local dports_if = ""
+      for i,d in ipairs(dports) do
+         if dports_if == "" then
+            dports_if = 'dport != %d' % inet_ntohs(d)
+         else
+            dports_if = dports_if .. ' && ' .. ('dport != %d' % inet_ntohs(d))
+         end
+      end
+      local filter = "if (%s) { birth.delete(&sk); return 0; }" % dports_if
+      program = program:gsub('FILTER_DPORT', filter)
+   end
+   if args.localport then
+      local lports = split(args.localport,",")
+      local lports_if = ""
+      for i,l in ipairs(lports) do
+         if lports_if == "" then
+            lports_if = 'lport != %d' % inet_ntohs(l)
+         else
+            lports_if = lports_if .. ' && ' .. ('lport != %d' % inet_ntohs(l))
+         end
+      end
+      local filter = "if (%s) { birth.delete(&sk); return 0; }" % lports_if
+      program = program:gsub('FILTER_LPORT', filter)
+   end
+   program = program:gsub('FILTER_PID', '')
+   program = program:gsub('FILTER_DPORT', '')
+   program = program:gsub('FILTER_LPORT', '')
+
+   if args.wide then
+      header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
+      format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
+      ip_string = "IP"
+      ip_version = true
+   end
+   if args.csv then
+      header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
+      format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
+      ip_string = "IP"
+      ip_version = true
+      arg_csv = true
+   end
+
+   if args.time then
+      arg_time = true
+      if args.csv then
+         io.write("%s," % ("TIME"))
+      else
+         io.write("%-8s " % ("TIME"))
+      end
+   end
+
+   if args.timestamp then
+      arg_timestamp = true
+      if args.csv then
+         io.write("%s," % ("TIME(s)"))
+      else
+         io.write("%-9s " % ("TIME(s)"))
+      end
+   end
+
+end
+
+return function(BPF, utils)
+   parse_arg(utils)
+   if debug then
+      print(program)
+   end
+
+   local bpf = BPF:new{text=program}
+   bpf:attach_kprobe{event="tcp_set_state", fn_name="trace_tcp_set_state"}
+   print(header_string % {"PID", "COMM",
+                          ip_string, "LADDR",
+                          "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"})
+   local TASK_COMM_LEN = 16 -- linux/sched.h
+   bpf:get_table("ipv4_events"):open_perf_buffer(print_ipv4_event, [[
+    struct {
+      uint64_t ts_us;
+      uint64_t pid;
+      uint64_t saddr;
+      uint64_t daddr;
+      uint64_t ports;
+      uint64_t rx_b;
+      uint64_t tx_b;
+      uint64_t span_us;
+      char task[$];
+    }
+   ]], {TASK_COMM_LEN}, 64)
+   bpf:get_table("ipv6_events"):open_perf_buffer(print_ipv6_event, [[
+    struct {
+      uint64_t ts_us;
+      uint64_t pid;
+      uint64_t saddr[2];
+      uint64_t daddr[2];
+      uint64_t ports;
+      uint64_t rx_b;
+      uint64_t tx_b;
+      uint64_t span_us;
+      char task[$];
+    }
+   ]], {TASK_COMM_LEN}, 64)
+
+   bpf:perf_buffer_poll_loop()
+end
diff --git a/tools/tcplife.py b/tools/tcplife.py
new file mode 100755
index 0000000..51ed7ae
--- /dev/null
+++ b/tools/tcplife.py
@@ -0,0 +1,509 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcplife   Trace the lifespan of TCP sessions and summarize.
+#           For Linux, uses BCC, BPF. Embedded C.
+#
+# USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]]
+#
+# This uses the sock:inet_sock_set_state tracepoint if it exists (added to
+# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
+# kernel dynamic tracing of tcp_set_state().
+#
+# While throughput counters are emitted, they are fetched in a low-overhead
+# manner: reading members of the tcp_info struct on TCP close. ie, we do not
+# trace send/receive.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# IDEA: Julia Evans
+#
+# 18-Oct-2016   Brendan Gregg   Created this.
+# 29-Dec-2017      "      "     Added tracepoint support.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from socket import inet_ntop, ntohs, AF_INET, AF_INET6
+from struct import pack
+import ctypes as ct
+from time import strftime
+
+# arguments
+examples = """examples:
+    ./tcplife           # trace all TCP connect()s
+    ./tcplife -t        # include time column (HH:MM:SS)
+    ./tcplife -w        # wider colums (fit IPv6)
+    ./tcplife -stT      # csv output, with times & timestamps
+    ./tcplife -p 181    # only trace PID 181
+    ./tcplife -L 80     # only trace local port 80
+    ./tcplife -L 80,81  # only trace local ports 80 and 81
+    ./tcplife -D 80     # only trace remote port 80
+"""
+parser = argparse.ArgumentParser(
+    description="Trace the lifespan of TCP sessions and summarize",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--time", action="store_true",
+    help="include time column on output (HH:MM:SS)")
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output (seconds)")
+parser.add_argument("-w", "--wide", action="store_true",
+    help="wide column output (fits IPv6 addresses)")
+parser.add_argument("-s", "--csv", action="store_true",
+    help="comma separated values output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-L", "--localport",
+    help="comma-separated list of local ports to trace.")
+parser.add_argument("-D", "--remoteport",
+    help="comma-separated list of remote ports to trace.")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#define KBUILD_MODNAME "foo"
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(birth, struct sock *, u64);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u64 ts_us;
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u64 ports;
+    u64 rx_b;
+    u64 tx_b;
+    u64 span_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u32 pid;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u64 ports;
+    u64 rx_b;
+    u64 tx_b;
+    u64 span_us;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+struct id_t {
+    u32 pid;
+    char task[TASK_COMM_LEN];
+};
+BPF_HASH(whoami, struct sock *, struct id_t);
+"""
+
+#
+# XXX: The following is temporary code for older kernels, Linux 4.14 and
+# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and
+# later, the sock:inet_sock_set_state tracepoint should be used instead, as
+# is done by the code that follows this. In the distant future (2021?), this
+# kprobe code can be removed. This is why there is so much code
+# duplication: to make removal easier.
+#
+bpf_text_kprobe = """
+int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
+{
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+
+    // lport is either used in a filter here, or later
+    u16 lport = sk->__sk_common.skc_num;
+    FILTER_LPORT
+
+    // dport is either used in a filter here, or later
+    u16 dport = sk->__sk_common.skc_dport;
+    dport = ntohs(dport);
+    FILTER_DPORT
+
+    /*
+     * This tool includes PID and comm context. It's best effort, and may
+     * be wrong in some situations. It currently works like this:
+     * - record timestamp on any state < TCP_FIN_WAIT1
+     * - cache task context on:
+     *       TCP_SYN_SENT: tracing from client
+     *       TCP_LAST_ACK: client-closed from server
+     * - do output on TCP_CLOSE:
+     *       fetch task context if cached, or use current task
+     */
+
+    // capture birth time
+    if (state < TCP_FIN_WAIT1) {
+        /*
+         * Matching just ESTABLISHED may be sufficient, provided no code-path
+         * sets ESTABLISHED without a tcp_set_state() call. Until we know
+         * that for sure, match all early states to increase chances a
+         * timestamp is set.
+         * Note that this needs to be set before the PID filter later on,
+         * since the PID isn't reliable for these early stages, so we must
+         * save all timestamps and do the PID filter later when we can.
+         */
+        u64 ts = bpf_ktime_get_ns();
+        birth.update(&sk, &ts);
+    }
+
+    // record PID & comm on SYN_SENT
+    if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
+        // now we can PID filter, both here and a little later on for CLOSE
+        FILTER_PID
+        struct id_t me = {.pid = pid};
+        bpf_get_current_comm(&me.task, sizeof(me.task));
+        whoami.update(&sk, &me);
+    }
+
+    if (state != TCP_CLOSE)
+        return 0;
+
+    // calculate lifespan
+    u64 *tsp, delta_us;
+    tsp = birth.lookup(&sk);
+    if (tsp == 0) {
+        whoami.delete(&sk);     // may not exist
+        return 0;               // missed create
+    }
+    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+    birth.delete(&sk);
+
+    // fetch possible cached data, and filter
+    struct id_t *mep;
+    mep = whoami.lookup(&sk);
+    if (mep != 0)
+        pid = mep->pid;
+    FILTER_PID
+
+    // get throughput stats. see tcp_get_info().
+    u64 rx_b = 0, tx_b = 0, sport = 0;
+    struct tcp_sock *tp = (struct tcp_sock *)sk;
+    rx_b = tp->bytes_received;
+    tx_b = tp->bytes_acked;
+
+    u16 family = sk->__sk_common.skc_family;
+
+    if (family == AF_INET) {
+        struct ipv4_data_t data4 = {};
+        data4.span_us = delta_us;
+        data4.rx_b = rx_b;
+        data4.tx_b = tx_b;
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        data4.saddr = sk->__sk_common.skc_rcv_saddr;
+        data4.daddr = sk->__sk_common.skc_daddr;
+        // a workaround until data4 compiles with separate lport/dport
+        data4.pid = pid;
+        data4.ports = dport + ((0ULL + lport) << 32);
+        if (mep == 0) {
+            bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        } else {
+            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
+        }
+        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
+
+    } else /* 6 */ {
+        struct ipv6_data_t data6 = {};
+        data6.span_us = delta_us;
+        data6.rx_b = rx_b;
+        data6.tx_b = tx_b;
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        // a workaround until data6 compiles with separate lport/dport
+        data6.ports = dport + ((0ULL + lport) << 32);
+        data6.pid = pid;
+        if (mep == 0) {
+            bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        } else {
+            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
+        }
+        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
+    }
+
+    if (mep != 0)
+        whoami.delete(&sk);
+
+    return 0;
+}
+"""
+
+bpf_text_tracepoint = """
+TRACEPOINT_PROBE(sock, inet_sock_set_state)
+{
+    if (args->protocol != IPPROTO_TCP)
+        return 0;
+
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+    // sk is mostly used as a UUID, and for two tcp stats:
+    struct sock *sk = (struct sock *)args->skaddr;
+
+    // lport is either used in a filter here, or later
+    u16 lport = args->sport;
+    FILTER_LPORT
+
+    // dport is either used in a filter here, or later
+    u16 dport = args->dport;
+    FILTER_DPORT
+
+    /*
+     * This tool includes PID and comm context. It's best effort, and may
+     * be wrong in some situations. It currently works like this:
+     * - record timestamp on any state < TCP_FIN_WAIT1
+     * - cache task context on:
+     *       TCP_SYN_SENT: tracing from client
+     *       TCP_LAST_ACK: client-closed from server
+     * - do output on TCP_CLOSE:
+     *       fetch task context if cached, or use current task
+     */
+
+    // capture birth time
+    if (args->newstate < TCP_FIN_WAIT1) {
+        /*
+         * Matching just ESTABLISHED may be sufficient, provided no code-path
+         * sets ESTABLISHED without a tcp_set_state() call. Until we know
+         * that for sure, match all early states to increase chances a
+         * timestamp is set.
+         * Note that this needs to be set before the PID filter later on,
+         * since the PID isn't reliable for these early stages, so we must
+         * save all timestamps and do the PID filter later when we can.
+         */
+        u64 ts = bpf_ktime_get_ns();
+        birth.update(&sk, &ts);
+    }
+
+    // record PID & comm on SYN_SENT
+    if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
+        // now we can PID filter, both here and a little later on for CLOSE
+        FILTER_PID
+        struct id_t me = {.pid = pid};
+        bpf_get_current_comm(&me.task, sizeof(me.task));
+        whoami.update(&sk, &me);
+    }
+
+    if (args->newstate != TCP_CLOSE)
+        return 0;
+
+    // calculate lifespan
+    u64 *tsp, delta_us;
+    tsp = birth.lookup(&sk);
+    if (tsp == 0) {
+        whoami.delete(&sk);     // may not exist
+        return 0;               // missed create
+    }
+    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+    birth.delete(&sk);
+
+    // fetch possible cached data, and filter
+    struct id_t *mep;
+    mep = whoami.lookup(&sk);
+    if (mep != 0)
+        pid = mep->pid;
+    FILTER_PID
+
+    // get throughput stats. see tcp_get_info().
+    u64 rx_b = 0, tx_b = 0, sport = 0;
+    struct tcp_sock *tp = (struct tcp_sock *)sk;
+    rx_b = tp->bytes_received;
+    tx_b = tp->bytes_acked;
+
+    if (args->family == AF_INET) {
+        struct ipv4_data_t data4 = {};
+        data4.span_us = delta_us;
+        data4.rx_b = rx_b;
+        data4.tx_b = tx_b;
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
+        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
+        // a workaround until data4 compiles with separate lport/dport
+        data4.ports = dport + ((0ULL + lport) << 32);
+        data4.pid = pid;
+
+        if (mep == 0) {
+            bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        } else {
+            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
+        }
+        ipv4_events.perf_submit(args, &data4, sizeof(data4));
+
+    } else /* 6 */ {
+        struct ipv6_data_t data6 = {};
+        data6.span_us = delta_us;
+        data6.rx_b = rx_b;
+        data6.tx_b = tx_b;
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
+        __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
+        // a workaround until data6 compiles with separate lport/dport
+        data6.ports = dport + ((0ULL + lport) << 32);
+        data6.pid = pid;
+        if (mep == 0) {
+            bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        } else {
+            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
+        }
+        ipv6_events.perf_submit(args, &data6, sizeof(data6));
+    }
+
+    if (mep != 0)
+        whoami.delete(&sk);
+
+    return 0;
+}
+"""
+
+if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
+    bpf_text += bpf_text_tracepoint
+else:
+    bpf_text += bpf_text_kprobe
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID',
+        'if (pid != %s) { return 0; }' % args.pid)
+if args.remoteport:
+    dports = [int(dport) for dport in args.remoteport.split(',')]
+    dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
+    bpf_text = bpf_text.replace('FILTER_DPORT',
+        'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
+if args.localport:
+    lports = [int(lport) for lport in args.localport.split(',')]
+    lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
+    bpf_text = bpf_text.replace('FILTER_LPORT',
+        'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
+bpf_text = bpf_text.replace('FILTER_PID', '')
+bpf_text = bpf_text.replace('FILTER_DPORT', '')
+bpf_text = bpf_text.replace('FILTER_LPORT', '')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("ports", ct.c_ulonglong),
+        ("rx_b", ct.c_ulonglong),
+        ("tx_b", ct.c_ulonglong),
+        ("span_us", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("ports", ct.c_ulonglong),
+        ("rx_b", ct.c_ulonglong),
+        ("tx_b", ct.c_ulonglong),
+        ("span_us", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+#
+# Setup output formats
+#
+# Don't change the default output (next 2 lines): this fits in 80 chars. I
+# know it doesn't have NS or UIDs etc. I know. If you really, really, really
+# need to add columns, columns that solve real actual problems, I'd start by
+# adding an extended mode (-x) to included those columns.
+#
+header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
+format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
+if args.wide:
+    header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
+    format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
+if args.csv:
+    header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
+    format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    global start_ts
+    if args.time:
+        if args.csv:
+            print("%s," % strftime("%H:%M:%S"), end="")
+        else:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        delta_s = (float(event.ts_us) - start_ts) / 1000000
+        if args.csv:
+            print("%.6f," % delta_s, end="")
+        else:
+            print("%-9.6f " % delta_s, end="")
+    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
+        "4" if args.wide or args.csv else "",
+        inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
+        inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
+        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    global start_ts
+    if args.time:
+        if args.csv:
+            print("%s," % strftime("%H:%M:%S"), end="")
+        else:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        delta_s = (float(event.ts_us) - start_ts) / 1000000
+        if args.csv:
+            print("%.6f," % delta_s, end="")
+        else:
+            print("%-9.6f " % delta_s, end="")
+    print(format_string % (event.pid, event.task.decode('utf-8', 'replace'),
+        "6" if args.wide or args.csv else "",
+        inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
+        inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
+        event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.time:
+    if args.csv:
+        print("%s," % ("TIME"), end="")
+    else:
+        print("%-8s " % ("TIME"), end="")
+if args.timestamp:
+    if args.csv:
+        print("%s," % ("TIME(s)"), end="")
+    else:
+        print("%-9s " % ("TIME(s)"), end="")
+print(header_string % ("PID", "COMM",
+    "IP" if args.wide or args.csv else "", "LADDR",
+    "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
+
+start_ts = 0
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcplife_example.txt b/tools/tcplife_example.txt
new file mode 100644
index 0000000..fe4e52b
--- /dev/null
+++ b/tools/tcplife_example.txt
@@ -0,0 +1,135 @@
+Demonstrations of tcplife, the Linux BPF/bcc version.
+
+
+tcplife summarizes TCP sessions that open and close while tracing. For example:
+
+# ./tcplife
+PID   COMM       LADDR           LPORT RADDR           RPORT TX_KB RX_KB MS
+22597 recordProg 127.0.0.1       46644 127.0.0.1       28527     0     0 0.23
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       46644     0     0 0.28
+22598 curl       100.66.3.172    61620 52.205.89.26    80        0     1 91.79
+22604 curl       100.66.3.172    44400 52.204.43.121   80        0     1 121.38
+22624 recordProg 127.0.0.1       46648 127.0.0.1       28527     0     0 0.22
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       46648     0     0 0.27
+22647 recordProg 127.0.0.1       46650 127.0.0.1       28527     0     0 0.21
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       46650     0     0 0.26
+[...]
+
+This caught a program, "recordProg" making a few short-lived TCP connections
+to "redis-serv", lasting about 0.25 milliseconds each connection. A couple of
+"curl" sessions were also traced, connecting to port 80, and lasting 91 and 121
+milliseconds.
+
+This tool is useful for workload characterisation and flow accounting:
+identifying what connections are happening, with the bytes transferred.
+
+
+Process names are truncated to 10 characters. By using the wide option, -w,
+the column width becomes 16 characters. The IP address columns are also wider
+to fit IPv6 addresses:
+
+# ./tcplife -w
+PID   COMM             IP LADDR                      LPORT RADDR                      RPORT  TX_KB  RX_KB MS
+26315 recordProgramSt  4  127.0.0.1                  44188 127.0.0.1                  28527      0      0 0.21
+3277  redis-server     4  127.0.0.1                  28527 127.0.0.1                  44188      0      0 0.26
+26320 ssh              6  fe80::8a3:9dff:fed5:6b19   22440 fe80::8a3:9dff:fed5:6b19   22         1      1 457.52
+26321 sshd             6  fe80::8a3:9dff:fed5:6b19   22    fe80::8a3:9dff:fed5:6b19   22440      1      1 458.69
+26341 recordProgramSt  4  127.0.0.1                  44192 127.0.0.1                  28527      0      0 0.27
+3277  redis-server     4  127.0.0.1                  28527 127.0.0.1                  44192      0      0 0.32
+
+
+In this example, I uploaded a 10 Mbyte file to the server, and then downloaded
+it again, using scp:
+
+# ./tcplife
+PID   COMM       LADDR           LPORT RADDR           RPORT TX_KB RX_KB MS
+7715  recordProg 127.0.0.1       50894 127.0.0.1       28527     0     0 0.25
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       50894     0     0 0.30
+7619  sshd       100.66.3.172    22    100.127.64.230  63033     5 10255 3066.79
+7770  recordProg 127.0.0.1       50896 127.0.0.1       28527     0     0 0.20
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       50896     0     0 0.24
+7793  recordProg 127.0.0.1       50898 127.0.0.1       28527     0     0 0.23
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       50898     0     0 0.27
+7847  recordProg 127.0.0.1       50900 127.0.0.1       28527     0     0 0.24
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       50900     0     0 0.29
+7870  recordProg 127.0.0.1       50902 127.0.0.1       28527     0     0 0.29
+3277  redis-serv 127.0.0.1       28527 127.0.0.1       50902     0     0 0.30
+7798  sshd       100.66.3.172    22    100.127.64.230  64925 10265     6 2176.15
+[...]
+
+You can see the 10 Mbytes received by sshd, and then later transmitted. Looks
+like receive was slower (3.07 seconds) than transmit (2.18 seconds).
+
+
+Timestamps can be added with -t:
+
+# ./tcplife -t
+TIME(s)   PID   COMM       LADDR           LPORT RADDR           RPORT TX_KB RX_KB MS
+0.000000  5973  recordProg 127.0.0.1       47986 127.0.0.1       28527     0     0 0.25
+0.000059  3277  redis-serv 127.0.0.1       28527 127.0.0.1       47986     0     0 0.29
+1.022454  5996  recordProg 127.0.0.1       47988 127.0.0.1       28527     0     0 0.23
+1.022513  3277  redis-serv 127.0.0.1       28527 127.0.0.1       47988     0     0 0.27
+2.044868  6019  recordProg 127.0.0.1       47990 127.0.0.1       28527     0     0 0.24
+2.044924  3277  redis-serv 127.0.0.1       28527 127.0.0.1       47990     0     0 0.28
+3.069136  6042  recordProg 127.0.0.1       47992 127.0.0.1       28527     0     0 0.22
+3.069204  3277  redis-serv 127.0.0.1       28527 127.0.0.1       47992     0     0 0.28
+
+This shows that the recordProg process was connecting once per second.
+
+There's also a -T for HH:MM:SS formatted times.
+
+
+There's a comma separated values mode, -s. Here it is with both -t and -T
+timestamps:
+
+# ./tcplife -stT
+TIME,TIME(s),PID,COMM,IP,LADDR,LPORT,RADDR,RPORT,TX_KB,RX_KB,MS
+23:39:38,0.000000,7335,recordProgramSt,4,127.0.0.1,48098,127.0.0.1,28527,0,0,0.26
+23:39:38,0.000064,3277,redis-server,4,127.0.0.1,28527,127.0.0.1,48098,0,0,0.32
+23:39:39,1.025078,7358,recordProgramSt,4,127.0.0.1,48100,127.0.0.1,28527,0,0,0.25
+23:39:39,1.025141,3277,redis-server,4,127.0.0.1,28527,127.0.0.1,48100,0,0,0.30
+23:39:41,2.040949,7381,recordProgramSt,4,127.0.0.1,48102,127.0.0.1,28527,0,0,0.24
+23:39:41,2.041011,3277,redis-server,4,127.0.0.1,28527,127.0.0.1,48102,0,0,0.29
+23:39:42,3.067848,7404,recordProgramSt,4,127.0.0.1,48104,127.0.0.1,28527,0,0,0.30
+23:39:42,3.067914,3277,redis-server,4,127.0.0.1,28527,127.0.0.1,48104,0,0,0.35
+[...]
+
+
+There are options for filtering on local and remote ports. Here is filtering
+on local ports 22 and 80:
+
+# ./tcplife.py -L 22,80
+PID   COMM       LADDR           LPORT RADDR           RPORT TX_KB RX_KB MS
+8301  sshd       100.66.3.172    22    100.127.64.230  58671     3     3 1448.52
+[...]
+
+
+USAGE:
+
+# ./tcplife.py -h
+usage: tcplife.py [-h] [-T] [-t] [-w] [-s] [-p PID] [-L LOCALPORT]
+                  [-D REMOTEPORT]
+
+Trace the lifespan of TCP sessions and summarize
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -T, --time            include time column on output (HH:MM:SS)
+  -t, --timestamp       include timestamp on output (seconds)
+  -w, --wide            wide column output (fits IPv6 addresses)
+  -s, --csv             comma separated values output
+  -p PID, --pid PID     trace this PID only
+  -L LOCALPORT, --localport LOCALPORT
+                        comma-separated list of local ports to trace.
+  -D REMOTEPORT, --remoteport REMOTEPORT
+                        comma-separated list of remote ports to trace.
+
+examples:
+    ./tcplife           # trace all TCP connect()s
+    ./tcplife -t        # include time column (HH:MM:SS)
+    ./tcplife -w        # wider colums (fit IPv6)
+    ./tcplife -stT      # csv output, with times & timestamps
+    ./tcplife -p 181    # only trace PID 181
+    ./tcplife -L 80     # only trace local port 80
+    ./tcplife -L 80,81  # only trace local ports 80 and 81
+    ./tcplife -D 80     # only trace remote port 80
diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py
new file mode 100755
index 0000000..4400483
--- /dev/null
+++ b/tools/tcpretrans.py
@@ -0,0 +1,306 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpretrans    Trace or count TCP retransmits and TLPs.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpretrans [-c] [-h] [-l]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Feb-2016   Brendan Gregg   Created this.
+# 03-Nov-2017   Matthias Tafelmeier Extended this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import ctypes as ct
+from time import sleep
+
+# arguments
+examples = """examples:
+    ./tcpretrans           # trace TCP retransmits
+    ./tcpretrans -l        # include TLP attempts
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP retransmits",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--lossprobe", action="store_true",
+    help="include tail loss probe attempts")
+parser.add_argument("-c", "--count", action="store_true",
+    help="count occurred retransmits per flow")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+#define RETRANSMIT  1
+#define TLP         2
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u32 pid;
+    u64 ip;
+    u32 saddr;
+    u32 daddr;
+    u16 lport;
+    u16 dport;
+    u64 state;
+    u64 type;
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u32 pid;
+    u64 ip;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 lport;
+    u16 dport;
+    u64 state;
+    u64 type;
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+// separate flow keys per address family
+struct ipv4_flow_key_t {
+    u32 saddr;
+    u32 daddr;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv4_count, struct ipv4_flow_key_t);
+
+struct ipv6_flow_key_t {
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv6_count, struct ipv6_flow_key_t);
+
+static int trace_event(struct pt_regs *ctx, struct sock *skp, int type)
+{
+    if (skp == NULL)
+        return 0;
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+
+    // pull in details
+    u16 family = skp->__sk_common.skc_family;
+    u16 lport = skp->__sk_common.skc_num;
+    u16 dport = skp->__sk_common.skc_dport;
+    char state = skp->__sk_common.skc_state;
+
+    if (family == AF_INET) {
+        IPV4_INIT
+        IPV4_CORE
+    } else if (family == AF_INET6) {
+        IPV6_INIT
+        IPV6_CORE
+    }
+    // else drop
+
+    return 0;
+}
+
+int trace_retransmit(struct pt_regs *ctx, struct sock *sk)
+{
+    trace_event(ctx, sk, RETRANSMIT);
+    return 0;
+}
+
+int trace_tlp(struct pt_regs *ctx, struct sock *sk)
+{
+    trace_event(ctx, sk, TLP);
+    return 0;
+}
+"""
+
+struct_init = { 'ipv4':
+        { 'count' :
+            """
+               struct ipv4_flow_key_t flow_key = {};
+               flow_key.saddr = skp->__sk_common.skc_rcv_saddr;
+               flow_key.daddr = skp->__sk_common.skc_daddr;
+               // lport is host order
+               flow_key.lport = lport;
+               flow_key.dport = ntohs(dport);""",
+               'trace' :
+               """
+               struct ipv4_data_t data4 = {};
+               data4.pid = pid;
+               data4.ip = 4;
+               data4.type = type;
+               data4.saddr = skp->__sk_common.skc_rcv_saddr;
+               data4.daddr = skp->__sk_common.skc_daddr;
+               // lport is host order
+               data4.lport = lport;
+               data4.dport = ntohs(dport);
+               data4.state = state; """
+               },
+        'ipv6':
+        { 'count' :
+            """
+                    struct ipv6_flow_key_t flow_key = {};
+                    bpf_probe_read(&flow_key.saddr, sizeof(flow_key.saddr),
+                        skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+                    bpf_probe_read(&flow_key.daddr, sizeof(flow_key.daddr),
+                        skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+                    // lport is host order
+                    flow_key.lport = lport;
+                    flow_key.dport = ntohs(dport);""",
+          'trace' : """
+                    struct ipv6_data_t data6 = {};
+                    data6.pid = pid;
+                    data6.ip = 6;
+                    data6.type = type;
+                    bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
+                        skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+                    bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
+                        skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+                    // lport is host order
+                    data6.lport = lport;
+                    data6.dport = ntohs(dport);
+                    data6.state = state;"""
+                }
+        }
+
+count_core_base = """
+        COUNT_STRUCT.increment(flow_key);
+"""
+
+if args.count:
+    bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['count'])
+    bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['count'])
+    bpf_text = bpf_text.replace("IPV4_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv4_count'))
+    bpf_text = bpf_text.replace("IPV6_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv6_count'))
+else:
+    bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['trace'])
+    bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['trace'])
+    bpf_text = bpf_text.replace("IPV4_CORE", "ipv4_events.perf_submit(ctx, &data4, sizeof(data4));")
+    bpf_text = bpf_text.replace("IPV6_CORE", "ipv6_events.perf_submit(ctx, &data6, sizeof(data6));")
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("lport", ct.c_ushort),
+        ("dport", ct.c_ushort),
+        ("state", ct.c_ulonglong),
+        ("type", ct.c_ulonglong)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("pid", ct.c_uint),
+        ("ip", ct.c_ulonglong),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("lport", ct.c_ushort),
+        ("dport", ct.c_ushort),
+        ("state", ct.c_ulonglong),
+        ("type", ct.c_ulonglong)
+    ]
+
+# from bpf_text:
+type = {}
+type[1] = 'R'
+type[2] = 'L'
+
+# from include/net/tcp_states.h:
+tcpstate = {}
+tcpstate[1] = 'ESTABLISHED'
+tcpstate[2] = 'SYN_SENT'
+tcpstate[3] = 'SYN_RECV'
+tcpstate[4] = 'FIN_WAIT1'
+tcpstate[5] = 'FIN_WAIT2'
+tcpstate[6] = 'TIME_WAIT'
+tcpstate[7] = 'CLOSE'
+tcpstate[8] = 'CLOSE_WAIT'
+tcpstate[9] = 'LAST_ACK'
+tcpstate[10] = 'LISTEN'
+tcpstate[11] = 'CLOSING'
+tcpstate[12] = 'NEW_SYN_RECV'
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % (
+        strftime("%H:%M:%S"), event.pid, event.ip,
+        "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport),
+        type[event.type],
+        "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport),
+        tcpstate[event.state]))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % (
+        strftime("%H:%M:%S"), event.pid, event.ip,
+        "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport),
+        type[event.type],
+        "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport),
+        tcpstate[event.state]))
+
+def depict_cnt(counts_tab, l3prot='ipv4'):
+    for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value):
+        depict_key = ""
+        ep_fmt = "[%s]#%d"
+        if l3prot == 'ipv4':
+            depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET, pack('I', k.saddr)), k.lport),
+                                              ep_fmt % (inet_ntop(AF_INET, pack('I', k.daddr)), k.dport))
+        else:
+            depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET6, k.saddr), k.lport),
+                                              ep_fmt % (inet_ntop(AF_INET6, k.daddr), k.dport))
+
+        print ("%s %10d" % (depict_key, v.value))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit")
+if args.lossprobe:
+    b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp")
+
+print("Tracing retransmits ... Hit Ctrl-C to end")
+if args.count:
+    try:
+        while 1:
+            sleep(99999999)
+    except BaseException:
+        pass
+
+    # header
+    print("\n%-25s %-25s %-10s" % (
+        "LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS"))
+    depict_cnt(b.get_table("ipv4_count"))
+    depict_cnt(b.get_table("ipv6_count"), l3prot='ipv6')
+# read events
+else:
+    # header
+    print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP",
+        "LADDR:LPORT", "T", "RADDR:RPORT", "STATE"))
+    b["ipv4_events"].open_perf_buffer(print_ipv4_event)
+    b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+    while 1:
+        b.perf_buffer_poll()
diff --git a/tools/tcpretrans_example.txt b/tools/tcpretrans_example.txt
new file mode 100644
index 0000000..db63477
--- /dev/null
+++ b/tools/tcpretrans_example.txt
@@ -0,0 +1,76 @@
+Demonstrations of tcpretrans, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel TCP retransmit function to show details of these
+retransmits. For example:
+
+# ./tcpretrans 
+TIME     PID    IP LADDR:LPORT          T> RADDR:RPORT          STATE
+01:55:05 0      4  10.153.223.157:22    R> 69.53.245.40:34619   ESTABLISHED
+01:55:05 0      4  10.153.223.157:22    R> 69.53.245.40:34619   ESTABLISHED
+01:55:17 0      4  10.153.223.157:22    R> 69.53.245.40:22957   ESTABLISHED
+[...]
+
+This output shows three TCP retransmits, the first two were for an IPv4
+connection from 10.153.223.157 port 22 to 69.53.245.40 port 34619. The TCP
+state was "ESTABLISHED" at the time of the retransmit. The on-CPU PID at the
+time of the retransmit is printed, in this case 0 (the kernel, which will
+be the case most of the time).
+
+Retransmits are usually a sign of poor network health, and this tool is
+useful for their investigation. Unlike using tcpdump, this tool has very
+low overhead, as it only traces the retransmit function. It also prints
+additional kernel details: the state of the TCP session at the time of the
+retransmit.
+
+
+A -l option will include TCP tail loss probe attempts:
+
+# ./tcpretrans -l
+TIME     PID    IP LADDR:LPORT          T> RADDR:RPORT          STATE
+01:55:45 0      4  10.153.223.157:22    R> 69.53.245.40:51601   ESTABLISHED
+01:55:46 0      4  10.153.223.157:22    R> 69.53.245.40:51601   ESTABLISHED
+01:55:46 0      4  10.153.223.157:22    R> 69.53.245.40:51601   ESTABLISHED
+01:55:53 0      4  10.153.223.157:22    L> 69.53.245.40:46444   ESTABLISHED
+01:56:06 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:06 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:08 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:08 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:08 1938   4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:08 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+01:56:08 0      4  10.153.223.157:22    R> 69.53.245.40:46444   ESTABLISHED
+[...]
+
+See the "L>" in the "T>" column. These are attempts: the kernel probably
+sent a TLP, but in some cases it might not have been ultimately sent.
+
+To spot heavily retransmitting flows quickly one can use the -c flag. It will
+count occurring retransmits per flow. 
+
+# ./tcpretrans.py -c
+Tracing retransmits ... Hit Ctrl-C to end
+^C
+LADDR:LPORT              RADDR:RPORT             RETRANSMITS
+192.168.10.50:60366  <-> 172.217.21.194:443         700
+192.168.10.50:666    <-> 172.213.11.195:443         345    
+192.168.10.50:366    <-> 172.212.22.194:443         211
+[...]
+
+This can ease to quickly isolate congested or otherwise awry network paths
+responsible for clamping tcp performance.
+
+USAGE message:
+
+# ./tcpretrans -h
+usage: tcpretrans [-h] [-l]
+
+Trace TCP retransmits
+
+optional arguments:
+  -h, --help       show this help message and exit
+  -l, --lossprobe  include tail loss probe attempts
+  -c, --count      count occurred retransmits per flow
+
+examples:
+    ./tcpretrans           # trace TCP retransmits
+    ./tcpretrans -l        # include TLP attempts
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
new file mode 100755
index 0000000..381a6d5
--- /dev/null
+++ b/tools/tcpstates.py
@@ -0,0 +1,334 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpstates   Trace the TCP session state changes with durations.
+#             For Linux, uses BCC, BPF. Embedded C.
+#
+# USAGE: tcpstates [-h] [-C] [-S] [interval [count]]
+#
+# This uses the sock:inet_sock_set_state tracepoint, added to Linux 4.16.
+# Linux 4.16 also adds more state transitions so that they can be traced.
+#
+# Copyright 2018 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Mar-2018   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import ctypes as ct
+from time import strftime
+
+# arguments
+examples = """examples:
+    ./tcpstates           # trace all TCP state changes
+    ./tcpstates -t        # include timestamp column
+    ./tcpstates -T        # include time column (HH:MM:SS)
+    ./tcpstates -w        # wider colums (fit IPv6)
+    ./tcpstates -stT      # csv output, with times & timestamps
+    ./tcpstates -L 80     # only trace local port 80
+    ./tcpstates -L 80,81  # only trace local ports 80 and 81
+    ./tcpstates -D 80     # only trace remote port 80
+"""
+parser = argparse.ArgumentParser(
+    description="Trace TCP session state changes and durations",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--time", action="store_true",
+    help="include time column on output (HH:MM:SS)")
+parser.add_argument("-t", "--timestamp", action="store_true",
+    help="include timestamp on output (seconds)")
+parser.add_argument("-w", "--wide", action="store_true",
+    help="wide column output (fits IPv6 addresses)")
+parser.add_argument("-s", "--csv", action="store_true",
+    help="comma separated values output")
+parser.add_argument("-L", "--localport",
+    help="comma-separated list of local ports to trace.")
+parser.add_argument("-D", "--remoteport",
+    help="comma-separated list of remote ports to trace.")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#define KBUILD_MODNAME "foo"
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+BPF_HASH(last, struct sock *, u64);
+
+// separate data structs for ipv4 and ipv6
+struct ipv4_data_t {
+    u64 ts_us;
+    u64 skaddr;
+    u32 saddr;
+    u32 daddr;
+    u64 span_us;
+    u32 pid;
+    u32 ports;
+    u32 oldstate;
+    u32 newstate;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv4_events);
+
+struct ipv6_data_t {
+    u64 ts_us;
+    u64 skaddr;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u64 span_us;
+    u32 pid;
+    u32 ports;
+    u32 oldstate;
+    u32 newstate;
+    char task[TASK_COMM_LEN];
+};
+BPF_PERF_OUTPUT(ipv6_events);
+
+struct id_t {
+    u32 pid;
+    char task[TASK_COMM_LEN];
+};
+
+TRACEPOINT_PROBE(sock, inet_sock_set_state)
+{
+    if (args->protocol != IPPROTO_TCP)
+        return 0;
+
+    u32 pid = bpf_get_current_pid_tgid() >> 32;
+    // sk is used as a UUID
+    struct sock *sk = (struct sock *)args->skaddr;
+
+    // lport is either used in a filter here, or later
+    u16 lport = args->sport;
+    FILTER_LPORT
+
+    // dport is either used in a filter here, or later
+    u16 dport = args->dport;
+    FILTER_DPORT
+
+    // calculate delta
+    u64 *tsp, delta_us;
+    tsp = last.lookup(&sk);
+    if (tsp == 0)
+        delta_us = 0;
+    else
+        delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
+
+    if (args->family == AF_INET) {
+        struct ipv4_data_t data4 = {
+            .span_us = delta_us,
+            .oldstate = args->oldstate,
+            .newstate = args->newstate };
+        data4.skaddr = (u64)args->skaddr;
+        data4.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
+        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
+        // a workaround until data4 compiles with separate lport/dport
+        data4.ports = dport + ((0ULL + lport) << 32);
+        data4.pid = pid;
+
+        bpf_get_current_comm(&data4.task, sizeof(data4.task));
+        ipv4_events.perf_submit(args, &data4, sizeof(data4));
+
+    } else /* 6 */ {
+        struct ipv6_data_t data6 = {
+            .span_us = delta_us,
+            .oldstate = args->oldstate,
+            .newstate = args->newstate };
+        data6.skaddr = (u64)args->skaddr;
+        data6.ts_us = bpf_ktime_get_ns() / 1000;
+        __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
+        __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
+        // a workaround until data6 compiles with separate lport/dport
+        data6.ports = dport + ((0ULL + lport) << 32);
+        data6.pid = pid;
+        bpf_get_current_comm(&data6.task, sizeof(data6.task));
+        ipv6_events.perf_submit(args, &data6, sizeof(data6));
+    }
+
+    u64 ts = bpf_ktime_get_ns();
+    last.update(&sk, &ts);
+
+    return 0;
+}
+"""
+
+if (not BPF.tracepoint_exists("sock", "inet_sock_set_state")):
+    print("ERROR: tracepoint sock:inet_sock_set_state missing "
+        "(added in Linux 4.16). Exiting")
+    exit()
+
+# code substitutions
+if args.remoteport:
+    dports = [int(dport) for dport in args.remoteport.split(',')]
+    dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
+    bpf_text = bpf_text.replace('FILTER_DPORT',
+        'if (%s) { last.delete(&sk); return 0; }' % dports_if)
+if args.localport:
+    lports = [int(lport) for lport in args.localport.split(',')]
+    lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
+    bpf_text = bpf_text.replace('FILTER_LPORT',
+        'if (%s) { last.delete(&sk); return 0; }' % lports_if)
+bpf_text = bpf_text.replace('FILTER_DPORT', '')
+bpf_text = bpf_text.replace('FILTER_LPORT', '')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# event data
+TASK_COMM_LEN = 16      # linux/sched.h
+
+class Data_ipv4(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("skaddr", ct.c_ulonglong),
+        ("saddr", ct.c_uint),
+        ("daddr", ct.c_uint),
+        ("span_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("ports", ct.c_uint),
+        ("oldstate", ct.c_uint),
+        ("newstate", ct.c_uint),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+class Data_ipv6(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("skaddr", ct.c_ulonglong),
+        ("saddr", (ct.c_ulonglong * 2)),
+        ("daddr", (ct.c_ulonglong * 2)),
+        ("span_us", ct.c_ulonglong),
+        ("pid", ct.c_uint),
+        ("ports", ct.c_uint),
+        ("oldstate", ct.c_uint),
+        ("newstate", ct.c_uint),
+        ("task", ct.c_char * TASK_COMM_LEN)
+    ]
+
+#
+# Setup output formats
+#
+# Don't change the default output (next 2 lines): this fits in 80 chars. I
+# know it doesn't have NS or UIDs etc. I know. If you really, really, really
+# need to add columns, columns that solve real actual problems, I'd start by
+# adding an extended mode (-x) to included those columns.
+#
+header_string = "%-16s %-5s %-10.10s %s%-15s %-5s %-15s %-5s %-11s -> %-11s %s"
+format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
+    "-> %-11s %.3f")
+if args.wide:
+    header_string = ("%-16s %-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %-11s " +
+        "-> %-11s %s")
+    format_string = ("%-16x %-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %-11s " +
+        "-> %-11s %.3f")
+if args.csv:
+    header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
+    format_string = "%x,%d,%s,%s,%s,%s,%s,%d,%s,%s,%.3f"
+
+def tcpstate2str(state):
+    # from include/net/tcp_states.h:
+    tcpstate = {
+        1: "ESTABLISHED",
+        2: "SYN_SENT",
+        3: "SYN_RECV",
+        4: "FIN_WAIT1",
+        5: "FIN_WAIT2",
+        6: "TIME_WAIT",
+        7: "CLOSE",
+        8: "CLOSE_WAIT",
+        9: "LAST_ACK",
+        10: "LISTEN",
+        11: "CLOSING",
+        12: "NEW_SYN_RECV",
+    }
+
+    if state in tcpstate:
+        return tcpstate[state]
+    else:
+        return str(state)
+
+# process event
+def print_ipv4_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
+    global start_ts
+    if args.time:
+        if args.csv:
+            print("%s," % strftime("%H:%M:%S"), end="")
+        else:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        delta_s = (float(event.ts_us) - start_ts) / 1000000
+        if args.csv:
+            print("%.6f," % delta_s, end="")
+        else:
+            print("%-9.6f " % delta_s, end="")
+    print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
+        "4" if args.wide or args.csv else "",
+        inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
+        inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
+        tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
+        float(event.span_us) / 1000))
+
+def print_ipv6_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
+    global start_ts
+    if args.time:
+        if args.csv:
+            print("%s," % strftime("%H:%M:%S"), end="")
+        else:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_us
+        delta_s = (float(event.ts_us) - start_ts) / 1000000
+        if args.csv:
+            print("%.6f," % delta_s, end="")
+        else:
+            print("%-9.6f " % delta_s, end="")
+    print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
+        "6" if args.wide or args.csv else "",
+        inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
+        inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
+        tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
+        float(event.span_us) / 1000))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# header
+if args.time:
+    if args.csv:
+        print("%s," % ("TIME"), end="")
+    else:
+        print("%-8s " % ("TIME"), end="")
+if args.timestamp:
+    if args.csv:
+        print("%s," % ("TIME(s)"), end="")
+    else:
+        print("%-9s " % ("TIME(s)"), end="")
+print(header_string % ("SKADDR", "C-PID", "C-COMM",
+    "IP" if args.wide or args.csv else "",
+    "LADDR", "LPORT", "RADDR", "RPORT",
+    "OLDSTATE", "NEWSTATE", "MS"))
+
+start_ts = 0
+
+# read events
+b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/tcpstates_example.txt b/tools/tcpstates_example.txt
new file mode 100644
index 0000000..aca857a
--- /dev/null
+++ b/tools/tcpstates_example.txt
@@ -0,0 +1,52 @@
+Demonstrations of tcpstates, the Linux BPF/bcc version.
+
+
+tcpstates prints TCP state change information, including the duration in each
+state as milliseconds. For example, a single TCP session:
+
+# tcpstates
+SKADDR           C-PID C-COMM     LADDR           LPORT RADDR           RPORT OLDSTATE    -> NEWSTATE    MS
+ffff9fd7e8192000 22384 curl       100.66.100.185  0     52.33.159.26    80    CLOSE       -> SYN_SENT    0.000
+ffff9fd7e8192000 0     swapper/5  100.66.100.185  63446 52.33.159.26    80    SYN_SENT    -> ESTABLISHED 1.373
+ffff9fd7e8192000 22384 curl       100.66.100.185  63446 52.33.159.26    80    ESTABLISHED -> FIN_WAIT1   176.042
+ffff9fd7e8192000 0     swapper/5  100.66.100.185  63446 52.33.159.26    80    FIN_WAIT1   -> FIN_WAIT2   0.536
+ffff9fd7e8192000 0     swapper/5  100.66.100.185  63446 52.33.159.26    80    FIN_WAIT2   -> CLOSE       0.006
+^C
+
+This showed that the most time was spent in the ESTABLISHED state (which then
+transitioned to FIN_WAIT1), which was 176.042 milliseconds.
+
+The first column is the socked address, as the output may include lines from
+different sessions interleaved. The next two columns show the current on-CPU
+process ID and command name: these may show the process that owns the TCP
+session, depending on whether the state change executes synchronously in
+process context. If that's not the case, they may show kernel details.
+
+
+USAGE:
+
+# tcpstates -h
+usage: tcpstates [-h] [-T] [-t] [-w] [-s] [-L LOCALPORT] [-D REMOTEPORT]
+
+Trace TCP session state changes and durations
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -T, --time            include time column on output (HH:MM:SS)
+  -t, --timestamp       include timestamp on output (seconds)
+  -w, --wide            wide column output (fits IPv6 addresses)
+  -s, --csv             comma separated values output
+  -L LOCALPORT, --localport LOCALPORT
+                        comma-separated list of local ports to trace.
+  -D REMOTEPORT, --remoteport REMOTEPORT
+                        comma-separated list of remote ports to trace.
+
+examples:
+    ./tcpstates           # trace all TCP state changes
+    ./tcpstates -t        # include timestamp column
+    ./tcpstates -T        # include time column (HH:MM:SS)
+    ./tcpstates -w        # wider colums (fit IPv6)
+    ./tcpstates -stT      # csv output, with times & timestamps
+    ./tcpstates -L 80     # only trace local port 80
+    ./tcpstates -L 80,81  # only trace local ports 80 and 81
+    ./tcpstates -D 80     # only trace remote port 80
diff --git a/tools/tcpsubnet.py b/tools/tcpsubnet.py
new file mode 100755
index 0000000..5f2a806
--- /dev/null
+++ b/tools/tcpsubnet.py
@@ -0,0 +1,262 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpsubnet   Summarize TCP bytes sent to different subnets.
+#             For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpsubnet [-h] [-v] [-J] [-f FORMAT] [-i INTERVAL] [subnets]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# This is an adaptation of tcptop from written by Brendan Gregg.
+#
+# WARNING: This traces all send at the TCP level, and while it
+# summarizes data in-kernel to reduce overhead, there may still be some
+# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP
+# events/sec. This is not the same as packet rate: funccount can be used to
+# count the kprobes below to find out the TCP rate). Test in a lab environment
+# first. If your send rate is low (eg, <1k/sec) then the overhead is
+# expected to be negligible.
+#
+# Copyright 2017 Rodrigo Manyari
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 03-Oct-2017   Rodrigo Manyari   Created this based on tcptop.
+# 13-Feb-2018   Rodrigo Manyari   Fix pep8 errors, some refactoring.
+# 05-Mar-2018   Rodrigo Manyari   Add date time to output.
+
+import argparse
+import json
+import logging
+import struct
+import socket
+from bcc import BPF
+from datetime import datetime as dt
+from time import sleep
+
+# arguments
+examples = """examples:
+    ./tcpsubnet                 # Trace TCP sent to the default subnets:
+                                # 127.0.0.1/32,10.0.0.0/8,172.16.0.0/12,
+                                # 192.168.0.0/16,0.0.0.0/0
+    ./tcpsubnet -f K            # Trace TCP sent to the default subnets
+                                # aggregated in KBytes.
+    ./tcpsubnet 10.80.0.0/24    # Trace TCP sent to 10.80.0.0/24 only
+    ./tcpsubnet -J              # Format the output in JSON.
+"""
+
+default_subnets = "127.0.0.1/32,10.0.0.0/8," \
+    "172.16.0.0/12,192.168.0.0/16,0.0.0.0/0"
+
+parser = argparse.ArgumentParser(
+    description="Summarize TCP send and aggregate by subnet",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("subnets", help="comma separated list of subnets",
+    type=str, nargs="?", default=default_subnets)
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="output debug statements")
+parser.add_argument("-J", "--json", action="store_true",
+    help="format output in JSON")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+parser.add_argument("-f", "--format", default="B",
+    help="[bkmBKM] format to report: bits, Kbits, Mbits, bytes, " +
+    "KBytes, MBytes (default B)", choices=["b", "k", "m", "B", "K", "M"])
+parser.add_argument("-i", "--interval", default=1, type=int,
+    help="output interval, in seconds (default 1)")
+args = parser.parse_args()
+
+level = logging.INFO
+if args.verbose:
+    level = logging.DEBUG
+
+logging.basicConfig(level=level)
+
+logging.debug("Starting with the following args:")
+logging.debug(args)
+
+# args checking
+if int(args.interval) <= 0:
+    logging.error("Invalid interval, must be > 0. Exiting.")
+    exit(1)
+else:
+    args.interval = int(args.interval)
+
+# map of supported formats
+formats = {
+    "b": lambda x: (x * 8),
+    "k": lambda x: ((x * 8) / 1024),
+    "m": lambda x: ((x * 8) / pow(1024, 2)),
+    "B": lambda x: x,
+    "K": lambda x: x / 1024,
+    "M": lambda x: x / pow(1024, 2)
+}
+
+# Let's swap the string with the actual numeric value
+# once here so we don't have to do it on every interval
+formatFn = formats[args.format]
+
+# define the basic structure of the BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+struct index_key_t {
+  u32 index;
+};
+
+BPF_HASH(ipv4_send_bytes, struct index_key_t);
+
+int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
+    struct msghdr *msg, size_t size)
+{
+    u16 family = sk->__sk_common.skc_family;
+
+    if (family == AF_INET) {
+        u32 dst = sk->__sk_common.skc_daddr;
+        unsigned categorized = 0;
+        __SUBNETS__
+    }
+    return 0;
+}
+"""
+
+
+# Takes in a mask and returns the integer equivalent
+# e.g.
+# mask_to_int(8) returns 4278190080
+def mask_to_int(n):
+    return ((1 << n) - 1) << (32 - n)
+
+# Takes in a list of subnets and returns a list
+# of tuple-3 containing:
+# - The subnet info at index 0
+# - The addr portion as an int at index 1
+# - The mask portion as an int at index 2
+#
+# e.g.
+# parse_subnets([10.10.0.0/24]) returns
+# [
+#   ['10.10.0.0/24', 168427520, 4294967040],
+# ]
+def parse_subnets(subnets):
+    m = []
+    for s in subnets:
+        parts = s.split("/")
+        if len(parts) != 2:
+            msg = "Subnet [%s] is invalid, please refer to the examples." % s
+            raise ValueError(msg)
+        netaddr_int = 0
+        mask_int = 0
+        try:
+            netaddr_int = struct.unpack("!I", socket.inet_aton(parts[0]))[0]
+        except:
+            msg = ("Invalid net address in subnet [%s], " +
+                "please refer to the examples.") % s
+            raise ValueError(msg)
+        try:
+            mask_int = int(parts[1])
+        except:
+            msg = "Invalid mask in subnet [%s]. Mask must be an int" % s
+            raise ValueError(msg)
+        if mask_int < 0 or mask_int > 32:
+            msg = ("Invalid mask in subnet [%s]. Must be an " +
+                "int between 0 and 32.") % s
+            raise ValueError(msg)
+        mask_int = mask_to_int(int(parts[1]))
+        m.append([s, netaddr_int, mask_int])
+    return m
+
+def generate_bpf_subnets(subnets):
+    template = """
+        if (!categorized && (__NET_ADDR__ & __NET_MASK__) ==
+             (dst & __NET_MASK__)) {
+          struct index_key_t key = {.index = __POS__};
+          ipv4_send_bytes.increment(key, size);
+          categorized = 1;
+        }
+    """
+    bpf = ''
+    for i, s in enumerate(subnets):
+        branch = template
+        branch = branch.replace("__NET_ADDR__", str(socket.htonl(s[1])))
+        branch = branch.replace("__NET_MASK__", str(socket.htonl(s[2])))
+        branch = branch.replace("__POS__", str(i))
+        bpf += branch
+    return bpf
+
+subnets = []
+if args.subnets:
+    subnets = args.subnets.split(",")
+
+subnets = parse_subnets(subnets)
+
+logging.debug("Packets are going to be categorized in the following subnets:")
+logging.debug(subnets)
+
+bpf_subnets = generate_bpf_subnets(subnets)
+
+# initialize BPF
+bpf_text = bpf_text.replace("__SUBNETS__", bpf_subnets)
+
+logging.debug("Done preprocessing the BPF program, " +
+        "this is what will actually get executed:")
+logging.debug(bpf_text)
+
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+b = BPF(text=bpf_text)
+
+ipv4_send_bytes = b["ipv4_send_bytes"]
+
+if not args.json:
+    print("Tracing... Output every %d secs. Hit Ctrl-C to end" % args.interval)
+
+# output
+exiting = 0
+while (1):
+
+    try:
+        sleep(args.interval)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    # IPv4:  build dict of all seen keys
+    keys = ipv4_send_bytes
+    for k, v in ipv4_send_bytes.items():
+        if k not in keys:
+            keys[k] = v
+
+    # to hold json data
+    data = {}
+
+    # output
+    now = dt.now()
+    data['date'] = now.strftime('%x')
+    data['time'] = now.strftime('%X')
+    data['entries'] = {}
+    if not args.json:
+        print(now.strftime('[%x %X]'))
+    for k, v in reversed(sorted(keys.items(), key=lambda keys: keys[1].value)):
+        send_bytes = 0
+        if k in ipv4_send_bytes:
+            send_bytes = int(ipv4_send_bytes[k].value)
+        subnet = subnets[k.index][0]
+        send = formatFn(send_bytes)
+        if args.json:
+            data['entries'][subnet] = send
+        else:
+            print("%-21s %6d" % (subnet, send))
+
+    if args.json:
+        print(json.dumps(data))
+
+    ipv4_send_bytes.clear()
+
+    if exiting:
+        exit(0)
diff --git a/tools/tcpsubnet_example.txt b/tools/tcpsubnet_example.txt
new file mode 100644
index 0000000..72a6172
--- /dev/null
+++ b/tools/tcpsubnet_example.txt
@@ -0,0 +1,161 @@
+Demonstrations of tcpsubnet, the Linux eBPF/bcc version.
+
+
+tcpsubnet summarizes throughput by destination subnet.
+It works only for IPv4. Eg:
+
+# tcpsubnet
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+[03/05/18 22:32:47]
+127.0.0.1/32               8
+[03/05/18 22:32:48]
+[03/05/18 22:32:49]
+[03/05/18 22:32:50]
+[03/05/18 22:32:51]
+[03/05/18 22:32:52]
+127.0.0.1/32              10
+[03/05/18 22:32:53]
+
+This example output shows the number of bytes sent to 127.0.0.1/32 (the
+loopback interface). For demo purposes, I set netcat listening on port
+8080, connected to it and sent the following payloads.
+
+# nc 127.0.0.1 8080
+1111111
+111111111
+
+The first line sends 7 digits plus the null character (8 bytes)
+The second line sends 9 digits plus the null character (10 bytes)
+
+Notice also, how tcpsubnet prints a header line with the current date
+and time formatted in the current locale.
+
+Try it yourself to get a feeling of how tcpsubnet works.
+
+By default, tcpsubnet will categorize traffic in the following subnets:
+
+- 127.0.0.1/32
+- 10.0.0.0/8
+- 172.16.0.0/12
+- 192.168.0.0/16
+- 0.0.0.0/0
+
+The last subnet is a catch-all. In other words, anything that doesn't
+match the first 4 defaults will be categorized under 0.0.0.0/0
+You can change this default behavoir by passing a comma separated list
+of subnets. Let's say we would like to know how much traffic we
+are sending to github.com. We first find out what IPs github.com resolves
+to, Eg:
+
+# dig +short github.com
+192.30.253.112
+192.30.253.113
+
+With this information, we can come up with a reasonable range of IPs
+to monitor, Eg:
+ 
+# tcpsubnet.py 192.30.253.110/27,0.0.0.0/0
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+[03/05/18 22:38:58]
+0.0.0.0/0               5780
+192.30.253.110/27       2205
+[03/05/18 22:38:59]
+0.0.0.0/0               2036
+192.30.253.110/27       1183
+[03/05/18 22:39:00]
+[03/05/18 22:39:01]
+192.30.253.110/27      12537
+
+If we would like to be more accurate, we can use the two IPs returned
+by dig, Eg:
+
+# tcpsubnet 192.30.253.113/32,192.130.253.112/32,0.0.0.0/0
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+[03/05/18 22:42:56]
+0.0.0.0/0               1177
+192.30.253.113/32        910
+[03/05/18 22:42:57]
+0.0.0.0/0              48704
+192.30.253.113/32        892
+[03/05/18 22:42:58]
+192.30.253.113/32        891
+0.0.0.0/0                858
+[03/05/18 22:42:59]
+0.0.0.0/0              11159
+192.30.253.113/32        894
+[03/05/18 22:43:00]
+0.0.0.0/0              60601
+
+NOTE: When used in production, it is expected that you will have full
+information about your network topology. In which case you won't need
+to approximate subnets nor need to put individual IP addresses like
+we just did.
+
+Notice that the order of the subnet matters. Say, we put 0.0.0.0/0 as
+the first element of the list and 192.130.253.112/32 as the second, all the
+traffic going to 192.130.253.112/32 will have been categorized in
+0.0.0.0/0 as 192.130.253.112/32 is contained in 0.0.0.0/0.
+
+The default ouput unit is bytes. You can change it by using the
+-f [--format] flag. tcpsubnet uses the same flags as iperf for the unit
+format and adds mM. When using kmKM, the output will be rounded to floor.
+Eg:
+
+# tcpsubnet -fK 0.0.0.0/0
+[03/05/18 22:44:04]
+0.0.0.0/0                  1
+[03/05/18 22:44:05]
+0.0.0.0/0                  5
+[03/05/18 22:44:06]
+0.0.0.0/0                 31
+
+Just like the majority of the bcc tools, tcpsubnet supports -i and --ebpf
+
+It also supports -v [--verbose] which gives useful debugging information
+on how the subnets are evaluated and the BPF program is constructed.
+
+Last but not least, it supports -J [--json] to print the output in
+JSON format. This is handy if you're calling tcpsubnet from another
+program (say a nodejs server) and would like to have a structured stdout.
+The output in JSON format will also include the date and time.
+Eg:
+
+# tcpsubnet -J -fK 192.130.253.110/27,0.0.0.0/0
+{"date": "03/05/18", "entries": {"0.0.0.0/0": 2}, "time": "22:46:27"}
+{"date": "03/05/18", "entries": {}, "time": "22:46:28"}
+{"date": "03/05/18", "entries": {}, "time": "22:46:29"}
+{"date": "03/05/18", "entries": {}, "time": "22:46:30"}
+{"date": "03/05/18", "entries": {"192.30.253.110/27": 0}, "time": "22:46:31"}
+{"date": "03/05/18", "entries": {"192.30.253.110/27": 1}, "time": "22:46:32"}
+{"date": "03/05/18", "entries": {"192.30.253.110/27": 18}, "time": "22:46:32"}
+
+
+USAGE:
+
+# ./tcpsubnet -h
+usage: tcpsubnet.py [-h] [-v] [-J] [-f {b,k,m,B,K,M}] [-i INTERVAL] [subnets]
+
+Summarize TCP send and aggregate by subnet
+
+positional arguments:
+  subnets               comma separated list of subnets
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --verbose         output debug statements
+  -J, --json            format output in JSON
+  -f {b,k,m,B,K,M}, --format {b,k,m,B,K,M}
+                        [bkmBKM] format to report: bits, Kbits, Mbits, bytes,
+                        KBytes, MBytes (default B)
+  -i INTERVAL, --interval INTERVAL
+                        output interval, in seconds (default 1)
+
+examples:
+    ./tcpsubnet                 # Trace TCP sent to the default subnets:
+                                # 127.0.0.1/32,10.0.0.0/8,172.16.0.0/12,
+                                # 192.168.0.0/16,0.0.0.0/0
+    ./tcpsubnet -f K            # Trace TCP sent to the default subnets
+                                # aggregated in KBytes.
+    ./tcpsubnet 10.80.0.0/24    # Trace TCP sent to 10.80.0.0/24 only
+    ./tcpsubnet -J              # Format the output in JSON.
+
diff --git a/tools/tcptop.py b/tools/tcptop.py
new file mode 100755
index 0000000..e1eb241
--- /dev/null
+++ b/tools/tcptop.py
@@ -0,0 +1,287 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcptop    Summarize TCP send/recv throughput by host.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcptop [-h] [-C] [-S] [-p PID] [interval [count]]
+#
+# This uses dynamic tracing of kernel functions, and will need to be updated
+# to match kernel changes.
+#
+# WARNING: This traces all send/receives at the TCP level, and while it
+# summarizes data in-kernel to reduce overhead, there may still be some
+# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP
+# events/sec. This is not the same as packet rate: funccount can be used to
+# count the kprobes below to find out the TCP rate). Test in a lab environment
+# first. If your send/receive rate is low (eg, <1k/sec) then the overhead is
+# expected to be negligible.
+#
+# ToDo: Fit output to screen size (top X only) in default (not -C) mode.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 02-Sep-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+from time import sleep, strftime
+from subprocess import call
+import ctypes as ct
+from collections import namedtuple, defaultdict
+
+# arguments
+def range_check(string):
+    value = int(string)
+    if value < 1:
+        msg = "value must be stricly positive, got %d" % (value,)
+        raise argparse.ArgumentTypeError(msg)
+    return value
+
+examples = """examples:
+    ./tcptop           # trace TCP send/recv by host
+    ./tcptop -C        # don't clear the screen
+    ./tcptop -p 181    # only trace PID 181
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize TCP send/recv throughput by host",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("-S", "--nosummary", action="store_true",
+    help="skip system summary line")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=1, type=range_check,
+    help="output interval, in seconds (default 1)")
+parser.add_argument("count", nargs="?", default=-1, type=range_check,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+# linux stats
+loadavg = "/proc/loadavg"
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+
+struct ipv4_key_t {
+    u32 pid;
+    u32 saddr;
+    u32 daddr;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv4_send_bytes, struct ipv4_key_t);
+BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t);
+
+struct ipv6_key_t {
+    u32 pid;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 lport;
+    u16 dport;
+};
+BPF_HASH(ipv6_send_bytes, struct ipv6_key_t);
+BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t);
+
+int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk,
+    struct msghdr *msg, size_t size)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    u16 dport = 0, family = sk->__sk_common.skc_family;
+
+    if (family == AF_INET) {
+        struct ipv4_key_t ipv4_key = {.pid = pid};
+        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
+        ipv4_key.daddr = sk->__sk_common.skc_daddr;
+        ipv4_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv4_key.dport = ntohs(dport);
+        ipv4_send_bytes.increment(ipv4_key, size);
+
+    } else if (family == AF_INET6) {
+        struct ipv6_key_t ipv6_key = {.pid = pid};
+        __builtin_memcpy(&ipv6_key.saddr,
+            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32, sizeof(ipv6_key.saddr));
+        __builtin_memcpy(&ipv6_key.daddr,
+            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32, sizeof(ipv6_key.daddr));
+        ipv6_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv6_key.dport = ntohs(dport);
+        ipv6_send_bytes.increment(ipv6_key, size);
+    }
+    // else drop
+
+    return 0;
+}
+
+/*
+ * tcp_recvmsg() would be obvious to trace, but is less suitable because:
+ * - we'd need to trace both entry and return, to have both sock and size
+ * - misses tcp_read_sock() traffic
+ * we'd much prefer tracepoints once they are available.
+ */
+int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    FILTER
+    u16 dport = 0, family = sk->__sk_common.skc_family;
+    u64 *val, zero = 0;
+
+    if (copied <= 0)
+        return 0;
+
+    if (family == AF_INET) {
+        struct ipv4_key_t ipv4_key = {.pid = pid};
+        ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
+        ipv4_key.daddr = sk->__sk_common.skc_daddr;
+        ipv4_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv4_key.dport = ntohs(dport);
+        ipv4_recv_bytes.increment(ipv4_key, copied);
+
+    } else if (family == AF_INET6) {
+        struct ipv6_key_t ipv6_key = {.pid = pid};
+        __builtin_memcpy(&ipv6_key.saddr,
+            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32, sizeof(ipv6_key.saddr));
+        __builtin_memcpy(&ipv6_key.daddr,
+            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32, sizeof(ipv6_key.daddr));
+        ipv6_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        ipv6_key.dport = ntohs(dport);
+        ipv6_recv_bytes.increment(ipv6_key, copied);
+    }
+    // else drop
+
+    return 0;
+}
+"""
+
+# code substitutions
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+TCPSessionKey = namedtuple('TCPSession', ['pid', 'laddr', 'lport', 'daddr', 'dport'])
+
+def pid_to_comm(pid):
+    try:
+        comm = open("/proc/%d/comm" % pid, "r").read().rstrip()
+        return comm
+    except IOError:
+        return str(pid)
+
+def get_ipv4_session_key(k):
+    return TCPSessionKey(pid=k.pid,
+                         laddr=inet_ntop(AF_INET, pack("I", k.saddr)),
+                         lport=k.lport,
+                         daddr=inet_ntop(AF_INET, pack("I", k.daddr)),
+                         dport=k.dport)
+
+def get_ipv6_session_key(k):
+    return TCPSessionKey(pid=k.pid,
+                         laddr=inet_ntop(AF_INET6, k.saddr),
+                         lport=k.lport,
+                         daddr=inet_ntop(AF_INET6, k.daddr),
+                         dport=k.dport)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+ipv4_send_bytes = b["ipv4_send_bytes"]
+ipv4_recv_bytes = b["ipv4_recv_bytes"]
+ipv6_send_bytes = b["ipv6_send_bytes"]
+ipv6_recv_bytes = b["ipv6_recv_bytes"]
+
+print('Tracing... Output every %s secs. Hit Ctrl-C to end' % args.interval)
+
+# output
+i = 0
+exiting = False
+while i != args.count and not exiting:
+    try:
+        sleep(args.interval)
+    except KeyboardInterrupt:
+        exiting = True
+
+    # header
+    if args.noclear:
+        print()
+    else:
+        call("clear")
+    if not args.nosummary:
+        with open(loadavg) as stats:
+            print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+
+    # IPv4: build dict of all seen keys
+    ipv4_throughput = defaultdict(lambda: [0, 0])
+    for k, v in ipv4_send_bytes.items():
+        key = get_ipv4_session_key(k)
+        ipv4_throughput[key][0] = v.value
+    ipv4_send_bytes.clear()
+
+    for k, v in ipv4_recv_bytes.items():
+        key = get_ipv4_session_key(k)
+        ipv4_throughput[key][1] = v.value
+    ipv4_recv_bytes.clear()
+
+    if ipv4_throughput:
+        print("%-6s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM",
+            "LADDR", "RADDR", "RX_KB", "TX_KB"))
+
+    # output
+    for k, (send_bytes, recv_bytes) in sorted(ipv4_throughput.items(),
+                                              key=lambda kv: sum(kv[1]),
+                                              reverse=True):
+        print("%-6d %-12.12s %-21s %-21s %6d %6d" % (k.pid,
+            pid_to_comm(k.pid),
+            k.laddr + ":" + str(k.lport),
+            k.daddr + ":" + str(k.dport),
+            int(recv_bytes / 1024), int(send_bytes / 1024)))
+
+    # IPv6: build dict of all seen keys
+    ipv6_throughput = defaultdict(lambda: [0, 0])
+    for k, v in ipv6_send_bytes.items():
+        key = get_ipv6_session_key(k)
+        ipv6_throughput[key][0] = v.value
+    ipv6_send_bytes.clear()
+
+    for k, v in ipv6_recv_bytes.items():
+        key = get_ipv6_session_key(k)
+        ipv6_throughput[key][1] = v.value
+    ipv6_recv_bytes.clear()
+
+    if ipv6_throughput:
+        # more than 80 chars, sadly.
+        print("\n%-6s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM",
+            "LADDR6", "RADDR6", "RX_KB", "TX_KB"))
+
+    # output
+    for k, (send_bytes, recv_bytes) in sorted(ipv6_throughput.items(),
+                                              key=lambda kv: sum(kv[1]),
+                                              reverse=True):
+        print("%-6d %-12.12s %-32s %-32s %6d %6d" % (k.pid,
+            pid_to_comm(k.pid),
+            k.laddr + ":" + str(k.lport),
+            k.daddr + ":" + str(k.dport),
+            int(recv_bytes / 1024), int(send_bytes / 1024)))
+
+    i += 1
diff --git a/tools/tcptop_example.txt b/tools/tcptop_example.txt
new file mode 100644
index 0000000..63ba2ef
--- /dev/null
+++ b/tools/tcptop_example.txt
@@ -0,0 +1,116 @@
+Demonstrations of tcptop, the Linux eBPF/bcc version.
+
+
+tcptop summarizes throughput by host and port. Eg:
+
+# tcptop
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+<screen clears>
+19:46:24 loadavg: 1.86 2.67 2.91 3/362 16681
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+16648  16648        100.66.3.172:22       100.127.69.165:6684        1      0
+16647  sshd         100.66.3.172:22       100.127.69.165:6684        0   2149
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+14458  sshd         100.66.3.172:22       100.127.69.165:7165        0      0
+
+PID    COMM         LADDR6                           RADDR6                            RX_KB  TX_KB
+16681  sshd         fe80::8a3:9dff:fed5:6b19:22      fe80::8a3:9dff:fed5:6b19:16606        1      1
+16679  ssh          fe80::8a3:9dff:fed5:6b19:16606   fe80::8a3:9dff:fed5:6b19:22           1      1
+16680  sshd         fe80::8a3:9dff:fed5:6b19:22      fe80::8a3:9dff:fed5:6b19:16606        0      0
+
+This example output shows two listings of TCP connections, for IPv4 and IPv6.
+If there is only traffic for one of these, then only one group is shown.
+
+The output in each listing is sorted by total throughput (send then receive),
+and when printed it is rounded (floor) to the nearest Kbyte. The example output
+shows PID 16647, sshd, transmitted 2149 Kbytes during the tracing interval.
+The other IPv4 sessions had such low throughput they rounded to zero.
+
+All TCP sessions, including over loopback, are included.
+
+The session with the process name (COMM) of 16648 is really a short-lived
+process with PID 16648 where we didn't catch the process name when printing
+the output. If this behavior is a serious issue for you, you can modify the
+tool's code to include bpf_get_current_comm() in the key structs, so that it's
+fetched during the event and will always be seen. I did it this way to start
+with, but it was measurably increasing the overhead of this tool, so I switched
+to the asynchronous model.
+
+The overhead is relative to TCP event rate (the rate of tcp_sendmsg() and
+tcp_recvmsg() or tcp_cleanup_rbuf()). Due to buffering, this should be lower
+than the packet rate. You can measure the rate of these using funccount.
+Some sample production servers tested found total rates of 4k to 15k per
+second. The CPU overhead at these rates ranged from 0.5% to 2.0% of one CPU.
+Maybe your workloads have higher rates and therefore higher overhead, or,
+lower rates.
+
+
+I much prefer not clearing the screen, so that historic output is in the
+scroll-back buffer, and patterns or intermittent issues can be better seen.
+You can do this with -C:
+
+# tcptop -C
+Tracing... Output every 1 secs. Hit Ctrl-C to end
+
+20:27:12 loadavg: 0.08 0.02 0.17 2/367 17342
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17287  17287        100.66.3.172:22       100.127.69.165:57585       3      1
+17286  sshd         100.66.3.172:22       100.127.69.165:57585       0      1
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:13 loadavg: 0.08 0.02 0.17 1/367 17342
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17286  sshd         100.66.3.172:22       100.127.69.165:57585       1   7761
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:14 loadavg: 0.08 0.02 0.17 2/365 17347
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17286  17286        100.66.3.172:22       100.127.69.165:57585       1   2501
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:15 loadavg: 0.07 0.02 0.17 2/367 17403
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17349  17349        100.66.3.172:22       100.127.69.165:10161       3      1
+17348  sshd         100.66.3.172:22       100.127.69.165:10161       0      1
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:16 loadavg: 0.07 0.02 0.17 1/367 17403
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17348  sshd         100.66.3.172:22       100.127.69.165:10161    3333      0
+14374  sshd         100.66.3.172:22       100.127.69.165:25219       0      0
+
+20:27:17 loadavg: 0.07 0.02 0.17 2/366 17409
+
+PID    COMM         LADDR                 RADDR                  RX_KB  TX_KB
+17348  17348        100.66.3.172:22       100.127.69.165:10161    6909      2
+
+You can disable the loadavg summary line with -S if needed.
+
+
+USAGE:
+
+# tcptop -h
+usage: tcptop.py [-h] [-C] [-S] [-p PID] [interval] [count]
+
+Summarize TCP send/recv throughput by host
+
+positional arguments:
+  interval           output interval, in seconds (default 1)
+  count              number of outputs
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -C, --noclear      don't clear the screen
+  -S, --nosummary    skip system summary line
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./tcptop           # trace TCP send/recv by host
+    ./tcptop -C        # don't clear the screen
+    ./tcptop -p 181    # only trace PID 181
diff --git a/tools/tcptracer.py b/tools/tcptracer.py
new file mode 100755
index 0000000..16bb4b1
--- /dev/null
+++ b/tools/tcptracer.py
@@ -0,0 +1,665 @@
+#!/usr/bin/python
+#
+# tcpv4tracer   Trace TCP connections.
+#               For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: tcpv4tracer [-h] [-v] [-p PID] [-N NETNS]
+#
+# You should generally try to avoid writing long scripts that measure multiple
+# functions and walk multiple kernel structures, as they will be a burden to
+# maintain as the kernel changes.
+# The following code should be replaced, and simplified, when static TCP probes
+# exist.
+#
+# Copyright 2017 Kinvolk GmbH
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+from __future__ import print_function
+from bcc import BPF
+
+import argparse as ap
+import ctypes
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+
+parser = ap.ArgumentParser(description="Trace TCP connections",
+                           formatter_class=ap.RawDescriptionHelpFormatter)
+parser.add_argument("-t", "--timestamp", action="store_true",
+                    help="include timestamp on output")
+parser.add_argument("-p", "--pid", default=0, type=int,
+                    help="trace this PID only")
+parser.add_argument("-N", "--netns", default=0, type=int,
+                    help="trace this Network Namespace only")
+parser.add_argument("-v", "--verbose", action="store_true",
+                    help="include Network Namespace in the output")
+parser.add_argument("--ebpf", action="store_true",
+                    help=ap.SUPPRESS)
+args = parser.parse_args()
+
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-compare"
+#include <net/sock.h>
+#pragma clang diagnostic pop
+#include <net/inet_sock.h>
+#include <net/net_namespace.h>
+#include <bcc/proto.h>
+
+#define TCP_EVENT_TYPE_CONNECT 1
+#define TCP_EVENT_TYPE_ACCEPT  2
+#define TCP_EVENT_TYPE_CLOSE   3
+
+struct tcp_ipv4_event_t {
+    u64 ts_ns;
+    u32 type;
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+    u8 ip;
+    u32 saddr;
+    u32 daddr;
+    u16 sport;
+    u16 dport;
+    u32 netns;
+};
+BPF_PERF_OUTPUT(tcp_ipv4_event);
+
+struct tcp_ipv6_event_t {
+    u64 ts_ns;
+    u32 type;
+    u32 pid;
+    char comm[TASK_COMM_LEN];
+    u8 ip;
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 sport;
+    u16 dport;
+    u32 netns;
+};
+BPF_PERF_OUTPUT(tcp_ipv6_event);
+
+// tcp_set_state doesn't run in the context of the process that initiated the
+// connection so we need to store a map TUPLE -> PID to send the right PID on
+// the event
+struct ipv4_tuple_t {
+    u32 saddr;
+    u32 daddr;
+    u16 sport;
+    u16 dport;
+    u32 netns;
+};
+
+struct ipv6_tuple_t {
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 sport;
+    u16 dport;
+    u32 netns;
+};
+
+struct pid_comm_t {
+    u64 pid;
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_HASH(tuplepid_ipv4, struct ipv4_tuple_t, struct pid_comm_t);
+BPF_HASH(tuplepid_ipv6, struct ipv6_tuple_t, struct pid_comm_t);
+
+BPF_HASH(connectsock, u64, struct sock *);
+
+static int read_ipv4_tuple(struct ipv4_tuple_t *tuple, struct sock *skp)
+{
+  u32 net_ns_inum = 0;
+  u32 saddr = skp->__sk_common.skc_rcv_saddr;
+  u32 daddr = skp->__sk_common.skc_daddr;
+  struct inet_sock *sockp = (struct inet_sock *)skp;
+  u16 sport = sockp->inet_sport;
+  u16 dport = skp->__sk_common.skc_dport;
+#ifdef CONFIG_NET_NS
+  net_ns_inum = skp->__sk_common.skc_net.net->ns.inum;
+#endif
+
+  ##FILTER_NETNS##
+
+  tuple->saddr = saddr;
+  tuple->daddr = daddr;
+  tuple->sport = sport;
+  tuple->dport = dport;
+  tuple->netns = net_ns_inum;
+
+  // if addresses or ports are 0, ignore
+  if (saddr == 0 || daddr == 0 || sport == 0 || dport == 0) {
+      return 0;
+  }
+
+  return 1;
+}
+
+static int read_ipv6_tuple(struct ipv6_tuple_t *tuple, struct sock *skp)
+{
+  u32 net_ns_inum = 0;
+  unsigned __int128 saddr = 0, daddr = 0;
+  struct inet_sock *sockp = (struct inet_sock *)skp;
+  u16 sport = sockp->inet_sport;
+  u16 dport = skp->__sk_common.skc_dport;
+#ifdef CONFIG_NET_NS
+  net_ns_inum = skp->__sk_common.skc_net.net->ns.inum;
+#endif
+  bpf_probe_read(&saddr, sizeof(saddr),
+                 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+  bpf_probe_read(&daddr, sizeof(daddr),
+                 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+
+  ##FILTER_NETNS##
+
+  tuple->saddr = saddr;
+  tuple->daddr = daddr;
+  tuple->sport = sport;
+  tuple->dport = dport;
+  tuple->netns = net_ns_inum;
+
+  // if addresses or ports are 0, ignore
+  if (saddr == 0 || daddr == 0 || sport == 0 || dport == 0) {
+      return 0;
+  }
+
+  return 1;
+}
+
+static bool check_family(struct sock *sk, u16 expected_family) {
+  u64 zero = 0;
+  u16 family = sk->__sk_common.skc_family;
+  return family == expected_family;
+}
+
+int trace_connect_v4_entry(struct pt_regs *ctx, struct sock *sk)
+{
+  u64 pid = bpf_get_current_pid_tgid();
+
+  ##FILTER_PID##
+
+  // stash the sock ptr for lookup on return
+  connectsock.update(&pid, &sk);
+
+  return 0;
+}
+
+int trace_connect_v4_return(struct pt_regs *ctx)
+{
+  int ret = PT_REGS_RC(ctx);
+  u64 pid = bpf_get_current_pid_tgid();
+
+  struct sock **skpp;
+  skpp = connectsock.lookup(&pid);
+  if (skpp == 0) {
+      return 0;       // missed entry
+  }
+
+  connectsock.delete(&pid);
+
+  if (ret != 0) {
+      // failed to send SYNC packet, may not have populated
+      // socket __sk_common.{skc_rcv_saddr, ...}
+      return 0;
+  }
+
+  // pull in details
+  struct sock *skp = *skpp;
+  struct ipv4_tuple_t t = { };
+  if (!read_ipv4_tuple(&t, skp)) {
+      return 0;
+  }
+
+  struct pid_comm_t p = { };
+  p.pid = pid;
+  bpf_get_current_comm(&p.comm, sizeof(p.comm));
+
+  tuplepid_ipv4.update(&t, &p);
+
+  return 0;
+}
+
+int trace_connect_v6_entry(struct pt_regs *ctx, struct sock *sk)
+{
+  u64 pid = bpf_get_current_pid_tgid();
+
+  ##FILTER_PID##
+
+  // stash the sock ptr for lookup on return
+  connectsock.update(&pid, &sk);
+
+  return 0;
+}
+
+int trace_connect_v6_return(struct pt_regs *ctx)
+{
+  int ret = PT_REGS_RC(ctx);
+  u64 pid = bpf_get_current_pid_tgid();
+
+  struct sock **skpp;
+  skpp = connectsock.lookup(&pid);
+  if (skpp == 0) {
+      return 0;       // missed entry
+  }
+
+  connectsock.delete(&pid);
+
+  if (ret != 0) {
+      // failed to send SYNC packet, may not have populated
+      // socket __sk_common.{skc_rcv_saddr, ...}
+      return 0;
+  }
+
+  // pull in details
+  struct sock *skp = *skpp;
+  struct ipv6_tuple_t t = { };
+  if (!read_ipv6_tuple(&t, skp)) {
+      return 0;
+  }
+
+  struct pid_comm_t p = { };
+  p.pid = pid;
+  bpf_get_current_comm(&p.comm, sizeof(p.comm));
+
+  tuplepid_ipv6.update(&t, &p);
+
+  return 0;
+}
+
+int trace_tcp_set_state_entry(struct pt_regs *ctx, struct sock *skp, int state)
+{
+  if (state != TCP_ESTABLISHED && state != TCP_CLOSE) {
+      return 0;
+  }
+
+  u8 ipver = 0;
+  if (check_family(skp, AF_INET)) {
+      ipver = 4;
+      struct ipv4_tuple_t t = { };
+      if (!read_ipv4_tuple(&t, skp)) {
+          return 0;
+      }
+
+      if (state == TCP_CLOSE) {
+          tuplepid_ipv4.delete(&t);
+          return 0;
+      }
+
+      struct pid_comm_t *p;
+      p = tuplepid_ipv4.lookup(&t);
+      if (p == 0) {
+          return 0;       // missed entry
+      }
+
+      struct tcp_ipv4_event_t evt4 = { };
+      evt4.ts_ns = bpf_ktime_get_ns();
+      evt4.type = TCP_EVENT_TYPE_CONNECT;
+      evt4.pid = p->pid >> 32;
+      evt4.ip = ipver;
+      evt4.saddr = t.saddr;
+      evt4.daddr = t.daddr;
+      evt4.sport = ntohs(t.sport);
+      evt4.dport = ntohs(t.dport);
+      evt4.netns = t.netns;
+
+      int i;
+      for (i = 0; i < TASK_COMM_LEN; i++) {
+          evt4.comm[i] = p->comm[i];
+      }
+
+      tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
+      tuplepid_ipv4.delete(&t);
+  } else if (check_family(skp, AF_INET6)) {
+      ipver = 6;
+      struct ipv6_tuple_t t = { };
+      if (!read_ipv6_tuple(&t, skp)) {
+          return 0;
+      }
+
+      if (state == TCP_CLOSE) {
+          tuplepid_ipv6.delete(&t);
+          return 0;
+      }
+
+      struct pid_comm_t *p;
+      p = tuplepid_ipv6.lookup(&t);
+      if (p == 0) {
+          return 0;       // missed entry
+      }
+
+      struct tcp_ipv6_event_t evt6 = { };
+      evt6.ts_ns = bpf_ktime_get_ns();
+      evt6.type = TCP_EVENT_TYPE_CONNECT;
+      evt6.pid = p->pid >> 32;
+      evt6.ip = ipver;
+      evt6.saddr = t.saddr;
+      evt6.daddr = t.daddr;
+      evt6.sport = ntohs(t.sport);
+      evt6.dport = ntohs(t.dport);
+      evt6.netns = t.netns;
+
+      int i;
+      for (i = 0; i < TASK_COMM_LEN; i++) {
+          evt6.comm[i] = p->comm[i];
+      }
+
+      tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
+      tuplepid_ipv6.delete(&t);
+  }
+  // else drop
+
+  return 0;
+}
+
+int trace_close_entry(struct pt_regs *ctx, struct sock *skp)
+{
+  u64 pid = bpf_get_current_pid_tgid();
+
+  ##FILTER_PID##
+
+  u8 oldstate = skp->sk_state;
+  // Don't generate close events for connections that were never
+  // established in the first place.
+  if (oldstate == TCP_SYN_SENT ||
+      oldstate == TCP_SYN_RECV ||
+      oldstate == TCP_NEW_SYN_RECV)
+      return 0;
+
+  u8 ipver = 0;
+  if (check_family(skp, AF_INET)) {
+      ipver = 4;
+      struct ipv4_tuple_t t = { };
+      if (!read_ipv4_tuple(&t, skp)) {
+          return 0;
+      }
+
+      struct tcp_ipv4_event_t evt4 = { };
+      evt4.ts_ns = bpf_ktime_get_ns();
+      evt4.type = TCP_EVENT_TYPE_CLOSE;
+      evt4.pid = pid >> 32;
+      evt4.ip = ipver;
+      evt4.saddr = t.saddr;
+      evt4.daddr = t.daddr;
+      evt4.sport = ntohs(t.sport);
+      evt4.dport = ntohs(t.dport);
+      evt4.netns = t.netns;
+      bpf_get_current_comm(&evt4.comm, sizeof(evt4.comm));
+
+      tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
+  } else if (check_family(skp, AF_INET6)) {
+      ipver = 6;
+      struct ipv6_tuple_t t = { };
+      if (!read_ipv6_tuple(&t, skp)) {
+          return 0;
+      }
+
+      struct tcp_ipv6_event_t evt6 = { };
+      evt6.ts_ns = bpf_ktime_get_ns();
+      evt6.type = TCP_EVENT_TYPE_CLOSE;
+      evt6.pid = pid >> 32;
+      evt6.ip = ipver;
+      evt6.saddr = t.saddr;
+      evt6.daddr = t.daddr;
+      evt6.sport = ntohs(t.sport);
+      evt6.dport = ntohs(t.dport);
+      evt6.netns = t.netns;
+      bpf_get_current_comm(&evt6.comm, sizeof(evt6.comm));
+
+      tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
+  }
+  // else drop
+
+  return 0;
+};
+
+int trace_accept_return(struct pt_regs *ctx)
+{
+  struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
+  u64 pid = bpf_get_current_pid_tgid();
+
+  ##FILTER_PID##
+
+  if (newsk == NULL) {
+      return 0;
+  }
+
+  // pull in details
+  u16 lport = 0, dport = 0;
+  u32 net_ns_inum = 0;
+  u8 ipver = 0;
+
+  dport = newsk->__sk_common.skc_dport;
+  lport = newsk->__sk_common.skc_num;
+
+  // Get network namespace id, if kernel supports it
+#ifdef CONFIG_NET_NS
+  net_ns_inum = newsk->__sk_common.skc_net.net->ns.inum;
+#endif
+
+  ##FILTER_NETNS##
+
+  if (check_family(newsk, AF_INET)) {
+      ipver = 4;
+
+      struct tcp_ipv4_event_t evt4 = { 0 };
+
+      evt4.ts_ns = bpf_ktime_get_ns();
+      evt4.type = TCP_EVENT_TYPE_ACCEPT;
+      evt4.netns = net_ns_inum;
+      evt4.pid = pid >> 32;
+      evt4.ip = ipver;
+
+      evt4.saddr = newsk->__sk_common.skc_rcv_saddr;
+      evt4.daddr = newsk->__sk_common.skc_daddr;
+
+      evt4.sport = lport;
+      evt4.dport = ntohs(dport);
+      bpf_get_current_comm(&evt4.comm, sizeof(evt4.comm));
+
+      // do not send event if IP address is 0.0.0.0 or port is 0
+      if (evt4.saddr != 0 && evt4.daddr != 0 &&
+          evt4.sport != 0 && evt4.dport != 0) {
+          tcp_ipv4_event.perf_submit(ctx, &evt4, sizeof(evt4));
+      }
+  } else if (check_family(newsk, AF_INET6)) {
+      ipver = 6;
+
+      struct tcp_ipv6_event_t evt6 = { 0 };
+
+      evt6.ts_ns = bpf_ktime_get_ns();
+      evt6.type = TCP_EVENT_TYPE_ACCEPT;
+      evt6.netns = net_ns_inum;
+      evt6.pid = pid >> 32;
+      evt6.ip = ipver;
+
+      bpf_probe_read(&evt6.saddr, sizeof(evt6.saddr),
+                     newsk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+      bpf_probe_read(&evt6.daddr, sizeof(evt6.daddr),
+                     newsk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+
+      evt6.sport = lport;
+      evt6.dport = ntohs(dport);
+      bpf_get_current_comm(&evt6.comm, sizeof(evt6.comm));
+
+      // do not send event if IP address is 0.0.0.0 or port is 0
+      if (evt6.saddr != 0 && evt6.daddr != 0 &&
+          evt6.sport != 0 && evt6.dport != 0) {
+          tcp_ipv6_event.perf_submit(ctx, &evt6, sizeof(evt6));
+      }
+  }
+  // else drop
+
+  return 0;
+}
+"""
+
+TASK_COMM_LEN = 16   # linux/sched.h
+
+
+class TCPIPV4Evt(ctypes.Structure):
+    _fields_ = [
+            ("ts_ns", ctypes.c_ulonglong),
+            ("type", ctypes.c_uint),
+            ("pid", ctypes.c_uint),
+            ("comm", ctypes.c_char * TASK_COMM_LEN),
+            ("ip", ctypes.c_ubyte),
+            ("saddr", ctypes.c_uint),
+            ("daddr", ctypes.c_uint),
+            ("sport", ctypes.c_ushort),
+            ("dport", ctypes.c_ushort),
+            ("netns", ctypes.c_uint)
+    ]
+
+
+class TCPIPV6Evt(ctypes.Structure):
+    _fields_ = [
+            ("ts_ns", ctypes.c_ulonglong),
+            ("type", ctypes.c_uint),
+            ("pid", ctypes.c_uint),
+            ("comm", ctypes.c_char * TASK_COMM_LEN),
+            ("ip", ctypes.c_ubyte),
+            ("saddr", (ctypes.c_ulong * 2)),
+            ("daddr", (ctypes.c_ulong * 2)),
+            ("sport", ctypes.c_ushort),
+            ("dport", ctypes.c_ushort),
+            ("netns", ctypes.c_uint)
+    ]
+
+
+verbose_types = {"C": "connect", "A": "accept",
+                 "X": "close", "U": "unknown"}
+
+
+def print_ipv4_event(cpu, data, size):
+    event = ctypes.cast(data, ctypes.POINTER(TCPIPV4Evt)).contents
+    global start_ts
+
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_ns
+        if args.verbose:
+            print("%-14d" % (event.ts_ns - start_ts), end="")
+        else:
+            print("%-9.3f" % ((event.ts_ns - start_ts) / 1000000000.0), end="")
+    if event.type == 1:
+        type_str = "C"
+    elif event.type == 2:
+        type_str = "A"
+    elif event.type == 3:
+        type_str = "X"
+    else:
+        type_str = "U"
+
+    if args.verbose:
+        print("%-12s " % (verbose_types[type_str]), end="")
+    else:
+        print("%-2s " % (type_str), end="")
+
+    print("%-6d %-16s %-2d %-16s %-16s %-6d %-6d" %
+          (event.pid, event.comm.decode('utf-8', 'replace'),
+           event.ip,
+           inet_ntop(AF_INET, pack("I", event.saddr)),
+           inet_ntop(AF_INET, pack("I", event.daddr)),
+           event.sport,
+           event.dport), end="")
+    if args.verbose and not args.netns:
+        print(" %-8d" % event.netns)
+    else:
+        print()
+
+
+def print_ipv6_event(cpu, data, size):
+    event = ctypes.cast(data, ctypes.POINTER(TCPIPV6Evt)).contents
+    global start_ts
+    if args.timestamp:
+        if start_ts == 0:
+            start_ts = event.ts_ns
+        if args.verbose:
+            print("%-14d" % (event.ts_ns - start_ts), end="")
+        else:
+            print("%-9.3f" % ((event.ts_ns - start_ts) / 1000000000.0), end="")
+    if event.type == 1:
+        type_str = "C"
+    elif event.type == 2:
+        type_str = "A"
+    elif event.type == 3:
+        type_str = "X"
+    else:
+        type_str = "U"
+
+    if args.verbose:
+        print("%-12s " % (verbose_types[type_str]), end="")
+    else:
+        print("%-2s " % (type_str), end="")
+
+    print("%-6d %-16s %-2d %-16s %-16s %-6d %-6d" %
+          (event.pid, event.comm.decode('utf-8', 'replace'),
+           event.ip,
+           "[" + inet_ntop(AF_INET6, event.saddr) + "]",
+           "[" + inet_ntop(AF_INET6, event.daddr) + "]",
+           event.sport,
+           event.dport), end="")
+    if args.verbose and not args.netns:
+        print(" %-8d" % event.netns)
+    else:
+        print()
+
+
+pid_filter = ""
+netns_filter = ""
+
+if args.pid:
+    pid_filter = 'if (pid >> 32 != %d) { return 0; }' % args.pid
+if args.netns:
+    netns_filter = 'if (net_ns_inum != %d) { return 0; }' % args.netns
+
+bpf_text = bpf_text.replace('##FILTER_PID##', pid_filter)
+bpf_text = bpf_text.replace('##FILTER_NETNS##', netns_filter)
+
+if args.ebpf:
+    print(bpf_text)
+    exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_entry")
+b.attach_kretprobe(event="tcp_v4_connect", fn_name="trace_connect_v4_return")
+b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_entry")
+b.attach_kretprobe(event="tcp_v6_connect", fn_name="trace_connect_v6_return")
+b.attach_kprobe(event="tcp_set_state", fn_name="trace_tcp_set_state_entry")
+b.attach_kprobe(event="tcp_close", fn_name="trace_close_entry")
+b.attach_kretprobe(event="inet_csk_accept", fn_name="trace_accept_return")
+
+print("Tracing TCP established connections. Ctrl-C to end.")
+
+# header
+if args.verbose:
+    if args.timestamp:
+        print("%-14s" % ("TIME(ns)"), end="")
+    print("%-12s %-6s %-16s %-2s %-16s %-16s %-6s %-7s" % ("TYPE",
+          "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"), end="")
+    if not args.netns:
+        print("%-8s" % "NETNS", end="")
+    print()
+else:
+    if args.timestamp:
+        print("%-9s" % ("TIME(s)"), end="")
+    print("%-2s %-6s %-16s %-2s %-16s %-16s %-6s %-6s" %
+          ("T", "PID", "COMM", "IP", "SADDR", "DADDR", "SPORT", "DPORT"))
+
+start_ts = 0
+
+def inet_ntoa(addr):
+    dq = ''
+    for i in range(0, 4):
+        dq = dq + str(addr & 0xff)
+        if (i != 3):
+            dq = dq + '.'
+        addr = addr >> 8
+    return dq
+
+
+b["tcp_ipv4_event"].open_perf_buffer(print_ipv4_event)
+b["tcp_ipv6_event"].open_perf_buffer(print_ipv6_event)
+while True:
+    b.perf_buffer_poll()
diff --git a/tools/tcptracer_example.txt b/tools/tcptracer_example.txt
new file mode 100644
index 0000000..f782d91
--- /dev/null
+++ b/tools/tcptracer_example.txt
@@ -0,0 +1,37 @@
+Demonstrations of tcptracer, the Linux eBPF/bcc version.
+
+
+This tool traces the kernel function performing TCP connections (eg, via a
+connect() or accept() syscalls) and closing them (explicitly or if the process
+dies). Some example output (IP addresses are fake):
+
+```
+# ./tcptracer
+Tracing TCP established connections. Ctrl-C to end.
+T  PID    COMM             IP SADDR            DADDR            SPORT  DPORT
+C  28943  telnet           4  192.168.1.2      192.168.1.1      59306  23
+C  28818  curl             6  [::1]            [::1]            55758  80
+X  28943  telnet           4  192.168.1.2      192.168.1.1      59306  23
+A  28817  nc               6  [::1]            [::1]            80     55758
+X  28818  curl             6  [::1]            [::1]            55758  80
+X  28817  nc               6  [::1]            [::1]            80     55758
+A  28978  nc               4  10.202.210.1     10.202.109.12    8080   59160
+X  28978  nc               4  10.202.210.1     10.202.109.12    8080   59160
+```
+
+This output shows three conections, one outgoing from a "telnet" process, one
+outgoing from "curl" to a local netcat, and one incoming received by the "nc"
+process. The output details show the kind of event (C for connection, X for
+close and A for accept), PID, IP version, source address, destination address,
+source port and destination port.
+
+The -t option prints a timestamp column:
+
+```
+# ./tcptracer -t
+Tracing TCP established connections. Ctrl-C to end.
+TIME(s)  T  PID    COMM             IP SADDR            DADDR            SPORT  DPORT
+0.000    C  31002  telnet           4  192.168.1.2      192.168.1.1      42590  23
+3.546    C    748  curl             6  [::1]            [::1]            42592  80
+4.294    X  31002  telnet           4  192.168.1.2      192.168.1.1      42590  23
+```
diff --git a/tools/tplist.py b/tools/tplist.py
new file mode 100755
index 0000000..db4b68b
--- /dev/null
+++ b/tools/tplist.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#
+# tplist    Display kernel tracepoints or USDT probes and their formats.
+#
+# USAGE:    tplist [-p PID] [-l LIB] [-v] [filter]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Copyright (C) 2016 Sasha Goldshtein.
+
+import argparse
+import fnmatch
+import os
+import re
+import sys
+
+from bcc import USDT
+
+trace_root = "/sys/kernel/debug/tracing"
+event_root = os.path.join(trace_root, "events")
+
+parser = argparse.ArgumentParser(
+        description="Display kernel tracepoints or USDT probes " +
+                    "and their formats.",
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-p", "--pid", type=int, default=None,
+        help="List USDT probes in the specified process")
+parser.add_argument("-l", "--lib", default="",
+        help="List USDT probes in the specified library or executable")
+parser.add_argument("-v", dest="verbosity", action="count", default=0,
+        help="Increase verbosity level (print variables, arguments, etc.)")
+parser.add_argument(dest="filter", nargs="?",
+        help="A filter that specifies which probes/tracepoints to print")
+args = parser.parse_args()
+
+def print_tpoint_format(category, event):
+        fmt = open(os.path.join(event_root, category, event, "format")) \
+              .readlines()
+        for line in fmt:
+                match = re.search(r'field:([^;]*);', line)
+                if match is None:
+                        continue
+                parts = match.group(1).split()
+                field_name = parts[-1:][0]
+                field_type = " ".join(parts[:-1])
+                if field_name.startswith("common_"):
+                        continue
+                print("    %s %s;" % (field_type, field_name))
+
+def print_tpoint(category, event):
+        tpoint = "%s:%s" % (category, event)
+        if not args.filter or fnmatch.fnmatch(tpoint, args.filter):
+                print(tpoint)
+                if args.verbosity > 0:
+                        print_tpoint_format(category, event)
+
+def print_tracepoints():
+        for category in os.listdir(event_root):
+                cat_dir = os.path.join(event_root, category)
+                if not os.path.isdir(cat_dir):
+                        continue
+                for event in os.listdir(cat_dir):
+                        evt_dir = os.path.join(cat_dir, event)
+                        if os.path.isdir(evt_dir):
+                                print_tpoint(category, event)
+
+def print_usdt_argument_details(location):
+        for idx in range(0, location.num_arguments):
+                arg = location.get_argument(idx)
+                print("    argument #%d %s" % (idx + 1, arg))
+
+def print_usdt_details(probe):
+        if args.verbosity > 0:
+                print(probe)
+                if args.verbosity > 1:
+                        for idx in range(0, probe.num_locations):
+                                loc = probe.get_location(idx)
+                                print("  location #%d %s" % (idx + 1, loc))
+                                print_usdt_argument_details(loc)
+                else:
+                        print("  %d location(s)" % probe.num_locations)
+                        print("  %d argument(s)" % probe.num_arguments)
+        else:
+                print("%s %s:%s" %
+                      (probe.bin_path, probe.provider, probe.name))
+
+def print_usdt(pid, lib):
+        reader = USDT(path=lib, pid=pid)
+        probes_seen = []
+        for probe in reader.enumerate_probes():
+                probe_name = probe.short_name()
+                if not args.filter or fnmatch.fnmatch(probe_name, args.filter):
+                        if probe_name in probes_seen:
+                                continue
+                        probes_seen.append(probe_name)
+                        print_usdt_details(probe)
+
+if __name__ == "__main__":
+        try:
+                if args.pid or args.lib != "":
+                        print_usdt(args.pid, args.lib)
+                else:
+                        print_tracepoints()
+        except:
+                if sys.exc_info()[0] is not SystemExit:
+                        print(sys.exc_info()[1])
diff --git a/tools/tplist_example.txt b/tools/tplist_example.txt
new file mode 100644
index 0000000..b541bcf
--- /dev/null
+++ b/tools/tplist_example.txt
@@ -0,0 +1,131 @@
+Demonstrations of tplist.
+
+
+tplist displays kernel tracepoints and USDT probes, including their
+format. It can be used to discover probe points for use with the trace
+and argdist tools. Kernel tracepoints are scattered around the kernel
+and provide valuable static tracing on block and network I/O, scheduling,
+power events, and many other subjects. USDT probes are placed in libraries
+(such as libc) and executables (such as node) and provide static tracing
+information that can (optionally) be turned on and off at runtime.
+
+For example, suppose you want to discover which USDT probes a particular
+executable contains. Just run tplist on that executable (or library):
+
+$ tplist -l basic_usdt
+/home/vagrant/basic_usdt basic_usdt:start_main
+/home/vagrant/basic_usdt basic_usdt:loop_iter
+/home/vagrant/basic_usdt basic_usdt:end_main
+
+The loop_iter probe sounds interesting. How many arguments are available?
+
+$ tplist '*loop_iter' -l basic_usdt -v
+basic_usdt:loop_iter [sema 0x601036]
+  2 location(s)
+  2 argument(s)
+
+This output indicates that the loop_iter probe is used in two locations
+in the basic_usdt executable, and that it has two arguments. Fortunately,
+the argdist and trace tools understand the probe format and can print out
+the arguments automatically -- you can refer to them as arg1, arg2, and
+so on.
+
+Try to explore with some common libraries on your system and see if they
+contain UDST probes. Here are two examples you might find interesting:
+
+$ tplist -l pthread     # list probes in libpthread
+/lib64/libpthread.so.0 libpthread:pthread_start
+/lib64/libpthread.so.0 libpthread:pthread_create
+/lib64/libpthread.so.0 libpthread:pthread_join
+/lib64/libpthread.so.0 libpthread:pthread_join_ret
+/lib64/libpthread.so.0 libpthread:mutex_init
+... more output truncated
+
+$ tplist -l c           # list probes in libc
+/lib64/libc.so.6 libc:setjmp
+/lib64/libc.so.6 libc:longjmp
+/lib64/libc.so.6 libc:longjmp_target
+/lib64/libc.so.6 libc:memory_arena_reuse_free_list
+/lib64/libc.so.6 libc:memory_heap_new
+... more output truncated
+
+tplist also understands kernel tracepoints, and can list their format
+as well. For example, let's look for all block I/O-related tracepoints:
+
+# tplist 'block*'
+block:block_touch_buffer
+block:block_dirty_buffer
+block:block_rq_abort
+block:block_rq_requeue
+block:block_rq_complete
+block:block_rq_insert
+block:block_rq_issue
+block:block_bio_bounce
+block:block_bio_complete
+block:block_bio_backmerge
+block:block_bio_frontmerge
+block:block_bio_queue
+block:block_getrq
+block:block_sleeprq
+block:block_plug
+block:block_unplug
+block:block_split
+block:block_bio_remap
+block:block_rq_remap
+
+The block:block_rq_complete tracepoints sounds interesting. Let's print
+its format to see what we can trace with argdist and trace:
+
+$ tplist -v block:block_rq_complete
+block:block_rq_complete
+    dev_t dev;
+    sector_t sector;
+    unsigned int nr_sector;
+    int errors;
+    char rwbs[8];
+
+The dev, sector, nr_sector, etc. variables can now all be used in probes
+you specify with argdist or trace.
+
+
+For debugging USDT probes, it is sometimes useful to see the exact locations
+and arguments of the probes, including the registers or global variables from
+which their values are coming from. In super-verbose mode, tplist will print
+this information (note the -vv):
+
+$ tplist -vv -l c *alloc*
+libc:memory_malloc_retry [sema 0x0]
+  location #0 /lib64/libc.so.6 0x835c0
+    argument #0 8 unsigned bytes @ bp
+  location #1 /lib64/libc.so.6 0x83778
+    argument #0 8 unsigned bytes @ bp
+  location #2 /lib64/libc.so.6 0x85a50
+    argument #0 8 unsigned bytes @ bp
+libc:memory_realloc_retry [sema 0x0]
+  location #0 /lib64/libc.so.6 0x84b90
+    argument #0 8 unsigned bytes @ r13
+    argument #1 8 unsigned bytes @ bp
+  location #1 /lib64/libc.so.6 0x85cf0
+    argument #0 8 unsigned bytes @ r13
+    argument #1 8 unsigned bytes @ bp
+libc:memory_calloc_retry [sema 0x0]
+  location #0 /lib64/libc.so.6 0x850f0
+    argument #0 8 unsigned bytes @ bp
+
+
+USAGE message:
+
+$ tplist -h
+usage: tplist.py [-h] [-p PID] [-l LIB] [-v] [filter]
+
+Display kernel tracepoints or USDT probes and their formats.
+
+positional arguments:
+  filter             A filter that specifies which probes/tracepoints to print
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -p PID, --pid PID  List USDT probes in the specified process
+  -l LIB, --lib LIB  List USDT probes in the specified library or executable
+  -v                 Increase verbosity level (print variables, arguments, etc.)
+
diff --git a/tools/trace.py b/tools/trace.py
new file mode 100755
index 0000000..2233305
--- /dev/null
+++ b/tools/trace.py
@@ -0,0 +1,797 @@
+#!/usr/bin/env python
+#
+# trace         Trace a function and print a trace message based on its
+#               parameters, with an optional filter.
+#
+# usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
+#              [-M MAX_EVENTS] [-T] [-t] [-K] [-U] [-a] [-I header]
+#              probe [probe ...]
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Copyright (C) 2016 Sasha Goldshtein.
+
+from __future__ import print_function
+from bcc import BPF, USDT
+from functools import partial
+from time import sleep, strftime
+import argparse
+import re
+import ctypes as ct
+import os
+import traceback
+import sys
+
+class Probe(object):
+        probe_count = 0
+        streq_index = 0
+        max_events = None
+        event_count = 0
+        first_ts = 0
+        print_time = False
+        use_localtime = True
+        time_field = False
+        print_cpu = False
+        print_address = False
+        tgid = -1
+        pid = -1
+        page_cnt = None
+
+        @classmethod
+        def configure(cls, args):
+                cls.max_events = args.max_events
+                cls.print_time = args.timestamp or args.time
+                cls.use_localtime = not args.timestamp
+                cls.time_field = cls.print_time and (not cls.use_localtime)
+                cls.print_cpu = args.print_cpu
+                cls.print_address = args.address
+                cls.first_ts = BPF.monotonic_time()
+                cls.tgid = args.tgid or -1
+                cls.pid = args.pid or -1
+                cls.page_cnt = args.buffer_pages
+                cls.bin_cmp = args.bin_cmp
+
+        def __init__(self, probe, string_size, kernel_stack, user_stack):
+                self.usdt = None
+                self.streq_functions = ""
+                self.raw_probe = probe
+                self.string_size = string_size
+                self.kernel_stack = kernel_stack
+                self.user_stack = user_stack
+                Probe.probe_count += 1
+                self._parse_probe()
+                self.probe_num = Probe.probe_count
+                self.probe_name = "probe_%s_%d" % \
+                                (self._display_function(), self.probe_num)
+                self.probe_name = re.sub(r'[^A-Za-z0-9_]', '_',
+                                         self.probe_name)
+
+                # compiler can generate proper codes for function
+                # signatures with "syscall__" prefix
+                if self.is_syscall_kprobe:
+                        self.probe_name = "syscall__" + self.probe_name[6:]
+
+        def __str__(self):
+                return "%s:%s:%s FLT=%s ACT=%s/%s" % (self.probe_type,
+                        self.library, self._display_function(), self.filter,
+                        self.types, self.values)
+
+        def is_default_action(self):
+                return self.python_format == ""
+
+        def _bail(self, error):
+                raise ValueError("error in probe '%s': %s" %
+                                 (self.raw_probe, error))
+
+        def _parse_probe(self):
+                text = self.raw_probe
+
+                # There might be a function signature preceding the actual
+                # filter/print part, or not. Find the probe specifier first --
+                # it ends with either a space or an open paren ( for the
+                # function signature part.
+                #                                          opt. signature
+                #                               probespec       |      rest
+                #                               ---------  ----------   --
+                (spec, sig, rest) = re.match(r'([^ \t\(]+)(\([^\(]*\))?(.*)',
+                                             text).groups()
+
+                self._parse_spec(spec)
+                # Remove the parens
+                self.signature = sig[1:-1] if sig else None
+                if self.signature and self.probe_type in ['u', 't']:
+                        self._bail("USDT and tracepoint probes can't have " +
+                                   "a function signature; use arg1, arg2, " +
+                                   "... instead")
+
+                text = rest.lstrip()
+                # If we now have a (, wait for the balanced closing ) and that
+                # will be the predicate
+                self.filter = None
+                if len(text) > 0 and text[0] == "(":
+                        balance = 1
+                        for i in range(1, len(text)):
+                                if text[i] == "(":
+                                        balance += 1
+                                if text[i] == ")":
+                                        balance -= 1
+                                if balance == 0:
+                                        self._parse_filter(text[:i + 1])
+                                        text = text[i + 1:]
+                                        break
+                        if self.filter is None:
+                                self._bail("unmatched end of predicate")
+
+                if self.filter is None:
+                        self.filter = "1"
+
+                # The remainder of the text is the printf action
+                self._parse_action(text.lstrip())
+
+        def _parse_spec(self, spec):
+                parts = spec.split(":")
+                # Two special cases: 'func' means 'p::func', 'lib:func' means
+                # 'p:lib:func'. Other combinations need to provide an empty
+                # value between delimiters, e.g. 'r::func' for a kretprobe on
+                # the function func.
+                if len(parts) == 1:
+                        parts = ["p", "", parts[0]]
+                elif len(parts) == 2:
+                        parts = ["p", parts[0], parts[1]]
+                if len(parts[0]) == 0:
+                        self.probe_type = "p"
+                elif parts[0] in ["p", "r", "t", "u"]:
+                        self.probe_type = parts[0]
+                else:
+                        self._bail("probe type must be '', 'p', 't', 'r', " +
+                                   "or 'u', but got '%s'" % parts[0])
+                if self.probe_type == "t":
+                        self.tp_category = parts[1]
+                        self.tp_event = parts[2]
+                        self.library = ""       # kernel
+                        self.function = ""      # from TRACEPOINT_PROBE
+                elif self.probe_type == "u":
+                        self.library = ':'.join(parts[1:-1])
+                        self.usdt_name = parts[-1]
+                        self.function = ""      # no function, just address
+                        # We will discover the USDT provider by matching on
+                        # the USDT name in the specified library
+                        self._find_usdt_probe()
+                else:
+                        self.library = ':'.join(parts[1:-1])
+                        self.function = parts[-1]
+
+                # only x64 syscalls needs checking, no other syscall wrapper yet.
+                self.is_syscall_kprobe = False
+                if self.probe_type == "p" and len(self.library) == 0 and \
+                   self.function[:10] == "__x64_sys_":
+                        self.is_syscall_kprobe = True
+
+        def _find_usdt_probe(self):
+                target = Probe.pid if Probe.pid and Probe.pid != -1 \
+                                   else Probe.tgid
+                self.usdt = USDT(path=self.library, pid=target)
+                for probe in self.usdt.enumerate_probes():
+                        if probe.name == self.usdt_name.encode('ascii'):
+                                return  # Found it, will enable later
+                self._bail("unrecognized USDT probe %s" % self.usdt_name)
+
+        def _parse_filter(self, filt):
+                self.filter = self._rewrite_expr(filt)
+
+        def _parse_types(self, fmt):
+                for match in re.finditer(
+                            r'[^%]%(s|u|d|lu|llu|ld|lld|hu|hd|x|lx|llx|c|K|U)', fmt):
+                        self.types.append(match.group(1))
+                fmt = re.sub(r'([^%]%)(u|d|lu|llu|ld|lld|hu|hd)', r'\1d', fmt)
+                fmt = re.sub(r'([^%]%)(x|lx|llx)', r'\1x', fmt)
+                fmt = re.sub('%K|%U', '%s', fmt)
+                self.python_format = fmt.strip('"')
+
+        def _parse_action(self, action):
+                self.values = []
+                self.types = []
+                self.python_format = ""
+                if len(action) == 0:
+                        return
+
+                action = action.strip()
+                match = re.search(r'(\".*?\"),?(.*)', action)
+                if match is None:
+                        self._bail("expected format string in \"s")
+
+                self.raw_format = match.group(1)
+                self._parse_types(self.raw_format)
+                for part in re.split('(?<!"),', match.group(2)):
+                        part = self._rewrite_expr(part)
+                        if len(part) > 0:
+                                self.values.append(part)
+
+        aliases_arg = {
+                "arg1": "PT_REGS_PARM1(ctx)",
+                "arg2": "PT_REGS_PARM2(ctx)",
+                "arg3": "PT_REGS_PARM3(ctx)",
+                "arg4": "PT_REGS_PARM4(ctx)",
+                "arg5": "PT_REGS_PARM5(ctx)",
+                "arg6": "PT_REGS_PARM6(ctx)",
+        }
+
+        aliases_indarg = {
+                "arg1": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM1(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})",
+                "arg2": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM2(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})",
+                "arg3": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM3(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})",
+                "arg4": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM4(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})",
+                "arg5": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM5(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})",
+                "arg6": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM6(ctx);"
+                        "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})",
+        }
+
+        aliases_common = {
+                "retval": "PT_REGS_RC(ctx)",
+                "$uid": "(unsigned)(bpf_get_current_uid_gid() & 0xffffffff)",
+                "$gid": "(unsigned)(bpf_get_current_uid_gid() >> 32)",
+                "$pid": "(unsigned)(bpf_get_current_pid_tgid() & 0xffffffff)",
+                "$tgid": "(unsigned)(bpf_get_current_pid_tgid() >> 32)",
+                "$cpu": "bpf_get_smp_processor_id()",
+                "$task" : "((struct task_struct *)bpf_get_current_task())"
+        }
+
+        def _generate_streq_function(self, string):
+                fname = "streq_%d" % Probe.streq_index
+                Probe.streq_index += 1
+                self.streq_functions += """
+static inline bool %s(char const *ignored, uintptr_t str) {
+        char needle[] = %s;
+        char haystack[sizeof(needle)];
+        bpf_probe_read(&haystack, sizeof(haystack), (void *)str);
+        for (int i = 0; i < sizeof(needle) - 1; ++i) {
+                if (needle[i] != haystack[i]) {
+                        return false;
+                }
+        }
+        return true;
+}
+                """ % (fname, string)
+                return fname
+
+        def _rewrite_expr(self, expr):
+                if self.is_syscall_kprobe:
+                    for alias, replacement in Probe.aliases_indarg.items():
+                        expr = expr.replace(alias, replacement)
+                else:
+                    for alias, replacement in Probe.aliases_arg.items():
+                        # For USDT probes, we replace argN values with the
+                        # actual arguments for that probe obtained using
+                        # bpf_readarg_N macros emitted at BPF construction.
+                        if self.probe_type == "u":
+                                continue
+                        expr = expr.replace(alias, replacement)
+                for alias, replacement in Probe.aliases_common.items():
+                    expr = expr.replace(alias, replacement)
+                if self.bin_cmp:
+                    STRCMP_RE = 'STRCMP\\(\"([^"]+)\\"'
+                else:
+                    STRCMP_RE = 'STRCMP\\(("[^"]+\\")'
+                matches = re.finditer(STRCMP_RE, expr)
+                for match in matches:
+                        string = match.group(1)
+                        fname = self._generate_streq_function(string)
+                        expr = expr.replace("STRCMP", fname, 1)
+                return expr
+
+        p_type = {"u": ct.c_uint, "d": ct.c_int, "lu": ct.c_ulong,
+                  "ld": ct.c_long,
+                  "llu": ct.c_ulonglong, "lld": ct.c_longlong,
+                  "hu": ct.c_ushort, "hd": ct.c_short,
+                  "x": ct.c_uint, "lx": ct.c_ulong, "llx": ct.c_ulonglong,
+                  "c": ct.c_ubyte,
+                  "K": ct.c_ulonglong, "U": ct.c_ulonglong}
+
+        def _generate_python_field_decl(self, idx, fields):
+                field_type = self.types[idx]
+                if field_type == "s":
+                        ptype = ct.c_char * self.string_size
+                else:
+                        ptype = Probe.p_type[field_type]
+                fields.append(("v%d" % idx, ptype))
+
+        def _generate_python_data_decl(self):
+                self.python_struct_name = "%s_%d_Data" % \
+                                (self._display_function(), self.probe_num)
+                fields = []
+                if self.time_field:
+                    fields.append(("timestamp_ns", ct.c_ulonglong))
+                if self.print_cpu:
+                    fields.append(("cpu", ct.c_int))
+                fields.extend([
+                        ("tgid", ct.c_uint),
+                        ("pid", ct.c_uint),
+                        ("comm", ct.c_char * 16)       # TASK_COMM_LEN
+                ])
+                for i in range(0, len(self.types)):
+                        self._generate_python_field_decl(i, fields)
+                if self.kernel_stack:
+                        fields.append(("kernel_stack_id", ct.c_int))
+                if self.user_stack:
+                        fields.append(("user_stack_id", ct.c_int))
+                return type(self.python_struct_name, (ct.Structure,),
+                            dict(_fields_=fields))
+
+        c_type = {"u": "unsigned int", "d": "int",
+                  "lu": "unsigned long", "ld": "long",
+                  "llu": "unsigned long long", "lld": "long long",
+                  "hu": "unsigned short", "hd": "short",
+                  "x": "unsigned int", "lx": "unsigned long",
+                  "llx": "unsigned long long",
+                  "c": "char", "K": "unsigned long long",
+                  "U": "unsigned long long"}
+        fmt_types = c_type.keys()
+
+        def _generate_field_decl(self, idx):
+                field_type = self.types[idx]
+                if field_type == "s":
+                        return "char v%d[%d];\n" % (idx, self.string_size)
+                if field_type in Probe.fmt_types:
+                        return "%s v%d;\n" % (Probe.c_type[field_type], idx)
+                self._bail("unrecognized format specifier %s" % field_type)
+
+        def _generate_data_decl(self):
+                # The BPF program will populate values into the struct
+                # according to the format string, and the Python program will
+                # construct the final display string.
+                self.events_name = "%s_events" % self.probe_name
+                self.struct_name = "%s_data_t" % self.probe_name
+                self.stacks_name = "%s_stacks" % self.probe_name
+                stack_table = "BPF_STACK_TRACE(%s, 1024);" % self.stacks_name \
+                              if (self.kernel_stack or self.user_stack) else ""
+                data_fields = ""
+                for i, field_type in enumerate(self.types):
+                        data_fields += "        " + \
+                                       self._generate_field_decl(i)
+                time_str = "u64 timestamp_ns;" if self.time_field else ""
+                cpu_str = "int cpu;" if self.print_cpu else ""
+                kernel_stack_str = "       int kernel_stack_id;" \
+                                   if self.kernel_stack else ""
+                user_stack_str = "       int user_stack_id;" \
+                                 if self.user_stack else ""
+
+                text = """
+struct %s
+{
+%s
+%s
+        u32 tgid;
+        u32 pid;
+        char comm[TASK_COMM_LEN];
+%s
+%s
+%s
+};
+
+BPF_PERF_OUTPUT(%s);
+%s
+"""
+                return text % (self.struct_name, time_str, cpu_str, data_fields,
+                               kernel_stack_str, user_stack_str,
+                               self.events_name, stack_table)
+
+        def _generate_field_assign(self, idx):
+                field_type = self.types[idx]
+                expr = self.values[idx].strip()
+                text = ""
+                if self.probe_type == "u" and expr[0:3] == "arg":
+                        arg_index = int(expr[3])
+                        arg_ctype = self.usdt.get_probe_arg_ctype(
+                                self.usdt_name, arg_index - 1)
+                        text = ("        %s %s = 0;\n" +
+                                "        bpf_usdt_readarg(%s, ctx, &%s);\n") \
+                                % (arg_ctype, expr, expr[3], expr)
+
+                if field_type == "s":
+                        return text + """
+        if (%s != 0) {
+                void *__tmp = (void *)%s;
+                bpf_probe_read(&__data.v%d, sizeof(__data.v%d), __tmp);
+        }
+                """ % (expr, expr, idx, idx)
+                if field_type in Probe.fmt_types:
+                        return text + "        __data.v%d = (%s)%s;\n" % \
+                                        (idx, Probe.c_type[field_type], expr)
+                self._bail("unrecognized field type %s" % field_type)
+
+        def _generate_usdt_filter_read(self):
+            text = ""
+            if self.probe_type != "u":
+                    return text
+            for arg, _ in Probe.aliases_arg.items():
+                    if not (arg in self.filter):
+                            continue
+                    arg_index = int(arg.replace("arg", ""))
+                    arg_ctype = self.usdt.get_probe_arg_ctype(
+                            self.usdt_name, arg_index - 1)
+                    if not arg_ctype:
+                            self._bail("Unable to determine type of {} "
+                                       "in the filter".format(arg))
+                    text += """
+        {} {}_filter;
+        bpf_usdt_readarg({}, ctx, &{}_filter);
+                    """.format(arg_ctype, arg, arg_index, arg)
+                    self.filter = self.filter.replace(
+                            arg, "{}_filter".format(arg))
+            return text
+
+        def generate_program(self, include_self):
+                data_decl = self._generate_data_decl()
+                if Probe.pid != -1:
+                        pid_filter = """
+        if (__pid != %d) { return 0; }
+                """ % Probe.pid
+                # uprobes can have a built-in tgid filter passed to
+                # attach_uprobe, hence the check here -- for kprobes, we
+                # need to do the tgid test by hand:
+                elif len(self.library) == 0 and Probe.tgid != -1:
+                        pid_filter = """
+        if (__tgid != %d) { return 0; }
+                """ % Probe.tgid
+                elif not include_self:
+                        pid_filter = """
+        if (__tgid == %d) { return 0; }
+                """ % os.getpid()
+                else:
+                        pid_filter = ""
+
+                prefix = ""
+                signature = "struct pt_regs *ctx"
+                if self.signature:
+                        signature += ", " + self.signature
+
+                data_fields = ""
+                for i, expr in enumerate(self.values):
+                        data_fields += self._generate_field_assign(i)
+
+                if self.probe_type == "t":
+                        heading = "TRACEPOINT_PROBE(%s, %s)" % \
+                                  (self.tp_category, self.tp_event)
+                        ctx_name = "args"
+                else:
+                        heading = "int %s(%s)" % (self.probe_name, signature)
+                        ctx_name = "ctx"
+
+                time_str = """
+        __data.timestamp_ns = bpf_ktime_get_ns();""" if self.time_field else ""
+                cpu_str = """
+        __data.cpu = bpf_get_smp_processor_id();""" if self.print_cpu else ""
+                stack_trace = ""
+                if self.user_stack:
+                        stack_trace += """
+        __data.user_stack_id = %s.get_stackid(
+          %s, BPF_F_REUSE_STACKID | BPF_F_USER_STACK
+        );""" % (self.stacks_name, ctx_name)
+                if self.kernel_stack:
+                        stack_trace += """
+        __data.kernel_stack_id = %s.get_stackid(
+          %s, BPF_F_REUSE_STACKID
+        );""" % (self.stacks_name, ctx_name)
+
+                text = heading + """
+{
+        u64 __pid_tgid = bpf_get_current_pid_tgid();
+        u32 __tgid = __pid_tgid >> 32;
+        u32 __pid = __pid_tgid; // implicit cast to u32 for bottom half
+        %s
+        %s
+        %s
+        if (!(%s)) return 0;
+
+        struct %s __data = {0};
+        %s
+        %s
+        __data.tgid = __tgid;
+        __data.pid = __pid;
+        bpf_get_current_comm(&__data.comm, sizeof(__data.comm));
+%s
+%s
+        %s.perf_submit(%s, &__data, sizeof(__data));
+        return 0;
+}
+"""
+                text = text % (pid_filter, prefix,
+                               self._generate_usdt_filter_read(), self.filter,
+                               self.struct_name, time_str, cpu_str, data_fields,
+                               stack_trace, self.events_name, ctx_name)
+
+                return self.streq_functions + data_decl + "\n" + text
+
+        @classmethod
+        def _time_off_str(cls, timestamp_ns):
+                return "%.6f" % (1e-9 * (timestamp_ns - cls.first_ts))
+
+        def _display_function(self):
+                if self.probe_type == 'p' or self.probe_type == 'r':
+                        return self.function
+                elif self.probe_type == 'u':
+                        return self.usdt_name
+                else:   # self.probe_type == 't'
+                        return self.tp_event
+
+        def print_stack(self, bpf, stack_id, tgid):
+            if stack_id < 0:
+                print("        %d" % stack_id)
+                return
+
+            stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
+            for addr in stack:
+                print("        ", end="")
+                if Probe.print_address:
+                    print("%16x " % addr, end="")
+                print("%s" % (bpf.sym(addr, tgid,
+                                     show_module=True, show_offset=True)))
+
+        def _format_message(self, bpf, tgid, values):
+                # Replace each %K with kernel sym and %U with user sym in tgid
+                kernel_placeholders = [i for i, t in enumerate(self.types)
+                                       if t == 'K']
+                user_placeholders = [i for i, t in enumerate(self.types)
+                                     if t == 'U']
+                for kp in kernel_placeholders:
+                        values[kp] = bpf.ksym(values[kp], show_offset=True)
+                for up in user_placeholders:
+                        values[up] = bpf.sym(values[up], tgid,
+                                           show_module=True, show_offset=True)
+                return self.python_format % tuple(values)
+
+        def print_event(self, bpf, cpu, data, size):
+                # Cast as the generated structure type and display
+                # according to the format string in the probe.
+                event = ct.cast(data, ct.POINTER(self.python_struct)).contents
+                values = map(lambda i: getattr(event, "v%d" % i),
+                             range(0, len(self.values)))
+                msg = self._format_message(bpf, event.tgid, values)
+                if Probe.print_time:
+                    time = strftime("%H:%M:%S") if Probe.use_localtime else \
+                           Probe._time_off_str(event.timestamp_ns)
+                    print("%-8s " % time[:8], end="")
+                if Probe.print_cpu:
+                    print("%-3s " % event.cpu, end="")
+                print("%-7d %-7d %-15s %-16s %s" %
+                      (event.tgid, event.pid,
+                       event.comm.decode('utf-8', 'replace'),
+                       self._display_function(), msg))
+
+                if self.kernel_stack:
+                        self.print_stack(bpf, event.kernel_stack_id, -1)
+                if self.user_stack:
+                        self.print_stack(bpf, event.user_stack_id, event.tgid)
+                if self.user_stack or self.kernel_stack:
+                        print("")
+
+                Probe.event_count += 1
+                if Probe.max_events is not None and \
+                   Probe.event_count >= Probe.max_events:
+                        exit()
+
+        def attach(self, bpf, verbose):
+                if len(self.library) == 0:
+                        self._attach_k(bpf)
+                else:
+                        self._attach_u(bpf)
+                self.python_struct = self._generate_python_data_decl()
+                callback = partial(self.print_event, bpf)
+                bpf[self.events_name].open_perf_buffer(callback,
+                        page_cnt=self.page_cnt)
+
+        def _attach_k(self, bpf):
+                if self.probe_type == "r":
+                        bpf.attach_kretprobe(event=self.function,
+                                             fn_name=self.probe_name)
+                elif self.probe_type == "p":
+                        bpf.attach_kprobe(event=self.function,
+                                          fn_name=self.probe_name)
+                # Note that tracepoints don't need an explicit attach
+
+        def _attach_u(self, bpf):
+                libpath = BPF.find_library(self.library)
+                if libpath is None:
+                        # This might be an executable (e.g. 'bash')
+                        libpath = BPF.find_exe(self.library)
+                if libpath is None or len(libpath) == 0:
+                        self._bail("unable to find library %s" % self.library)
+
+                if self.probe_type == "u":
+                        pass    # Was already enabled by the BPF constructor
+                elif self.probe_type == "r":
+                        bpf.attach_uretprobe(name=libpath,
+                                             sym=self.function,
+                                             fn_name=self.probe_name,
+                                             pid=Probe.tgid)
+                else:
+                        bpf.attach_uprobe(name=libpath,
+                                          sym=self.function,
+                                          fn_name=self.probe_name,
+                                          pid=Probe.tgid)
+
+class Tool(object):
+        DEFAULT_PERF_BUFFER_PAGES = 64
+        examples = """
+EXAMPLES:
+
+trace do_sys_open
+        Trace the open syscall and print a default trace message when entered
+trace 'do_sys_open "%s", arg2'
+        Trace the open syscall and print the filename being opened
+trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
+        Trace the read syscall and print a message for reads >20000 bytes
+trace 'r::do_sys_open "%llx", retval'
+        Trace the return from the open syscall and print the return value
+trace 'c:open (arg2 == 42) "%s %d", arg1, arg2'
+        Trace the open() call from libc only if the flags (arg2) argument is 42
+trace 'c:malloc "size = %d", arg1'
+        Trace malloc calls and print the size being allocated
+trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
+        Trace the write() call from libc to monitor writes to STDOUT
+trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
+        Trace returns from __kmalloc which returned a null pointer
+trace 'r:c:malloc (retval) "allocated = %x", retval'
+        Trace returns from malloc and print non-NULL allocated buffers
+trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
+        Trace the block_rq_complete kernel tracepoint and print # of tx sectors
+trace 'u:pthread:pthread_create (arg4 != 0)'
+        Trace the USDT probe pthread_create when its 4th argument is non-zero
+trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
+        Trace the nanosleep syscall and print the sleep duration in ns
+trace -I 'linux/fs.h' \\
+      'p::uprobe_register(struct inode *inode) "a_ops = %llx", inode->i_mapping->a_ops'
+        Trace the uprobe_register inode mapping ops, and the symbol can be found
+        in /proc/kallsyms
+trace -I 'kernel/sched/sched.h' \\
+      'p::__account_cfs_rq_runtime(struct cfs_rq *cfs_rq) "%d", cfs_rq->runtime_remaining'
+        Trace the cfs scheduling runqueue remaining runtime. The struct cfs_rq is defined
+        in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
+        package.  So this command needs to run at the kernel source tree root directory
+        so that the added header file can be found by the compiler.
+trace -I 'net/sock.h' \\
+      'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)'
+        Trace udpv6 sendmsg calls only if socket's destination port is equal
+        to 53 (DNS; 13568 in big endian order)
+trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
+        Trace the number of users accessing the file system of the current task
+"""
+
+        def __init__(self):
+                parser = argparse.ArgumentParser(description="Attach to " +
+                  "functions and print trace messages.",
+                  formatter_class=argparse.RawDescriptionHelpFormatter,
+                  epilog=Tool.examples)
+                parser.add_argument("-b", "--buffer-pages", type=int,
+                  default=Tool.DEFAULT_PERF_BUFFER_PAGES,
+                  help="number of pages to use for perf_events ring buffer "
+                       "(default: %(default)d)")
+                # we'll refer to the userspace concepts of "pid" and "tid" by
+                # their kernel names -- tgid and pid -- inside the script
+                parser.add_argument("-p", "--pid", type=int, metavar="PID",
+                  dest="tgid", help="id of the process to trace (optional)")
+                parser.add_argument("-L", "--tid", type=int, metavar="TID",
+                  dest="pid", help="id of the thread to trace (optional)")
+                parser.add_argument("-v", "--verbose", action="store_true",
+                  help="print resulting BPF program code before executing")
+                parser.add_argument("-Z", "--string-size", type=int,
+                  default=80, help="maximum size to read from strings")
+                parser.add_argument("-S", "--include-self",
+                  action="store_true",
+                  help="do not filter trace's own pid from the trace")
+                parser.add_argument("-M", "--max-events", type=int,
+                  help="number of events to print before quitting")
+                parser.add_argument("-t", "--timestamp", action="store_true",
+                  help="print timestamp column (offset from trace start)")
+                parser.add_argument("-T", "--time", action="store_true",
+                  help="print time column")
+                parser.add_argument("-C", "--print_cpu", action="store_true",
+                  help="print CPU id")
+                parser.add_argument("-B", "--bin_cmp", action="store_true",
+                  help="allow to use STRCMP with binary values")
+                parser.add_argument("-K", "--kernel-stack",
+                  action="store_true", help="output kernel stack trace")
+                parser.add_argument("-U", "--user-stack",
+                  action="store_true", help="output user stack trace")
+                parser.add_argument("-a", "--address", action="store_true",
+                  help="print virtual address in stacks")
+                parser.add_argument(metavar="probe", dest="probes", nargs="+",
+                  help="probe specifier (see examples)")
+                parser.add_argument("-I", "--include", action="append",
+                  metavar="header",
+                  help="additional header files to include in the BPF program "
+                       "as either full path, "
+                       "or relative to current working directory, "
+                       "or relative to default kernel header search path")
+                parser.add_argument("--ebpf", action="store_true",
+                  help=argparse.SUPPRESS)
+                self.args = parser.parse_args()
+                if self.args.tgid and self.args.pid:
+                        parser.error("only one of -p and -L may be specified")
+
+        def _create_probes(self):
+                Probe.configure(self.args)
+                self.probes = []
+                for probe_spec in self.args.probes:
+                        self.probes.append(Probe(
+                                probe_spec, self.args.string_size,
+                                self.args.kernel_stack, self.args.user_stack))
+
+        def _generate_program(self):
+                self.program = """
+#include <linux/ptrace.h>
+#include <linux/sched.h>        /* For TASK_COMM_LEN */
+
+"""
+                for include in (self.args.include or []):
+                        if include.startswith((".", "/")):
+                                include = os.path.abspath(include)
+                                self.program += "#include \"%s\"\n" % include
+                        else:
+                                self.program += "#include <%s>\n" % include
+                self.program += BPF.generate_auto_includes(
+                        map(lambda p: p.raw_probe, self.probes))
+                for probe in self.probes:
+                        self.program += probe.generate_program(
+                                        self.args.include_self)
+
+                if self.args.verbose or self.args.ebpf:
+                        print(self.program)
+                        if self.args.ebpf:
+                                exit()
+
+        def _attach_probes(self):
+                usdt_contexts = []
+                for probe in self.probes:
+                    if probe.usdt:
+                        # USDT probes must be enabled before the BPF object
+                        # is initialized, because that's where the actual
+                        # uprobe is being attached.
+                        probe.usdt.enable_probe(
+                                probe.usdt_name, probe.probe_name)
+                        if self.args.verbose:
+                                print(probe.usdt.get_text())
+                        usdt_contexts.append(probe.usdt)
+                self.bpf = BPF(text=self.program, usdt_contexts=usdt_contexts)
+                for probe in self.probes:
+                        if self.args.verbose:
+                                print(probe)
+                        probe.attach(self.bpf, self.args.verbose)
+
+        def _main_loop(self):
+                all_probes_trivial = all(map(Probe.is_default_action,
+                                             self.probes))
+
+                # Print header
+                if self.args.timestamp or self.args.time:
+                    print("%-8s " % "TIME", end="");
+                if self.args.print_cpu:
+                    print("%-3s " % "CPU", end="");
+                print("%-7s %-7s %-15s %-16s %s" %
+                      ("PID", "TID", "COMM", "FUNC",
+                      "-" if not all_probes_trivial else ""))
+
+                while True:
+                        self.bpf.perf_buffer_poll()
+
+        def run(self):
+                try:
+                        self._create_probes()
+                        self._generate_program()
+                        self._attach_probes()
+                        self._main_loop()
+                except:
+                        exc_info = sys.exc_info()
+                        sys_exit = exc_info[0] is SystemExit
+                        if self.args.verbose:
+                                traceback.print_exc()
+                        elif not sys_exit:
+                                print(exc_info[1])
+                        exit(0 if sys_exit else 1)
+
+if __name__ == "__main__":
+        Tool().run()
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
new file mode 100644
index 0000000..0b41d7a
--- /dev/null
+++ b/tools/trace_example.txt
@@ -0,0 +1,328 @@
+Demonstrations of trace.
+
+
+trace probes functions you specify and displays trace messages if a particular
+condition is met. You can control the message format to display function
+arguments and return values.
+
+For example, suppose you want to trace all commands being exec'd across the
+system:
+
+# trace 'sys_execve "%s", arg1'
+PID    COMM         FUNC             -
+4402   bash         sys_execve       /usr/bin/man
+4411   man          sys_execve       /usr/local/bin/less
+4411   man          sys_execve       /usr/bin/less
+4410   man          sys_execve       /usr/local/bin/nroff
+4410   man          sys_execve       /usr/bin/nroff
+4409   man          sys_execve       /usr/local/bin/tbl
+4409   man          sys_execve       /usr/bin/tbl
+4408   man          sys_execve       /usr/local/bin/preconv
+4408   man          sys_execve       /usr/bin/preconv
+4415   nroff        sys_execve       /usr/bin/locale
+4416   nroff        sys_execve       /usr/bin/groff
+4418   groff        sys_execve       /usr/bin/grotty
+4417   groff        sys_execve       /usr/bin/troff
+^C
+
+The ::sys_execve syntax specifies that you want an entry probe (which is the
+default), in a kernel function (which is the default) called sys_execve. Next,
+the format string to print is simply "%s", which prints a string. Finally, the
+value to print is the first argument to the sys_execve function, which happens
+to be the command that is exec'd. The above trace was generated by executing
+"man ls" in a separate shell. As you see, man executes a number of additional
+programs to finally display the man page.
+
+Next, suppose you are looking for large reads across the system. Let's trace
+the read system call and inspect the third argument, which is the number of
+bytes to be read:
+
+# trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
+PID    COMM         FUNC             -
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
+^C
+
+During the trace, I executed "dd if=/dev/zero of=/dev/null bs=1M count=4".
+The individual reads are visible, with the custom format message printed for
+each read. The parenthesized expression "(arg3 > 20000)" is a filter that is
+evaluated for each invocation of the probe before printing anything.
+
+You can also trace user functions. For example, let's simulate the bashreadline
+script, which attaches to the readline function in bash and prints its return
+value, effectively snooping all bash shell input across the system:
+
+# trace 'r:bash:readline "%s", retval'
+PID    COMM         FUNC             -
+2740   bash         readline         echo hi!
+2740   bash         readline         man ls
+^C
+
+The special retval keyword stands for the function's return value, and can
+be used only in a retprobe, specified by the 'r' prefix. The next component
+of the probe is the library that contains the desired function. It's OK to
+specify executables too, as long as they can be found in the PATH. Or, you
+can specify the full path to the executable (e.g. "/usr/bin/bash").
+
+Sometimes it can be useful to see where in code the events happen. There are
+flags to print the kernel stack (-K), the user stack (-U) and optionally
+include the virtual address in the stacks as well (-a):
+
+# trace.py -U -a 'r::sys_futex "%d", retval'
+PID     TID     COMM            FUNC             -
+793922  793951  poller          sys_futex        0
+        7f6c72b6497a __lll_unlock_wake+0x1a [libpthread-2.23.so]
+              627fef folly::FunctionScheduler::run()+0x46f [router]
+        7f6c7345f171 execute_native_thread_routine+0x21 [libstdc++.so.6.0.21]
+        7f6c72b5b7a9 start_thread+0xd9 [libpthread-2.23.so]
+        7f6c7223fa7d clone+0x6d [libc-2.23.so]
+
+Multiple probes can be combined on the same command line. For example, let's
+trace failed read and write calls on the libc level, and include a time column:
+
+# trace 'r:c:read ((int)retval < 0) "read failed: %d", retval' \
+        'r:c:write ((int)retval < 0) "write failed: %d", retval' -T
+TIME     PID    COMM         FUNC             -
+05:31:57 3388   bash         write            write failed: -1
+05:32:00 3388   bash         write            write failed: -1
+^C
+
+Note that the retval variable must be cast to int before comparing to zero.
+The reason is that the default type for argN and retval is an unsigned 64-bit
+integer, which can never be smaller than 0.
+
+trace has also some basic support for kernel tracepoints. For example, let's
+trace the block:block_rq_complete tracepoint and print out the number of sectors
+transferred:
+
+# trace 't:block:block_rq_complete "sectors=%d", args->nr_sector' -T
+TIME     PID    COMM         FUNC             -
+01:23:51 0      swapper/0    block_rq_complete sectors=8
+01:23:55 10017  kworker/u64: block_rq_complete sectors=1
+01:23:55 0      swapper/0    block_rq_complete sectors=8
+^C
+
+To discover the tracepoint structure format (which you can refer to as the "args"
+pointer variable), use the tplist tool. For example:
+
+# tplist -v block:block_rq_complete
+block:block_rq_complete
+    dev_t dev;
+    sector_t sector;
+    unsigned int nr_sector;
+    int errors;
+    char rwbs[8];
+
+This output tells you that you can use "args->dev", "args->sector", etc. in your
+predicate and trace arguments.
+
+
+More and more high-level libraries are instrumented with USDT probe support.
+These probes can be traced by trace just like kernel tracepoints. For example,
+trace new threads being created and their function name, include time column
+and on which CPU it happened:
+
+# trace 'u:pthread:pthread_create "%U", arg3' -T -C
+TIME     CPU PID     TID     COMM            FUNC             -
+13:22:01 25  2627    2629    automount       pthread_create   expire_proc_indirect+0x0 [automount]
+13:22:01 5   21360   21414   osqueryd        pthread_create   [unknown] [osqueryd]
+13:22:03 25  2627    2629    automount       pthread_create   expire_proc_indirect+0x0 [automount]
+13:22:04 15  21360   21414   osqueryd        pthread_create   [unknown] [osqueryd]
+13:22:07 25  2627    2629    automount       pthread_create   expire_proc_indirect+0x0 [automount]
+13:22:07 4   21360   21414   osqueryd        pthread_create   [unknown] [osqueryd]
+^C
+
+The "%U" format specifier tells trace to resolve arg3 as a user-space symbol,
+if possible. Similarly, use "%K" for kernel symbols.
+
+Ruby, Node, and OpenJDK are also instrumented with USDT. For example, let's
+trace Ruby methods being called (this requires a version of Ruby built with
+the --enable-dtrace configure flag):
+
+# trace 'u:ruby:method__entry "%s.%s", arg1, arg2' -p $(pidof irb) -T
+TIME     PID    COMM         FUNC             -
+12:08:43 18420  irb          method__entry    IRB::Context.verbose?
+12:08:43 18420  irb          method__entry    RubyLex.ungetc
+12:08:43 18420  irb          method__entry    RuxyLex.debug?
+^C
+
+In the previous invocation, arg1 and arg2 are the class name and method name
+for the Ruby method being invoked.
+
+You can also trace exported functions from shared libraries, or an imported
+function on the actual executable:
+
+# sudo ./trace.py 'r:/usr/lib64/libtinfo.so:curses_version "Version=%s", retval'
+# tput -V
+
+PID    TID    COMM         FUNC             -
+21720  21720  tput         curses_version   Version=ncurses 6.0.20160709
+^C
+
+
+Occasionally, it can be useful to filter specific strings. For example, you
+might be interested in open() calls that open a specific file:
+
+# trace 'p:c:open (STRCMP("test.txt", arg1)) "opening %s", arg1' -T
+TIME     PID    COMM         FUNC             -
+01:43:15 10938  cat          open             opening test.txt
+01:43:20 10939  cat          open             opening test.txt
+^C
+
+
+In the preceding example, as well as in many others, readability may be
+improved by providing the function's signature, which names the arguments and
+lets you access structure sub-fields, which is hard with the "arg1", "arg2"
+convention. For example:
+
+# trace 'p:c:open(char *filename) "opening %s", filename'
+PID    TID    COMM         FUNC             -
+17507  17507  cat          open             opening FAQ.txt
+^C
+
+# trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
+PID    TID    COMM         FUNC             -
+777    785    automount    SyS_nanosleep    sleep for 500000000 ns
+777    785    automount    SyS_nanosleep    sleep for 500000000 ns
+777    785    automount    SyS_nanosleep    sleep for 500000000 ns
+777    785    automount    SyS_nanosleep    sleep for 500000000 ns
+^C
+
+Remember to use the -I argument include the appropriate header file. We didn't
+need to do that here because `struct timespec` is used internally by the tool,
+so it always includes this header file.
+
+
+As a final example, let's trace open syscalls for a specific process. By
+default, tracing is system-wide, but the -p switch overrides this:
+
+# trace -p 2740 'do_sys_open "%s", arg2' -T
+TIME     PID    COMM         FUNC             -
+05:36:16 15872  ls           do_sys_open      /etc/ld.so.cache
+05:36:16 15872  ls           do_sys_open      /lib64/libselinux.so.1
+05:36:16 15872  ls           do_sys_open      /lib64/libcap.so.2
+05:36:16 15872  ls           do_sys_open      /lib64/libacl.so.1
+05:36:16 15872  ls           do_sys_open      /lib64/libc.so.6
+05:36:16 15872  ls           do_sys_open      /lib64/libpcre.so.1
+05:36:16 15872  ls           do_sys_open      /lib64/libdl.so.2
+05:36:16 15872  ls           do_sys_open      /lib64/libattr.so.1
+05:36:16 15872  ls           do_sys_open      /lib64/libpthread.so.0
+05:36:16 15872  ls           do_sys_open      /usr/lib/locale/locale-archive
+05:36:16 15872  ls           do_sys_open      /home/vagrant
+^C
+
+In this example, we traced the "ls ~" command as it was opening its shared
+libraries and then accessing the /home/vagrant directory listing.
+
+
+Lastly, if a high-frequency event is traced you may overflow the perf ring
+buffer. This shows as "Lost N samples":
+
+# trace sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+Lost 764896 samples
+Lost 764896 samples
+Lost 764896 samples
+
+The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer
+size and is measured in pages. The value must be a power of two and defaults to
+64 pages.
+
+# trace.py 'sys_setsockopt(int fd, int level, int optname, char* optval, int optlen)(level==0 && optname == 1 && STRCMP("{0x6C, 0x00, 0x00, 0x00}", optval))' -U -M 1 --bin_cmp
+PID     TID     COMM            FUNC             -
+1855611 1863183 worker          sys_setsockopt   found
+
+In this example we are catching setsockopt syscall to change IPv4 IP_TOS
+value only for the cases where new TOS value is equal to 108. we are using
+STRCMP helper in binary mode (--bin_cmp flag) to compare optval array
+against int value of 108 (parametr of setsockopt call) in hex representation
+(little endian format)
+
+
+
+
+USAGE message:
+
+usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE]
+             [-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-a] [-I header]
+             probe [probe ...]
+
+Attach to functions and print trace messages.
+
+positional arguments:
+  probe                 probe specifier (see examples)
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -b BUFFER_PAGES, --buffer-pages BUFFER_PAGES
+                        number of pages to use for perf_events ring buffer
+                        (default: 64)
+  -p PID, --pid PID     id of the process to trace (optional)
+  -L TID, --tid TID     id of the thread to trace (optional)
+  -v, --verbose         print resulting BPF program code before executing
+  -Z STRING_SIZE, --string-size STRING_SIZE
+                        maximum size to read from strings
+  -S, --include-self    do not filter trace's own pid from the trace
+  -M MAX_EVENTS, --max-events MAX_EVENTS
+                        number of events to print before quitting
+  -t, --timestamp       print timestamp column (offset from trace start)
+  -T, --time            print time column
+  -C, --print_cpu       print CPU id
+  -B, --bin_cmp         allow to use STRCMP with binary values
+  -K, --kernel-stack    output kernel stack trace
+  -U, --user-stack      output user stack trace
+  -a, --address         print virtual address in stacks
+  -I header, --include header
+                        additional header files to include in the BPF program
+                        as either full path, or relative to current working directory,
+                        or relative to default kernel header search path
+
+EXAMPLES:
+
+trace do_sys_open
+        Trace the open syscall and print a default trace message when entered
+trace 'do_sys_open "%s", arg2'
+        Trace the open syscall and print the filename being opened
+trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
+        Trace the read syscall and print a message for reads >20000 bytes
+trace 'r::do_sys_open "%llx", retval'
+        Trace the return from the open syscall and print the return value
+trace 'c:open (arg2 == 42) "%s %d", arg1, arg2'
+        Trace the open() call from libc only if the flags (arg2) argument is 42
+trace 'c:malloc "size = %d", arg1'
+        Trace malloc calls and print the size being allocated
+trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
+        Trace the write() call from libc to monitor writes to STDOUT
+trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
+        Trace returns from __kmalloc which returned a null pointer
+trace 'r:c:malloc (retval) "allocated = %x", retval'
+        Trace returns from malloc and print non-NULL allocated buffers
+trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
+        Trace the block_rq_complete kernel tracepoint and print # of tx sectors
+trace 'u:pthread:pthread_create (arg4 != 0)'
+        Trace the USDT probe pthread_create when its 4th argument is non-zero
+trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
+        Trace the nanosleep syscall and print the sleep duration in ns
+trace -I 'linux/fs.h' \
+      'p::uprobe_register(struct inode *inode) "a_ops = %llx", inode->i_mapping->a_ops'
+        Trace the uprobe_register inode mapping ops, and the symbol can be found
+        in /proc/kallsyms
+trace -I 'kernel/sched/sched.h' \
+      'p::__account_cfs_rq_runtime(struct cfs_rq *cfs_rq) "%d", cfs_rq->runtime_remaining'
+        Trace the cfs scheduling runqueue remaining runtime. The struct cfs_rq is defined
+        in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
+        package.  So this command needs to run at the kernel source tree root directory
+        so that the added header file can be found by the compiler.
+trace -I 'net/sock.h' \\
+      'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)'
+        Trace udpv6 sendmsg calls only if socket's destination port is equal
+        to 53 (DNS; 13568 in big endian order)
+trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
+        Trace the number of users accessing the file system of the current task
+"
diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py
new file mode 100755
index 0000000..9780518
--- /dev/null
+++ b/tools/ttysnoop.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ttysnoop   Watch live output from a tty or pts device.
+#            For Linux, uses BCC, eBPF. Embedded C.
+#
+# Due to a limited buffer size (see BUFSIZE), some commands (eg, a vim
+# session) are likely to be printed a little messed up.
+#
+# Copyright (c) 2016 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Idea: from ttywatcher.
+#
+# 15-Oct-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import ctypes as ct
+from subprocess import call
+import argparse
+from sys import argv
+import sys
+from os import stat
+
+def usage():
+    print("USAGE: %s [-Ch] {PTS | /dev/ttydev}  # try -h for help" % argv[0])
+    exit()
+
+# arguments
+examples = """examples:
+    ./ttysnoop /dev/pts/2    # snoop output from /dev/pts/2
+    ./ttysnoop 2             # snoop output from /dev/pts/2 (shortcut)
+    ./ttysnoop /dev/console  # snoop output from the system console
+    ./ttysnoop /dev/tty0     # snoop output from /dev/tty0
+"""
+parser = argparse.ArgumentParser(
+    description="Snoop output from a pts or tty device, eg, a shell",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("device", default="-1",
+    help="path to a tty device (eg, /dev/tty0) or pts number")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+
+if args.device == "-1":
+    usage()
+
+path = args.device
+if path.find('/') != 0:
+    path = "/dev/pts/" + path
+try:
+    pi = stat(path)
+except:
+    print("Unable to read device %s. Exiting." % path)
+    exit()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+
+#define BUFSIZE 256
+struct data_t {
+    int count;
+    char buf[BUFSIZE];
+};
+
+BPF_PERF_OUTPUT(events);
+
+int kprobe__tty_write(struct pt_regs *ctx, struct file *file,
+    const char __user *buf, size_t count)
+{
+    if (file->f_inode->i_ino != PTS)
+        return 0;
+
+    // bpf_probe_read() can only use a fixed size, so truncate to count
+    // in user space:
+    struct data_t data = {};
+    bpf_probe_read(&data.buf, BUFSIZE, (void *)buf);
+    if (count > BUFSIZE)
+        data.count = BUFSIZE;
+    else
+        data.count = count;
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+};
+"""
+
+bpf_text = bpf_text.replace('PTS', str(pi.st_ino))
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+BUFSIZE = 256
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("count", ct.c_int),
+        ("buf", ct.c_char * BUFSIZE)
+    ]
+
+if not args.noclear:
+    call("clear")
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%s" % event.buf[0:event.count].decode('utf-8', 'replace'), end="")
+    sys.stdout.flush()
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/ttysnoop_example.txt b/tools/ttysnoop_example.txt
new file mode 100644
index 0000000..1c29961
--- /dev/null
+++ b/tools/ttysnoop_example.txt
@@ -0,0 +1,83 @@
+Demonstrations of ttysnoop, the Linux eBPF/bcc version.
+
+
+ttysnoop watches a tty or pts device, and prints the same output that is
+appearing on that device. It can be used to mirror the output from a shell
+session, or the system console.
+
+Let's snoop /dev/pts/2:
+
+# ./ttysnoop 2
+<screen clears>
+date
+Sun Oct 16 01:28:47 UTC 2016
+# uname -a
+Linux bgregg-xenial-bpf-i-xxx 4.8.0-rc4-virtual #1 SMP Wed Aug 31 22:54:37 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux
+# df -h
+Filesystem      Size  Used Avail Use% Mounted on
+udev            7.4G     0  7.4G   0% /dev
+tmpfs           1.5G   89M  1.4G   6% /run
+/dev/xvda1      7.8G  4.5G  3.3G  59% /
+tmpfs           7.4G     0  7.4G   0% /dev/shm
+tmpfs           5.0M     0  5.0M   0% /run/lock
+tmpfs           7.4G     0  7.4G   0% /sys/fs/cgroup
+tmpfs           250M     0  250M   0% /run/shm
+/dev/md0        160G   20G  141G  13% /mnt
+tmpfs           1.5G     0  1.5G   0% /run/user/0
+# ^C
+
+What we're seeing is another shell session. The first line was "date" without
+the shell prompt ("#") because we began tracing after the prompt was printed.
+The other commands appeared, keystroke by keystroke, as the user was typing
+them. Spooky!
+
+Remember to Ctrl-C to exit ttysnoop.
+
+
+To figure out which pts device number to use, you can check your own with "ps"
+and other's with "w". For example:
+
+# ps -p $$
+  PID TTY          TIME CMD
+ 9605 pts/1    00:00:00 bash
+# w
+ 01:26:37 up 9 days, 35 min,  2 users,  load average: 0.22, 0.22, 0.15
+USER     TTY      FROM             LOGIN@   IDLE   JCPU   PCPU WHAT
+root     pts/1    100.127.65.241   00:39    2.00s  0.33s  0.33s -bash
+root     pts/2    100.127.65.241   00:40   16.00s  1.06s  1.06s -bash
+
+So I'm pts/1, and there's another session that's pts/2.
+
+
+This can also snoop tty devices using their full path. Eg, snooping the system
+console:
+
+# ./ttysnoop /dev/console
+Oct 16 01:32:06 bgregg-xenial-bpf-i-xxx kernel: [780087.407428] bash (9888): drop_caches: 1
+Oct 16 01:32:38 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+Oct 16 01:33:32 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+Oct 16 01:34:26 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+^C
+
+Neat!
+
+
+USAGE:
+
+# ./ttysnoop.py -h
+usage: ttysnoop.py [-h] [-C] device
+
+Snoop output from a pts or tty device, eg, a shell
+
+positional arguments:
+  device         path to a tty device (eg, /dev/tty0) or pts number
+
+optional arguments:
+  -h, --help     show this help message and exit
+  -C, --noclear  don't clear the screen
+
+examples:
+    ./ttysnoop /dev/pts/2    # snoop output from /dev/pts/2
+    ./ttysnoop 2             # snoop output from /dev/pts/2 (shortcut)
+    ./ttysnoop /dev/console  # snoop output from the system console
+    ./ttysnoop /dev/tty0     # snoop output from /dev/tty0
diff --git a/tools/vfscount.py b/tools/vfscount.py
new file mode 100755
index 0000000..10c6b1e
--- /dev/null
+++ b/tools/vfscount.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# vfscount  Count VFS calls ("vfs_*").
+#           For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting functions.
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+struct key_t {
+    u64 ip;
+};
+
+BPF_HASH(counts, struct key_t, u64, 256);
+
+int do_count(struct pt_regs *ctx) {
+    struct key_t key = {};
+    key.ip = PT_REGS_IP(ctx);
+    counts.increment(key);
+    return 0;
+}
+""")
+b.attach_kprobe(event_re="^vfs_.*", fn_name="do_count")
+
+# header
+print("Tracing... Ctrl-C to end.")
+
+# output
+try:
+    sleep(99999999)
+except KeyboardInterrupt:
+    pass
+
+print("\n%-16s %-26s %8s" % ("ADDR", "FUNC", "COUNT"))
+counts = b.get_table("counts")
+for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+    print("%-16x %-26s %8d" % (k.ip, b.ksym(k.ip), v.value))
diff --git a/tools/vfscount_example.txt b/tools/vfscount_example.txt
new file mode 100644
index 0000000..1012bff
--- /dev/null
+++ b/tools/vfscount_example.txt
@@ -0,0 +1,26 @@
+Demonstrations of vfscount, the Linux eBPF/bcc version.
+
+
+This counts VFS calls, by tracing all kernel functions beginning with "vfs_":
+
+# ./vfscount
+Tracing... Ctrl-C to end.
+^C
+ADDR             FUNC                          COUNT
+ffffffff811f3c01 vfs_create                        1
+ffffffff8120be71 vfs_getxattr                      2
+ffffffff811f5f61 vfs_unlink                        2
+ffffffff81236ca1 vfs_lock_file                     6
+ffffffff81218fb1 vfs_fsync_range                   6
+ffffffff811ecaf1 vfs_fstat                       319
+ffffffff811e6f01 vfs_open                        475
+ffffffff811ecb51 vfs_fstatat                     488
+ffffffff811ecac1 vfs_getattr                     704
+ffffffff811ec9f1 vfs_getattr_nosec               704
+ffffffff811e80a1 vfs_write                      1764
+ffffffff811e7f71 vfs_read                       2283
+
+This can be useful for workload characterization, to see what types of
+operations are in use.
+
+You can edit the script to customize what kernel functions are matched.
diff --git a/tools/vfsstat.py b/tools/vfsstat.py
new file mode 100755
index 0000000..1764c60
--- /dev/null
+++ b/tools/vfsstat.py
@@ -0,0 +1,110 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# vfsstat   Count some VFS calls.
+#           For Linux, uses BCC, eBPF. See .c file.
+#
+# Written as a basic example of counting multiple events as a stat tool.
+#
+# USAGE: vfsstat [interval [count]]
+#
+# Copyright (c) 2015 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Aug-2015   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from ctypes import c_int
+from time import sleep, strftime
+from sys import argv
+
+def usage():
+    print("USAGE: %s [interval [count]]" % argv[0])
+    exit()
+
+# arguments
+interval = 1
+count = -1
+if len(argv) > 1:
+    try:
+        interval = int(argv[1])
+        if interval == 0:
+            raise
+        if len(argv) > 2:
+            count = int(argv[2])
+    except:  # also catches -h, --help
+        usage()
+
+# load BPF program
+b = BPF(text="""
+#include <uapi/linux/ptrace.h>
+
+enum stat_types {
+    S_READ = 1,
+    S_WRITE,
+    S_FSYNC,
+    S_OPEN,
+    S_CREATE,
+    S_MAXSTAT
+};
+
+BPF_ARRAY(stats, u64, S_MAXSTAT);
+
+static void stats_increment(int key) {
+    u64 *leaf = stats.lookup(&key);
+    if (leaf) (*leaf)++;
+}
+
+void do_read(struct pt_regs *ctx) { stats_increment(S_READ); }
+void do_write(struct pt_regs *ctx) { stats_increment(S_WRITE); }
+void do_fsync(struct pt_regs *ctx) { stats_increment(S_FSYNC); }
+void do_open(struct pt_regs *ctx) { stats_increment(S_OPEN); }
+void do_create(struct pt_regs *ctx) { stats_increment(S_CREATE); }
+""")
+b.attach_kprobe(event="vfs_read", fn_name="do_read")
+b.attach_kprobe(event="vfs_write", fn_name="do_write")
+b.attach_kprobe(event="vfs_fsync", fn_name="do_fsync")
+b.attach_kprobe(event="vfs_open", fn_name="do_open")
+b.attach_kprobe(event="vfs_create", fn_name="do_create")
+
+# stat column labels and indexes
+stat_types = {
+    "READ": 1,
+    "WRITE": 2,
+    "FSYNC": 3,
+    "OPEN": 4,
+    "CREATE": 5
+}
+
+# header
+print("%-8s  " % "TIME", end="")
+for stype in stat_types.keys():
+    print(" %8s" % (stype + "/s"), end="")
+    idx = stat_types[stype]
+print("")
+
+# output
+i = 0
+while (1):
+    if count > 0:
+        i += 1
+        if i > count:
+            exit()
+    try:
+        sleep(interval)
+    except KeyboardInterrupt:
+        pass
+        exit()
+
+    print("%-8s: " % strftime("%H:%M:%S"), end="")
+    # print each statistic as a column
+    for stype in stat_types.keys():
+        idx = stat_types[stype]
+        try:
+            val = b["stats"][c_int(idx)].value / interval
+            print(" %8d" % val, end="")
+        except:
+            print(" %8d" % 0, end="")
+    b["stats"].clear()
+    print("")
diff --git a/tools/vfsstat_example.txt b/tools/vfsstat_example.txt
new file mode 100644
index 0000000..eba0343
--- /dev/null
+++ b/tools/vfsstat_example.txt
@@ -0,0 +1,36 @@
+Demonstrations of vfsstat, the Linux eBPF/bcc version.
+
+
+This traces some common VFS calls and prints per-second summaries. By default,
+the output interval is one second:
+
+# ./vfsstat
+TIME         READ/s  WRITE/s CREATE/s   OPEN/s  FSYNC/s
+18:35:32:       231       12        4       98        0
+18:35:33:       274       13        4      106        0
+18:35:34:       586       86        4      251        0
+18:35:35:       241       15        4       99        0
+18:35:36:       232       10        4       98        0
+18:35:37:       244       10        4      107        0
+18:35:38:       235       13        4       97        0
+18:35:39:      6749     2633        4     1446        0
+18:35:40:       277       31        4      115        0
+18:35:41:       238       16        6      102        0
+18:35:42:       284       50        8      114        0
+^C
+
+
+Here we are using an output interval of five seconds, and printing three output
+lines:
+
+# ./vfsstat 5 3
+TIME         READ/s  WRITE/s CREATE/s   OPEN/s  FSYNC/s
+18:35:55:       238        8        3      101        0
+18:36:00:       962      233        4      247        0
+18:36:05:       241        8        3      100        0
+
+
+Full usage:
+
+# ./vfsstat -h
+USAGE: ./vfsstat [interval [count]]
diff --git a/tools/wakeuptime.py b/tools/wakeuptime.py
new file mode 100755
index 0000000..18e70e4
--- /dev/null
+++ b/tools/wakeuptime.py
@@ -0,0 +1,230 @@
+#!/usr/bin/python
+#
+# wakeuptime    Summarize sleep to wakeup time by waker kernel stack
+#               For Linux, uses BCC, eBPF.
+#
+# USAGE: wakeuptime [-h] [-u] [-p PID] [-v] [-f] [duration]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Jan-2016	Brendan Gregg	Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from bcc.utils import printb
+from time import sleep, strftime
+import argparse
+import signal
+import errno
+from sys import stderr
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+# arguments
+examples = """examples:
+    ./wakeuptime             # trace blocked time with waker stacks
+    ./wakeuptime 5           # trace for 5 seconds only
+    ./wakeuptime -f 5        # 5 seconds, and output in folded format
+    ./wakeuptime -u          # don't include kernel threads (user only)
+    ./wakeuptime -p 185      # trace for PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize sleep to wakeup time by waker kernel stack",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-u", "--useronly", action="store_true",
+    help="user threads only (no kernel threads)")
+parser.add_argument("-p", "--pid",
+    type=positive_int,
+    help="trace this PID only")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="show raw addresses")
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format")
+parser.add_argument("--stack-storage-size", default=1024,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+         "displayed (default 1024)")
+parser.add_argument("duration", nargs="?", default=99999999,
+    type=positive_nonzero_int,
+    help="duration of trace, in seconds")
+parser.add_argument("-m", "--min-block-time", default=1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds over which we " +
+         "store traces (default 1)")
+parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1,
+    type=positive_nonzero_int,
+    help="the amount of time in microseconds under which we " +
+         "store traces (default U64_MAX)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+folded = args.folded
+duration = int(args.duration)
+debug = 0
+if args.pid and args.useronly:
+    parser.error("use either -p or -u.")
+
+# signal handler
+def signal_ignore(signal, frame):
+    print()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#define MINBLOCK_US    MINBLOCK_US_VALUEULL
+#define MAXBLOCK_US    MAXBLOCK_US_VALUEULL
+
+struct key_t {
+    int  w_k_stack_id;
+    char waker[TASK_COMM_LEN];
+    char target[TASK_COMM_LEN];
+};
+BPF_HASH(counts, struct key_t);
+BPF_HASH(start, u32);
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+int offcpu(struct pt_regs *ctx) {
+    u32 pid = bpf_get_current_pid_tgid();
+    struct task_struct *p = (struct task_struct *) bpf_get_current_task();
+    u64 ts;
+
+    if (FILTER)
+        return 0;
+
+    ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+int waker(struct pt_regs *ctx, struct task_struct *p) {
+    u32 pid = p->pid;
+    u64 delta, *tsp, ts;
+
+    tsp = start.lookup(&pid);
+    if (tsp == 0)
+        return 0;        // missed start
+    start.delete(&pid);
+
+    if (FILTER)
+        return 0;
+
+    // calculate delta time
+    delta = bpf_ktime_get_ns() - *tsp;
+    delta = delta / 1000;
+    if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US))
+        return 0;
+
+    struct key_t key = {};
+
+    key.w_k_stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
+    bpf_probe_read(&key.target, sizeof(key.target), p->comm);
+    bpf_get_current_comm(&key.waker, sizeof(key.waker));
+
+    counts.increment(key, delta);
+    return 0;
+}
+"""
+if args.pid:
+    filter = 'pid != %s' % args.pid
+elif args.useronly:
+    filter = 'p->flags & PF_KTHREAD'
+else:
+    filter = '0'
+bpf_text = bpf_text.replace('FILTER', filter)
+
+# set stack storage size
+bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
+bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+b.attach_kprobe(event="schedule", fn_name="offcpu")
+b.attach_kprobe(event="try_to_wake_up", fn_name="waker")
+matched = b.num_open_kprobes()
+if matched == 0:
+    print("0 functions traced. Exiting.")
+    exit()
+
+# header
+if not folded:
+    print("Tracing blocked time (us) by kernel stack", end="")
+    if duration < 99999999:
+        print(" for %d secs." % duration)
+    else:
+        print("... Hit Ctrl-C to end.")
+
+# output
+while (1):
+    try:
+        sleep(duration)
+    except KeyboardInterrupt:
+        # as cleanup can take many seconds, trap Ctrl-C:
+        signal.signal(signal.SIGINT, signal_ignore)
+
+    if not folded:
+        print()
+    missing_stacks = 0
+    has_enomem = False
+    counts = b.get_table("counts")
+    stack_traces = b.get_table("stack_traces")
+    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
+        # handle get_stackid errors
+        # check for an ENOMEM error
+        if k.w_k_stack_id == -errno.ENOMEM:
+            missing_stacks += 1
+            continue
+
+        waker_kernel_stack = [] if k.w_k_stack_id < 1 else \
+            reversed(list(stack_traces.walk(k.w_k_stack_id))[1:])
+
+        if folded:
+            # print folded stack output
+            line = \
+                [k.waker] + \
+                [b.ksym(addr)
+                    for addr in reversed(list(waker_kernel_stack))] + \
+                [k.target]
+            printb(b"%s %d" % (b";".join(line), v.value))
+        else:
+            # print default multi-line stack output
+            printb(b"    %-16s %s" % (b"target:", k.target))
+            for addr in waker_kernel_stack:
+                printb(b"    %-16x %s" % (addr, b.ksym(addr)))
+            printb(b"    %-16s %s" % (b"waker:", k.waker))
+            print("        %d\n" % v.value)
+    counts.clear()
+
+    if missing_stacks > 0:
+        enomem_str = " Consider increasing --stack-storage-size."
+        print("WARNING: %d stack traces could not be displayed.%s" %
+            (missing_stacks, enomem_str),
+            file=stderr)
+
+    if not folded:
+        print("Detaching...")
+    exit()
diff --git a/tools/wakeuptime_example.txt b/tools/wakeuptime_example.txt
new file mode 100644
index 0000000..dbade29
--- /dev/null
+++ b/tools/wakeuptime_example.txt
@@ -0,0 +1,481 @@
+Demonstrations of wakeuptime, the Linux eBPF/bcc version.
+
+
+This program measures when threads block, and shows the stack traces for the
+threads that performed the wakeup, along with the process names of the waker
+and target processes, and the total blocked time. This blocked time is measured
+from when a thread blocks, to when the wakeup signal is sent. Note that this
+time excludes some run queue latency from the target thread, which may not
+immediately execute if it needs to wait its turn on-CPU. All the data shown,
+stack traces, process names, and times, are summarized in-kernel using an eBPF
+map for efficiency.
+
+This tool is intended to be used after offcputime, which shows the directly
+blocked stacks. wakeuptime can then be used to show the stacks that performed
+the wakeups.
+
+Here is some example (truncated) output. To explain what we are seeing: the
+first stack trace shown is for a "vmstat" thread, which was woken up by
+"swapper/1". The stack trace is for swapper/1, which shows a timer interrupt.
+The total time is 4.0 seconds: this actually corresponds to a "vmstat 1"
+printing 4 x 1 second summaries -- we're seeing the interrupt stack that
+wakes up vmstat:
+
+# ./wakeuptime
+Tracing blocked time (us) by kernel stack... Hit Ctrl-C to end.
+^C
+[...truncated...]
+
+    target:          vmstat
+    ffffffff810df082 hrtimer_wakeup
+    ffffffff810df494 __hrtimer_run_queues
+    ffffffff810dfba8 hrtimer_interrupt
+    ffffffff8100b9e1 xen_timer_interrupt
+    ffffffff810cb9c8 handle_irq_event_percpu
+    ffffffff810cf1ca handle_percpu_irq
+    ffffffff810cb0c2 generic_handle_irq
+    ffffffff814766f7 evtchn_2l_handle_events
+    ffffffff81473e83 __xen_evtchn_do_upcall
+    ffffffff81475cf0 xen_evtchn_do_upcall
+    ffffffff8178adee xen_do_hypervisor_callback
+    waker:           swapper/1
+        4000415
+
+    target:          sshd
+    ffffffff812037b6 pollwake
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:2
+        4001028
+
+    target:          rcuos/0
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d8043 rcu_gp_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    ffffffff81ca9420 ddebug_tables
+    waker:           rcu_sched
+        4009976
+
+    target:          rcuos/7
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d6f28 rcu_nocb_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           rcuos/6
+        4095781
+
+    target:          rcuos/6
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d8043 rcu_gp_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    ffffffff81ca9420 ddebug_tables
+    waker:           rcu_sched
+        4101075
+
+    target:          rcuos/5
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d6f28 rcu_nocb_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           rcuos/4
+        4103492
+
+    target:          rcuos/3
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d6f28 rcu_nocb_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           rcuos/2
+        4107785
+
+    target:          rcuos/2
+    ffffffff810b5b12 autoremove_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff810d8043 rcu_gp_kthread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    ffffffff81ca9420 ddebug_tables
+    waker:           rcu_sched
+        4113308
+
+    target:          wakeuptime
+    ffffffff8108109e signal_wake_up_state
+    ffffffff810811e6 complete_signal
+    ffffffff8108186b __send_signal
+    ffffffff81081b0e send_signal
+    ffffffff810824e3 do_send_sig_info
+    ffffffff81082955 group_send_sig_info
+    ffffffff810829b4 __kill_pgrp_info
+    ffffffff81082a15 kill_pgrp
+    ffffffff8149081f __isig
+    ffffffff814912b4 isig
+    ffffffff81491f7c n_tty_receive_signal_char
+    ffffffff81493528 n_tty_receive_char_special
+    ffffffff8149419f n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:2
+        4125162
+
+    target:          sshd
+    ffffffff812037b6 pollwake
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b58d5 __wake_up_sync_key
+    ffffffff816707ba sock_def_readable
+    ffffffff816d9f87 tcp_data_queue
+    ffffffff816dd465 tcp_rcv_established
+    ffffffff816e7ec5 tcp_v4_do_rcv
+    ffffffff816e8ff7 tcp_v4_rcv
+    ffffffff816c3a84 ip_local_deliver_finish
+    ffffffff816c3d80 ip_local_deliver
+    ffffffff816c3762 ip_rcv_finish
+    ffffffff816c4062 ip_rcv
+    ffffffff816885be __netif_receive_skb_core
+    ffffffff81688928 __netif_receive_skb
+    ffffffff81688993 netif_receive_skb_internal
+    ffffffff816894c5 napi_gro_receive
+    ffffffff81593111 xennet_poll
+    ffffffff81688e0e net_rx_action
+    ffffffff8107932b __do_softirq
+    ffffffff810796b2 irq_exit
+    waker:           swapper/0
+        4515762
+
+    target:          supervise
+    ffffffff810df082 hrtimer_wakeup
+    ffffffff810df494 __hrtimer_run_queues
+    ffffffff810dfba8 hrtimer_interrupt
+    ffffffff8100b9e1 xen_timer_interrupt
+    ffffffff810cb9c8 handle_irq_event_percpu
+    ffffffff810cf1ca handle_percpu_irq
+    ffffffff810cb0c2 generic_handle_irq
+    ffffffff814766f7 evtchn_2l_handle_events
+    ffffffff81473e83 __xen_evtchn_do_upcall
+    ffffffff81475cf0 xen_evtchn_do_upcall
+    ffffffff8178adee xen_do_hypervisor_callback
+    waker:           swapper/0
+        25523344
+
+Detaching...
+
+The second last stack trace shows sshd being woken up by packets being received.
+Near the bottom of the stack is driver processing, then IP, TCP, and finally
+socket processing as we work up the stack. The total time sshd (all sshd's)
+were blocked and woken up in this way was 4.5 seconds.
+
+Do be somewhat careful with overhead: this is tracing scheduler functions, which
+can be called very frequently. While this uses in-kernel summaries for
+efficiency, the rate of scheduler functions can be very high (> 1,000,000/sec),
+and this is performing stack walks when threads return to CPU. At some point
+the overhead will be measurable.
+
+
+A -p option can be used to filter (in-kernel) on a single process ID. For
+example, only matching PID 19169, which is a bash shell:
+
+# ./wakeuptime -p 19169
+Tracing blocked time (us) by kernel stack... Hit Ctrl-C to end.
+^C
+    target:          bash
+    ffffffff81075eb8 child_wait_callback
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b58d5 __wake_up_sync_key
+    ffffffff81078046 __wake_up_parent
+    ffffffff810831b3 do_notify_parent
+    ffffffff81077eaf do_exit
+    ffffffff81077f93 do_group_exit
+    ffffffff81078014 sys_exit_group
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    waker:           ls
+        2015
+
+    target:          bash
+    ffffffff81075eb8 child_wait_callback
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b58d5 __wake_up_sync_key
+    ffffffff81078046 __wake_up_parent
+    ffffffff810831b3 do_notify_parent
+    ffffffff81077eaf do_exit
+    ffffffff81077f93 do_group_exit
+    ffffffff81078014 sys_exit_group
+    ffffffff81789076 entry_SYSCALL_64_fastpath
+    waker:           sleep
+        1001347
+
+    target:          bash
+    ffffffff810b5921 woken_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:0
+        1871024
+
+    target:          bash
+    ffffffff810b5921 woken_wake_function
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:2
+        3019839
+
+Detaching...
+
+These stack traces are fascinating! The first two shows bash waiting on child
+processes, an "ls" and a "sleep". The sleep stack was responsible for 1.0
+seconds of blocked time: I'd run a "sleep 1".
+
+The last two stacks show bash waking up to service tty input (keystrokes).
+
+
+A duration can be added, for example, tracing PID 19097 (sshd) for 5 seconds
+only:
+
+# ./wakeuptime -p 19097 5
+Tracing blocked time (us) by kernel stack for 5 secs.
+
+    target:          sshd
+    ffffffff812037b6 pollwake
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:1
+        785
+
+    target:          sshd
+    ffffffff812037b6 pollwake
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b54d9 __wake_up
+    ffffffff814939fd n_tty_receive_buf_common
+    ffffffff81494424 n_tty_receive_buf2
+    ffffffff81496df5 flush_to_ldisc
+    ffffffff8108c80a process_one_work
+    ffffffff8108caeb worker_thread
+    ffffffff81092979 kthread
+    ffffffff8178940f ret_from_fork
+    waker:           kworker/u16:2
+        2843
+
+    target:          sshd
+    ffffffff812037b6 pollwake
+    ffffffff810b5462 __wake_up_common
+    ffffffff810b58d5 __wake_up_sync_key
+    ffffffff816707ba sock_def_readable
+    ffffffff816d9f87 tcp_data_queue
+    ffffffff816dd465 tcp_rcv_established
+    ffffffff816e7ec5 tcp_v4_do_rcv
+    ffffffff816e8ff7 tcp_v4_rcv
+    ffffffff816c3a84 ip_local_deliver_finish
+    ffffffff816c3d80 ip_local_deliver
+    ffffffff816c3762 ip_rcv_finish
+    ffffffff816c4062 ip_rcv
+    ffffffff816884be __netif_receive_skb_core
+    ffffffff81688928 __netif_receive_skb
+    ffffffff81688993 netif_receive_skb_internal
+    ffffffff816894c5 napi_gro_receive
+    ffffffff81593111 xennet_poll
+    ffffffff81688e0e net_rx_action
+    ffffffff8107932b __do_softirq
+    ffffffff810796b2 irq_exit
+    waker:           swapper/0
+        276103
+
+Detaching...
+
+
+A -f option will emit output using the "folded stacks" format, which can be
+read directly by flamegraph.pl from the FlameGraph open source software
+(https://github.com/brendangregg/FlameGraph). Eg:
+
+# ./wakeuptime -f 5
+run;entry_SYSCALL_64_fastpath;sys_mmap;sys_mmap_pgoff;vm_mmap_pgoff;do_mmap;mmap_region;do_munmap;__split_vma.isra.35;vma_adjust;call_rwsem_wake;rwsem_wake;__rwsem_do_wake;run 1
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;run 1
+chmod;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;mkdir 2
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;free_pgtables;unlink_file_vma;call_rwsem_wake;rwsem_wake;__rwsem_do_wake;supervise 2
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_script;search_binary_handler;load_elf_binary;clear_user;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;run 2
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;flush_old_exec;mmput;exit_mmap;free_pgtables;unlink_file_vma;call_rwsem_wake;rwsem_wake;__rwsem_do_wake;run 3
+mkdir;return_from_execve;sys_execve;do_execveat_common.isra.33;search_binary_handler;load_elf_binary;clear_user;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;mkdir 3
+mkdir;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;mkdir 4
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;mkdir 4
+supervise;entry_SYSCALL_64_fastpath;sys_clone;_do_fork;copy_process;call_rwsem_wake;rwsem_wake;__rwsem_do_wake;supervise 5
+rcuos/0;ddebug_tables;ret_from_fork;kthread;rcu_nocb_kthread;rcu_gp_kthread_wake;__wake_up;__wake_up_common;autoremove_wake_function;rcu_sched 10
+swapper/7;cpu_bringup_and_idle;cpu_startup_entry;rcu_idle_enter;rcu_eqs_enter;rcu_eqs_enter_common;wake_nocb_leader;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/6 12
+swapper/0;xen_start_kernel;x86_64_start_reservations;start_kernel;rest_init;cpu_startup_entry;rcu_idle_enter;rcu_eqs_enter;rcu_eqs_enter_common;wake_nocb_leader;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/0 16
+rcuos/6;ret_from_fork;kthread;rcu_nocb_kthread;rcu_gp_kthread_wake;__wake_up;__wake_up_common;autoremove_wake_function;rcu_sched 19
+run;page_fault;do_page_fault;__do_page_fault;handle_mm_fault;unlock_page;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;run 25
+bash;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;do_output_char;pty_write;tty_flip_buffer_push;queue_work_on;__queue_work;insert_work;kworker/u16:2 26
+swapper/4;cpu_bringup_and_idle;cpu_startup_entry;rcu_idle_enter;rcu_eqs_enter;rcu_eqs_enter_common;wake_nocb_leader;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/4 30
+pickup;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;sock_write_iter;sock_sendmsg;unix_stream_sendmsg;sock_def_readable;__wake_up_sync_key;__wake_up_common;ep_poll_callback;__wake_up_locked;__wake_up_common;master 36
+swapper/1;cpu_bringup_and_idle;cpu_startup_entry;rcu_idle_enter;rcu_eqs_enter;rcu_eqs_enter_common;wake_nocb_leader;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/0 52
+chown;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;ext4_end_bio;ext4_finish_bio;end_page_writeback;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 189
+supervise;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;end_bio_bh_io_sync;journal_end_buffer_io_sync;wake_up_bit;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 371
+supervise;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;ext4_end_bio;ext4_finish_bio;end_page_writeback;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 3093
+chown;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 3985
+supervise;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 3997
+supervise;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;end_bio_bh_io_sync;journal_end_buffer_io_sync;unlock_buffer;wake_up_bit;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;jbd2/xvda1-8 4511
+chmod;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;ext4_end_bio;ext4_finish_bio;end_page_writeback;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 4646
+swapper/4;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 7971
+readproctitle;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;end_bio_bh_io_sync;journal_end_buffer_io_sync;unlock_buffer;wake_up_bit;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;jbd2/xvda1-8 8249
+swapper/2;cpu_bringup_and_idle;cpu_startup_entry;rcu_idle_enter;rcu_eqs_enter;rcu_eqs_enter_common;wake_nocb_leader;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/2 12016
+run;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;ext4_end_bio;ext4_finish_bio;end_page_writeback;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 13973
+swapper/0;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;end_bio_bh_io_sync;journal_end_buffer_io_sync;wake_up_bit;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 15736
+run;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 19916
+swapper/1;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 31877
+mkdir;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;do_notify_parent;__wake_up_parent;__wake_up_sync_key;__wake_up_common;child_wait_callback;run 39619
+swapper/5;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 39837
+chown;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;do_notify_parent;__wake_up_parent;__wake_up_sync_key;__wake_up_common;child_wait_callback;run 42190
+chmod;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;do_notify_parent;__wake_up_parent;__wake_up_sync_key;__wake_up_common;child_wait_callback;run 43486
+swapper/0;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_edge_irq;handle_irq_event;handle_irq_event_percpu;blkif_interrupt;blk_mq_complete_request;__blk_mq_complete_request;blk_mq_end_request;blk_update_request;bio_endio;ext4_end_bio;ext4_finish_bio;end_page_writeback;__wake_up_bit;__wake_up;__wake_up_common;wake_bit_function;supervise 47810
+bash;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;pty_write;tty_flip_buffer_push;queue_work_on;__queue_work;insert_work;kworker/u16:2 86794
+vmstat;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;pty_write;tty_flip_buffer_push;queue_work_on;__queue_work;insert_work;kworker/u16:2 210848
+swapper/0;irq_exit;__do_softirq;net_rx_action;xennet_poll;napi_gro_receive;netif_receive_skb_internal;__netif_receive_skb;__netif_receive_skb_core;ip_rcv;ip_rcv_finish;ip_local_deliver;ip_local_deliver_finish;tcp_v4_rcv;tcp_v4_do_rcv;tcp_rcv_established;tcp_data_queue;sock_def_readable;__wake_up_sync_key;__wake_up_common;pollwake;sshd 543295
+kworker/u16:2;ret_from_fork;kthread;worker_thread;process_one_work;flush_to_ldisc;n_tty_receive_buf2;n_tty_receive_buf_common;__wake_up;__wake_up_common;woken_wake_function;bash 543570
+swapper/7;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/u16:2 741234
+sshd;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;tty_write;n_tty_write;pty_write;tty_flip_buffer_push;queue_work_on;__queue_work;insert_work;kworker/u16:2 855436
+bash;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/7 942685
+swapper/0;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 969059
+swapper/4;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;add_interrupt_randomness;credit_entropy_bits;queue_work_on;__queue_work;insert_work;kworker/4:0 999981
+swapper/0;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;sleep 1000103
+sleep;entry_SYSCALL_64_fastpath;sys_exit_group;do_group_exit;do_exit;do_notify_parent;__wake_up_parent;__wake_up_sync_key;__wake_up_common;child_wait_callback;bash 1001564
+swapper/3;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 1016980
+mkdir;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 1019302
+chown;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 1019908
+swapper/7;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 1021074
+swapper/4;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 1021075
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/6 1030506
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/4 1032424
+swapper/6;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 1036908
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/1 1040207
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/5 1044756
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/3 1044986
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/2 1046347
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/7 1093598
+swapper/7;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;process_timeout;rcu_sched 1858510
+supervise;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 2041736
+swapper/6;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 2042028
+swapper/1;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 2042149
+swapper/2;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 2042152
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/6 2042698
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/0 2044085
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/0 2047386
+run;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/7 2065637
+swapper/4;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/4:0 2999930
+swapper/6;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;snmpd 2999999
+swapper/1;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/u16:2 3010848
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/5 3050881
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/1 3051454
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/3 3054844
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/2 3059548
+supervise;return_from_execve;sys_execve;do_execveat_common.isra.33;sched_exec;stop_one_cpu;cpu_stop_queue_work;migration/4 3061480
+swapper/3;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 3062666
+swapper/5;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 3063222
+swapper/7;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;it_real_fn;kill_pid_info;group_send_sig_info;do_send_sig_info;send_signal;__send_signal;complete_signal;signal_wake_up_state;ntpd 3999835
+swapper/1;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/1:0 3999933
+swapper/6;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/6:0 3999938
+swapper/3;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/3:0 3999938
+swapper/7;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/7:2 3999939
+swapper/5;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/5:3 3999942
+swapper/2;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/2:1 3999946
+swapper/0;xen_do_hypervisor_callback;xen_evtchn_do_upcall;irq_exit;__do_softirq;run_timer_softirq;call_timer_fn;delayed_work_timer_fn;__queue_work;insert_work;kworker/0:1 3999953
+swapper/0;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;tail 4000414
+swapper/4;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;vmstat 4000417
+chmod;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 4083231
+run;entry_SYSCALL_64_fastpath;sys_write;vfs_write;__vfs_write;pipe_write;__wake_up_sync_key;__wake_up_common;autoremove_wake_function;readproctitle 4096457
+rcuos/4;ret_from_fork;kthread;rcu_nocb_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/5 4973072
+rcuos/0;ddebug_tables;ret_from_fork;kthread;rcu_nocb_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/1 4973898
+rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/0 4976731
+rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/4 4976755
+rcuos/2;ret_from_fork;kthread;rcu_nocb_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/3 4980207
+rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/2 4980502
+rcuos/6;ret_from_fork;kthread;rcu_nocb_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/7 4981025
+rcu_sched;ddebug_tables;ret_from_fork;kthread;rcu_gp_kthread;__wake_up;__wake_up_common;autoremove_wake_function;rcuos/6 4983110
+kworker/u16:2;ret_from_fork;kthread;worker_thread;process_one_work;flush_to_ldisc;n_tty_receive_buf2;n_tty_receive_buf_common;__wake_up;__wake_up_common;pollwake;sshd 5004430
+swapper/6;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;wakeuptime 5005051
+run;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 7144088
+swapper/0;xen_do_hypervisor_callback;xen_evtchn_do_upcall;__xen_evtchn_do_upcall;evtchn_2l_handle_events;generic_handle_irq;handle_percpu_irq;handle_irq_event_percpu;xen_timer_interrupt;hrtimer_interrupt;__hrtimer_run_queues;hrtimer_wakeup;supervise 11229310
+
+The stack traces are shown as single lines, with functions separated by
+semicolons. The first entry is the waker task name, followed by the waker stack,
+and then last entry is the target task name. As a flame graph, this puts the
+waker name on the bottom, followed by the waker stack, and then the target
+task name on top. The 2nd column is the total blocked time.
+
+I'd save this output to a file, then move it to the system where you'll be
+creating your "wakeup time flame graphs".
+
+
+USAGE message:
+
+# ./wakeuptime -h
+usage: wakeuptime [-h] [-u] [-p PID] [-v] [-f]
+                  [--stack-storage-size STACK_STORAGE_SIZE]
+                  [-m MIN_BLOCK_TIME] [-M MAX_BLOCK_TIME]
+                  [duration]
+
+Summarize sleep to wakeup time by waker kernel stack
+
+positional arguments:
+  duration              duration of trace, in seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -u, --useronly        user threads only (no kernel threads)
+  -p PID, --pid PID     trace this PID only
+  -v, --verbose         show raw addresses
+  -f, --folded          output folded format
+  --stack-storage-size STACK_STORAGE_SIZE
+                        the number of unique stack traces that can be stored
+                        and displayed (default 1024)
+  -m MIN_BLOCK_TIME, --min-block-time MIN_BLOCK_TIME
+                        the amount of time in microseconds over which we store
+                        traces (default 1)
+  -M MAX_BLOCK_TIME, --max-block-time MAX_BLOCK_TIME
+                        the amount of time in microseconds under which we
+                        store traces (default U64_MAX)
+examples:
+    ./wakeuptime             # trace blocked time with waker stacks
+    ./wakeuptime 5           # trace for 5 seconds only
+    ./wakeuptime -f 5        # 5 seconds, and output in folded format
+    ./wakeuptime -u          # don't include kernel threads (user only)
+    ./wakeuptime -p 185      # trace for PID 185 only
diff --git a/tools/xfsdist.py b/tools/xfsdist.py
new file mode 100755
index 0000000..f409f90
--- /dev/null
+++ b/tools/xfsdist.py
@@ -0,0 +1,172 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# xfsdist  Summarize XFS operation latency.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: xfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./xfsdist            # show operation latency as a histogram
+    ./xfsdist -p 181     # trace PID 181 only
+    ./xfsdist 1 10       # print 1 second summaries, 10 times
+    ./xfsdist -m 5       # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize XFS operation latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+    help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+    factor = 1000000
+    label = "msecs"
+else:
+    factor = 1000
+    label = "usecs"
+if args.interval and int(args.interval) == 0:
+    print("ERROR: interval 0. Exiting.")
+    exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+    char op[OP_NAME_LEN];
+    u64 slot;
+} dist_key_t;
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+    u64 *tsp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start or filtered
+    }
+    u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+
+    // store as histogram
+    dist_key_t key = {.slot = bpf_log2l(delta)};
+    __builtin_memcpy(&key.op, op, sizeof(key.op));
+    dist.increment(key);
+
+    start.delete(&pid);
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    char *op = "read";
+    return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    char *op = "write";
+    return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    char *op = "open";
+    return trace_return(ctx, op);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    char *op = "fsync";
+    return trace_return(ctx, op);
+}
+"""
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# common file functions
+b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_entry")
+b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_entry")
+b.attach_kprobe(event="xfs_file_open", fn_name="trace_entry")
+b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_entry")
+b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return")
+
+print("Tracing XFS operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.interval and (not args.notimestamp):
+        print(strftime("%H:%M:%S:"))
+
+    dist.print_log2_hist(label, "operation")
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/xfsdist_example.txt b/tools/xfsdist_example.txt
new file mode 100644
index 0000000..c646501
--- /dev/null
+++ b/tools/xfsdist_example.txt
@@ -0,0 +1,155 @@
+Demonstrations of xfsdist, the Linux eBPF/bcc version.
+
+
+xfsdist traces XFS reads, writes, opens, and fsyncs, and summarizes their
+latency as a power-of-2 histogram. For example:
+
+# ./xfsdist 
+Tracing XFS operation latency... Hit Ctrl-C to end.
+^C
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 362      |                                        |
+         4 -> 7          : 807      |*                                       |
+         8 -> 15         : 20686    |****************************************|
+        16 -> 31         : 512      |                                        |
+        32 -> 63         : 4        |                                        |
+        64 -> 127        : 2744     |*****                                   |
+       128 -> 255        : 7127     |*************                           |
+       256 -> 511        : 2483     |****                                    |
+       512 -> 1023       : 1281     |**                                      |
+      1024 -> 2047       : 39       |                                        |
+      2048 -> 4095       : 5        |                                        |
+      4096 -> 8191       : 1        |                                        |
+
+operation = 'open'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 3        |****************************************|
+
+This output shows a bi-modal distribution for read latency, with a faster
+mode of 20,686 reads that took between 8 and 15 microseconds, and a slower
+mode of over 10,000 reads that took between 64 and 1023 microseconds. It's
+likely that the faster mode was a hit from the in-memory file system cache,
+and the slower mode is a read from a storage device (disk).
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example:
+
+# ./xfsdist -m 1 5
+Tracing XFS operation latency... Hit Ctrl-C to end.
+
+10:14:15:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 1366     |****************************************|
+         2 -> 3          : 86       |**                                      |
+         4 -> 7          : 95       |**                                      |
+         8 -> 15         : 132      |***                                     |
+        16 -> 31         : 72       |**                                      |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 685      |****************************************|
+
+10:14:16:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 984      |****************************************|
+         2 -> 3          : 66       |**                                      |
+         4 -> 7          : 67       |**                                      |
+         8 -> 15         : 104      |****                                    |
+        16 -> 31         : 70       |**                                      |
+        32 -> 63         : 12       |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 536      |****************************************|
+
+10:14:17:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 1262     |****************************************|
+         2 -> 3          : 75       |**                                      |
+         4 -> 7          : 80       |**                                      |
+         8 -> 15         : 119      |***                                     |
+        16 -> 31         : 75       |**                                      |
+        32 -> 63         : 3        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 639      |****************************************|
+
+10:14:18:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 1070     |****************************************|
+         2 -> 3          : 58       |**                                      |
+         4 -> 7          : 74       |**                                      |
+         8 -> 15         : 140      |*****                                   |
+        16 -> 31         : 60       |**                                      |
+        32 -> 63         : 5        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 556      |****************************************|
+
+10:14:19:
+
+operation = 'read'
+     msecs               : count     distribution
+         0 -> 1          : 1176     |****************************************|
+         2 -> 3          : 53       |*                                       |
+         4 -> 7          : 94       |***                                     |
+         8 -> 15         : 112      |***                                     |
+        16 -> 31         : 77       |**                                      |
+        32 -> 63         : 3        |                                        |
+
+operation = 'write'
+     msecs               : count     distribution
+         0 -> 1          : 613      |****************************************|
+
+This shows a mixed read/write workload, where the slower read mode was around
+10 ms.
+
+
+USAGE message:
+
+# ./xfsdist -h
+usage: xfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize XFS operation latency
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --notimestamp   don't include timestamp on interval output
+  -m, --milliseconds  output in milliseconds
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./xfsdist            # show operation latency as a histogram
+    ./xfsdist -p 181     # trace PID 181 only
+    ./xfsdist 1 10       # print 1 second summaries, 10 times
+    ./xfsdist -m 5       # 5s summaries, milliseconds
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
new file mode 100755
index 0000000..c70721a
--- /dev/null
+++ b/tools/xfsslower.py
@@ -0,0 +1,306 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# xfsslower  Trace slow XFS operations.
+#            For Linux, uses BCC, eBPF.
+#
+# USAGE: xfsslower [-h] [-j] [-p PID] [min_ms]
+#
+# This script traces common XFS file operations: reads, writes, opens, and
+# syncs. It measures the time spent in these operations, and prints details
+# for each that exceeded a threshold.
+#
+# WARNING: This adds low-overhead instrumentation to these XFS operations,
+# including reads and writes from the file system cache. Such reads and writes
+# can be very frequent (depending on the workload; eg, 1M/sec), at which
+# point the overhead of this tool (even if it prints no "slower" events) can
+# begin to become significant.
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 11-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./xfsslower             # trace operations slower than 10 ms (default)
+    ./xfsslower 1           # trace operations slower than 1 ms
+    ./xfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./xfsslower 0           # trace all operations (warning: verbose)
+    ./xfsslower -p 185      # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace common XFS file operations slower than a threshold",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="just print fields: comma-separated values")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("min_ms", nargs="?", default='10',
+    help="minimum I/O duration to trace, in ms (default 10)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+pid = args.pid
+csv = args.csv
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/dcache.h>
+
+// XXX: switch these to char's when supported
+#define TRACE_READ      0
+#define TRACE_WRITE     1
+#define TRACE_OPEN      2
+#define TRACE_FSYNC     3
+
+struct val_t {
+    u64 ts;
+    u64 offset;
+    struct file *fp;
+};
+
+struct data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 type;
+    u64 size;
+    u64 offset;
+    u64 delta_us;
+    u64 pid;
+    char task[TASK_COMM_LEN];
+    char file[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(entryinfo, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+//
+// Store timestamp and size on entry
+//
+
+// xfs_file_read_iter(), xfs_file_write_iter():
+int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = iocb->ki_filp;
+    val.offset = iocb->ki_pos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// xfs_file_open():
+int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
+    struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// xfs_file_fsync():
+int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = file;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+//
+// Output
+//
+
+static int trace_return(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    valp = entryinfo.lookup(&id);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+
+    // calculate delta
+    u64 ts = bpf_ktime_get_ns();
+    u64 delta_us = ts - valp->ts;
+    entryinfo.delete(&id);
+
+    // Skip entries with backwards time: temp workaround for #728
+    if ((s64) delta_us < 0)
+        return 0;
+
+    delta_us /= 1000;
+
+    if (FILTER_US)
+        return 0;
+
+    // populate output struct
+    u32 size = PT_REGS_RC(ctx);
+    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
+        .pid = pid};
+    data.ts_us = ts / 1000;
+    data.offset = valp->offset;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    // workaround (rewriter should handle file to d_name in one step):
+    struct qstr qs = valp->fp->f_path.dentry->d_name;
+    if (qs.len == 0)
+        return 0;
+    bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_WRITE);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_OPEN);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_FSYNC);
+}
+
+"""
+if min_ms == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US',
+        'delta_us <= %s' % str(min_ms * 1000))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("type", ct.c_ulonglong),
+        ("size", ct.c_ulonglong),
+        ("offset", ct.c_ulonglong),
+        ("delta_us", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("file", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    type = 'R'
+    if event.type == 1:
+        type = 'W'
+    elif event.type == 2:
+        type = 'O'
+    elif event.type == 3:
+        type = 'S'
+
+    if (csv):
+        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
+            event.ts_us, event.task, event.pid, type, event.size,
+            event.offset, event.delta_us, event.file))
+        return
+    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
+        event.task, event.pid, type, event.size, event.offset / 1024,
+        float(event.delta_us) / 1000, event.file))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# common file functions
+b.attach_kprobe(event="xfs_file_read_iter", fn_name="trace_rw_entry")
+b.attach_kprobe(event="xfs_file_write_iter", fn_name="trace_rw_entry")
+b.attach_kprobe(event="xfs_file_open", fn_name="trace_open_entry")
+b.attach_kprobe(event="xfs_file_fsync", fn_name="trace_fsync_entry")
+b.attach_kretprobe(event="xfs_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="xfs_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="xfs_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="xfs_file_fsync", fn_name="trace_fsync_return")
+
+# header
+if (csv):
+    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
+else:
+    if min_ms == 0:
+        print("Tracing XFS operations")
+    else:
+        print("Tracing XFS operations slower than %d ms" % min_ms)
+    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
+        "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/xfsslower_example.txt b/tools/xfsslower_example.txt
new file mode 100644
index 0000000..4c6ae33
--- /dev/null
+++ b/tools/xfsslower_example.txt
@@ -0,0 +1,148 @@
+Demonstrations of xfsslower, the Linux eBPF/bcc version.
+
+
+xfsslower shows XFS reads, writes, opens, and fsyncs, slower than a threshold.
+For example:
+
+# ./xfsslower
+Tracing XFS operations slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:23:06 randread.pl    32497  R 8192    24938024   17.93 data1
+06:23:06 randread.pl    32521  R 8192    13431528   18.27 data1
+06:23:08 randread.pl    32497  R 8192    5070904    16.37 data1
+06:23:08 randread.pl    32521  R 8192    12693016   16.06 data1
+06:23:18 randread.pl    32521  R 8192    27049136   21.68 data1
+06:23:18 randread.pl    32497  R 8192    257864     21.74 data1
+06:23:20 randread.pl    32497  R 8192    17797208   13.37 data1
+06:23:20 randread.pl    32497  R 8192    6088224    19.74 data1
+
+This shows several reads from a "randread.pl" program, each 8 Kbytes in size,
+and from a "data1" file. These all had over 10 ms latency.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+The threshold can be provided as an argument. Eg, I/O slower than 1 ms:
+
+# ./xfsslower 1
+Tracing XFS operations slower than 1 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:26:59 randread.pl    5394   R 8192    9045728     1.24 data1
+06:26:59 randread.pl    5394   R 8192    23532136    1.17 data1
+06:26:59 randread.pl    5442   R 8192    2192376     2.06 data1
+06:27:00 randread.pl    5394   R 8192    3535176     1.27 data1
+06:27:00 randread.pl    5442   R 8192    21361784    3.18 data1
+06:27:00 randread.pl    5394   R 8192    2556336     3.23 data1
+06:27:00 randread.pl    5394   R 8192    20020880    2.87 data1
+06:27:00 randread.pl    5442   R 8192    20708888    3.32 data1
+06:27:00 randread.pl    5394   R 8192    4654680     2.00 data1
+06:27:00 randread.pl    5442   R 8192    5591744     1.98 data1
+06:27:00 randread.pl    5394   R 8192    2431056     1.22 data1
+06:27:00 randread.pl    5394   R 8192    384288      2.95 data1
+06:27:00 randread.pl    5442   R 8192    29277672    3.07 data1
+06:27:00 randread.pl    5442   R 8192    29508216    3.23 data1
+06:27:00 randread.pl    5394   R 8192    17200008    2.86 data1
+06:27:00 randread.pl    5442   R 8192    20693088    1.06 data1
+06:27:00 randread.pl    5394   R 8192    28124192    1.38 data1
+06:27:00 randread.pl    5442   R 8192    23821184    1.28 data1
+06:27:00 randread.pl    5394   R 8192    1623200     1.47 data1
+[...]
+
+There's now much more output (this spans only 2 seconds, the previous output
+spanned 14 seconds), as the lower threshold is catching more I/O.
+
+
+A threshold of 0 will trace all operations. Warning: the output will be
+verbose, as it will include all file system cache hits.
+
+# ./xfsslower 0
+Tracing XFS operations
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:29:43 ls             9291   O 0       0           0.00 bench
+06:29:47 cat            9361   O 0       0           0.00 date.txt
+06:29:47 cat            9361   R 29      0           0.01 date.txt
+06:29:47 cat            9361   R 0       0           0.00 date.txt
+06:29:50 bash           20500  O 0       0           0.00 bench
+06:29:50 bash           20500  O 0       0           0.00 bench
+06:29:50 bash           20500  O 0       0           0.00 bench
+06:29:50 bash           9431   O 0       0           0.00 bench
+06:29:50 bash           9432   O 0       0           0.00 bench
+06:29:50 bash           9456   O 0       0           0.00 newdate.txt
+06:29:50 date           9456   W 29      0           0.01 newdate.txt
+06:29:53 cksum          9503   O 0       0           0.00 data1
+06:29:53 cksum          9503   R 65536   0           0.06 data1
+06:29:53 cksum          9503   R 65536   64          0.01 data1
+06:29:53 cksum          9503   R 65536   128         0.02 data1
+06:29:53 cksum          9503   R 65536   192         0.01 data1
+06:29:53 cksum          9503   R 65536   256         0.01 data1
+06:29:53 cksum          9503   R 65536   320         0.01 data1
+06:29:53 cksum          9503   R 65536   384         0.01 data1
+06:29:53 cksum          9503   R 65536   448         0.04 data1
+06:29:53 cksum          9503   R 65536   512         0.01 data1
+06:29:53 cksum          9503   R 65536   576         0.02 data1
+06:29:53 cksum          9503   R 65536   640         0.01 data1
+06:29:53 cksum          9503   R 65536   704         0.01 data1
+06:29:53 cksum          9503   R 65536   768         0.01 data1
+06:29:53 cksum          9503   R 65536   832         0.01 data1
+06:29:53 cksum          9503   R 65536   896         0.01 data1
+06:29:53 cksum          9503   R 65536   960         0.01 data1
+06:29:53 cksum          9503   R 65536   1024        0.01 data1
+06:29:53 cksum          9503   R 65536   1088        0.02 data1
+06:29:53 cksum          9503   R 65536   1152        0.01 data1
+06:29:53 cksum          9503   R 65536   1216        0.01 data1
+[...]
+
+The output now includes open operations ("O"), and writes ("W"). A cksum(1)
+command can be seen reading from a data1 file, from progressively increasing
+offsets: a sequential workload.
+
+
+A -j option will print just the fields (parsable output, csv):
+
+# ./xfsslower -j 1 
+ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE
+125563830632,randread.pl,12155,R,8192,27824193536,1057,data1
+125565050578,randread.pl,12155,R,8192,16908525568,1969,data1
+125566331140,randread.pl,12202,R,8192,16310689792,1738,data1
+125566427955,randread.pl,12155,R,8192,11127439360,1058,data1
+125567223494,randread.pl,12202,R,8192,8422031360,1131,data1
+125567331145,randread.pl,12155,R,8192,9233088512,1230,data1
+125567331220,randread.pl,12202,R,8192,12716326912,1148,data1
+125567334983,randread.pl,12155,R,8192,24545206272,2182,data1
+[...]
+
+This may be useful for visualizing with another tool, for example, for
+producing a scatter plot of ENDTIME vs LATENCY, to look for time-based
+patterns.
+
+
+USAGE message:
+
+# ./xfsslower -h
+usage: xfsslower [-h] [-j] [-p PID] [min_ms]
+
+Trace common XFS file operations slower than a threshold
+
+positional arguments:
+  min_ms             minimum I/O duration to trace, in ms (default 10)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -j, --csv          just print fields: comma-separated values
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./xfsslower             # trace operations slower than 10 ms (default)
+    ./xfsslower 1           # trace operations slower than 1 ms
+    ./xfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./xfsslower 0           # trace all operations (warning: verbose)
+    ./xfsslower -p 185      # trace PID 185 only
diff --git a/tools/zfsdist.py b/tools/zfsdist.py
new file mode 100755
index 0000000..6b29b99
--- /dev/null
+++ b/tools/zfsdist.py
@@ -0,0 +1,186 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# zfsdist  Summarize ZFS operation latency.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Feb-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+    ./zfsdist            # show operation latency as a histogram
+    ./zfsdist -p 181     # trace PID 181 only
+    ./zfsdist 1 10       # print 1 second summaries, 10 times
+    ./zfsdist -m 5       # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize ZFS operation latency",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+    help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+    factor = 1000000
+    label = "msecs"
+else:
+    factor = 1000
+    label = "usecs"
+if args.interval and int(args.interval) == 0:
+    print("ERROR: interval 0. Exiting.")
+    exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+    char op[OP_NAME_LEN];
+    u64 slot;
+} dist_key_t;
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+    u32 pid = bpf_get_current_pid_tgid();
+    if (FILTER_PID)
+        return 0;
+    u64 ts = bpf_ktime_get_ns();
+    start.update(&pid, &ts);
+    return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+    u64 *tsp;
+    u32 pid = bpf_get_current_pid_tgid();
+
+    // fetch timestamp and calculate delta
+    tsp = start.lookup(&pid);
+    if (tsp == 0) {
+        return 0;   // missed start or filtered
+    }
+    u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+
+    // store as histogram
+    dist_key_t key = {.slot = bpf_log2l(delta)};
+    __builtin_memcpy(&key.op, op, sizeof(key.op));
+    dist.increment(key);
+
+    start.delete(&pid);
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    char *op = "read";
+    return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    char *op = "write";
+    return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    char *op = "open";
+    return trace_return(ctx, op);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    char *op = "fsync";
+    return trace_return(ctx, op);
+}
+"""
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# common file functions
+if BPF.get_kprobe_functions(b'zpl_iter'):
+    b.attach_kprobe(event="zpl_iter_read", fn_name="trace_entry")
+    b.attach_kprobe(event="zpl_iter_write", fn_name="trace_entry")
+elif BPF.get_kprobe_functions(b'zpl_aio'):
+    b.attach_kprobe(event="zpl_aio_read", fn_name="trace_entry")
+    b.attach_kprobe(event="zpl_aio_write", fn_name="trace_entry")
+else:
+    b.attach_kprobe(event="zpl_read", fn_name="trace_entry")
+    b.attach_kprobe(event="zpl_write", fn_name="trace_entry")
+b.attach_kprobe(event="zpl_open", fn_name="trace_entry")
+b.attach_kprobe(event="zpl_fsync", fn_name="trace_entry")
+if BPF.get_kprobe_functions(b'zpl_iter'):
+    b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return")
+elif BPF.get_kprobe_functions(b'zpl_aio'):
+    b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return")
+else:
+    b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return")
+b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return")
+
+print("Tracing ZFS operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+    try:
+        if args.interval:
+            sleep(int(args.interval))
+        else:
+            sleep(99999999)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.interval and (not args.notimestamp):
+        print(strftime("%H:%M:%S:"))
+
+    dist.print_log2_hist(label, "operation")
+    dist.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/zfsdist_example.txt b/tools/zfsdist_example.txt
new file mode 100644
index 0000000..a02d4dc
--- /dev/null
+++ b/tools/zfsdist_example.txt
@@ -0,0 +1,183 @@
+Demonstrations of zfsdist, the Linux eBPF/bcc version.
+
+
+zfsdist traces ZFS reads, writes, opens, and fsyncs, and summarizes their
+latency as a power-of-2 histogram. It has been written to work on ZFS on Linux
+(http://zfsonlinux.org). For example:
+
+# ./zfsdist 
+Tracing ZFS operation latency... Hit Ctrl-C to end.
+^C
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 4479     |****************************************|
+         8 -> 15         : 1028     |*********                               |
+        16 -> 31         : 14       |                                        |
+        32 -> 63         : 1        |                                        |
+        64 -> 127        : 2        |                                        |
+       128 -> 255        : 6        |                                        |
+       256 -> 511        : 1        |                                        |
+       512 -> 1023       : 1256     |***********                             |
+      1024 -> 2047       : 9        |                                        |
+      2048 -> 4095       : 1        |                                        |
+      4096 -> 8191       : 2        |                                        |
+
+operation = 'write'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 75       |****************************************|
+       256 -> 511        : 11       |*****                                   |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 0        |                                        |
+     65536 -> 131071     : 13       |******                                  |
+    131072 -> 262143     : 1        |                                        |
+
+operation = 'open'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 2        |****************************************|
+
+This output shows a bimodal distribution for read latency, with a faster
+mode of around 5 thousand reads that took between 4 and 15 microseconds, and a
+slower mode of 1256 reads that took between 512 and 1023 microseconds. It's
+likely that the faster mode was a hit from the in-memory file system cache,
+and the slower mode is a read from a storage device (disk).
+
+The write latency is also bimodal, with a faster mode between 128 and 511 us,
+and the slower mode between 65 and 131 ms.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system (via the ZFS POSIX layer), to when it completed.
+This spans everything: block device I/O (disk I/O), file system CPU cycles,
+file system locks, run queue latency, etc. This is a better measure of the
+latency suffered by applications reading from the file system than measuring
+this down at the block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example:
+
+# ./zfsdist 1 5
+Tracing ZFS operation latency... Hit Ctrl-C to end.
+
+06:55:41:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 3976     |****************************************|
+         8 -> 15         : 1181     |***********                             |
+        16 -> 31         : 18       |                                        |
+        32 -> 63         : 4        |                                        |
+        64 -> 127        : 17       |                                        |
+       128 -> 255        : 16       |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1275     |************                            |
+      1024 -> 2047       : 36       |                                        |
+      2048 -> 4095       : 3        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 1        |                                        |
+     16384 -> 32767      : 1        |                                        |
+
+06:55:42:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 12751    |****************************************|
+         8 -> 15         : 1190     |***                                     |
+        16 -> 31         : 38       |                                        |
+        32 -> 63         : 7        |                                        |
+        64 -> 127        : 85       |                                        |
+       128 -> 255        : 47       |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1010     |***                                     |
+      1024 -> 2047       : 49       |                                        |
+      2048 -> 4095       : 12       |                                        |
+
+06:55:43:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 80925    |****************************************|
+         8 -> 15         : 1645     |                                        |
+        16 -> 31         : 251      |                                        |
+        32 -> 63         : 24       |                                        |
+        64 -> 127        : 16       |                                        |
+       128 -> 255        : 12       |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 80       |                                        |
+      1024 -> 2047       : 1        |                                        |
+
+06:55:44:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 81207    |****************************************|
+         8 -> 15         : 2075     |*                                       |
+        16 -> 31         : 2005     |                                        |
+        32 -> 63         : 177      |                                        |
+        64 -> 127        : 3        |                                        |
+
+06:55:45:
+
+operation = 'read'
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 74364    |****************************************|
+         8 -> 15         : 865      |                                        |
+        16 -> 31         : 4960     |**                                      |
+        32 -> 63         : 625      |                                        |
+        64 -> 127        : 2        |                                        |
+
+This workload was randomly reading from a file that became cached. The slower
+mode can be seen to disappear by the final summaries.
+
+
+USAGE message:
+
+# ./zfsdist -h
+usage: zfsdist [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize ZFS operation latency
+
+positional arguments:
+  interval            output interval, in seconds
+  count               number of outputs
+
+optional arguments:
+  -h, --help          show this help message and exit
+  -T, --notimestamp   don't include timestamp on interval output
+  -m, --milliseconds  output in milliseconds
+  -p PID, --pid PID   trace this PID only
+
+examples:
+    ./zfsdist            # show operation latency as a histogram
+    ./zfsdist -p 181     # trace PID 181 only
+    ./zfsdist 1 10       # print 1 second summaries, 10 times
+    ./zfsdist -m 5       # 5s summaries, milliseconds
diff --git a/tools/zfsslower.py b/tools/zfsslower.py
new file mode 100755
index 0000000..8ab283a
--- /dev/null
+++ b/tools/zfsslower.py
@@ -0,0 +1,318 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# zfsslower  Trace slow ZFS operations.
+#            For Linux, uses BCC, eBPF.
+#
+# USAGE: zfsslower [-h] [-j] [-p PID] [min_ms]
+#
+# This script traces common ZFS file operations: reads, writes, opens, and
+# syncs. It measures the time spent in these operations, and prints details
+# for each that exceeded a threshold.
+#
+# WARNING: This adds low-overhead instrumentation to these ZFS operations,
+# including reads and writes from the file system cache. Such reads and writes
+# can be very frequent (depending on the workload; eg, 1M/sec), at which
+# point the overhead of this tool (even if it prints no "slower" events) can
+# begin to become significant.
+#
+# This works by using kernel dynamic tracing of the ZPL interface, and will
+# need updates to match any changes to this interface.
+#
+# By default, a minimum millisecond threshold of 10 is used.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
+
+from __future__ import print_function
+from bcc import BPF
+import argparse
+from time import strftime
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./zfsslower             # trace operations slower than 10 ms (default)
+    ./zfsslower 1           # trace operations slower than 1 ms
+    ./zfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./zfsslower 0           # trace all operations (warning: verbose)
+    ./zfsslower -p 185      # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+    description="Trace common ZFS file operations slower than a threshold",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="just print fields: comma-separated values")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("min_ms", nargs="?", default='10',
+    help="minimum I/O duration to trace, in ms (default 10)")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+min_ms = int(args.min_ms)
+pid = args.pid
+csv = args.csv
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/dcache.h>
+
+// XXX: switch these to char's when supported
+#define TRACE_READ      0
+#define TRACE_WRITE     1
+#define TRACE_OPEN      2
+#define TRACE_FSYNC     3
+
+struct val_t {
+    u64 ts;
+    u64 offset;
+    struct file *fp;
+};
+
+struct data_t {
+    // XXX: switch some to u32's when supported
+    u64 ts_us;
+    u64 type;
+    u64 size;
+    u64 offset;
+    u64 delta_us;
+    u64 pid;
+    char task[TASK_COMM_LEN];
+    char file[DNAME_INLINE_LEN];
+};
+
+BPF_HASH(entryinfo, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+//
+// Store timestamp and size on entry
+//
+
+// zpl_read(), zpl_write():
+int trace_rw_entry(struct pt_regs *ctx, struct file *filp, char __user *buf,
+    size_t len, loff_t *ppos)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = filp;
+    val.offset = *ppos;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// zpl_open():
+int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
+    struct file *filp)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filep and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = filp;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+// zpl_fsync():
+int trace_fsync_entry(struct pt_regs *ctx, struct file *filp)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    if (FILTER_PID)
+        return 0;
+
+    // store filp and timestamp by id
+    struct val_t val = {};
+    val.ts = bpf_ktime_get_ns();
+    val.fp = filp;
+    val.offset = 0;
+    if (val.fp)
+        entryinfo.update(&id, &val);
+
+    return 0;
+}
+
+//
+// Output
+//
+
+static int trace_return(struct pt_regs *ctx, int type)
+{
+    struct val_t *valp;
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
+    valp = entryinfo.lookup(&id);
+    if (valp == 0) {
+        // missed tracing issue or filtered
+        return 0;
+    }
+
+    // calculate delta
+    u64 ts = bpf_ktime_get_ns();
+    u64 delta_us = (ts - valp->ts) / 1000;
+    entryinfo.delete(&id);
+    if (FILTER_US)
+        return 0;
+
+    // populate output struct
+    u32 size = PT_REGS_RC(ctx);
+    struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
+        .pid = pid};
+    data.ts_us = ts / 1000;
+    data.offset = valp->offset;
+    bpf_get_current_comm(&data.task, sizeof(data.task));
+
+    struct qstr qs = valp->fp->f_path.dentry->d_name;
+    if (qs.len == 0)
+        return 0;
+    bpf_probe_read(&data.file, sizeof(data.file), (void *)qs.name);
+
+    // output
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_READ);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_WRITE);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_OPEN);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+    return trace_return(ctx, TRACE_FSYNC);
+}
+
+"""
+if min_ms == 0:
+    bpf_text = bpf_text.replace('FILTER_US', '0')
+else:
+    bpf_text = bpf_text.replace('FILTER_US',
+        'delta_us <= %s' % str(min_ms * 1000))
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# kernel->user event data: struct data_t
+DNAME_INLINE_LEN = 32   # linux/dcache.h
+TASK_COMM_LEN = 16      # linux/sched.h
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts_us", ct.c_ulonglong),
+        ("type", ct.c_ulonglong),
+        ("size", ct.c_ulonglong),
+        ("offset", ct.c_ulonglong),
+        ("delta_us", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("task", ct.c_char * TASK_COMM_LEN),
+        ("file", ct.c_char * DNAME_INLINE_LEN)
+    ]
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+
+    type = 'R'
+    if event.type == 1:
+        type = 'W'
+    elif event.type == 2:
+        type = 'O'
+    elif event.type == 3:
+        type = 'S'
+
+    if (csv):
+        print("%d,%s,%d,%s,%d,%d,%d,%s" % (
+            event.ts_us, event.task.decode('utf-8', 'replace'), event.pid,
+            type, event.size, event.offset, event.delta_us,
+            event.file.decode('utf-8', 'replace')))
+        return
+    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
+        event.task.decode('utf-8', 'replace'), event.pid, type, event.size,
+        event.offset / 1024, float(event.delta_us) / 1000,
+        event.file.decode('utf-8', 'replace')))
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+# common file functions
+if BPF.get_kprobe_functions(b'zpl_iter'):
+    b.attach_kprobe(event="zpl_iter_read", fn_name="trace_rw_entry")
+    b.attach_kprobe(event="zpl_iter_write", fn_name="trace_rw_entry")
+elif BPF.get_kprobe_functions(b'zpl_aio'):
+    b.attach_kprobe(event="zpl_aio_read", fn_name="trace_rw_entry")
+    b.attach_kprobe(event="zpl_aio_write", fn_name="trace_rw_entry")
+else:
+    b.attach_kprobe(event="zpl_read", fn_name="trace_rw_entry")
+    b.attach_kprobe(event="zpl_write", fn_name="trace_rw_entry")
+b.attach_kprobe(event="zpl_open", fn_name="trace_open_entry")
+b.attach_kprobe(event="zpl_fsync", fn_name="trace_fsync_entry")
+if BPF.get_kprobe_functions(b'zpl_iter'):
+    b.attach_kretprobe(event="zpl_iter_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_iter_write", fn_name="trace_write_return")
+elif BPF.get_kprobe_functions(b'zpl_aio'):
+    b.attach_kretprobe(event="zpl_aio_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_aio_write", fn_name="trace_write_return")
+else:
+    b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return")
+    b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return")
+b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return")
+
+# header
+if (csv):
+    print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
+else:
+    if min_ms == 0:
+        print("Tracing ZFS operations")
+    else:
+        print("Tracing ZFS operations slower than %d ms" % min_ms)
+    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
+        "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
+
+# read events
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+while 1:
+    b.perf_buffer_poll()
diff --git a/tools/zfsslower_example.txt b/tools/zfsslower_example.txt
new file mode 100644
index 0000000..fddae6e
--- /dev/null
+++ b/tools/zfsslower_example.txt
@@ -0,0 +1,157 @@
+Demonstrations of zfsslower, the Linux eBPF/bcc version.
+
+
+zfsslower shows ZFS reads, writes, opens, and fsyncs, slower than a threshold.
+It has been written to work on ZFS on Linux (http://zfsonlinux.org). For
+example:
+
+# ./zfsslower 
+Tracing ZFS operations slower than 10 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:31:28 dd             25570  W 131072  38784     303.92 data1
+06:31:34 dd             25686  W 131072  38784     388.28 data1
+06:31:35 dd             25686  W 131072  78720     519.66 data1
+06:31:35 dd             25686  W 131072  116992    405.94 data1
+06:31:35 dd             25686  W 131072  153600    433.52 data1
+06:31:36 dd             25686  W 131072  188672    314.37 data1
+06:31:36 dd             25686  W 131072  222336    372.33 data1
+06:31:36 dd             25686  W 131072  254592    309.59 data1
+06:31:37 dd             25686  W 131072  285440    304.52 data1
+06:31:37 dd             25686  W 131072  315008    236.45 data1
+06:31:37 dd             25686  W 131072  343424    193.54 data1
+06:31:38 dd             25686  W 131072  370560    286.07 data1
+06:31:38 dd             25686  W 131072  396672    251.92 data1
+[...]
+
+This shows writes to a "data1" file, each taking well over the 10 ms threshold.
+the slowest, on the 3rd line of output, reached 519.66 ms for a 128 Kbyte
+write by the "dd" command.
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system (via the ZFS POSIX layer), to when it completed.
+This spans everything: block device I/O (disk I/O), file system CPU cycles,
+file system locks, run queue latency, etc. This is a better measure of the
+latency suffered by applications reading from the file system than measuring
+this down at the block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+A threshold of 0 will trace all operations. Warning: the output will be
+verbose, as it will include all file system cache hits.
+
+# ./zfsslower 0
+Tracing ZFS operations
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:36:07 dd             32242  O 0       0           0.01 data1
+06:36:07 dd             32242  W 131072  0           0.25 data1
+06:36:07 dd             32242  W 131072  128         0.03 data1
+06:36:07 dd             32242  W 131072  256         0.04 data1
+06:36:07 dd             32242  W 131072  384         0.04 data1
+06:36:07 dd             32242  W 131072  512         0.04 data1
+06:36:07 dd             32242  W 131072  640         0.03 data1
+06:36:07 dd             32242  W 131072  768         0.03 data1
+06:36:07 dd             32242  W 131072  896         0.04 data1
+06:36:07 dd             32242  W 131072  1024        0.28 data1
+06:36:07 dd             32242  W 131072  1152        0.04 data1
+06:36:07 dd             32242  W 131072  1280        0.03 data1
+[...]
+06:36:07 dd             32242  W 131072  13824       0.04 data1
+06:36:07 dd             32242  W 131072  13952       0.04 data1
+06:36:07 dd             32242  W 131072  14080       0.04 data1
+06:36:07 dd             32242  W 131072  14208     398.92 data1
+06:36:07 dd             32242  W 131072  14336       0.04 data1
+06:36:07 dd             32242  W 131072  14464       0.04 data1
+06:36:07 dd             32242  W 131072  15104       0.03 data1
+[...]
+
+The output now includes the open operation for this file ("O"), and then the
+writes. Most of the writes are very fast, with only an occasional outlier that
+is in the hundreds of milliseconds.
+
+Fortunately this is not a real world environment: I setup a zpool on top of a
+XFS file system for testing purposes. More debugging using other tools will
+explain these outliers: possibly XFS flushing.
+
+
+Here's a random read workload, and showing operations slower than 1 ms:
+
+# ./zfsslower 1
+Tracing ZFS operations slower than 1 ms
+TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
+06:47:30 randread.pl    15431  R 8192    97840       1.03 data1
+06:47:30 randread.pl    15431  R 8192    416744      1.12 data1
+06:47:31 randread.pl    15431  R 8192    228856      1.96 data1
+06:47:31 randread.pl    15431  R 8192    452248      1.02 data1
+06:47:31 randread.pl    15431  R 8192    315288      5.90 data1
+06:47:31 randread.pl    15431  R 8192    752696      1.20 data1
+06:47:31 randread.pl    15431  R 8192    481832      1.39 data1
+06:47:31 randread.pl    15431  R 8192    673752      1.39 data1
+06:47:31 randread.pl    15431  R 8192    691736      1.01 data1
+06:47:31 randread.pl    15431  R 8192    694776      1.78 data1
+06:47:31 randread.pl    15431  R 8192    403328      3.75 data1
+06:47:31 randread.pl    15431  R 8192    567688      1.08 data1
+06:47:31 randread.pl    15431  R 8192    694280      1.31 data1
+06:47:31 randread.pl    15431  R 8192    669280      1.06 data1
+06:47:31 randread.pl    15431  R 8192    426608      1.56 data1
+06:47:31 randread.pl    15431  R 8192    42512       1.01 data1
+06:47:31 randread.pl    15431  R 8192    22944       1.33 data1
+06:47:31 randread.pl    15431  R 8192    427432      1.48 data1
+06:47:31 randread.pl    15431  R 8192    261320      1.28 data1
+06:47:31 randread.pl    15431  R 8192    132248      1.23 data1
+06:47:31 randread.pl    15431  R 8192    96936       1.04 data1
+06:47:31 randread.pl    15431  R 8192    482800      2.63 data1
+[...]
+
+
+A -j option will print just the fields (parsable output, csv):
+
+# ./zfsslower -j 1
+ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE
+252305490911,randread.pl,17922,R,8192,163446784,1156,data1
+252305493852,randread.pl,17922,R,8192,321437696,1129,data1
+252305498839,randread.pl,17922,R,8192,475152384,1154,data1
+252305505515,randread.pl,17922,R,8192,49094656,1082,data1
+252305506774,randread.pl,17922,R,8192,470401024,1245,data1
+252305509265,randread.pl,17922,R,8192,553246720,2412,data1
+252305512365,randread.pl,17922,R,8192,20963328,1093,data1
+252305513755,randread.pl,17922,R,8192,304111616,1350,data1
+252305583330,randread.pl,17922,R,8192,166174720,1154,data1
+252305593913,randread.pl,17922,R,8192,175079424,1241,data1
+252305602833,randread.pl,17922,R,8192,305340416,3307,data1
+252305608663,randread.pl,17922,R,8192,655958016,2704,data1
+252305611212,randread.pl,17922,R,8192,40951808,1033,data1
+252305614609,randread.pl,17922,R,8192,318922752,2687,data1
+252305623800,randread.pl,17922,R,8192,246734848,2983,data1
+252305711125,randread.pl,17922,R,8192,581795840,1091,data1
+252305728694,randread.pl,17922,R,8192,710483968,1034,data1
+252305762046,randread.pl,17922,R,8192,329367552,1405,data1
+252305798215,randread.pl,17922,R,8192,44482560,1030,data1
+252305806748,randread.pl,17922,R,8192,660602880,1069,data1
+252305826360,randread.pl,17922,R,8192,616144896,2327,data1
+[...]
+
+
+USAGE message:
+
+# ./zfsslower -h
+usage: zfsslower [-h] [-j] [-p PID] [min_ms]
+
+Trace common ZFS file operations slower than a threshold
+
+positional arguments:
+  min_ms             minimum I/O duration to trace, in ms (default 10)
+
+optional arguments:
+  -h, --help         show this help message and exit
+  -j, --csv          just print fields: comma-separated values
+  -p PID, --pid PID  trace this PID only
+
+examples:
+    ./zfsslower             # trace operations slower than 10 ms (default)
+    ./zfsslower 1           # trace operations slower than 1 ms
+    ./zfsslower -j 1        # ... 1 ms, parsable output (csv)
+    ./zfsslower 0           # trace all operations (warning: verbose)
+    ./zfsslower -p 185      # trace PID 185 only