Merge "Upgrade bcc to v0.8.0" am: ee971d503a am: c910c866b3
am: 1d9c1f90bc

Change-Id: I067074ec0b499d1349dabd106638c463ad0a6192
diff --git a/.gitignore b/.gitignore
index 2e39a80..65a3946 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 *.swo
 *.pyc
 .idea
+*~
 
 # Build artifacts
 /build/
diff --git a/.travis.yml b/.travis.yml
index dd36669..9835547 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,5 +3,6 @@
   - sudo apt-get install -y python-pip
   - sudo pip install pep8
 script:
+  - set -euo pipefail
   - ./scripts/check-helpers.sh
-  - find tools/ -type f -name "*.py" | xargs pep8 -r --show-source --ignore=E123,E125,E126,E127,E128,E302
+  - ./scripts/py-style-check.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb3f53b..610e153 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@
 include(CheckCXXCompilerFlag)
 include(cmake/FindCompilerFlag.cmake)
 
+option(ENABLE_LLVM_NATIVECODEGEN "Enable use of llvm nativecodegen module (needed by rw-engine)" ON)
+option(ENABLE_RTTI "Enable compiling with real time type information" OFF)
 option(ENABLE_LLVM_SHARED "Enable linking LLVM as a shared library" OFF)
 option(ENABLE_CLANG_JIT "Enable Loading BPF through Clang Frontend" ON)
 option(ENABLE_USDT "Enable User-level Statically Defined Tracing" ON)
diff --git a/INSTALL.md b/INSTALL.md
index 25df2ec..7c1934b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -7,7 +7,7 @@
   - [Arch](#arch---aur)
   - [Gentoo](#gentoo---portage)
   - [openSUSE](#opensuse---binary)
-  - [RHEL](#redhat---binary)
+  - [RHEL](#rhel---binary)
 * [Source](#source)
   - [Debian](#debian---source)
   - [Ubuntu](#ubuntu---source)
@@ -58,11 +58,11 @@
 
 ```bash
 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 4052245BD4284CDD
-echo "deb https://repo.iovisor.org/apt/xenial xenial main" | sudo tee /etc/apt/sources.list.d/iovisor.list
+echo "deb https://repo.iovisor.org/apt/$(lsb_release -cs) $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/iovisor.list
 sudo apt-get update
 sudo apt-get install bcc-tools libbcc-examples linux-headers-$(uname -r)
 ```
-(replace `xenial` with `artful` or `bionic` as appropriate)
+(replace `xenial` with `artful` or `bionic` as appropriate). Tools will be installed under /usr/share/bcc/tools.
 
 **Nightly Packages**
 
@@ -73,6 +73,16 @@
 ```
 (replace `xenial` with `artful` or `bionic` as appropriate)
 
+**Ubuntu Packages**
+
+The previous commands will install the latest bcc from the iovisor repositories. It is also available from the standard Ubuntu multiverse repository, under the package name `bpfcc-tools`.
+
+```bash
+sudo apt-get install bpfcc-tools linux-headers-$(uname -r)
+```
+
+The tools are installed in /sbin with a -bpfcc extension. Try running `sudo opensnoop-bpfcc`.
+
 ## Fedora - Binary
 
 Ensure that you are running a 4.2+ kernel with `uname -r`. If not, install a 4.2+ kernel from
@@ -148,7 +158,7 @@
 
 ## RHEL - Binary
 
-For Redhat 7.6 (Beta) bcc is already included in the official yum repository as bcc-tools. As part of the install the following dependencies are installed: bcc.x86_64 0:0.6.0-3.el7 ,llvm-private.x86_64 0:6.0.1-2.el7 ,python-bcc.x86_64 0:0.6.0-3.el7,python-netaddr.noarch 0:0.7.5-9.el7
+For RHEL 7.6, bcc is already included in the official yum repository as bcc-tools. As part of the install, the following dependencies are installed: bcc.x86_64 0:0.6.1-2.el7 ,llvm-private.x86_64 0:6.0.1-2.el7 ,python-bcc.x86_64 0:0.6.1-2.el7,python-netaddr.noarch 0:0.7.5-9.el7
 
 ```
 yum install bcc-tools
@@ -258,7 +268,11 @@
 wget -O - http://llvm.org/apt/llvm-snapshot.gpg.key | sudo apt-key add -
 sudo apt-get update
 
-# All versions
+# For bionic
+sudo apt-get -y install bison build-essential cmake flex git libedit-dev \
+  libllvm6.0 llvm-6.0-dev libclang-6.0-dev python zlib1g-dev libelf-dev
+
+# For other versions
 sudo apt-get -y install bison build-essential cmake flex git libedit-dev \
   libllvm3.7 llvm-3.7-dev libclang-3.7-dev python zlib1g-dev libelf-dev
 
diff --git a/METADATA b/METADATA
index d2d1a3c..495af06 100644
--- a/METADATA
+++ b/METADATA
@@ -1,7 +1,5 @@
 name: "BCC"
-description:
-    "BCC is a toolkit for creating efficient kernel tracing and manipulation programs"
-
+description: "BCC is a toolkit for creating efficient kernel tracing and manipulation programs"
 third_party {
   url {
     type: HOMEPAGE
@@ -11,6 +9,10 @@
     type: GIT
     value: "https://github.com/iovisor/bcc.git"
   }
-  version: "b998421b18a34d0b47a6bda996c91bad12fa5da0"
-  last_upgrade_date { year: 2018 month: 10 day: 31 }
+  version: "v0.8.0"
+  last_upgrade_date {
+    year: 2019
+    month: 2
+    day: 1
+  }
 }
diff --git a/README.md b/README.md
index 50d6db0..54733eb 100644
--- a/README.md
+++ b/README.md
@@ -132,6 +132,8 @@
 - tools/[runqlat](tools/runqlat.py): Run queue (scheduler) latency as a histogram. [Examples](tools/runqlat_example.txt).
 - tools/[runqlen](tools/runqlen.py): Run queue length as a histogram. [Examples](tools/runqlen_example.txt).
 - tools/[runqslower](tools/runqslower.py): Trace long process scheduling delays. [Examples](tools/runqslower_example.txt).
+- tools/[shmsnoop](tools/shmsnoop.py): Trace System V shared memory syscalls. [Examples](tools/shmsnoop_example.txt).
+- tools/[sofdsnoop](tools/sofdsnoop.py): Trace FDs passed through unix sockets. [Examples](tools/sofdsnoop_example.txt).
 - tools/[slabratetop](tools/slabratetop.py): Kernel SLAB/SLUB memory cache allocation rate top. [Examples](tools/slabratetop_example.txt).
 - tools/[softirqs](tools/softirqs.py):  Measure soft IRQ (soft interrupt) event time. [Examples](tools/softirqs_example.txt).
 - tools/[solisten](tools/solisten.py): Trace TCP socket listen. [Examples](tools/solisten_example.txt).
diff --git a/SPECS/bcc.spec b/SPECS/bcc.spec
index f74bb61..691ab3d 100644
--- a/SPECS/bcc.spec
+++ b/SPECS/bcc.spec
@@ -88,10 +88,6 @@
 %install
 pushd build
 make install/strip DESTDIR=%{buildroot}
-# mangle shebangs
-find %{buildroot}/usr/share/bcc/{tools,examples} -type f -exec \
-    sed -i -e '1 s|^#!/usr/bin/python$|#!'%{__python}'|' \
-           -e '1 s|^#!/usr/bin/env python$|#!'%{__python}'|' {} \;
 
 %package -n libbcc
 Summary: Shared Library for BPF Compiler Collection (BCC)
diff --git a/cmake/clang_libs.cmake b/cmake/clang_libs.cmake
index 12aa9fd..5ebfaa5 100644
--- a/cmake/clang_libs.cmake
+++ b/cmake/clang_libs.cmake
@@ -2,7 +2,10 @@
 set(llvm_libs "LLVM")
 else()
 set(llvm_raw_libs bitwriter bpfcodegen debuginfodwarf irreader linker
-  mcjit objcarcopts option passes nativecodegen lto)
+  mcjit objcarcopts option passes lto)
+if(ENABLE_LLVM_NATIVECODEGEN)
+set(llvm_raw_libs ${llvm_raw_libs} nativecodegen)
+endif()
 list(FIND LLVM_AVAILABLE_LIBS "LLVMCoverage" _llvm_coverage)
 if (${_llvm_coverage} GREATER -1)
   list(APPEND llvm_raw_libs coverage)
diff --git a/debian/changelog b/debian/changelog
index 1f5be87..0f0e89b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+bcc (0.8.0-1) unstable; urgency=low
+
+  * Support for kernel up to 5.0
+
+ -- Brenden Blanco <bblanco@gmail.com>  Fri, 11 Jan 2019 17:00:00 +0000
+
 bcc (0.7.0-1) unstable; urgency=low
 
   * Support for kernel up to 4.18
diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md
index af42c8f..48e6d2e 100644
--- a/docs/kernel-versions.md
+++ b/docs/kernel-versions.md
@@ -181,12 +181,13 @@
 `BPF_FUNC_lwt_seg6_store_bytes()` | 4.18 |  | [`fe94cc290f53`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=fe94cc290f535709d3c5ebd1e472dfd0aec7ee79)
 `BPF_FUNC_map_delete_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
 `BPF_FUNC_map_lookup_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
-`BPF_FUNC_map_peek_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
-`BPF_FUNC_map_pop_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
-`BPF_FUNC_map_push_elem()` | 3.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_peek_elem()` | 4.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_pop_elem()` | 4.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
+`BPF_FUNC_map_push_elem()` | 4.20 |  | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92)
 `BPF_FUNC_map_update_elem()` | 3.19 |  | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5)
 `BPF_FUNC_msg_apply_bytes()` | 4.17 |  | [`2a100317c9eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a100317c9ebc204a166f16294884fbf9da074ce)
 `BPF_FUNC_msg_cork_bytes()` | 4.17 |  | [`91843d540a13`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91843d540a139eb8070bcff8aa10089164436deb)
+`BPF_FUNC_msg_pop_data()` | 4.21 |  | [`7246d8ed4dcc`](https://github.com/torvalds/linux/commit/7246d8ed4dcce23f7509949a77be15fa9f0e3d28)
 `BPF_FUNC_msg_pull_data()` | 4.17 |  | [`015632bb30da`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=015632bb30daaaee64e1bcac07570860e0bf3092)
 `BPF_FUNC_msg_push_data()` | 4.20 |  | [`6fff607e2f14`](https://github.com/torvalds/linux/commit/6fff607e2f14bd7c63c06c464a6f93b8efbabe28)
 `BPF_FUNC_msg_redirect_hash()` | 4.18 |  | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4)
@@ -199,6 +200,7 @@
 `BPF_FUNC_probe_read_str()` | 4.11 | GPL | [`a5e8c07059d0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5e8c07059d0f0b31737408711d44794928ac218)
 `BPF_FUNC_probe_write_user()` | 4.8 | GPL | [`96ae52279594`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=96ae52279594470622ff0585621a13e96b700600)
 `BPF_FUNC_rc_keydown()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936)
+`BPF_FUNC_rc_pointer_rel()` | 4.21 | GPL | [`01d3240a04f4`](https://github.com/torvalds/linux/commit/01d3240a04f4c09392e13c77b54d4423ebce2d72)
 `BPF_FUNC_rc_repeat()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936)
 `BPF_FUNC_redirect()` | 4.4 |  | [`27b29f63058d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=27b29f63058d26c6c1742f1993338280d5a41dc6)
 `BPF_FUNC_redirect_map()` | 4.14 |  | [`97f91a7cf04f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=97f91a7cf04ff605845c20948b8a80e54cbd3376)
@@ -272,11 +274,11 @@
 |`BPF_PROG_TYPE_SOCK_OPS`|`BPF_FUNC_setsockopt()` <br> `BPF_FUNC_getsockopt()` <br> `BPF_FUNC_sock_ops_cb_flags_set()` <br> `BPF_FUNC_sock_map_update()` <br> `BPF_FUNC_sock_hash_update()` <br> `BPF_FUNC_get_socket_cookie()` <br> `Base functions`|
 |`BPF_PROG_TYPE_SK_SKB`|`BPF_FUNC_skb_store_bytes()` <br> `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_skb_pull_data()` <br> `BPF_FUNC_skb_change_tail()` <br> `BPF_FUNC_skb_change_head()` <br> `BPF_FUNC_get_socket_cookie()` <br> `BPF_FUNC_get_socket_uid()` <br> `BPF_FUNC_sk_redirect_map()` <br> `BPF_FUNC_sk_redirect_hash()` <br> `BPF_FUNC_sk_lookup_tcp()` <br> `BPF_FUNC_sk_lookup_udp()` <br> `BPF_FUNC_sk_release()` <br> `Base functions`|
 |`BPF_PROG_TYPE_CGROUP_DEVICE`|`BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_get_current_uid_gid()` <br> `BPF_FUNC_trace_printk()`|
-|`BPF_PROG_TYPE_SK_MSG`|`BPF_FUNC_msg_redirect_map()` <br> `BPF_FUNC_msg_redirect_hash()` <br> `BPF_FUNC_msg_apply_bytes()` <br> `BPF_FUNC_msg_cork_bytes()` <br> `BPF_FUNC_msg_pull_data()` <br> `BPF_FUNC_msg_push_data()` <br> `Base functions`|
+|`BPF_PROG_TYPE_SK_MSG`|`BPF_FUNC_msg_redirect_map()` <br> `BPF_FUNC_msg_redirect_hash()` <br> `BPF_FUNC_msg_apply_bytes()` <br> `BPF_FUNC_msg_cork_bytes()` <br> `BPF_FUNC_msg_pull_data()` <br> `BPF_FUNC_msg_push_data()` <br> `BPF_FUNC_msg_pop_data()` <br> `Base functions`|
 |`BPF_PROG_TYPE_RAW_TRACEPOINT`|`BPF_FUNC_perf_event_output()` <br> `BPF_FUNC_get_stackid()` <br> `BPF_FUNC_get_stack()` <br> `Tracing functions`|
 |`BPF_PROG_TYPE_CGROUP_SOCK_ADDR`|`BPF_FUNC_get_current_uid_gid()` <br> `BPF_FUNC_bind()` <br> `BPF_FUNC_get_socket_cookie()` <br> `Base functions`|
 |`BPF_PROG_TYPE_LWT_SEG6LOCAL`|`BPF_FUNC_lwt_seg6_store_bytes()` <br> `BPF_FUNC_lwt_seg6_action()` <br> `BPF_FUNC_lwt_seg6_adjust_srh()` <br> `LWT functions`|
-|`BPF_PROG_TYPE_LIRC_MODE2`|`BPF_FUNC_rc_repeat()` <br> `BPF_FUNC_rc_keydown()` <br> `BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_ktime_get_ns()` <br> `BPF_FUNC_tail_call()` <br> `BPF_FUNC_get_prandom_u32()` <br> `BPF_FUNC_trace_printk()`|
+|`BPF_PROG_TYPE_LIRC_MODE2`|`BPF_FUNC_rc_repeat()` <br> `BPF_FUNC_rc_keydown()` <br> `BPF_FUNC_rc_pointer_rel()` <br> `BPF_FUNC_map_lookup_elem()` <br> `BPF_FUNC_map_update_elem()` <br> `BPF_FUNC_map_delete_elem()` <br> `BPF_FUNC_ktime_get_ns()` <br> `BPF_FUNC_tail_call()` <br> `BPF_FUNC_get_prandom_u32()` <br> `BPF_FUNC_trace_printk()`|
 |`BPF_PROG_TYPE_SK_REUSEPORT`|`BPF_FUNC_sk_select_reuseport()` <br> `BPF_FUNC_skb_load_bytes()` <br> `BPF_FUNC_load_bytes_relative()` <br> `Base functions`|
 |`BPF_PROG_TYPE_FLOW_DISSECTOR`|`BPF_FUNC_skb_load_bytes()` <br> `Base functions`|
 
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index a90cf31..79d5103 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -433,11 +433,11 @@
 
 ### 1. bpf_trace_printk()
 
-Syntax: ```int bpf_trace_printk(const char *fmt, int fmt_size, ...)```
+Syntax: ```int bpf_trace_printk(const char *fmt, ...)```
 
 Return: 0 on success
 
-A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT().
+A simple kernel facility for printf() to the common trace_pipe (/sys/kernel/debug/tracing/trace_pipe). This is ok for some quick examples, but has limitations: 3 args max, 1 %s only, and trace_pipe is globally shared, so concurrent programs will have clashing output. A better interface is via BPF_PERF_OUTPUT(). Note that calling this helper is made simpler than the original kernel version, which has ```fmt_size``` as the second parameter.
 
 Examples in situ:
 [search /examples](https://github.com/iovisor/bcc/search?q=bpf_trace_printk+path%3Aexamples&type=Code),
diff --git a/docs/tutorial.md b/docs/tutorial.md
index e00c79d..09de4a2 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -63,7 +63,7 @@
 
 ```
 # ./opensnoop
-PID    COMM      FD ERR PATH
+PID    COMM               FD ERR PATH
 1565   redis-server        5   0 /proc/1565/stat
 1565   redis-server        5   0 /proc/1565/stat
 1565   redis-server        5   0 /proc/1565/stat
diff --git a/docs/tutorial_bcc_python_developer.md b/docs/tutorial_bcc_python_developer.md
index 40d4985..192902e 100644
--- a/docs/tutorial_bcc_python_developer.md
+++ b/docs/tutorial_bcc_python_developer.md
@@ -67,14 +67,14 @@
 # define BPF program
 prog = """
 int hello(void *ctx) {
-	bpf_trace_printk("Hello, World!\\n");
-	return 0;
+    bpf_trace_printk("Hello, World!\\n");
+    return 0;
 }
 """
 
 # load BPF program
 b = BPF(text=prog)
-b.attach_kprobe(event="sys_clone", fn_name="hello")
+b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
 
 # header
 print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE"))
@@ -94,7 +94,7 @@
 
 1. ```hello()```: Now we're just declaring a C function, instead of the ```kprobe__``` shortcut. We'll refer to this later. All C functions declared in the BPF program are expected to be executed on a probe, hence they all need to take a ```pt_reg* ctx``` as first argument. If you need to define some helper function that will not be executed on a probe, they need to be defined as ```static inline``` in order to be inlined by the compiler. Sometimes you would also need to add ```_always_inline``` function attribute to it.
 
-1. ```b.attach_kprobe(event="sys_clone", fn_name="hello")```: Creates a kprobe for the sys_clone() kernel function, which will execute our defined hello() function. You can call attach_kprobe() more than once, and attach your C function to multiple kernel functions.
+1. ```b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")```: Creates a kprobe for the kernel clone system call function, which will execute our defined hello() function. You can call attach_kprobe() more than once, and attach your C function to multiple kernel functions.
 
 1. ```b.trace_fields()```: Returns a fixed set of fields from trace_pipe. Similar to trace_print(), this is handy for hacking, but for real tooling we should switch to BPF_PERF_OUTPUT().
 
@@ -114,37 +114,37 @@
 This program is [examples/tracing/sync_timing.py](../examples/tracing/sync_timing.py):
 
 ```Python
+from __future__ import print_function
 from bcc import BPF
 
 # load BPF program
 b = BPF(text="""
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
 
 BPF_HASH(last);
 
 int do_trace(struct pt_regs *ctx) {
-	u64 ts, *tsp, delta, key = 0;
+    u64 ts, *tsp, delta, key = 0;
 
-	// attempt to read stored timestamp
-	tsp = last.lookup(&key);
-	if (tsp != 0) {
-		delta = bpf_ktime_get_ns() - *tsp;
-		if (delta < 1000000000) {
-			// output if time is less than 1 second
-			bpf_trace_printk("%d\\n", delta / 1000000);
-		}
-		last.delete(&key);
-	}
+    // attempt to read stored timestamp
+    tsp = last.lookup(&key);
+    if (tsp != 0) {
+        delta = bpf_ktime_get_ns() - *tsp;
+        if (delta < 1000000000) {
+            // output if time is less than 1 second
+            bpf_trace_printk("%d\\n", delta / 1000000);
+        }
+        last.delete(&key);
+    }
 
-	// update stored timestamp
-	ts = bpf_ktime_get_ns();
-	last.update(&key, &ts);
-	return 0;
+    // update stored timestamp
+    ts = bpf_ktime_get_ns();
+    last.update(&key, &ts);
+    return 0;
 }
 """)
 
-b.attach_kprobe(event="sys_sync", fn_name="do_trace")
+b.attach_kprobe(event=b.get_syscall_fnname("sync"), fn_name="do_trace")
 print("Tracing for quick sync's... Ctrl-C to end")
 
 # format output
@@ -168,7 +168,7 @@
 
 ### Lesson 5. sync_count.py
 
-Modify the sync_timing.py program (prior lesson) to store the count of all sys_sync() calls (both fast and slow), and print it with the output. This count can be recorded in the BPF program by adding a new key index to the existing hash.
+Modify the sync_timing.py program (prior lesson) to store the count of all kernel sync system calls (both fast and slow), and print it with the output. This count can be recorded in the BPF program by adding a new key index to the existing hash.
 
 ### Lesson 6. disksnoop.py
 
@@ -211,7 +211,7 @@
 	if (tsp != 0) {
 		delta = bpf_ktime_get_ns() - *tsp;
 		bpf_trace_printk("%d %x %d\\n", req->__data_len,
-			req->cmd_flags, delta / 1000);
+		    req->cmd_flags, delta / 1000);
 		start.delete(&req);
 	}
 }
@@ -258,33 +258,33 @@
 
 // define output data structure in C
 struct data_t {
-	u32 pid;
-	u64 ts;
-	char comm[TASK_COMM_LEN];
+    u32 pid;
+    u64 ts;
+    char comm[TASK_COMM_LEN];
 };
 BPF_PERF_OUTPUT(events);
 
 int hello(struct pt_regs *ctx) {
-	struct data_t data = {};
+    struct data_t data = {};
 
-	data.pid = bpf_get_current_pid_tgid();
-	data.ts = bpf_ktime_get_ns();
-	bpf_get_current_comm(&data.comm, sizeof(data.comm));
+    data.pid = bpf_get_current_pid_tgid();
+    data.ts = bpf_ktime_get_ns();
+    bpf_get_current_comm(&data.comm, sizeof(data.comm));
 
-	events.perf_submit(ctx, &data, sizeof(data));
+    events.perf_submit(ctx, &data, sizeof(data));
 
-	return 0;
+    return 0;
 }
 """
 
 # load BPF program
 b = BPF(text=prog)
-b.attach_kprobe(event="sys_clone", fn_name="hello")
+b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello")
 
 # define output data structure in Python
 TASK_COMM_LEN = 16    # linux/sched.h
 class Data(ct.Structure):
-    _fields_ = [("pid", ct.c_ulonglong),
+    _fields_ = [("pid", ct.c_uint),
                 ("ts", ct.c_ulonglong),
                 ("comm", ct.c_char * TASK_COMM_LEN)]
 
@@ -349,6 +349,7 @@
 Code is [examples/tracing/bitehist.py](../examples/tracing/bitehist.py):
 
 ```Python
+from __future__ import print_function
 from bcc import BPF
 from time import sleep
 
@@ -371,9 +372,9 @@
 
 # trace until Ctrl-C
 try:
-    sleep(99999999)
+	sleep(99999999)
 except KeyboardInterrupt:
-    print
+	print()
 
 # output
 b["dist"].print_log2_hist("kbytes")
@@ -462,15 +463,16 @@
 Hah! I caught smtp by accident. Code is [examples/tracing/urandomread.py](../examples/tracing/urandomread.py):
 
 ```Python
+from __future__ import print_function
 from bcc import BPF
 
 # load BPF program
 b = BPF(text="""
 TRACEPOINT_PROBE(random, urandom_read) {
-	// args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
-	bpf_trace_printk("%d\\n", args->got_bits);
-	return 0;
-};
+    // args is from /sys/kernel/debug/tracing/events/random/urandom_read/format
+    bpf_trace_printk("%d\\n", args->got_bits);
+    return 0;
+}
 """)
 
 # header
@@ -544,6 +546,7 @@
 Code is [examples/tracing/strlen_count.py](../examples/tracing/strlen_count.py):
 
 ```Python
+from __future__ import print_function
 from bcc import BPF
 from time import sleep
 
@@ -552,24 +555,22 @@
 #include <uapi/linux/ptrace.h>
 
 struct key_t {
-	char c[80];
+    char c[80];
 };
 BPF_HASH(counts, struct key_t);
 
 int count(struct pt_regs *ctx) {
-  if (!PT_REGS_PARM1(ctx))
+    if (!PT_REGS_PARM1(ctx))
+        return 0;
+
+    struct key_t key = {};
+    u64 zero = 0, *val;
+
+    bpf_probe_read(&key.c, sizeof(key.c), (void *)PT_REGS_PARM1(ctx));
+    // could also use `counts.increment(key)`
+    val = counts.lookup_or_init(&key, &zero);
+    (*val)++;
     return 0;
-
-  struct key_t key = {};
-  u64 zero = 0, *val;
-
-  bpf_probe_read(&key.c, sizeof(key.c), (void *)PT_REGS_PARM1(ctx));
-
-  // another possibility is using `counts.increment(key);`. It allows a second
-  //   optional parameter to specify the increment step
-  val = counts.lookup_or_init(&key, &zero);
-  (*val)++;
-  return 0;
 };
 """)
 b.attach_uprobe(name="c", sym="strlen", fn_name="count")
@@ -610,27 +611,35 @@
 Relevant code from [examples/tracing/nodejs_http_server.py](../examples/tracing/nodejs_http_server.py):
 
 ```Python
+from __future__ import print_function
+from bcc import BPF, USDT
+import sys
+
 if len(sys.argv) < 2:
-	print("USAGE: nodejs_http_server PID")
-	exit()
+    print("USAGE: nodejs_http_server PID")
+    exit()
 pid = sys.argv[1]
+debug = 0
 
 # load BPF program
 bpf_text = """
 #include <uapi/linux/ptrace.h>
 int do_trace(struct pt_regs *ctx) {
-	uint64_t addr;
-	char path[128];
-	bpf_usdt_readarg(6, ctx, &addr);
-	bpf_probe_read(&path, sizeof(path), (void *)addr);
-	bpf_trace_printk("path:%s\\n", path);
-	return 0;
+    uint64_t addr;
+    char path[128]={0};
+    bpf_usdt_readarg(6, ctx, &addr);
+    bpf_probe_read(&path, sizeof(path), (void *)addr);
+    bpf_trace_printk("path:%s\\n", path);
+    return 0;
 };
 """
 
 # enable USDT probe from given PID
 u = USDT(pid=int(pid))
 u.enable_probe(probe="http__server__request", fn_name="do_trace")
+if debug:
+    print(u.get_text())
+    print(bpf_text)
 
 # initialize BPF
 b = BPF(text=bpf_text, usdt_contexts=[u])
@@ -674,9 +683,6 @@
 };
 // map_type, key_type, leaf_type, table_name, num_entry
 BPF_HASH(stats, struct key_t, u64, 1024);
-// attach to finish_task_switch in kernel/sched/core.c, which has the following
-// prototype:
-//   struct rq *finish_task_switch(struct task_struct *prev)
 int count_sched(struct pt_regs *ctx, struct task_struct *prev) {
   struct key_t key = {};
   u64 zero = 0, *val;
@@ -684,8 +690,7 @@
   key.curr_pid = bpf_get_current_pid_tgid();
   key.prev_pid = prev->pid;
 
-  // another possibility is using `counts.increment(key);`. It allows a second
-  //   optional parameter to specify the increment step
+  // could also use `stats.increment(key);`
   val = stats.lookup_or_init(&key, &zero);
   (*val)++;
   return 0;
diff --git a/examples/cpp/RandomRead.cc b/examples/cpp/RandomRead.cc
index 5e0609a..7b42626 100644
--- a/examples/cpp/RandomRead.cc
+++ b/examples/cpp/RandomRead.cc
@@ -117,6 +117,12 @@
     return 1;
   }
 
+  // done with all initial work, free bcc memory
+  if (bpf->free_bcc_memory()) {
+    std::cerr << "Failed to free llvm/clang memory" << std::endl;
+    return 1;
+  }
+
   signal(SIGINT, signal_handler);
   std::cout << "Started tracing, hit Ctrl-C to terminate." << std::endl;
   while (true)
diff --git a/examples/networking/dns_matching/dns_matching.py b/examples/networking/dns_matching/dns_matching.py
index 943dca5..c8625cd 100755
--- a/examples/networking/dns_matching/dns_matching.py
+++ b/examples/networking/dns_matching/dns_matching.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 from __future__ import print_function
 from bcc import BPF
diff --git a/examples/networking/http_filter/http-parse-complete.c b/examples/networking/http_filter/http-parse-complete.c
index dff16b9..8c6dbf8 100644
--- a/examples/networking/http_filter/http-parse-complete.c
+++ b/examples/networking/http_filter/http-parse-complete.c
@@ -83,7 +83,7 @@
 	//e.g. tcp->offset = 5 ; TCP Header Length = 5 x 4 byte = 20 byte
 	tcp_header_length = tcp->offset << 2; //SHL 2 -> *4 multiply
 
-	//calculate patload offset and length
+	//calculate payload offset and length
 	payload_offset = ETH_HLEN + ip_header_length + tcp_header_length;
 	payload_length = ip->tlen - ip_header_length - tcp_header_length;
 
@@ -99,11 +99,8 @@
 	//direct access to skb not allowed
 	unsigned long p[7];
 	int i = 0;
-	int j = 0;
-	const int last_index = payload_offset + 7;
-	for (i = payload_offset ; i < last_index ; i++) {
-		p[j] = load_byte(skb , i);
-		j++;
+	for (i = 0; i < 7; i++) {
+		p[i] = load_byte(skb , payload_offset + i);
 	}
 
 	//find a match with an HTTP message
diff --git a/examples/networking/http_filter/http-parse-complete.py b/examples/networking/http_filter/http-parse-complete.py
index f1e5e0a..1218cb2 100644
--- a/examples/networking/http_filter/http-parse-complete.py
+++ b/examples/networking/http_filter/http-parse-complete.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #Bertrone Matteo - Polytechnic of Turin
 #November 2015
diff --git a/examples/networking/http_filter/http-parse-simple.c b/examples/networking/http_filter/http-parse-simple.c
index b4e49cc..6ec5302 100644
--- a/examples/networking/http_filter/http-parse-simple.c
+++ b/examples/networking/http_filter/http-parse-simple.c
@@ -54,7 +54,7 @@
 	//e.g. tcp->offset = 5 ; TCP Header Length = 5 x 4 byte = 20 byte
 	tcp_header_length = tcp->offset << 2; //SHL 2 -> *4 multiply
 
-	//calculate patload offset and length
+	//calculate payload offset and length
 	payload_offset = ETH_HLEN + ip_header_length + tcp_header_length;
 	payload_length = ip->tlen - ip_header_length - tcp_header_length;
 
@@ -70,11 +70,8 @@
 	//direct access to skb not allowed
 	unsigned long p[7];
 	int i = 0;
-	int j = 0;
-	const int last_index = payload_offset + 7;
-	for (i = payload_offset ; i < last_index ; i++) {
-		p[j] = load_byte(skb , i);
-		j++;
+	for (i = 0; i < 7; i++) {
+		p[i] = load_byte(skb , payload_offset + i);
 	}
 
 	//find a match with an HTTP message
diff --git a/examples/networking/http_filter/http-parse-simple.py b/examples/networking/http_filter/http-parse-simple.py
index b702393..1fad0d8 100644
--- a/examples/networking/http_filter/http-parse-simple.py
+++ b/examples/networking/http_filter/http-parse-simple.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 #Bertrone Matteo - Polytechnic of Turin
 #November 2015
diff --git a/examples/networking/simple_tc.py b/examples/networking/simple_tc.py
index ec0a3e7..4dd8aa5 100755
--- a/examples/networking/simple_tc.py
+++ b/examples/networking/simple_tc.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # Copyright (c) PLUMgrid, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License")
 
diff --git a/examples/networking/vlan_filter/data-plane-tracing.py b/examples/networking/vlan_filter/data-plane-tracing.py
index efaa7f1..975552f 100755
--- a/examples/networking/vlan_filter/data-plane-tracing.py
+++ b/examples/networking/vlan_filter/data-plane-tracing.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 from __future__ import print_function
 from bcc import BPF
 
diff --git a/examples/networking/xdp/xdp_drop_count.py b/examples/networking/xdp/xdp_drop_count.py
index ff0af0f..f04cb15 100755
--- a/examples/networking/xdp/xdp_drop_count.py
+++ b/examples/networking/xdp/xdp_drop_count.py
@@ -96,23 +96,18 @@
 
     h_proto = eth->h_proto;
 
-    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
-        struct vlan_hdr *vhdr;
+    // parse double vlans
+    #pragma unroll
+    for (int i=0; i<2; i++) {
+        if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+            struct vlan_hdr *vhdr;
 
-        vhdr = data + nh_off;
-        nh_off += sizeof(struct vlan_hdr);
-        if (data + nh_off > data_end)
-            return rc;
-            h_proto = vhdr->h_vlan_encapsulated_proto;
-    }
-    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
-        struct vlan_hdr *vhdr;
-
-        vhdr = data + nh_off;
-        nh_off += sizeof(struct vlan_hdr);
-        if (data + nh_off > data_end)
-            return rc;
-            h_proto = vhdr->h_vlan_encapsulated_proto;
+            vhdr = data + nh_off;
+            nh_off += sizeof(struct vlan_hdr);
+            if (data + nh_off > data_end)
+                return rc;
+                h_proto = vhdr->h_vlan_encapsulated_proto;
+        }
     }
 
     if (h_proto == htons(ETH_P_IP))
diff --git a/examples/networking/xdp/xdp_macswap_count.py b/examples/networking/xdp/xdp_macswap_count.py
new file mode 100755
index 0000000..145d004
--- /dev/null
+++ b/examples/networking/xdp/xdp_macswap_count.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python
+#
+# xdp_macswap_count.py Swap Source and Destination MAC addresses on
+#                      incoming packets and transmit packets back on
+#                      same interface in XDP layer and count for which
+#                      protocol type
+#
+# Copyright (c) 2016 PLUMgrid
+# Copyright (c) 2016 Jan Ruth
+# Copyright (c) 2018 Andy Gospodarek
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from bcc import BPF
+import pyroute2
+import time
+import sys
+
+flags = 0
+def usage():
+    print("Usage: {0} [-S] <ifdev>".format(sys.argv[0]))
+    print("       -S: use skb mode\n")
+    print("e.g.: {0} eth0\n".format(sys.argv[0]))
+    exit(1)
+
+if len(sys.argv) < 2 or len(sys.argv) > 3:
+    usage()
+
+if len(sys.argv) == 2:
+    device = sys.argv[1]
+
+if len(sys.argv) == 3:
+    if "-S" in sys.argv:
+        # XDP_FLAGS_SKB_MODE
+        flags |= 2 << 0
+
+    if "-S" == sys.argv[1]:
+        device = sys.argv[2]
+    else:
+        device = sys.argv[1]
+
+mode = BPF.XDP
+#mode = BPF.SCHED_CLS
+
+if mode == BPF.XDP:
+    ret = "XDP_TX"
+    ctxtype = "xdp_md"
+else:
+    ret = "TC_ACT_SHOT"
+    ctxtype = "__sk_buff"
+
+# load BPF program
+b = BPF(text = """
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+
+BPF_TABLE("percpu_array", uint32_t, long, dropcnt, 256);
+
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end) {
+    struct iphdr *iph = data + nh_off;
+
+    if ((void*)&iph[1] > data_end)
+        return 0;
+    return iph->protocol;
+}
+
+static inline int parse_ipv6(void *data, u64 nh_off, void *data_end) {
+    struct ipv6hdr *ip6h = data + nh_off;
+
+    if ((void*)&ip6h[1] > data_end)
+        return 0;
+    return ip6h->nexthdr;
+}
+
+static void swap_src_dst_mac(void *data)
+{
+    unsigned short *p = data;
+    unsigned short dst[3];
+
+    dst[0] = p[0];
+    dst[1] = p[1];
+    dst[2] = p[2];
+    p[0] = p[3];
+    p[1] = p[4];
+    p[2] = p[5];
+    p[3] = dst[0];
+    p[4] = dst[1];
+    p[5] = dst[2];
+}
+
+int xdp_prog1(struct CTXTYPE *ctx) {
+
+    void* data_end = (void*)(long)ctx->data_end;
+    void* data = (void*)(long)ctx->data;
+
+    struct ethhdr *eth = data;
+
+    // drop packets
+    int rc = RETURNCODE; // let pass XDP_PASS or redirect to tx via XDP_TX
+    long *value;
+    uint16_t h_proto;
+    uint64_t nh_off = 0;
+    uint32_t index;
+
+    nh_off = sizeof(*eth);
+
+    if (data + nh_off  > data_end)
+        return rc;
+
+    h_proto = eth->h_proto;
+
+    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+        struct vlan_hdr *vhdr;
+
+        vhdr = data + nh_off;
+        nh_off += sizeof(struct vlan_hdr);
+        if (data + nh_off > data_end)
+            return rc;
+            h_proto = vhdr->h_vlan_encapsulated_proto;
+    }
+    if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+        struct vlan_hdr *vhdr;
+
+        vhdr = data + nh_off;
+        nh_off += sizeof(struct vlan_hdr);
+        if (data + nh_off > data_end)
+            return rc;
+            h_proto = vhdr->h_vlan_encapsulated_proto;
+    }
+
+    if (h_proto == htons(ETH_P_IP))
+        index = parse_ipv4(data, nh_off, data_end);
+    else if (h_proto == htons(ETH_P_IPV6))
+       index = parse_ipv6(data, nh_off, data_end);
+    else
+        index = 0;
+
+    if (h_proto == IPPROTO_UDP) {
+        swap_src_dst_mac(data);
+        rc = XDP_TX;
+    }
+
+    value = dropcnt.lookup(&index);
+    if (value)
+        *value += 1;
+
+    return rc;
+}
+""", cflags=["-w", "-DRETURNCODE=%s" % ret, "-DCTXTYPE=%s" % ctxtype])
+
+fn = b.load_func("xdp_prog1", mode)
+
+if mode == BPF.XDP:
+    b.attach_xdp(device, fn, flags)
+else:
+    ip = pyroute2.IPRoute()
+    ipdb = pyroute2.IPDB(nl=ip)
+    idx = ipdb.interfaces[device].index
+    ip.tc("add", "clsact", idx)
+    ip.tc("add-filter", "bpf", idx, ":1", fd=fn.fd, name=fn.name,
+          parent="ffff:fff2", classid=1, direct_action=True)
+
+dropcnt = b.get_table("dropcnt")
+prev = [0] * 256
+print("Printing drops per IP protocol-number, hit CTRL+C to stop")
+while 1:
+    try:
+        for k in dropcnt.keys():
+            val = dropcnt.sum(k).value
+            i = k.value
+            if val:
+                delta = val - prev[i]
+                prev[i] = val
+                print("{}: {} pkt/s".format(i, delta))
+        time.sleep(1)
+    except KeyboardInterrupt:
+        print("Removing filter from device")
+        break;
+
+if mode == BPF.XDP:
+    b.remove_xdp(device, flags)
+else:
+    ip.tc("del", "clsact", idx)
+    ipdb.release()
diff --git a/examples/tracing/bitehist.py b/examples/tracing/bitehist.py
index c8c7f7a..410424b 100755
--- a/examples/tracing/bitehist.py
+++ b/examples/tracing/bitehist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # bitehist.py	Block I/O size histogram.
 #		For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py
index ed3dd81..c30ac0a 100755
--- a/examples/tracing/disksnoop.py
+++ b/examples/tracing/disksnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # disksnoop.py	Trace block device I/O: basic version of iosnoop.
 #		For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/mallocstacks.py b/examples/tracing/mallocstacks.py
index 2f3eb25..4820447 100644
--- a/examples/tracing/mallocstacks.py
+++ b/examples/tracing/mallocstacks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # mallocstacks  Trace malloc() calls in a process and print the full
 #               stack trace for all callsites.
diff --git a/examples/tracing/mysqld_query.py b/examples/tracing/mysqld_query.py
index 15ff297..cf877d1 100755
--- a/examples/tracing/mysqld_query.py
+++ b/examples/tracing/mysqld_query.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # mysqld_query    Trace MySQL server queries. Example of USDT tracing.
 #                 For Linux, uses BCC, BPF. Embedded C.
diff --git a/examples/tracing/nodejs_http_server.py b/examples/tracing/nodejs_http_server.py
index 1017de5..367e9d7 100755
--- a/examples/tracing/nodejs_http_server.py
+++ b/examples/tracing/nodejs_http_server.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # nodejs_http_server    Basic example of node.js USDT tracing.
 #                       For Linux, uses BCC, BPF. Embedded C.
diff --git a/examples/tracing/stacksnoop.py b/examples/tracing/stacksnoop.py
index bced93f..d16b59d 100755
--- a/examples/tracing/stacksnoop.py
+++ b/examples/tracing/stacksnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # stacksnoop    Trace a kernel function and print all kernel stack traces.
 #               For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
diff --git a/examples/tracing/strlen_count.py b/examples/tracing/strlen_count.py
index 49d7080..103464f 100755
--- a/examples/tracing/strlen_count.py
+++ b/examples/tracing/strlen_count.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # strlen_count  Trace strlen() and print a frequency count of strings.
 #               For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/strlen_hist.py b/examples/tracing/strlen_hist.py
index dda1cb2..4652c4a 100755
--- a/examples/tracing/strlen_hist.py
+++ b/examples/tracing/strlen_hist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 #
 # strlen_hist.py   Histogram of system-wide strlen return values
diff --git a/examples/tracing/strlen_snoop.py b/examples/tracing/strlen_snoop.py
index c3c7199..44be1ac 100755
--- a/examples/tracing/strlen_snoop.py
+++ b/examples/tracing/strlen_snoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # strlen_snoop  Trace strlen() library function for a given PID.
 #               For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/sync_timing.py b/examples/tracing/sync_timing.py
index 675ad14..a00bf5a 100755
--- a/examples/tracing/sync_timing.py
+++ b/examples/tracing/sync_timing.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # sync_timing.py    Trace time between syncs.
 #                   For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/task_switch.py b/examples/tracing/task_switch.py
index 161edfb..46d43ba 100755
--- a/examples/tracing/task_switch.py
+++ b/examples/tracing/task_switch.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # Copyright (c) PLUMgrid, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License")
 
diff --git a/examples/tracing/tcpv4connect.py b/examples/tracing/tcpv4connect.py
index 8a89469..5b03717 100755
--- a/examples/tracing/tcpv4connect.py
+++ b/examples/tracing/tcpv4connect.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # tcpv4connect	Trace TCP IPv4 connect()s.
 #		For Linux, uses BCC, eBPF. Embedded C.
diff --git a/examples/tracing/urandomread-explicit.py b/examples/tracing/urandomread-explicit.py
index 448ffdf..7be545a 100755
--- a/examples/tracing/urandomread-explicit.py
+++ b/examples/tracing/urandomread-explicit.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # urandomread-explicit  Example of instrumenting a kernel tracepoint.
 #                       For Linux, uses BCC, BPF. Embedded C.
diff --git a/examples/tracing/urandomread.py b/examples/tracing/urandomread.py
index 319db2c..80ea9de 100755
--- a/examples/tracing/urandomread.py
+++ b/examples/tracing/urandomread.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # urandomread  Example of instrumenting a kernel tracepoint.
 #              For Linux, uses BCC, BPF. Embedded C.
diff --git a/examples/tracing/vfsreadlat.py b/examples/tracing/vfsreadlat.py
index b2c4156..f4daae5 100755
--- a/examples/tracing/vfsreadlat.py
+++ b/examples/tracing/vfsreadlat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # vfsreadlat.py		VFS read latency distribution.
 #			For Linux, uses BCC, eBPF. See .c file.
diff --git a/examples/usdt_sample/scripts/lat_avg.py b/examples/usdt_sample/scripts/lat_avg.py
index be473d1..36c4dbb 100755
--- a/examples/usdt_sample/scripts/lat_avg.py
+++ b/examples/usdt_sample/scripts/lat_avg.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 from time import sleep, strftime
 from sys import argv
diff --git a/examples/usdt_sample/scripts/lat_dist.py b/examples/usdt_sample/scripts/lat_dist.py
index af13e89..647f295 100755
--- a/examples/usdt_sample/scripts/lat_dist.py
+++ b/examples/usdt_sample/scripts/lat_dist.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 from time import sleep, strftime
 from sys import argv
diff --git a/examples/usdt_sample/scripts/latency.py b/examples/usdt_sample/scripts/latency.py
index 4170592..d46f2ef 100755
--- a/examples/usdt_sample/scripts/latency.py
+++ b/examples/usdt_sample/scripts/latency.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 from time import sleep
 from sys import argv
diff --git a/man/man8/biotop.8 b/man/man8/biotop.8
index 8b872aa..ed25521 100644
--- a/man/man8/biotop.8
+++ b/man/man8/biotop.8
@@ -30,9 +30,6 @@
 \-r MAXROWS
 Maximum number of rows to print. Default is 20.
 .TP
-\-p PID
-Trace this PID only.
-.TP
 interval
 Interval between updates, seconds.
 .TP
diff --git a/man/man8/capable.8 b/man/man8/capable.8
index c847ff0..3be7571 100644
--- a/man/man8/capable.8
+++ b/man/man8/capable.8
@@ -2,7 +2,7 @@
 .SH NAME
 capable \- Trace security capability checks (cap_capable()).
 .SH SYNOPSIS
-.B capable [\-h] [\-v] [\-p PID]
+.B capable [\-h] [\-v] [\-p PID] [\-K] [\-U]
 .SH DESCRIPTION
 This traces security capability checks in the kernel, and prints details for
 each call. This can be useful for general debugging, and also security
@@ -19,6 +19,12 @@
 Include non-audit capability checks. These are those deemed not interesting and
 not necessary to audit, such as CAP_SYS_ADMIN checks on memory allocation to
 affect the behavior of overcommit.
+.TP
+\-K
+Include kernel stack traces to the output.
+.TP
+\-U
+Include user-space stack traces to the output.
 .SH EXAMPLES
 .TP
 Trace all capability checks system-wide:
diff --git a/man/man8/dbslower.8 b/man/man8/dbslower.8
index 740fdb6..c21e6fa 100644
--- a/man/man8/dbslower.8
+++ b/man/man8/dbslower.8
@@ -2,7 +2,7 @@
 .SH NAME
 dbslower \- Trace MySQL/PostgreSQL server queries slower than a threshold.
 .SH SYNOPSIS
-.B dbslower [-v] [-p PID [PID ...]] [-m THRESHOLD] {mysql,postgres}
+.B dbslower [-v] [-p PID [PID ...]] [-x PATH] [-m THRESHOLD] {mysql,postgres}
 .SH DESCRIPTION
 This traces queries served by a MySQL or PostgreSQL server, and prints
 those that exceed a latency (query time) threshold. By default a threshold of
@@ -11,6 +11,8 @@
 This uses User Statically-Defined Tracing (USDT) probes, a feature added to
 MySQL and PostgreSQL for DTrace support, but which may not be enabled on a
 given installation. See requirements.
+Alternativly, MySQL queries can be traced without the USDT support using the
+-x option.
 
 Since this uses BPF, only the root user can use this tool.
 .SH REQUIREMENTS
@@ -25,6 +27,10 @@
 Trace this PID. If no PID is specified, the tool will attempt to automatically
 detect the MySQL or PostgreSQL processes running on the system.
 .TP
+\-x PATH
+Path to MySQL binary. This option allow to MySQL queries even when USDT probes
+aren't enabled on the MySQL server.
+.TP
 \-m THRESHOLD
 Minimum query latency (duration) to trace, in milliseconds. Default is 1 ms.
 .TP
diff --git a/man/man8/inject.8 b/man/man8/inject.8
index e97613b..0cf729e 100644
--- a/man/man8/inject.8
+++ b/man/man8/inject.8
@@ -3,7 +3,7 @@
 inject \- injects appropriate error into function if input call chain and
 predicates are satisfied. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B inject -h [-I header] [-P probability] [-v] mode spec
+.B inject -h [-I header] [-P probability] [-v] [-C count] mode spec
 .SH DESCRIPTION
 inject injects errors into specified kernel functionality when a given call
 chain and associated predicates are satsified.
@@ -29,6 +29,9 @@
 .TP
 \-P probability
 Optional probability of failure, default 1.
+.TP
+\-C count
+Number of errors to inject before stopping, default never stops.
 .SH EXAMPLES
 Please see inject_example.txt
 .SH SOURCE
diff --git a/man/man8/opensnoop.8 b/man/man8/opensnoop.8
index f7b74c1..9d99a90 100644
--- a/man/man8/opensnoop.8
+++ b/man/man8/opensnoop.8
@@ -27,6 +27,9 @@
 \-T
 Include a timestamp column.
 .TP
+\-U
+Show UID.
+.TP
 \-x
 Only print failed opens.
 .TP
@@ -36,11 +39,20 @@
 \-t TID
 Trace this thread ID only (filtered in-kernel).
 .TP
+\-u UID
+Trace this UID only (filtered in-kernel).
+.TP
 \-d DURATION
 Total duration of trace in seconds.
 .TP
 \-n name
 Only print processes where its name partially matches 'name'
+.TP
+\-e
+Show extended fields.
+.TP
+\-f FLAG
+Filter on open() flags, e.g., O_WRONLY.
 .SH EXAMPLES
 .TP
 Trace all open() syscalls:
@@ -55,6 +67,10 @@
 #
 .B opensnoop \-T
 .TP
+Show UID:
+#
+.B opensnoop \-U
+.TP
 Trace only open() syscalls that failed:
 #
 .B opensnoop \-x
@@ -63,14 +79,29 @@
 #
 .B opensnoop \-p 181
 .TP
+Trace UID 1000 only:
+#
+.B opensnoop \-u 1000
+.TP
 Trace all open() syscalls from processes where its name partially matches 'ed':
 #
 .B opensnoop \-n ed
+.TP
+Show extended fields:
+#
+.B opensnoop \-e
+.TP
+Only print calls for writing:
+#
+.B opensnoop \-f O_WRONLY \-f O_RDWR
 .SH FIELDS
 .TP
 TIME(s)
 Time of the call, in seconds.
 .TP
+UID
+User ID
+.TP
 PID
 Process ID
 .TP
@@ -86,6 +117,9 @@
 ERR
 Error number (see the system's errno.h)
 .TP
+FLAGS
+Flags passed to open(2), in octal
+.TP
 PATH
 Open path
 .SH OVERHEAD
diff --git a/man/man8/shmsnoop.8 b/man/man8/shmsnoop.8
new file mode 100644
index 0000000..390974f
--- /dev/null
+++ b/man/man8/shmsnoop.8
@@ -0,0 +1,74 @@
+.TH shmsnoop 8  "2018-09-24" "USER COMMANDS"
+.SH NAME
+shmsnoop \- Trace System V shared memory syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B shmsnoop [\-h] [\-T] [\-p] [\-t] [\-d DURATION] [\-n NAME]
+.SH DESCRIPTION
+shmsnoop traces System V shared memory syscalls: shmget, shmat, shmdt, shmctl
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include a timestamp column.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
+\-d DURATION
+Total duration of trace in seconds.
+.TP
+\-n NAME
+Only print command lines matching this command name (regex)
+.SH EXAMPLES
+.TP
+Trace all shm* syscalls:
+#
+.B shmsnoop
+.TP
+Trace all shm* syscalls, and include timestamps:
+#
+.B shmsnoop \-T
+.TP
+Only trace shm* syscalls where the process contains "server":
+#
+.B shmsnoop \-n server
+.SH FIELDS
+.TP
+TIME(s)
+Time of shm syscall return, in seconds.
+.TP
+PID
+Process ID
+.TP
+COMM
+Parent process/command name.
+.TP
+RET
+Return value of shm syscall.
+.TP
+ARGS
+"arg: value" couples that represent given syscall arguments as described in their manpage
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Jiri Olsa
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/spfdsnoop.8 b/man/man8/spfdsnoop.8
new file mode 100644
index 0000000..ffad57c
--- /dev/null
+++ b/man/man8/spfdsnoop.8
@@ -0,0 +1,85 @@
+.TH sofdsnoop 8  "2018-11-08" "USER COMMANDS"
+.SH NAME
+sofdsnoop \- Trace FDs passed through unix sockets. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B sofdsnoop [-h] [-T] [-p PID] [-t TID] [-n NAME] [-d DURATION]
+.SH DESCRIPTION
+sofdsnoop traces FDs passed through unix sockets
+
+Every file descriptor that is passed via unix sockets os displayed
+on separate line together with process info (TID/COMM columns),
+ACTION details (SEND/RECV), file descriptor number (FD) and its
+translation to file if available (NAME).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include a timestamp column.
+.TP
+\-p PID
+Trace this process ID only (filtered in-kernel).
+.TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
+\-d DURATION
+Total duration of trace in seconds.
+.TP
+\-n NAME
+Only print command lines matching this command name (regex)
+.SH EXAMPLES
+.TP
+Trace all sockets:
+#
+.B sofdsnoop
+.TP
+Trace all sockets, and include timestamps:
+#
+.B sofdsnoop \-T
+.TP
+Only trace sockets where the process contains "server":
+#
+.B sofdsnoop \-n server
+.SH FIELDS
+.TP
+TIME(s)
+Time of SEDN/RECV actions, in seconds.
+.TP
+ACTION
+Operation on the fd SEND/RECV.
+.TP
+TID
+Process TID
+.TP
+COMM
+Parent process/command name.
+.TP
+SOCKET
+The socket carrier.
+.TP
+FD
+file descriptor number
+.TP
+NAME
+file name for SEND lines
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Jiri Olsa
+.SH SEE ALSO
+opensnoop(1)
diff --git a/man/man8/tcpconnect.8 b/man/man8/tcpconnect.8
index eb1f4ad..60de372 100644
--- a/man/man8/tcpconnect.8
+++ b/man/man8/tcpconnect.8
@@ -32,6 +32,12 @@
 Comma-separated list of destination ports to trace (filtered in-kernel).
 .SH EXAMPLES
 .TP
+\-U
+Include a UID column.
+.TP
+\-u UID
+Trace this UID only (filtered in-kernel).
+.TP
 Trace all active TCP connections:
 #
 .B tcpconnect
@@ -49,9 +55,22 @@
 .B tcpconnect \-P 80,81
 .SH FIELDS
 .TP
+Trace all TCP connects, and include UID:
+#
+.B tcpconnect \-U
+.SH FIELDS
+.TP
+Trace UID 1000 only:
+#
+.B tcpconnect \-u 1000
+.SH FIELDS
+.TP
 TIME(s)
 Time of the call, in seconds.
 .TP
+UID
+User ID
+.TP
 PID
 Process ID
 .TP
diff --git a/man/man8/tcpstates.8 b/man/man8/tcpstates.8
index b31fd64..d78161b 100644
--- a/man/man8/tcpstates.8
+++ b/man/man8/tcpstates.8
@@ -2,7 +2,7 @@
 .SH NAME
 tcpstates \- Trace TCP session state changes with durations. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B tcpstates [\-h] [\-T] [\-t] [\-w] [\-s] [\-D PORTS] [\-L PORTS]
+.B tcpstates [\-h] [\-T] [\-t] [\-w] [\-s] [\-D PORTS] [\-L PORTS] [\-Y]
 .SH DESCRIPTION
 This tool traces TCP session state changes while tracing, and prints details
 including the duration in each state. This can help explain the latency of
@@ -41,6 +41,9 @@
 .TP
 \-D PORTS
 Comma-separated list of destination ports to trace (filtered in-kernel).
+.TP
+\-Y
+Log session state changes to the systemd journal.
 .SH EXAMPLES
 .TP
 Trace all TCP sessions, and show all state changes:
diff --git a/scripts/style-check.sh b/scripts/c-style-check.sh
similarity index 100%
rename from scripts/style-check.sh
rename to scripts/c-style-check.sh
diff --git a/scripts/py-style-check.sh b/scripts/py-style-check.sh
new file mode 100755
index 0000000..d8c5ece
--- /dev/null
+++ b/scripts/py-style-check.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# TODO: stop ignoring this. Maybe autopep8 existing stuff?
+find tools -type f -name "*.py" | xargs pep8 -r --show-source --ignore=E123,E125,E126,E127,E128,E302 || \
+    echo "pep8 run failed, please fix it" >&2
+
+NO_PROPER_SHEBANG="$(find tools examples -type f -executable -name '*.py' | xargs grep -L '#!/usr/bin/env python')"
+if [ -n "$NO_PROPER_SHEBANG" ]; then
+    echo "bad shebangs found:"
+    echo "$NO_PROPER_SHEBANG"
+    echo
+    echo "either add proper shebang or remove executable bit" >&2
+
+    exit 1
+fi
diff --git a/snapcraft/Makefile b/snapcraft/Makefile
index 92d1f6c..2bf9f97 100644
--- a/snapcraft/Makefile
+++ b/snapcraft/Makefile
@@ -29,7 +29,7 @@
 	snapcraft
 
 set_version:
-	cat snapcraft.yaml | sed 's/version: .*/version: $(V)/' > snapcraft-tmp.yaml
+	cat snapcraft.yaml | sed 's/^version: .*/version: $(V)/' > snapcraft-tmp.yaml
 	mv snapcraft-tmp.yaml snapcraft.yaml
 
 install:
diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml
index e4acdb2..93a2adc 100644
--- a/snapcraft/snapcraft.yaml
+++ b/snapcraft/snapcraft.yaml
@@ -16,7 +16,7 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 #
 name: bcc
-version: 0.3.0-20170401-1747-c5f48c9
+version: 0.7.0-20181122-2831-166fba57
 summary: BPF Compiler Collection (BCC)
 description: A toolkit for creating efficient kernel tracing and manipulation programs
 confinement: strict
@@ -26,319 +26,14 @@
     system-observe: null
     system-trace: null
 assumes: [snapd2.23]
-apps:
-    argdist:
-        command: wrapper argdist
-        aliases: [argdist]
-    bashreadline:
-        command: wrapper bashreadline
-        aliases: [bashreadline]
-    biolatency:
-        command: wrapper biolatency
-        aliases: [biolatency]
-    biosnoop:
-        command: wrapper biosnoop
-        aliases: [biosnoop]
-    biotop:
-        command: wrapper biotop
-        aliases: [biotop]
-    bitesize:
-        command: wrapper bitesize
-        aliases: [bitesize]
-    bpflist:
-        command: wrapper bpflist
-        aliases: [bpflist]
-    btrfsdist:
-        command: wrapper btrfsdist
-        aliases: [btrfsdist]
-    btrfsslower:
-        command: wrapper btrfsslower
-        aliases: [btrfsslower]
-    cachestat:
-        command: wrapper cachestat
-        aliases: [cachestat]
-    cachetop:
-        command: wrapper cachetop
-        aliases: [cachetop]
-    capable:
-        command: wrapper capable
-        aliases: [capable]
-    cobjnew:
-        command: wrapper cobjnew
-        aliases: [cobjnew]
-    cpudist:
-        command: wrapper cpudist
-        aliases: [cpudist]
-    cpuunclaimed:
-        command: wrapper cpuunclaimed
-        aliases: [cpuunclaimed]
-    dbslower:
-        command: wrapper dbslower
-        aliases: [dbslower]
-    dbstat:
-        command: wrapper dbstat
-        aliases: [dbstat]
-    dcsnoop:
-        command: wrapper dcsnoop
-        aliases: [dcsnoop]
-    dcstat:
-        command: wrapper dcstat
-        aliases: [dcstat]
-    deadlock-detector:
-        command: wrapper deadlock_detector
-        aliases: [deadlock-detector]
-    execsnoop:
-        command: wrapper execsnoop
-        aliases: [execsnoop]
-    ext4dist:
-        command: wrapper ext4dist
-        aliases: [ext4dist]
-    ext4slower:
-        command: wrapper ext4slower
-        aliases: [ext4slower]
-    filelife:
-        command: wrapper filelife
-        aliases: [filelife]
-    fileslower:
-        command: wrapper fileslower
-        aliases: [fileslower]
-    filetop:
-        command: wrapper filetop
-        aliases: [filetop]
-    funccount:
-        command: wrapper funccount
-        aliases: [funccount]
-    funclatency:
-        command: wrapper funclatency
-        aliases: [funclatency]
-    funcslower:
-        command: wrapper funcslower
-        aliases: [funcslower]
-    gethostlatency:
-        command: wrapper gethostlatency
-        aliases: [gethostlatency]
-    hardirqs:
-        command: wrapper hardirqs
-        aliases: [hardirqs]
-    javacalls:
-        command: wrapper javacalls
-        aliases: [javacalls]
-    javaflow:
-        command: wrapper javaflow
-        aliases: [javaflow]
-    javagc:
-        command: wrapper javagc
-        aliases: [javagc]
-    javaobjnew:
-        command: wrapper javaobjnew
-        aliases: [javaobjnew]
-    javastat:
-        command: wrapper javastat
-        aliases: [javastat]
-    javathreads:
-        command: wrapper javathreads
-        aliases: [javathreads]
-    killsnoop:
-        command: wrapper killsnoop
-        aliases: [killsnoop]
-    llcstat:
-        command: wrapper llcstat
-        aliases: [llcstat]
-    mdflush:
-        command: wrapper mdflush
-        aliases: [mdflush]
-    memleak:
-        command: wrapper memleak
-        aliases: [memleak]
-    mountsnoop:
-        command: wrapper mountsnoop
-        aliases: [mountsnoop]
-    mysqld-qslower:
-        command: wrapper mysqld_qslower
-        aliases: [mysqld-qslower]
-    nfsdist:
-        command: wrapper nfsdist
-        aliases: [nfsdist]
-    nfsslower:
-        command: wrapper nfsslower
-        aliases: [nfsslower]
-    nodegc:
-        command: wrapper nodegc
-        aliases: [nodegc]
-    nodestat:
-        command: wrapper nodestat
-        aliases: [nodestat]
-    offcputime:
-        command: wrapper offcputime
-        aliases: [offcputime]
-    offwaketime:
-        command: wrapper offwaketime
-        aliases: [offwaketime]
-    oomkill:
-        command: wrapper oomkill
-        aliases: [oomkill]
-    opensnoop:
-        command: wrapper opensnoop
-        aliases: [opensnoop]
-    perlcalls:
-        command: wrapper perlcalls
-        aliases: [perlcalls]
-    perlflow:
-        command: wrapper perlflow
-        aliases: [perlflow]
-    perlstat:
-        command: wrapper perlstat
-        aliases: [perlstat]
-    phpcalls:
-        command: wrapper phpcalls
-        aliases: [phpcalls]
-    phpflow:
-        command: wrapper phpflow
-        aliases: [phpflow]
-    phpstat:
-        command: wrapper phpstat
-        aliases: [phpstat]
-    pidpersec:
-        command: wrapper pidpersec
-        aliases: [pidpersec]
-    profile:
-        command: wrapper profile
-        aliases: [profile]
-    pythoncalls:
-        command: wrapper pythoncalls
-        aliases: [pythoncalls]
-    pythonflow:
-        command: wrapper pythonflow
-        aliases: [pythonflow]
-    pythongc:
-        command: wrapper pythongc
-        aliases: [pythongc]
-    pythonstat:
-        command: wrapper pythonstat
-        aliases: [pythonstat]
-    rubycalls:
-        command: wrapper rubycalls
-        aliases: [rubycalls]
-    rubyflow:
-        command: wrapper rubyflow
-        aliases: [rubyflow]
-    rubygc:
-        command: wrapper rubygc
-        aliases: [rubygc]
-    rubyobjnew:
-        command: wrapper rubyobjnew
-        aliases: [rubyobjnew]
-    rubystat:
-        command: wrapper rubystat
-        aliases: [rubystat]
-    runqlat:
-        command: wrapper runqlat
-        aliases: [runqlat]
-    runqlen:
-        command: wrapper runqlen
-        aliases: [runqlen]
-    slabratetop:
-        command: wrapper slabratetop
-        aliases: [slabratetop]
-    softirqs:
-        command: wrapper softirqs
-        aliases: [softirqs]
-    solisten:
-        command: wrapper solisten
-        aliases: [solisten]
-    sslsniff:
-        command: wrapper sslsniff
-        aliases: [sslsniff]
-    stackcount:
-        command: wrapper stackcount
-        aliases: [stackcount]
-    stacksnoop:
-        command: wrapper stacksnoop
-        aliases: [stacksnoop]
-    statsnoop:
-        command: wrapper statsnoop
-        aliases: [statsnoop]
-    syncsnoop:
-        command: wrapper syncsnoop
-        aliases: [syncsnoop]
-    syscount:
-        command: wrapper syscount
-        aliases: [syscount]
-    tcpaccept:
-        command: wrapper tcpaccept
-        aliases: [tcpaccept]
-    tcpconnect:
-        command: wrapper tcpconnect
-        aliases: [tcpconnect]
-    tcpconnlat:
-        command: wrapper tcpconnlat
-        aliases: [tcpconnlat]
-    tcplife:
-        command: wrapper tcplife
-        aliases: [tcplife]
-    tcpretrans:
-        command: wrapper tcpretrans
-        aliases: [tcpretrans]
-    tcptop:
-        command: wrapper tcptop
-        aliases: [tcptop]
-    tcptracer:
-        command: wrapper tcptracer
-        aliases: [tcptracer]
-    tplist:
-        command: wrapper tplist
-        aliases: [tplist]
-    trace:
-        command: wrapper trace
-        aliases: [trace]
-    ttysnoop:
-        command: wrapper ttysnoop
-        aliases: [ttysnoop]
-    ucalls:
-        command: wrapper lib/ucalls
-        aliases: [ucalls]
-    uflow:
-        command: wrapper lib/uflow
-        aliases: [uflow]
-    ugc:
-        command: wrapper lib/ugc
-        aliases: [ugc]
-    uobjnew:
-        command: wrapper lib/uobjnew
-        aliases: [uobjnew]
-    ustat:
-        command: wrapper lib/ustat
-        aliases: [ustat]
-    uthreads:
-        command: wrapper lib/uthreads
-        aliases: [uthreads]
-    vfscount:
-        command: wrapper vfscount
-        aliases: [vfscount]
-    vfsstat:
-        command: wrapper vfsstat
-        aliases: [vfsstat]
-    wakeuptime:
-        command: wrapper wakeuptime
-        aliases: [wakeuptime]
-    xfsdist:
-        command: wrapper xfsdist
-        aliases: [xfsdist]
-    xfsslower:
-        command: wrapper xfsslower
-        aliases: [xfsslower]
-    zfsdist:
-        command: wrapper zfsdist
-        aliases: [zfsdist]
-    zfsslower:
-        command: wrapper zfsslower
-        aliases: [zfsslower]
+
 parts:
     bcc:
         plugin: cmake
         configflags:
-            - -DCMAKE_INSTALL_PREFIX=/usr
+            - '-DCMAKE_INSTALL_PREFIX=/usr'
         source: ..
+        source-type: git
         build-packages:
             - bison
             - build-essential
@@ -351,18 +46,228 @@
             - python
             - zlib1g-dev
             - libelf-dev
+            - iperf
         stage-packages:
-            - python
-        snap:
-            - usr/bin/python*
+            - libc6
+        prime:
             - usr/share/bcc/tools
             - usr/lib/*/lib*.so*
             - usr/lib/python2.7
+
             - -usr/share/bcc/tools/doc
-    wrapper:
-        source: .
-        plugin: copy
-        files:
-            wrapper: bin/wrapper
+
+    python-deps:
+        plugin: python
+        python-version: python2
+        stage-packages:
+            - libc6
+
+apps:
+    argdist:
+        command: usr/share/bcc/tools/argdist
+    bashreadline:
+        command: usr/share/bcc/tools/bashreadline
+    biolatency:
+        command: usr/share/bcc/tools/biolatency
+    biosnoop:
+        command: usr/share/bcc/tools/biosnoop
+    biotop:
+        command: usr/share/bcc/tools/biotop
+    bitesize:
+        command: usr/share/bcc/tools/bitesize
+    bpflist:
+        command: usr/share/bcc/tools/bpflist
+    btrfsdist:
+        command: usr/share/bcc/tools/btrfsdist
+    btrfsslower:
+        command: usr/share/bcc/tools/btrfsslower
+    cachestat:
+        command: usr/share/bcc/tools/cachestat
+    cachetop:
+        command: usr/share/bcc/tools/cachetop
+    capable:
+        command: usr/share/bcc/tools/capable
+    cobjnew:
+        command: usr/share/bcc/tools/cobjnew
+    cpudist:
+        command: usr/share/bcc/tools/cpudist
+    cpuunclaimed:
+        command: usr/share/bcc/tools/cpuunclaimed
+    dbslower:
+        command: usr/share/bcc/tools/dbslower
+    dbstat:
+        command: usr/share/bcc/tools/dbstat
+    dcsnoop:
+        command: usr/share/bcc/tools/dcsnoop
+    dcstat:
+        command: usr/share/bcc/tools/dcstat
+    deadlock-detector:
+        command: usr/share/bcc/tools/deadlock_detector
+    execsnoop:
+        command: usr/share/bcc/tools/execsnoop
+    ext4dist:
+        command: usr/share/bcc/tools/ext4dist
+    ext4slower:
+        command: usr/share/bcc/tools/ext4slower
+    filelife:
+        command: usr/share/bcc/tools/filelife
+    fileslower:
+        command: usr/share/bcc/tools/fileslower
+    filetop:
+        command: usr/share/bcc/tools/filetop
+    funccount:
+        command: usr/share/bcc/tools/funccount
+    funclatency:
+        command: usr/share/bcc/tools/funclatency
+    funcslower:
+        command: usr/share/bcc/tools/funcslower
+    gethostlatency:
+        command: usr/share/bcc/tools/gethostlatency
+    hardirqs:
+        command: usr/share/bcc/tools/hardirqs
+    javacalls:
+        command: usr/share/bcc/tools/javacalls
+    javaflow:
+        command: usr/share/bcc/tools/javaflow
+    javagc:
+        command: usr/share/bcc/tools/javagc
+    javaobjnew:
+        command: usr/share/bcc/tools/javaobjnew
+    javastat:
+        command: usr/share/bcc/tools/javastat
+    javathreads:
+        command: usr/share/bcc/tools/javathreads
+    killsnoop:
+        command: usr/share/bcc/tools/killsnoop
+    llcstat:
+        command: usr/share/bcc/tools/llcstat
+    mdflush:
+        command: usr/share/bcc/tools/mdflush
+    memleak:
+        command: usr/share/bcc/tools/memleak
+    mountsnoop:
+        command: usr/share/bcc/tools/mountsnoop
+    mysqld-qslower:
+        command: usr/share/bcc/tools/mysqld_qslower
+    nfsdist:
+        command: usr/share/bcc/tools/nfsdist
+    nfsslower:
+        command: usr/share/bcc/tools/nfsslower
+    nodegc:
+        command: usr/share/bcc/tools/nodegc
+    nodestat:
+        command: usr/share/bcc/tools/nodestat
+    offcputime:
+        command: usr/share/bcc/tools/offcputime
+    offwaketime:
+        command: usr/share/bcc/tools/offwaketime
+    oomkill:
+        command: usr/share/bcc/tools/oomkill
+    opensnoop:
+        command: usr/share/bcc/tools/opensnoop
+    perlcalls:
+        command: usr/share/bcc/tools/perlcalls
+    perlflow:
+        command: usr/share/bcc/tools/perlflow
+    perlstat:
+        command: usr/share/bcc/tools/perlstat
+    shmsnoop:
+        command: usr/share/bcc/tools/shmsnoop
+    sofdsnoop:
+        command: usr/share/bcc/tools/sofdsnoop
+    phpcalls:
+        command: usr/share/bcc/tools/phpcalls
+    phpflow:
+        command: usr/share/bcc/tools/phpflow
+    phpstat:
+        command: usr/share/bcc/tools/phpstat
+    pidpersec:
+        command: usr/share/bcc/tools/pidpersec
+    profile:
+        command: usr/share/bcc/tools/profile
+    pythoncalls:
+        command: usr/share/bcc/tools/pythoncalls
+    pythonflow:
+        command: usr/share/bcc/tools/pythonflow
+    pythongc:
+        command: usr/share/bcc/tools/pythongc
+    pythonstat:
+        command: usr/share/bcc/tools/pythonstat
+    rubycalls:
+        command: usr/share/bcc/tools/rubycalls
+    rubyflow:
+        command: usr/share/bcc/tools/rubyflow
+    rubygc:
+        command: usr/share/bcc/tools/rubygc
+    rubyobjnew:
+        command: usr/share/bcc/tools/rubyobjnew
+    rubystat:
+        command: usr/share/bcc/tools/rubystat
+    runqlat:
+        command: usr/share/bcc/tools/runqlat
+    runqlen:
+        command: usr/share/bcc/tools/runqlen
+    slabratetop:
+        command: usr/share/bcc/tools/slabratetop
+    softirqs:
+        command: usr/share/bcc/tools/softirqs
+    solisten:
+        command: usr/share/bcc/tools/solisten
+    sslsniff:
+        command: usr/share/bcc/tools/sslsniff
+    stackcount:
+        command: usr/share/bcc/tools/stackcount
+    statsnoop:
+        command: usr/share/bcc/tools/statsnoop
+    syncsnoop:
+        command: usr/share/bcc/tools/syncsnoop
+    syscount:
+        command: usr/share/bcc/tools/syscount
+    tcpaccept:
+        command: usr/share/bcc/tools/tcpaccept
+    tcpconnect:
+        command: usr/share/bcc/tools/tcpconnect
+    tcpconnlat:
+        command: usr/share/bcc/tools/tcpconnlat
+    tcplife:
+        command: usr/share/bcc/tools/tcplife
+    tcpretrans:
+        command: usr/share/bcc/tools/tcpretrans
+    tcptop:
+        command: usr/share/bcc/tools/tcptop
+    tcptracer:
+        command: usr/share/bcc/tools/tcptracer
+    tplist:
+        command: usr/share/bcc/tools/tplist
+    trace:
+        command: usr/share/bcc/tools/trace
+    ttysnoop:
+        command: usr/share/bcc/tools/ttysnoop
+    ucalls:
+        command: usr/share/bcc/tools/lib/ucalls
+    uflow:
+        command: usr/share/bcc/tools/lib/uflow
+    ugc:
+        command: usr/share/bcc/tools/lib/ugc
+    uobjnew:
+        command: usr/share/bcc/tools/lib/uobjnew
+    ustat:
+        command: usr/share/bcc/tools/lib/ustat
+    uthreads:
+        command: usr/share/bcc/tools/lib/uthreads
+    vfscount:
+        command: usr/share/bcc/tools/vfscount
+    vfsstat:
+        command: usr/share/bcc/tools/vfsstat
+    wakeuptime:
+        command: usr/share/bcc/tools/wakeuptime
+    xfsdist:
+        command: usr/share/bcc/tools/xfsdist
+    xfsslower:
+        command: usr/share/bcc/tools/xfsslower
+    zfsdist:
+        command: usr/share/bcc/tools/zfsdist
+    zfsslower:
+        command: usr/share/bcc/tools/zfsslower
 
 # vim: set ai et sts=4 tabstop=4 sw=4:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7daca5b..37b7e28 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,7 +2,12 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+
+if (ENABLE_RTTI)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -frtti")
+else()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
+endif()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt
index fda165d..242c420 100644
--- a/src/cc/CMakeLists.txt
+++ b/src/cc/CMakeLists.txt
@@ -15,7 +15,8 @@
 configure_file(libbcc.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc @ONLY)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -DBCC_PROG_TAG_DIR='\"${BCC_PROG_TAG_DIR}\"'")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -Wno-unused-result")
 
 string(REGEX MATCH "^([0-9]+).*" _ ${LLVM_PACKAGE_VERSION})
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_MAJOR_VERSION=${CMAKE_MATCH_1}")
@@ -33,12 +34,18 @@
   set(bcc_common_sources ${bcc_common_sources} bcc_debug.cc)
 endif()
 
+if(ENABLE_LLVM_NATIVECODEGEN)
+set(bcc_common_sources ${bcc_common_sources} bpf_module_rw_engine.cc)
+else()
+set(bcc_common_sources ${bcc_common_sources} bpf_module_rw_engine_disabled.cc)
+endif()
+
 set(bcc_table_sources table_storage.cc shared_table.cc bpffs_table.cc json_map_decl_visitor.cc)
 set(bcc_util_sources ns_guard.cc common.cc)
 set(bcc_sym_sources bcc_syms.cc bcc_elf.c bcc_perf_map.c bcc_proc.c)
 set(bcc_common_headers libbpf.h perf_reader.h)
 set(bcc_table_headers file_desc.h table_desc.h table_storage.h)
-set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h)
+set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h bcc_elf.h)
 
 if(ENABLE_CLANG_JIT)
 add_library(bcc-shared SHARED
diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc
index 5f451f7..c6f843f 100644
--- a/src/cc/api/BPF.cc
+++ b/src/cc/api/BPF.cc
@@ -27,6 +27,7 @@
 #include <vector>
 
 #include "bcc_exception.h"
+#include "bcc_elf.h"
 #include "bcc_syms.h"
 #include "bpf_module.h"
 #include "common.h"
@@ -707,6 +708,10 @@
   return StatusTuple(0);
 }
 
+int BPF::free_bcc_memory() {
+  return bcc_free_memory();
+}
+
 USDT::USDT(const std::string& binary_path, const std::string& provider,
            const std::string& name, const std::string& probe_func)
     : initialized_(false),
diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h
index 21fb42d..fcf0db4 100644
--- a/src/cc/api/BPF.h
+++ b/src/cc/api/BPF.h
@@ -47,7 +47,7 @@
   static const int BPF_MAX_STACK_DEPTH = 127;
 
   explicit BPF(unsigned int flag = 0, TableStorage* ts = nullptr,
-               bool rw_engine_enabled = true, const std::string &maps_ns = "")
+               bool rw_engine_enabled = bpf_module_rw_engine_enabled(), const std::string &maps_ns = "")
       : flag_(flag),
       bpf_module_(new BPFModule(flag, ts, rw_engine_enabled, maps_ns)) {}
   StatusTuple init(const std::string& bpf_program,
@@ -176,6 +176,8 @@
                         int& fd);
   StatusTuple unload_func(const std::string& func_name);
 
+  int free_bcc_memory();
+
  private:
   std::string get_kprobe_event(const std::string& kernel_func,
                                bpf_probe_attach_type type);
diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c
index c425db6..c6745a2 100644
--- a/src/cc/bcc_elf.c
+++ b/src/cc/bcc_elf.c
@@ -184,11 +184,50 @@
   return res;
 }
 
+static Elf_Scn * get_section(Elf *e, const char *section_name,
+                             GElf_Shdr *section_hdr, size_t *section_idx) {
+  Elf_Scn *section = NULL;
+  GElf_Shdr header;
+  char *name;
+
+  size_t stridx;
+  if (elf_getshdrstrndx(e, &stridx) != 0)
+    return NULL;
+
+  size_t index;
+  for (index = 1; (section = elf_nextscn(e, section)) != 0; index++) {
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    name = elf_strptr(e, stridx, header.sh_name);
+    if (name && !strcmp(name, section_name)) {
+      if (section_hdr)
+        *section_hdr = header;
+      if (section_idx)
+        *section_idx = index;
+      return section;
+    }
+  }
+
+  return NULL;
+}
+
 static int list_in_scn(Elf *e, Elf_Scn *section, size_t stridx, size_t symsize,
                        struct bcc_symbol_option *option,
                        bcc_elf_symcb callback, void *payload) {
   Elf_Data *data = NULL;
 
+#if defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  size_t opdidx = 0;
+  Elf_Scn *opdsec = NULL;
+  GElf_Shdr opdshdr = {};
+  Elf_Data *opddata = NULL;
+
+  opdsec = get_section(e, ".opd", &opdshdr, &opdidx);
+  if (opdsec && opdshdr.sh_type == SHT_PROGBITS)
+    opddata = elf_getdata(opdsec, NULL);
+#endif
+
   while ((data = elf_getdata(section, data)) != 0) {
     size_t i, symcount = data->d_size / symsize;
 
@@ -214,6 +253,40 @@
       if (!(option->use_symbol_type & (1 << st_type)))
         continue;
 
+#ifdef __powerpc64__
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      if (opddata && sym.st_shndx == opdidx) {
+        size_t offset = sym.st_value - opdshdr.sh_addr;
+        /* Find the function descriptor */
+        uint64_t *descr = opddata->d_buf + offset;
+        /* Read the actual entry point address from the descriptor */
+        sym.st_value = *descr;
+      }
+#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+      if (option->use_symbol_type & (1 << STT_PPC64LE_SYM_LEP)) {
+        /*
+         * The PowerPC 64-bit ELF v2 ABI says that the 3 most significant bits
+         * in the st_other field of the symbol table specifies the number of
+         * instructions between a function's Global Entry Point (GEP) and Local
+         * Entry Point (LEP).
+         */
+        switch (sym.st_other >> 5) {
+          /* GEP and LEP are the same for 0 or 1, usage is reserved for 7 */
+          /* If 2, LEP is 1 instruction past the GEP */
+          case 2: sym.st_value += 4; break;
+          /* If 3, LEP is 2 instructions past the GEP */
+          case 3: sym.st_value += 8; break;
+          /* If 4, LEP is 4 instructions past the GEP */
+          case 4: sym.st_value += 16; break;
+          /* If 5, LEP is 8 instructions past the GEP */
+          case 5: sym.st_value += 32; break;
+          /* If 6, LEP is 16 instructions past the GEP */
+          case 6: sym.st_value += 64; break;
+        }
+      }
+#endif
+#endif
+
       if (callback(name, sym.st_value, sym.st_size, payload) < 0)
         return 1;      // signal termination to caller
     }
@@ -248,24 +321,9 @@
 }
 
 static Elf_Data * get_section_elf_data(Elf *e, const char *section_name) {
-  Elf_Scn *section = NULL;
-  GElf_Shdr header;
-  char *name;
-
-  size_t stridx;
-  if (elf_getshdrstrndx(e, &stridx) != 0)
-    return NULL;
-
-  while ((section = elf_nextscn(e, section)) != 0) {
-    if (!gelf_getshdr(section, &header))
-      continue;
-
-    name = elf_strptr(e, stridx, header.sh_name);
-    if (name && !strcmp(name, section_name)) {
-      return elf_getdata(section, NULL);
-    }
-  }
-
+  Elf_Scn *section = get_section(e, section_name, NULL, NULL);
+  if (section)
+    return elf_getdata(section, NULL);
   return NULL;
 }
 
@@ -398,6 +456,7 @@
 static char *find_debug_via_debuglink(Elf *e, const char *binpath,
                                       int check_crc) {
   char fullpath[PATH_MAX];
+  char *tmppath;
   char *bindir = NULL;
   char *res = NULL;
   unsigned int crc;
@@ -406,8 +465,8 @@
   if (!find_debuglink(e, &name, &crc))
     return NULL;
 
-  bindir = strdup(binpath);
-  bindir = dirname(bindir);
+  tmppath = strdup(binpath);
+  bindir = dirname(tmppath);
 
   // Search for the file in 'binpath', but ignore the file we find if it
   // matches the binary itself: the binary will always be probed later on,
@@ -434,9 +493,11 @@
   }
 
 DONE:
-  free(bindir);
-  if (res && check_crc && !verify_checksum(res, crc))
+  free(tmppath);
+  if (res && check_crc && !verify_checksum(res, crc)) {
+    free(res);
     return NULL;
+  }
   return res;
 }
 
@@ -665,6 +726,162 @@
   return listsymbols(elf, callback, payload, &default_option);
 }
 
+// return value: 0   : success
+//               < 0 : error and no bcc lib found
+//               > 0 : error and bcc lib found
+static int bcc_free_memory_with_file(const char *path) {
+  unsigned long sym_addr = 0, sym_shndx;
+  Elf_Scn *section = NULL;
+  int fd = -1, err;
+  GElf_Shdr header;
+  Elf *e = NULL;
+
+  if ((err = openelf(path, &e, &fd)) < 0)
+    goto exit;
+
+  // get symbol address of "bcc_free_memory", which
+  // will be used to calculate runtime .text address
+  // range, esp. for shared libraries.
+  err = -1;
+  while ((section = elf_nextscn(e, section)) != 0) {
+    Elf_Data *data = NULL;
+    size_t symsize;
+
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    if (header.sh_type != SHT_SYMTAB && header.sh_type != SHT_DYNSYM)
+      continue;
+
+    /* iterate all symbols */
+    symsize = header.sh_entsize;
+    while ((data = elf_getdata(section, data)) != 0) {
+      size_t i, symcount = data->d_size / symsize;
+
+      for (i = 0; i < symcount; ++i) {
+        GElf_Sym sym;
+
+        if (!gelf_getsym(data, (int)i, &sym))
+          continue;
+
+        if (GELF_ST_TYPE(sym.st_info) != STT_FUNC)
+          continue;
+
+        const char *name;
+        if ((name = elf_strptr(e, header.sh_link, sym.st_name)) == NULL)
+          continue;
+
+        if (strcmp(name, "bcc_free_memory") == 0) {
+          sym_addr = sym.st_value;
+          sym_shndx = sym.st_shndx;
+          break;
+        }
+      }
+    }
+  }
+
+  // Didn't find bcc_free_memory in the ELF file.
+  if (sym_addr == 0)
+    goto exit;
+
+  int sh_idx = 0;
+  section = NULL;
+  err = 1;
+  while ((section = elf_nextscn(e, section)) != 0) {
+    sh_idx++;
+    if (!gelf_getshdr(section, &header))
+      continue;
+
+    if (sh_idx == sym_shndx) {
+      unsigned long saddr, saddr_n, eaddr;
+      long page_size = sysconf(_SC_PAGESIZE);
+
+      saddr = (unsigned long)bcc_free_memory - sym_addr + header.sh_addr;
+      eaddr = saddr + header.sh_size;
+
+      // adjust saddr and eaddr, start addr needs to be page aligned
+      saddr_n = (saddr + page_size - 1) & ~(page_size - 1);
+      eaddr -= saddr_n - saddr;
+
+      if (madvise((void *)saddr_n, eaddr - saddr_n, MADV_DONTNEED)) {
+        fprintf(stderr, "madvise failed, saddr %lx, eaddr %lx\n", saddr, eaddr);
+        goto exit;
+      }
+
+      err = 0;
+      break;
+    }
+  }
+
+exit:
+  if (e)
+    elf_end(e);
+  if (fd >= 0)
+    close(fd);
+  return err;
+}
+
+// Free bcc mmemory
+//
+// The main purpose of this function is to free llvm/clang text memory
+// through madvise MADV_DONTNEED.
+//
+// bcc could be linked statically or dynamically into the application.
+// If it is static linking, there is no easy way to know which region
+// inside .text section belongs to llvm/clang, so the whole .text section
+// is freed. Otherwise, the process map is searched to find libbcc.so
+// library and the whole .text section for that shared library is
+// freed.
+//
+// Note that the text memory used by bcc (mainly llvm/clang) is reclaimable
+// in the kernel as it is file backed. But the reclaim process
+// may take some time if no memory pressure. So this API is mostly
+// used for application who needs to immediately lowers its RssFile
+// metric right after loading BPF program.
+int bcc_free_memory() {
+  int err;
+
+  // First try whether bcc is statically linked or not
+  err = bcc_free_memory_with_file("/proc/self/exe");
+  if (err >= 0)
+    return -err;
+
+  // Not statically linked, let us find the libbcc.so
+  FILE *maps = fopen("/proc/self/maps", "r");
+  if (!maps)
+    return -1;
+
+  char *line = NULL;
+  size_t size;
+  while (getline(&line, &size, maps) > 0) {
+    char *libbcc = strstr(line, "libbcc.so");
+    if (!libbcc)
+      continue;
+
+    // Parse the line and get the full libbcc.so path
+    unsigned long addr_start, addr_end, offset, inode;
+    int path_start = 0, path_end = 0;
+    unsigned int devmajor, devminor;
+    char perms[8];
+    if (sscanf(line, "%lx-%lx %7s %lx %u:%u %lu %n%*[^\n]%n",
+               &addr_start, &addr_end, perms, &offset,
+               &devmajor, &devminor, &inode,
+               &path_start, &path_end) < 7)
+       break;
+
+    // Free the text in the bcc dynamic library.
+    char libbcc_path[4096];
+    memcpy(libbcc_path, line + path_start, path_end - path_start);
+    libbcc_path[path_end - path_start] = '\0';
+    err = bcc_free_memory_with_file(libbcc_path);
+    err = (err <= 0) ? err : -err;
+  }
+
+  fclose(maps);
+  free(line);
+  return err;
+}
+
 #if 0
 #include <stdio.h>
 
diff --git a/src/cc/bcc_elf.h b/src/cc/bcc_elf.h
index bbe2494..0d10259 100644
--- a/src/cc/bcc_elf.h
+++ b/src/cc/bcc_elf.h
@@ -68,6 +68,7 @@
 int bcc_elf_is_shared_obj(const char *path);
 int bcc_elf_is_exe(const char *path);
 int bcc_elf_is_vdso(const char *name);
+int bcc_free_memory();
 
 #ifdef __cplusplus
 }
diff --git a/src/cc/bcc_proc.c b/src/cc/bcc_proc.c
index d694eb9..ccec0fc 100644
--- a/src/cc/bcc_proc.c
+++ b/src/cc/bcc_proc.c
@@ -99,7 +99,7 @@
   while (true) {
     buf[0] = '\0';
     // From fs/proc/task_mmu.c:show_map_vma
-    if (fscanf(procmap, "%lx-%lx %s %llx %s %lu%[^\n]", &begin, &end, perm,
+    if (fscanf(procmap, "%lx-%lx %4s %llx %7s %lu%[^\n]", &begin, &end, perm,
                &offset, dev, &inode, buf) != 7)
       break;
 
diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc
index be9781a..116cf0d 100644
--- a/src/cc/bcc_syms.cc
+++ b/src/cc/bcc_syms.cc
@@ -122,13 +122,13 @@
 }
 
 void ProcSyms::load_exe() {
+  ProcMountNSGuard g(mount_ns_instance_.get());
   std::string exe = ebpf::get_pid_exe(pid_);
   Module module(exe.c_str(), mount_ns_instance_.get(), &symbol_option_);
 
   if (module.type_ != ModuleType::EXEC)
     return;
 
-  ProcMountNSGuard g(mount_ns_instance_.get());
 
   bcc_elf_foreach_load_section(exe.c_str(), &_add_load_sections, &module);
 
@@ -163,6 +163,7 @@
     // It only gives the mmap offset. We need the real offset for symbol
     // lookup.
     if (module.type_ == ModuleType::SO) {
+      ProcMountNSGuard g(ps->mount_ns_instance_.get());
       if (bcc_elf_get_text_scn_info(modname, &module.elf_so_addr_,
                                     &module.elf_so_offset_) < 0) {
         fprintf(stderr, "WARNING: Couldn't find .text section in %s\n", modname);
@@ -499,7 +500,11 @@
   static struct bcc_symbol_option default_option = {
     .use_debug_file = 1,
     .check_debug_file_crc = 1,
+#if defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    .use_symbol_type = BCC_SYM_ALL_TYPES | (1 << STT_PPC64LE_SYM_LEP),
+#else
     .use_symbol_type = BCC_SYM_ALL_TYPES,
+#endif
   };
 
   if (module == NULL)
diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h
index d617c1d..42a1cf3 100644
--- a/src/cc/bcc_syms.h
+++ b/src/cc/bcc_syms.h
@@ -34,6 +34,13 @@
 #ifndef STT_GNU_IFUNC
 #define STT_GNU_IFUNC 10
 #endif
+
+#if defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+// Indicate if the Local Entry Point (LEP) should be used as a symbol's
+// start address
+#define STT_PPC64LE_SYM_LEP 31
+#endif
+
 static const uint32_t BCC_SYM_ALL_TYPES = 65535;
 struct bcc_symbol_option {
   int use_debug_file;
diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc
index a8174be..73235bf 100644
--- a/src/cc/bpf_module.cc
+++ b/src/cc/bpf_module.cc
@@ -13,32 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
 #include <fcntl.h>
-#include <ftw.h>
 #include <map>
-#include <stdio.h>
 #include <string>
 #include <sys/stat.h>
-#include <sys/utsname.h>
 #include <unistd.h>
 #include <vector>
 #include <linux/bpf.h>
 
-#include <llvm/ADT/STLExtras.h>
 #include <llvm/ExecutionEngine/MCJIT.h>
 #include <llvm/ExecutionEngine/SectionMemoryManager.h>
-#include <llvm/IRReader/IRReader.h>
-#include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/IRPrintingPasses.h>
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/LLVMContext.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Verifier.h>
-#include <llvm/Object/ObjectFile.h>
-#include <llvm/Support/FormattedStream.h>
-#include <llvm/Support/Host.h>
-#include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
@@ -46,13 +35,11 @@
 
 #include "common.h"
 #include "bcc_debug.h"
-#include "bcc_exception.h"
 #include "frontends/b/loader.h"
 #include "frontends/clang/loader.h"
 #include "frontends/clang/b_frontend_action.h"
 #include "bpf_module.h"
 #include "exported_files.h"
-#include "kbuild_helper.h"
 #include "libbpf.h"
 
 namespace ebpf {
@@ -102,14 +89,13 @@
 BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled,
                      const std::string &maps_ns)
     : flags_(flags),
-      rw_engine_enabled_(rw_engine_enabled),
+      rw_engine_enabled_(rw_engine_enabled && bpf_module_rw_engine_enabled()),
       used_b_loader_(false),
       ctx_(new LLVMContext),
       id_(std::to_string((uintptr_t)this)),
       maps_ns_(maps_ns),
       ts_(ts) {
-  InitializeNativeTarget();
-  InitializeNativeTargetAsmPrinter();
+  initialize_rw_engine();
   LLVMInitializeBPFTarget();
   LLVMInitializeBPFTargetMC();
   LLVMInitializeBPFTargetInfo();
@@ -148,329 +134,13 @@
   }
 
   engine_.reset();
-  rw_engine_.reset();
+  cleanup_rw_engine();
   ctx_.reset();
   func_src_.reset();
 
   ts_->DeletePrefix(Path({id_}));
 }
 
-static void debug_printf(Module *mod, IRBuilder<> &B, const string &fmt, vector<Value *> args) {
-  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
-  args.insert(args.begin(), B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)})));
-  args.insert(args.begin(), B.getInt64((uintptr_t)stderr));
-  Function *fprintf_fn = mod->getFunction("fprintf");
-  if (!fprintf_fn) {
-    vector<Type *> fprintf_fn_args({B.getInt64Ty(), B.getInt8PtrTy()});
-    FunctionType *fprintf_fn_type = FunctionType::get(B.getInt32Ty(), fprintf_fn_args, /*isvarArg=*/true);
-    fprintf_fn = Function::Create(fprintf_fn_type, GlobalValue::ExternalLinkage, "fprintf", mod);
-    fprintf_fn->setCallingConv(CallingConv::C);
-    fprintf_fn->addFnAttr(Attribute::NoUnwind);
-  }
-  B.CreateCall(fprintf_fn, args);
-}
-
-static void finish_sscanf(IRBuilder<> &B, vector<Value *> *args, string *fmt,
-                          const map<string, Value *> &locals, bool exact_args) {
-  // fmt += "%n";
-  // int nread = 0;
-  // int n = sscanf(s, fmt, args..., &nread);
-  // if (n < 0) return -1;
-  // s = &s[nread];
-  Value *sptr = locals.at("sptr");
-  Value *nread = locals.at("nread");
-  Function *cur_fn = B.GetInsertBlock()->getParent();
-  Function *sscanf_fn = B.GetInsertBlock()->getModule()->getFunction("sscanf");
-  *fmt += "%n";
-  B.CreateStore(B.getInt32(0), nread);
-  GlobalVariable *fmt_gvar = B.CreateGlobalString(*fmt, "fmt");
-  (*args)[1] = B.CreateInBoundsGEP(fmt_gvar, {B.getInt64(0), B.getInt64(0)});
-  (*args)[0] = B.CreateLoad(sptr);
-  args->push_back(nread);
-  CallInst *call = B.CreateCall(sscanf_fn, *args);
-  call->setTailCall(true);
-
-  BasicBlock *label_true = BasicBlock::Create(B.getContext(), "", cur_fn);
-  BasicBlock *label_false = BasicBlock::Create(B.getContext(), "", cur_fn);
-
-  // exact_args means fail if don't consume exact number of "%" inputs
-  // exact_args is disabled for string parsing (empty case)
-  Value *cond = exact_args ? B.CreateICmpNE(call, B.getInt32(args->size() - 3))
-                           : B.CreateICmpSLT(call, B.getInt32(0));
-  B.CreateCondBr(cond, label_true, label_false);
-
-  B.SetInsertPoint(label_true);
-  B.CreateRet(B.getInt32(-1));
-
-  B.SetInsertPoint(label_false);
-  // s = &s[nread];
-  B.CreateStore(
-      B.CreateInBoundsGEP(B.CreateLoad(sptr), B.CreateLoad(nread, true)), sptr);
-
-  args->resize(2);
-  fmt->clear();
-}
-
-// recursive helper to capture the arguments
-static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt,
-                       Type *type, Value *out,
-                       const map<string, Value *> &locals, bool is_writer) {
-  if (StructType *st = dyn_cast<StructType>(type)) {
-    *fmt += "{ ";
-    unsigned idx = 0;
-    for (auto field : st->elements()) {
-      parse_type(B, args, fmt, field, B.CreateStructGEP(type, out, idx++),
-                 locals, is_writer);
-      *fmt += " ";
-    }
-    *fmt += "}";
-  } else if (ArrayType *at = dyn_cast<ArrayType>(type)) {
-    if (at->getElementType() == B.getInt8Ty()) {
-      // treat i8[] as a char string instead of as an array of u8's
-      if (is_writer) {
-        *fmt += "\"%s\"";
-        args->push_back(out);
-      } else {
-        // When reading strings, scanf doesn't support empty "", so we need to
-        // break this up into multiple scanf calls. To understand it, let's take
-        // an example:
-        // struct Event {
-        //   u32 a;
-        //   struct {
-        //     char x[64];
-        //     int y;
-        //   } b[2];
-        //   u32 c;
-        // };
-        // The writer string would look like:
-        //  "{ 0x%x [ { \"%s\" 0x%x } { \"%s\" 0x%x } ] 0x%x }"
-        // But the reader string needs to restart at each \"\".
-        //  reader0(const char *s, struct Event *val) {
-        //    int nread, rc;
-        //    nread = 0;
-        //    rc = sscanf(s, "{ %i [ { \"%n", &val->a, &nread);
-        //    if (rc != 1) return -1;
-        //    s += nread; nread = 0;
-        //    rc = sscanf(s, "%[^\"]%n", &val->b[0].x, &nread);
-        //    if (rc < 0) return -1;
-        //    s += nread; nread = 0;
-        //    rc = sscanf(s, "\" %i } { \"%n", &val->b[0].y, &nread);
-        //    if (rc != 1) return -1;
-        //    s += nread; nread = 0;
-        //    rc = sscanf(s, "%[^\"]%n", &val->b[1].x, &nread);
-        //    if (rc < 0) return -1;
-        //    s += nread; nread = 0;
-        //    rc = sscanf(s, "\" %i } ] %i }%n", &val->b[1].y, &val->c, &nread);
-        //    if (rc != 2) return -1;
-        //    s += nread; nread = 0;
-        //    return 0;
-        //  }
-        *fmt += "\"";
-        finish_sscanf(B, args, fmt, locals, true);
-
-        *fmt = "%[^\"]";
-        args->push_back(out);
-        finish_sscanf(B, args, fmt, locals, false);
-
-        *fmt = "\"";
-      }
-    } else {
-      *fmt += "[ ";
-      for (size_t i = 0; i < at->getNumElements(); ++i) {
-        parse_type(B, args, fmt, at->getElementType(),
-                   B.CreateStructGEP(type, out, i), locals, is_writer);
-        *fmt += " ";
-      }
-      *fmt += "]";
-    }
-  } else if (isa<PointerType>(type)) {
-    *fmt += "0xl";
-    if (is_writer)
-      *fmt += "x";
-    else
-      *fmt += "i";
-  } else if (IntegerType *it = dyn_cast<IntegerType>(type)) {
-    if (is_writer)
-      *fmt += "0x";
-    if (it->getBitWidth() <= 8)
-      *fmt += "%hh";
-    else if (it->getBitWidth() <= 16)
-      *fmt += "%h";
-    else if (it->getBitWidth() <= 32)
-      *fmt += "%";
-    else
-      *fmt += "%l";
-    if (is_writer)
-      *fmt += "x";
-    else
-      *fmt += "i";
-    args->push_back(is_writer ? B.CreateLoad(out) : out);
-  }
-}
-
-// make_reader generates a dynamic function in the instruction set of the host
-// (not bpf) that is able to convert c-strings in the pretty-print format of
-// make_writer back into binary representations. The encoding of the string
-// takes the llvm ir structure format, which closely maps the c structure but
-// not exactly (no support for unions for instance).
-// The general algorithm is:
-//  pod types (u8..u64)                <= %i
-//  array types
-//   u8[]  no nested quotes :(         <= "..."
-//   !u8[]                             <= [ %i %i ... ]
-//  struct types
-//   struct { u8 a; u64 b; }           <= { %i %i }
-//  nesting is supported
-//   struct { struct { u8 a[]; }; }    <= { "" }
-//   struct { struct { u64 a[]; }; }   <= { [ %i %i .. ] }
-string BPFModule::make_reader(Module *mod, Type *type) {
-  auto fn_it = readers_.find(type);
-  if (fn_it != readers_.end())
-    return fn_it->second;
-
-  // int read(const char *in, Type *out) {
-  //   int n = sscanf(in, "{ %i ... }", &out->field1, ...);
-  //   if (n != num_fields) return -1;
-  //   return 0;
-  // }
-
-  IRBuilder<> B(*ctx_);
-
-  FunctionType *sscanf_fn_type = FunctionType::get(
-      B.getInt32Ty(), {B.getInt8PtrTy(), B.getInt8PtrTy()}, /*isVarArg=*/true);
-  Function *sscanf_fn = mod->getFunction("sscanf");
-  if (!sscanf_fn) {
-    sscanf_fn = Function::Create(sscanf_fn_type, GlobalValue::ExternalLinkage,
-                                 "sscanf", mod);
-    sscanf_fn->setCallingConv(CallingConv::C);
-    sscanf_fn->addFnAttr(Attribute::NoUnwind);
-  }
-
-  string name = "reader" + std::to_string(readers_.size());
-  vector<Type *> fn_args({B.getInt8PtrTy(), PointerType::getUnqual(type)});
-  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
-  Function *fn =
-      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
-  auto arg_it = fn->arg_begin();
-  Argument *arg_in = &*arg_it;
-  ++arg_it;
-  arg_in->setName("in");
-  Argument *arg_out = &*arg_it;
-  ++arg_it;
-  arg_out->setName("out");
-
-  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
-  B.SetInsertPoint(label_entry);
-
-  Value *nread = B.CreateAlloca(B.getInt32Ty());
-  Value *sptr = B.CreateAlloca(B.getInt8PtrTy());
-  map<string, Value *> locals{{"nread", nread}, {"sptr", sptr}};
-  B.CreateStore(arg_in, sptr);
-  vector<Value *> args({nullptr, nullptr});
-  string fmt;
-  parse_type(B, &args, &fmt, type, arg_out, locals, false);
-
-  if (0)
-    debug_printf(mod, B, "%p %p\n", vector<Value *>({arg_in, arg_out}));
-
-  finish_sscanf(B, &args, &fmt, locals, true);
-
-  B.CreateRet(B.getInt32(0));
-
-  readers_[type] = name;
-  return name;
-}
-
-// make_writer generates a dynamic function in the instruction set of the host
-// (not bpf) that is able to pretty-print key/leaf entries as a c-string. The
-// encoding of the string takes the llvm ir structure format, which closely maps
-// the c structure but not exactly (no support for unions for instance).
-// The general algorithm is:
-//  pod types (u8..u64)                => 0x%x
-//  array types
-//   u8[]                              => "..."
-//   !u8[]                             => [ 0x%x 0x%x ... ]
-//  struct types
-//   struct { u8 a; u64 b; }           => { 0x%x 0x%x }
-//  nesting is supported
-//   struct { struct { u8 a[]; }; }    => { "" }
-//   struct { struct { u64 a[]; }; }   => { [ 0x%x 0x%x .. ] }
-string BPFModule::make_writer(Module *mod, Type *type) {
-  auto fn_it = writers_.find(type);
-  if (fn_it != writers_.end())
-    return fn_it->second;
-
-  // int write(int len, char *out, Type *in) {
-  //   return snprintf(out, len, "{ %i ... }", out->field1, ...);
-  // }
-
-  IRBuilder<> B(*ctx_);
-
-  string name = "writer" + std::to_string(writers_.size());
-  vector<Type *> fn_args({B.getInt8PtrTy(), B.getInt64Ty(), PointerType::getUnqual(type)});
-  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
-  Function *fn =
-      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
-  auto arg_it = fn->arg_begin();
-  Argument *arg_out = &*arg_it;
-  ++arg_it;
-  arg_out->setName("out");
-  Argument *arg_len = &*arg_it;
-  ++arg_it;
-  arg_len->setName("len");
-  Argument *arg_in = &*arg_it;
-  ++arg_it;
-  arg_in->setName("in");
-
-  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
-  B.SetInsertPoint(label_entry);
-
-  map<string, Value *> locals{
-      {"nread", B.CreateAlloca(B.getInt64Ty())},
-  };
-  vector<Value *> args({arg_out, B.CreateZExt(arg_len, B.getInt64Ty()), nullptr});
-  string fmt;
-  parse_type(B, &args, &fmt, type, arg_in, locals, true);
-
-  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
-
-  args[2] = B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)}));
-
-  if (0)
-    debug_printf(mod, B, "%d %p %p\n", vector<Value *>({arg_len, arg_out, arg_in}));
-
-  vector<Type *> snprintf_fn_args({B.getInt8PtrTy(), B.getInt64Ty(), B.getInt8PtrTy()});
-  FunctionType *snprintf_fn_type = FunctionType::get(B.getInt32Ty(), snprintf_fn_args, /*isVarArg=*/true);
-  Function *snprintf_fn = mod->getFunction("snprintf");
-  if (!snprintf_fn)
-    snprintf_fn = Function::Create(snprintf_fn_type, GlobalValue::ExternalLinkage, "snprintf", mod);
-  snprintf_fn->setCallingConv(CallingConv::C);
-  snprintf_fn->addFnAttr(Attribute::NoUnwind);
-
-  CallInst *call = B.CreateCall(snprintf_fn, args);
-  call->setTailCall(true);
-
-  B.CreateRet(call);
-
-  writers_[type] = name;
-  return name;
-}
-
-unique_ptr<ExecutionEngine> BPFModule::finalize_rw(unique_ptr<Module> m) {
-  Module *mod = &*m;
-
-  run_pass_manager(*mod);
-
-  string err;
-  EngineBuilder builder(move(m));
-  builder.setErrorStr(&err);
-  builder.setUseOrcMCJITReplacement(false);
-  auto engine = unique_ptr<ExecutionEngine>(builder.create());
-  if (!engine)
-    fprintf(stderr, "Could not create ExecutionEngine: %s\n", err.c_str());
-  return engine;
-}
-
 // load an entire c file as a module
 int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) {
   ClangLoader clang_loader(&*ctx_, flags_);
@@ -507,79 +177,6 @@
   }
 }
 
-int BPFModule::annotate() {
-  for (auto fn = mod_->getFunctionList().begin(); fn != mod_->getFunctionList().end(); ++fn)
-    if (!fn->hasFnAttribute(Attribute::NoInline))
-      fn->addFnAttr(Attribute::AlwaysInline);
-
-  // separate module to hold the reader functions
-  auto m = ebpf::make_unique<Module>("sscanf", *ctx_);
-
-  size_t id = 0;
-  Path path({id_});
-  for (auto it = ts_->lower_bound(path), up = ts_->upper_bound(path); it != up; ++it) {
-    TableDesc &table = it->second;
-    tables_.push_back(&it->second);
-    table_names_[table.name] = id++;
-    GlobalValue *gvar = mod_->getNamedValue(table.name);
-    if (!gvar) continue;
-    if (PointerType *pt = dyn_cast<PointerType>(gvar->getType())) {
-      if (StructType *st = dyn_cast<StructType>(pt->getElementType())) {
-        if (st->getNumElements() < 2) continue;
-        Type *key_type = st->elements()[0];
-        Type *leaf_type = st->elements()[1];
-
-        using std::placeholders::_1;
-        using std::placeholders::_2;
-        using std::placeholders::_3;
-        table.key_sscanf = std::bind(&BPFModule::sscanf, this,
-                                     make_reader(&*m, key_type), _1, _2);
-        table.leaf_sscanf = std::bind(&BPFModule::sscanf, this,
-                                      make_reader(&*m, leaf_type), _1, _2);
-        table.key_snprintf = std::bind(&BPFModule::snprintf, this,
-                                       make_writer(&*m, key_type), _1, _2, _3);
-        table.leaf_snprintf =
-            std::bind(&BPFModule::snprintf, this, make_writer(&*m, leaf_type),
-                      _1, _2, _3);
-      }
-    }
-  }
-
-  rw_engine_ = finalize_rw(move(m));
-  if (!rw_engine_)
-    return -1;
-  return 0;
-}
-
-StatusTuple BPFModule::sscanf(string fn_name, const char *str, void *val) {
-  if (!rw_engine_enabled_)
-    return StatusTuple(-1, "rw_engine not enabled");
-  auto fn =
-      (int (*)(const char *, void *))rw_engine_->getFunctionAddress(fn_name);
-  if (!fn)
-    return StatusTuple(-1, "sscanf not available");
-  int rc = fn(str, val);
-  if (rc < 0)
-    return StatusTuple(rc, "error in sscanf: %s", std::strerror(errno));
-  return StatusTuple(rc);
-}
-
-StatusTuple BPFModule::snprintf(string fn_name, char *str, size_t sz,
-                                const void *val) {
-  if (!rw_engine_enabled_)
-    return StatusTuple(-1, "rw_engine not enabled");
-  auto fn = (int (*)(char *, size_t,
-                     const void *))rw_engine_->getFunctionAddress(fn_name);
-  if (!fn)
-    return StatusTuple(-1, "snprintf not available");
-  int rc = fn(str, sz, val);
-  if (rc < 0)
-    return StatusTuple(rc, "error in snprintf: %s", std::strerror(errno));
-  if ((size_t)rc == sz)
-    return StatusTuple(-1, "buffer of size %zd too small", sz);
-  return StatusTuple(0);
-}
-
 void BPFModule::dump_ir(Module &mod) {
   legacy::PassManager PM;
   PM.add(createPrintModulePass(errs()));
@@ -618,6 +215,11 @@
       *sections_p;
 
   mod->setTargetTriple("bpf-pc-linux");
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  mod->setDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
+#else
+  mod->setDataLayout("E-m:e-p:64:64-i64:64-n32:64-S128");
+#endif
   sections_p = rw_engine_enabled_ ? &sections_ : &tmp_sections;
 
   string err;
diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h
index ff237a5..d4bc54e 100644
--- a/src/cc/bpf_module.h
+++ b/src/cc/bpf_module.h
@@ -54,10 +54,14 @@
 class ClangLoader;
 class FuncSource;
 
+bool bpf_module_rw_engine_enabled(void);
+
 class BPFModule {
  private:
   static const std::string FN_PREFIX;
   int init_engine();
+  void initialize_rw_engine();
+  void cleanup_rw_engine();
   int parse(llvm::Module *mod);
   int finalize();
   int annotate();
diff --git a/src/cc/bpf_module_rw_engine.cc b/src/cc/bpf_module_rw_engine.cc
new file mode 100644
index 0000000..418355d
--- /dev/null
+++ b/src/cc/bpf_module_rw_engine.cc
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <map>
+#include <string>
+#include <vector>
+
+#include <llvm/ExecutionEngine/MCJIT.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/Support/TargetSelect.h>
+
+#include "common.h"
+#include "bpf_module.h"
+#include "table_storage.h"
+
+namespace ebpf {
+
+using std::map;
+using std::move;
+using std::string;
+using std::unique_ptr;
+using std::vector;
+using namespace llvm;
+
+bool bpf_module_rw_engine_enabled(void) {
+  return true;
+}
+
+void BPFModule::initialize_rw_engine() {
+  InitializeNativeTarget();
+  InitializeNativeTargetAsmPrinter();
+}
+
+void BPFModule::cleanup_rw_engine() {
+  rw_engine_.reset();
+}
+
+static void debug_printf(Module *mod, IRBuilder<> &B, const string &fmt, vector<Value *> args) {
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
+  args.insert(args.begin(), B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)})));
+  args.insert(args.begin(), B.getInt64((uintptr_t)stderr));
+  Function *fprintf_fn = mod->getFunction("fprintf");
+  if (!fprintf_fn) {
+    vector<Type *> fprintf_fn_args({B.getInt64Ty(), B.getInt8PtrTy()});
+    FunctionType *fprintf_fn_type = FunctionType::get(B.getInt32Ty(), fprintf_fn_args, /*isvarArg=*/true);
+    fprintf_fn = Function::Create(fprintf_fn_type, GlobalValue::ExternalLinkage, "fprintf", mod);
+    fprintf_fn->setCallingConv(CallingConv::C);
+    fprintf_fn->addFnAttr(Attribute::NoUnwind);
+  }
+  B.CreateCall(fprintf_fn, args);
+}
+
+static void finish_sscanf(IRBuilder<> &B, vector<Value *> *args, string *fmt,
+                          const map<string, Value *> &locals, bool exact_args) {
+  // fmt += "%n";
+  // int nread = 0;
+  // int n = sscanf(s, fmt, args..., &nread);
+  // if (n < 0) return -1;
+  // s = &s[nread];
+  Value *sptr = locals.at("sptr");
+  Value *nread = locals.at("nread");
+  Function *cur_fn = B.GetInsertBlock()->getParent();
+  Function *sscanf_fn = B.GetInsertBlock()->getModule()->getFunction("sscanf");
+  *fmt += "%n";
+  B.CreateStore(B.getInt32(0), nread);
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(*fmt, "fmt");
+  (*args)[1] = B.CreateInBoundsGEP(fmt_gvar, {B.getInt64(0), B.getInt64(0)});
+  (*args)[0] = B.CreateLoad(sptr);
+  args->push_back(nread);
+  CallInst *call = B.CreateCall(sscanf_fn, *args);
+  call->setTailCall(true);
+
+  BasicBlock *label_true = BasicBlock::Create(B.getContext(), "", cur_fn);
+  BasicBlock *label_false = BasicBlock::Create(B.getContext(), "", cur_fn);
+
+  // exact_args means fail if don't consume exact number of "%" inputs
+  // exact_args is disabled for string parsing (empty case)
+  Value *cond = exact_args ? B.CreateICmpNE(call, B.getInt32(args->size() - 3))
+                           : B.CreateICmpSLT(call, B.getInt32(0));
+  B.CreateCondBr(cond, label_true, label_false);
+
+  B.SetInsertPoint(label_true);
+  B.CreateRet(B.getInt32(-1));
+
+  B.SetInsertPoint(label_false);
+  // s = &s[nread];
+  B.CreateStore(
+      B.CreateInBoundsGEP(B.CreateLoad(sptr), B.CreateLoad(nread, true)), sptr);
+
+  args->resize(2);
+  fmt->clear();
+}
+
+// recursive helper to capture the arguments
+static void parse_type(IRBuilder<> &B, vector<Value *> *args, string *fmt,
+                       Type *type, Value *out,
+                       const map<string, Value *> &locals, bool is_writer) {
+  if (StructType *st = dyn_cast<StructType>(type)) {
+    *fmt += "{ ";
+    unsigned idx = 0;
+    for (auto field : st->elements()) {
+      parse_type(B, args, fmt, field, B.CreateStructGEP(type, out, idx++),
+                 locals, is_writer);
+      *fmt += " ";
+    }
+    *fmt += "}";
+  } else if (ArrayType *at = dyn_cast<ArrayType>(type)) {
+    if (at->getElementType() == B.getInt8Ty()) {
+      // treat i8[] as a char string instead of as an array of u8's
+      if (is_writer) {
+        *fmt += "\"%s\"";
+        args->push_back(out);
+      } else {
+        // When reading strings, scanf doesn't support empty "", so we need to
+        // break this up into multiple scanf calls. To understand it, let's take
+        // an example:
+        // struct Event {
+        //   u32 a;
+        //   struct {
+        //     char x[64];
+        //     int y;
+        //   } b[2];
+        //   u32 c;
+        // };
+        // The writer string would look like:
+        //  "{ 0x%x [ { \"%s\" 0x%x } { \"%s\" 0x%x } ] 0x%x }"
+        // But the reader string needs to restart at each \"\".
+        //  reader0(const char *s, struct Event *val) {
+        //    int nread, rc;
+        //    nread = 0;
+        //    rc = sscanf(s, "{ %i [ { \"%n", &val->a, &nread);
+        //    if (rc != 1) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "%[^\"]%n", &val->b[0].x, &nread);
+        //    if (rc < 0) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "\" %i } { \"%n", &val->b[0].y, &nread);
+        //    if (rc != 1) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "%[^\"]%n", &val->b[1].x, &nread);
+        //    if (rc < 0) return -1;
+        //    s += nread; nread = 0;
+        //    rc = sscanf(s, "\" %i } ] %i }%n", &val->b[1].y, &val->c, &nread);
+        //    if (rc != 2) return -1;
+        //    s += nread; nread = 0;
+        //    return 0;
+        //  }
+        *fmt += "\"";
+        finish_sscanf(B, args, fmt, locals, true);
+
+        *fmt = "%[^\"]";
+        args->push_back(out);
+        finish_sscanf(B, args, fmt, locals, false);
+
+        *fmt = "\"";
+      }
+    } else {
+      *fmt += "[ ";
+      for (size_t i = 0; i < at->getNumElements(); ++i) {
+        parse_type(B, args, fmt, at->getElementType(),
+                   B.CreateStructGEP(type, out, i), locals, is_writer);
+        *fmt += " ";
+      }
+      *fmt += "]";
+    }
+  } else if (isa<PointerType>(type)) {
+    *fmt += "0xl";
+    if (is_writer)
+      *fmt += "x";
+    else
+      *fmt += "i";
+  } else if (IntegerType *it = dyn_cast<IntegerType>(type)) {
+    if (is_writer)
+      *fmt += "0x";
+    if (it->getBitWidth() <= 8)
+      *fmt += "%hh";
+    else if (it->getBitWidth() <= 16)
+      *fmt += "%h";
+    else if (it->getBitWidth() <= 32)
+      *fmt += "%";
+    else
+      *fmt += "%l";
+    if (is_writer)
+      *fmt += "x";
+    else
+      *fmt += "i";
+    args->push_back(is_writer ? B.CreateLoad(out) : out);
+  }
+}
+
+// make_reader generates a dynamic function in the instruction set of the host
+// (not bpf) that is able to convert c-strings in the pretty-print format of
+// make_writer back into binary representations. The encoding of the string
+// takes the llvm ir structure format, which closely maps the c structure but
+// not exactly (no support for unions for instance).
+// The general algorithm is:
+//  pod types (u8..u64)                <= %i
+//  array types
+//   u8[]  no nested quotes :(         <= "..."
+//   !u8[]                             <= [ %i %i ... ]
+//  struct types
+//   struct { u8 a; u64 b; }           <= { %i %i }
+//  nesting is supported
+//   struct { struct { u8 a[]; }; }    <= { "" }
+//   struct { struct { u64 a[]; }; }   <= { [ %i %i .. ] }
+string BPFModule::make_reader(Module *mod, Type *type) {
+  auto fn_it = readers_.find(type);
+  if (fn_it != readers_.end())
+    return fn_it->second;
+
+  // int read(const char *in, Type *out) {
+  //   int n = sscanf(in, "{ %i ... }", &out->field1, ...);
+  //   if (n != num_fields) return -1;
+  //   return 0;
+  // }
+
+  IRBuilder<> B(*ctx_);
+
+  FunctionType *sscanf_fn_type = FunctionType::get(
+      B.getInt32Ty(), {B.getInt8PtrTy(), B.getInt8PtrTy()}, /*isVarArg=*/true);
+  Function *sscanf_fn = mod->getFunction("sscanf");
+  if (!sscanf_fn) {
+    sscanf_fn = Function::Create(sscanf_fn_type, GlobalValue::ExternalLinkage,
+                                 "sscanf", mod);
+    sscanf_fn->setCallingConv(CallingConv::C);
+    sscanf_fn->addFnAttr(Attribute::NoUnwind);
+  }
+
+  string name = "reader" + std::to_string(readers_.size());
+  vector<Type *> fn_args({B.getInt8PtrTy(), PointerType::getUnqual(type)});
+  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
+  Function *fn =
+      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
+  auto arg_it = fn->arg_begin();
+  Argument *arg_in = &*arg_it;
+  ++arg_it;
+  arg_in->setName("in");
+  Argument *arg_out = &*arg_it;
+  ++arg_it;
+  arg_out->setName("out");
+
+  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
+  B.SetInsertPoint(label_entry);
+
+  Value *nread = B.CreateAlloca(B.getInt32Ty());
+  Value *sptr = B.CreateAlloca(B.getInt8PtrTy());
+  map<string, Value *> locals{{"nread", nread}, {"sptr", sptr}};
+  B.CreateStore(arg_in, sptr);
+  vector<Value *> args({nullptr, nullptr});
+  string fmt;
+  parse_type(B, &args, &fmt, type, arg_out, locals, false);
+
+  if (0)
+    debug_printf(mod, B, "%p %p\n", vector<Value *>({arg_in, arg_out}));
+
+  finish_sscanf(B, &args, &fmt, locals, true);
+
+  B.CreateRet(B.getInt32(0));
+
+  readers_[type] = name;
+  return name;
+}
+
+// make_writer generates a dynamic function in the instruction set of the host
+// (not bpf) that is able to pretty-print key/leaf entries as a c-string. The
+// encoding of the string takes the llvm ir structure format, which closely maps
+// the c structure but not exactly (no support for unions for instance).
+// The general algorithm is:
+//  pod types (u8..u64)                => 0x%x
+//  array types
+//   u8[]                              => "..."
+//   !u8[]                             => [ 0x%x 0x%x ... ]
+//  struct types
+//   struct { u8 a; u64 b; }           => { 0x%x 0x%x }
+//  nesting is supported
+//   struct { struct { u8 a[]; }; }    => { "" }
+//   struct { struct { u64 a[]; }; }   => { [ 0x%x 0x%x .. ] }
+string BPFModule::make_writer(Module *mod, Type *type) {
+  auto fn_it = writers_.find(type);
+  if (fn_it != writers_.end())
+    return fn_it->second;
+
+  // int write(int len, char *out, Type *in) {
+  //   return snprintf(out, len, "{ %i ... }", out->field1, ...);
+  // }
+
+  IRBuilder<> B(*ctx_);
+
+  string name = "writer" + std::to_string(writers_.size());
+  vector<Type *> fn_args({B.getInt8PtrTy(), B.getInt64Ty(), PointerType::getUnqual(type)});
+  FunctionType *fn_type = FunctionType::get(B.getInt32Ty(), fn_args, /*isVarArg=*/false);
+  Function *fn =
+      Function::Create(fn_type, GlobalValue::ExternalLinkage, name, mod);
+  auto arg_it = fn->arg_begin();
+  Argument *arg_out = &*arg_it;
+  ++arg_it;
+  arg_out->setName("out");
+  Argument *arg_len = &*arg_it;
+  ++arg_it;
+  arg_len->setName("len");
+  Argument *arg_in = &*arg_it;
+  ++arg_it;
+  arg_in->setName("in");
+
+  BasicBlock *label_entry = BasicBlock::Create(*ctx_, "entry", fn);
+  B.SetInsertPoint(label_entry);
+
+  map<string, Value *> locals{
+      {"nread", B.CreateAlloca(B.getInt64Ty())},
+  };
+  vector<Value *> args({arg_out, B.CreateZExt(arg_len, B.getInt64Ty()), nullptr});
+  string fmt;
+  parse_type(B, &args, &fmt, type, arg_in, locals, true);
+
+  GlobalVariable *fmt_gvar = B.CreateGlobalString(fmt, "fmt");
+
+  args[2] = B.CreateInBoundsGEP(fmt_gvar, vector<Value *>({B.getInt64(0), B.getInt64(0)}));
+
+  if (0)
+    debug_printf(mod, B, "%d %p %p\n", vector<Value *>({arg_len, arg_out, arg_in}));
+
+  vector<Type *> snprintf_fn_args({B.getInt8PtrTy(), B.getInt64Ty(), B.getInt8PtrTy()});
+  FunctionType *snprintf_fn_type = FunctionType::get(B.getInt32Ty(), snprintf_fn_args, /*isVarArg=*/true);
+  Function *snprintf_fn = mod->getFunction("snprintf");
+  if (!snprintf_fn)
+    snprintf_fn = Function::Create(snprintf_fn_type, GlobalValue::ExternalLinkage, "snprintf", mod);
+  snprintf_fn->setCallingConv(CallingConv::C);
+  snprintf_fn->addFnAttr(Attribute::NoUnwind);
+
+  CallInst *call = B.CreateCall(snprintf_fn, args);
+  call->setTailCall(true);
+
+  B.CreateRet(call);
+
+  writers_[type] = name;
+  return name;
+}
+
+unique_ptr<ExecutionEngine> BPFModule::finalize_rw(unique_ptr<Module> m) {
+  Module *mod = &*m;
+
+  run_pass_manager(*mod);
+
+  string err;
+  EngineBuilder builder(move(m));
+  builder.setErrorStr(&err);
+  builder.setUseOrcMCJITReplacement(false);
+  auto engine = unique_ptr<ExecutionEngine>(builder.create());
+  if (!engine)
+    fprintf(stderr, "Could not create ExecutionEngine: %s\n", err.c_str());
+  return engine;
+}
+
+int BPFModule::annotate() {
+  for (auto fn = mod_->getFunctionList().begin(); fn != mod_->getFunctionList().end(); ++fn)
+    if (!fn->hasFnAttribute(Attribute::NoInline))
+      fn->addFnAttr(Attribute::AlwaysInline);
+
+  // separate module to hold the reader functions
+  auto m = ebpf::make_unique<Module>("sscanf", *ctx_);
+
+  size_t id = 0;
+  Path path({id_});
+  for (auto it = ts_->lower_bound(path), up = ts_->upper_bound(path); it != up; ++it) {
+    TableDesc &table = it->second;
+    tables_.push_back(&it->second);
+    table_names_[table.name] = id++;
+    GlobalValue *gvar = mod_->getNamedValue(table.name);
+    if (!gvar) continue;
+    if (PointerType *pt = dyn_cast<PointerType>(gvar->getType())) {
+      if (StructType *st = dyn_cast<StructType>(pt->getElementType())) {
+        if (st->getNumElements() < 2) continue;
+        Type *key_type = st->elements()[0];
+        Type *leaf_type = st->elements()[1];
+
+        using std::placeholders::_1;
+        using std::placeholders::_2;
+        using std::placeholders::_3;
+        table.key_sscanf = std::bind(&BPFModule::sscanf, this,
+                                     make_reader(&*m, key_type), _1, _2);
+        table.leaf_sscanf = std::bind(&BPFModule::sscanf, this,
+                                      make_reader(&*m, leaf_type), _1, _2);
+        table.key_snprintf = std::bind(&BPFModule::snprintf, this,
+                                       make_writer(&*m, key_type), _1, _2, _3);
+        table.leaf_snprintf =
+            std::bind(&BPFModule::snprintf, this, make_writer(&*m, leaf_type),
+                      _1, _2, _3);
+      }
+    }
+  }
+
+  rw_engine_ = finalize_rw(move(m));
+  if (!rw_engine_)
+    return -1;
+  return 0;
+}
+
+StatusTuple BPFModule::sscanf(string fn_name, const char *str, void *val) {
+  if (!rw_engine_enabled_)
+    return StatusTuple(-1, "rw_engine not enabled");
+  auto fn =
+      (int (*)(const char *, void *))rw_engine_->getFunctionAddress(fn_name);
+  if (!fn)
+    return StatusTuple(-1, "sscanf not available");
+  int rc = fn(str, val);
+  if (rc < 0)
+    return StatusTuple(rc, "error in sscanf: %s", std::strerror(errno));
+  return StatusTuple(rc);
+}
+
+StatusTuple BPFModule::snprintf(string fn_name, char *str, size_t sz,
+                                const void *val) {
+  if (!rw_engine_enabled_)
+    return StatusTuple(-1, "rw_engine not enabled");
+  auto fn = (int (*)(char *, size_t,
+                     const void *))rw_engine_->getFunctionAddress(fn_name);
+  if (!fn)
+    return StatusTuple(-1, "snprintf not available");
+  int rc = fn(str, sz, val);
+  if (rc < 0)
+    return StatusTuple(rc, "error in snprintf: %s", std::strerror(errno));
+  if ((size_t)rc == sz)
+    return StatusTuple(-1, "buffer of size %zd too small", sz);
+  return StatusTuple(0);
+}
+
+} // namespace ebpf
diff --git a/src/cc/bpf_module_rw_engine_disabled.cc b/src/cc/bpf_module_rw_engine_disabled.cc
new file mode 100644
index 0000000..567d2fa
--- /dev/null
+++ b/src/cc/bpf_module_rw_engine_disabled.cc
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2015 PLUMgrid, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bpf_module.h"
+
+namespace ebpf {
+
+bool bpf_module_rw_engine_enabled(void) {
+  return false;
+}
+
+void BPFModule::initialize_rw_engine() {
+}
+
+void BPFModule::cleanup_rw_engine() {
+}
+
+int BPFModule::annotate() {
+  return -1;
+}
+
+} // namespace ebpf
diff --git a/src/cc/common.cc b/src/cc/common.cc
index c8370a3..ab7528c 100644
--- a/src/cc/common.cc
+++ b/src/cc/common.cc
@@ -57,7 +57,7 @@
   res = readlink(exe_link.c_str(), exe_path, sizeof(exe_path));
   if (res == -1)
     return "";
-  if (res >= sizeof(exe_path))
+  if (res >= static_cast<int>(sizeof(exe_path)))
     res = sizeof(exe_path) - 1;
   exe_path[res] = '\0';
   return std::string(exe_path);
diff --git a/src/cc/compat/linux/bpf.h b/src/cc/compat/linux/bpf.h
index f780fd4..bc3bcdd 100644
--- a/src/cc/compat/linux/bpf.h
+++ b/src/cc/compat/linux/bpf.h
@@ -133,6 +133,14 @@
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -232,6 +240,20 @@
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
@@ -257,9 +279,6 @@
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
-/* flags for BPF_PROG_QUERY */
-#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
-
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -269,6 +288,12 @@
 /* Flag for stack_map, store build_id+offset instead of pointer */
 #define BPF_F_STACK_BUILD_ID	(1U << 5)
 
+/* Zero-initialize hash function seed. This should only be used for testing. */
+#define BPF_F_ZERO_SEED		(1U << 6)
+
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -326,7 +351,7 @@
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
@@ -335,6 +360,13 @@
 		 * (context accesses, allowed helpers, etc).
 		 */
 		__u32		expected_attach_type;
+		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
+		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
+		__aligned_u64	func_info;	/* func info */
+		__u32		func_info_cnt;	/* number of bpf_func_info records */
+		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
+		__aligned_u64	line_info;	/* line info */
+		__u32		line_info_cnt;	/* number of bpf_line_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -353,8 +385,11 @@
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
 		__u32		prog_fd;
 		__u32		retval;
-		__u32		data_size_in;
-		__u32		data_size_out;
+		__u32		data_size_in;	/* input: len of data_in */
+		__u32		data_size_out;	/* input/output: len of data_out
+						 *   returns ENOSPC if data_out
+						 *   is too small.
+						 */
 		__aligned_u64	data_in;
 		__aligned_u64	data_out;
 		__u32		repeat;
@@ -475,18 +510,6 @@
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_pop_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Pop an element from *map*.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
- * int bpf_map_peek_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Get an element from *map* without removing it.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1910,9 +1933,9 @@
  *		is set to metric from route (IPv4/IPv6 only), and ifindex
  *		is set to the device index of the nexthop from the FIB lookup.
  *
- *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be a combination of one or more of the
- *             following values:
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be a combination of one or more of the
+ *		following values:
  *
  *		**BPF_FIB_LOOKUP_DIRECT**
  *			Do a direct table lookup vs full lookup using FIB
@@ -1921,9 +1944,9 @@
  *			Perform lookup from an egress perspective (default is
  *			ingress).
  *
- *             *ctx* is either **struct xdp_md** for XDP programs or
- *             **struct sk_buff** tc cls_act programs.
- *     Return
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *	Return
  *		* < 0 if any input argument is invalid
  *		*   0 on success (packet is forwarded, nexthop neighbor exists)
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
@@ -2068,8 +2091,8 @@
  *		translated to a keycode using the rc keymap, and reported as
  *		an input key down event. After a period a key up event is
  *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown** () again with the same values, or calling
- *		**bpf_rc_repeat** ().
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
  *		Some protocols include a toggle bit, in case the button	was
  *		released and pressed again between consecutive scancodes.
@@ -2152,29 +2175,30 @@
  *		The *flags* meaning is specific for each map type,
  *		and has to be 0 for cgroup local storage.
  *
- *		Depending on the bpf program type, a local storage area
- *		can be shared between multiple instances of the bpf program,
+ *		Depending on the BPF program type, a local storage area
+ *		can be shared between multiple instances of the BPF program,
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the BPF_STX_XADD instruction to alter
+ *		For example, by using the **BPF_STX_XADD** instruction to alter
  *		the shared data.
  *	Return
- *		Pointer to the local storage area.
+ *		A pointer to the local storage area.
  *
  * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
- *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
- *		It checks the selected sk is matching the incoming
- *		request in the skb.
+ *		Select a **SO_REUSEPORT** socket from a
+ *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		It checks the selected socket is matching the incoming
+ *		request in the socket buffer.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2187,12 +2211,14 @@
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2200,13 +2226,15 @@
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
- * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2219,12 +2247,14 @@
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2232,31 +2262,71 @@
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
- * int bpf_sk_release(struct bpf_sock *sk)
+ * int bpf_sk_release(struct bpf_sock *sock)
  *	Description
- *		Release the reference held by *sock*. *sock* must be a non-NULL
- *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *		Release the reference held by *sock*. *sock* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_sk_lookup_xxx**\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
  *	Description
- *		For socket policies, insert *len* bytes into msg at offset
+ *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
  *
  *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
- *		*msg* it may want to insert metadata or options into the msg.
+ *		*msg* it may want to insert metadata or options into the *msg*.
  *		This can later be read and used by any of the lower layer BPF
  *		hooks.
  *
  *		This helper may fail if under memory pressure (a malloc
  *		fails) in these cases BPF programs will get an appropriate
  *		error and BPF programs will need to handle them.
- *
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	Description
+ *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		This may result in **ENOMEM** errors under certain situations if
+ *		an allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either due to *start* byte not being valid part of *msg*
+ *		payload and/or *pop* value being to large.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded pointer movement.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2349,7 +2419,9 @@
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),		\
+	FN(rc_pointer_rel),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2405,6 +2477,9 @@
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Current network namespace */
+#define BPF_F_CURRENT_NETNS		(-1L)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -2422,6 +2497,12 @@
 	BPF_LWT_ENCAP_SEG6_INLINE
 };
 
+#define __bpf_md_ptr(type, name)	\
+union {					\
+	type name;			\
+	__u64 :64;			\
+} __attribute__((aligned(8)))
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -2456,7 +2537,9 @@
 	/* ... here. */
 
 	__u32 data_meta;
-	struct bpf_flow_keys *flow_keys;
+	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
+	__u64 tstamp;
+	__u32 wire_len;
 };
 
 struct bpf_tunnel_key {
@@ -2572,8 +2655,8 @@
  * be added to the end of this structure
  */
 struct sk_msg_md {
-	void *data;
-	void *data_end;
+	__bpf_md_ptr(void *, data);
+	__bpf_md_ptr(void *, data_end);
 
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -2582,6 +2665,7 @@
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 size;		/* Total size of sk_msg */
 };
 
 struct sk_reuseport_md {
@@ -2589,8 +2673,9 @@
 	 * Start of directly accessible data. It begins from
 	 * the tcp/udp header.
 	 */
-	void *data;
-	void *data_end;		/* End of directly accessible data */
+	__bpf_md_ptr(void *, data);
+	/* End of directly accessible data */
+	__bpf_md_ptr(void *, data_end);
 	/*
 	 * Total length of packet (starting from the tcp/udp header).
 	 * Note that the directly accessible bytes (data_end - data)
@@ -2631,6 +2716,18 @@
 	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
 	__aligned_u64 jited_func_lens;
+	__u32 btf_id;
+	__u32 func_info_rec_size;
+	__aligned_u64 func_info;
+	__u32 nr_func_info;
+	__u32 nr_line_info;
+	__aligned_u64 line_info;
+	__aligned_u64 jited_line_info;
+	__u32 nr_jited_line_info;
+	__u32 line_info_rec_size;
+	__u32 jited_line_info_rec_size;
+	__u32 nr_prog_tags;
+	__aligned_u64 prog_tags;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2942,4 +3039,19 @@
 	};
 };
 
+struct bpf_func_info {
+	__u32	insn_off;
+	__u32	type_id;
+};
+
+#define BPF_LINE_INFO_LINE_NUM(line_col)	((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col)	((line_col) & 0x3ff)
+
+struct bpf_line_info {
+	__u32	insn_off;
+	__u32	file_name_off;
+	__u32	line_off;
+	__u32	line_col;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/src/cc/compat/linux/bpf_common.h b/src/cc/compat/linux/bpf_common.h
index a5c220e..ee97668 100644
--- a/src/cc/compat/linux/bpf_common.h
+++ b/src/cc/compat/linux/bpf_common.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 #ifndef _UAPI__LINUX_BPF_COMMON_H__
 #define _UAPI__LINUX_BPF_COMMON_H__
 
@@ -14,9 +15,10 @@
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index 26039d5..e4be3c4 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -134,6 +134,14 @@
 	BPF_MAP_TYPE_STACK,
 };
 
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
@@ -233,6 +241,20 @@
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever.  On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement.  This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT	(1U << 1)
+
 /* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
 #define BPF_PSEUDO_MAP_FD	1
 
@@ -258,9 +280,6 @@
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
-/* flags for BPF_PROG_QUERY */
-#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
-
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -270,6 +289,12 @@
 /* Flag for stack_map, store build_id+offset instead of pointer */
 #define BPF_F_STACK_BUILD_ID	(1U << 5)
 
+/* Zero-initialize hash function seed. This should only be used for testing. */
+#define BPF_F_ZERO_SEED		(1U << 6)
+
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -327,7 +352,7 @@
 		__u32		log_level;	/* verbosity level of verifier */
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
-		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		kern_version;	/* not used */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
 		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
@@ -336,6 +361,13 @@
 		 * (context accesses, allowed helpers, etc).
 		 */
 		__u32		expected_attach_type;
+		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
+		__u32		func_info_rec_size;	/* userspace bpf_func_info size */
+		__aligned_u64	func_info;	/* func info */
+		__u32		func_info_cnt;	/* number of bpf_func_info records */
+		__u32		line_info_rec_size;	/* userspace bpf_line_info size */
+		__aligned_u64	line_info;	/* line info */
+		__u32		line_info_cnt;	/* number of bpf_line_info records */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -354,8 +386,11 @@
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
 		__u32		prog_fd;
 		__u32		retval;
-		__u32		data_size_in;
-		__u32		data_size_out;
+		__u32		data_size_in;	/* input: len of data_in */
+		__u32		data_size_out;	/* input/output: len of data_out
+						 *   returns ENOSPC if data_out
+						 *   is too small.
+						 */
 		__aligned_u64	data_in;
 		__aligned_u64	data_out;
 		__u32		repeat;
@@ -476,18 +511,6 @@
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_map_pop_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Pop an element from *map*.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
- * int bpf_map_peek_elem(struct bpf_map *map, void *value)
- * 	Description
- * 		Get an element from *map* without removing it.
- * Return
- * 		0 on success, or a negative error in case of failure.
- *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1911,9 +1934,9 @@
  *		is set to metric from route (IPv4/IPv6 only), and ifindex
  *		is set to the device index of the nexthop from the FIB lookup.
  *
- *             *plen* argument is the size of the passed in struct.
- *             *flags* argument can be a combination of one or more of the
- *             following values:
+ *		*plen* argument is the size of the passed in struct.
+ *		*flags* argument can be a combination of one or more of the
+ *		following values:
  *
  *		**BPF_FIB_LOOKUP_DIRECT**
  *			Do a direct table lookup vs full lookup using FIB
@@ -1922,9 +1945,9 @@
  *			Perform lookup from an egress perspective (default is
  *			ingress).
  *
- *             *ctx* is either **struct xdp_md** for XDP programs or
- *             **struct sk_buff** tc cls_act programs.
- *     Return
+ *		*ctx* is either **struct xdp_md** for XDP programs or
+ *		**struct sk_buff** tc cls_act programs.
+ *	Return
  *		* < 0 if any input argument is invalid
  *		*   0 on success (packet is forwarded, nexthop neighbor exists)
  *		* > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
@@ -2069,8 +2092,8 @@
  *		translated to a keycode using the rc keymap, and reported as
  *		an input key down event. After a period a key up event is
  *		generated. This period can be extended by calling either
- *		**bpf_rc_keydown** () again with the same values, or calling
- *		**bpf_rc_repeat** ().
+ *		**bpf_rc_keydown**\ () again with the same values, or calling
+ *		**bpf_rc_repeat**\ ().
  *
  *		Some protocols include a toggle bit, in case the button	was
  *		released and pressed again between consecutive scancodes.
@@ -2153,29 +2176,30 @@
  *		The *flags* meaning is specific for each map type,
  *		and has to be 0 for cgroup local storage.
  *
- *		Depending on the bpf program type, a local storage area
- *		can be shared between multiple instances of the bpf program,
+ *		Depending on the BPF program type, a local storage area
+ *		can be shared between multiple instances of the BPF program,
  *		running simultaneously.
  *
  *		A user should care about the synchronization by himself.
- *		For example, by using the BPF_STX_XADD instruction to alter
+ *		For example, by using the **BPF_STX_XADD** instruction to alter
  *		the shared data.
  *	Return
- *		Pointer to the local storage area.
+ *		A pointer to the local storage area.
  *
  * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
  *	Description
- *		Select a SO_REUSEPORT sk from a	BPF_MAP_TYPE_REUSEPORT_ARRAY map
- *		It checks the selected sk is matching the incoming
- *		request in the skb.
+ *		Select a **SO_REUSEPORT** socket from a
+ *		**BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*.
+ *		It checks the selected socket is matching the incoming
+ *		request in the socket buffer.
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
- * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for TCP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2188,12 +2212,14 @@
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2201,13 +2227,15 @@
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
- * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags)
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
  *	Description
  *		Look for UDP socket matching *tuple*, optionally in a child
  *		network namespace *netns*. The return value must be checked,
- *		and if non-NULL, released via **bpf_sk_release**\ ().
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
  *
  *		The *ctx* should point to the context of the program, such as
  *		the skb or socket (depending on the hook in use). This is used
@@ -2220,12 +2248,14 @@
  *		**sizeof**\ (*tuple*\ **->ipv6**)
  *			Look for an IPv6 socket.
  *
- *		If the *netns* is zero, then the socket lookup table in the
- *		netns associated with the *ctx* will be used. For the TC hooks,
- *		this in the netns of the device in the skb. For socket hooks,
- *		this in the netns of the socket. If *netns* is non-zero, then
- *		it specifies the ID of the netns relative to the netns
- *		associated with the *ctx*.
+ *		If the *netns* is a negative signed 32-bit integer, then the
+ *		socket lookup table in the netns associated with the *ctx* will
+ *		will be used. For the TC hooks, this is the netns of the device
+ *		in the skb. For socket hooks, this is the netns of the socket.
+ *		If *netns* is any other signed 32-bit value greater than or
+ *		equal to zero then it specifies the ID of the netns relative to
+ *		the netns associated with the *ctx*. *netns* values beyond the
+ *		range of 32-bit integers are reserved for future use.
  *
  *		All values for *flags* are reserved for future usage, and must
  *		be left at zero.
@@ -2233,31 +2263,71 @@
  *		This helper is available only if the kernel was compiled with
  *		**CONFIG_NET** configuration option.
  *	Return
- *		Pointer to *struct bpf_sock*, or NULL in case of failure.
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  *
- * int bpf_sk_release(struct bpf_sock *sk)
+ * int bpf_sk_release(struct bpf_sock *sock)
  *	Description
- *		Release the reference held by *sock*. *sock* must be a non-NULL
- *		pointer that was returned from bpf_sk_lookup_xxx\ ().
+ *		Release the reference held by *sock*. *sock* must be a
+ *		non-**NULL** pointer that was returned from
+ *		**bpf_sk_lookup_xxx**\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
  *	Description
- *		For socket policies, insert *len* bytes into msg at offset
+ *		For socket policies, insert *len* bytes into *msg* at offset
  *		*start*.
  *
  *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
- *		*msg* it may want to insert metadata or options into the msg.
+ *		*msg* it may want to insert metadata or options into the *msg*.
  *		This can later be read and used by any of the lower layer BPF
  *		hooks.
  *
  *		This helper may fail if under memory pressure (a malloc
  *		fails) in these cases BPF programs will get an appropriate
  *		error and BPF programs will need to handle them.
- *
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags)
+ *	Description
+ *		Will remove *pop* bytes from a *msg* starting at byte *start*.
+ *		This may result in **ENOMEM** errors under certain situations if
+ *		an allocation and copy are required due to a full ring buffer.
+ *		However, the helper will try to avoid doing the allocation
+ *		if possible. Other errors can occur if input parameters are
+ *		invalid either due to *start* byte not being valid part of *msg*
+ *		payload and/or *pop* value being to large.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ *	Description
+ *		This helper is used in programs implementing IR decoding, to
+ *		report a successfully decoded pointer movement.
+ *
+ *		The *ctx* should point to the lirc sample as passed into
+ *		the program.
+ *
+ *		This helper is only available is the kernel was compiled with
+ *		the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ *		"**y**".
+ *	Return
+ *		0
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2350,7 +2420,9 @@
 	FN(map_push_elem),		\
 	FN(map_pop_elem),		\
 	FN(map_peek_elem),		\
-	FN(msg_push_data),
+	FN(msg_push_data),		\
+	FN(msg_pop_data),		\
+	FN(rc_pointer_rel),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2406,6 +2478,9 @@
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
 #define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
+/* Current network namespace */
+#define BPF_F_CURRENT_NETNS		(-1L)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -2423,6 +2498,12 @@
 	BPF_LWT_ENCAP_SEG6_INLINE
 };
 
+#define __bpf_md_ptr(type, name)	\
+union {					\
+	type name;			\
+	__u64 :64;			\
+} __attribute__((aligned(8)))
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -2457,7 +2538,9 @@
 	/* ... here. */
 
 	__u32 data_meta;
-	struct bpf_flow_keys *flow_keys;
+	__bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
+	__u64 tstamp;
+	__u32 wire_len;
 };
 
 struct bpf_tunnel_key {
@@ -2573,8 +2656,8 @@
  * be added to the end of this structure
  */
 struct sk_msg_md {
-	void *data;
-	void *data_end;
+	__bpf_md_ptr(void *, data);
+	__bpf_md_ptr(void *, data_end);
 
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
@@ -2583,6 +2666,7 @@
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 size;		/* Total size of sk_msg */
 };
 
 struct sk_reuseport_md {
@@ -2590,8 +2674,9 @@
 	 * Start of directly accessible data. It begins from
 	 * the tcp/udp header.
 	 */
-	void *data;
-	void *data_end;		/* End of directly accessible data */
+	__bpf_md_ptr(void *, data);
+	/* End of directly accessible data */
+	__bpf_md_ptr(void *, data_end);
 	/*
 	 * Total length of packet (starting from the tcp/udp header).
 	 * Note that the directly accessible bytes (data_end - data)
@@ -2632,6 +2717,18 @@
 	__u32 nr_jited_func_lens;
 	__aligned_u64 jited_ksyms;
 	__aligned_u64 jited_func_lens;
+	__u32 btf_id;
+	__u32 func_info_rec_size;
+	__aligned_u64 func_info;
+	__u32 nr_func_info;
+	__u32 nr_line_info;
+	__aligned_u64 line_info;
+	__aligned_u64 jited_line_info;
+	__u32 nr_jited_line_info;
+	__u32 line_info_rec_size;
+	__u32 jited_line_info_rec_size;
+	__u32 nr_prog_tags;
+	__aligned_u64 prog_tags;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
@@ -2943,5 +3040,20 @@
 	};
 };
 
+struct bpf_func_info {
+	__u32	insn_off;
+	__u32	type_id;
+};
+
+#define BPF_LINE_INFO_LINE_NUM(line_col)	((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col)	((line_col) & 0x3ff)
+
+struct bpf_line_info {
+	__u32	insn_off;
+	__u32	file_name_off;
+	__u32	line_off;
+	__u32	line_col;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
 )********"
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index 882a79d..00b9160 100755
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -17,6 +17,21 @@
 #ifndef __BPF_HELPERS_H
 #define __BPF_HELPERS_H
 
+/* Before bpf_helpers.h is included, uapi bpf.h has been
+ * included, which references linux/types.h. This will bring
+ * in asm_volatile_goto definition if permitted based on
+ * compiler setup and kernel configs.
+ *
+ * clang does not support "asm volatile goto" yet.
+ * So redefine asm_volatile_goto to some invalid asm code.
+ * If asm_volatile_goto is actually used by the bpf program,
+ * a compilation error will appear.
+ */
+#ifdef asm_volatile_goto
+#undef asm_volatile_goto
+#define asm_volatile_goto(x...) asm volatile("invalid use of asm_volatile_goto")
+#endif
+
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/if_packet.h>
 #include <linux/version.h>
@@ -435,6 +450,10 @@
   (void *) BPF_FUNC_map_peek_elem;
 static int (*bpf_msg_push_data)(void *skb, u32 start, u32 len, u64 flags) =
   (void *) BPF_FUNC_msg_push_data;
+static int (*bpf_msg_pop_data)(void *msg, u32 start, u32 pop, u64 flags) =
+  (void *) BPF_FUNC_msg_pop_data;
+static int (*bpf_rc_pointer_rel)(void *ctx, s32 rel_x, s32 rel_y) =
+  (void *) BPF_FUNC_rc_pointer_rel;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
@@ -737,6 +756,7 @@
 #define PT_REGS_PARM4(ctx)	((ctx)->cx)
 #define PT_REGS_PARM5(ctx)	((ctx)->r8)
 #define PT_REGS_PARM6(ctx)	((ctx)->r9)
+#define PT_REGS_RET(ctx)	((ctx)->sp)
 #define PT_REGS_FP(ctx)         ((ctx)->bp) /* Works only with CONFIG_FRAME_POINTER */
 #define PT_REGS_RC(ctx)		((ctx)->ax)
 #define PT_REGS_IP(ctx)		((ctx)->ip)
diff --git a/src/cc/frontends/b/type_check.cc b/src/cc/frontends/b/type_check.cc
index 8d49de9..7c5b7ce 100644
--- a/src/cc/frontends/b/type_check.cc
+++ b/src/cc/frontends/b/type_check.cc
@@ -204,6 +204,7 @@
     case Tok::TCGT:
     case Tok::TCGE:
       n->bit_width_ = 1;
+      break;
     default:
       n->bit_width_ = std::max(n->lhs_->bit_width_, n->rhs_->bit_width_);
   }
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
index 12095e6..20af23b 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -1080,6 +1080,26 @@
   return C.getDiagnostics().Report(loc, diag_id);
 }
 
+int64_t BTypeVisitor::getFieldValue(VarDecl *Decl, FieldDecl *FDecl, int64_t OrigFValue) {
+  unsigned idx = FDecl->getFieldIndex();
+
+  if (auto I = dyn_cast_or_null<InitListExpr>(Decl->getInit())) {
+#if LLVM_MAJOR_VERSION >= 8
+    Expr::EvalResult res;
+    if (I->getInit(idx)->EvaluateAsInt(res, C)) {
+      return res.Val.getInt().getExtValue();
+    }
+#else
+    llvm::APSInt res;
+    if (I->getInit(idx)->EvaluateAsInt(res, C)) {
+      return res.getExtValue();
+    }
+#endif
+  }
+
+  return OrigFValue;
+}
+
 // Open table FDs when bpf tables (as denoted by section("maps*") attribute)
 // are declared.
 bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
@@ -1124,21 +1144,9 @@
         table.leaf_size = sz;
         leaf_type = F->getType();
       } else if (F->getName() == "max_entries") {
-        unsigned idx = F->getFieldIndex();
-        if (auto I = dyn_cast_or_null<InitListExpr>(Decl->getInit())) {
-          llvm::APSInt res;
-          if (I->getInit(idx)->EvaluateAsInt(res, C)) {
-            table.max_entries = res.getExtValue();
-          }
-        }
+            table.max_entries = getFieldValue(Decl, F, table.max_entries);
       } else if (F->getName() == "flags") {
-        unsigned idx = F->getFieldIndex();
-        if (auto I = dyn_cast_or_null<InitListExpr>(Decl->getInit())) {
-          llvm::APSInt res;
-          if (I->getInit(idx)->EvaluateAsInt(res, C)) {
-            table.flags = res.getExtValue();
-          }
-        }
+            table.flags = getFieldValue(Decl, F, table.flags);
       }
       ++i;
     }
diff --git a/src/cc/frontends/clang/b_frontend_action.h b/src/cc/frontends/clang/b_frontend_action.h
index 4559d11..37aea82 100644
--- a/src/cc/frontends/clang/b_frontend_action.h
+++ b/src/cc/frontends/clang/b_frontend_action.h
@@ -75,6 +75,8 @@
   void genParamIndirectAssign(clang::FunctionDecl *D, std::string& preamble,
                               const char **calling_conv_regs);
   void rewriteFuncParam(clang::FunctionDecl *D);
+  int64_t getFieldValue(clang::VarDecl *Decl, clang::FieldDecl *FDecl,
+                        int64_t OrigFValue);
   template <unsigned N>
   clang::DiagnosticBuilder error(clang::SourceLocation loc, const char (&fmt)[N]);
   template <unsigned N>
diff --git a/src/cc/frontends/clang/kbuild_helper.cc b/src/cc/frontends/clang/kbuild_helper.cc
index 63bb7d2..acacdd2 100644
--- a/src/cc/frontends/clang/kbuild_helper.cc
+++ b/src/cc/frontends/clang/kbuild_helper.cc
@@ -34,35 +34,35 @@
   //               -e s/ppc.*/powerpc/ -e s/mips.*/mips/ -e s/sh[234].*/sh/
   //               -e s/aarch64.*/arm64/
 
-  string arch = uname_machine;
-  const char *archenv;
-
-  if (!strncmp(uname_machine, "x86_64", 6)) {
-    arch = "x86";
-  } else if (uname_machine[0] == 'i' && !strncmp(&uname_machine[2], "86", 2)) {
-    arch = "x86";
-  } else if (!strncmp(uname_machine, "arm", 3)) {
-    arch = "arm";
-  } else if (!strncmp(uname_machine, "sa110", 5)) {
-    arch = "arm";
-  } else if (!strncmp(uname_machine, "s390x", 5)) {
-    arch = "s390";
-  } else if (!strncmp(uname_machine, "parisc64", 8)) {
-    arch = "parisc";
-  } else if (!strncmp(uname_machine, "ppc", 3)) {
-    arch = "powerpc";
-  } else if (!strncmp(uname_machine, "mips", 4)) {
-    arch = "mips";
-  } else if (!strncmp(uname_machine, "sh", 2)) {
-    arch = "sh";
-  } else if (!strncmp(uname_machine, "aarch64", 7)) {
-    arch = "arm64";
-  }
-
+  string arch;
+  const char *archenv = getenv("ARCH");
   // If ARCH env is defined, use it over uname
-  archenv = getenv("ARCH");
   if (archenv)
     arch = string(archenv);
+  else
+    arch = string(uname_machine);
+
+  if (!arch.compare(0, 6, "x86_64")) {
+    arch = "x86";
+  } else if (arch[0] == 'i' && !arch.compare(2, 2, "86")) {
+    arch = "x86";
+  } else if (!arch.compare(0, 3, "arm")) {
+    arch = "arm";
+  } else if (!arch.compare(0, 5, "sa110")) {
+    arch = "arm";
+  } else if (!arch.compare(0, 5, "s390x")) {
+    arch = "s390";
+  } else if (!arch.compare(0, 8, "parisc64")) {
+    arch = "parisc";
+  } else if (!arch.compare(0, 3, "ppc")) {
+    arch = "powerpc";
+  } else if (!arch.compare(0, 4, "mips")) {
+    arch = "mips";
+  } else if (!arch.compare(0, 2, "sh")) {
+    arch = "sh";
+  } else if (!arch.compare(0, 7, "aarch64")) {
+    arch = "arm64";
+  }
 
   cflags->push_back("-nostdinc");
   cflags->push_back("-isystem");
diff --git a/src/cc/frontends/p4/compiler/ebpfTable.py b/src/cc/frontends/p4/compiler/ebpfTable.py
index eb1efd9..4b7e023 100644
--- a/src/cc/frontends/p4/compiler/ebpfTable.py
+++ b/src/cc/frontends/p4/compiler/ebpfTable.py
@@ -110,7 +110,7 @@
                 ebpfHeader = program.getInstance(instance.name)
                 assert isinstance(ebpfHeader, ebpfInstance.SimpleInstance)
                 basetype = ebpfHeader.type
-                eInstance = program.getInstance(instance.base_name)
+                eInstance = program.getInstance(instance.name)
 
             ebpfField = basetype.getField(fieldname)
             assert isinstance(ebpfField, ebpfStructType.EbpfField)
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index 1cf315a..90ae2fd 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -180,6 +180,8 @@
   {"map_pop_elem", "4.20"},
   {"map_peak_elem", "4.20"},
   {"msg_push_data", "4.20"},
+  {"msg_pop_data", "4.21"},
+  {"rc_pointer_rel", "4.21"},
 };
 
 static uint64_t ptr_to_u64(void *ptr)
@@ -342,6 +344,14 @@
       "you'll need to be explicit.\n\n");
   }
 
+  // referencing global/static variables or read only data
+  if (strstr(log, "unknown opcode") != NULL) {
+    fprintf(stderr, "HINT: The 'unknown opcode' can happen if you reference "
+      "a global or static variable, or data in read-only section. For example,"
+      " 'char *p = \"hello\"' will result in p referencing a read-only section,"
+      " and 'char p[] = \"hello\"' will have \"hello\" stored on the stack.\n\n");
+  }
+
   // helper function not found in kernel
   char *helper_str = strstr(log, "invalid func ");
   if (helper_str != NULL) {
@@ -521,14 +531,16 @@
     }
   }
 
-  if (strncmp(name, "kprobe__", 8) == 0)
-    name_offset = 8;
-  else if (strncmp(name, "tracepoint__", 12) == 0)
-    name_offset = 12;
-  else if (strncmp(name, "raw_tracepoint__", 16) == 0)
-    name_offset = 16;
-  memcpy(attr.prog_name, name + name_offset,
-         min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
+  if (name_len) {
+    if (strncmp(name, "kprobe__", 8) == 0)
+      name_offset = 8;
+    else if (strncmp(name, "tracepoint__", 12) == 0)
+      name_offset = 12;
+    else if (strncmp(name, "raw_tracepoint__", 16) == 0)
+      name_offset = 16;
+    memcpy(attr.prog_name, name + name_offset,
+           min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1));
+  }
 
   ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
   // BPF object name is not supported on older Kernels.
@@ -698,7 +710,7 @@
   close(fd);
   if (ret < 0 || ret >= (int)sizeof(buf))
     return -1;
-  if (strlen(buf) < strlen("config:"))
+  if (strncmp(buf, "config:", strlen("config:")))
     return -1;
   errno = 0;
   ret = (int)strtol(buf + strlen("config:"), NULL, 10);
diff --git a/src/lua/bcc/bpf.lua b/src/lua/bcc/bpf.lua
index fa987f3..44b801c 100644
--- a/src/lua/bcc/bpf.lua
+++ b/src/lua/bcc/bpf.lua
@@ -211,9 +211,10 @@
   local event = args.event or ""
   local ptype = args.retprobe and "r" or "p"
   local ev_name = string.format("%s_%s", ptype, event:gsub("[%+%.]", "_"))
+  local offset = args.fn_offset or 0
   local retprobe = args.retprobe and 1 or 0
 
-  local res = libbcc.bpf_attach_kprobe(fn.fd, retprobe, ev_name, event)
+  local res = libbcc.bpf_attach_kprobe(fn.fd, retprobe, ev_name, event, offset)
 
   assert(res >= 0, "failed to attach BPF to kprobe")
   self:probe_store("kprobe", ev_name, res)
diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua
index c518a89..47a2d3b 100644
--- a/src/lua/bcc/libbcc.lua
+++ b/src/lua/bcc/libbcc.lua
@@ -43,7 +43,7 @@
 typedef void (*perf_reader_lost_cb)(void *cb_cookie, uint64_t lost);
 
 int bpf_attach_kprobe(int progfd, int attach_type, const char *ev_name,
-                      const char *fn_name);
+                      const char *fn_name, uint64_t fn_offset);
 
 int bpf_detach_kprobe(const char *ev_name);
 
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 7ce8366..fa60239 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -1,42 +1,46 @@
 # Copyright (c) PLUMgrid, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License")
 
-set(PYTHON_SRC __init__.py perf.py tcp.py utils.py libbcc.py table.py usdt.py)
-
-foreach (PY_SRC ${PYTHON_SRC})
-  configure_file(bcc/${PY_SRC} ${CMAKE_CURRENT_BINARY_DIR}/bcc/${PY_SRC} COPYONLY)
-endforeach()
-
 if(NOT PYTHON_CMD)
   set(PYTHON_CMD "python")
 endif()
 
-configure_file(setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py @ONLY)
-configure_file(bcc/version.py.in ${CMAKE_CURRENT_BINARY_DIR}/bcc/version.py @ONLY)
 if(EXISTS "/etc/debian_version")
   set(PYTHON_FLAGS "${PYTHON_FLAGS} --install-layout deb")
 endif()
 
+file(GLOB_RECURSE PYTHON_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/*.py)
+file(GLOB_RECURSE PYTHON_INCLUDES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/*.py.in)
+
 foreach(PY_CMD ${PYTHON_CMD})
   string(REPLACE "/" "-" PY_CMD_ESCAPED ${PY_CMD})
+  set(PY_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bcc-${PY_CMD_ESCAPED})
 
-  set(PY_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/bcc/__init__.py ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-  if (PREVIOUS_PY)
-    set(PY_DEPENDS ${PY_DEPENDS} ${PREVIOUS_PY})
-  endif()
+  foreach(PY_SRC ${PYTHON_SOURCES})
+    configure_file(${PY_SRC} ${PY_DIRECTORY}/${PY_SRC} COPYONLY)
+  endforeach()
 
-  set(PIP_INSTALLABLE "${CMAKE_CURRENT_BINARY_DIR}/dist-${PY_CMD_ESCAPED}/bcc-${REVISION}.tar.gz")
-  # build the pip installable
-  add_custom_command(OUTPUT ${PIP_INSTALLABLE}
-    COMMAND ${PY_CMD} setup.py sdist --dist-dir dist-${PY_CMD_ESCAPED}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS ${PY_DEPENDS}
-    )
+  foreach(PY_INC ${PYTHON_INCLUDES})
+    string(REPLACE ".py.in" ".py" PY_INC_REPLACED ${PY_INC})
+    configure_file(${PY_INC} ${PY_DIRECTORY}/${PY_INC_REPLACED} @ONLY)
+  endforeach()
+
+  set(PIP_INSTALLABLE "${PY_DIRECTORY}/dist/bcc-${REVISION}.tar.gz")
+  add_custom_command(
+    OUTPUT ${PIP_INSTALLABLE}
+    COMMAND ${PY_CMD} setup.py sdist
+    WORKING_DIRECTORY ${PY_DIRECTORY}
+    DEPENDS ${PYTHON_SOURCES} ${PYTHON_INCLUDES}
+    COMMENT "Building sdist for ${PY_CMD}"
+  )
   add_custom_target(bcc_py_${PY_CMD_ESCAPED} ALL DEPENDS ${PIP_INSTALLABLE})
 
-  install(CODE "execute_process(COMMAND ${PY_CMD} setup.py install -f ${PYTHON_FLAGS}
-    --prefix=${CMAKE_INSTALL_PREFIX} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})"
+  install(
+    CODE "
+      execute_process(
+        COMMAND ${PY_CMD} setup.py install -f ${PYTHON_FLAGS} --prefix=${CMAKE_INSTALL_PREFIX}
+        WORKING_DIRECTORY ${PY_DIRECTORY})"
     COMPONENT python)
-
-  set(PREVIOUS_PY ${PIP_INSTALLABLE})
 endforeach()
diff --git a/src/python/MANIFEST b/src/python/MANIFEST
deleted file mode 100644
index f6e1add..0000000
--- a/src/python/MANIFEST
+++ /dev/null
@@ -1,5 +0,0 @@
-# file GENERATED by distutils, do NOT edit
-setup.py
-bcc/__init__.py
-bcc/table.py
-bcc/libbcc.py
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index 1dfd830..1d99afd 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -27,6 +27,7 @@
 from .libbcc import lib, bcc_symbol, bcc_symbol_option, _SYM_CB_TYPE
 from .table import Table, PerfEventArray
 from .perf import Perf
+from .syscall import syscall_name
 from .utils import get_online_cpus, printb, _assert_is_bytes, ArgString
 from .version import __version__
 
@@ -171,6 +172,7 @@
         b"__x64_sys_",
         b"__x32_compat_sys_",
         b"__ia32_compat_sys_",
+        b"__arm64_sys_",
     ]
 
     # BPF timestamps come from the monotonic clock. To be able to filter
@@ -514,9 +516,12 @@
         fns = []
 
         in_init_section = 0
+        in_irq_section = 0
         with open("/proc/kallsyms", "rb") as avail_file:
             for line in avail_file:
                 (t, fn) = line.rstrip().split()[1:3]
+                # Skip all functions defined between __init_begin and
+                # __init_end
                 if in_init_section == 0:
                     if fn == b'__init_begin':
                         in_init_section = 1
@@ -525,6 +530,26 @@
                     if fn == b'__init_end':
                         in_init_section = 2
                     continue
+                # Skip all functions defined between __irqentry_text_start and
+                # __irqentry_text_end
+                if in_irq_section == 0:
+                    if fn == b'__irqentry_text_start':
+                        in_irq_section = 1
+                        continue
+                elif in_irq_section == 1:
+                    if fn == b'__irqentry_text_end':
+                        in_irq_section = 2
+                    continue
+                # All functions defined as NOKPROBE_SYMBOL() start with the
+                # prefix _kbl_addr_*, blacklisting them by looking at the name
+                # allows to catch also those symbols that are defined in kernel
+                # modules.
+                if fn.startswith(b'_kbl_addr_'):
+                    continue
+                # Explicitly blacklist perf-related functions, they are all
+                # non-attachable.
+                elif fn.startswith(b'__perf') or fn.startswith(b'perf_'):
+                    continue
                 if (t.lower() in [b't', b'w']) and re.match(event_re, fn) \
                     and fn not in blacklist:
                     fns.append(fn)
@@ -602,7 +627,8 @@
         ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_")
         fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off)
         if fd < 0:
-            raise Exception("Failed to attach BPF to kprobe")
+            raise Exception("Failed to attach BPF program %s to kprobe %s" %
+                            (fn_name, event))
         self._add_kprobe_fd(ev_name, fd)
         return self
 
@@ -625,13 +651,14 @@
         ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_")
         fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0)
         if fd < 0:
-            raise Exception("Failed to attach BPF to kretprobe")
+            raise Exception("Failed to attach BPF program %s to kretprobe %s" %
+                            (fn_name, event))
         self._add_kprobe_fd(ev_name, fd)
         return self
 
     def detach_kprobe_event(self, ev_name):
         if ev_name not in self.kprobe_fds:
-            raise Exception("Kprobe %s is not attached" % event)
+            raise Exception("Kprobe %s is not attached" % ev_name)
         res = lib.bpf_close_perf_event_fd(self.kprobe_fds[ev_name])
         if res < 0:
             raise Exception("Failed to close kprobe FD")
@@ -766,7 +793,8 @@
         (tp_category, tp_name) = tp.split(b':')
         fd = lib.bpf_attach_tracepoint(fn.fd, tp_category, tp_name)
         if fd < 0:
-            raise Exception("Failed to attach BPF to tracepoint")
+            raise Exception("Failed to attach BPF program %s to tracepoint %s" %
+                            (fn_name, tp))
         self.tracepoint_fds[tp] = fd
         return self
 
@@ -1231,6 +1259,9 @@
         """
         self.perf_buffer_poll(timeout)
 
+    def free_bcc_memory(self):
+        return lib.bcc_free_memory()
+
     def donothing(self):
         """the do nothing exit handler"""
 
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index e61227e..6b5be77 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -88,7 +88,8 @@
 _RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int)
 _LOST_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_ulonglong)
 lib.bpf_attach_kprobe.restype = ct.c_int
-lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_int, ct.c_char_p, ct.c_char_p]
+lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_int, ct.c_char_p, ct.c_char_p,
+        ct.c_ulonglong]
 lib.bpf_detach_kprobe.restype = ct.c_int
 lib.bpf_detach_kprobe.argtypes = [ct.c_char_p]
 lib.bpf_attach_uprobe.restype = ct.c_int
@@ -176,6 +177,9 @@
 lib.bcc_symcache_refresh.restype = None
 lib.bcc_symcache_refresh.argtypes = [ct.c_void_p]
 
+lib.bcc_free_memory.restype = ct.c_int
+lib.bcc_free_memory.argtypes = None
+
 lib.bcc_usdt_new_frompid.restype = ct.c_void_p
 lib.bcc_usdt_new_frompid.argtypes = [ct.c_int, ct.c_char_p]
 
diff --git a/src/python/bcc/syscall.py b/src/python/bcc/syscall.py
new file mode 100644
index 0000000..752b64e
--- /dev/null
+++ b/src/python/bcc/syscall.py
@@ -0,0 +1,391 @@
+# Copyright 2017 Sasha Goldshtein
+# Copyright 2018 Red Hat, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""syscall.py contains functions useful for mapping between syscall names and numbers"""
+
+import subprocess
+import platform
+
+#
+# Syscall table for Linux x86_64, not very recent.
+# Automatically generated from strace/linux/x86_64/syscallent.h using the
+# following command:
+#
+# cat syscallent.h | awk -F, '{ gsub(/[ \t"}]/, "", $4);
+#                               gsub(/[\[\] \t{]/, "", $1); split($1, a, "=");
+#                               print "    "a[1]": b\""$4"\","; }
+#                               BEGIN { print "syscalls = {" }
+#                               END { print "}" '}
+syscalls = {
+    0: b"read",
+    1: b"write",
+    2: b"open",
+    3: b"close",
+    4: b"stat",
+    5: b"fstat",
+    6: b"lstat",
+    7: b"poll",
+    8: b"lseek",
+    9: b"mmap",
+    10: b"mprotect",
+    11: b"munmap",
+    12: b"brk",
+    13: b"rt_sigaction",
+    14: b"rt_sigprocmask",
+    15: b"rt_sigreturn",
+    16: b"ioctl",
+    17: b"pread64",
+    18: b"pwrite64",
+    19: b"readv",
+    20: b"writev",
+    21: b"access",
+    22: b"pipe",
+    23: b"select",
+    24: b"sched_yield",
+    25: b"mremap",
+    26: b"msync",
+    27: b"mincore",
+    28: b"madvise",
+    29: b"shmget",
+    30: b"shmat",
+    31: b"shmctl",
+    32: b"dup",
+    33: b"dup2",
+    34: b"pause",
+    35: b"nanosleep",
+    36: b"getitimer",
+    37: b"alarm",
+    38: b"setitimer",
+    39: b"getpid",
+    40: b"sendfile",
+    41: b"socket",
+    42: b"connect",
+    43: b"accept",
+    44: b"sendto",
+    45: b"recvfrom",
+    46: b"sendmsg",
+    47: b"recvmsg",
+    48: b"shutdown",
+    49: b"bind",
+    50: b"listen",
+    51: b"getsockname",
+    52: b"getpeername",
+    53: b"socketpair",
+    54: b"setsockopt",
+    55: b"getsockopt",
+    56: b"clone",
+    57: b"fork",
+    58: b"vfork",
+    59: b"execve",
+    60: b"exit",
+    61: b"wait4",
+    62: b"kill",
+    63: b"uname",
+    64: b"semget",
+    65: b"semop",
+    66: b"semctl",
+    67: b"shmdt",
+    68: b"msgget",
+    69: b"msgsnd",
+    70: b"msgrcv",
+    71: b"msgctl",
+    72: b"fcntl",
+    73: b"flock",
+    74: b"fsync",
+    75: b"fdatasync",
+    76: b"truncate",
+    77: b"ftruncate",
+    78: b"getdents",
+    79: b"getcwd",
+    80: b"chdir",
+    81: b"fchdir",
+    82: b"rename",
+    83: b"mkdir",
+    84: b"rmdir",
+    85: b"creat",
+    86: b"link",
+    87: b"unlink",
+    88: b"symlink",
+    89: b"readlink",
+    90: b"chmod",
+    91: b"fchmod",
+    92: b"chown",
+    93: b"fchown",
+    94: b"lchown",
+    95: b"umask",
+    96: b"gettimeofday",
+    97: b"getrlimit",
+    98: b"getrusage",
+    99: b"sysinfo",
+    100: b"times",
+    101: b"ptrace",
+    102: b"getuid",
+    103: b"syslog",
+    104: b"getgid",
+    105: b"setuid",
+    106: b"setgid",
+    107: b"geteuid",
+    108: b"getegid",
+    109: b"setpgid",
+    110: b"getppid",
+    111: b"getpgrp",
+    112: b"setsid",
+    113: b"setreuid",
+    114: b"setregid",
+    115: b"getgroups",
+    116: b"setgroups",
+    117: b"setresuid",
+    118: b"getresuid",
+    119: b"setresgid",
+    120: b"getresgid",
+    121: b"getpgid",
+    122: b"setfsuid",
+    123: b"setfsgid",
+    124: b"getsid",
+    125: b"capget",
+    126: b"capset",
+    127: b"rt_sigpending",
+    128: b"rt_sigtimedwait",
+    129: b"rt_sigqueueinfo",
+    130: b"rt_sigsuspend",
+    131: b"sigaltstack",
+    132: b"utime",
+    133: b"mknod",
+    134: b"uselib",
+    135: b"personality",
+    136: b"ustat",
+    137: b"statfs",
+    138: b"fstatfs",
+    139: b"sysfs",
+    140: b"getpriority",
+    141: b"setpriority",
+    142: b"sched_setparam",
+    143: b"sched_getparam",
+    144: b"sched_setscheduler",
+    145: b"sched_getscheduler",
+    146: b"sched_get_priority_max",
+    147: b"sched_get_priority_min",
+    148: b"sched_rr_get_interval",
+    149: b"mlock",
+    150: b"munlock",
+    151: b"mlockall",
+    152: b"munlockall",
+    153: b"vhangup",
+    154: b"modify_ldt",
+    155: b"pivot_root",
+    156: b"_sysctl",
+    157: b"prctl",
+    158: b"arch_prctl",
+    159: b"adjtimex",
+    160: b"setrlimit",
+    161: b"chroot",
+    162: b"sync",
+    163: b"acct",
+    164: b"settimeofday",
+    165: b"mount",
+    166: b"umount2",
+    167: b"swapon",
+    168: b"swapoff",
+    169: b"reboot",
+    170: b"sethostname",
+    171: b"setdomainname",
+    172: b"iopl",
+    173: b"ioperm",
+    174: b"create_module",
+    175: b"init_module",
+    176: b"delete_module",
+    177: b"get_kernel_syms",
+    178: b"query_module",
+    179: b"quotactl",
+    180: b"nfsservctl",
+    181: b"getpmsg",
+    182: b"putpmsg",
+    183: b"afs_syscall",
+    184: b"tuxcall",
+    185: b"security",
+    186: b"gettid",
+    187: b"readahead",
+    188: b"setxattr",
+    189: b"lsetxattr",
+    190: b"fsetxattr",
+    191: b"getxattr",
+    192: b"lgetxattr",
+    193: b"fgetxattr",
+    194: b"listxattr",
+    195: b"llistxattr",
+    196: b"flistxattr",
+    197: b"removexattr",
+    198: b"lremovexattr",
+    199: b"fremovexattr",
+    200: b"tkill",
+    201: b"time",
+    202: b"futex",
+    203: b"sched_setaffinity",
+    204: b"sched_getaffinity",
+    205: b"set_thread_area",
+    206: b"io_setup",
+    207: b"io_destroy",
+    208: b"io_getevents",
+    209: b"io_submit",
+    210: b"io_cancel",
+    211: b"get_thread_area",
+    212: b"lookup_dcookie",
+    213: b"epoll_create",
+    214: b"epoll_ctl_old",
+    215: b"epoll_wait_old",
+    216: b"remap_file_pages",
+    217: b"getdents64",
+    218: b"set_tid_address",
+    219: b"restart_syscall",
+    220: b"semtimedop",
+    221: b"fadvise64",
+    222: b"timer_create",
+    223: b"timer_settime",
+    224: b"timer_gettime",
+    225: b"timer_getoverrun",
+    226: b"timer_delete",
+    227: b"clock_settime",
+    228: b"clock_gettime",
+    229: b"clock_getres",
+    230: b"clock_nanosleep",
+    231: b"exit_group",
+    232: b"epoll_wait",
+    233: b"epoll_ctl",
+    234: b"tgkill",
+    235: b"utimes",
+    236: b"vserver",
+    237: b"mbind",
+    238: b"set_mempolicy",
+    239: b"get_mempolicy",
+    240: b"mq_open",
+    241: b"mq_unlink",
+    242: b"mq_timedsend",
+    243: b"mq_timedreceive",
+    244: b"mq_notify",
+    245: b"mq_getsetattr",
+    246: b"kexec_load",
+    247: b"waitid",
+    248: b"add_key",
+    249: b"request_key",
+    250: b"keyctl",
+    251: b"ioprio_set",
+    252: b"ioprio_get",
+    253: b"inotify_init",
+    254: b"inotify_add_watch",
+    255: b"inotify_rm_watch",
+    256: b"migrate_pages",
+    257: b"openat",
+    258: b"mkdirat",
+    259: b"mknodat",
+    260: b"fchownat",
+    261: b"futimesat",
+    262: b"newfstatat",
+    263: b"unlinkat",
+    264: b"renameat",
+    265: b"linkat",
+    266: b"symlinkat",
+    267: b"readlinkat",
+    268: b"fchmodat",
+    269: b"faccessat",
+    270: b"pselect6",
+    271: b"ppoll",
+    272: b"unshare",
+    273: b"set_robust_list",
+    274: b"get_robust_list",
+    275: b"splice",
+    276: b"tee",
+    277: b"sync_file_range",
+    278: b"vmsplice",
+    279: b"move_pages",
+    280: b"utimensat",
+    281: b"epoll_pwait",
+    282: b"signalfd",
+    283: b"timerfd_create",
+    284: b"eventfd",
+    285: b"fallocate",
+    286: b"timerfd_settime",
+    287: b"timerfd_gettime",
+    288: b"accept4",
+    289: b"signalfd4",
+    290: b"eventfd2",
+    291: b"epoll_create1",
+    292: b"dup3",
+    293: b"pipe2",
+    294: b"inotify_init1",
+    295: b"preadv",
+    296: b"pwritev",
+    297: b"rt_tgsigqueueinfo",
+    298: b"perf_event_open",
+    299: b"recvmmsg",
+    300: b"fanotify_init",
+    301: b"fanotify_mark",
+    302: b"prlimit64",
+    303: b"name_to_handle_at",
+    304: b"open_by_handle_at",
+    305: b"clock_adjtime",
+    306: b"syncfs",
+    307: b"sendmmsg",
+    308: b"setns",
+    309: b"getcpu",
+    310: b"process_vm_readv",
+    311: b"process_vm_writev",
+    312: b"kcmp",
+    313: b"finit_module",
+    314: b"sched_setattr",
+    315: b"sched_getattr",
+    316: b"renameat2",
+    317: b"seccomp",
+    318: b"getrandom",
+    319: b"memfd_create",
+    320: b"kexec_file_load",
+    321: b"bpf",
+    322: b"execveat",
+    323: b"userfaultfd",
+    324: b"membarrier",
+    325: b"mlock2",
+    326: b"copy_file_range",
+    327: b"preadv2",
+    328: b"pwritev2",
+    329: b"pkey_mprotect",
+    330: b"pkey_alloc",
+    331: b"pkey_free",
+    332: b"statx",
+    333: b"io_pgetevents",
+    334: b"rseq",
+}
+
+# Try to use ausyscall if it is available, because it can give us an up-to-date
+# list of syscalls for various architectures, rather than the x86-64 hardcoded
+# list above.
+def _parse_syscall(line):
+    parts = line.split()
+    return (int(parts[0]), parts[1].strip())
+
+try:
+    # Skip the first line, which is a header. The rest of the lines are simply
+    # SYSCALL_NUM\tSYSCALL_NAME pairs.
+    out = subprocess.check_output(['ausyscall', '--dump'], stderr=subprocess.STDOUT)
+    # remove the first line of expected output
+    out = out.split(b'\n',1)[1]
+    syscalls = dict(map(_parse_syscall, out.strip().split(b'\n')))
+except Exception as e:
+    if platform.machine() == "x86_64":
+        pass
+    else:
+        raise Exception("ausyscall: command not found")
+
+def syscall_name(syscall_num):
+    """Return the syscall name for the particular syscall number."""
+    return syscalls.get(syscall_num, b"[unknown: %d]" % syscall_num)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 86abec9..11960a7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 set(TEST_WRAPPER ${CMAKE_CURRENT_BINARY_DIR}/wrapper.sh)
 
-add_test(NAME style-check COMMAND ${CMAKE_SOURCE_DIR}/scripts/style-check.sh)
+add_test(NAME style-check COMMAND ${CMAKE_SOURCE_DIR}/scripts/c-style-check.sh)
 set_tests_properties(style-check PROPERTIES PASS_REGULAR_EXPRESSION ".*")
 
 if(ENABLE_CLANG_JIT)
diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt
index 335b428..d28060b 100644
--- a/tests/cc/CMakeLists.txt
+++ b/tests/cc/CMakeLists.txt
@@ -9,6 +9,9 @@
 
 add_test(NAME c_test_static COMMAND ${TEST_WRAPPER} c_test_static sudo ${CMAKE_CURRENT_BINARY_DIR}/test_static)
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-result")
+
 if(ENABLE_USDT)
 add_executable(test_libbcc
 	test_libbcc.cc
diff --git a/tests/lua/test_standalone.sh b/tests/lua/test_standalone.sh
index 7786ac9..bea35c3 100755
--- a/tests/lua/test_standalone.sh
+++ b/tests/lua/test_standalone.sh
@@ -15,7 +15,8 @@
     exit 0
 fi
 
-if ldd bcc-lua | grep -q luajit; then
+LIBRARY=$(ldd bcc-lua | grep luajit)
+if [ $? -ne 0 -o -z "$LIBRARY" ] ; then
     fail "bcc-lua depends on libluajit"
 fi
 
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index 468c700..4a233bb 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -77,3 +77,5 @@
   COMMAND ${TEST_WRAPPER} py_test_usdt3 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_usdt3.py)
 add_test(NAME py_test_license WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
   COMMAND ${TEST_WRAPPER} py_test_license sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_license.py)
+add_test(NAME py_test_free_bcc_memory WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  COMMAND ${TEST_WRAPPER} py_test_free_bcc_memory sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_free_bcc_memory.py)
diff --git a/tests/python/test_free_bcc_memory.py b/tests/python/test_free_bcc_memory.py
new file mode 100755
index 0000000..bb2c8fb
--- /dev/null
+++ b/tests/python/test_free_bcc_memory.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+#
+# USAGE: test_usdt.py
+#
+# Copyright 2018 Facebook, Inc
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+from __future__ import print_function
+from bcc import BPF
+from unittest import main, skipUnless, TestCase
+from subprocess import Popen, PIPE
+import distutils.version
+import os
+
+def kernel_version_ge(major, minor):
+    # True if running kernel is >= X.Y
+    version = distutils.version.LooseVersion(os.uname()[2]).version
+    if version[0] > major:
+        return True
+    if version[0] < major:
+        return False
+    if minor and version[1] < minor:
+        return False
+    return True
+
+class TestFreeLLVMMemory(TestCase):
+    def getRssFile(self):
+        p = Popen(["cat", "/proc/" + str(os.getpid()) + "/status"],
+                  stdout=PIPE)
+        rss = None
+        unit = None
+        for line in p.stdout.readlines():
+            if (line.find(b'RssFile') >= 0):
+                rss  = line.split(b' ')[-2]
+                unit = line.split(b' ')[-1].rstrip()
+                break
+
+        return [rss, unit]
+
+    @skipUnless(kernel_version_ge(4,5), "requires kernel >= 4.5")
+    def testFreeLLVMMemory(self):
+        text = "int test() { return 0; }"
+        b = BPF(text=text)
+
+        # get the RssFile before freeing bcc memory
+        [rss1, unit1] = self.getRssFile()
+        self.assertTrue(rss1 != None)
+
+        # free the bcc memory
+        self.assertTrue(b.free_bcc_memory() == 0)
+
+        # get the RssFile after freeing bcc memory
+        [rss2, unit2] = self.getRssFile()
+        self.assertTrue(rss2 != None)
+
+        self.assertTrue(unit1 == unit2)
+
+        print("Before freeing llvm memory: RssFile: ", rss1, unit1)
+        print("After  freeing llvm memory: RssFile: ", rss2, unit2)
+        self.assertTrue(rss1 > rss2)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py
index ab80ecf..211dbdb 100755
--- a/tests/python/test_tools_smoke.py
+++ b/tests/python/test_tools_smoke.py
@@ -262,6 +262,14 @@
     def test_runqlen(self):
         self.run_with_duration("runqlen.py 1 1")
 
+    @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8")
+    def test_shmsnoop(self):
+        self.run_with_int("shmsnoop.py")
+
+    @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8")
+    def test_sofdsnoop(self):
+        self.run_with_int("sofdsnoop.py")
+
     def test_slabratetop(self):
         self.run_with_duration("slabratetop.py 1 1")
 
@@ -332,12 +340,9 @@
     def test_ttysnoop(self):
         self.run_with_int("ttysnoop.py /dev/console")
 
-    @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
+    @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7")
     def test_ucalls(self):
-        # This attaches a large number (300+) kprobes, which can be slow,
-        # so use an increased timeout value.
-        self.run_with_int("lib/ucalls.py -l none -S %d" % os.getpid(),
-                          timeout=60, kill_timeout=60)
+        self.run_with_int("lib/ucalls.py -l none -S %d" % os.getpid())
 
     @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4")
     def test_uflow(self):
diff --git a/tools/bashreadline.py b/tools/bashreadline.py
index 89c37c3..3d74c93 100755
--- a/tools/bashreadline.py
+++ b/tools/bashreadline.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # bashreadline  Print entered bash commands from all running shells.
 #               For Linux, uses BCC, eBPF. Embedded C.
@@ -61,4 +61,7 @@
 
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/biolatency.py b/tools/biolatency.py
index 3879af1..dcb6d26 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # biolatency    Summarize block device I/O latency as a histogram.
@@ -67,7 +67,7 @@
 }
 
 // output
-int trace_req_completion(struct pt_regs *ctx, struct request *req)
+int trace_req_done(struct pt_regs *ctx, struct request *req)
 {
     u64 *tsp, delta;
 
@@ -116,10 +116,11 @@
 if args.queued:
     b.attach_kprobe(event="blk_account_io_start", fn_name="trace_req_start")
 else:
-    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+    if BPF.get_kprobe_functions(b'blk_start_request'):
+        b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
     b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
-b.attach_kprobe(event="blk_account_io_completion",
-    fn_name="trace_req_completion")
+b.attach_kprobe(event="blk_account_io_done",
+    fn_name="trace_req_done")
 
 print("Tracing block device I/O... Hit Ctrl-C to end.")
 
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 2b1e77d..51b3a7f 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # biosnoop  Trace block device I/O and print details including issuing PID.
@@ -122,7 +122,8 @@
 }
 """, debug=0)
 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
-b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+if BPF.get_kprobe_functions(b'blk_start_request'):
+    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
 b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
 b.attach_kprobe(event="blk_account_io_completion",
     fn_name="trace_req_completion")
@@ -186,4 +187,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/biotop.py b/tools/biotop.py
index c6e1ca2..3fe454c 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # biotop  block device (disk) I/O by process.
@@ -53,7 +53,7 @@
 diskstats = "/proc/diskstats"
 
 # signal handler
-def signal_ignore(signal, frame):
+def signal_ignore(signal_value, frame):
     print()
 
 # load BPF program
@@ -173,7 +173,8 @@
 
 b = BPF(text=bpf_text)
 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
-b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
+if BPF.get_kprobe_functions(b'blk_start_request'):
+    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
 b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
 b.attach_kprobe(event="blk_account_io_completion",
     fn_name="trace_req_completion")
diff --git a/tools/bitesize.py b/tools/bitesize.py
index f70f091..e57185d 100755
--- a/tools/bitesize.py
+++ b/tools/bitesize.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # bitehist.py   Block I/O size histogram.
 #               For Linux, uses BCC, eBPF. See .c file.
diff --git a/tools/bpflist.py b/tools/bpflist.py
index f73e945..85220b6 100755
--- a/tools/bpflist.py
+++ b/tools/bpflist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # bpflist   Display processes currently using BPF programs and maps,
 #           pinned BPF programs and maps, and enabled probes.
diff --git a/tools/btrfsdist.py b/tools/btrfsdist.py
index 4659ab4..a0aeb24 100755
--- a/tools/btrfsdist.py
+++ b/tools/btrfsdist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # btrfsdist  Summarize btrfs operation latency.
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
index d48e04c..cff61b8 100755
--- a/tools/btrfsslower.py
+++ b/tools/btrfsslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # btrfsslower  Trace slow btrfs operations.
@@ -352,4 +352,7 @@
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/cachestat.py b/tools/cachestat.py
index b00c804..90a55b0 100755
--- a/tools/cachestat.py
+++ b/tools/cachestat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # cachestat     Count cache kernel function calls.
 #               For Linux, uses BCC, eBPF. See .c file.
diff --git a/tools/cachetop.py b/tools/cachetop.py
index 1013675..51ddace 100755
--- a/tools/cachetop.py
+++ b/tools/cachetop.py
@@ -40,7 +40,7 @@
     "WRITE_HIT%"
 )
 DEFAULT_FIELD = "HITS"
-
+DEFAULT_SORT_FIELD = FIELDS.index(DEFAULT_FIELD)
 
 # signal handler
 def signal_ignore(signal, frame):
@@ -61,7 +61,7 @@
 
 def get_processes_stats(
         bpf,
-        sort_field=FIELDS.index(DEFAULT_FIELD),
+        sort_field=DEFAULT_SORT_FIELD,
         sort_reverse=False):
     '''
     Return a tuple containing:
@@ -223,7 +223,7 @@
             uid = int(stat[1])
             try:
                 username = pwd.getpwuid(uid)[0]
-            except KeyError as ex:
+            except KeyError:
                 # `pwd` throws a KeyError if the user cannot be found. This can
                 # happen e.g. when the process is running in a cgroup that has
                 # different users from the host.
diff --git a/tools/capable.py b/tools/capable.py
index efcff4d..368f4b0 100755
--- a/tools/capable.py
+++ b/tools/capable.py
@@ -1,12 +1,10 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # capable   Trace security capabilitiy checks (cap_capable()).
 #           For Linux, uses BCC, eBPF. Embedded C.
 #
-# USAGE: capable [-h] [-v] [-p PID]
-#
-# ToDo: add -s for kernel stacks.
+# USAGE: capable [-h] [-v] [-p PID] [-K] [-U]
 #
 # Copyright 2016 Netflix, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License")
@@ -14,7 +12,10 @@
 # 13-Sep-2016   Brendan Gregg   Created this.
 
 from __future__ import print_function
+from os import getpid
+from functools import partial
 from bcc import BPF
+import errno
 import argparse
 from time import strftime
 import ctypes as ct
@@ -24,6 +25,8 @@
     ./capable             # trace capability checks
     ./capable -v          # verbose: include non-audit checks
     ./capable -p 181      # only trace PID 181
+    ./capable -K          # add kernel stacks to trace
+    ./capable -U          # add user-space stacks to trace
 """
 parser = argparse.ArgumentParser(
     description="Trace security capability checks",
@@ -33,6 +36,10 @@
     help="include non-audit checks")
 parser.add_argument("-p", "--pid",
     help="trace this PID only")
+parser.add_argument("-K", "--kernel-stack", action="store_true",
+    help="output kernel stack trace")
+parser.add_argument("-U", "--user-stack", action="store_true",
+    help="output user stack trace")
 args = parser.parse_args()
 debug = 0
 
@@ -80,31 +87,59 @@
     37: "CAP_AUDIT_READ",
 }
 
+class Enum(set):
+    def __getattr__(self, name):
+        if name in self:
+            return name
+        raise AttributeError
+
+# Stack trace types
+StackType = Enum(("Kernel", "User",))
+
 # define BPF program
 bpf_text = """
 #include <uapi/linux/ptrace.h>
 #include <linux/sched.h>
 
 struct data_t {
-   // switch to u32s when supported
-   u64 pid;
-   u64 uid;
+   u32 tgid;
+   u32 pid;
+   u32 uid;
    int cap;
    int audit;
    char comm[TASK_COMM_LEN];
+#ifdef KERNEL_STACKS
+   int kernel_stack_id;
+#endif
+#ifdef USER_STACKS
+   int user_stack_id;
+#endif
 };
 
 BPF_PERF_OUTPUT(events);
 
+#if defined(USER_STACKS) || defined(KERNEL_STACKS)
+BPF_STACK_TRACE(stacks, 2048);
+#endif
+
 int kprobe__cap_capable(struct pt_regs *ctx, const struct cred *cred,
     struct user_namespace *targ_ns, int cap, int audit)
 {
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 __pid_tgid = bpf_get_current_pid_tgid();
+    u32 tgid = __pid_tgid >> 32;
+    u32 pid = __pid_tgid;
     FILTER1
     FILTER2
+    FILTER3
 
     u32 uid = bpf_get_current_uid_gid();
-    struct data_t data = {.pid = pid, .uid = uid, .cap = cap, .audit = audit};
+    struct data_t data = {.tgid = tgid, .pid = pid, .uid = uid, .cap = cap, .audit = audit};
+#ifdef KERNEL_STACKS
+    data.kernel_stack_id = stacks.get_stackid(ctx, 0);
+#endif
+#ifdef USER_STACKS
+    data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
+#endif
     bpf_get_current_comm(&data.comm, sizeof(data.comm));
     events.perf_submit(ctx, &data, sizeof(data));
 
@@ -116,8 +151,14 @@
         'if (pid != %s) { return 0; }' % args.pid)
 if not args.verbose:
     bpf_text = bpf_text.replace('FILTER2', 'if (audit == 0) { return 0; }')
+if args.kernel_stack:
+    bpf_text = "#define KERNEL_STACKS\n" + bpf_text
+if args.user_stack:
+    bpf_text = "#define USER_STACKS\n" + bpf_text
 bpf_text = bpf_text.replace('FILTER1', '')
 bpf_text = bpf_text.replace('FILTER2', '')
+bpf_text = bpf_text.replace('FILTER3',
+    'if (pid == %s) { return 0; }' % getpid())
 if debug:
     print(bpf_text)
 
@@ -128,30 +169,54 @@
 
 class Data(ct.Structure):
     _fields_ = [
-        ("pid", ct.c_ulonglong),
-        ("uid", ct.c_ulonglong),
+        ("tgid", ct.c_uint32),
+        ("pid", ct.c_uint32),
+        ("uid", ct.c_uint32),
         ("cap", ct.c_int),
         ("audit", ct.c_int),
-        ("comm", ct.c_char * TASK_COMM_LEN)
-    ]
+        ("comm", ct.c_char * TASK_COMM_LEN),
+    ] + ([("kernel_stack_id", ct.c_int)] if args.kernel_stack else []) \
+      + ([("user_stack_id", ct.c_int)] if args.user_stack else [])
 
 # header
-print("%-9s %-6s %-6s %-16s %-4s %-20s %s" % (
-    "TIME", "UID", "PID", "COMM", "CAP", "NAME", "AUDIT"))
+print("%-9s %-6s %-6s %-6s %-16s %-4s %-20s %s" % (
+    "TIME", "UID", "PID", "TID", "COMM", "CAP", "NAME", "AUDIT"))
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+def print_stack(bpf, stack_id, stack_type, tgid):
+    if stack_id_err(stack_id):
+        print("    [Missed %s Stack]" % stack_type)
+        return
+    stack = list(bpf.get_table("stacks").walk(stack_id))
+    for addr in stack:
+        print("        ", end="")
+        print("%s" % (bpf.sym(addr, tgid, show_module=True, show_offset=True)))
 
 # process event
-def print_event(cpu, data, size):
+def print_event(bpf, cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data)).contents
 
     if event.cap in capabilities:
         name = capabilities[event.cap]
     else:
         name = "?"
-    print("%-9s %-6d %-6d %-16s %-4d %-20s %d" % (strftime("%H:%M:%S"),
-        event.uid, event.pid, event.comm.decode('utf-8', 'replace'),
+    print("%-9s %-6d %-6d %-6d %-16s %-4d %-20s %d" % (strftime("%H:%M:%S"),
+        event.uid, event.pid, event.tgid, event.comm.decode('utf-8', 'replace'),
         event.cap, name, event.audit))
+    if args.kernel_stack:
+        print_stack(bpf, event.kernel_stack_id, StackType.Kernel, -1)
+    if args.user_stack:
+        print_stack(bpf, event.user_stack_id, StackType.User, event.tgid)
 
 # loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+callback = partial(print_event, b)
+b["events"].open_perf_buffer(callback)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/capable_example.txt b/tools/capable_example.txt
index 0a63765..c17c9b3 100644
--- a/tools/capable_example.txt
+++ b/tools/capable_example.txt
@@ -44,36 +44,45 @@
 To see what each of these capabilities does, check the capabilities(7) man
 page and the kernel source.
 
+It is possible to include a kernel stack trace to the capable events by passing
+-K to the command:
 
-Sometimes capable catches itself starting up:
-
-# ./capable.py 
+# ./capable.py -K
 TIME      UID    PID    COMM             CAP  NAME                 AUDIT
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21949  capable.py       21   CAP_SYS_ADMIN        1
-22:22:19  0      21952  run              24   CAP_SYS_RESOURCE     1
-[...]
+15:32:21  1000   10708  fetchmail        7    CAP_SETUID           1
+        cap_capable+0x1 [kernel]
+        ns_capable_common+0x7a [kernel]
+        __sys_setresuid+0xc8 [kernel]
+        do_syscall_64+0x56 [kernel]
+        entry_SYSCALL_64_after_hwframe+0x49 [kernel]
+15:32:21  1000   30047  procmail         6    CAP_SETGID           1
+        cap_capable+0x1 [kernel]
+        ns_capable_common+0x7a [kernel]
+        may_setgroups+0x2f [kernel]
+        __x64_sys_setgroups+0x18 [kernel]
+        do_syscall_64+0x56 [kernel]
+        entry_SYSCALL_64_after_hwframe+0x49 [kernel]
 
-These are capability checks from BPF and perf_events syscalls.
-
+Similarly, it is possible to include user-space stack with -U (or they can be
+used both at the same time to include user and kernel stack).
 
 USAGE:
 
 # ./capable.py -h
-usage: capable.py [-h] [-v] [-p PID]
+usage: capable.py [-h] [-v] [-p PID] [-K] [-U]
 
 Trace security capability checks
 
 optional arguments:
-  -h, --help         show this help message and exit
-  -v, --verbose      include non-audit checks
-  -p PID, --pid PID  trace this PID only
+  -h, --help          show this help message and exit
+  -v, --verbose       include non-audit checks
+  -p PID, --pid PID   trace this PID only
+  -K, --kernel-stack  output kernel stack trace
+  -U, --user-stack    output user stack trace
 
 examples:
     ./capable             # trace capability checks
     ./capable -v          # verbose: include non-audit checks
     ./capable -p 181      # only trace PID 181
+    ./capable -K          # add kernel stacks to trace
+    ./capable -U          # add user-space stacks to trace
diff --git a/tools/cpudist.py b/tools/cpudist.py
index 4d7c9eb..9e61341 100755
--- a/tools/cpudist.py
+++ b/tools/cpudist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # cpudist   Summarize on- and off-CPU time per task as a histogram.
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
index b862bad..c899398 100755
--- a/tools/cpuunclaimed.py
+++ b/tools/cpuunclaimed.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # cpuunclaimed   Sample CPU run queues and calculate unclaimed idle CPU.
diff --git a/tools/criticalstat.py b/tools/criticalstat.py
index e45731c..8126b49 100755
--- a/tools/criticalstat.py
+++ b/tools/criticalstat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # criticalstat  Trace long critical sections (IRQs or preemption disabled)
@@ -319,7 +319,7 @@
             print("NO STACK FOUND DUE TO COLLISION")
         print("===================================")
         print("")
-    except:
+    except Exception:
         sys.exit(0)
 
 b["events"].open_perf_buffer(print_event, page_cnt=256)
@@ -328,4 +328,7 @@
     ('preempt' if preemptoff else 'IRQ'), args.duration))
 
 while 1:
-    b.perf_buffer_poll();
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/dbslower.py b/tools/dbslower.py
index c523d7a..a42df87 100755
--- a/tools/dbslower.py
+++ b/tools/dbslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # dbslower      Trace MySQL and PostgreSQL queries slower than a threshold.
 #
@@ -69,7 +69,7 @@
 
         (mysql_func_name, addr) = symbols[0]
 
-        if mysql_func_name.find("COM_DATA") >= 0:
+        if mysql_func_name.find(b'COM_DATA') >= 0:
             mode = "MYSQL57"
         else:
             mode = "MYSQL56"
@@ -230,4 +230,7 @@
 
 bpf["events"].open_perf_buffer(print_event, page_cnt=64)
 while True:
-    bpf.perf_buffer_poll()
+    try:
+        bpf.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/dbslower_example.txt b/tools/dbslower_example.txt
index 88cbab0..756701c 100644
--- a/tools/dbslower_example.txt
+++ b/tools/dbslower_example.txt
@@ -67,7 +67,7 @@
 
 USAGE:
 # dbslower -h
-usage: dbslower.py [-h] [-v] [-p [PIDS [PIDS ...]]] [-m THRESHOLD]
+usage: dbslower.py [-h] [-v] [-p [PIDS [PIDS ...]]] [-x PATH] [-m THRESHOLD]
                    {mysql,postgres}
 
 positional arguments:
@@ -78,6 +78,7 @@
   -v, --verbose         print the BPF program
   -p [PID [PID ...]], --pid [PID [PID ...]]
                         the pid(s) to trace
+  -x PATH, --exe PATH   path to binary
   -m THRESHOLD, --threshold THRESHOLD
                         trace queries slower than this threshold (ms)
 
@@ -86,3 +87,4 @@
     dbslower postgres -p 188 322 # trace specific PostgreSQL processes
     dbslower mysql -p 480 -m 30  # trace MySQL queries slower than 30ms
     dbslower mysql -p 480 -v     # trace MySQL queries and print the BPF program
+    dbslower mysql -x $(which mysqld)  # trace MySQL queries with uprobes
diff --git a/tools/dbstat.py b/tools/dbstat.py
index a89b097..1d98436 100755
--- a/tools/dbstat.py
+++ b/tools/dbstat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # dbstat        Display a histogram of MySQL and PostgreSQL query latencies.
 #
diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py
index 13152c2..1452191 100755
--- a/tools/dcsnoop.py
+++ b/tools/dcsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # dcsnoop   Trace directory entry cache (dcache) lookups.
@@ -162,4 +162,7 @@
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/dcstat.py b/tools/dcstat.py
index 5ecddd1..2009a19 100755
--- a/tools/dcstat.py
+++ b/tools/dcstat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # dcstat   Directory entry cache (dcache) stats.
diff --git a/tools/execsnoop.py b/tools/execsnoop.py
index 6fdde97..e27e50e 100755
--- a/tools/execsnoop.py
+++ b/tools/execsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # execsnoop Trace new processes via exec() syscalls.
@@ -246,4 +246,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/ext4dist.py b/tools/ext4dist.py
index bc797fb..b71cfda 100755
--- a/tools/ext4dist.py
+++ b/tools/ext4dist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ext4dist  Summarize ext4 operation latency.
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
index 88db831..344e68f 100755
--- a/tools/ext4slower.py
+++ b/tools/ext4slower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ext4slower  Trace slow ext4 operations.
@@ -356,4 +356,7 @@
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/filelife.py b/tools/filelife.py
index 410659d..4095273 100755
--- a/tools/filelife.py
+++ b/tools/filelife.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # filelife    Trace the lifespan of short-lived files.
@@ -141,4 +141,7 @@
 
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/fileslower.py b/tools/fileslower.py
index 25443a2..219a94a 100755
--- a/tools/fileslower.py
+++ b/tools/fileslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # fileslower  Trace slow synchronous file reads and writes.
@@ -205,7 +205,7 @@
 try:
     b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry")
     b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return")
-except:
+except Exception:
     # older kernels don't have __vfs_write so try vfs_write instead
     b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry")
     b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return")
@@ -250,4 +250,7 @@
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/filetop.py b/tools/filetop.py
index 4c7a28a..91e098e 100755
--- a/tools/filetop.py
+++ b/tools/filetop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # filetop  file reads and writes by process.
@@ -60,7 +60,7 @@
 loadavg = "/proc/loadavg"
 
 # signal handler
-def signal_ignore(signal, frame):
+def signal_ignore(signal_value, frame):
     print()
 
 # define BPF program
diff --git a/tools/funccount.py b/tools/funccount.py
index 69dd01c..fcb96b8 100755
--- a/tools/funccount.py
+++ b/tools/funccount.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # funccount Count functions, tracepoints, and USDT probes.
diff --git a/tools/funclatency.py b/tools/funclatency.py
index 3f08a7e..f23d8f0 100755
--- a/tools/funclatency.py
+++ b/tools/funclatency.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # funclatency   Time functions and print latency as a histogram.
diff --git a/tools/funcslower.py b/tools/funcslower.py
index 261869e..2143583 100755
--- a/tools/funcslower.py
+++ b/tools/funcslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # funcslower  Trace slow kernel or user function calls.
@@ -94,6 +94,9 @@
     u64 duration_ns;
     u64 retval;
     char comm[TASK_COMM_LEN];
+#ifdef GRAB_ARGS
+    u64 args[6];
+#endif
 #ifdef USER_STACKS
     int user_stack_id;
 #endif
@@ -101,9 +104,6 @@
     int kernel_stack_id;
     u64 kernel_ip;
 #endif
-#ifdef GRAB_ARGS
-    u64 args[6];
-#endif
 };
 
 BPF_HASH(entryinfo, u64, struct entry_t);
@@ -330,4 +330,7 @@
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while True:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/gethostlatency.py b/tools/gethostlatency.py
index 3a967ae..84c7988 100755
--- a/tools/gethostlatency.py
+++ b/tools/gethostlatency.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # gethostlatency  Show latency for getaddrinfo/gethostbyname[2] calls.
 #                 For Linux, uses BCC, eBPF. Embedded C.
@@ -135,4 +135,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/hardirqs.py b/tools/hardirqs.py
index 589a890..1f5983a 100755
--- a/tools/hardirqs.py
+++ b/tools/hardirqs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # hardirqs  Summarize hard IRQ (interrupt) event time.
diff --git a/tools/inject.py b/tools/inject.py
index 031679b..2cf9909 100755
--- a/tools/inject.py
+++ b/tools/inject.py
@@ -41,12 +41,14 @@
     errno_mapping = {
         "kmalloc": "-ENOMEM",
         "bio": "-EIO",
+        "alloc_page" : "true",
     }
 
     @classmethod
-    def configure(cls, mode, probability):
+    def configure(cls, mode, probability, count):
         cls.mode = mode
         cls.probability = probability
+        cls.count = count
 
     def __init__(self, func, preds, length, entry):
         # length of call chain
@@ -207,6 +209,14 @@
         pred = self.preds[0][0]
         text = self._get_heading() + """
 {
+        u32 overriden = 0;
+        int zero = 0;
+        u32* val;
+
+        val = count.lookup(&zero);
+        if (val)
+            overriden = *val;
+
         /*
          * preparation for predicate, if necessary
          */
@@ -214,7 +224,8 @@
         /*
          * If this is the only call in the chain and predicate passes
          */
-        if (%s == 1 && %s) {
+        if (%s == 1 && %s && overriden < %s) {
+                count.increment(zero);
                 bpf_override_return(ctx, %s);
                 return 0;
         }
@@ -228,12 +239,15 @@
         /*
          * If all conds have been met and predicate passes
          */
-        if (p->conds_met == %s && %s)
+        if (p->conds_met == %s && %s && overriden < %s) {
+                count.increment(zero);
                 bpf_override_return(ctx, %s);
+        }
         return 0;
 }"""
-        return text % (self.prep, self.length, pred, self._get_err(),
-                    self.length - 1, pred, self._get_err())
+        return text % (self.prep, self.length, pred, Probe.count,
+                self._get_err(), self.length - 1, pred, Probe.count,
+                self._get_err())
 
     # presently parses and replaces STRCMP
     # STRCMP exists because string comparison is inconvenient and somewhat buggy
@@ -314,6 +328,7 @@
     error_injection_mapping = {
         "kmalloc": "should_failslab(struct kmem_cache *s, gfp_t gfpflags)",
         "bio": "should_fail_bio(struct bio *bio)",
+        "alloc_page": "should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)",
     }
 
     def __init__(self):
@@ -321,7 +336,7 @@
                 " functionality when call chain and predicates are met",
                 formatter_class=argparse.RawDescriptionHelpFormatter,
                 epilog=Tool.examples)
-        parser.add_argument(dest="mode", choices=['kmalloc','bio'],
+        parser.add_argument(dest="mode", choices=["kmalloc", "bio", "alloc_page"],
                 help="indicate which base kernel function to fail")
         parser.add_argument(metavar="spec", dest="spec",
                 help="specify call chain")
@@ -333,6 +348,8 @@
                 help="probability that this call chain will fail")
         parser.add_argument("-v", "--verbose", action="store_true",
                 help="print BPF program")
+        parser.add_argument("-c", "--count", action="store", default=-1,
+                help="Number of fails before bypassing the override")
         self.args = parser.parse_args()
 
         self.program = ""
@@ -344,7 +361,7 @@
     # create_probes and associated stuff
     def _create_probes(self):
         self._parse_spec()
-        Probe.configure(self.args.mode, self.args.probability)
+        Probe.configure(self.args.mode, self.args.probability, self.args.count)
         # self, func, preds, total, entry
 
         # create all the pair probes
@@ -482,6 +499,8 @@
 
         self.program += self._def_pid_struct()
         self.program += "BPF_HASH(m, u32, struct pid_struct);\n"
+        self.program += "BPF_ARRAY(count, u32, 1);\n"
+
         for p in self.probes:
             self.program += p.generate_program() + "\n"
 
@@ -490,7 +509,10 @@
 
     def _main_loop(self):
         while True:
-            self.bpf.perf_buffer_poll()
+            try:
+                self.bpf.perf_buffer_poll()
+            except KeyboardInterrupt:
+                exit()
 
     def run(self):
         self._create_probes()
diff --git a/tools/killsnoop.py b/tools/killsnoop.py
index d60c72e..ce03d37 100755
--- a/tools/killsnoop.py
+++ b/tools/killsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # killsnoop Trace signals issued by the kill() syscall.
@@ -145,4 +145,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/lib/ucalls.py b/tools/lib/ucalls.py
index 18ca22c..3b90b91 100755
--- a/tools/lib/ucalls.py
+++ b/tools/lib/ucalls.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ucalls  Summarize method calls in high-level languages and/or system calls.
@@ -14,9 +14,8 @@
 
 from __future__ import print_function
 import argparse
-from bcc import BPF, USDT, utils
 from time import sleep
-import os
+from bcc import BPF, USDT, utils, syscall_name
 
 languages = ["java", "perl", "php", "python", "ruby", "tcl"]
 
@@ -130,7 +129,7 @@
 };
 struct syscall_entry_t {
     u64 timestamp;
-    u64 ip;
+    u64 id;
 };
 
 #ifndef LATENCY
@@ -198,37 +197,35 @@
 #endif  // NOLANG
 
 #ifdef SYSCALLS
-int syscall_entry(struct pt_regs *ctx) {
+TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
     u64 pid = bpf_get_current_pid_tgid();
-    u64 *valp, ip = PT_REGS_IP(ctx), val = 0;
+    u64 *valp, id = args->id, val = 0;
     PID_FILTER
 #ifdef LATENCY
     struct syscall_entry_t data = {};
     data.timestamp = bpf_ktime_get_ns();
-    data.ip = ip;
+    data.id = id;
+    sysentry.update(&pid, &data);
 #endif
 #ifndef LATENCY
-    valp = syscounts.lookup_or_init(&ip, &val);
+    valp = syscounts.lookup_or_init(&id, &val);
     ++(*valp);
 #endif
-#ifdef LATENCY
-    sysentry.update(&pid, &data);
-#endif
     return 0;
 }
 
 #ifdef LATENCY
-int syscall_return(struct pt_regs *ctx) {
+TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
     struct syscall_entry_t *e;
     struct info_t *info, zero = {};
-    u64 pid = bpf_get_current_pid_tgid(), ip;
+    u64 pid = bpf_get_current_pid_tgid(), id;
     PID_FILTER
     e = sysentry.lookup(&pid);
     if (!e) {
         return 0;   // missed the entry event
     }
-    ip = e->ip;
-    info = systimes.lookup_or_init(&ip, &zero);
+    id = e->id;
+    info = systimes.lookup_or_init(&id, &zero);
     info->num_calls += 1;
     info->total_ns += bpf_ktime_get_ns() - e->timestamp;
     sysentry.delete(&pid);
@@ -260,12 +257,7 @@
 
 bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else [])
 if args.syscalls:
-    syscall_regex = "^[Ss]y[Ss]_.*"
-    bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry")
-    if args.latency:
-        bpf.attach_kretprobe(event_re=syscall_regex, fn_name="syscall_return")
-    print("Attached %d kernel probes for syscall tracing." %
-          bpf.num_open_kprobes())
+    print("Attached kernel tracepoints for syscall tracing.")
 
 def get_data():
     # Will be empty when no language was specified for tracing
@@ -284,12 +276,12 @@
 
     if args.syscalls:
         if args.latency:
-            syscalls = map(lambda kv: (bpf.ksym(kv[0].value),
-                                           (kv[1].num_calls, kv[1].total_ns)),
+            syscalls = map(lambda kv: (syscall_name(kv[0].value).decode('utf-8', 'replace'),
+                                       (kv[1].num_calls, kv[1].total_ns)),
                            bpf["systimes"].items())
             data.extend(syscalls)
         else:
-            syscalls = map(lambda kv: (bpf.ksym(kv[0].value),
+            syscalls = map(lambda kv: (syscall_name(kv[0].value).decode('utf-8', 'replace'),
                                        (kv[1].value, 0)),
                            bpf["syscounts"].items())
             data.extend(syscalls)
diff --git a/tools/lib/ucalls_example.txt b/tools/lib/ucalls_example.txt
index 7191fb8..31b3bc8 100644
--- a/tools/lib/ucalls_example.txt
+++ b/tools/lib/ucalls_example.txt
@@ -27,22 +27,21 @@
 To trace only syscalls in a particular process and print the top 10 most 
 frequently-invoked ones:
 
-# ucalls -ST 10 3018
-Attached 375 kernel probes for syscall tracing.
-Tracing calls in process 3018 (language: none)... Ctrl-C to quit.
-
-METHOD                                              # CALLS
-sys_rt_sigaction                                          4
-SyS_rt_sigprocmask                                        4
-sys_mprotect                                              5
-sys_read                                                 22
-SyS_write                                                39
-SyS_epoll_wait                                           42
-sys_futex                                               177
-SyS_mmap                                                180
-sys_mmap_pgoff                                          181
-sys_munmap                                              817
+# ucalls -l none -ST 10 7913
+Attached kernel tracepoints for syscall tracing.
+Tracing calls in process 7913 (language: none)... Ctrl-C to quit.
 ^C
+METHOD                                              # CALLS
+timerfd_settime                                           9
+tgkill                                                   10
+getpid                                                   10
+stat                                                     80
+writev                                                  158
+pselect6                                                204
+rt_sigreturn                                            301
+rt_sigprocmask                                          872
+poll                                                   1387
+recvmsg                                                1417
 Detaching kernel probes, please wait...
 
 
diff --git a/tools/lib/uflow.py b/tools/lib/uflow.py
index 02cad55..2bfe36b 100755
--- a/tools/lib/uflow.py
+++ b/tools/lib/uflow.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # uflow  Trace method execution flow in high-level languages.
@@ -206,4 +206,7 @@
 
 bpf["calls"].open_perf_buffer(print_event)
 while 1:
-    bpf.perf_buffer_poll()
+    try:
+        bpf.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/lib/uflow_example.txt b/tools/lib/uflow_example.txt
index 5dccb8f..c7621f5 100644
--- a/tools/lib/uflow_example.txt
+++ b/tools/lib/uflow_example.txt
@@ -48,8 +48,8 @@
 3   27722  27731  3.144          <- java/lang/ThreadGroup.checkAccess
 3   27722  27731  3.144          -> java/lang/ThreadGroup.addUnstarted
 3   27722  27731  3.144          <- java/lang/ThreadGroup.addUnstarted
-3   27722  27731  3.145          -> java/lang/Thread.isDaemon     
-3   27722  27731  3.145          <- java/lang/Thread.isDaemon     
+3   27722  27731  3.145          -> java/lang/Thread.isDaemon
+3   27722  27731  3.145          <- java/lang/Thread.isDaemon
 3   27722  27731  3.145          -> java/lang/Thread.getPriority   
 3   27722  27731  3.145          <- java/lang/Thread.getPriority   
 3   27722  27731  3.145          -> java/lang/Thread.getContextClassLoader
diff --git a/tools/lib/ugc.py b/tools/lib/ugc.py
index 8288910..77f125e 100755
--- a/tools/lib/ugc.py
+++ b/tools/lib/ugc.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ugc  Summarize garbage collection events in high-level languages.
@@ -244,4 +244,7 @@
 
 bpf["gcs"].open_perf_buffer(print_event)
 while 1:
-    bpf.perf_buffer_poll()
+    try:
+        bpf.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/lib/uobjnew.py b/tools/lib/uobjnew.py
index 85f5768..8159f9a 100755
--- a/tools/lib/uobjnew.py
+++ b/tools/lib/uobjnew.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # uobjnew  Summarize object allocations in high-level languages.
diff --git a/tools/lib/ustat.py b/tools/lib/ustat.py
index 3661a14..ef29d76 100755
--- a/tools/lib/ustat.py
+++ b/tools/lib/ustat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ustat  Activity stats from high-level languages, including exceptions,
@@ -20,8 +20,9 @@
 
 from __future__ import print_function
 import argparse
-from bcc import BPF, USDT
+from bcc import BPF, USDT, USDTException
 import os
+import sys
 from subprocess import call
 from time import sleep, strftime
 
@@ -62,7 +63,12 @@
     def _enable_probes(self):
         self.usdts = []
         for pid in self.targets:
-            usdt = USDT(pid=pid)
+            try:
+                usdt = USDT(pid=pid)
+            except USDTException:
+                # avoid race condition on pid going away.
+                print("failed to instrument %d" % pid, file=sys.stderr)
+                continue
             for event in self.events:
                 try:
                     usdt.enable_probe(event, "%s_%s" % (self.language, event))
@@ -111,6 +117,9 @@
         for event, category in self.events.items():
             counts = bpf["%s_%s_counts" % (self.language, event)]
             for pid, count in counts.items():
+                if pid.value not in result:
+                    print("result was not found for %d" % pid.value, file=sys.stderr)
+                    continue
                 result[pid.value][category] = count.value
             counts.clear()
         return result
diff --git a/tools/lib/uthreads.py b/tools/lib/uthreads.py
index 71e9c6a..00dd68b 100755
--- a/tools/lib/uthreads.py
+++ b/tools/lib/uthreads.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # uthreads  Trace thread creation/destruction events in high-level languages.
@@ -128,4 +128,7 @@
 
 bpf["threads"].open_perf_buffer(print_event)
 while 1:
-    bpf.perf_buffer_poll()
+    try:
+        bpf.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/llcstat.py b/tools/llcstat.py
index ec2c1f8..01a63fd 100755
--- a/tools/llcstat.py
+++ b/tools/llcstat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # llcstat.py Summarize cache references and cache misses by PID.
 #            Cache reference and cache miss are corresponding events defined in
@@ -85,7 +85,7 @@
     b.attach_perf_event(
         ev_type=PerfType.HARDWARE, ev_config=PerfHWConfig.CACHE_REFERENCES,
         fn_name="on_cache_ref", sample_period=args.sample_period)
-except:
+except Exception:
     print("Failed to attach to a hardware event. Is this a virtual machine?")
     exit()
 
diff --git a/tools/mdflush.py b/tools/mdflush.py
index 70afc4d..507614b 100755
--- a/tools/mdflush.py
+++ b/tools/mdflush.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # mdflush  Trace md flush events.
@@ -78,4 +78,7 @@
 # read events
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/mountsnoop.py b/tools/mountsnoop.py
index e9b5865..b6f96ca 100755
--- a/tools/mountsnoop.py
+++ b/tools/mountsnoop.py
@@ -110,22 +110,22 @@
     events.perf_submit(ctx, &event, sizeof(event));
 
     event.type = EVENT_MOUNT_SOURCE;
-    memset(event.str, 0, sizeof(event.str));
+    __builtin_memset(event.str, 0, sizeof(event.str));
     bpf_probe_read(event.str, sizeof(event.str), source);
     events.perf_submit(ctx, &event, sizeof(event));
 
     event.type = EVENT_MOUNT_TARGET;
-    memset(event.str, 0, sizeof(event.str));
+    __builtin_memset(event.str, 0, sizeof(event.str));
     bpf_probe_read(event.str, sizeof(event.str), target);
     events.perf_submit(ctx, &event, sizeof(event));
 
     event.type = EVENT_MOUNT_TYPE;
-    memset(event.str, 0, sizeof(event.str));
+    __builtin_memset(event.str, 0, sizeof(event.str));
     bpf_probe_read(event.str, sizeof(event.str), type);
     events.perf_submit(ctx, &event, sizeof(event));
 
     event.type = EVENT_MOUNT_DATA;
-    memset(event.str, 0, sizeof(event.str));
+    __builtin_memset(event.str, 0, sizeof(event.str));
     bpf_probe_read(event.str, sizeof(event.str), data);
     events.perf_submit(ctx, &event, sizeof(event));
 
@@ -165,7 +165,7 @@
     events.perf_submit(ctx, &event, sizeof(event));
 
     event.type = EVENT_UMOUNT_TARGET;
-    memset(event.str, 0, sizeof(event.str));
+    __builtin_memset(event.str, 0, sizeof(event.str));
     bpf_probe_read(event.str, sizeof(event.str), target);
     events.perf_submit(ctx, &event, sizeof(event));
 
@@ -414,7 +414,11 @@
     print('{:16} {:<7} {:<7} {:<11} {}'.format(
         'COMM', 'PID', 'TID', 'MNT_NS', 'CALL'))
     while True:
-        b.perf_buffer_poll()
+        try:
+            b.perf_buffer_poll()
+        except KeyboardInterrupt:
+            exit()
+
 
 
 if __name__ == '__main__':
diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py
index d760773..5737d18 100755
--- a/tools/mysqld_qslower.py
+++ b/tools/mysqld_qslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # mysqld_qslower    MySQL server queries slower than a threshold.
 #                   For Linux, uses BCC, BPF. Embedded C.
@@ -130,4 +130,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/nfsdist.py b/tools/nfsdist.py
index ff78506..e3317a4 100755
--- a/tools/nfsdist.py
+++ b/tools/nfsdist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # nfsdist   Summarize NFS operation latency
diff --git a/tools/nfsslower.py b/tools/nfsslower.py
index 2f92c90..8113eff 100755
--- a/tools/nfsslower.py
+++ b/tools/nfsslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # nfsslower     Trace slow NFS operations
@@ -325,4 +325,7 @@
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
+    try:
         b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/offcputime.py b/tools/offcputime.py
index d84ae52..6440260 100755
--- a/tools/offcputime.py
+++ b/tools/offcputime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # offcputime    Summarize off-CPU time by stack trace
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/offwaketime.py b/tools/offwaketime.py
index 0e4f35e..3c4f0f3 100755
--- a/tools/offwaketime.py
+++ b/tools/offwaketime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # offwaketime   Summarize blocked time by kernel off-CPU stack + waker stack
 #               For Linux, uses BCC, eBPF.
@@ -342,7 +342,7 @@
         if not args.kernel_stacks_only:
             line.extend(["-"] if (need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0) else [])
             if stack_id_err(k.w_u_stack_id):
-                line.extend("[Missed User Stack]")
+                line.append("[Missed User Stack]")
             else:
                 line.extend([b.sym(addr, k.w_tgid)
                     for addr in reversed(list(waker_user_stack))])
diff --git a/tools/old/bashreadline.py b/tools/old/bashreadline.py
index 571b662..c4b8ec2 100755
--- a/tools/old/bashreadline.py
+++ b/tools/old/bashreadline.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # bashreadline  Print entered bash commands from all running shells.
 #               For Linux, uses BCC, eBPF. Embedded C.
diff --git a/tools/old/biosnoop.py b/tools/old/biosnoop.py
index 37ee3f9..96db56b 100755
--- a/tools/old/biosnoop.py
+++ b/tools/old/biosnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # biosnoop  Trace block device I/O and print details including issuing PID.
diff --git a/tools/old/filelife.py b/tools/old/filelife.py
index 075be08..af64b53 100755
--- a/tools/old/filelife.py
+++ b/tools/old/filelife.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # filelife    Trace the lifespan of short-lived files.
diff --git a/tools/old/gethostlatency.py b/tools/old/gethostlatency.py
index 7d32cb8..4d87c83 100755
--- a/tools/old/gethostlatency.py
+++ b/tools/old/gethostlatency.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # gethostlatency  Show latency for getaddrinfo/gethostbyname[2] calls.
 #                 For Linux, uses BCC, eBPF. Embedded C.
diff --git a/tools/old/killsnoop.py b/tools/old/killsnoop.py
index ddf9d5a..e2d4cb5 100755
--- a/tools/old/killsnoop.py
+++ b/tools/old/killsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # killsnoop Trace signals issued by the kill() syscall.
diff --git a/tools/old/offcputime.py b/tools/old/offcputime.py
index 38d12a2..dc89063 100755
--- a/tools/old/offcputime.py
+++ b/tools/old/offcputime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # offcputime    Summarize off-CPU time by kernel stack trace
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/old/offwaketime.py b/tools/old/offwaketime.py
index 3b5bb36..b5fdd0f 100755
--- a/tools/old/offwaketime.py
+++ b/tools/old/offwaketime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # offwaketime   Summarize blocked time by kernel off-CPU stack + waker stack
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/old/opensnoop.py b/tools/old/opensnoop.py
index 5df3b41..3736ec2 100755
--- a/tools/old/opensnoop.py
+++ b/tools/old/opensnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # opensnoop Trace open() syscalls.
diff --git a/tools/old/profile.py b/tools/old/profile.py
index e308208..f0328d2 100755
--- a/tools/old/profile.py
+++ b/tools/old/profile.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # profile  Profile CPU usage by sampling stack traces at a timed interval.
diff --git a/tools/old/softirqs.py b/tools/old/softirqs.py
index 3b40b1a..30495bc 100755
--- a/tools/old/softirqs.py
+++ b/tools/old/softirqs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # softirqs  Summarize soft IRQ (interrupt) event time.
diff --git a/tools/old/stackcount.py b/tools/old/stackcount.py
index 108c800..6eee27f 100755
--- a/tools/old/stackcount.py
+++ b/tools/old/stackcount.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # stackcount    Count kernel function calls and their stack traces.
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/old/stacksnoop.py b/tools/old/stacksnoop.py
index 9fcc12b..238ab82 100755
--- a/tools/old/stacksnoop.py
+++ b/tools/old/stacksnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # stacksnoop    Trace a kernel function and print all kernel stack traces.
 #               For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C.
diff --git a/tools/old/statsnoop.py b/tools/old/statsnoop.py
index ad54ac7..82128c2 100755
--- a/tools/old/statsnoop.py
+++ b/tools/old/statsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # statsnoop Trace stat() syscalls.
diff --git a/tools/old/syncsnoop.py b/tools/old/syncsnoop.py
index cae57ea..b14309d 100755
--- a/tools/old/syncsnoop.py
+++ b/tools/old/syncsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # syncsnoop Trace sync() syscall.
diff --git a/tools/old/tcpaccept.py b/tools/old/tcpaccept.py
index 8125eaa..cc0c240 100755
--- a/tools/old/tcpaccept.py
+++ b/tools/old/tcpaccept.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpaccept Trace TCP accept()s.
diff --git a/tools/old/tcpconnect.py b/tools/old/tcpconnect.py
index 579a85f..e0a59e9 100755
--- a/tools/old/tcpconnect.py
+++ b/tools/old/tcpconnect.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpconnect    Trace TCP connect()s.
diff --git a/tools/old/wakeuptime.py b/tools/old/wakeuptime.py
index 783c7ff..512e4f4 100644
--- a/tools/old/wakeuptime.py
+++ b/tools/old/wakeuptime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # wakeuptime    Summarize sleep to wakeup time by waker kernel stack
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/oomkill.py b/tools/oomkill.py
index 0677e49..db3a537 100755
--- a/tools/oomkill.py
+++ b/tools/oomkill.py
@@ -77,4 +77,7 @@
 print("Tracing OOM kills... Ctrl-C to stop.")
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/opensnoop.py b/tools/opensnoop.py
index 418d47b..4cb4dbb 100755
--- a/tools/opensnoop.py
+++ b/tools/opensnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # opensnoop Trace open() syscalls.
@@ -12,22 +12,30 @@
 # 17-Sep-2015   Brendan Gregg   Created this.
 # 29-Apr-2016   Allan McAleavy  Updated for BPF_PERF_OUTPUT.
 # 08-Oct-2016   Dina Goldshtein Support filtering by PID and TID.
+# 28-Dec-2018   Tim Douglas     Print flags argument, enable filtering
+# 06-Jan-2019   Takuma Kume     Support filtering by UID
 
 from __future__ import print_function
 from bcc import ArgString, BPF
+from bcc.utils import printb
 import argparse
 import ctypes as ct
 from datetime import datetime, timedelta
+import os
 
 # arguments
 examples = """examples:
     ./opensnoop           # trace all open() syscalls
     ./opensnoop -T        # include timestamps
+    ./opensnoop -U        # include UID
     ./opensnoop -x        # only show failed opens
     ./opensnoop -p 181    # only trace PID 181
     ./opensnoop -t 123    # only trace TID 123
+    ./opensnoop -u 1000   # only trace UID 1000
     ./opensnoop -d 10     # trace for 10 seconds only
     ./opensnoop -n main   # only print process names containing "main"
+    ./opensnoop -e        # show extended fields
+    ./opensnoop -f O_WRONLY -f O_RDWR  # only print calls for writing
 """
 parser = argparse.ArgumentParser(
     description="Trace open() syscalls",
@@ -35,12 +43,16 @@
     epilog=examples)
 parser.add_argument("-T", "--timestamp", action="store_true",
     help="include timestamp on output")
+parser.add_argument("-U", "--print-uid", action="store_true",
+    help="print UID column")
 parser.add_argument("-x", "--failed", action="store_true",
     help="only show failed opens")
 parser.add_argument("-p", "--pid",
     help="trace this PID only")
 parser.add_argument("-t", "--tid",
     help="trace this TID only")
+parser.add_argument("-u", "--uid",
+    help="trace this UID only")
 parser.add_argument("-d", "--duration",
     help="total duration of trace in seconds")
 parser.add_argument("-n", "--name",
@@ -48,10 +60,22 @@
     help="only print process names containing this name")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
+parser.add_argument("-e", "--extended_fields", action="store_true",
+    help="show extended fields")
+parser.add_argument("-f", "--flag_filter", action="append",
+    help="filter on flags argument (e.g., O_WRONLY)")
 args = parser.parse_args()
 debug = 0
 if args.duration:
     args.duration = timedelta(seconds=int(args.duration))
+flag_filter_mask = 0
+for flag in args.flag_filter or []:
+    if not flag.startswith('O_'):
+        exit("Bad flag: %s" % flag)
+    try:
+        flag_filter_mask |= getattr(os, flag)
+    except AttributeError:
+        exit("Bad flag: %s" % flag)
 
 # define BPF program
 bpf_text = """
@@ -63,30 +87,37 @@
     u64 id;
     char comm[TASK_COMM_LEN];
     const char *fname;
+    int flags; // EXTENDED_STRUCT_MEMBER
 };
 
 struct data_t {
     u64 id;
     u64 ts;
+    u32 uid;
     int ret;
     char comm[TASK_COMM_LEN];
     char fname[NAME_MAX];
+    int flags; // EXTENDED_STRUCT_MEMBER
 };
 
 BPF_HASH(infotmp, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
-int trace_entry(struct pt_regs *ctx, int dfd, const char __user *filename)
+int trace_entry(struct pt_regs *ctx, int dfd, const char __user *filename, int flags)
 {
     struct val_t val = {};
     u64 id = bpf_get_current_pid_tgid();
     u32 pid = id >> 32; // PID is higher part
     u32 tid = id;       // Cast and get the lower part
+    u32 uid = bpf_get_current_uid_gid();
 
-    FILTER
+    PID_TID_FILTER
+    UID_FILTER
+    FLAGS_FILTER
     if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
         val.id = id;
         val.fname = filename;
+        val.flags = flags; // EXTENDED_STRUCT_MEMBER
         infotmp.update(&id, &val);
     }
 
@@ -110,6 +141,8 @@
     bpf_probe_read(&data.fname, sizeof(data.fname), (void *)valp->fname);
     data.id = valp->id;
     data.ts = tsp / 1000;
+    data.uid = bpf_get_current_uid_gid();
+    data.flags = valp->flags; // EXTENDED_STRUCT_MEMBER
     data.ret = PT_REGS_RC(ctx);
 
     events.perf_submit(ctx, &data, sizeof(data));
@@ -119,13 +152,26 @@
 }
 """
 if args.tid:  # TID trumps PID
-    bpf_text = bpf_text.replace('FILTER',
+    bpf_text = bpf_text.replace('PID_TID_FILTER',
         'if (tid != %s) { return 0; }' % args.tid)
 elif args.pid:
-    bpf_text = bpf_text.replace('FILTER',
+    bpf_text = bpf_text.replace('PID_TID_FILTER',
         'if (pid != %s) { return 0; }' % args.pid)
 else:
-    bpf_text = bpf_text.replace('FILTER', '')
+    bpf_text = bpf_text.replace('PID_TID_FILTER', '')
+if args.uid:
+    bpf_text = bpf_text.replace('UID_FILTER',
+        'if (uid != %s) { return 0; }' % args.uid)
+else:
+    bpf_text = bpf_text.replace('UID_FILTER', '')
+if args.flag_filter:
+    bpf_text = bpf_text.replace('FLAGS_FILTER',
+        'if (!(flags & %d)) { return 0; }' % flag_filter_mask)
+else:
+    bpf_text = bpf_text.replace('FLAGS_FILTER', '')
+if not (args.extended_fields or args.flag_filter):
+    bpf_text = '\n'.join(x for x in bpf_text.split('\n')
+        if 'EXTENDED_STRUCT_MEMBER' not in x)
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -143,9 +189,11 @@
     _fields_ = [
         ("id", ct.c_ulonglong),
         ("ts", ct.c_ulonglong),
+        ("uid", ct.c_uint32),
         ("ret", ct.c_int),
         ("comm", ct.c_char * TASK_COMM_LEN),
-        ("fname", ct.c_char * NAME_MAX)
+        ("fname", ct.c_char * NAME_MAX),
+        ("flags", ct.c_int),
     ]
 
 initial_ts = 0
@@ -153,8 +201,13 @@
 # header
 if args.timestamp:
     print("%-14s" % ("TIME(s)"), end="")
-print("%-6s %-16s %4s %3s %s" %
-      ("TID" if args.tid else "PID", "COMM", "FD", "ERR", "PATH"))
+if args.print_uid:
+    print("%-6s" % ("UID"), end="")
+print("%-6s %-16s %4s %3s " %
+      ("TID" if args.tid else "PID", "COMM", "FD", "ERR"), end="")
+if args.extended_fields:
+    print("%-9s" % ("FLAGS"), end="")
+print("PATH")
 
 # process event
 def print_event(cpu, data, size):
@@ -182,13 +235,23 @@
         delta = event.ts - initial_ts
         print("%-14.9f" % (float(delta) / 1000000), end="")
 
-    print("%-6d %-16s %4d %3d %s" %
+    if args.print_uid:
+        print("%-6d" % event.uid, end="")
+
+    print("%-6d %-16s %4d %3d " %
           (event.id & 0xffffffff if args.tid else event.id >> 32,
-           event.comm.decode('utf-8', 'replace'), fd_s, err,
-           event.fname.decode('utf-8', 'replace')))
+           event.comm.decode('utf-8', 'replace'), fd_s, err), end="")
+
+    if args.extended_fields:
+        print("%08o " % event.flags, end="")
+
+    printb(b'%s' % event.fname)
 
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 start_time = datetime.now()
 while not args.duration or datetime.now() - start_time < args.duration:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/opensnoop_example.txt b/tools/opensnoop_example.txt
index 1d00f12..c504ba4 100644
--- a/tools/opensnoop_example.txt
+++ b/tools/opensnoop_example.txt
@@ -65,6 +65,27 @@
 second.
 
 
+The -U option include UID on output:
+
+# ./opensnoop -U
+UID   PID    COMM               FD ERR PATH
+0     27063  vminfo              5   0 /var/run/utmp
+103   628    dbus-daemon        -1   2 /usr/local/share/dbus-1/system-services
+103   628    dbus-daemon        18   0 /usr/share/dbus-1/system-services
+103   628    dbus-daemon        -1   2 /lib/dbus-1/system-services
+
+
+The -u option filtering UID:
+
+# ./opensnoop -Uu 1000
+UID   PID    COMM               FD ERR PATH
+1000  30240  ls                  3   0 /etc/ld.so.cache
+1000  30240  ls                  3   0 /lib/x86_64-linux-gnu/libselinux.so.1
+1000  30240  ls                  3   0 /lib/x86_64-linux-gnu/libc.so.6
+1000  30240  ls                  3   0 /lib/x86_64-linux-gnu/libpcre.so.3
+1000  30240  ls                  3   0 /lib/x86_64-linux-gnu/libdl.so.2
+1000  30240  ls                  3   0 /lib/x86_64-linux-gnu/libpthread.so.0
+
 The -x option only prints failed opens:
 
 # ./opensnoop -x
@@ -132,28 +153,68 @@
 to the '-n' option.
 
 
+The -e option prints out extra columns; for example, the following output
+contains the flags passed to open(2), in octal:
+
+# ./opensnoop -e
+PID    COMM               FD ERR FLAGS    PATH
+28512  sshd               10   0 00101101 /proc/self/oom_score_adj
+28512  sshd                3   0 02100000 /etc/ld.so.cache
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libwrap.so.0
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libaudit.so.1
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libpam.so.0
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libselinux.so.1
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libsystemd.so.0
+28512  sshd                3   0 02100000 /usr/lib/x86_64-linux-gnu/libcrypto.so.1.0.2
+28512  sshd                3   0 02100000 /lib/x86_64-linux-gnu/libutil.so.1
+
+
+The -f option filters based on flags to the open(2) call, for example:
+
+# ./opensnoop -e -f O_WRONLY -f O_RDWR
+PID    COMM               FD ERR FLAGS    PATH
+28084  clear_console       3   0 00100002 /dev/tty
+28084  clear_console      -1  13 00100002 /dev/tty0
+28084  clear_console      -1  13 00100001 /dev/tty0
+28084  clear_console      -1  13 00100002 /dev/console
+28084  clear_console      -1  13 00100001 /dev/console
+28051  sshd                8   0 02100002 /var/run/utmp
+28051  sshd                7   0 00100001 /var/log/wtmp
+
+
 USAGE message:
 
 # ./opensnoop -h
 usage: opensnoop [-h] [-T] [-x] [-p PID] [-t TID] [-d DURATION] [-n NAME]
+                 [-e] [-f FLAG_FILTER]
 
 Trace open() syscalls
 
 optional arguments:
   -h, --help            show this help message and exit
   -T, --timestamp       include timestamp on output
+  -U, --print-uid       include UID on output
   -x, --failed          only show failed opens
   -p PID, --pid PID     trace this PID only
   -t TID, --tid TID     trace this TID only
+  -u UID, --uid UID     trace this UID only
   -d DURATION, --duration DURATION
                         total duration of trace in seconds
   -n NAME, --name NAME  only print process names containing this name
+  -e, --extended_fields
+                        show extended fields
+  -f FLAG_FILTER, --flag_filter FLAG_FILTER
+                        filter on flags argument (e.g., O_WRONLY)
 
 examples:
     ./opensnoop           # trace all open() syscalls
     ./opensnoop -T        # include timestamps
+    ./opensnoop -U        # include UID
     ./opensnoop -x        # only show failed opens
     ./opensnoop -p 181    # only trace PID 181
     ./opensnoop -t 123    # only trace TID 123
+    ./opensnoop -u 1000   # only trace UID 1000
     ./opensnoop -d 10     # trace for 10 seconds only
     ./opensnoop -n main   # only print process names containing "main"
+    ./opensnoop -e        # show extended fields
+    ./opensnoop -f O_WRONLY -f O_RDWR  # only print calls for writing
diff --git a/tools/pidpersec.py b/tools/pidpersec.py
index c449004..aff1219 100755
--- a/tools/pidpersec.py
+++ b/tools/pidpersec.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # pidpersec Count new processes (via fork).
diff --git a/tools/profile.py b/tools/profile.py
index d1d3d26..084ac63 100755
--- a/tools/profile.py
+++ b/tools/profile.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # profile  Profile CPU usage by sampling stack traces at a timed interval.
diff --git a/tools/runqlat.py b/tools/runqlat.py
index 9fd4064..9c56d22 100755
--- a/tools/runqlat.py
+++ b/tools/runqlat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # runqlat   Run queue (scheduler) latency as a histogram.
diff --git a/tools/runqlen.py b/tools/runqlen.py
index b56a591..4ff515d 100755
--- a/tools/runqlen.py
+++ b/tools/runqlen.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # runqlen    Summarize scheduler run queue length as a histogram.
diff --git a/tools/runqslower.py b/tools/runqslower.py
index 7a1869c..0b3e1c1 100755
--- a/tools/runqslower.py
+++ b/tools/runqslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # runqslower    Trace long process scheduling delays.
@@ -218,7 +218,7 @@
 else:
     bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
 if args.pid:
-    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.pid)
 else:
     bpf_text = bpf_text.replace('FILTER_PID', '0')
 if debug or args.ebpf:
@@ -254,4 +254,7 @@
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/shmsnoop.py b/tools/shmsnoop.py
new file mode 100755
index 0000000..20fcbd7
--- /dev/null
+++ b/tools/shmsnoop.py
@@ -0,0 +1,322 @@
+#!/usr/bin/env python
+# @lint-avoid-python-3-compatibility-imports
+#
+# shmsnoop Trace shm*() syscalls.
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: shmsnoop [-h] [-T] [-x] [-p PID] [-d DURATION] [-t TID] [-n NAME]
+#
+# Copyright (c) 2018 Jiri Olsa.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 08-Oct-2018   Jiri Olsa   Created this.
+
+from __future__ import print_function
+from bcc import ArgString, BPF
+import argparse
+import ctypes as ct
+from datetime import datetime, timedelta
+
+# arguments
+examples = """examples:
+    ./shmsnoop           # trace all shm*() syscalls
+    ./shmsnoop -T        # include timestamps
+    ./shmsnoop -p 181    # only trace PID 181
+    ./shmsnoop -t 123    # only trace TID 123
+    ./shmsnoop -d 10     # trace for 10 seconds only
+    ./shmsnoop -n main   # only print process names containing "main"
+"""
+parser = argparse.ArgumentParser(
+    description="Trace shm*() syscalls",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
+parser.add_argument("-d", "--duration",
+    help="total duration of trace in seconds")
+parser.add_argument("-n", "--name",
+    type=ArgString,
+    help="only print process names containing this name")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+debug = 0
+if args.duration:
+    args.duration = timedelta(seconds=int(args.duration))
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/limits.h>
+#include <linux/sched.h>
+
+struct val_t {
+    u64            id;
+    u64            ts;
+    int            sys;
+    unsigned long  key;
+    unsigned long  size;
+    unsigned long  shmflg;
+    unsigned long  shmid;
+    unsigned long  cmd;
+    unsigned long  buf;
+    unsigned long  shmaddr;
+    unsigned long  ret;
+    char           comm[TASK_COMM_LEN];
+};
+
+BPF_HASH(infotmp, u64, struct val_t);
+BPF_PERF_OUTPUT(events);
+
+enum {
+    SYS_SHMGET,
+    SYS_SHMAT,
+    SYS_SHMDT,
+    SYS_SHMCTL,
+};
+
+static int enter(struct val_t *val)
+{
+    u64 id  = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
+
+    FILTER
+
+    val->id = id;
+    infotmp.update(&id, val);
+    return 0;
+}
+
+int trace_return(struct pt_regs *ctx)
+{
+    u64 id  = bpf_get_current_pid_tgid();
+    u64 tsp = bpf_ktime_get_ns();
+    struct val_t *val;
+
+    val = infotmp.lookup(&id);
+    if (val == 0)
+        return 0;
+
+    if (bpf_get_current_comm(&val->comm, sizeof(val->comm)) != 0)
+        goto out;
+
+    val->ts  = tsp / 1000;
+    val->ret = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, val, sizeof(*val));
+
+out:
+    infotmp.delete(&id);
+    return 0;
+}
+
+int syscall__shmget(struct pt_regs *ctx, u64 key, u64 size, u64 shmflg)
+{
+    struct val_t val = {
+        .sys = SYS_SHMGET,
+    };
+
+    val.key    = key;
+    val.size   = size;
+    val.shmflg = shmflg;
+    return enter(&val);
+};
+
+int syscall__shmat(struct pt_regs *ctx, u64 shmid, u64 shmaddr, u64 shmflg)
+{
+    struct val_t val = {
+        .sys = SYS_SHMAT,
+    };
+
+    val.shmid   = shmid;
+    val.shmaddr = shmaddr;
+    val.shmflg  = shmflg;
+    return enter(&val);
+};
+
+int syscall__shmdt(struct pt_regs *ctx, u64 shmaddr)
+{
+    struct val_t val = {
+        .sys = SYS_SHMDT,
+    };
+
+    val.shmaddr = shmaddr;
+    return enter(&val);
+};
+
+int syscall__shmctl(struct pt_regs *ctx, u64 shmid, u64 cmd, u64 buf)
+{
+    struct val_t val = {
+        .sys = SYS_SHMCTL,
+    };
+
+    val.shmid = shmid;
+    val.cmd   = cmd;
+    val.buf   = buf;
+    return enter(&val);
+};
+
+"""
+if args.tid:  # TID trumps PID
+    bpf_text = bpf_text.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+syscall_fnname = b.get_syscall_fnname("shmget")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmget")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+syscall_fnname = b.get_syscall_fnname("shmat")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmat")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+syscall_fnname = b.get_syscall_fnname("shmdt")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmdt")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+syscall_fnname = b.get_syscall_fnname("shmctl")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__shmctl")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return")
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+SYS_SHMGET = 0
+SYS_SHMAT  = 1
+SYS_SHMDT  = 2
+SYS_SHMCTL = 3
+
+initial_ts = 0
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("id",      ct.c_ulonglong),
+        ("ts",      ct.c_ulonglong),
+        ("sys",     ct.c_int),
+        ("key",     ct.c_ulong),
+        ("size",    ct.c_ulong),
+        ("shmflg",  ct.c_ulong),
+        ("shmid",   ct.c_ulong),
+        ("cmd",     ct.c_ulong),
+        ("buf",     ct.c_ulong),
+        ("shmaddr", ct.c_ulong),
+        ("ret",     ct.c_ulong),
+        ("comm",    ct.c_char * TASK_COMM_LEN),
+    ]
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-16s %6s %16s ARGs" %
+      ("TID" if args.tid else "PID", "COMM", "SYS", "RET"))
+
+def sys_name(sys):
+    switcher = {
+        SYS_SHMGET: "SHMGET",
+        SYS_SHMAT:  "SHMAT",
+        SYS_SHMDT:  "SHMDT",
+        SYS_SHMCTL: "SHMCTL",
+    }
+    return switcher.get(sys, "N/A")
+
+shmget_flags = [
+  { 'name' : 'IPC_CREAT',     'value' :    0o1000 },
+  { 'name' : 'IPC_EXCL',      'value' :    0o2000 },
+  { 'name' : 'SHM_HUGETLB',   'value' :    0o4000 },
+  { 'name' : 'SHM_HUGE_2MB',  'value' :  21 << 26 },
+  { 'name' : 'SHM_HUGE_1GB',  'value' :  30 << 26 },
+  { 'name' : 'SHM_NORESERVE', 'value' :   0o10000 },
+  { 'name' : 'SHM_EXEC',      'value' :  0o100000 }
+]
+
+shmat_flags = [
+  { 'name' : 'SHM_RDONLY', 'value' :  0o10000 },
+  { 'name' : 'SHM_RND',    'value' :  0o20000 },
+  { 'name' : 'SHM_REMAP',  'value' :  0o40000 },
+  { 'name' : 'SHM_EXEC',   'value' : 0o100000 },
+]
+
+def shmflg_str(val, flags):
+    cur = filter(lambda x : x['value'] & val, flags)
+    str = "0x%x" % val
+
+    if (not val):
+        return str
+
+    str += " ("
+    cnt = 0
+    for x in cur:
+        if cnt:
+            str += "|"
+        str +=  x['name']
+        val &= ~x['value']
+        cnt += 1
+
+    if val != 0 or not cnt:
+        if cnt:
+            str += "|"
+        str += "0%o" % val
+
+    str += ")"
+    return str
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    global initial_ts
+
+    if not initial_ts:
+        initial_ts = event.ts
+
+    if args.name and bytes(args.name) not in event.comm:
+        return
+
+    if args.timestamp:
+        delta = event.ts - initial_ts
+        print("%-14.9f" % (float(delta) / 1000000), end="")
+
+    print("%-6d %-16s %6s %16lx " %
+          (event.id & 0xffffffff if args.tid else event.id >> 32,
+           event.comm.decode(), sys_name(event.sys), event.ret), end = '')
+
+    if event.sys == SYS_SHMGET:
+        print("key: 0x%lx, size: %lu, shmflg: %s" %
+              (event.key, event.size, shmflg_str(event.shmflg, shmget_flags)))
+
+    if event.sys == SYS_SHMAT:
+        print("shmid: 0x%lx, shmaddr: 0x%lx, shmflg: %s" %
+              (event.shmid, event.shmaddr, shmflg_str(event.shmflg, shmat_flags)))
+
+    if event.sys == SYS_SHMDT:
+        print("shmaddr: 0x%lx" % (event.shmaddr))
+
+    if event.sys == SYS_SHMCTL:
+        print("shmid: 0x%lx, cmd: %lu, buf: 0x%x" % (event.shmid, event.cmd, event.buf))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+start_time = datetime.now()
+while not args.duration or datetime.now() - start_time < args.duration:
+    try:
+        b.perf_buffer_poll(timeout=1000)
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/shmsnoop_example.txt b/tools/shmsnoop_example.txt
new file mode 100644
index 0000000..53bbe70
--- /dev/null
+++ b/tools/shmsnoop_example.txt
@@ -0,0 +1,66 @@
+Demonstrations of shmsnoop, the Linux eBPF/bcc version.
+
+shmsnoop traces shm*() syscalls, for example:
+
+# ./shmsnoop.py
+PID    COMM                SYS              RET ARGs
+19813  server           SHMGET            10000 key: 0x78020001, size: 20, shmflg: 0x3b6 (IPC_CREAT|0666)
+19813  server            SHMAT     7f1cf8b1f000 shmid: 0x10000, shmaddr: 0x0, shmflg: 0x0
+19816  client           SHMGET            10000 key: 0x78020001, size: 20, shmflg: 0x1b6 (0666)
+19816  client            SHMAT     7f4fd8ee7000 shmid: 0x10000, shmaddr: 0x0, shmflg: 0x0
+19816  client            SHMDT                0 shmaddr: 0x7f4fd8ee7000
+19813  server            SHMDT                0 shmaddr: 0x7f1cf8b1f000
+19813  server           SHMCTL                0 shmid: 0x10000, cmd: 0, buf: 0x0
+
+
+Every call the shm* syscall (SHM column) is displayed
+on separate line together with process info (PID/COMM
+columns) and argument details: return value (RET column)
+and syscall arguments (ARGs column).
+
+The ARGs column contains 'arg: value' couples that represent
+given syscall arguments as described in their manpage.
+
+This works by tracing shm* system calls and sending
+argument details to the python script.
+
+A -T option can be used to include a timestamp column,
+and a -n option to match on a command name. Regular
+expressions are allowed.  For example, matching commands
+containing "server" with timestamps:
+
+# ./shmsnoop.py -T -n server
+TIME(s)       PID    COMM                SYS              RET ARGs
+0.563194000   19825  server            SHMDT                0 shmaddr: 0x7f74362e4000
+0.563237000   19825  server           SHMCTL                0 shmid: 0x18000, cmd: 0, buf: 0x0
+
+
+A -p option can be used to trace only selected process:
+
+# ./shmsnoop.py -p 19855
+PID    COMM                SYS              RET ARGs
+19855  server            SHMDT                0 shmaddr: 0x7f4329ff8000
+19855  server           SHMCTL                0 shmid: 0x20000, cmd: 0, buf: 0x0
+
+USAGE message:
+# ./shmsnoop.py -h
+usage: shmsnoop.py [-h] [-T] [-p PID] [-t TID] [-d DURATION] [-n NAME]
+
+Trace shm*() syscalls
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -T, --timestamp       include timestamp on output
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -d DURATION, --duration DURATION
+                        total duration of trace in seconds
+  -n NAME, --name NAME  only print process names containing this name
+
+examples:
+    ./shmsnoop           # trace all shm*() syscalls
+    ./shmsnoop -T        # include timestamps
+    ./shmsnoop -p 181    # only trace PID 181
+    ./shmsnoop -t 123    # only trace TID 123
+    ./shmsnoop -d 10     # trace for 10 seconds only
+    ./shmsnoop -n main   # only print process names containing "main"
diff --git a/tools/slabratetop.py b/tools/slabratetop.py
index 101c585..ab6c08c 100755
--- a/tools/slabratetop.py
+++ b/tools/slabratetop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # slabratetop  Summarize kmem_cache_alloc() calls.
diff --git a/tools/sofdsnoop.py b/tools/sofdsnoop.py
new file mode 100755
index 0000000..f633107
--- /dev/null
+++ b/tools/sofdsnoop.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python
+# @lint-avoid-python-3-compatibility-imports
+#
+# sofdsnoop traces file descriptors passed via socket
+#           For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: sofdsnoop
+#
+# Copyright (c) 2018 Jiri Olsa.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 30-Jul-2018   Jiri Olsa   Created this.
+
+from __future__ import print_function
+from bcc import ArgString, BPF
+import os
+import argparse
+import ctypes as ct
+from datetime import datetime, timedelta
+
+# arguments
+examples = """examples:
+    ./sofdsnoop           # trace file descriptors passes
+    ./sofdsnoop -T        # include timestamps
+    ./sofdsnoop -p 181    # only trace PID 181
+    ./sofdsnoop -t 123    # only trace TID 123
+    ./sofdsnoop -d 10     # trace for 10 seconds only
+    ./sofdsnoop -n main   # only print process names containing "main"
+
+"""
+parser = argparse.ArgumentParser(
+    description="Trace file descriptors passed via socket",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
+parser.add_argument("-n", "--name",
+    type=ArgString,
+    help="only print process names containing this name")
+parser.add_argument("-d", "--duration",
+    help="total duration of trace in seconds")
+args = parser.parse_args()
+debug = 0
+
+ACTION_SEND=0
+ACTION_RECV=1
+MAX_FD=10
+
+if args.duration:
+    args.duration = timedelta(seconds=int(args.duration))
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/limits.h>
+#include <linux/sched.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#define MAX_FD 10
+#define ACTION_SEND   0
+#define ACTION_RECV   1
+
+struct val_t {
+    u64  id;
+    u64  ts;
+    int  action;
+    int  sock_fd;
+    int  fd_cnt;
+    int  fd[MAX_FD];
+    char comm[TASK_COMM_LEN];
+};
+
+BPF_HASH(detach_ptr, u64, struct cmsghdr *);
+BPF_HASH(sock_fd, u64, int);
+BPF_PERF_OUTPUT(events);
+
+static void set_fd(int fd)
+{
+    u64 id = bpf_get_current_pid_tgid();
+
+    sock_fd.update(&id, &fd);
+}
+
+static int get_fd(void)
+{
+    u64 id = bpf_get_current_pid_tgid();
+    int *fd;
+
+    fd = sock_fd.lookup(&id);
+    return fd ? *fd : -1;
+}
+
+static void put_fd(void)
+{
+    u64 id = bpf_get_current_pid_tgid();
+
+    sock_fd.delete(&id);
+}
+
+static int sent_1(struct pt_regs *ctx, struct val_t *val, int num, void *data)
+{
+    val->fd_cnt = min(num, MAX_FD);
+
+    if (bpf_probe_read(&val->fd[0], MAX_FD * sizeof(int), data))
+        return -1;
+
+    events.perf_submit(ctx, val, sizeof(*val));
+    return 0;
+}
+
+#define SEND_1                                  \
+    if (sent_1(ctx, &val, num, (void *) data))  \
+        return 0;                               \
+                                                \
+    num -= MAX_FD;                              \
+    if (num < 0)                                \
+        return 0;                               \
+                                                \
+    data += MAX_FD;
+
+#define SEND_2   SEND_1 SEND_1
+#define SEND_4   SEND_2 SEND_2
+#define SEND_8   SEND_4 SEND_4
+#define SEND_260 SEND_8 SEND_8 SEND_8 SEND_2
+
+static int send(struct pt_regs *ctx, struct cmsghdr *cmsg, int action)
+{
+    struct val_t val = { 0 };
+    int *data, num, fd;
+    u64 tsp = bpf_ktime_get_ns();
+
+    data = (void *) ((char *) cmsg + sizeof(struct cmsghdr));
+    num  = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
+
+    val.id      = bpf_get_current_pid_tgid();
+    val.action  = action;
+    val.sock_fd = get_fd();
+    val.ts      = tsp / 1000;
+
+    if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) != 0)
+        return 0;
+
+    SEND_260
+    return 0;
+}
+
+static bool allow_pid(u64 id)
+{
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
+
+    FILTER
+
+    return 1;
+}
+
+int trace_scm_send_entry(struct pt_regs *ctx, struct socket *sock, struct msghdr *hdr)
+{
+    struct cmsghdr *cmsg = NULL;
+
+    if (!allow_pid(bpf_get_current_pid_tgid()))
+        return 0;
+
+    if (hdr->msg_controllen >= sizeof(struct cmsghdr))
+        cmsg = hdr->msg_control;
+
+    if (!cmsg || (cmsg->cmsg_type != SCM_RIGHTS))
+        return 0;
+
+    return send(ctx, cmsg, ACTION_SEND);
+};
+
+int trace_scm_detach_fds_entry(struct pt_regs *ctx, struct msghdr *hdr)
+{
+    struct cmsghdr *cmsg = NULL;
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    if (hdr->msg_controllen >= sizeof(struct cmsghdr))
+        cmsg = hdr->msg_control;
+
+    if (!cmsg)
+        return 0;
+
+    detach_ptr.update(&id, &cmsg);
+    return 0;
+};
+
+int trace_scm_detach_fds_return(struct pt_regs *ctx)
+{
+    struct cmsghdr **cmsgp;
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    cmsgp = detach_ptr.lookup(&id);
+
+    if (!cmsgp)
+        return 0;
+
+    return send(ctx, *cmsgp, ACTION_RECV);
+}
+
+int syscall__sendmsg(struct pt_regs *ctx, u64 fd, u64 msg, u64 flags)
+{
+    struct pt_regs p;
+
+    if (!allow_pid(bpf_get_current_pid_tgid()))
+        return 0;
+
+    set_fd(fd);
+    return 0;
+}
+
+int trace_sendmsg_return(struct pt_regs *ctx)
+{
+    if (!allow_pid(bpf_get_current_pid_tgid()))
+        return 0;
+
+    put_fd();
+    return 0;
+}
+
+int syscall__recvmsg(struct pt_regs *ctx, u64 fd, u64 msg, u64 flags)
+{
+    struct pt_regs p;
+
+    if (!allow_pid(bpf_get_current_pid_tgid()))
+        return 0;
+
+    fd = fd;
+
+    set_fd(fd);
+    return 0;
+}
+
+int trace_recvmsg_return(struct pt_regs *ctx)
+{
+    if (!allow_pid(bpf_get_current_pid_tgid()))
+        return 0;
+
+    put_fd();
+    return 0;
+}
+
+"""
+
+if args.tid:  # TID trumps PID
+    bpf_text = bpf_text.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+    bpf_text = bpf_text.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER', '')
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+syscall_fnname = b.get_syscall_fnname("sendmsg")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__sendmsg")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_sendmsg_return")
+
+syscall_fnname = b.get_syscall_fnname("recvmsg")
+if BPF.ksymname(syscall_fnname) != -1:
+    b.attach_kprobe(event=syscall_fnname, fn_name="syscall__recvmsg")
+    b.attach_kretprobe(event=syscall_fnname, fn_name="trace_recvmsg_return")
+
+b.attach_kprobe(event="__scm_send", fn_name="trace_scm_send_entry")
+b.attach_kprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_entry")
+b.attach_kretprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_return")
+
+TASK_COMM_LEN = 16    # linux/sched.h
+
+initial_ts = 0
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("id",      ct.c_ulonglong),
+        ("ts",      ct.c_ulonglong),
+        ("action",  ct.c_int),
+        ("sock_fd", ct.c_int),
+        ("fd_cnt",  ct.c_int),
+        ("fd",      ct.c_int  * MAX_FD),
+        ("comm",    ct.c_char * TASK_COMM_LEN),
+    ]
+
+# header
+if args.timestamp:
+    print("%-14s" % ("TIME(s)"), end="")
+print("%-6s %-6s %-16s %-25s %-5s %s" %
+      ("ACTION", "TID", "COMM", "SOCKET", "FD", "NAME"))
+
+def get_file(pid, fd):
+    proc = "/proc/%d/fd/%d" % (pid, fd)
+    try:
+        return os.readlink(proc)
+    except OSError as err:
+        return "N/A"
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    tid = event.id & 0xffffffff;
+
+    cnt = min(MAX_FD, event.fd_cnt);
+
+    if args.name and bytes(args.name) not in event.comm:
+        return
+
+    for i in range(0, cnt):
+        global initial_ts
+
+        if not initial_ts:
+            initial_ts = event.ts
+
+        if args.timestamp:
+            delta = event.ts - initial_ts
+            print("%-14.9f" % (float(delta) / 1000000), end="")
+
+        print("%-6s %-6d %-16s " %
+              ("SEND" if event.action == ACTION_SEND else "RECV",
+               tid, event.comm.decode()), end = '')
+
+        sock = "%d:%s" % (event.sock_fd, get_file(tid, event.sock_fd))
+        print("%-25s " % sock, end = '')
+
+        fd = event.fd[i]
+        fd_file = get_file(tid, fd) if event.action == ACTION_SEND else ""
+        print("%-5d %s" % (fd, fd_file))
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event, page_cnt=64)
+start_time = datetime.now()
+while not args.duration or datetime.now() - start_time < args.duration:
+    try:
+        b.perf_buffer_poll(timeout=1000)
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/sofdsnoop_example.txt b/tools/sofdsnoop_example.txt
new file mode 100644
index 0000000..740a26f
--- /dev/null
+++ b/tools/sofdsnoop_example.txt
@@ -0,0 +1,69 @@
+Demonstrations of sofdsnoop, the Linux eBPF/bcc version.
+
+sofdsnoop traces FDs passed through unix sockets
+
+# ./sofdsnoop.py
+ACTION TID    COMM             SOCKET                    FD    NAME
+SEND   2576   Web Content      24:socket:[39763]         51    /dev/shm/org.mozilla.ipc.2576.23874
+RECV   2576   Web Content      49:socket:[809997]        51
+SEND   2576   Web Content      24:socket:[39763]         58    N/A
+RECV   2464   Gecko_IOThread   75:socket:[39753]         55
+
+Every file descriptor that is passed via unix sockets os displayed
+on separate line together with process info (TID/COMM columns),
+ACTION details (SEND/RECV), file descriptor number (FD) and its
+translation to file if available (NAME).
+
+The file descriptor (fd) value is bound to a process. The SEND
+lines display the fd value within the sending process. The RECV
+lines display the fd value of the sending process. That's why
+there's translation to name only on SEND lines, where we are
+able to find it in task proc records.
+
+This works by tracing sendmsg/recvmsg system calls to provide
+the socket fds, and scm_send_entry/scm_detach_fds to provide
+the file descriptor details.
+
+A -T option can be used to include a timestamp column,
+and a -n option to match on a command name. Regular
+expressions are allowed.  For example, matching commands
+containing "server" with timestamps:
+
+# ./sofdsnoop.py -T -n Web
+TIME(s)       ACTION TID    COMM             SOCKET                    FD    NAME
+0.000000000   SEND   2576   Web Content      24:socket:[39763]         51    /dev/shm/org.mozilla.ipc.2576.25404 (deleted)
+0.000413000   RECV   2576   Web Content      49:/dev/shm/org.mozilla.ipc.2576.25404 (deleted) 51
+0.000558000   SEND   2576   Web Content      24:socket:[39763]         58    N/A
+0.000952000   SEND   2576   Web Content      24:socket:[39763]         58    socket:[817962]
+
+
+A -p option can be used to trace only selected process:
+
+# ./sofdsnoop.py -p 2576 -T
+TIME(s)       ACTION TID    COMM             SOCKET                    FD    NAME
+0.000000000   SEND   2576   Web Content      24:socket:[39763]         51    N/A
+0.000138000   RECV   2576   Web Content      49:N/A                    5
+0.000191000   SEND   2576   Web Content      24:socket:[39763]         58    N/A
+0.000424000   RECV   2576   Web Content      51:/dev/shm/org.mozilla.ipc.2576.25319 (deleted) 49
+
+USAGE message:
+usage: sofdsnoop.py [-h] [-T] [-p PID] [-t TID] [-n NAME] [-d DURATION]
+
+Trace file descriptors passed via socket
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -T, --timestamp       include timestamp on output
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -n NAME, --name NAME  only print process names containing this name
+  -d DURATION, --duration DURATION
+                        total duration of trace in seconds
+
+examples:
+    ./sofdsnoop           # trace file descriptors passes
+    ./sofdsnoop -T        # include timestamps
+    ./sofdsnoop -p 181    # only trace PID 181
+    ./sofdsnoop -t 123    # only trace TID 123
+    ./sofdsnoop -d 10     # trace for 10 seconds only
+    ./sofdsnoop -n main   # only print process names containing "main"
diff --git a/tools/softirqs.py b/tools/softirqs.py
index 1e2daf5..10ebc38 100755
--- a/tools/softirqs.py
+++ b/tools/softirqs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # softirqs  Summarize soft IRQ (interrupt) event time.
diff --git a/tools/solisten.py b/tools/solisten.py
index 6a35f82..81e82e0 100755
--- a/tools/solisten.py
+++ b/tools/solisten.py
@@ -23,6 +23,7 @@
 import argparse
 from bcc import BPF
 import ctypes as ct
+from bcc.utils import printb
 
 # Arguments
 examples = """Examples:
@@ -165,12 +166,12 @@
 
         # Display
         if show_netns:
-            print("%-6d %-12.12s %-12s %-6s %-8s %-5s %-39s" % (
+            printb(b"%-6d %-12.12s %-12s %-6s %-8s %-5s %-39s" % (
                 pid, event.task, event.netns, protocol, event.backlog,
                 event.lport, address,
             ))
         else:
-            print("%-6d %-12.12s %-6s %-8s %-5s %-39s" % (
+            printb(b"%-6d %-12.12s %-6s %-8s %-5s %-39s" % (
                 pid, event.task, protocol, event.backlog,
                 event.lport, address,
             ))
@@ -210,4 +211,7 @@
 
     # Read events
     while 1:
-        b.perf_buffer_poll()
+        try:
+            b.perf_buffer_poll()
+        except KeyboardInterrupt:
+            exit()
diff --git a/tools/sslsniff.py b/tools/sslsniff.py
index 0c9f976..f7bc117 100755
--- a/tools/sslsniff.py
+++ b/tools/sslsniff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # sslsniff  Captures data on read/recv or write/send functions of OpenSSL,
 #           GnuTLS and NSS
@@ -228,4 +228,7 @@
 b["perf_SSL_write"].open_perf_buffer(print_event_write)
 b["perf_SSL_read"].open_perf_buffer(print_event_read)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/statsnoop.py b/tools/statsnoop.py
index 4e62ebd..9e585be 100755
--- a/tools/statsnoop.py
+++ b/tools/statsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # statsnoop Trace stat() syscalls.
@@ -179,4 +179,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/syncsnoop.py b/tools/syncsnoop.py
index ba3f1d3..eb892ba 100755
--- a/tools/syncsnoop.py
+++ b/tools/syncsnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # syncsnoop Trace sync() syscall.
@@ -50,4 +50,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/syscount.py b/tools/syscount.py
index 191511c..58039be 100755
--- a/tools/syscount.py
+++ b/tools/syscount.py
@@ -9,368 +9,21 @@
 #
 # 15-Feb-2017   Sasha Goldshtein    Created this.
 
-from bcc import BPF
-from bcc.utils import printb
 from time import sleep, strftime
 import argparse
 import errno
 import itertools
-import subprocess
 import sys
 import signal
-import platform
+from bcc import BPF
+from bcc.utils import printb
+from bcc.syscall import syscall_name
 
 if sys.version_info.major < 3:
     izip_longest = itertools.izip_longest
 else:
     izip_longest = itertools.zip_longest
 
-#
-# Syscall table for Linux x86_64, not very recent.
-# Automatically generated from strace/linux/x86_64/syscallent.h using the
-# following command:
-#
-#  cat syscallent.h | awk -F, '{ gsub(/[ \t"}]/, "", $4);
-#                                gsub(/[ \t/*]/, "", $5);
-#                                print "    "$5": \""$4"\","; }
-#                              BEGIN { print "syscalls = {" }
-#                              END { print "}" }'
-#
-syscalls = {
-    0: b"read",
-    1: b"write",
-    2: b"open",
-    3: b"close",
-    4: b"stat",
-    5: b"fstat",
-    6: b"lstat",
-    7: b"poll",
-    8: b"lseek",
-    9: b"mmap",
-    10: b"mprotect",
-    11: b"munmap",
-    12: b"brk",
-    13: b"rt_sigaction",
-    14: b"rt_sigprocmask",
-    15: b"rt_sigreturn",
-    16: b"ioctl",
-    17: b"pread",
-    18: b"pwrite",
-    19: b"readv",
-    20: b"writev",
-    21: b"access",
-    22: b"pipe",
-    23: b"select",
-    24: b"sched_yield",
-    25: b"mremap",
-    26: b"msync",
-    27: b"mincore",
-    28: b"madvise",
-    29: b"shmget",
-    30: b"shmat",
-    31: b"shmctl",
-    32: b"dup",
-    33: b"dup2",
-    34: b"pause",
-    35: b"nanosleep",
-    36: b"getitimer",
-    37: b"alarm",
-    38: b"setitimer",
-    39: b"getpid",
-    40: b"sendfile",
-    41: b"socket",
-    42: b"connect",
-    43: b"accept",
-    44: b"sendto",
-    45: b"recvfrom",
-    46: b"sendmsg",
-    47: b"recvmsg",
-    48: b"shutdown",
-    49: b"bind",
-    50: b"listen",
-    51: b"getsockname",
-    52: b"getpeername",
-    53: b"socketpair",
-    54: b"setsockopt",
-    55: b"getsockopt",
-    56: b"clone",
-    57: b"fork",
-    58: b"vfork",
-    59: b"execve",
-    60: b"_exit",
-    61: b"wait4",
-    62: b"kill",
-    63: b"uname",
-    64: b"semget",
-    65: b"semop",
-    66: b"semctl",
-    67: b"shmdt",
-    68: b"msgget",
-    69: b"msgsnd",
-    70: b"msgrcv",
-    71: b"msgctl",
-    72: b"fcntl",
-    73: b"flock",
-    74: b"fsync",
-    75: b"fdatasync",
-    76: b"truncate",
-    77: b"ftruncate",
-    78: b"getdents",
-    79: b"getcwd",
-    80: b"chdir",
-    81: b"fchdir",
-    82: b"rename",
-    83: b"mkdir",
-    84: b"rmdir",
-    85: b"creat",
-    86: b"link",
-    87: b"unlink",
-    88: b"symlink",
-    89: b"readlink",
-    90: b"chmod",
-    91: b"fchmod",
-    92: b"chown",
-    93: b"fchown",
-    94: b"lchown",
-    95: b"umask",
-    96: b"gettimeofday",
-    97: b"getrlimit",
-    98: b"getrusage",
-    99: b"sysinfo",
-    100: b"times",
-    101: b"ptrace",
-    102: b"getuid",
-    103: b"syslog",
-    104: b"getgid",
-    105: b"setuid",
-    106: b"setgid",
-    107: b"geteuid",
-    108: b"getegid",
-    109: b"setpgid",
-    110: b"getppid",
-    111: b"getpgrp",
-    112: b"setsid",
-    113: b"setreuid",
-    114: b"setregid",
-    115: b"getgroups",
-    116: b"setgroups",
-    117: b"setresuid",
-    118: b"getresuid",
-    119: b"setresgid",
-    120: b"getresgid",
-    121: b"getpgid",
-    122: b"setfsuid",
-    123: b"setfsgid",
-    124: b"getsid",
-    125: b"capget",
-    126: b"capset",
-    127: b"rt_sigpending",
-    128: b"rt_sigtimedwait",
-    129: b"rt_sigqueueinfo",
-    130: b"rt_sigsuspend",
-    131: b"sigaltstack",
-    132: b"utime",
-    133: b"mknod",
-    134: b"uselib",
-    135: b"personality",
-    136: b"ustat",
-    137: b"statfs",
-    138: b"fstatfs",
-    139: b"sysfs",
-    140: b"getpriority",
-    141: b"setpriority",
-    142: b"sched_setparam",
-    143: b"sched_getparam",
-    144: b"sched_setscheduler",
-    145: b"sched_getscheduler",
-    146: b"sched_get_priority_max",
-    147: b"sched_get_priority_min",
-    148: b"sched_rr_get_interval",
-    149: b"mlock",
-    150: b"munlock",
-    151: b"mlockall",
-    152: b"munlockall",
-    153: b"vhangup",
-    154: b"modify_ldt",
-    155: b"pivot_root",
-    156: b"_sysctl",
-    157: b"prctl",
-    158: b"arch_prctl",
-    159: b"adjtimex",
-    160: b"setrlimit",
-    161: b"chroot",
-    162: b"sync",
-    163: b"acct",
-    164: b"settimeofday",
-    165: b"mount",
-    166: b"umount",
-    167: b"swapon",
-    168: b"swapoff",
-    169: b"reboot",
-    170: b"sethostname",
-    171: b"setdomainname",
-    172: b"iopl",
-    173: b"ioperm",
-    174: b"create_module",
-    175: b"init_module",
-    176: b"delete_module",
-    177: b"get_kernel_syms",
-    178: b"query_module",
-    179: b"quotactl",
-    180: b"nfsservctl",
-    181: b"getpmsg",
-    182: b"putpmsg",
-    183: b"afs_syscall",
-    184: b"tuxcall",
-    185: b"security",
-    186: b"gettid",
-    187: b"readahead",
-    188: b"setxattr",
-    189: b"lsetxattr",
-    190: b"fsetxattr",
-    191: b"getxattr",
-    192: b"lgetxattr",
-    193: b"fgetxattr",
-    194: b"listxattr",
-    195: b"llistxattr",
-    196: b"flistxattr",
-    197: b"removexattr",
-    198: b"lremovexattr",
-    199: b"fremovexattr",
-    200: b"tkill",
-    201: b"time",
-    202: b"futex",
-    203: b"sched_setaffinity",
-    204: b"sched_getaffinity",
-    205: b"set_thread_area",
-    206: b"io_setup",
-    207: b"io_destroy",
-    208: b"io_getevents",
-    209: b"io_submit",
-    210: b"io_cancel",
-    211: b"get_thread_area",
-    212: b"lookup_dcookie",
-    213: b"epoll_create",
-    214: b"epoll_ctl_old",
-    215: b"epoll_wait_old",
-    216: b"remap_file_pages",
-    217: b"getdents64",
-    218: b"set_tid_address",
-    219: b"restart_syscall",
-    220: b"semtimedop",
-    221: b"fadvise64",
-    222: b"timer_create",
-    223: b"timer_settime",
-    224: b"timer_gettime",
-    225: b"timer_getoverrun",
-    226: b"timer_delete",
-    227: b"clock_settime",
-    228: b"clock_gettime",
-    229: b"clock_getres",
-    230: b"clock_nanosleep",
-    231: b"exit_group",
-    232: b"epoll_wait",
-    233: b"epoll_ctl",
-    234: b"tgkill",
-    235: b"utimes",
-    236: b"vserver",
-    237: b"mbind",
-    238: b"set_mempolicy",
-    239: b"get_mempolicy",
-    240: b"mq_open",
-    241: b"mq_unlink",
-    242: b"mq_timedsend",
-    243: b"mq_timedreceive",
-    244: b"mq_notify",
-    245: b"mq_getsetattr",
-    246: b"kexec_load",
-    247: b"waitid",
-    248: b"add_key",
-    249: b"request_key",
-    250: b"keyctl",
-    251: b"ioprio_set",
-    252: b"ioprio_get",
-    253: b"inotify_init",
-    254: b"inotify_add_watch",
-    255: b"inotify_rm_watch",
-    256: b"migrate_pages",
-    257: b"openat",
-    258: b"mkdirat",
-    259: b"mknodat",
-    260: b"fchownat",
-    261: b"futimesat",
-    262: b"newfstatat",
-    263: b"unlinkat",
-    264: b"renameat",
-    265: b"linkat",
-    266: b"symlinkat",
-    267: b"readlinkat",
-    268: b"fchmodat",
-    269: b"faccessat",
-    270: b"pselect6",
-    271: b"ppoll",
-    272: b"unshare",
-    273: b"set_robust_list",
-    274: b"get_robust_list",
-    275: b"splice",
-    276: b"tee",
-    277: b"sync_file_range",
-    278: b"vmsplice",
-    279: b"move_pages",
-    280: b"utimensat",
-    281: b"epoll_pwait",
-    282: b"signalfd",
-    283: b"timerfd_create",
-    284: b"eventfd",
-    285: b"fallocate",
-    286: b"timerfd_settime",
-    287: b"timerfd_gettime",
-    288: b"accept4",
-    289: b"signalfd4",
-    290: b"eventfd2",
-    291: b"epoll_create1",
-    292: b"dup3",
-    293: b"pipe2",
-    294: b"inotify_init1",
-    295: b"preadv",
-    296: b"pwritev",
-    297: b"rt_tgsigqueueinfo",
-    298: b"perf_event_open",
-    299: b"recvmmsg",
-    300: b"fanotify_init",
-    301: b"fanotify_mark",
-    302: b"prlimit64",
-    303: b"name_to_handle_at",
-    304: b"open_by_handle_at",
-    305: b"clock_adjtime",
-    306: b"syncfs",
-    307: b"sendmmsg",
-    308: b"setns",
-    309: b"getcpu",
-    310: b"process_vm_readv",
-    311: b"process_vm_writev",
-    312: b"kcmp",
-    313: b"finit_module",
-}
-
-# Try to use ausyscall if it is available, because it can give us an up-to-date
-# list of syscalls for various architectures, rather than the x86-64 hardcoded
-# list above.
-def parse_syscall(line):
-    parts = line.split()
-    return (int(parts[0]), parts[1].strip())
-
-try:
-    # Skip the first line, which is a header. The rest of the lines are simply
-    # SYSCALL_NUM\tSYSCALL_NAME pairs.
-    out = subprocess.check_output('ausyscall --dump | tail -n +2', shell=True)
-    syscalls = dict(map(parse_syscall, out.strip().split(b'\n')))
-except Exception as e:
-    if platform.machine() == "x86_64":
-        pass
-    else:
-        raise Exception("ausyscall: command not found")
-
 # signal handler
 def signal_ignore(signal, frame):
     print()
@@ -526,7 +179,7 @@
     if args.process:
         return b"%-6d %-15s" % (key.value, comm_for_pid(key.value))
     else:
-        return syscalls.get(key.value, b"[unknown: %d]" % key.value)
+        return syscall_name(key.value)
 
 def print_count_stats():
     data = bpf["data"]
diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py
index 884b0c5..5a7bbb8 100755
--- a/tools/tcpaccept.py
+++ b/tools/tcpaccept.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpaccept Trace TCP accept()s.
@@ -21,6 +21,7 @@
 from struct import pack
 import argparse
 import ctypes as ct
+from bcc.utils import printb
 
 # arguments
 examples = """examples:
@@ -83,6 +84,8 @@
     struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
     u32 pid = bpf_get_current_pid_tgid();
 
+    ##FILTER_PID##
+
     if (newsk == NULL)
         return 0;
 
@@ -160,6 +163,9 @@
     if (args->protocol != IPPROTO_TCP)
         return 0;
     u32 pid = bpf_get_current_pid_tgid();
+
+    ##FILTER_PID##
+
     // pull in details
     u16 family = 0, lport = 0;
     family = args->family;
@@ -196,10 +202,10 @@
 
 # code substitutions
 if args.pid:
-    bpf_text = bpf_text.replace('FILTER',
+    bpf_text = bpf_text.replace('##FILTER_PID##',
         'if (pid != %s) { return 0; }' % args.pid)
 else:
-    bpf_text = bpf_text.replace('FILTER', '')
+    bpf_text = bpf_text.replace('##FILTER_PID##', '')
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -238,10 +244,11 @@
         if start_ts == 0:
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
-    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
-        event.task.decode('utf-8', 'replace'), event.ip,
-        inet_ntop(AF_INET, pack("I", event.daddr)),
-        inet_ntop(AF_INET, pack("I", event.saddr)), event.lport))
+    printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task, event.ip,
+        inet_ntop(AF_INET, pack("I", event.daddr)).encode(),
+        inet_ntop(AF_INET, pack("I", event.saddr)).encode(),
+        event.lport))
 
 def print_ipv6_event(cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
@@ -250,9 +257,10 @@
         if start_ts == 0:
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
-    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
-        event.task.decode('utf-8', 'replace'), event.ip,
-        inet_ntop(AF_INET6, event.daddr),inet_ntop(AF_INET6, event.saddr),
+    printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task, event.ip,
+        inet_ntop(AF_INET6, event.daddr).encode(),
+        inet_ntop(AF_INET6, event.saddr).encode(),
         event.lport))
 
 # initialize BPF
@@ -270,4 +278,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py
index ac84326..5ca6851 100755
--- a/tools/tcpconnect.py
+++ b/tools/tcpconnect.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpconnect    Trace TCP connect()s.
@@ -16,9 +16,11 @@
 #
 # 25-Sep-2015   Brendan Gregg   Created this.
 # 14-Feb-2016      "      "     Switch to bpf_perf_output.
+# 09-Jan-2019   Takuma Kume     Support filtering by UID
 
 from __future__ import print_function
 from bcc import BPF
+from bcc.utils import printb
 import argparse
 from socket import inet_ntop, ntohs, AF_INET, AF_INET6
 from struct import pack
@@ -31,6 +33,8 @@
     ./tcpconnect -p 181    # only trace PID 181
     ./tcpconnect -P 80     # only trace port 80
     ./tcpconnect -P 80,81  # only trace port 80 and 81
+    ./tcpconnect -U        # include UID
+    ./tcpconnect -u 1000   # only trace UID 1000
 """
 parser = argparse.ArgumentParser(
     description="Trace TCP connects",
@@ -42,6 +46,10 @@
     help="trace this PID only")
 parser.add_argument("-P", "--port",
     help="comma-separated list of destination ports to trace.")
+parser.add_argument("-U", "--print-uid", action="store_true",
+    help="include UID on output")
+parser.add_argument("-u", "--uid",
+    help="trace this UID only")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
 args = parser.parse_args()
@@ -59,6 +67,7 @@
 struct ipv4_data_t {
     u64 ts_us;
     u32 pid;
+    u32 uid;
     u32 saddr;
     u32 daddr;
     u64 ip;
@@ -70,6 +79,7 @@
 struct ipv6_data_t {
     u64 ts_us;
     u32 pid;
+    u32 uid;
     unsigned __int128 saddr;
     unsigned __int128 daddr;
     u64 ip;
@@ -83,6 +93,9 @@
     u32 pid = bpf_get_current_pid_tgid();
     FILTER_PID
 
+    u32 uid = bpf_get_current_uid_gid();
+    FILTER_UID
+
     // stash the sock ptr for lookup on return
     currsock.update(&pid, &sk);
 
@@ -115,6 +128,7 @@
 
     if (ipver == 4) {
         struct ipv4_data_t data4 = {.pid = pid, .ip = ipver};
+        data4.uid = bpf_get_current_uid_gid();
         data4.ts_us = bpf_ktime_get_ns() / 1000;
         data4.saddr = skp->__sk_common.skc_rcv_saddr;
         data4.daddr = skp->__sk_common.skc_daddr;
@@ -124,6 +138,7 @@
 
     } else /* 6 */ {
         struct ipv6_data_t data6 = {.pid = pid, .ip = ipver};
+        data6.uid = bpf_get_current_uid_gid();
         data6.ts_us = bpf_ktime_get_ns() / 1000;
         bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
             skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
@@ -159,9 +174,13 @@
     dports_if = ' && '.join(['dport != %d' % ntohs(dport) for dport in dports])
     bpf_text = bpf_text.replace('FILTER_PORT',
         'if (%s) { currsock.delete(&pid); return 0; }' % dports_if)
+if args.uid:
+    bpf_text = bpf_text.replace('FILTER_UID',
+        'if (uid != %s) { return 0; }' % args.uid)
 
 bpf_text = bpf_text.replace('FILTER_PID', '')
 bpf_text = bpf_text.replace('FILTER_PORT', '')
+bpf_text = bpf_text.replace('FILTER_UID', '')
 
 if debug or args.ebpf:
     print(bpf_text)
@@ -175,6 +194,7 @@
     _fields_ = [
         ("ts_us", ct.c_ulonglong),
         ("pid", ct.c_uint),
+        ("uid", ct.c_uint),
         ("saddr", ct.c_uint),
         ("daddr", ct.c_uint),
         ("ip", ct.c_ulonglong),
@@ -186,6 +206,7 @@
     _fields_ = [
         ("ts_us", ct.c_ulonglong),
         ("pid", ct.c_uint),
+        ("uid", ct.c_uint),
         ("saddr", (ct.c_ulonglong * 2)),
         ("daddr", (ct.c_ulonglong * 2)),
         ("ip", ct.c_ulonglong),
@@ -201,10 +222,12 @@
         if start_ts == 0:
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
-    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
-        event.task.decode('utf-8', 'replace'), event.ip,
-        inet_ntop(AF_INET, pack("I", event.saddr)),
-        inet_ntop(AF_INET, pack("I", event.daddr)), event.dport))
+    if args.print_uid:
+        print("%-6d" % event.uid, end="")
+    printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task, event.ip,
+        inet_ntop(AF_INET, pack("I", event.saddr)).encode(),
+        inet_ntop(AF_INET, pack("I", event.daddr)).encode(), event.dport))
 
 def print_ipv6_event(cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
@@ -213,9 +236,11 @@
         if start_ts == 0:
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
-    print("%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
-        event.task.decode('utf-8', 'replace'), event.ip,
-        inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr),
+    if args.print_uid:
+        print("%-6d" % event.uid, end="")
+    printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid,
+        event.task, event.ip,
+        inet_ntop(AF_INET6, event.saddr).encode(), inet_ntop(AF_INET6, event.daddr).encode(),
         event.dport))
 
 # initialize BPF
@@ -228,6 +253,8 @@
 # header
 if args.timestamp:
     print("%-9s" % ("TIME(s)"), end="")
+if args.print_uid:
+    print("%-6s" % ("UID"), end="")
 print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "SADDR",
     "DADDR", "DPORT"))
 
@@ -237,4 +264,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcpconnect_example.txt b/tools/tcpconnect_example.txt
index 6d2f8f8..15f6e71 100644
--- a/tools/tcpconnect_example.txt
+++ b/tools/tcpconnect_example.txt
@@ -38,6 +38,24 @@
 process to various other addresses. A few connections occur every minute.
 
 
+The -U option prints a UID column:
+
+# ./tcpconnect -U
+UID   PID    COMM         IP SADDR            DADDR            DPORT
+0     31333  telnet       6  ::1              ::1              23
+0     31333  telnet       4  127.0.0.1        127.0.0.1        23
+1000  31322  curl         4  127.0.0.1        127.0.0.1        80
+1000  31322  curl         6  ::1              ::1              80
+
+
+The -u option filtering UID:
+
+# ./tcpconnect -Uu 1000
+UID   PID    COMM         IP SADDR            DADDR            DPORT
+1000  31338  telnet       6  ::1              ::1              23
+1000  31338  telnet       4  127.0.0.1        127.0.0.1        23
+
+
 USAGE message:
 
 # ./tcpconnect -h
@@ -51,6 +69,8 @@
   -p PID, --pid PID  trace this PID only
   -P PORT, --port PORT
                      comma-separated list of destination ports to trace.
+  -U, --print-uid    include UID on output
+  -u UID, --uid UID  trace this UID only
 
 examples:
     ./tcpconnect           # trace all TCP connect()s
@@ -58,3 +78,5 @@
     ./tcpconnect -p 181    # only trace PID 181
     ./tcpconnect -P 80     # only trace port 80
     ./tcpconnect -P 80,81  # only trace port 80 and 81
+    ./tcpconnect -U        # include UID
+    ./tcpconnect -u 1000   # only trace UID 1000
diff --git a/tools/tcpconnlat.py b/tools/tcpconnlat.py
index 0d21b83..92dc2c1 100755
--- a/tools/tcpconnlat.py
+++ b/tools/tcpconnlat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpconnlat    Trace TCP active connection latency (connect).
@@ -264,4 +264,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py
index d9fbdf5..82f66a7 100755
--- a/tools/tcpdrop.py
+++ b/tools/tcpdrop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpdrop   Trace TCP kernel-dropped packets/segments.
@@ -221,4 +221,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcplife.py b/tools/tcplife.py
index 51ed7ae..e7d29d4 100755
--- a/tools/tcplife.py
+++ b/tools/tcplife.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcplife   Trace the lifespan of TCP sessions and summarize.
@@ -506,4 +506,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py
index 4400483..442fd3e 100755
--- a/tools/tcpretrans.py
+++ b/tools/tcpretrans.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpretrans    Trace or count TCP retransmits and TLPs.
@@ -303,4 +303,7 @@
     b["ipv4_events"].open_perf_buffer(print_ipv4_event)
     b["ipv6_events"].open_perf_buffer(print_ipv6_event)
     while 1:
-        b.perf_buffer_poll()
+        try:
+            b.perf_buffer_poll()
+        except KeyboardInterrupt:
+            exit()
diff --git a/tools/tcpstates.py b/tools/tcpstates.py
index 381a6d5..736de97 100755
--- a/tools/tcpstates.py
+++ b/tools/tcpstates.py
@@ -1,4 +1,5 @@
-#!/usr/bin/python
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpstates   Trace the TCP session state changes with durations.
@@ -20,7 +21,8 @@
 from socket import inet_ntop, AF_INET, AF_INET6
 from struct import pack
 import ctypes as ct
-from time import strftime
+from time import strftime, time
+from os import getuid
 
 # arguments
 examples = """examples:
@@ -29,6 +31,7 @@
     ./tcpstates -T        # include time column (HH:MM:SS)
     ./tcpstates -w        # wider colums (fit IPv6)
     ./tcpstates -stT      # csv output, with times & timestamps
+    ./tcpstates -Y        # log events to the systemd journal
     ./tcpstates -L 80     # only trace local port 80
     ./tcpstates -L 80,81  # only trace local ports 80 and 81
     ./tcpstates -D 80     # only trace remote port 80
@@ -51,6 +54,8 @@
     help="comma-separated list of remote ports to trace.")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
+parser.add_argument("-Y", "--journal", action="store_true",
+    help="log session state changes to the systemd journal")
 args = parser.parse_args()
 debug = 0
 
@@ -237,6 +242,14 @@
     header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
     format_string = "%x,%d,%s,%s,%s,%s,%s,%d,%s,%s,%.3f"
 
+if args.journal:
+    try:
+        from systemd import journal
+    except ImportError:
+        print("ERROR: Journal logging requires the systemd.journal module")
+        exit(1)
+
+
 def tcpstate2str(state):
     # from include/net/tcp_states.h:
     tcpstate = {
@@ -259,6 +272,44 @@
     else:
         return str(state)
 
+def journal_fields(event, addr_family):
+    addr_pfx = 'IPV4'
+    if addr_family == AF_INET6:
+        addr_pfx = 'IPV6'
+
+    fields = {
+        # Standard fields described in systemd.journal-fields(7). journal.send
+        # will fill in CODE_LINE, CODE_FILE, and CODE_FUNC for us. If we're
+        # root and specify OBJECT_PID, systemd-journald will add other OBJECT_*
+        # fields for us.
+        'SYSLOG_IDENTIFIER': 'tcpstates',
+        'PRIORITY': 5,
+        '_SOURCE_REALTIME_TIMESTAMP': time() * 1000000,
+        'OBJECT_PID': str(event.pid),
+        'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
+        # Custom fields, aka "stuff we sort of made up".
+        'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
+        'OBJECT_TCP_SOURCE_PORT': str(event.ports >> 32),
+        'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
+        'OBJECT_TCP_DESTINATION_PORT': str(event.ports & 0xffffffff),
+        'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
+        'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
+        'OBJECT_TCP_SPAN_TIME': str(event.span_us)
+        }
+
+    msg_format_string = (u"%(OBJECT_COMM)s " +
+        u"%(OBJECT_" + addr_pfx + "_SOURCE_ADDRESS)s " +
+        u"%(OBJECT_TCP_SOURCE_PORT)s → " +
+        u"%(OBJECT_" + addr_pfx + "_DESTINATION_ADDRESS)s " +
+        u"%(OBJECT_TCP_DESTINATION_PORT)s " +
+        u"%(OBJECT_TCP_OLD_STATE)s → %(OBJECT_TCP_NEW_STATE)s")
+    fields['MESSAGE'] = msg_format_string % (fields)
+
+    if getuid() == 0:
+        del fields['OBJECT_COMM'] # Handled by systemd-journald
+
+    return fields
+
 # process event
 def print_ipv4_event(cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
@@ -282,6 +333,8 @@
         inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
         tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
         float(event.span_us) / 1000))
+    if args.journal:
+        journal.send(**journal_fields(event, AF_INET))
 
 def print_ipv6_event(cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
@@ -305,6 +358,8 @@
         inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
         tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
         float(event.span_us) / 1000))
+    if args.journal:
+        journal.send(**journal_fields(event, AF_INET6))
 
 # initialize BPF
 b = BPF(text=bpf_text)
@@ -331,4 +386,7 @@
 b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
 b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/tcpstates_example.txt b/tools/tcpstates_example.txt
index aca857a..05df8b6 100644
--- a/tools/tcpstates_example.txt
+++ b/tools/tcpstates_example.txt
@@ -26,7 +26,8 @@
 USAGE:
 
 # tcpstates -h
-usage: tcpstates [-h] [-T] [-t] [-w] [-s] [-L LOCALPORT] [-D REMOTEPORT]
+usage: tcpstates.py [-h] [-T] [-t] [-w] [-s] [-L LOCALPORT] [-D REMOTEPORT]
+                    [-Y]
 
 Trace TCP session state changes and durations
 
@@ -40,6 +41,7 @@
                         comma-separated list of local ports to trace.
   -D REMOTEPORT, --remoteport REMOTEPORT
                         comma-separated list of remote ports to trace.
+  -Y, --journal         log session state changes to the systemd journal
 
 examples:
     ./tcpstates           # trace all TCP state changes
@@ -47,6 +49,7 @@
     ./tcpstates -T        # include time column (HH:MM:SS)
     ./tcpstates -w        # wider colums (fit IPv6)
     ./tcpstates -stT      # csv output, with times & timestamps
+    ./tcpstates -Y        # log events to the systemd journal
     ./tcpstates -L 80     # only trace local port 80
     ./tcpstates -L 80,81  # only trace local ports 80 and 81
     ./tcpstates -D 80     # only trace remote port 80
diff --git a/tools/tcpsubnet.py b/tools/tcpsubnet.py
index 5f2a806..bf944e1 100755
--- a/tools/tcpsubnet.py
+++ b/tools/tcpsubnet.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcpsubnet   Summarize TCP bytes sent to different subnets.
diff --git a/tools/tcptop.py b/tools/tcptop.py
index e1eb241..a8451d2 100755
--- a/tools/tcptop.py
+++ b/tools/tcptop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # tcptop    Summarize TCP send/recv throughput by host.
diff --git a/tools/tcptracer.py b/tools/tcptracer.py
index 16bb4b1..8f272eb 100755
--- a/tools/tcptracer.py
+++ b/tools/tcptracer.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # tcpv4tracer   Trace TCP connections.
 #               For Linux, uses BCC, eBPF. Embedded C.
@@ -662,4 +662,7 @@
 b["tcp_ipv4_event"].open_perf_buffer(print_ipv4_event)
 b["tcp_ipv6_event"].open_perf_buffer(print_ipv6_event)
 while True:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/trace.py b/tools/trace.py
index 2233305..e1845da 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -216,17 +216,17 @@
         }
 
         aliases_indarg = {
-                "arg1": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM1(ctx);"
+                "arg1": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})",
-                "arg2": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM2(ctx);"
+                "arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM2(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})",
-                "arg3": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM3(ctx);"
+                "arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM3(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})",
-                "arg4": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM4(ctx);"
+                "arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM4(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})",
-                "arg5": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM5(ctx);"
+                "arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM5(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})",
-                "arg6": "({u64 _val; struct pt_regs *_ctx = PT_REGS_PARM6(ctx);"
+                "arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM6(ctx);"
                         "  bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})",
         }
 
diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py
index 9780518..aa18d24 100755
--- a/tools/ttysnoop.py
+++ b/tools/ttysnoop.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # ttysnoop   Watch live output from a tty or pts device.
@@ -121,4 +121,7 @@
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/vfscount.py b/tools/vfscount.py
index 10c6b1e..285cd8b 100755
--- a/tools/vfscount.py
+++ b/tools/vfscount.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # vfscount  Count VFS calls ("vfs_*").
diff --git a/tools/vfsstat.py b/tools/vfsstat.py
index 1764c60..4a55f8c 100755
--- a/tools/vfsstat.py
+++ b/tools/vfsstat.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # vfsstat   Count some VFS calls.
diff --git a/tools/wakeuptime.py b/tools/wakeuptime.py
index 18e70e4..68e8857 100755
--- a/tools/wakeuptime.py
+++ b/tools/wakeuptime.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # wakeuptime    Summarize sleep to wakeup time by waker kernel stack
 #               For Linux, uses BCC, eBPF.
diff --git a/tools/xfsdist.py b/tools/xfsdist.py
index f409f90..1a7fdd9 100755
--- a/tools/xfsdist.py
+++ b/tools/xfsdist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # xfsdist  Summarize XFS operation latency.
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
index c70721a..5b4e0a2 100755
--- a/tools/xfsslower.py
+++ b/tools/xfsslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # xfsslower  Trace slow XFS operations.
@@ -303,4 +303,7 @@
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()
diff --git a/tools/zfsdist.py b/tools/zfsdist.py
index 6b29b99..9330739 100755
--- a/tools/zfsdist.py
+++ b/tools/zfsdist.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # zfsdist  Summarize ZFS operation latency.
diff --git a/tools/zfsslower.py b/tools/zfsslower.py
index 8ab283a..6f0382a 100755
--- a/tools/zfsslower.py
+++ b/tools/zfsslower.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 # @lint-avoid-python-3-compatibility-imports
 #
 # zfsslower  Trace slow ZFS operations.
@@ -315,4 +315,7 @@
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
-    b.perf_buffer_poll()
+    try:
+        b.perf_buffer_poll()
+    except KeyboardInterrupt:
+        exit()