|  | #!/usr/bin/python | 
|  | # @lint-avoid-python-3-compatibility-imports | 
|  | # | 
|  | # drsnoop  Trace direct reclaim and print details including issuing PID. | 
|  | #       For Linux, uses BCC, eBPF. | 
|  | # | 
|  | # This uses in-kernel eBPF maps to cache process details (PID and comm) by | 
|  | # direct reclaim begin, as well as a starting timestamp for calculating | 
|  | # latency. | 
|  | # | 
|  | # Copyright (c) 2019 Ethercflow | 
|  | # Licensed under the Apache License, Version 2.0 (the "License") | 
|  | # | 
|  | # 20-Feb-2019   Ethercflow   Created this. | 
|  | # 09-Mar-2019   Ethercflow   Updated for show sys mem info. | 
|  |  | 
|  | from __future__ import print_function | 
|  | from bcc import ArgString, BPF | 
|  | import argparse | 
|  | from datetime import datetime, timedelta | 
|  | import os | 
|  | import math | 
|  |  | 
|  | # symbols | 
|  | kallsyms = "/proc/kallsyms" | 
|  |  | 
|  | # arguments | 
|  | examples = """examples: | 
|  | ./drsnoop           # trace all direct reclaim | 
|  | ./drsnoop -T        # include timestamps | 
|  | ./drsnoop -U        # include UID | 
|  | ./drsnoop -P 181    # only trace PID 181 | 
|  | ./drsnoop -t 123    # only trace TID 123 | 
|  | ./drsnoop -u 1000   # only trace UID 1000 | 
|  | ./drsnoop -d 10     # trace for 10 seconds only | 
|  | ./drsnoop -n main   # only print process names containing "main" | 
|  | """ | 
|  | parser = argparse.ArgumentParser( | 
|  | description="Trace direct reclaim", | 
|  | formatter_class=argparse.RawDescriptionHelpFormatter, | 
|  | epilog=examples) | 
|  | parser.add_argument("-T", "--timestamp", action="store_true", | 
|  | help="include timestamp on output") | 
|  | parser.add_argument("-U", "--print-uid", action="store_true", | 
|  | help="print UID column") | 
|  | parser.add_argument("-p", "--pid", | 
|  | help="trace this PID only") | 
|  | parser.add_argument("-t", "--tid", | 
|  | help="trace this TID only") | 
|  | parser.add_argument("-u", "--uid", | 
|  | help="trace this UID only") | 
|  | parser.add_argument("-d", "--duration", | 
|  | help="total duration of trace in seconds") | 
|  | parser.add_argument("-n", "--name", | 
|  | type=ArgString, | 
|  | help="only print process names containing this name") | 
|  | parser.add_argument("-v", "--verbose", action="store_true", | 
|  | help="show system memory state") | 
|  | parser.add_argument("--ebpf", action="store_true", | 
|  | help=argparse.SUPPRESS) | 
|  | args = parser.parse_args() | 
|  | debug = 0 | 
|  | if args.duration: | 
|  | args.duration = timedelta(seconds=int(args.duration)) | 
|  |  | 
|  |  | 
|  | # vm_stat | 
|  | vm_stat_addr = '' | 
|  | with open(kallsyms) as syms: | 
|  | for line in syms: | 
|  | (addr, size, name) = line.rstrip().split(" ", 2) | 
|  | name = name.split("\t")[0] | 
|  | if name == "vm_stat": | 
|  | vm_stat_addr = "0x" + addr | 
|  | break | 
|  | if name == "vm_zone_stat": | 
|  | vm_stat_addr = "0x" + addr | 
|  | break | 
|  | if vm_stat_addr == '': | 
|  | print("ERROR: no vm_stat or vm_zone_stat in /proc/kallsyms. Exiting.") | 
|  | print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") | 
|  | exit() | 
|  |  | 
|  | NR_FREE_PAGES = 0 | 
|  |  | 
|  | PAGE_SIZE = os.sysconf("SC_PAGE_SIZE") | 
|  | PAGE_SHIFT = int(math.log(PAGE_SIZE) / math.log(2)) | 
|  |  | 
|  | def K(x): | 
|  | return x << (PAGE_SHIFT - 10) | 
|  |  | 
|  | # load BPF program | 
|  | bpf_text = """ | 
|  | #include <uapi/linux/ptrace.h> | 
|  | #include <linux/sched.h> | 
|  | #include <linux/mmzone.h> | 
|  |  | 
|  | struct val_t { | 
|  | u64 id; | 
|  | u64 ts; // start time | 
|  | char name[TASK_COMM_LEN]; | 
|  | u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 
|  | }; | 
|  |  | 
|  | struct data_t { | 
|  | u64 id; | 
|  | u32 uid; | 
|  | u64 nr_reclaimed; | 
|  | u64 delta; | 
|  | u64 ts;    // end time | 
|  | char name[TASK_COMM_LEN]; | 
|  | u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 
|  | }; | 
|  |  | 
|  | BPF_HASH(start, u64, struct val_t); | 
|  | BPF_PERF_OUTPUT(events); | 
|  |  | 
|  | TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_begin) { | 
|  | struct val_t val = {}; | 
|  | u64 id = bpf_get_current_pid_tgid(); | 
|  | u32 pid = id >> 32; // PID is higher part | 
|  | u32 tid = id;       // Cast and get the lower part | 
|  | u32 uid = bpf_get_current_uid_gid(); | 
|  | u64 ts; | 
|  |  | 
|  | PID_TID_FILTER | 
|  | UID_FILTER | 
|  | if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { | 
|  | val.id = id; | 
|  | val.ts = bpf_ktime_get_ns(); | 
|  | bpf_probe_read(&val.vm_stat, sizeof(val.vm_stat), (const void *)%s); | 
|  | start.update(&id, &val); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_end) { | 
|  | u64 id = bpf_get_current_pid_tgid(); | 
|  | struct val_t *valp; | 
|  | struct data_t data = {}; | 
|  | u64 ts = bpf_ktime_get_ns(); | 
|  |  | 
|  | valp = start.lookup(&id); | 
|  | if (valp == NULL) { | 
|  | // missed entry | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | data.delta = ts - valp->ts; | 
|  | data.ts = ts / 1000; | 
|  | data.id = valp->id; | 
|  | data.uid = bpf_get_current_uid_gid(); | 
|  | bpf_probe_read(&data.name, sizeof(data.name), valp->name); | 
|  | bpf_probe_read(&data.vm_stat, sizeof(data.vm_stat), valp->vm_stat); | 
|  | data.nr_reclaimed = args->nr_reclaimed; | 
|  |  | 
|  | events.perf_submit(args, &data, sizeof(data)); | 
|  | start.delete(&id); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  | """ % vm_stat_addr | 
|  |  | 
|  | if args.tid:  # TID trumps PID | 
|  | bpf_text = bpf_text.replace('PID_TID_FILTER', | 
|  | 'if (tid != %s) { return 0; }' % args.tid) | 
|  | elif args.pid: | 
|  | bpf_text = bpf_text.replace('PID_TID_FILTER', | 
|  | 'if (pid != %s) { return 0; }' % args.pid) | 
|  | else: | 
|  | bpf_text = bpf_text.replace('PID_TID_FILTER', '') | 
|  | if args.uid: | 
|  | bpf_text = bpf_text.replace('UID_FILTER', | 
|  | 'if (uid != %s) { return 0; }' % args.uid) | 
|  | else: | 
|  | bpf_text = bpf_text.replace('UID_FILTER', '') | 
|  | if debug or args.ebpf: | 
|  | print(bpf_text) | 
|  | if args.ebpf: | 
|  | exit() | 
|  |  | 
|  | # initialize BPF | 
|  | b = BPF(text=bpf_text) | 
|  |  | 
|  | initial_ts = 0 | 
|  |  | 
|  | # header | 
|  | if args.timestamp: | 
|  | print("%-14s" % ("TIME(s)"), end="") | 
|  | if args.print_uid: | 
|  | print("%-6s" % ("UID"), end="") | 
|  | print("%-14s %-6s %8s %5s" % | 
|  | ("COMM", "TID" if args.tid else "PID", "LAT(ms)", "PAGES"), end="") | 
|  | if args.verbose: | 
|  | print("%10s" % ("FREE(KB)")) | 
|  | else: | 
|  | print("") | 
|  |  | 
|  | # process event | 
|  | def print_event(cpu, data, size): | 
|  | event = b["events"].event(data) | 
|  |  | 
|  | global initial_ts | 
|  |  | 
|  | if not initial_ts: | 
|  | initial_ts = event.ts | 
|  |  | 
|  | if args.name and bytes(args.name) not in event.name: | 
|  | return | 
|  |  | 
|  | if args.timestamp: | 
|  | delta = event.ts - initial_ts | 
|  | print("%-14.9f" % (float(delta) / 1000000), end="") | 
|  |  | 
|  | if args.print_uid: | 
|  | print("%-6d" % event.uid, end="") | 
|  |  | 
|  | print("%-14.14s %-6s %8.2f %5d" % | 
|  | (event.name.decode('utf-8', 'replace'), | 
|  | event.id & 0xffffffff if args.tid else event.id >> 32, | 
|  | float(event.delta) / 1000000, event.nr_reclaimed), end="") | 
|  | if args.verbose: | 
|  | print("%10d" % K(event.vm_stat[NR_FREE_PAGES])) | 
|  | else: | 
|  | print("") | 
|  |  | 
|  |  | 
|  | # loop with callback to print_event | 
|  | b["events"].open_perf_buffer(print_event, page_cnt=64) | 
|  | start_time = datetime.now() | 
|  | while not args.duration or datetime.now() - start_time < args.duration: | 
|  | try: | 
|  | b.perf_buffer_poll() | 
|  | except KeyboardInterrupt: | 
|  | exit() |