|  | #!/usr/bin/python | 
|  | # @lint-avoid-python-3-compatibility-imports | 
|  | # | 
|  | # runqlen    Summarize scheduler run queue length as a histogram. | 
|  | #            For Linux, uses BCC, eBPF. | 
|  | # | 
|  | # This counts the length of the run queue, excluding the currently running | 
|  | # thread, and shows it as a histogram. | 
|  | # | 
|  | # Also answers run queue occupancy. | 
|  | # | 
|  | # USAGE: runqlen [-h] [-T] [-Q] [-m] [-D] [interval] [count] | 
|  | # | 
|  | # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is | 
|  | # a version of this tool that may work on Linux 4.6 - 4.8. | 
|  | # | 
|  | # Copyright 2016 Netflix, Inc. | 
|  | # Licensed under the Apache License, Version 2.0 (the "License") | 
|  | # | 
|  | # 12-Dec-2016   Brendan Gregg   Created this. | 
|  |  | 
|  | from __future__ import print_function | 
|  | from bcc import BPF, PerfType, PerfSWConfig | 
|  | from time import sleep, strftime | 
|  | from tempfile import NamedTemporaryFile | 
|  | from os import open, close, dup, unlink, O_WRONLY | 
|  | import argparse | 
|  |  | 
|  | # arguments | 
|  | examples = """examples: | 
|  | ./runqlen            # summarize run queue length as a histogram | 
|  | ./runqlen 1 10       # print 1 second summaries, 10 times | 
|  | ./runqlen -T 1       # 1s summaries and timestamps | 
|  | ./runqlen -O         # report run queue occupancy | 
|  | ./runqlen -C         # show each CPU separately | 
|  | """ | 
|  | parser = argparse.ArgumentParser( | 
|  | description="Summarize scheduler run queue length as a histogram", | 
|  | formatter_class=argparse.RawDescriptionHelpFormatter, | 
|  | epilog=examples) | 
|  | parser.add_argument("-T", "--timestamp", action="store_true", | 
|  | help="include timestamp on output") | 
|  | parser.add_argument("-O", "--runqocc", action="store_true", | 
|  | help="report run queue occupancy") | 
|  | parser.add_argument("-C", "--cpus", action="store_true", | 
|  | help="print output for each CPU separately") | 
|  | parser.add_argument("interval", nargs="?", default=99999999, | 
|  | help="output interval, in seconds") | 
|  | parser.add_argument("count", nargs="?", default=99999999, | 
|  | help="number of outputs") | 
|  | parser.add_argument("--ebpf", action="store_true", | 
|  | help=argparse.SUPPRESS) | 
|  | args = parser.parse_args() | 
|  | countdown = int(args.count) | 
|  | debug = 0 | 
|  | frequency = 99 | 
|  |  | 
|  | # Linux 4.15 introduced a new field runnable_weight | 
|  | # in linux_src:kernel/sched/sched.h as | 
|  | #     struct cfs_rq { | 
|  | #         struct load_weight load; | 
|  | #         unsigned long runnable_weight; | 
|  | #         unsigned int nr_running, h_nr_running; | 
|  | #         ...... | 
|  | #     } | 
|  | # and this tool requires to access nr_running to get | 
|  | # runqueue len information. | 
|  | # | 
|  | # The commit which introduces cfs_rq->runnable_weight | 
|  | # field also introduces the field sched_entity->runnable_weight | 
|  | # where sched_entity is defined in linux_src:include/linux/sched.h. | 
|  | # | 
|  | # To cope with pre-4.15 and 4.15/post-4.15 releases, | 
|  | # we run a simple BPF program to detect whether | 
|  | # field sched_entity->runnable_weight exists. The existence of | 
|  | # this field should infer the existence of cfs_rq->runnable_weight. | 
|  | # | 
|  | # This will need maintenance as the relationship between these | 
|  | # two fields may change in the future. | 
|  | # | 
|  | def check_runnable_weight_field(): | 
|  | # Define the bpf program for checking purpose | 
|  | bpf_check_text = """ | 
|  | #include <linux/sched.h> | 
|  | unsigned long dummy(struct sched_entity *entity) | 
|  | { | 
|  | return entity->runnable_weight; | 
|  | } | 
|  | """ | 
|  |  | 
|  | # Get a temporary file name | 
|  | tmp_file = NamedTemporaryFile(delete=False) | 
|  | tmp_file.close(); | 
|  |  | 
|  | # Duplicate and close stderr (fd = 2) | 
|  | old_stderr = dup(2) | 
|  | close(2) | 
|  |  | 
|  | # Open a new file, should get fd number 2 | 
|  | # This will avoid printing llvm errors on the screen | 
|  | fd = open(tmp_file.name, O_WRONLY) | 
|  | try: | 
|  | t = BPF(text=bpf_check_text) | 
|  | success_compile = True | 
|  | except: | 
|  | success_compile = False | 
|  |  | 
|  | # Release the fd 2, and next dup should restore old stderr | 
|  | close(fd) | 
|  | dup(old_stderr) | 
|  | close(old_stderr) | 
|  |  | 
|  | # remove the temporary file and return | 
|  | unlink(tmp_file.name) | 
|  | return success_compile | 
|  |  | 
|  |  | 
|  | # define BPF program | 
|  | bpf_text = """ | 
|  | #include <uapi/linux/ptrace.h> | 
|  | #include <linux/sched.h> | 
|  |  | 
|  | // Declare enough of cfs_rq to find nr_running, since we can't #import the | 
|  | // header. This will need maintenance. It is from kernel/sched/sched.h: | 
|  | struct cfs_rq_partial { | 
|  | struct load_weight load; | 
|  | RUNNABLE_WEIGHT_FIELD | 
|  | unsigned int nr_running, h_nr_running; | 
|  | }; | 
|  |  | 
|  | typedef struct cpu_key { | 
|  | int cpu; | 
|  | unsigned int slot; | 
|  | } cpu_key_t; | 
|  | STORAGE | 
|  |  | 
|  | int do_perf_event() | 
|  | { | 
|  | unsigned int len = 0; | 
|  | pid_t pid = 0; | 
|  | struct task_struct *task = NULL; | 
|  | struct cfs_rq_partial *my_q = NULL; | 
|  |  | 
|  | // Fetch the run queue length from task->se.cfs_rq->nr_running. This is an | 
|  | // unstable interface and may need maintenance. Perhaps a future version | 
|  | // of BPF will support task_rq(p) or something similar as a more reliable | 
|  | // interface. | 
|  | task = (struct task_struct *)bpf_get_current_task(); | 
|  | my_q = (struct cfs_rq_partial *)task->se.cfs_rq; | 
|  | len = my_q->nr_running; | 
|  |  | 
|  | // Calculate run queue length by subtracting the currently running task, | 
|  | // if present. len 0 == idle, len 1 == one running task. | 
|  | if (len > 0) | 
|  | len--; | 
|  |  | 
|  | STORE | 
|  |  | 
|  | return 0; | 
|  | } | 
|  | """ | 
|  |  | 
|  | # code substitutions | 
|  | if args.cpus: | 
|  | bpf_text = bpf_text.replace('STORAGE', | 
|  | 'BPF_HISTOGRAM(dist, cpu_key_t);') | 
|  | bpf_text = bpf_text.replace('STORE', 'cpu_key_t key = {.slot = len}; ' + | 
|  | 'key.cpu = bpf_get_smp_processor_id(); ' + | 
|  | 'dist.increment(key);') | 
|  | else: | 
|  | bpf_text = bpf_text.replace('STORAGE', | 
|  | 'BPF_HISTOGRAM(dist, unsigned int);') | 
|  | bpf_text = bpf_text.replace('STORE', 'dist.atomic_increment(len);') | 
|  |  | 
|  | if check_runnable_weight_field(): | 
|  | bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') | 
|  | else: | 
|  | bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') | 
|  |  | 
|  | if debug or args.ebpf: | 
|  | print(bpf_text) | 
|  | if args.ebpf: | 
|  | exit() | 
|  |  | 
|  | # initialize BPF & perf_events | 
|  | b = BPF(text=bpf_text) | 
|  | b.attach_perf_event(ev_type=PerfType.SOFTWARE, | 
|  | ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", | 
|  | sample_period=0, sample_freq=frequency) | 
|  |  | 
|  | print("Sampling run queue length... Hit Ctrl-C to end.") | 
|  |  | 
|  | # output | 
|  | exiting = 0 if args.interval else 1 | 
|  | dist = b.get_table("dist") | 
|  | while (1): | 
|  | try: | 
|  | sleep(int(args.interval)) | 
|  | except KeyboardInterrupt: | 
|  | exiting = 1 | 
|  |  | 
|  | print() | 
|  | if args.timestamp: | 
|  | print("%-8s\n" % strftime("%H:%M:%S"), end="") | 
|  |  | 
|  | if args.runqocc: | 
|  | if args.cpus: | 
|  | # run queue occupancy, per-CPU summary | 
|  | idle = {} | 
|  | queued = {} | 
|  | cpumax = 0 | 
|  | for k, v in dist.items(): | 
|  | if k.cpu > cpumax: | 
|  | cpumax = k.cpu | 
|  | for c in range(0, cpumax + 1): | 
|  | idle[c] = 0 | 
|  | queued[c] = 0 | 
|  | for k, v in dist.items(): | 
|  | if k.slot == 0: | 
|  | idle[k.cpu] += v.value | 
|  | else: | 
|  | queued[k.cpu] += v.value | 
|  | for c in range(0, cpumax + 1): | 
|  | samples = idle[c] + queued[c] | 
|  | if samples: | 
|  | runqocc = float(queued[c]) / samples | 
|  | else: | 
|  | runqocc = 0 | 
|  | print("runqocc, CPU %-3d %6.2f%%" % (c, 100 * runqocc)) | 
|  |  | 
|  | else: | 
|  | # run queue occupancy, system-wide summary | 
|  | idle = 0 | 
|  | queued = 0 | 
|  | for k, v in dist.items(): | 
|  | if k.value == 0: | 
|  | idle += v.value | 
|  | else: | 
|  | queued += v.value | 
|  | samples = idle + queued | 
|  | if samples: | 
|  | runqocc = float(queued) / samples | 
|  | else: | 
|  | runqocc = 0 | 
|  | print("runqocc: %0.2f%%" % (100 * runqocc)) | 
|  |  | 
|  | else: | 
|  | # run queue length histograms | 
|  | dist.print_linear_hist("runqlen", "cpu") | 
|  |  | 
|  | dist.clear() | 
|  |  | 
|  | countdown -= 1 | 
|  | if exiting or countdown == 0: | 
|  | exit() |