| """ NNAPI Systrace parser - tracking of call tree based on trace lines |
| |
| See contract-between-code-and-parser.txt for the |
| specification (cases in the specification are referred to with SPEC). |
| """ |
| |
| import re |
| import sys |
| from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag, |
| get_function_name_from_mark, make_tag) |
| from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION |
| from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT |
| from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK |
| from parser.tree import SingleThreadCallTree |
| |
| class AppPhase(object): |
| """ Class to track the overall phase of the program. Used to split up warmup and benchmark. |
| |
| Needs to be separate from the call trees to propagate the difference to driver. |
| """ |
| def __init__(self): |
| self.reset() |
| |
| def current(self): |
| if self.stack: |
| return self.stack[-1] |
| else: |
| return PHASE_OVERALL |
| |
| def push(self, phase): |
| self.stack.append(phase) |
| |
| def pop(self): |
| self.stack.pop() |
| |
| def reset(self): |
| self.stack = [] |
| |
| class Tracker(object): |
| """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree |
| as well as keeping track of entry and exit times for functions. |
| |
| Exposes statistics for a single thread, transforming the call tree as needed. |
| All statistics are in milliseconds. |
| |
| Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat(). |
| |
| Subphases of Execution are aggregated towards the overall Execution phase as needed. |
| """ |
| def __init__(self, tgid, is_driver, app_phase): |
| self.tgid = tgid |
| self.is_driver = is_driver |
| self.app_phase = app_phase |
| |
| # Match the trace string |
| # "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1" |
| # "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1" |
| self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?") |
| |
| self.reset() |
| |
| def reset(self): |
| self.stats = {} |
| self.items = {} |
| self.mytree = SingleThreadCallTree() |
| self.begins_and_ends_ms = {} |
| self.la_pe_counts = {} |
| self.debugstring = "\n" |
| |
| def handle_mark(self, time, mark): |
| """ Handle a single trace item (scoped entry and exit). |
| Translates: |
| - Automatically generated HIDL traces into NNTRACE layers and phases |
| - SPEC:Switch phase during function into dummy items |
| - SPEC:Subtracting time when nesting is violated into "subtract" |
| markers |
| - CPU/Driver layer distinction based on whether the process is the |
| driver or an application |
| This function is called multiple times for a single application run, |
| afterwards the statistics can be calculated. |
| """ |
| if mark[0] == "B": |
| switch = False |
| subtract = False |
| # Workarounds for wrong tracepoints in early versions |
| # TODO(mikie): remove later |
| if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark): |
| mark = mark.replace("_PT", "_PE") |
| # Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333) |
| if ("getSupportedExtensions" in mark): |
| mark = mark.replace("_PC", "_PI") |
| elif ("[SW][NN_LA_PR]executeWithCompilation" in mark): |
| mark = mark.replace("[SW]", "") |
| if MARKER_SWITCH in mark: |
| switch = True |
| if MARKER_SUBTRACT in mark: |
| subtract = True |
| if switch: |
| # End previous item |
| self.handle_mark(time, "E") |
| # Push a placeholder item that will get popped by the 'real' end of the |
| # previous item. |
| self.mytree.push_dummy(time) |
| m = self.matcher.search(mark) |
| if m is None: |
| tag = translate_hidl_mark_to_nn_and_tag(mark) |
| if tag is None: |
| raise Exception("Couldn't parse mark " + mark) |
| else: |
| tag = m.group(1) |
| [_, layer, phase] = tag.split("_") |
| if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]: |
| self.app_phase.push(phase) |
| if not self.is_driver: |
| layer = layer.replace(LAYER_DRIVER, LAYER_CPU) |
| else: |
| layer = layer.replace(LAYER_CPU, LAYER_DRIVER) |
| if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION: |
| self.la_pe_counts[self.app_phase.current()] = ( |
| self.la_pe_counts.get(self.app_phase.current(), 0) + 1) |
| self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract) |
| elif mark[0] == "E": |
| try: |
| node = self.mytree.pop(time) |
| if node.is_dummy(): # Placeholder item |
| pass |
| else: |
| if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]: |
| self.app_phase.pop() |
| function = node.app_phase + "::" + get_function_name_from_mark(node.mark) |
| self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) + |
| [[float(node.start_time_s) * 1000.0, |
| float(node.end_time_s) * 1000.0]]) |
| except IndexError as e: |
| raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e |
| |
| def is_complete(self): |
| """ Checks if we've seen all end tracepoints for the begin tracepoints. |
| """ |
| return self.mytree.current.is_root() |
| |
| def calculate_stats(self): |
| assert self.is_complete() |
| self.mytree.remove_ignored() |
| self.mytree.remove_dummies() |
| self.mytree.copy_subtracted_init_and_wrong_la() |
| self.mytree.add_missing_la_nodes() |
| # self.mytree.print() |
| self.mytree.validate_nesting() |
| |
| def recurse(node, prev_layer, prev_phase, indent, in_pe_layers): |
| [begun, mark, layer, phase] = [ |
| node.start_time_s, node.mark, node.layer, node.phase()] |
| time = node.end_time_s |
| tag = None |
| elapsed0 = "DETAIL" |
| elapsed1 = node.elapsed_less_subtracted_ms() |
| if elapsed1 is None: |
| raise Exception("Elapsed for {} returned None".format(node.to_str())) |
| |
| if not node.is_added_detail() and not node.subtract: |
| tag = node.app_phase + "_" + layer + "_" + phase |
| elapsed0 = elapsed1 |
| self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0 |
| self.items[tag] = self.items.get(tag, []) + [ |
| mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag] |
| if phase in subphases[PHASE_EXECUTION]: |
| if not in_pe_layers.get(layer): |
| pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION) |
| self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0 |
| self.items[pe_tag] = self.items.get(pe_tag, []) + [ |
| mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag] |
| if phase == PHASE_EXECUTION: |
| in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1 |
| for c in node.children: |
| recurse(c, layer or prev_layer, phase or prev_phase, |
| indent + " ", in_pe_layers) |
| if phase == PHASE_EXECUTION: |
| in_pe_layers[layer] = in_pe_layers[layer] - 1 |
| return |
| |
| for top in self.mytree.root.children: |
| recurse(top, None, None, "", {}) |
| self.debugstring = self.mytree.to_str() |
| |
| # We need to special case the driver execution time because: |
| # - The existing drivers don't have tracing, so we rely on HIDL traces |
| # - Best we can do is to take the start of the HIDL server side call as |
| # the starting point (which includes a bit of overhead, but not much) and |
| # the start of the callback as the end point (which should be pretty |
| # accurate) |
| # Note that the begin and end may be on different threads, hence the |
| # calculation needs to happen in aggregation rather than here. |
| def get_ld_pe_begins(self, app_phase): |
| return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server") |
| |
| def get_ld_pe_ends(self, app_phase): |
| return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client") |
| |
| def get_stat(self, tag, app_phase, special_case_pe=True): |
| if not self.stats and not self.mytree.is_empty(): |
| self.calculate_stats() |
| if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe: |
| # Execution is exposed as an asynchronous event from the runtime, we |
| # calculate the runtime time as starting from when the async operation is |
| # kicked off until wait finishes + synchronous setup and teardown calls. |
| # This has two limitations: |
| # - multithreaded usage will not work correctly |
| # - should the application spend so much time before calling wait that |
| # execution has already finished, the time would get allocated to the |
| # runtime incorrectly |
| async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute") |
| async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait") |
| elapsed = 0.0 |
| for i in range(0, len(async_starts)): |
| elapsed = elapsed + (async_ends[i] - async_starts[i]) |
| for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free", |
| "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free", |
| "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput", |
| "ANeuralNetworksExecution_setInputFromMemory", |
| "ANeuralNetworksExecution_setOutputFromMemory"]: |
| sync_starts = self.get_begins(app_phase, sync) |
| sync_ends = self.get_ends(app_phase, sync) |
| for i in range(0, len(sync_starts)): |
| elapsed = elapsed + (sync_ends[i] - sync_starts[i]) |
| return elapsed |
| return self.stats.get(app_phase + "_" + tag, 0.0) |
| |
| def get_execution_count(self, app_phase): |
| # ANeuralNetworksExecution_create is reliable and comes from the runtime, |
| # but not available pre-P |
| count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create")) |
| if count > 0: |
| return count |
| # Application may have added tracepoints |
| return self.la_pe_counts.get(app_phase, 0) |
| |
| def get_begins(self, app_phase, function): |
| name = app_phase + "::" + function |
| return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])] |
| def get_ends(self, app_phase, function): |
| name = app_phase + "::" + function |
| return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])] |
| |
| def print_stats(self): |
| if not self.stats: |
| self.calculate_stats() |
| print(self.tgid, "Driver" if self.is_driver else "App") |
| for tag in self.stats: |
| print(tag, self.stats[tag]) |
| if self.items.get(tag): |
| for item in self.items[tag]: |
| print(" ", item) |
| else: |
| print(" ", "calculated only") |
| |
| def print(self): |
| self.mytree.print() |