tools/systrace_parser/parser/tracker.py - platform/packages/modules/NeuralNetworks - Git at Google

 """ NNAPI Systrace parser - tracking of call tree based on trace lines

     See contract-between-code-and-parser.txt for the
     specification (cases in the specification are referred to with SPEC).
 """

 import re
 import sys
 from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag,
                            get_function_name_from_mark, make_tag)
 from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION
 from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT
 from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK
 from parser.tree import SingleThreadCallTree

 class AppPhase(object):
   """ Class to track the overall phase of the program. Used to split up warmup and benchmark.

       Needs to be separate from the call trees to propagate the difference to driver.
   """
   def __init__(self):
     self.reset()

   def current(self):
     if self.stack:
       return self.stack[-1]
     else:
       return PHASE_OVERALL

   def push(self, phase):
     self.stack.append(phase)

   def pop(self):
     self.stack.pop()

   def reset(self):
     self.stack = []

 class Tracker(object):
   """ Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree
       as well as keeping track of entry and exit times for functions.

       Exposes statistics for a single thread, transforming the call tree as needed.
       All statistics are in milliseconds.

       Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat().

       Subphases of Execution are aggregated towards the overall Execution phase as needed.
   """
   def __init__(self, tgid, is_driver, app_phase):
     self.tgid = tgid
     self.is_driver = is_driver
     self.app_phase = app_phase

     # Match the trace string
     # "[NN_LA_PP]funcE1" in "B|<thread1>|[NN_LA_PP]funcE1"
     # "[NN_LC_PCO]funcC1" in "B|<thread1>|[SW][NN_LC_PCO]funcC1"
     self.matcher = re.compile(r"B\|\d+\|.*\[([^]]+)\]\[?([^]])\]?")

     self.reset()

   def reset(self):
     self.stats = {}
     self.items = {}
     self.mytree = SingleThreadCallTree()
     self.begins_and_ends_ms = {}
     self.la_pe_counts = {}
     self.debugstring = "\n"

   def handle_mark(self, time, mark):
     """ Handle a single trace item (scoped entry and exit).
         Translates:
           - Automatically generated HIDL traces into NNTRACE layers and phases
           - SPEC:Switch phase during function into dummy items
           - SPEC:Subtracting time when nesting is violated into "subtract"
             markers
           - CPU/Driver layer distinction based on whether the process is the
             driver or an application
         This function is called multiple times for a single application run,
         afterwards the statistics can be calculated.
     """
     if mark[0] == "B":
       switch = False
       subtract = False
       # Workarounds for wrong tracepoints in early versions
       # TODO(mikie): remove later
       if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark):
         mark = mark.replace("_PT", "_PE")
       # Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333)
       if ("getSupportedExtensions" in mark):
         mark = mark.replace("_PC", "_PI")
       elif ("[SW][NN_LA_PR]executeWithCompilation" in mark):
         mark = mark.replace("[SW]", "")
       if MARKER_SWITCH in mark:
         switch = True
       if MARKER_SUBTRACT in mark:
         subtract = True
       if switch:
         # End previous item
         self.handle_mark(time, "E")
         # Push a placeholder item that will get popped by the 'real' end of the
         # previous item.
         self.mytree.push_dummy(time)
       m = self.matcher.search(mark)
       if m is None:
         tag = translate_hidl_mark_to_nn_and_tag(mark)
         if tag is None:
           raise Exception("Couldn't parse mark " + mark)
       else:
         tag = m.group(1)
       [_, layer, phase] = tag.split("_")
       if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
         self.app_phase.push(phase)
       if not self.is_driver:
         layer = layer.replace(LAYER_DRIVER, LAYER_CPU)
       else:
         layer = layer.replace(LAYER_CPU, LAYER_DRIVER)
       if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION:
         self.la_pe_counts[self.app_phase.current()] = (
             self.la_pe_counts.get(self.app_phase.current(), 0) + 1)
       self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract)
     elif mark[0] == "E":
       try:
         node = self.mytree.pop(time)
         if node.is_dummy():  # Placeholder item
           pass
         else:
           if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
             self.app_phase.pop()
           function = node.app_phase + "::" + get_function_name_from_mark(node.mark)
           self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) +
                                                [[float(node.start_time_s) * 1000.0,
                                                  float(node.end_time_s) * 1000.0]])
       except IndexError as e:
         raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e

   def is_complete(self):
     """ Checks if we've seen all end tracepoints for the begin tracepoints.
     """
     return self.mytree.current.is_root()

   def calculate_stats(self):
     assert self.is_complete()
     self.mytree.remove_ignored()
     self.mytree.remove_dummies()
     self.mytree.copy_subtracted_init_and_wrong_la()
     self.mytree.add_missing_la_nodes()
     # self.mytree.print()
     self.mytree.validate_nesting()

     def recurse(node, prev_layer, prev_phase, indent, in_pe_layers):
       [begun, mark, layer, phase] = [
           node.start_time_s, node.mark, node.layer, node.phase()]
       time = node.end_time_s
       tag = None
       elapsed0 = "DETAIL"
       elapsed1 = node.elapsed_less_subtracted_ms()
       if elapsed1 is None:
         raise Exception("Elapsed for {} returned None".format(node.to_str()))

       if not node.is_added_detail() and not node.subtract:
         tag = node.app_phase + "_" + layer + "_" + phase
         elapsed0 = elapsed1
         self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0
         self.items[tag] = self.items.get(tag, []) + [
             mark + " " + str(elapsed0) + " " + str(elapsed1) + " " +  tag]
         if phase in subphases[PHASE_EXECUTION]:
           if not in_pe_layers.get(layer):
             pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION)
             self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0
             self.items[pe_tag] = self.items.get(pe_tag, []) + [
                 mark + " " + str(elapsed0) + " " + str(elapsed1) + " " +  pe_tag]
       if phase == PHASE_EXECUTION:
         in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1
       for c in node.children:
         recurse(c, layer or prev_layer, phase or prev_phase,
                 indent + "  ", in_pe_layers)
       if phase == PHASE_EXECUTION:
         in_pe_layers[layer] = in_pe_layers[layer] - 1
       return

     for top in self.mytree.root.children:
       recurse(top, None, None, "", {})
     self.debugstring = self.mytree.to_str()

   # We need to special case the driver execution time because:
   # - The existing drivers don't have tracing, so we rely on HIDL traces
   # - Best we can do is to take the start of the HIDL server side call as
   #   the starting point (which includes a bit of overhead, but not much) and
   #   the start of the callback as the end point (which should be pretty
   #   accurate)
   # Note that the begin and end may be on different threads, hence the
   # calculation needs to happen in aggregation rather than here.
   def get_ld_pe_begins(self, app_phase):
       return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server")

   def get_ld_pe_ends(self, app_phase):
       return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client")

   def get_stat(self, tag, app_phase, special_case_pe=True):
     if not self.stats and not self.mytree.is_empty():
       self.calculate_stats()
     if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe:
       # Execution is exposed as an asynchronous event from the runtime, we
       # calculate the runtime time as starting from when the async operation is
       # kicked off until wait finishes + synchronous setup and teardown calls.
       # This has two limitations:
       #   - multithreaded usage will not work correctly
       #   - should the application spend so much time before calling wait that
       #     execution has already finished, the time would get allocated to the
       #     runtime incorrectly
       async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute")
       async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait")
       elapsed = 0.0
       for i in range(0, len(async_starts)):
         elapsed = elapsed + (async_ends[i] - async_starts[i])
       for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free",
                    "ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free",
                    "ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput",
                    "ANeuralNetworksExecution_setInputFromMemory",
                    "ANeuralNetworksExecution_setOutputFromMemory"]:
         sync_starts = self.get_begins(app_phase, sync)
         sync_ends = self.get_ends(app_phase, sync)
         for i in range(0, len(sync_starts)):
           elapsed = elapsed + (sync_ends[i] - sync_starts[i])
       return elapsed
     return self.stats.get(app_phase + "_" + tag, 0.0)

   def get_execution_count(self, app_phase):
     # ANeuralNetworksExecution_create is reliable and comes from the runtime,
     # but not available pre-P
     count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create"))
     if count > 0:
       return count
     # Application may have added tracepoints
     return self.la_pe_counts.get(app_phase, 0)

   def get_begins(self, app_phase, function):
     name = app_phase + "::" + function
     return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
   def get_ends(self, app_phase, function):
     name = app_phase + "::" + function
     return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])]

   def print_stats(self):
     if not self.stats:
       self.calculate_stats()
     print(self.tgid, "Driver" if self.is_driver else "App")
     for tag in self.stats:
       print(tag, self.stats[tag])
       if self.items.get(tag):
         for item in self.items[tag]:
           print("  ", item)
       else:
         print("  ", "calculated only")

   def print(self):
     self.mytree.print()
	""" NNAPI Systrace parser - tracking of call tree based on trace lines

	See contract-between-code-and-parser.txt for the
	specification (cases in the specification are referred to with SPEC).
	"""

	import re
	import sys
	from parser.naming import (subphases, translate_hidl_mark_to_nn_and_tag,
	get_function_name_from_mark, make_tag)
	from parser.naming import LAYER_CPU, LAYER_DRIVER, LAYER_RUNTIME, LAYER_APPLICATION
	from parser.naming import MARKER_SWITCH, MARKER_SUBTRACT
	from parser.naming import PHASE_EXECUTION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK
	from parser.tree import SingleThreadCallTree

	class AppPhase(object):
	""" Class to track the overall phase of the program. Used to split up warmup and benchmark.

	Needs to be separate from the call trees to propagate the difference to driver.
	"""
	def __init__(self):
	self.reset()

	def current(self):
	if self.stack:
	return self.stack[-1]
	else:
	return PHASE_OVERALL

	def push(self, phase):
	self.stack.append(phase)

	def pop(self):
	self.stack.pop()

	def reset(self):
	self.stack = []

	class Tracker(object):
	""" Class to track the stack trace of a single thread and feed it into a SingleThreadCallTree
	as well as keeping track of entry and exit times for functions.

	Exposes statistics for a single thread, transforming the call tree as needed.
	All statistics are in milliseconds.

	Layer Runtime, Phase Execution (LR_PE) is special-cased, see comment in get_stat().

	Subphases of Execution are aggregated towards the overall Execution phase as needed.
	"""
	def __init__(self, tgid, is_driver, app_phase):
	self.tgid = tgid
	self.is_driver = is_driver
	self.app_phase = app_phase

	# Match the trace string
	# "[NN_LA_PP]funcE1" in "B\|<thread1>\|[NN_LA_PP]funcE1"
	# "[NN_LC_PCO]funcC1" in "B\|<thread1>\|[SW][NN_LC_PCO]funcC1"
	self.matcher = re.compile(r"B\\|\d+\\|.*\[([^]]+)\]\[?([^]])\]?")

	self.reset()

	def reset(self):
	self.stats = {}
	self.items = {}
	self.mytree = SingleThreadCallTree()
	self.begins_and_ends_ms = {}
	self.la_pe_counts = {}
	self.debugstring = "\n"

	def handle_mark(self, time, mark):
	""" Handle a single trace item (scoped entry and exit).
	Translates:
	- Automatically generated HIDL traces into NNTRACE layers and phases
	- SPEC:Switch phase during function into dummy items
	- SPEC:Subtracting time when nesting is violated into "subtract"
	markers
	- CPU/Driver layer distinction based on whether the process is the
	driver or an application
	This function is called multiple times for a single application run,
	afterwards the statistics can be calculated.
	"""
	if mark[0] == "B":
	switch = False
	subtract = False
	# Workarounds for wrong tracepoints in early versions
	# TODO(mikie): remove later
	if ("ANeuralNetworksEvent_free" in mark) or ("ANeuralNetworksExecution_free" in mark):
	mark = mark.replace("_PT", "_PE")
	# Workarounds for trace marker for getSupportedExtensions (fixed in ag/9484333)
	if ("getSupportedExtensions" in mark):
	mark = mark.replace("_PC", "_PI")
	elif ("[SW][NN_LA_PR]executeWithCompilation" in mark):
	mark = mark.replace("[SW]", "")
	if MARKER_SWITCH in mark:
	switch = True
	if MARKER_SUBTRACT in mark:
	subtract = True
	if switch:
	# End previous item
	self.handle_mark(time, "E")
	# Push a placeholder item that will get popped by the 'real' end of the
	# previous item.
	self.mytree.push_dummy(time)
	m = self.matcher.search(mark)
	if m is None:
	tag = translate_hidl_mark_to_nn_and_tag(mark)
	if tag is None:
	raise Exception("Couldn't parse mark " + mark)
	else:
	tag = m.group(1)
	[_, layer, phase] = tag.split("_")
	if layer == LAYER_APPLICATION and phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
	self.app_phase.push(phase)
	if not self.is_driver:
	layer = layer.replace(LAYER_DRIVER, LAYER_CPU)
	else:
	layer = layer.replace(LAYER_CPU, LAYER_DRIVER)
	if layer == LAYER_APPLICATION and phase == PHASE_EXECUTION:
	self.la_pe_counts[self.app_phase.current()] = (
	self.la_pe_counts.get(self.app_phase.current(), 0) + 1)
	self.mytree.push(time, mark, layer, phase, self.app_phase.current(), subtract)
	elif mark[0] == "E":
	try:
	node = self.mytree.pop(time)
	if node.is_dummy(): # Placeholder item
	pass
	else:
	if node.layer == LAYER_APPLICATION and node.phase in [PHASE_WARMUP, PHASE_BENCHMARK]:
	self.app_phase.pop()
	function = node.app_phase + "::" + get_function_name_from_mark(node.mark)
	self.begins_and_ends_ms[function] = (self.begins_and_ends_ms.get(function, []) +
	[[float(node.start_time_s) * 1000.0,
	float(node.end_time_s) * 1000.0]])
	except IndexError as e:
	raise Exception("Unable to process a trace termination mark, please check that the collected trace are including full application lifecycles.\n") from e

	def is_complete(self):
	""" Checks if we've seen all end tracepoints for the begin tracepoints.
	"""
	return self.mytree.current.is_root()

	def calculate_stats(self):
	assert self.is_complete()
	self.mytree.remove_ignored()
	self.mytree.remove_dummies()
	self.mytree.copy_subtracted_init_and_wrong_la()
	self.mytree.add_missing_la_nodes()
	# self.mytree.print()
	self.mytree.validate_nesting()

	def recurse(node, prev_layer, prev_phase, indent, in_pe_layers):
	[begun, mark, layer, phase] = [
	node.start_time_s, node.mark, node.layer, node.phase()]
	time = node.end_time_s
	tag = None
	elapsed0 = "DETAIL"
	elapsed1 = node.elapsed_less_subtracted_ms()
	if elapsed1 is None:
	raise Exception("Elapsed for {} returned None".format(node.to_str()))

	if not node.is_added_detail() and not node.subtract:
	tag = node.app_phase + "_" + layer + "_" + phase
	elapsed0 = elapsed1
	self.stats[tag] = self.stats.get(tag, 0.0) + elapsed0
	self.items[tag] = self.items.get(tag, []) + [
	mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + tag]
	if phase in subphases[PHASE_EXECUTION]:
	if not in_pe_layers.get(layer):
	pe_tag = node.app_phase + "_" + make_tag(layer, PHASE_EXECUTION)
	self.stats[pe_tag] = self.stats.get(pe_tag, 0.0) + elapsed0
	self.items[pe_tag] = self.items.get(pe_tag, []) + [
	mark + " " + str(elapsed0) + " " + str(elapsed1) + " " + pe_tag]
	if phase == PHASE_EXECUTION:
	in_pe_layers[layer] = in_pe_layers.get(layer, 0) + 1
	for c in node.children:
	recurse(c, layer or prev_layer, phase or prev_phase,
	indent + " ", in_pe_layers)
	if phase == PHASE_EXECUTION:
	in_pe_layers[layer] = in_pe_layers[layer] - 1
	return

	for top in self.mytree.root.children:
	recurse(top, None, None, "", {})
	self.debugstring = self.mytree.to_str()

	# We need to special case the driver execution time because:
	# - The existing drivers don't have tracing, so we rely on HIDL traces
	# - Best we can do is to take the start of the HIDL server side call as
	# the starting point (which includes a bit of overhead, but not much) and
	# the start of the callback as the end point (which should be pretty
	# accurate)
	# Note that the begin and end may be on different threads, hence the
	# calculation needs to happen in aggregation rather than here.
	def get_ld_pe_begins(self, app_phase):
	return self.get_begins(app_phase, "HIDL::IPreparedModel::execute::server")

	def get_ld_pe_ends(self, app_phase):
	return self.get_begins(app_phase, "HIDL::IExecutionCallback::notify::client")

	def get_stat(self, tag, app_phase, special_case_pe=True):
	if not self.stats and not self.mytree.is_empty():
	self.calculate_stats()
	if tag == make_tag(LAYER_RUNTIME, PHASE_EXECUTION) and special_case_pe:
	# Execution is exposed as an asynchronous event from the runtime, we
	# calculate the runtime time as starting from when the async operation is
	# kicked off until wait finishes + synchronous setup and teardown calls.
	# This has two limitations:
	# - multithreaded usage will not work correctly
	# - should the application spend so much time before calling wait that
	# execution has already finished, the time would get allocated to the
	# runtime incorrectly
	async_starts = self.get_begins(app_phase, "ANeuralNetworksExecution_startCompute")
	async_ends = self.get_ends(app_phase, "ANeuralNetworksEvent_wait")
	elapsed = 0.0
	for i in range(0, len(async_starts)):
	elapsed = elapsed + (async_ends[i] - async_starts[i])
	for sync in ["ANeuralNetworksExecution_create", "ANeuralNetworksExecution_free",
	"ANeuralNetworksEvent_create", "ANeuralNetworksEvent_free",
	"ANeuralNetworksExecution_setInput", "ANeuralNetworksExecution_setOutput",
	"ANeuralNetworksExecution_setInputFromMemory",
	"ANeuralNetworksExecution_setOutputFromMemory"]:
	sync_starts = self.get_begins(app_phase, sync)
	sync_ends = self.get_ends(app_phase, sync)
	for i in range(0, len(sync_starts)):
	elapsed = elapsed + (sync_ends[i] - sync_starts[i])
	return elapsed
	return self.stats.get(app_phase + "_" + tag, 0.0)

	def get_execution_count(self, app_phase):
	# ANeuralNetworksExecution_create is reliable and comes from the runtime,
	# but not available pre-P
	count = len(self.get_begins(app_phase, "ANeuralNetworksExecution_create"))
	if count > 0:
	return count
	# Application may have added tracepoints
	return self.la_pe_counts.get(app_phase, 0)

	def get_begins(self, app_phase, function):
	name = app_phase + "::" + function
	return [begin_and_end[0] for begin_and_end in self.begins_and_ends_ms.get(name, [])]
	def get_ends(self, app_phase, function):
	name = app_phase + "::" + function
	return [begin_and_end[1] for begin_and_end in self.begins_and_ends_ms.get(name, [])]

	def print_stats(self):
	if not self.stats:
	self.calculate_stats()
	print(self.tgid, "Driver" if self.is_driver else "App")
	for tag in self.stats:
	print(tag, self.stats[tag])
	if self.items.get(tag):
	for item in self.items[tag]:
	print(" ", item)
	else:
	print(" ", "calculated only")

	def print(self):
	self.mytree.print()