catapult/dashboard/dashboard/bench_find_anomalies.py - platform/external/chromium-trace - Git at Google

 # Copyright 2015 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """A tool to compare functions for finding anomalies to the current default.

 This tool provides a way to benchmark an anomaly detection algorithm against
 the current find_change_points (base) by running simulations and comparing the
 results to the base results and to the existing anomalies in the datastore.

 Usage:
   1. Run SetupBaseDataForBench() if not yet.

   2. Add an implementation of find_change_points that takes
   (test_entity, chart_series) arguments and returns a list of
   find_change_points.ChangePoint entities.
   See find_change_points_exp.RunFindChangePoints.

   3. Add that function path to _EXPERIMENTAL_FUNCTIONS with a key name.

   4. Call BenchFindChangePoints(name, description) to add a bench job, where
   name is one of the keys in _EXPERIMENTAL_FUNCTIONS. Name and description
   must be unique for each run. The bench results are logged in quick_logger at:
   chromeperf.appspot.com/get_logs?namespace=bench_find_anomalies&name=report

 If you want to clear the base data, you can run DeleteAllTestBenchEntities().

 Results:
   Invalid alerts: Number of change points found by the experimental function
       which correspond to invalid alerts, over total invalid alerts.
   Confirmed alerts: Number of change points found by the experimental function
       which correspond to alerts the sheriff filed a bug for, over the total
       number of alerts with bug ID.
   New alerts: Number of alerts found by the experimental function that the base
       find_change_points algorithm did not find.
   Total alerts: Total change points found by the experimental function,
       over the total number of base alerts.
 """

 import logging

 from pipeline import common as pipeline_common
 from pipeline import pipeline

 from google.appengine.api import app_identity
 from google.appengine.ext import deferred
 from google.appengine.ext import ndb

 from dashboard import debug_alert
 from dashboard import find_change_points
 from dashboard import find_change_points_exp
 from dashboard import layered_cache
 from dashboard import quick_logger
 from dashboard import utils
 from dashboard.models import anomaly
 from dashboard.models import anomaly_config
 from dashboard.models import graph_data

 _TASK_QUEUE_NAME = 'find-anomalies-bench-queue'

 _FIND_ANOMALIES_BENCH_CACHE_KEY = 'find-anomalies-bench'

 # Bench name to path of allowable find anomalies function to benchmark.

 _EXPERIMENTAL_FUNCTIONS = {
     'find_change_points_default': find_change_points_exp.RunFindChangePoints,
     'steppiness_0_3': lambda test, series:
                       find_change_points_exp.RunFindChangePoints(
                           test, series, min_steppiness=0.3),
     'steppiness_0_4': lambda test, series:
                       find_change_points_exp.RunFindChangePoints(
                           test, series, min_steppiness=0.4),
     'steppiness_0_5': lambda test, series:
                       find_change_points_exp.RunFindChangePoints(
                           test, series, min_steppiness=0.5),
     'steppiness_0_6': lambda test, series:
                       find_change_points_exp.RunFindChangePoints(
                           test, series, min_steppiness=0.6),
 }


 _TEST_DATA_SHERIFF = 'Chromium Perf Sheriff'

 # 1000 tests and 300 rows take about 3 hours to run SetupBaseDataForBench.
 _NUM_TEST_TO_BENCH = 3000

 # 250 rows takes about 5 minutes to run find_change_points per task queue task.
 # (The AE limit is 10 minutes)
 _NUM_ROWS_TO_BENCH = 300

 # This is the window size which consists of points before and after the
 # Anomaly. If an Anomaly's end revision overlaps another Anomaly's window,
 # they are considered the same Anomaly.
 _MAX_SEGMENT_SIZE_AROUND_ANOMALY = 4

 _REPORT_TEMPLATE = """%(bench_name)s: %(description)s
  Invalid alerts: %(invalid_alerts)s
  Confirmed alerts: %(confirmed_alerts)s
  New alerts: %(new_alerts)s
  Total alerts: %(total_alerts)s

  "Unconfirmed" alerts, i.e. "valid" alerts that were not found by
  the experimental function:
  %(unconfirmed_alert_links)s

  "Extra" alerts, i.e. new alerts found by the experimental function
  that weren't found before:
  %(extra_alert_links)s
 """


 class TestBench(ndb.Model):
   """Reference anomaly data for one Test."""

   # Test key.
   test = ndb.KeyProperty()

   # List of tuples of (x_value, y_value) for test.
   data_series = ndb.PickleProperty()

   # List of lists of revisions around Anomaly entities from base run.
   base_anomaly_revs = ndb.PickleProperty()

   # List of lists of revisions around Anomaly entities marked invalid.
   invalid_anomaly_revs = ndb.PickleProperty()

   # List of lists of revisions around Anomaly entities with bug IDs.
   confirmed_anomaly_revs = ndb.PickleProperty()


 class SimulateAlertProcessingPipeline(pipeline.Pipeline):

   def run(self, bench_name, test_bench_id):  # pylint: disable=invalid-name
     """Runs one experimental alerting function for one TestBench entity.

     Args:
       bench_name: A string bench name.
       test_bench_id: Integer ID of a TestBench entity.

     Returns:
       A pair (TestBench ID, list of Anomaly dicts). But if the Test
       can't be gotten, this will return (None, None).
     """
     all_change_points = []
     test_bench = TestBench.get_by_id(test_bench_id)
     test = test_bench.test.get()
     # If test doesn't exist anymore, just remove this TestBench entity.
     if not test:
       test_bench.key.delete()
       return None, None

     # Clear the last_alerted_function property because it will be used in
     # the experimental alerting function.
     test.last_alerted_revision = None
     data_series = test_bench.data_series
     for i in xrange(1, len(data_series)):
       find_change_points_func = _EXPERIMENTAL_FUNCTIONS[bench_name]
       change_points = find_change_points_func(test, data_series[0:i])
       change_points = [c for c in change_points if _IsRegression(c, test)]
       all_change_points.extend(change_points)
     logging.debug('Completed alert processing simulation task for bench_name: '
                   '%s, bench_id: %s.', bench_name, test_bench_id)
     return test_bench_id, all_change_points


 class GenerateComparisonReportPipeline(pipeline.Pipeline):

   def run(  # pylint: disable=invalid-name
       self, bench_name, description, simulation_results):
     """"Generates a comparison report between experimental and base results.

     Args:
       bench_name: A string bench name.
       description: A string description of this bench job.
       simulation_results: A list of pairs, each of which is a pair
           (TestBench id, change point results), i.e. the return value of
           SimulateAlertProcessingPipeline.run. But, the ChangePoint objects,
           which are named tuple objects, are automatically converted to lists
           because they're implicitly serialized as JSON.
     """
     bench_id_to_change_points_as_lists = dict(simulation_results)
     results = {
         'bench_name': bench_name,
         'description': description,
     }
     total_invalid_alerts = 0
     total_confirmed_alerts = 0
     total_new_alerts = 0
     total_alerts = 0
     total_base_alerts = 0
     total_base_invalid_alerts = 0
     total_base_confirmed_alerts = 0

     unconfirmed_alert_links = []
     extra_alert_links = []

     for bench in TestBench.query().fetch():
       bench_id = bench.key.integer_id()
       if bench_id not in bench_id_to_change_points_as_lists:
         continue
       change_points_as_lists = bench_id_to_change_points_as_lists[bench_id]
       invalid_anomaly_rev_set = _Flatten(bench.invalid_anomaly_revs)
       confirmed_anomaly_rev_set = _Flatten(bench.confirmed_anomaly_revs)
       base_anomaly_rev_set = _Flatten(bench.base_anomaly_revs)
       unconfirmed_alert_links.extend(
           _UnconfirmedAlertLinks(bench, change_points_as_lists))
       extra_alert_links.extend(
           _ExtraAlertLinks(bench, change_points_as_lists))

       for change_point_as_list in change_points_as_lists:
         change_point = find_change_points.ChangePoint(*change_point_as_list)
         end_rev = change_point.x_value
         if end_rev in invalid_anomaly_rev_set:
           total_invalid_alerts += 1
         elif end_rev in confirmed_anomaly_rev_set:
           total_confirmed_alerts += 1
         elif end_rev not in base_anomaly_rev_set:
           total_new_alerts += 1

       total_alerts += len(change_points_as_lists)
       total_base_alerts += len(bench.base_anomaly_revs)
       total_base_invalid_alerts += len(bench.invalid_anomaly_revs)
       total_base_confirmed_alerts += len(bench.confirmed_anomaly_revs)

     results['invalid_alerts'] = (
         '%s/%s' % (total_invalid_alerts, total_base_invalid_alerts))
     results['confirmed_alerts'] = (
         '%s/%s' % (total_confirmed_alerts, total_base_confirmed_alerts))
     results['new_alerts'] = total_new_alerts
     results['total_alerts'] = '%s/%s' % (total_alerts, total_base_alerts)
     results['unconfirmed_alert_links'] = '\n'.join(
         unconfirmed_alert_links[:10])
     results['extra_alert_links'] = '\n'.join(
         extra_alert_links[:10])

     _AddReportToLog(results)

     logging.debug('Completed comparison report for bench_name: %s, '
                   'description: %s. Results: %s', bench_name, description,
                   results)


 def _UnconfirmedAlertLinks(bench, change_points_as_lists):
   """Makes a list of URLs to view graphs for "unconfirmed" alerts.

   Here, "unconfirmed" alerts refers to alerts that are in the TestBench
   object (i.e. they were found before and "confirmed") but were not found
   by the experimental find-anomalies function -- they were not "confirmed"
   again by the experimental function, so I'm calling them "unconfirmed".

   Below, bench.confirmed_anomaly_revs is a list of lists of revisions *around*
   a confirmed alert. For example, if an alert was found before at revision
   200 and 300, this list might look like: [[199, 200, 201], [299, 300, 301]].

   Thus, the set of alerts that were "confirmed alerts" before, but not found
   by the experimental function is the central revision for each one of these
   groups where the experimental function didn't find any corresponding alerts.

   Ideally for a good experimental function, we're hoping that these
   "unconfirmed" alerts are all cases where sheriffs triaged the alert wrong and
   it was actually invalid.

   Args:
     bench: One TestBench entity.
     change_points_as_lists: List of lists (which are JSONified ChangePoints).

   Returns:
     A list of URLs, each of which is for a graph for one unconfirmed alert.
   """
   anomaly_revs = {c[0] for c in change_points_as_lists}
   unconfirmed_revs = []
   for confirmed_rev_group in bench.confirmed_anomaly_revs:
     if not anomaly_revs.intersection(confirmed_rev_group):
       # The alert for the this confirmed rev group is "unconfirmed" by the
       # experimental function. It should be added to the list.
       mid_index = len(confirmed_rev_group) / 2
       unconfirmed_revs.append(confirmed_rev_group[mid_index])
   return [_GraphLink(bench.test, rev) for rev in unconfirmed_revs]


 def _ExtraAlertLinks(bench, change_points_as_lists):
   """Makes a list of links to view "extra" alerts found.

   Here, an "extra" alert means an alert that was found by the experimental
   function but doesn't coincide with any Anomaly in the datastore, regardless
   of whether that Anomaly would be found by the current default alerting
   function.

   Args:
     bench: A TestBench entity.
     change_points_as_lists: List of lists (which are JSONified ChangePoints).

   Returns:
     A list of URLs, each of which is for a graph for one extra alert.
   """
   anomaly_revs = {c[0] for c in change_points_as_lists}
   confirmed_revs = _Flatten(bench.confirmed_anomaly_revs)
   invalid_revs = _Flatten(bench.invalid_anomaly_revs)
   # Both "confirmed revs" and "invalid revs" are previously fired alerts.
   extra_revs = anomaly_revs.difference(confirmed_revs, invalid_revs)
   return [_GraphLink(bench.test, rev) for rev in extra_revs]


 def _GraphLink(test_key, rev):
   """Returns an HTML link to view the graph for an alert."""
   test_path = utils.TestPath(test_key)
   master, bot, test = test_path.split('/', 2)
   query = '?masters=%s&bots=%s&tests=%s&rev=%s' % (master, bot, test, rev)
   return '<a href="https://%s/report%s">%s/%s@%s</a>' % (
       app_identity.get_default_version_hostname(), query, bot, test, rev)


 class RunExperimentalChunkPipeline(pipeline.Pipeline):

   def run(self, bench_name, test_bench_ids):  # pylint: disable=invalid-name
     """Runs the experimental find_change_points on each TestBench entity.

     This runs SimulateAlertProcessing in parallel and returns a list of
     the combined results.

     Args:
       bench_name: A string bench name.
       test_bench_ids: List of TestBench IDs.

     Yields:
       Pipeline instance.
     """
     results = []
     for bench_id in test_bench_ids:
       result_future = yield SimulateAlertProcessingPipeline(
           bench_name, bench_id)
       results.append(result_future)
     yield pipeline_common.List(*results)


 class RunExperimentalPipeline(pipeline.Pipeline):

   def run(self, bench_name, description):  # pylint: disable=invalid-name
     """The root pipeline that start simulation tasks and generating report.

     This spawns tasks to spawn more tasks that run simulation and executes the
     generate report task on the aggregated the results.

     Args:
       bench_name: A string bench name.
       description: A string description of this bench job.

     Yields:
       Pipeline instance.
     """
     test_bench_keys = TestBench.query().fetch(keys_only=True)
     test_bench_ids = [k.integer_id() for k in test_bench_keys]

     results = []
     # Size of number of taskqueue tasks we want to spawn per pipeline.
     pipeline_chunk_size = 1000
     for i in xrange(0, len(test_bench_ids), pipeline_chunk_size):
       id_chunk = test_bench_ids[i:i + pipeline_chunk_size]
       result_future = yield RunExperimentalChunkPipeline(
           bench_name, id_chunk)
       results.append(result_future)

     combined_results = yield pipeline_common.Extend(*results)
     yield GenerateComparisonReportPipeline(
         bench_name, description, combined_results)


 def SetupBaseDataForBench():
   """Adds tasks to queue to create base data for bench."""
   if TestBench.query().fetch(keys_only=True, limit=1):
     raise Exception('Base data already exist.')

   # This will take a while, so we do it in a task queue.
   deferred.defer(_AddCreateTestBenchTasks, _queue=_TASK_QUEUE_NAME)


 def BenchFindChangePoints(bench_name, description):
   """Submits a bench job for a bench_name and description.

   Requires an implementation of find_change_points added to
   _EXPERIMENTAL_FUNCTIONS. At least bench_name or description must
   be different for each job.

   Args:
     bench_name: A string bench name which should exist in they keys of
         _EXPERIMENTAL_FUNCTIONS.
     description: A string description of this bench job.

   Raises:
     ValueError: The input was not valid.
     Exception: Not enough data available.
   """
   if bench_name not in _EXPERIMENTAL_FUNCTIONS:
     raise ValueError('%s is not a valid find anomalies bench function.' %
                      bench_name)

   bench_key = '%s.%s' % (bench_name, description)
   submitted_benches = layered_cache.Get(_FIND_ANOMALIES_BENCH_CACHE_KEY)
   if not submitted_benches:
     submitted_benches = {}
   if bench_key in submitted_benches:
     raise ValueError('Bench job for "%s. %s" already in submitted.' %
                      (bench_name, description))

   submitted_benches[bench_key] = True
   layered_cache.Set(_FIND_ANOMALIES_BENCH_CACHE_KEY, submitted_benches)

   # Check if base bench data exist.
   if not TestBench.query().fetch(keys_only=True, limit=1):
     raise Exception('No base data available to bench against.')

   # Add to taskqueue to run simulation.
   stage = RunExperimentalPipeline(bench_name, description)
   stage.start(queue_name=_TASK_QUEUE_NAME)


 def DeleteAllTestBenchEntities():
   """Deletes all TestBench data."""
   ndb.delete_multi(TestBench.query().fetch(keys_only=True))


 def _AddCreateTestBenchTasks():
   """Adds _CreateTestBench tasks to queue."""
   sheriff_key = ndb.Key('Sheriff', _TEST_DATA_SHERIFF)
   query = graph_data.Test.query(
       graph_data.Test.sheriff == sheriff_key,
       graph_data.Test.has_rows == True,
       graph_data.Test.deprecated == False)

   tests = query.fetch(limit=_NUM_TEST_TO_BENCH)

   tests = [t for t in tests if _GetSheriffForTest(t) and not _IsRefBuild(t)]
   for test in tests:
     deferred.defer(_CreateTestBench, test.key, _queue=_TASK_QUEUE_NAME)


 def _CreateTestBench(test_key):
   """Fetches and stores test and row data that would be used to run bench."""
   # Get rows entity.
   query = graph_data.Row.query(projection=['revision', 'value'])
   query = query.filter(graph_data.Row.parent_test == test_key)
   query = query.order(-graph_data.Row.revision)
   rows = list(reversed(query.fetch(limit=_NUM_ROWS_TO_BENCH)))
   data_series = [(row.revision, row.value) for row in rows]

   # Add TestBench entity.
   test_bench = TestBench(test=test_key, data_series=data_series)
   _UpdateInvalidAndConfirmedAnomalyRevs(test_bench)
   _RunBaseAlertProcessing(test_bench)
   test_bench.put()


 def _AddReportToLog(report_dict):
   """Adds a log for bench results."""
   report = _REPORT_TEMPLATE % report_dict
   formatter = quick_logger.Formatter()
   logger = quick_logger.QuickLogger(
       'bench_find_anomalies', 'report', formatter)
   logger.Log(report)
   logger.Save()


 def _Flatten(list_of_list):
   """Creates set of all items in the sublists."""
   flattened = set()
   for item in list_of_list:
     flattened.update(item)
   return flattened


 def _UpdateInvalidAndConfirmedAnomalyRevs(test_bench):
   """Updates TestBench entity with invalid and confirmed anomalies revs."""

   # Start rev for getting Anomalies should be at min_segment_size.
   test = test_bench.test.get()
   config_dict = anomaly_config.GetAnomalyConfigDict(test)
   min_segment_size = config_dict.get(
       'min_segment_size', find_change_points.MIN_SEGMENT_SIZE)
   start_index = min(min_segment_size, len(test_bench.data_series)) - 1
   start_rev = test_bench.data_series[start_index][0]

   query = anomaly.Anomaly.query(anomaly.Anomaly.test == test_bench.test)
   anomalies = query.fetch()
   anomalies.sort(key=lambda a: a.end_revision)
   anomalies = [a for a in anomalies if a.end_revision >= start_rev and
                not a.is_improvement]

   test_bench.invalid_anomaly_revs = [
       _GetRevsAroundRev(test_bench.data_series, a.end_revision)
       for a in anomalies if a.bug_id == -1]
   test_bench.confirmed_anomaly_revs = [
       _GetRevsAroundRev(test_bench.data_series, a.end_revision)
       for a in anomalies if a.bug_id > 0]


 def _RunBaseAlertProcessing(test_bench):
   """Runs base alert processing simulation on TestBench entity.

   This function runs the current find_change_points.FindChangePoints
   implementation and saves the revisions around the found anomalies to
   a TestBench entity.

   Args:
     test_bench: A TestBench entity.
   """
   test = test_bench.test.get()
   config_dict = anomaly_config.GetAnomalyConfigDict(test)
   change_points = debug_alert.SimulateAlertProcessing(
       test_bench.data_series, **config_dict)

   test_bench.base_anomaly_revs = [
       _GetRevsAroundRev(test_bench.data_series, change_point.x_value)
       for change_point in change_points if _IsRegression(change_point, test)]


 def _GetRevsAroundRev(data_series, revision):
   """Gets a list of revisions from before to after a given revision.

   Args:
     data_series: A list of (revision, value).
     revision: A revision number.

   Returns:
     A list of revisions.
   """
   if not _MAX_SEGMENT_SIZE_AROUND_ANOMALY:
     return [revision]

   middle_index = 0
   for i in xrange(len(data_series)):
     if data_series[i][0] == revision:
       middle_index = i
       break
   start_index = max(0, middle_index - _MAX_SEGMENT_SIZE_AROUND_ANOMALY)
   end_index = middle_index + _MAX_SEGMENT_SIZE_AROUND_ANOMALY + 1
   series_around_rev = data_series[start_index:end_index]
   return [s[0] for s in series_around_rev]


 def _IsRefBuild(test):
   """Returns True if test is a reference build."""
   key_path = test.key.string_id()
   return key_path[-1] == 'ref' or key_path[-1].endswith('_ref')


 def _GetSheriffForTest(test):
   """Gets the Sheriff for a test, or None if no sheriff."""
   if test.sheriff:
     return test.sheriff.get()
   return None


 def _IsRegression(change_point, test):
   """Returns whether the alert is a regression for the given test.

   Args:
     change_point: A find_change_points.ChangePoint object.
     test: Test to get the regression direction for.

   Returns:
     True if it is a regression anomaly, otherwise False.
   """
   median_before = change_point.median_before
   median_after = change_point.median_after
   if (median_before < median_after and
       test.improvement_direction == anomaly.UP):
     return False
   if (median_before >= median_after and
       test.improvement_direction == anomaly.DOWN):
     return False
   return True
	# Copyright 2015 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""A tool to compare functions for finding anomalies to the current default.

	This tool provides a way to benchmark an anomaly detection algorithm against
	the current find_change_points (base) by running simulations and comparing the
	results to the base results and to the existing anomalies in the datastore.

	Usage:
	1. Run SetupBaseDataForBench() if not yet.

	2. Add an implementation of find_change_points that takes
	(test_entity, chart_series) arguments and returns a list of
	find_change_points.ChangePoint entities.
	See find_change_points_exp.RunFindChangePoints.

	3. Add that function path to _EXPERIMENTAL_FUNCTIONS with a key name.

	4. Call BenchFindChangePoints(name, description) to add a bench job, where
	name is one of the keys in _EXPERIMENTAL_FUNCTIONS. Name and description
	must be unique for each run. The bench results are logged in quick_logger at:
	chromeperf.appspot.com/get_logs?namespace=bench_find_anomalies&name=report

	If you want to clear the base data, you can run DeleteAllTestBenchEntities().

	Results:
	Invalid alerts: Number of change points found by the experimental function
	which correspond to invalid alerts, over total invalid alerts.
	Confirmed alerts: Number of change points found by the experimental function
	which correspond to alerts the sheriff filed a bug for, over the total
	number of alerts with bug ID.
	New alerts: Number of alerts found by the experimental function that the base
	find_change_points algorithm did not find.
	Total alerts: Total change points found by the experimental function,
	over the total number of base alerts.
	"""

	import logging

	from pipeline import common as pipeline_common
	from pipeline import pipeline

	from google.appengine.api import app_identity
	from google.appengine.ext import deferred
	from google.appengine.ext import ndb

	from dashboard import debug_alert
	from dashboard import find_change_points
	from dashboard import find_change_points_exp
	from dashboard import layered_cache
	from dashboard import quick_logger
	from dashboard import utils
	from dashboard.models import anomaly
	from dashboard.models import anomaly_config
	from dashboard.models import graph_data

	_TASK_QUEUE_NAME = 'find-anomalies-bench-queue'

	_FIND_ANOMALIES_BENCH_CACHE_KEY = 'find-anomalies-bench'

	# Bench name to path of allowable find anomalies function to benchmark.

	_EXPERIMENTAL_FUNCTIONS = {
	'find_change_points_default': find_change_points_exp.RunFindChangePoints,
	'steppiness_0_3': lambda test, series:
	find_change_points_exp.RunFindChangePoints(
	test, series, min_steppiness=0.3),
	'steppiness_0_4': lambda test, series:
	find_change_points_exp.RunFindChangePoints(
	test, series, min_steppiness=0.4),
	'steppiness_0_5': lambda test, series:
	find_change_points_exp.RunFindChangePoints(
	test, series, min_steppiness=0.5),
	'steppiness_0_6': lambda test, series:
	find_change_points_exp.RunFindChangePoints(
	test, series, min_steppiness=0.6),
	}


	_TEST_DATA_SHERIFF = 'Chromium Perf Sheriff'

	# 1000 tests and 300 rows take about 3 hours to run SetupBaseDataForBench.
	_NUM_TEST_TO_BENCH = 3000

	# 250 rows takes about 5 minutes to run find_change_points per task queue task.
	# (The AE limit is 10 minutes)
	_NUM_ROWS_TO_BENCH = 300

	# This is the window size which consists of points before and after the
	# Anomaly. If an Anomaly's end revision overlaps another Anomaly's window,
	# they are considered the same Anomaly.
	_MAX_SEGMENT_SIZE_AROUND_ANOMALY = 4

	_REPORT_TEMPLATE = """%(bench_name)s: %(description)s
	Invalid alerts: %(invalid_alerts)s
	Confirmed alerts: %(confirmed_alerts)s
	New alerts: %(new_alerts)s
	Total alerts: %(total_alerts)s

	"Unconfirmed" alerts, i.e. "valid" alerts that were not found by
	the experimental function:
	%(unconfirmed_alert_links)s

	"Extra" alerts, i.e. new alerts found by the experimental function
	that weren't found before:
	%(extra_alert_links)s
	"""


	class TestBench(ndb.Model):
	"""Reference anomaly data for one Test."""

	# Test key.
	test = ndb.KeyProperty()

	# List of tuples of (x_value, y_value) for test.
	data_series = ndb.PickleProperty()

	# List of lists of revisions around Anomaly entities from base run.
	base_anomaly_revs = ndb.PickleProperty()

	# List of lists of revisions around Anomaly entities marked invalid.
	invalid_anomaly_revs = ndb.PickleProperty()

	# List of lists of revisions around Anomaly entities with bug IDs.
	confirmed_anomaly_revs = ndb.PickleProperty()


	class SimulateAlertProcessingPipeline(pipeline.Pipeline):

	def run(self, bench_name, test_bench_id): # pylint: disable=invalid-name
	"""Runs one experimental alerting function for one TestBench entity.

	Args:
	bench_name: A string bench name.
	test_bench_id: Integer ID of a TestBench entity.

	Returns:
	A pair (TestBench ID, list of Anomaly dicts). But if the Test
	can't be gotten, this will return (None, None).
	"""
	all_change_points = []
	test_bench = TestBench.get_by_id(test_bench_id)
	test = test_bench.test.get()
	# If test doesn't exist anymore, just remove this TestBench entity.
	if not test:
	test_bench.key.delete()
	return None, None

	# Clear the last_alerted_function property because it will be used in
	# the experimental alerting function.
	test.last_alerted_revision = None
	data_series = test_bench.data_series
	for i in xrange(1, len(data_series)):
	find_change_points_func = _EXPERIMENTAL_FUNCTIONS[bench_name]
	change_points = find_change_points_func(test, data_series[0:i])
	change_points = [c for c in change_points if _IsRegression(c, test)]
	all_change_points.extend(change_points)
	logging.debug('Completed alert processing simulation task for bench_name: '
	'%s, bench_id: %s.', bench_name, test_bench_id)
	return test_bench_id, all_change_points


	class GenerateComparisonReportPipeline(pipeline.Pipeline):

	def run( # pylint: disable=invalid-name
	self, bench_name, description, simulation_results):
	""""Generates a comparison report between experimental and base results.

	Args:
	bench_name: A string bench name.
	description: A string description of this bench job.
	simulation_results: A list of pairs, each of which is a pair
	(TestBench id, change point results), i.e. the return value of
	SimulateAlertProcessingPipeline.run. But, the ChangePoint objects,
	which are named tuple objects, are automatically converted to lists
	because they're implicitly serialized as JSON.
	"""
	bench_id_to_change_points_as_lists = dict(simulation_results)
	results = {
	'bench_name': bench_name,
	'description': description,
	}
	total_invalid_alerts = 0
	total_confirmed_alerts = 0
	total_new_alerts = 0
	total_alerts = 0
	total_base_alerts = 0
	total_base_invalid_alerts = 0
	total_base_confirmed_alerts = 0

	unconfirmed_alert_links = []
	extra_alert_links = []

	for bench in TestBench.query().fetch():
	bench_id = bench.key.integer_id()
	if bench_id not in bench_id_to_change_points_as_lists:
	continue
	change_points_as_lists = bench_id_to_change_points_as_lists[bench_id]
	invalid_anomaly_rev_set = _Flatten(bench.invalid_anomaly_revs)
	confirmed_anomaly_rev_set = _Flatten(bench.confirmed_anomaly_revs)
	base_anomaly_rev_set = _Flatten(bench.base_anomaly_revs)
	unconfirmed_alert_links.extend(
	_UnconfirmedAlertLinks(bench, change_points_as_lists))
	extra_alert_links.extend(
	_ExtraAlertLinks(bench, change_points_as_lists))

	for change_point_as_list in change_points_as_lists:
	change_point = find_change_points.ChangePoint(*change_point_as_list)
	end_rev = change_point.x_value
	if end_rev in invalid_anomaly_rev_set:
	total_invalid_alerts += 1
	elif end_rev in confirmed_anomaly_rev_set:
	total_confirmed_alerts += 1
	elif end_rev not in base_anomaly_rev_set:
	total_new_alerts += 1

	total_alerts += len(change_points_as_lists)
	total_base_alerts += len(bench.base_anomaly_revs)
	total_base_invalid_alerts += len(bench.invalid_anomaly_revs)
	total_base_confirmed_alerts += len(bench.confirmed_anomaly_revs)

	results['invalid_alerts'] = (
	'%s/%s' % (total_invalid_alerts, total_base_invalid_alerts))
	results['confirmed_alerts'] = (
	'%s/%s' % (total_confirmed_alerts, total_base_confirmed_alerts))
	results['new_alerts'] = total_new_alerts
	results['total_alerts'] = '%s/%s' % (total_alerts, total_base_alerts)
	results['unconfirmed_alert_links'] = '\n'.join(
	unconfirmed_alert_links[:10])
	results['extra_alert_links'] = '\n'.join(
	extra_alert_links[:10])

	_AddReportToLog(results)

	logging.debug('Completed comparison report for bench_name: %s, '
	'description: %s. Results: %s', bench_name, description,
	results)


	def _UnconfirmedAlertLinks(bench, change_points_as_lists):
	"""Makes a list of URLs to view graphs for "unconfirmed" alerts.

	Here, "unconfirmed" alerts refers to alerts that are in the TestBench
	object (i.e. they were found before and "confirmed") but were not found
	by the experimental find-anomalies function -- they were not "confirmed"
	again by the experimental function, so I'm calling them "unconfirmed".

	Below, bench.confirmed_anomaly_revs is a list of lists of revisions around
	a confirmed alert. For example, if an alert was found before at revision
	200 and 300, this list might look like: [[199, 200, 201], [299, 300, 301]].

	Thus, the set of alerts that were "confirmed alerts" before, but not found
	by the experimental function is the central revision for each one of these
	groups where the experimental function didn't find any corresponding alerts.

	Ideally for a good experimental function, we're hoping that these
	"unconfirmed" alerts are all cases where sheriffs triaged the alert wrong and
	it was actually invalid.

	Args:
	bench: One TestBench entity.
	change_points_as_lists: List of lists (which are JSONified ChangePoints).

	Returns:
	A list of URLs, each of which is for a graph for one unconfirmed alert.
	"""
	anomaly_revs = {c[0] for c in change_points_as_lists}
	unconfirmed_revs = []
	for confirmed_rev_group in bench.confirmed_anomaly_revs:
	if not anomaly_revs.intersection(confirmed_rev_group):
	# The alert for the this confirmed rev group is "unconfirmed" by the
	# experimental function. It should be added to the list.
	mid_index = len(confirmed_rev_group) / 2
	unconfirmed_revs.append(confirmed_rev_group[mid_index])
	return [_GraphLink(bench.test, rev) for rev in unconfirmed_revs]


	def _ExtraAlertLinks(bench, change_points_as_lists):
	"""Makes a list of links to view "extra" alerts found.

	Here, an "extra" alert means an alert that was found by the experimental
	function but doesn't coincide with any Anomaly in the datastore, regardless
	of whether that Anomaly would be found by the current default alerting
	function.

	Args:
	bench: A TestBench entity.
	change_points_as_lists: List of lists (which are JSONified ChangePoints).

	Returns:
	A list of URLs, each of which is for a graph for one extra alert.
	"""
	anomaly_revs = {c[0] for c in change_points_as_lists}
	confirmed_revs = _Flatten(bench.confirmed_anomaly_revs)
	invalid_revs = _Flatten(bench.invalid_anomaly_revs)
	# Both "confirmed revs" and "invalid revs" are previously fired alerts.
	extra_revs = anomaly_revs.difference(confirmed_revs, invalid_revs)
	return [_GraphLink(bench.test, rev) for rev in extra_revs]


	def _GraphLink(test_key, rev):
	"""Returns an HTML link to view the graph for an alert."""
	test_path = utils.TestPath(test_key)
	master, bot, test = test_path.split('/', 2)
	query = '?masters=%s&bots=%s&tests=%s&rev=%s' % (master, bot, test, rev)
	return '<a href="https://%s/report%s">%s/%s@%s</a>' % (
	app_identity.get_default_version_hostname(), query, bot, test, rev)


	class RunExperimentalChunkPipeline(pipeline.Pipeline):

	def run(self, bench_name, test_bench_ids): # pylint: disable=invalid-name
	"""Runs the experimental find_change_points on each TestBench entity.

	This runs SimulateAlertProcessing in parallel and returns a list of
	the combined results.

	Args:
	bench_name: A string bench name.
	test_bench_ids: List of TestBench IDs.

	Yields:
	Pipeline instance.
	"""
	results = []
	for bench_id in test_bench_ids:
	result_future = yield SimulateAlertProcessingPipeline(
	bench_name, bench_id)
	results.append(result_future)
	yield pipeline_common.List(*results)


	class RunExperimentalPipeline(pipeline.Pipeline):

	def run(self, bench_name, description): # pylint: disable=invalid-name
	"""The root pipeline that start simulation tasks and generating report.

	This spawns tasks to spawn more tasks that run simulation and executes the
	generate report task on the aggregated the results.

	Args:
	bench_name: A string bench name.
	description: A string description of this bench job.

	Yields:
	Pipeline instance.
	"""
	test_bench_keys = TestBench.query().fetch(keys_only=True)
	test_bench_ids = [k.integer_id() for k in test_bench_keys]

	results = []
	# Size of number of taskqueue tasks we want to spawn per pipeline.
	pipeline_chunk_size = 1000
	for i in xrange(0, len(test_bench_ids), pipeline_chunk_size):
	id_chunk = test_bench_ids[i:i + pipeline_chunk_size]
	result_future = yield RunExperimentalChunkPipeline(
	bench_name, id_chunk)
	results.append(result_future)

	combined_results = yield pipeline_common.Extend(*results)
	yield GenerateComparisonReportPipeline(
	bench_name, description, combined_results)


	def SetupBaseDataForBench():
	"""Adds tasks to queue to create base data for bench."""
	if TestBench.query().fetch(keys_only=True, limit=1):
	raise Exception('Base data already exist.')

	# This will take a while, so we do it in a task queue.
	deferred.defer(_AddCreateTestBenchTasks, _queue=_TASK_QUEUE_NAME)


	def BenchFindChangePoints(bench_name, description):
	"""Submits a bench job for a bench_name and description.

	Requires an implementation of find_change_points added to
	_EXPERIMENTAL_FUNCTIONS. At least bench_name or description must
	be different for each job.

	Args:
	bench_name: A string bench name which should exist in they keys of
	_EXPERIMENTAL_FUNCTIONS.
	description: A string description of this bench job.

	Raises:
	ValueError: The input was not valid.
	Exception: Not enough data available.
	"""
	if bench_name not in _EXPERIMENTAL_FUNCTIONS:
	raise ValueError('%s is not a valid find anomalies bench function.' %
	bench_name)

	bench_key = '%s.%s' % (bench_name, description)
	submitted_benches = layered_cache.Get(_FIND_ANOMALIES_BENCH_CACHE_KEY)
	if not submitted_benches:
	submitted_benches = {}
	if bench_key in submitted_benches:
	raise ValueError('Bench job for "%s. %s" already in submitted.' %
	(bench_name, description))

	submitted_benches[bench_key] = True
	layered_cache.Set(_FIND_ANOMALIES_BENCH_CACHE_KEY, submitted_benches)

	# Check if base bench data exist.
	if not TestBench.query().fetch(keys_only=True, limit=1):
	raise Exception('No base data available to bench against.')

	# Add to taskqueue to run simulation.
	stage = RunExperimentalPipeline(bench_name, description)
	stage.start(queue_name=_TASK_QUEUE_NAME)


	def DeleteAllTestBenchEntities():
	"""Deletes all TestBench data."""
	ndb.delete_multi(TestBench.query().fetch(keys_only=True))


	def _AddCreateTestBenchTasks():
	"""Adds _CreateTestBench tasks to queue."""
	sheriff_key = ndb.Key('Sheriff', _TEST_DATA_SHERIFF)
	query = graph_data.Test.query(
	graph_data.Test.sheriff == sheriff_key,
	graph_data.Test.has_rows == True,
	graph_data.Test.deprecated == False)

	tests = query.fetch(limit=_NUM_TEST_TO_BENCH)

	tests = [t for t in tests if _GetSheriffForTest(t) and not _IsRefBuild(t)]
	for test in tests:
	deferred.defer(_CreateTestBench, test.key, _queue=_TASK_QUEUE_NAME)


	def _CreateTestBench(test_key):
	"""Fetches and stores test and row data that would be used to run bench."""
	# Get rows entity.
	query = graph_data.Row.query(projection=['revision', 'value'])
	query = query.filter(graph_data.Row.parent_test == test_key)
	query = query.order(-graph_data.Row.revision)
	rows = list(reversed(query.fetch(limit=_NUM_ROWS_TO_BENCH)))
	data_series = [(row.revision, row.value) for row in rows]

	# Add TestBench entity.
	test_bench = TestBench(test=test_key, data_series=data_series)
	_UpdateInvalidAndConfirmedAnomalyRevs(test_bench)
	_RunBaseAlertProcessing(test_bench)
	test_bench.put()


	def _AddReportToLog(report_dict):
	"""Adds a log for bench results."""
	report = _REPORT_TEMPLATE % report_dict
	formatter = quick_logger.Formatter()
	logger = quick_logger.QuickLogger(
	'bench_find_anomalies', 'report', formatter)
	logger.Log(report)
	logger.Save()


	def _Flatten(list_of_list):
	"""Creates set of all items in the sublists."""
	flattened = set()
	for item in list_of_list:
	flattened.update(item)
	return flattened


	def _UpdateInvalidAndConfirmedAnomalyRevs(test_bench):
	"""Updates TestBench entity with invalid and confirmed anomalies revs."""

	# Start rev for getting Anomalies should be at min_segment_size.
	test = test_bench.test.get()
	config_dict = anomaly_config.GetAnomalyConfigDict(test)
	min_segment_size = config_dict.get(
	'min_segment_size', find_change_points.MIN_SEGMENT_SIZE)
	start_index = min(min_segment_size, len(test_bench.data_series)) - 1
	start_rev = test_bench.data_series[start_index][0]

	query = anomaly.Anomaly.query(anomaly.Anomaly.test == test_bench.test)
	anomalies = query.fetch()
	anomalies.sort(key=lambda a: a.end_revision)
	anomalies = [a for a in anomalies if a.end_revision >= start_rev and
	not a.is_improvement]

	test_bench.invalid_anomaly_revs = [
	_GetRevsAroundRev(test_bench.data_series, a.end_revision)
	for a in anomalies if a.bug_id == -1]
	test_bench.confirmed_anomaly_revs = [
	_GetRevsAroundRev(test_bench.data_series, a.end_revision)
	for a in anomalies if a.bug_id > 0]


	def _RunBaseAlertProcessing(test_bench):
	"""Runs base alert processing simulation on TestBench entity.

	This function runs the current find_change_points.FindChangePoints
	implementation and saves the revisions around the found anomalies to
	a TestBench entity.

	Args:
	test_bench: A TestBench entity.
	"""
	test = test_bench.test.get()
	config_dict = anomaly_config.GetAnomalyConfigDict(test)
	change_points = debug_alert.SimulateAlertProcessing(
	test_bench.data_series, **config_dict)

	test_bench.base_anomaly_revs = [
	_GetRevsAroundRev(test_bench.data_series, change_point.x_value)
	for change_point in change_points if _IsRegression(change_point, test)]


	def _GetRevsAroundRev(data_series, revision):
	"""Gets a list of revisions from before to after a given revision.

	Args:
	data_series: A list of (revision, value).
	revision: A revision number.

	Returns:
	A list of revisions.
	"""
	if not _MAX_SEGMENT_SIZE_AROUND_ANOMALY:
	return [revision]

	middle_index = 0
	for i in xrange(len(data_series)):
	if data_series[i][0] == revision:
	middle_index = i
	break
	start_index = max(0, middle_index - _MAX_SEGMENT_SIZE_AROUND_ANOMALY)
	end_index = middle_index + _MAX_SEGMENT_SIZE_AROUND_ANOMALY + 1
	series_around_rev = data_series[start_index:end_index]
	return [s[0] for s in series_around_rev]


	def _IsRefBuild(test):
	"""Returns True if test is a reference build."""
	key_path = test.key.string_id()
	return key_path[-1] == 'ref' or key_path[-1].endswith('_ref')


	def _GetSheriffForTest(test):
	"""Gets the Sheriff for a test, or None if no sheriff."""
	if test.sheriff:
	return test.sheriff.get()
	return None


	def _IsRegression(change_point, test):
	"""Returns whether the alert is a regression for the given test.

	Args:
	change_point: A find_change_points.ChangePoint object.
	test: Test to get the regression direction for.

	Returns:
	True if it is a regression anomaly, otherwise False.
	"""
	median_before = change_point.median_before
	median_after = change_point.median_after
	if (median_before < median_after and
	test.improvement_direction == anomaly.UP):
	return False
	if (median_before >= median_after and
	test.improvement_direction == anomaly.DOWN):
	return False
	return True