catapult/dashboard/dashboard/find_change_points.py - platform/external/chromium-trace - Git at Google

 # Copyright 2015 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """A simplified change-point detection algorithm.

 Historically, the performance dashboard has used the GASP service for
 detection. Brett Schein wrote a simplified version of this algorithm
 for the dashboard in Matlab, and this was ported to Python by Dave Tu.

 The general goal is to find any increase or decrease which is likely to
 represent a real change in the underlying data source.

 See: http://en.wikipedia.org/wiki/Step_detection
 """

 import collections

 from dashboard import find_step
 from dashboard import math_utils
 from dashboard import ttest

 # Maximum number of points to consider at one time.
 _MAX_WINDOW_SIZE = 50

 # Minimum number of points in a segment. This can help filter out erroneous
 # results by ignoring results that were found from looking at too few points.
 MIN_SEGMENT_SIZE = 6

 # Minimum absolute difference between medians before and after.
 _MIN_ABSOLUTE_CHANGE = 0

 # Minimum relative difference between medians before and after.
 _MIN_RELATIVE_CHANGE = 0.01

 # "Steppiness" is a number between 0 and 1 that indicates how similar the
 # shape is to a perfect step function, where 1 represents a step function.
 _MIN_STEPPINESS = 0.5

 # The "standard deviation" is based on a subset of points in the series.
 # This parameter is the minimum acceptable ratio of the relative change
 # and this standard deviation.
 _MULTIPLE_OF_STD_DEV = 2.5


 class ChangePoint(collections.namedtuple(
     'ChangePoint', (
         # The x-value of the first point after a step.
         'x_value',
         # Median of the segments before and after the change point.
         'median_before', 'median_after',
         # Number of points before and after the change point.
         'size_before', 'size_after',
         # X-values of the first and last point in the series window used.
         'window_start', 'window_end',
         # Relative change from before to after.
         'relative_change',
         # Standard deviation of points before.
         'std_dev_before',
         # Results of the Welch's t-test for values before and after.
         't_statistic', 'degrees_of_freedom', 'p_value'
     ))):
   """A ChangePoint represents a change in a series -- a potential alert."""

   def AsDict(self):
     """Returns a dictionary mapping attributes to values."""
     return self._asdict()


 def FindChangePoints(
     series,
     max_window_size=_MAX_WINDOW_SIZE,
     min_segment_size=MIN_SEGMENT_SIZE,
     min_absolute_change=_MIN_ABSOLUTE_CHANGE,
     min_relative_change=_MIN_RELATIVE_CHANGE,
     min_steppiness=_MIN_STEPPINESS,
     multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
   """Finds at most one change point in the given series.

   Only the last |max_window_size| points are examined, regardless of
   how many points are passed in. The reason why it might make sense to
   limit the number of points to look at is that if there are multiple
   change-points in the window that's looked at, then this function will
   be less likely to find any of them.

   First, the "most likely" split is found. The most likely split is defined as
   a split that minimizes the sum of the variances on either side. Then the
   split point is checked to see whether it passes the thresholds.

   Args:
     series: A list of (x, y) pairs.
     max_window_size: Number of points to analyze.
     min_segment_size: Min size of segments before or after change point.
     min_absolute_change: Absolute change threshold.
     min_relative_change: Relative change threshold.
     min_steppiness: Threshold for how similar to a step a change point must be.
     multiple_of_std_dev: Threshold for change as multiple of std. deviation.

   Returns:
     A list with one ChangePoint object, or an empty list.
   """
   if len(series) < 2:
     return []  # Not enough points to possibly contain a valid split point.
   series = series[-max_window_size:]
   _, y_values = zip(*series)
   split_index = _FindSplit(y_values)
   if _PassesThresholds(
       y_values, split_index,
       min_segment_size=min_segment_size,
       min_absolute_change=min_absolute_change,
       min_relative_change=min_relative_change,
       min_steppiness=min_steppiness,
       multiple_of_std_dev=multiple_of_std_dev):
     return [MakeChangePoint(series, split_index)]
   return []


 def MakeChangePoint(series, split_index):
   """Makes a ChangePoint object for the given series at the given point.

   Args:
     series: A list of (x, y) pairs.
     split_index: Index of the first point after the split.

   Returns:
     A ChangePoint object.
   """
   assert 0 <= split_index < len(series)
   x_values, y_values = zip(*series)
   left, right = y_values[:split_index], y_values[split_index:]
   left_median, right_median = math_utils.Median(left), math_utils.Median(right)
   ttest_results = ttest.WelchsTTest(left, right)
   return ChangePoint(
       x_value=x_values[split_index],
       median_before=left_median,
       median_after=right_median,
       size_before=len(left),
       size_after=len(right),
       window_start=x_values[0],
       window_end=x_values[-1],  # inclusive bound
       relative_change=math_utils.RelativeChange(left_median, right_median),
       std_dev_before=math_utils.StandardDeviation(left),
       t_statistic=ttest_results.t,
       degrees_of_freedom=ttest_results.df,
       p_value=ttest_results.p)


 def _FindSplit(values):
   """Finds the index of the "most interesting" split of a sample of data.

   Currently, the most interesting split is considered to be the split that
   minimizes the standard deviation of the two sides concatenated together
   (after modifying both sides by shifting all the numbers in the left and
   right sides by the median of the left and right sides respectively).

   The reason why this is done is that normalizing the two segments on either
   side of a point so that both have the same center essentially removes any
   jump or step that occurs at that point.

   Args:
     values: A list of numbers.

   Returns:
     The index of the "most interesting" point.
   """
   def StdDevOfTwoNormalizedSides(index):
     left, right = values[:index], values[index:]
     return math_utils.StandardDeviation(_ZeroMedian(left) + _ZeroMedian(right))
   return min(range(1, len(values)), key=StdDevOfTwoNormalizedSides)


 def _ZeroMedian(values):
   """Subtracts the median value in the list from all values in the list."""
   median = math_utils.Median(values)
   return [val - median for val in values]


 def _PassesThresholds(
     values, split_index, min_segment_size, min_absolute_change,
     min_relative_change, min_steppiness, multiple_of_std_dev):
   """Checks whether a point in a series appears to be an change point.

   Args:
     values: A list of numbers.
     split_index: An index in the list of numbers.
     min_segment_size: Threshold for size of segments before or after a point.
     min_absolute_change: Minimum absolute median change threshold.
     min_relative_change: Minimum relative median change threshold.
     min_steppiness: Threshold for how similar to a step a change point must be.
     multiple_of_std_dev: Threshold for change as multiple of std. deviation.

   Returns:
     True if it passes all of the thresholds, False otherwise.
   """
   left, right = values[:split_index], values[split_index:]
   left_median, right_median = math_utils.Median(left), math_utils.Median(right)

   # 1. Segment size filter.
   if len(left) < min_segment_size or len(right) < min_segment_size:
     return False

   # 2. Absolute change filter.
   absolute_change = abs(left_median - right_median)
   if absolute_change < min_absolute_change:
     return False

   # 3. Relative change filter.
   relative_change = math_utils.RelativeChange(left_median, right_median)
   if relative_change < min_relative_change:
     return False

   # 4. Multiple of standard deviation filter.
   min_std_dev = min(
       math_utils.StandardDeviation(left),
       math_utils.StandardDeviation(right))
   if absolute_change < multiple_of_std_dev * min_std_dev:
     return False

   # 5. Steppiness filter.
   steppiness = find_step.Steppiness(values, split_index)
   if steppiness < min_steppiness:
     return False

   # Passed all filters!
   return True
	# Copyright 2015 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""A simplified change-point detection algorithm.

	Historically, the performance dashboard has used the GASP service for
	detection. Brett Schein wrote a simplified version of this algorithm
	for the dashboard in Matlab, and this was ported to Python by Dave Tu.

	The general goal is to find any increase or decrease which is likely to
	represent a real change in the underlying data source.

	See: http://en.wikipedia.org/wiki/Step_detection
	"""

	import collections

	from dashboard import find_step
	from dashboard import math_utils
	from dashboard import ttest

	# Maximum number of points to consider at one time.
	_MAX_WINDOW_SIZE = 50

	# Minimum number of points in a segment. This can help filter out erroneous
	# results by ignoring results that were found from looking at too few points.
	MIN_SEGMENT_SIZE = 6

	# Minimum absolute difference between medians before and after.
	_MIN_ABSOLUTE_CHANGE = 0

	# Minimum relative difference between medians before and after.
	_MIN_RELATIVE_CHANGE = 0.01

	# "Steppiness" is a number between 0 and 1 that indicates how similar the
	# shape is to a perfect step function, where 1 represents a step function.
	_MIN_STEPPINESS = 0.5

	# The "standard deviation" is based on a subset of points in the series.
	# This parameter is the minimum acceptable ratio of the relative change
	# and this standard deviation.
	_MULTIPLE_OF_STD_DEV = 2.5


	class ChangePoint(collections.namedtuple(
	'ChangePoint', (
	# The x-value of the first point after a step.
	'x_value',
	# Median of the segments before and after the change point.
	'median_before', 'median_after',
	# Number of points before and after the change point.
	'size_before', 'size_after',
	# X-values of the first and last point in the series window used.
	'window_start', 'window_end',
	# Relative change from before to after.
	'relative_change',
	# Standard deviation of points before.
	'std_dev_before',
	# Results of the Welch's t-test for values before and after.
	't_statistic', 'degrees_of_freedom', 'p_value'
	))):
	"""A ChangePoint represents a change in a series -- a potential alert."""

	def AsDict(self):
	"""Returns a dictionary mapping attributes to values."""
	return self._asdict()


	def FindChangePoints(
	series,
	max_window_size=_MAX_WINDOW_SIZE,
	min_segment_size=MIN_SEGMENT_SIZE,
	min_absolute_change=_MIN_ABSOLUTE_CHANGE,
	min_relative_change=_MIN_RELATIVE_CHANGE,
	min_steppiness=_MIN_STEPPINESS,
	multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
	"""Finds at most one change point in the given series.

	Only the last \|max_window_size\| points are examined, regardless of
	how many points are passed in. The reason why it might make sense to
	limit the number of points to look at is that if there are multiple
	change-points in the window that's looked at, then this function will
	be less likely to find any of them.

	First, the "most likely" split is found. The most likely split is defined as
	a split that minimizes the sum of the variances on either side. Then the
	split point is checked to see whether it passes the thresholds.

	Args:
	series: A list of (x, y) pairs.
	max_window_size: Number of points to analyze.
	min_segment_size: Min size of segments before or after change point.
	min_absolute_change: Absolute change threshold.
	min_relative_change: Relative change threshold.
	min_steppiness: Threshold for how similar to a step a change point must be.
	multiple_of_std_dev: Threshold for change as multiple of std. deviation.

	Returns:
	A list with one ChangePoint object, or an empty list.
	"""
	if len(series) < 2:
	return [] # Not enough points to possibly contain a valid split point.
	series = series[-max_window_size:]
	_, y_values = zip(*series)
	split_index = _FindSplit(y_values)
	if _PassesThresholds(
	y_values, split_index,
	min_segment_size=min_segment_size,
	min_absolute_change=min_absolute_change,
	min_relative_change=min_relative_change,
	min_steppiness=min_steppiness,
	multiple_of_std_dev=multiple_of_std_dev):
	return [MakeChangePoint(series, split_index)]
	return []


	def MakeChangePoint(series, split_index):
	"""Makes a ChangePoint object for the given series at the given point.

	Args:
	series: A list of (x, y) pairs.
	split_index: Index of the first point after the split.

	Returns:
	A ChangePoint object.
	"""
	assert 0 <= split_index < len(series)
	x_values, y_values = zip(*series)
	left, right = y_values[:split_index], y_values[split_index:]
	left_median, right_median = math_utils.Median(left), math_utils.Median(right)
	ttest_results = ttest.WelchsTTest(left, right)
	return ChangePoint(
	x_value=x_values[split_index],
	median_before=left_median,
	median_after=right_median,
	size_before=len(left),
	size_after=len(right),
	window_start=x_values[0],
	window_end=x_values[-1], # inclusive bound
	relative_change=math_utils.RelativeChange(left_median, right_median),
	std_dev_before=math_utils.StandardDeviation(left),
	t_statistic=ttest_results.t,
	degrees_of_freedom=ttest_results.df,
	p_value=ttest_results.p)


	def _FindSplit(values):
	"""Finds the index of the "most interesting" split of a sample of data.

	Currently, the most interesting split is considered to be the split that
	minimizes the standard deviation of the two sides concatenated together
	(after modifying both sides by shifting all the numbers in the left and
	right sides by the median of the left and right sides respectively).

	The reason why this is done is that normalizing the two segments on either
	side of a point so that both have the same center essentially removes any
	jump or step that occurs at that point.

	Args:
	values: A list of numbers.

	Returns:
	The index of the "most interesting" point.
	"""
	def StdDevOfTwoNormalizedSides(index):
	left, right = values[:index], values[index:]
	return math_utils.StandardDeviation(_ZeroMedian(left) + _ZeroMedian(right))
	return min(range(1, len(values)), key=StdDevOfTwoNormalizedSides)


	def _ZeroMedian(values):
	"""Subtracts the median value in the list from all values in the list."""
	median = math_utils.Median(values)
	return [val - median for val in values]


	def _PassesThresholds(
	values, split_index, min_segment_size, min_absolute_change,
	min_relative_change, min_steppiness, multiple_of_std_dev):
	"""Checks whether a point in a series appears to be an change point.

	Args:
	values: A list of numbers.
	split_index: An index in the list of numbers.
	min_segment_size: Threshold for size of segments before or after a point.
	min_absolute_change: Minimum absolute median change threshold.
	min_relative_change: Minimum relative median change threshold.
	min_steppiness: Threshold for how similar to a step a change point must be.
	multiple_of_std_dev: Threshold for change as multiple of std. deviation.

	Returns:
	True if it passes all of the thresholds, False otherwise.
	"""
	left, right = values[:split_index], values[split_index:]
	left_median, right_median = math_utils.Median(left), math_utils.Median(right)

	# 1. Segment size filter.
	if len(left) < min_segment_size or len(right) < min_segment_size:
	return False

	# 2. Absolute change filter.
	absolute_change = abs(left_median - right_median)
	if absolute_change < min_absolute_change:
	return False

	# 3. Relative change filter.
	relative_change = math_utils.RelativeChange(left_median, right_median)
	if relative_change < min_relative_change:
	return False

	# 4. Multiple of standard deviation filter.
	min_std_dev = min(
	math_utils.StandardDeviation(left),
	math_utils.StandardDeviation(right))
	if absolute_change < multiple_of_std_dev * min_std_dev:
	return False

	# 5. Steppiness filter.
	steppiness = find_step.Steppiness(values, split_index)
	if steppiness < min_steppiness:
	return False

	# Passed all filters!
	return True