blob: 782f199a60f6aba0a256b68008caf5a818353c55 [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""A simplified change-point detection algorithm.
Historically, the performance dashboard has used the GASP service for
detection. Brett Schein wrote a simplified version of this algorithm
for the dashboard in Matlab, and this was ported to Python by Dave Tu.
The general goal is to find any increase or decrease which is likely to
represent a real change in the underlying data source.
See: http://en.wikipedia.org/wiki/Step_detection
"""
import collections
from dashboard import find_step
from dashboard import math_utils
from dashboard import ttest
# Maximum number of points to consider at one time.
_MAX_WINDOW_SIZE = 50
# Minimum number of points in a segment. This can help filter out erroneous
# results by ignoring results that were found from looking at too few points.
MIN_SEGMENT_SIZE = 6
# Minimum absolute difference between medians before and after.
_MIN_ABSOLUTE_CHANGE = 0
# Minimum relative difference between medians before and after.
_MIN_RELATIVE_CHANGE = 0.01
# "Steppiness" is a number between 0 and 1 that indicates how similar the
# shape is to a perfect step function, where 1 represents a step function.
_MIN_STEPPINESS = 0.5
# The "standard deviation" is based on a subset of points in the series.
# This parameter is the minimum acceptable ratio of the relative change
# and this standard deviation.
_MULTIPLE_OF_STD_DEV = 2.5
class ChangePoint(collections.namedtuple(
'ChangePoint', (
# The x-value of the first point after a step.
'x_value',
# Median of the segments before and after the change point.
'median_before', 'median_after',
# Number of points before and after the change point.
'size_before', 'size_after',
# X-values of the first and last point in the series window used.
'window_start', 'window_end',
# Relative change from before to after.
'relative_change',
# Standard deviation of points before.
'std_dev_before',
# Results of the Welch's t-test for values before and after.
't_statistic', 'degrees_of_freedom', 'p_value'
))):
"""A ChangePoint represents a change in a series -- a potential alert."""
def AsDict(self):
"""Returns a dictionary mapping attributes to values."""
return self._asdict()
def FindChangePoints(
series,
max_window_size=_MAX_WINDOW_SIZE,
min_segment_size=MIN_SEGMENT_SIZE,
min_absolute_change=_MIN_ABSOLUTE_CHANGE,
min_relative_change=_MIN_RELATIVE_CHANGE,
min_steppiness=_MIN_STEPPINESS,
multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
"""Finds at most one change point in the given series.
Only the last |max_window_size| points are examined, regardless of
how many points are passed in. The reason why it might make sense to
limit the number of points to look at is that if there are multiple
change-points in the window that's looked at, then this function will
be less likely to find any of them.
First, the "most likely" split is found. The most likely split is defined as
a split that minimizes the sum of the variances on either side. Then the
split point is checked to see whether it passes the thresholds.
Args:
series: A list of (x, y) pairs.
max_window_size: Number of points to analyze.
min_segment_size: Min size of segments before or after change point.
min_absolute_change: Absolute change threshold.
min_relative_change: Relative change threshold.
min_steppiness: Threshold for how similar to a step a change point must be.
multiple_of_std_dev: Threshold for change as multiple of std. deviation.
Returns:
A list with one ChangePoint object, or an empty list.
"""
if len(series) < 2:
return [] # Not enough points to possibly contain a valid split point.
series = series[-max_window_size:]
_, y_values = zip(*series)
split_index = _FindSplit(y_values)
if _PassesThresholds(
y_values, split_index,
min_segment_size=min_segment_size,
min_absolute_change=min_absolute_change,
min_relative_change=min_relative_change,
min_steppiness=min_steppiness,
multiple_of_std_dev=multiple_of_std_dev):
return [MakeChangePoint(series, split_index)]
return []
def MakeChangePoint(series, split_index):
"""Makes a ChangePoint object for the given series at the given point.
Args:
series: A list of (x, y) pairs.
split_index: Index of the first point after the split.
Returns:
A ChangePoint object.
"""
assert 0 <= split_index < len(series)
x_values, y_values = zip(*series)
left, right = y_values[:split_index], y_values[split_index:]
left_median, right_median = math_utils.Median(left), math_utils.Median(right)
ttest_results = ttest.WelchsTTest(left, right)
return ChangePoint(
x_value=x_values[split_index],
median_before=left_median,
median_after=right_median,
size_before=len(left),
size_after=len(right),
window_start=x_values[0],
window_end=x_values[-1], # inclusive bound
relative_change=math_utils.RelativeChange(left_median, right_median),
std_dev_before=math_utils.StandardDeviation(left),
t_statistic=ttest_results.t,
degrees_of_freedom=ttest_results.df,
p_value=ttest_results.p)
def _FindSplit(values):
"""Finds the index of the "most interesting" split of a sample of data.
Currently, the most interesting split is considered to be the split that
minimizes the standard deviation of the two sides concatenated together
(after modifying both sides by shifting all the numbers in the left and
right sides by the median of the left and right sides respectively).
The reason why this is done is that normalizing the two segments on either
side of a point so that both have the same center essentially removes any
jump or step that occurs at that point.
Args:
values: A list of numbers.
Returns:
The index of the "most interesting" point.
"""
def StdDevOfTwoNormalizedSides(index):
left, right = values[:index], values[index:]
return math_utils.StandardDeviation(_ZeroMedian(left) + _ZeroMedian(right))
return min(range(1, len(values)), key=StdDevOfTwoNormalizedSides)
def _ZeroMedian(values):
"""Subtracts the median value in the list from all values in the list."""
median = math_utils.Median(values)
return [val - median for val in values]
def _PassesThresholds(
values, split_index, min_segment_size, min_absolute_change,
min_relative_change, min_steppiness, multiple_of_std_dev):
"""Checks whether a point in a series appears to be an change point.
Args:
values: A list of numbers.
split_index: An index in the list of numbers.
min_segment_size: Threshold for size of segments before or after a point.
min_absolute_change: Minimum absolute median change threshold.
min_relative_change: Minimum relative median change threshold.
min_steppiness: Threshold for how similar to a step a change point must be.
multiple_of_std_dev: Threshold for change as multiple of std. deviation.
Returns:
True if it passes all of the thresholds, False otherwise.
"""
left, right = values[:split_index], values[split_index:]
left_median, right_median = math_utils.Median(left), math_utils.Median(right)
# 1. Segment size filter.
if len(left) < min_segment_size or len(right) < min_segment_size:
return False
# 2. Absolute change filter.
absolute_change = abs(left_median - right_median)
if absolute_change < min_absolute_change:
return False
# 3. Relative change filter.
relative_change = math_utils.RelativeChange(left_median, right_median)
if relative_change < min_relative_change:
return False
# 4. Multiple of standard deviation filter.
min_std_dev = min(
math_utils.StandardDeviation(left),
math_utils.StandardDeviation(right))
if absolute_change < multiple_of_std_dev * min_std_dev:
return False
# 5. Steppiness filter.
steppiness = find_step.Steppiness(values, split_index)
if steppiness < min_steppiness:
return False
# Passed all filters!
return True