catapult/telemetry/telemetry/util/statistics.py - platform/external/chromium-trace - Git at Google

 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """A collection of statistical utility functions to be used by metrics."""

 import math


 def Clamp(value, low=0.0, high=1.0):
   """Clamp a value between some low and high value."""
   return min(max(value, low), high)


 def NormalizeSamples(samples):
   """Sorts the samples, and map them linearly to the range [0,1].

   They're mapped such that for the N samples, the first sample is 0.5/N and the
   last sample is (N-0.5)/N.

   Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
   twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
   we don't want to distinguish between these two cases, as our original domain
   is not bounded (it is for Monte Carlo integration, where discrepancy was
   first used).
   """
   if not samples:
     return samples, 1.0
   samples = sorted(samples)
   low = min(samples)
   high = max(samples)
   new_low = 0.5 / len(samples)
   new_high = (len(samples)-0.5) / len(samples)
   if high-low == 0.0:
     return [0.5] * len(samples), 1.0
   scale = (new_high - new_low) / (high - low)
   for i in xrange(0, len(samples)):
     samples[i] = float(samples[i] - low) * scale + new_low
   return samples, scale


 def Discrepancy(samples, location_count=None):
   """Computes the discrepancy of a set of 1D samples from the interval [0,1].

   The samples must be sorted. We define the discrepancy of an empty set
   of samples to be zero.

   http://en.wikipedia.org/wiki/Low-discrepancy_sequence
   http://mathworld.wolfram.com/Discrepancy.html
   """
   if not samples:
     return 0.0

   max_local_discrepancy = 0
   inv_sample_count = 1.0 / len(samples)
   locations = []
   # For each location, stores the number of samples less than that location.
   count_less = []
   # For each location, stores the number of samples less than or equal to that
   # location.
   count_less_equal = []

   if location_count:
     # Generate list of equally spaced locations.
     sample_index = 0
     for i in xrange(0, int(location_count)):
       location = float(i) / (location_count-1)
       locations.append(location)
       while sample_index < len(samples) and samples[sample_index] < location:
         sample_index += 1
       count_less.append(sample_index)
       while  sample_index < len(samples) and samples[sample_index] <= location:
         sample_index += 1
       count_less_equal.append(sample_index)
   else:
     # Populate locations with sample positions. Append 0 and 1 if necessary.
     if samples[0] > 0.0:
       locations.append(0.0)
       count_less.append(0)
       count_less_equal.append(0)
     for i in xrange(0, len(samples)):
       locations.append(samples[i])
       count_less.append(i)
       count_less_equal.append(i+1)
     if samples[-1] < 1.0:
       locations.append(1.0)
       count_less.append(len(samples))
       count_less_equal.append(len(samples))

   # Compute discrepancy as max(overshoot, -undershoot), where
   # overshoot = max(count_closed(i, j)/N - length(i, j)) for all i < j,
   # undershoot = min(count_open(i, j)/N - length(i, j)) for all i < j,
   # N = len(samples),
   # count_closed(i, j) is the number of points between i and j including ends,
   # count_open(i, j) is the number of points between i and j excluding ends,
   # length(i, j) is locations[i] - locations[j].

   # The following algorithm is modification of Kadane's algorithm,
   # see https://en.wikipedia.org/wiki/Maximum_subarray_problem.

   # The maximum of (count_closed(k, i-1)/N - length(k, i-1)) for any k < i-1.
   max_diff = 0
   # The minimum of (count_open(k, i-1)/N - length(k, i-1)) for any k < i-1.
   min_diff = 0
   for i in xrange(1, len(locations)):
     length = locations[i] - locations[i - 1]
     count_closed = count_less_equal[i] - count_less[i - 1]
     count_open = count_less[i] - count_less_equal[i - 1]
     # Number of points that are added if we extend a closed range that
     # ends at location (i-1).
     count_closed_increment = count_less_equal[i] - count_less_equal[i - 1]
     # Number of points that are added if we extend an open range that
     # ends at location (i-1).
     count_open_increment = count_less[i] - count_less[i - 1]

     # Either extend the previous optimal range or start a new one.
     max_diff = max(
         float(count_closed_increment) * inv_sample_count - length + max_diff,
         float(count_closed) * inv_sample_count - length)
     min_diff = min(
         float(count_open_increment) * inv_sample_count - length + min_diff,
         float(count_open) * inv_sample_count - length)

     max_local_discrepancy = max(max_diff, -min_diff, max_local_discrepancy)
   return max_local_discrepancy


 def TimestampsDiscrepancy(timestamps, absolute=True,
                           location_count=None):
   """A discrepancy based metric for measuring timestamp jank.

   TimestampsDiscrepancy quantifies the largest area of jank observed in a series
   of timestamps.  Note that this is different from metrics based on the
   max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
   B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
   Discrepancy(B) > Discrepancy(A).

   Two variants of discrepancy can be computed:

   Relative discrepancy is following the original definition of
   discrepancy. It characterized the largest area of jank, relative to the
   duration of the entire time stamp series.  We normalize the raw results,
   because the best case discrepancy for a set of N samples is 1/N (for
   equally spaced samples), and we want our metric to report 0.0 in that
   case.

   Absolute discrepancy also characterizes the largest area of jank, but its
   value wouldn't change (except for imprecisions due to a low
   |interval_multiplier|) if additional 'good' intervals were added to an
   exisiting list of time stamps.  Its range is [0,inf] and the unit is
   milliseconds.

   The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
   absolute discrepancy, but D has lower relative discrepancy than C.

   |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
   S_i is a time stamp series. In that case, the discrepancy D(S) is:
   D(S) = max(D(S_1), D(S_2), ..., D(S_N))
   """
   if not timestamps:
     return 0.0

   if isinstance(timestamps[0], list):
     range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
     return max(range_discrepancies)

   samples, sample_scale = NormalizeSamples(timestamps)
   discrepancy = Discrepancy(samples, location_count)
   inv_sample_count = 1.0 / len(samples)
   if absolute:
     # Compute absolute discrepancy
     discrepancy /= sample_scale
   else:
     # Compute relative discrepancy
     discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
   return discrepancy


 def DurationsDiscrepancy(durations, absolute=True,
                          location_count=None):
   """A discrepancy based metric for measuring duration jank.

   DurationsDiscrepancy computes a jank metric which measures how irregular a
   given sequence of intervals is. In order to minimize jank, each duration
   should be equally long. This is similar to how timestamp jank works,
   and we therefore reuse the timestamp discrepancy function above to compute a
   similar duration discrepancy number.

   Because timestamp discrepancy is defined in terms of timestamps, we first
   convert the list of durations to monotonically increasing timestamps.

   Args:
     durations: List of interval lengths in milliseconds.
     absolute: See TimestampsDiscrepancy.
     interval_multiplier: See TimestampsDiscrepancy.
   """
   if not durations:
     return 0.0

   timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
   return TimestampsDiscrepancy(timestamps, absolute, location_count)


 def ArithmeticMean(data):
   """Calculates arithmetic mean.

   Args:
     data: A list of samples.

   Returns:
     The arithmetic mean value, or 0 if the list is empty.
   """
   numerator_total = Total(data)
   denominator_total = Total(len(data))
   return DivideIfPossibleOrZero(numerator_total, denominator_total)


 def StandardDeviation(data):
   """Calculates the standard deviation.

   Args:
     data: A list of samples.

   Returns:
     The standard deviation of the samples provided.
   """
   if len(data) == 1:
     return 0.0

   mean = ArithmeticMean(data)
   variances = [float(x) - mean for x in data]
   variances = [x * x for x in variances]
   std_dev = math.sqrt(ArithmeticMean(variances))

   return std_dev


 def TrapezoidalRule(data, dx):
   """ Calculate the integral according to the trapezoidal rule

   TrapezoidalRule approximates the definite integral of f from a to b by
   the composite trapezoidal rule, using n subintervals.
   http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid

   Args:
     data: A list of samples
     dx: The uniform distance along the x axis between any two samples

   Returns:
     The area under the curve defined by the samples and the uniform distance
     according to the trapezoidal rule.
   """

   n = len(data) - 1
   s = data[0] + data[n]

   if n == 0:
     return 0.0

   for i in range(1, n):
     s += 2 * data[i]

   return s * dx / 2.0

 def Total(data):
   """Returns the float value of a number or the sum of a list."""
   if type(data) == float:
     total = data
   elif type(data) == int:
     total = float(data)
   elif type(data) == list:
     total = float(sum(data))
   else:
     raise TypeError
   return total


 def DivideIfPossibleOrZero(numerator, denominator):
   """Returns the quotient, or zero if the denominator is zero."""
   return (float(numerator) / float(denominator)) if denominator else 0


 def GeneralizedMean(values, exponent):
   """See http://en.wikipedia.org/wiki/Generalized_mean"""
   if not values:
     return 0.0
   sum_of_powers = 0.0
   for v in values:
     sum_of_powers += v ** exponent
   return (sum_of_powers / len(values)) ** (1.0/exponent)


 def Median(values):
   """Gets the median of a list of values."""
   return Percentile(values, 50)


 def Percentile(values, percentile):
   """Calculates the value below which a given percentage of values fall.

   For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
   percentile for this set of values. When the percentage doesn't exactly
   match a rank in the list of values, the percentile is computed using linear
   interpolation between closest ranks.

   Args:
     values: A list of numerical values.
     percentile: A number between 0 and 100.

   Returns:
     The Nth percentile for the list of values, where N is the given percentage.
   """
   if not values:
     return 0.0
   sorted_values = sorted(values)
   n = len(values)
   percentile /= 100.0
   if percentile <= 0.5 / n:
     return sorted_values[0]
   elif percentile >= (n - 0.5) / n:
     return sorted_values[-1]
   else:
     floor_index = int(math.floor(n * percentile -  0.5))
     floor_value = sorted_values[floor_index]
     ceil_value = sorted_values[floor_index+1]
     alpha = n * percentile - 0.5 - floor_index
     return floor_value + alpha * (ceil_value - floor_value)


 def GeometricMean(values):
   """Compute a rounded geometric mean from an array of values."""
   if not values:
     return None
   # To avoid infinite value errors, make sure no value is less than 0.001.
   new_values = []
   for value in values:
     if value > 0.001:
       new_values.append(value)
     else:
       new_values.append(0.001)
   # Compute the sum of the log of the values.
   log_sum = sum(map(math.log, new_values))
   # Raise e to that sum over the number of values.
   mean = math.pow(math.e, (log_sum / len(new_values)))
   # Return the rounded mean.
   return int(round(mean))
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""A collection of statistical utility functions to be used by metrics."""

	import math


	def Clamp(value, low=0.0, high=1.0):
	"""Clamp a value between some low and high value."""
	return min(max(value, low), high)


	def NormalizeSamples(samples):
	"""Sorts the samples, and map them linearly to the range [0,1].

	They're mapped such that for the N samples, the first sample is 0.5/N and the
	last sample is (N-0.5)/N.

	Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
	twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
	we don't want to distinguish between these two cases, as our original domain
	is not bounded (it is for Monte Carlo integration, where discrepancy was
	first used).
	"""
	if not samples:
	return samples, 1.0
	samples = sorted(samples)
	low = min(samples)
	high = max(samples)
	new_low = 0.5 / len(samples)
	new_high = (len(samples)-0.5) / len(samples)
	if high-low == 0.0:
	return [0.5] * len(samples), 1.0
	scale = (new_high - new_low) / (high - low)
	for i in xrange(0, len(samples)):
	samples[i] = float(samples[i] - low) * scale + new_low
	return samples, scale


	def Discrepancy(samples, location_count=None):
	"""Computes the discrepancy of a set of 1D samples from the interval [0,1].

	The samples must be sorted. We define the discrepancy of an empty set
	of samples to be zero.

	http://en.wikipedia.org/wiki/Low-discrepancy_sequence
	http://mathworld.wolfram.com/Discrepancy.html
	"""
	if not samples:
	return 0.0

	max_local_discrepancy = 0
	inv_sample_count = 1.0 / len(samples)
	locations = []
	# For each location, stores the number of samples less than that location.
	count_less = []
	# For each location, stores the number of samples less than or equal to that
	# location.
	count_less_equal = []

	if location_count:
	# Generate list of equally spaced locations.
	sample_index = 0
	for i in xrange(0, int(location_count)):
	location = float(i) / (location_count-1)
	locations.append(location)
	while sample_index < len(samples) and samples[sample_index] < location:
	sample_index += 1
	count_less.append(sample_index)
	while sample_index < len(samples) and samples[sample_index] <= location:
	sample_index += 1
	count_less_equal.append(sample_index)
	else:
	# Populate locations with sample positions. Append 0 and 1 if necessary.
	if samples[0] > 0.0:
	locations.append(0.0)
	count_less.append(0)
	count_less_equal.append(0)
	for i in xrange(0, len(samples)):
	locations.append(samples[i])
	count_less.append(i)
	count_less_equal.append(i+1)
	if samples[-1] < 1.0:
	locations.append(1.0)
	count_less.append(len(samples))
	count_less_equal.append(len(samples))

	# Compute discrepancy as max(overshoot, -undershoot), where
	# overshoot = max(count_closed(i, j)/N - length(i, j)) for all i < j,
	# undershoot = min(count_open(i, j)/N - length(i, j)) for all i < j,
	# N = len(samples),
	# count_closed(i, j) is the number of points between i and j including ends,
	# count_open(i, j) is the number of points between i and j excluding ends,
	# length(i, j) is locations[i] - locations[j].

	# The following algorithm is modification of Kadane's algorithm,
	# see https://en.wikipedia.org/wiki/Maximum_subarray_problem.

	# The maximum of (count_closed(k, i-1)/N - length(k, i-1)) for any k < i-1.
	max_diff = 0
	# The minimum of (count_open(k, i-1)/N - length(k, i-1)) for any k < i-1.
	min_diff = 0
	for i in xrange(1, len(locations)):
	length = locations[i] - locations[i - 1]
	count_closed = count_less_equal[i] - count_less[i - 1]
	count_open = count_less[i] - count_less_equal[i - 1]
	# Number of points that are added if we extend a closed range that
	# ends at location (i-1).
	count_closed_increment = count_less_equal[i] - count_less_equal[i - 1]
	# Number of points that are added if we extend an open range that
	# ends at location (i-1).
	count_open_increment = count_less[i] - count_less[i - 1]

	# Either extend the previous optimal range or start a new one.
	max_diff = max(
	float(count_closed_increment) * inv_sample_count - length + max_diff,
	float(count_closed) * inv_sample_count - length)
	min_diff = min(
	float(count_open_increment) * inv_sample_count - length + min_diff,
	float(count_open) * inv_sample_count - length)

	max_local_discrepancy = max(max_diff, -min_diff, max_local_discrepancy)
	return max_local_discrepancy


	def TimestampsDiscrepancy(timestamps, absolute=True,
	location_count=None):
	"""A discrepancy based metric for measuring timestamp jank.

	TimestampsDiscrepancy quantifies the largest area of jank observed in a series
	of timestamps. Note that this is different from metrics based on the
	max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
	B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
	Discrepancy(B) > Discrepancy(A).

	Two variants of discrepancy can be computed:

	Relative discrepancy is following the original definition of
	discrepancy. It characterized the largest area of jank, relative to the
	duration of the entire time stamp series. We normalize the raw results,
	because the best case discrepancy for a set of N samples is 1/N (for
	equally spaced samples), and we want our metric to report 0.0 in that
	case.

	Absolute discrepancy also characterizes the largest area of jank, but its
	value wouldn't change (except for imprecisions due to a low
	\|interval_multiplier\|) if additional 'good' intervals were added to an
	exisiting list of time stamps. Its range is [0,inf] and the unit is
	milliseconds.

	The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
	absolute discrepancy, but D has lower relative discrepancy than C.

	\|timestamps\| may be a list of lists S = [S_1, S_2, ..., S_N], where each
	S_i is a time stamp series. In that case, the discrepancy D(S) is:
	D(S) = max(D(S_1), D(S_2), ..., D(S_N))
	"""
	if not timestamps:
	return 0.0

	if isinstance(timestamps[0], list):
	range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
	return max(range_discrepancies)

	samples, sample_scale = NormalizeSamples(timestamps)
	discrepancy = Discrepancy(samples, location_count)
	inv_sample_count = 1.0 / len(samples)
	if absolute:
	# Compute absolute discrepancy
	discrepancy /= sample_scale
	else:
	# Compute relative discrepancy
	discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
	return discrepancy


	def DurationsDiscrepancy(durations, absolute=True,
	location_count=None):
	"""A discrepancy based metric for measuring duration jank.

	DurationsDiscrepancy computes a jank metric which measures how irregular a
	given sequence of intervals is. In order to minimize jank, each duration
	should be equally long. This is similar to how timestamp jank works,
	and we therefore reuse the timestamp discrepancy function above to compute a
	similar duration discrepancy number.

	Because timestamp discrepancy is defined in terms of timestamps, we first
	convert the list of durations to monotonically increasing timestamps.

	Args:
	durations: List of interval lengths in milliseconds.
	absolute: See TimestampsDiscrepancy.
	interval_multiplier: See TimestampsDiscrepancy.
	"""
	if not durations:
	return 0.0

	timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
	return TimestampsDiscrepancy(timestamps, absolute, location_count)


	def ArithmeticMean(data):
	"""Calculates arithmetic mean.

	Args:
	data: A list of samples.

	Returns:
	The arithmetic mean value, or 0 if the list is empty.
	"""
	numerator_total = Total(data)
	denominator_total = Total(len(data))
	return DivideIfPossibleOrZero(numerator_total, denominator_total)


	def StandardDeviation(data):
	"""Calculates the standard deviation.

	Args:
	data: A list of samples.

	Returns:
	The standard deviation of the samples provided.
	"""
	if len(data) == 1:
	return 0.0

	mean = ArithmeticMean(data)
	variances = [float(x) - mean for x in data]
	variances = [x * x for x in variances]
	std_dev = math.sqrt(ArithmeticMean(variances))

	return std_dev


	def TrapezoidalRule(data, dx):
	""" Calculate the integral according to the trapezoidal rule

	TrapezoidalRule approximates the definite integral of f from a to b by
	the composite trapezoidal rule, using n subintervals.
	http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid

	Args:
	data: A list of samples
	dx: The uniform distance along the x axis between any two samples

	Returns:
	The area under the curve defined by the samples and the uniform distance
	according to the trapezoidal rule.
	"""

	n = len(data) - 1
	s = data[0] + data[n]

	if n == 0:
	return 0.0

	for i in range(1, n):
	s += 2 * data[i]

	return s * dx / 2.0

	def Total(data):
	"""Returns the float value of a number or the sum of a list."""
	if type(data) == float:
	total = data
	elif type(data) == int:
	total = float(data)
	elif type(data) == list:
	total = float(sum(data))
	else:
	raise TypeError
	return total


	def DivideIfPossibleOrZero(numerator, denominator):
	"""Returns the quotient, or zero if the denominator is zero."""
	return (float(numerator) / float(denominator)) if denominator else 0


	def GeneralizedMean(values, exponent):
	"""See http://en.wikipedia.org/wiki/Generalized_mean"""
	if not values:
	return 0.0
	sum_of_powers = 0.0
	for v in values:
	sum_of_powers += v ** exponent
	return (sum_of_powers / len(values)) ** (1.0/exponent)


	def Median(values):
	"""Gets the median of a list of values."""
	return Percentile(values, 50)


	def Percentile(values, percentile):
	"""Calculates the value below which a given percentage of values fall.

	For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
	percentile for this set of values. When the percentage doesn't exactly
	match a rank in the list of values, the percentile is computed using linear
	interpolation between closest ranks.

	Args:
	values: A list of numerical values.
	percentile: A number between 0 and 100.

	Returns:
	The Nth percentile for the list of values, where N is the given percentage.
	"""
	if not values:
	return 0.0
	sorted_values = sorted(values)
	n = len(values)
	percentile /= 100.0
	if percentile <= 0.5 / n:
	return sorted_values[0]
	elif percentile >= (n - 0.5) / n:
	return sorted_values[-1]
	else:
	floor_index = int(math.floor(n * percentile - 0.5))
	floor_value = sorted_values[floor_index]
	ceil_value = sorted_values[floor_index+1]
	alpha = n * percentile - 0.5 - floor_index
	return floor_value + alpha * (ceil_value - floor_value)


	def GeometricMean(values):
	"""Compute a rounded geometric mean from an array of values."""
	if not values:
	return None
	# To avoid infinite value errors, make sure no value is less than 0.001.
	new_values = []
	for value in values:
	if value > 0.001:
	new_values.append(value)
	else:
	new_values.append(0.001)
	# Compute the sum of the log of the values.
	log_sum = sum(map(math.log, new_values))
	# Raise e to that sum over the number of values.
	mean = math.pow(math.e, (log_sum / len(new_values)))
	# Return the rounded mean.
	return int(round(mean))