guava/src/com/google/common/math/PairedStatsAccumulator.java - platform/external/guava - Git at Google

 /*
  * Copyright (C) 2012 The Guava Authors
  *
  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
  * in compliance with the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software distributed under the License
  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */

 package com.google.common.math;

 import static com.google.common.base.Preconditions.checkState;
 import static com.google.common.primitives.Doubles.isFinite;
 import static java.lang.Double.NaN;
 import static java.lang.Double.isNaN;

 import com.google.common.annotations.Beta;
 import com.google.common.annotations.GwtIncompatible;

 /**
  * A mutable object which accumulates paired double values (e.g. points on a plane) and tracks some
  * basic statistics over all the values added so far. This class is not thread safe.
  *
  * @author Pete Gillin
  * @since 20.0
  */
 @Beta
 @GwtIncompatible
 public final class PairedStatsAccumulator {

   // These fields must satisfy the requirements of PairedStats' constructor as well as those of the
   // stat methods of this class.
   private final StatsAccumulator xStats = new StatsAccumulator();
   private final StatsAccumulator yStats = new StatsAccumulator();
   private double sumOfProductsOfDeltas = 0.0;

   /** Adds the given pair of values to the dataset. */
   public void add(double x, double y) {
     // We extend the recursive expression for the one-variable case at Art of Computer Programming
     // vol. 2, Knuth, 4.2.2, (16) to the two-variable case. We have two value series x_i and y_i.
     // We define the arithmetic means X_n = 1/n \sum_{i=1}^n x_i, and Y_n = 1/n \sum_{i=1}^n y_i.
     // We also define the sum of the products of the differences from the means
     //           C_n = \sum_{i=1}^n x_i y_i - n X_n Y_n
     // for all n >= 1. Then for all n > 1:
     //       C_{n-1} = \sum_{i=1}^{n-1} x_i y_i - (n-1) X_{n-1} Y_{n-1}
     // C_n - C_{n-1} = x_n y_n - n X_n Y_n + (n-1) X_{n-1} Y_{n-1}
     //               = x_n y_n - X_n [ y_n + (n-1) Y_{n-1} ] + [ n X_n - x_n ] Y_{n-1}
     //               = x_n y_n - X_n y_n - x_n Y_{n-1} + X_n Y_{n-1}
     //               = (x_n - X_n) (y_n - Y_{n-1})
     xStats.add(x);
     if (isFinite(x) && isFinite(y)) {
       if (xStats.count() > 1) {
         sumOfProductsOfDeltas += (x - xStats.mean()) * (y - yStats.mean());
       }
     } else {
       sumOfProductsOfDeltas = NaN;
     }
     yStats.add(y);
   }

   /**
    * Adds the given statistics to the dataset, as if the individual values used to compute the
    * statistics had been added directly.
    */
   public void addAll(PairedStats values) {
     if (values.count() == 0) {
       return;
     }

     xStats.addAll(values.xStats());
     if (yStats.count() == 0) {
       sumOfProductsOfDeltas = values.sumOfProductsOfDeltas();
     } else {
       // This is a generalized version of the calculation in add(double, double) above. Note that
       // non-finite inputs will have sumOfProductsOfDeltas = NaN, so non-finite values will result
       // in NaN naturally.
       sumOfProductsOfDeltas +=
           values.sumOfProductsOfDeltas()
               + (values.xStats().mean() - xStats.mean())
                   * (values.yStats().mean() - yStats.mean())
                   * values.count();
     }
     yStats.addAll(values.yStats());
   }

   /** Returns an immutable snapshot of the current statistics. */
   public PairedStats snapshot() {
     return new PairedStats(xStats.snapshot(), yStats.snapshot(), sumOfProductsOfDeltas);
   }

   /** Returns the number of pairs in the dataset. */
   public long count() {
     return xStats.count();
   }

   /** Returns an immutable snapshot of the statistics on the {@code x} values alone. */
   public Stats xStats() {
     return xStats.snapshot();
   }

   /** Returns an immutable snapshot of the statistics on the {@code y} values alone. */
   public Stats yStats() {
     return yStats.snapshot();
   }

   /**
    * Returns the population covariance of the values. The count must be non-zero.
    *
    * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
    * is not guaranteed to return zero when the dataset consists of the same pair of values multiple
    * times, due to numerical errors.
    *
    * <h3>Non-finite values</h3>
    *
    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
    * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
    *
    * @throws IllegalStateException if the dataset is empty
    */
   public double populationCovariance() {
     checkState(count() != 0);
     return sumOfProductsOfDeltas / count();
   }

   /**
    * Returns the sample covariance of the values. The count must be greater than one.
    *
    * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
    * multiple times, due to numerical errors.
    *
    * <h3>Non-finite values</h3>
    *
    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
    * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
    *
    * @throws IllegalStateException if the dataset is empty or contains a single pair of values
    */
   public final double sampleCovariance() {
     checkState(count() > 1);
     return sumOfProductsOfDeltas / (count() - 1);
   }

   /**
    * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
    * product-moment correlation coefficient</a> of the values. The count must greater than one, and
    * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code
    * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not
    * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
    * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
    *
    * <h3>Non-finite values</h3>
    *
    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
    * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
    *
    * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
    *     either the {@code x} and {@code y} dataset has zero population variance
    */
   public final double pearsonsCorrelationCoefficient() {
     checkState(count() > 1);
     if (isNaN(sumOfProductsOfDeltas)) {
       return NaN;
     }
     double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
     double ySumOfSquaresOfDeltas = yStats.sumOfSquaresOfDeltas();
     checkState(xSumOfSquaresOfDeltas > 0.0);
     checkState(ySumOfSquaresOfDeltas > 0.0);
     // The product of two positive numbers can be zero if the multiplication underflowed. We
     // force a positive value by effectively rounding up to MIN_VALUE.
     double productOfSumsOfSquaresOfDeltas =
         ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
     return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
   }

   /**
    * Returns a linear transformation giving the best fit to the data according to <a
    * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
    * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
    * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code
    * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is
    * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
    * data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
    *
    * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
    * error is defined as the square root of the mean of the squares of the differences between the
    * actual {@code y} values of the data and the values predicted by the fit for the {@code x}
    * values (i.e. it is the square root of the mean of the squares of the vertical distances between
    * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 -
    * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's
    * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
    *
    * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
    * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
    * does not normally minimize that error: to do that, you should swap the roles of {@code x} and
    * {@code y}.
    *
    * <h3>Non-finite values</h3>
    *
    * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
    * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link
    * LinearTransformation#forNaN()}.
    *
    * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
    *     both the {@code x} and {@code y} dataset have zero population variance
    */
   public final LinearTransformation leastSquaresFit() {
     checkState(count() > 1);
     if (isNaN(sumOfProductsOfDeltas)) {
       return LinearTransformation.forNaN();
     }
     double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
     if (xSumOfSquaresOfDeltas > 0.0) {
       if (yStats.sumOfSquaresOfDeltas() > 0.0) {
         return LinearTransformation.mapping(xStats.mean(), yStats.mean())
             .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
       } else {
         return LinearTransformation.horizontal(yStats.mean());
       }
     } else {
       checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
       return LinearTransformation.vertical(xStats.mean());
     }
   }

   private double ensurePositive(double value) {
     if (value > 0.0) {
       return value;
     } else {
       return Double.MIN_VALUE;
     }
   }

   private static double ensureInUnitRange(double value) {
     if (value >= 1.0) {
       return 1.0;
     }
     if (value <= -1.0) {
       return -1.0;
     }
     return value;
   }
 }
	/*
	* Copyright (C) 2012 The Guava Authors
	*
	* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
	* in compliance with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software distributed under the License
	* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
	* or implied. See the License for the specific language governing permissions and limitations under
	* the License.
	*/

	package com.google.common.math;

	import static com.google.common.base.Preconditions.checkState;
	import static com.google.common.primitives.Doubles.isFinite;
	import static java.lang.Double.NaN;
	import static java.lang.Double.isNaN;

	import com.google.common.annotations.Beta;
	import com.google.common.annotations.GwtIncompatible;

	/**
	* A mutable object which accumulates paired double values (e.g. points on a plane) and tracks some
	* basic statistics over all the values added so far. This class is not thread safe.
	*
	* @author Pete Gillin
	* @since 20.0
	*/
	@Beta
	@GwtIncompatible
	public final class PairedStatsAccumulator {

	// These fields must satisfy the requirements of PairedStats' constructor as well as those of the
	// stat methods of this class.
	private final StatsAccumulator xStats = new StatsAccumulator();
	private final StatsAccumulator yStats = new StatsAccumulator();
	private double sumOfProductsOfDeltas = 0.0;

	/** Adds the given pair of values to the dataset. */
	public void add(double x, double y) {
	// We extend the recursive expression for the one-variable case at Art of Computer Programming
	// vol. 2, Knuth, 4.2.2, (16) to the two-variable case. We have two value series x_i and y_i.
	// We define the arithmetic means X_n = 1/n \sum_{i=1}^n x_i, and Y_n = 1/n \sum_{i=1}^n y_i.
	// We also define the sum of the products of the differences from the means
	// C_n = \sum_{i=1}^n x_i y_i - n X_n Y_n
	// for all n >= 1. Then for all n > 1:
	// C_{n-1} = \sum_{i=1}^{n-1} x_i y_i - (n-1) X_{n-1} Y_{n-1}
	// C_n - C_{n-1} = x_n y_n - n X_n Y_n + (n-1) X_{n-1} Y_{n-1}
	// = x_n y_n - X_n [ y_n + (n-1) Y_{n-1} ] + [ n X_n - x_n ] Y_{n-1}
	// = x_n y_n - X_n y_n - x_n Y_{n-1} + X_n Y_{n-1}
	// = (x_n - X_n) (y_n - Y_{n-1})
	xStats.add(x);
	if (isFinite(x) && isFinite(y)) {
	if (xStats.count() > 1) {
	sumOfProductsOfDeltas += (x - xStats.mean()) * (y - yStats.mean());
	}
	} else {
	sumOfProductsOfDeltas = NaN;
	}
	yStats.add(y);
	}

	/**
	* Adds the given statistics to the dataset, as if the individual values used to compute the
	* statistics had been added directly.
	*/
	public void addAll(PairedStats values) {
	if (values.count() == 0) {
	return;
	}

	xStats.addAll(values.xStats());
	if (yStats.count() == 0) {
	sumOfProductsOfDeltas = values.sumOfProductsOfDeltas();
	} else {
	// This is a generalized version of the calculation in add(double, double) above. Note that
	// non-finite inputs will have sumOfProductsOfDeltas = NaN, so non-finite values will result
	// in NaN naturally.
	sumOfProductsOfDeltas +=
	values.sumOfProductsOfDeltas()
	+ (values.xStats().mean() - xStats.mean())
	* (values.yStats().mean() - yStats.mean())
	* values.count();
	}
	yStats.addAll(values.yStats());
	}

	/** Returns an immutable snapshot of the current statistics. */
	public PairedStats snapshot() {
	return new PairedStats(xStats.snapshot(), yStats.snapshot(), sumOfProductsOfDeltas);
	}

	/** Returns the number of pairs in the dataset. */
	public long count() {
	return xStats.count();
	}

	/** Returns an immutable snapshot of the statistics on the {@code x} values alone. */
	public Stats xStats() {
	return xStats.snapshot();
	}

	/** Returns an immutable snapshot of the statistics on the {@code y} values alone. */
	public Stats yStats() {
	return yStats.snapshot();
	}

	/**
	* Returns the population covariance of the values. The count must be non-zero.
	*
	* <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It
	* is not guaranteed to return zero when the dataset consists of the same pair of values multiple
	* times, due to numerical errors.
	*
	* <h3>Non-finite values</h3>
	*
	* <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
	* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
	*
	* @throws IllegalStateException if the dataset is empty
	*/
	public double populationCovariance() {
	checkState(count() != 0);
	return sumOfProductsOfDeltas / count();
	}

	/**
	* Returns the sample covariance of the values. The count must be greater than one.
	*
	* <p>This is not guaranteed to return zero when the dataset consists of the same pair of values
	* multiple times, due to numerical errors.
	*
	* <h3>Non-finite values</h3>
	*
	* <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
	* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
	*
	* @throws IllegalStateException if the dataset is empty or contains a single pair of values
	*/
	public final double sampleCovariance() {
	checkState(count() > 1);
	return sumOfProductsOfDeltas / (count() - 1);
	}

	/**
	* Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or
	* product-moment correlation coefficient</a> of the values. The count must greater than one, and
	* the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code
	* xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not
	* guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to
	* numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1].
	*
	* <h3>Non-finite values</h3>
	*
	* <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
	* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}.
	*
	* @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
	* either the {@code x} and {@code y} dataset has zero population variance
	*/
	public final double pearsonsCorrelationCoefficient() {
	checkState(count() > 1);
	if (isNaN(sumOfProductsOfDeltas)) {
	return NaN;
	}
	double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
	double ySumOfSquaresOfDeltas = yStats.sumOfSquaresOfDeltas();
	checkState(xSumOfSquaresOfDeltas > 0.0);
	checkState(ySumOfSquaresOfDeltas > 0.0);
	// The product of two positive numbers can be zero if the multiplication underflowed. We
	// force a positive value by effectively rounding up to MIN_VALUE.
	double productOfSumsOfSquaresOfDeltas =
	ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas);
	return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas));
	}

	/**
	* Returns a linear transformation giving the best fit to the data according to <a
	* href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear
	* regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and
	* either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code
	* xStats().populationVariance() > 0.0 \|\| yStats().populationVariance() > 0.0}). The result is
	* guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y}
	* data, and vertical if there is variance in the {@code y} data but not the {@code x} data.
	*
	* <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This
	* error is defined as the square root of the mean of the squares of the differences between the
	* actual {@code y} values of the data and the values predicted by the fit for the {@code x}
	* values (i.e. it is the square root of the mean of the squares of the vertical distances between
	* the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 -
	* R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's
	* correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}).
	*
	* <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a
	* fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit
	* does not normally minimize that error: to do that, you should swap the roles of {@code x} and
	* {@code y}.
	*
	* <h3>Non-finite values</h3>
	*
	* <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link
	* Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link
	* LinearTransformation#forNaN()}.
	*
	* @throws IllegalStateException if the dataset is empty or contains a single pair of values, or
	* both the {@code x} and {@code y} dataset have zero population variance
	*/
	public final LinearTransformation leastSquaresFit() {
	checkState(count() > 1);
	if (isNaN(sumOfProductsOfDeltas)) {
	return LinearTransformation.forNaN();
	}
	double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas();
	if (xSumOfSquaresOfDeltas > 0.0) {
	if (yStats.sumOfSquaresOfDeltas() > 0.0) {
	return LinearTransformation.mapping(xStats.mean(), yStats.mean())
	.withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas);
	} else {
	return LinearTransformation.horizontal(yStats.mean());
	}
	} else {
	checkState(yStats.sumOfSquaresOfDeltas() > 0.0);
	return LinearTransformation.vertical(xStats.mean());
	}
	}

	private double ensurePositive(double value) {
	if (value > 0.0) {
	return value;
	} else {
	return Double.MIN_VALUE;
	}
	}

	private static double ensureInUnitRange(double value) {
	if (value >= 1.0) {
	return 1.0;
	}
	if (value <= -1.0) {
	return -1.0;
	}
	return value;
	}
	}