| /* |
| * Copyright (C) 2012 The Guava Authors |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
| * in compliance with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software distributed under the License |
| * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
| * or implied. See the License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| |
| package com.google.common.math; |
| |
| import static com.google.common.base.Preconditions.checkState; |
| import static com.google.common.primitives.Doubles.isFinite; |
| import static java.lang.Double.NaN; |
| import static java.lang.Double.isNaN; |
| |
| import com.google.common.annotations.Beta; |
| import com.google.common.annotations.GwtIncompatible; |
| |
| /** |
| * A mutable object which accumulates paired double values (e.g. points on a plane) and tracks some |
| * basic statistics over all the values added so far. This class is not thread safe. |
| * |
| * @author Pete Gillin |
| * @since 20.0 |
| */ |
| @Beta |
| @GwtIncompatible |
| public final class PairedStatsAccumulator { |
| |
| // These fields must satisfy the requirements of PairedStats' constructor as well as those of the |
| // stat methods of this class. |
| private final StatsAccumulator xStats = new StatsAccumulator(); |
| private final StatsAccumulator yStats = new StatsAccumulator(); |
| private double sumOfProductsOfDeltas = 0.0; |
| |
| /** Adds the given pair of values to the dataset. */ |
| public void add(double x, double y) { |
| // We extend the recursive expression for the one-variable case at Art of Computer Programming |
| // vol. 2, Knuth, 4.2.2, (16) to the two-variable case. We have two value series x_i and y_i. |
| // We define the arithmetic means X_n = 1/n \sum_{i=1}^n x_i, and Y_n = 1/n \sum_{i=1}^n y_i. |
| // We also define the sum of the products of the differences from the means |
| // C_n = \sum_{i=1}^n x_i y_i - n X_n Y_n |
| // for all n >= 1. Then for all n > 1: |
| // C_{n-1} = \sum_{i=1}^{n-1} x_i y_i - (n-1) X_{n-1} Y_{n-1} |
| // C_n - C_{n-1} = x_n y_n - n X_n Y_n + (n-1) X_{n-1} Y_{n-1} |
| // = x_n y_n - X_n [ y_n + (n-1) Y_{n-1} ] + [ n X_n - x_n ] Y_{n-1} |
| // = x_n y_n - X_n y_n - x_n Y_{n-1} + X_n Y_{n-1} |
| // = (x_n - X_n) (y_n - Y_{n-1}) |
| xStats.add(x); |
| if (isFinite(x) && isFinite(y)) { |
| if (xStats.count() > 1) { |
| sumOfProductsOfDeltas += (x - xStats.mean()) * (y - yStats.mean()); |
| } |
| } else { |
| sumOfProductsOfDeltas = NaN; |
| } |
| yStats.add(y); |
| } |
| |
| /** |
| * Adds the given statistics to the dataset, as if the individual values used to compute the |
| * statistics had been added directly. |
| */ |
| public void addAll(PairedStats values) { |
| if (values.count() == 0) { |
| return; |
| } |
| |
| xStats.addAll(values.xStats()); |
| if (yStats.count() == 0) { |
| sumOfProductsOfDeltas = values.sumOfProductsOfDeltas(); |
| } else { |
| // This is a generalized version of the calculation in add(double, double) above. Note that |
| // non-finite inputs will have sumOfProductsOfDeltas = NaN, so non-finite values will result |
| // in NaN naturally. |
| sumOfProductsOfDeltas += |
| values.sumOfProductsOfDeltas() |
| + (values.xStats().mean() - xStats.mean()) |
| * (values.yStats().mean() - yStats.mean()) |
| * values.count(); |
| } |
| yStats.addAll(values.yStats()); |
| } |
| |
| /** Returns an immutable snapshot of the current statistics. */ |
| public PairedStats snapshot() { |
| return new PairedStats(xStats.snapshot(), yStats.snapshot(), sumOfProductsOfDeltas); |
| } |
| |
| /** Returns the number of pairs in the dataset. */ |
| public long count() { |
| return xStats.count(); |
| } |
| |
| /** Returns an immutable snapshot of the statistics on the {@code x} values alone. */ |
| public Stats xStats() { |
| return xStats.snapshot(); |
| } |
| |
| /** Returns an immutable snapshot of the statistics on the {@code y} values alone. */ |
| public Stats yStats() { |
| return yStats.snapshot(); |
| } |
| |
| /** |
| * Returns the population covariance of the values. The count must be non-zero. |
| * |
| * <p>This is guaranteed to return zero if the dataset contains a single pair of finite values. It |
| * is not guaranteed to return zero when the dataset consists of the same pair of values multiple |
| * times, due to numerical errors. |
| * |
| * <h3>Non-finite values</h3> |
| * |
| * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link |
| * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. |
| * |
| * @throws IllegalStateException if the dataset is empty |
| */ |
| public double populationCovariance() { |
| checkState(count() != 0); |
| return sumOfProductsOfDeltas / count(); |
| } |
| |
| /** |
| * Returns the sample covariance of the values. The count must be greater than one. |
| * |
| * <p>This is not guaranteed to return zero when the dataset consists of the same pair of values |
| * multiple times, due to numerical errors. |
| * |
| * <h3>Non-finite values</h3> |
| * |
| * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link |
| * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. |
| * |
| * @throws IllegalStateException if the dataset is empty or contains a single pair of values |
| */ |
| public final double sampleCovariance() { |
| checkState(count() > 1); |
| return sumOfProductsOfDeltas / (count() - 1); |
| } |
| |
| /** |
| * Returns the <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">Pearson's or |
| * product-moment correlation coefficient</a> of the values. The count must greater than one, and |
| * the {@code x} and {@code y} values must both have non-zero population variance (i.e. {@code |
| * xStats().populationVariance() > 0.0 && yStats().populationVariance() > 0.0}). The result is not |
| * guaranteed to be exactly +/-1 even when the data are perfectly (anti-)correlated, due to |
| * numerical errors. However, it is guaranteed to be in the inclusive range [-1, +1]. |
| * |
| * <h3>Non-finite values</h3> |
| * |
| * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link |
| * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link Double#NaN}. |
| * |
| * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or |
| * either the {@code x} and {@code y} dataset has zero population variance |
| */ |
| public final double pearsonsCorrelationCoefficient() { |
| checkState(count() > 1); |
| if (isNaN(sumOfProductsOfDeltas)) { |
| return NaN; |
| } |
| double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); |
| double ySumOfSquaresOfDeltas = yStats.sumOfSquaresOfDeltas(); |
| checkState(xSumOfSquaresOfDeltas > 0.0); |
| checkState(ySumOfSquaresOfDeltas > 0.0); |
| // The product of two positive numbers can be zero if the multiplication underflowed. We |
| // force a positive value by effectively rounding up to MIN_VALUE. |
| double productOfSumsOfSquaresOfDeltas = |
| ensurePositive(xSumOfSquaresOfDeltas * ySumOfSquaresOfDeltas); |
| return ensureInUnitRange(sumOfProductsOfDeltas / Math.sqrt(productOfSumsOfSquaresOfDeltas)); |
| } |
| |
| /** |
| * Returns a linear transformation giving the best fit to the data according to <a |
| * href="http://mathworld.wolfram.com/LeastSquaresFitting.html">Ordinary Least Squares linear |
| * regression</a> of {@code y} as a function of {@code x}. The count must be greater than one, and |
| * either the {@code x} or {@code y} data must have a non-zero population variance (i.e. {@code |
| * xStats().populationVariance() > 0.0 || yStats().populationVariance() > 0.0}). The result is |
| * guaranteed to be horizontal if there is variance in the {@code x} data but not the {@code y} |
| * data, and vertical if there is variance in the {@code y} data but not the {@code x} data. |
| * |
| * <p>This fit minimizes the root-mean-square error in {@code y} as a function of {@code x}. This |
| * error is defined as the square root of the mean of the squares of the differences between the |
| * actual {@code y} values of the data and the values predicted by the fit for the {@code x} |
| * values (i.e. it is the square root of the mean of the squares of the vertical distances between |
| * the data points and the best fit line). For this fit, this error is a fraction {@code sqrt(1 - |
| * R*R)} of the population standard deviation of {@code y}, where {@code R} is the Pearson's |
| * correlation coefficient (as given by {@link #pearsonsCorrelationCoefficient()}). |
| * |
| * <p>The corresponding root-mean-square error in {@code x} as a function of {@code y} is a |
| * fraction {@code sqrt(1/(R*R) - 1)} of the population standard deviation of {@code x}. This fit |
| * does not normally minimize that error: to do that, you should swap the roles of {@code x} and |
| * {@code y}. |
| * |
| * <h3>Non-finite values</h3> |
| * |
| * <p>If the dataset contains any non-finite values ({@link Double#POSITIVE_INFINITY}, {@link |
| * Double#NEGATIVE_INFINITY}, or {@link Double#NaN}) then the result is {@link |
| * LinearTransformation#forNaN()}. |
| * |
| * @throws IllegalStateException if the dataset is empty or contains a single pair of values, or |
| * both the {@code x} and {@code y} dataset have zero population variance |
| */ |
| public final LinearTransformation leastSquaresFit() { |
| checkState(count() > 1); |
| if (isNaN(sumOfProductsOfDeltas)) { |
| return LinearTransformation.forNaN(); |
| } |
| double xSumOfSquaresOfDeltas = xStats.sumOfSquaresOfDeltas(); |
| if (xSumOfSquaresOfDeltas > 0.0) { |
| if (yStats.sumOfSquaresOfDeltas() > 0.0) { |
| return LinearTransformation.mapping(xStats.mean(), yStats.mean()) |
| .withSlope(sumOfProductsOfDeltas / xSumOfSquaresOfDeltas); |
| } else { |
| return LinearTransformation.horizontal(yStats.mean()); |
| } |
| } else { |
| checkState(yStats.sumOfSquaresOfDeltas() > 0.0); |
| return LinearTransformation.vertical(xStats.mean()); |
| } |
| } |
| |
| private double ensurePositive(double value) { |
| if (value > 0.0) { |
| return value; |
| } else { |
| return Double.MIN_VALUE; |
| } |
| } |
| |
| private static double ensureInUnitRange(double value) { |
| if (value >= 1.0) { |
| return 1.0; |
| } |
| if (value <= -1.0) { |
| return -1.0; |
| } |
| return value; |
| } |
| } |