blob: 0c8a90fef197fa51071eb8ff7bfe013ba48b8fa8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math.stat.descriptive.rank;
import java.io.Serializable;
import java.util.Arrays;
import org.apache.commons.math.MathRuntimeException;
import org.apache.commons.math.exception.util.LocalizedFormats;
import org.apache.commons.math.stat.descriptive.AbstractUnivariateStatistic;
import org.apache.commons.math.util.FastMath;
/**
* Provides percentile computation.
* <p>
* There are several commonly used methods for estimating percentiles (a.k.a.
* quantiles) based on sample data. For large samples, the different methods
* agree closely, but when sample sizes are small, different methods will give
* significantly different results. The algorithm implemented here works as follows:
* <ol>
* <li>Let <code>n</code> be the length of the (sorted) array and
* <code>0 < p <= 100</code> be the desired percentile.</li>
* <li>If <code> n = 1 </code> return the unique array element (regardless of
* the value of <code>p</code>); otherwise </li>
* <li>Compute the estimated percentile position
* <code> pos = p * (n + 1) / 100</code> and the difference, <code>d</code>
* between <code>pos</code> and <code>floor(pos)</code> (i.e. the fractional
* part of <code>pos</code>). If <code>pos >= n</code> return the largest
* element in the array; otherwise</li>
* <li>Let <code>lower</code> be the element in position
* <code>floor(pos)</code> in the array and let <code>upper</code> be the
* next element in the array. Return <code>lower + d * (upper - lower)</code>
* </li>
* </ol></p>
* <p>
* To compute percentiles, the data must be at least partially ordered. Input
* arrays are copied and recursively partitioned using an ordering definition.
* The ordering used by <code>Arrays.sort(double[])</code> is the one determined
* by {@link java.lang.Double#compareTo(Double)}. This ordering makes
* <code>Double.NaN</code> larger than any other value (including
* <code>Double.POSITIVE_INFINITY</code>). Therefore, for example, the median
* (50th percentile) of
* <code>{0, 1, 2, 3, 4, Double.NaN}</code> evaluates to <code>2.5.</code></p>
* <p>
* Since percentile estimation usually involves interpolation between array
* elements, arrays containing <code>NaN</code> or infinite values will often
* result in <code>NaN<code> or infinite values returned.</p>
* <p>
* Since 2.2, Percentile implementation uses only selection instead of complete
* sorting and caches selection algorithm state between calls to the various
* {@code evaluate} methods when several percentiles are to be computed on the same data.
* This greatly improves efficiency, both for single percentile and multiple
* percentiles computations. However, it also induces a need to be sure the data
* at one call to {@code evaluate} is the same as the data with the cached algorithm
* state from the previous calls. Percentile does this by checking the array reference
* itself and a checksum of its content by default. If the user already knows he calls
* {@code evaluate} on an immutable array, he can save the checking time by calling the
* {@code evaluate} methods that do <em>not</em>
* </p>
* <p>
* <strong>Note that this implementation is not synchronized.</strong> If
* multiple threads access an instance of this class concurrently, and at least
* one of the threads invokes the <code>increment()</code> or
* <code>clear()</code> method, it must be synchronized externally.</p>
*
* @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
*/
public class Percentile extends AbstractUnivariateStatistic implements Serializable {
/** Serializable version identifier */
private static final long serialVersionUID = -8091216485095130416L;
/** Minimum size under which we use a simple insertion sort rather than Hoare's select. */
private static final int MIN_SELECT_SIZE = 15;
/** Maximum number of partitioning pivots cached (each level double the number of pivots). */
private static final int MAX_CACHED_LEVELS = 10;
/** Determines what percentile is computed when evaluate() is activated
* with no quantile argument */
private double quantile = 0.0;
/** Cached pivots. */
private int[] cachedPivots;
/**
* Constructs a Percentile with a default quantile
* value of 50.0.
*/
public Percentile() {
this(50.0);
}
/**
* Constructs a Percentile with the specific quantile value.
* @param p the quantile
* @throws IllegalArgumentException if p is not greater than 0 and less
* than or equal to 100
*/
public Percentile(final double p) {
setQuantile(p);
cachedPivots = null;
}
/**
* Copy constructor, creates a new {@code Percentile} identical
* to the {@code original}
*
* @param original the {@code Percentile} instance to copy
*/
public Percentile(Percentile original) {
copy(original, this);
}
/** {@inheritDoc} */
@Override
public void setData(final double[] values) {
if (values == null) {
cachedPivots = null;
} else {
cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(cachedPivots, -1);
}
super.setData(values);
}
/** {@inheritDoc} */
@Override
public void setData(final double[] values, final int begin, final int length) {
if (values == null) {
cachedPivots = null;
} else {
cachedPivots = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(cachedPivots, -1);
}
super.setData(values, begin, length);
}
/**
* Returns the result of evaluating the statistic over the stored data.
* <p>
* The stored array is the one which was set by previous calls to
* </p>
* @param p the percentile value to compute
* @return the value of the statistic applied to the stored data
*/
public double evaluate(final double p) {
return evaluate(getDataRef(), p);
}
/**
* Returns an estimate of the <code>p</code>th percentile of the values
* in the <code>values</code> array.
* <p>
* Calls to this method do not modify the internal <code>quantile</code>
* state of this statistic.</p>
* <p>
* <ul>
* <li>Returns <code>Double.NaN</code> if <code>values</code> has length
* <code>0</code></li>
* <li>Returns (for any value of <code>p</code>) <code>values[0]</code>
* if <code>values</code> has length <code>1</code></li>
* <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
* is null or p is not a valid quantile value (p must be greater than 0
* and less than or equal to 100) </li>
* </ul></p>
* <p>
* See {@link Percentile} for a description of the percentile estimation
* algorithm used.</p>
*
* @param values input array of values
* @param p the percentile value to compute
* @return the percentile value or Double.NaN if the array is empty
* @throws IllegalArgumentException if <code>values</code> is null
* or p is invalid
*/
public double evaluate(final double[] values, final double p) {
test(values, 0, 0);
return evaluate(values, 0, values.length, p);
}
/**
* Returns an estimate of the <code>quantile</code>th percentile of the
* designated values in the <code>values</code> array. The quantile
* estimated is determined by the <code>quantile</code> property.
* <p>
* <ul>
* <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
* <li>Returns (for any value of <code>quantile</code>)
* <code>values[begin]</code> if <code>length = 1 </code></li>
* <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
* is null, or <code>start</code> or <code>length</code>
* is invalid</li>
* </ul></p>
* <p>
* See {@link Percentile} for a description of the percentile estimation
* algorithm used.</p>
*
* @param values the input array
* @param start index of the first array element to include
* @param length the number of elements to include
* @return the percentile value
* @throws IllegalArgumentException if the parameters are not valid
*
*/
@Override
public double evaluate( final double[] values, final int start, final int length) {
return evaluate(values, start, length, quantile);
}
/**
* Returns an estimate of the <code>p</code>th percentile of the values
* in the <code>values</code> array, starting with the element in (0-based)
* position <code>begin</code> in the array and including <code>length</code>
* values.
* <p>
* Calls to this method do not modify the internal <code>quantile</code>
* state of this statistic.</p>
* <p>
* <ul>
* <li>Returns <code>Double.NaN</code> if <code>length = 0</code></li>
* <li>Returns (for any value of <code>p</code>) <code>values[begin]</code>
* if <code>length = 1 </code></li>
* <li>Throws <code>IllegalArgumentException</code> if <code>values</code>
* is null , <code>begin</code> or <code>length</code> is invalid, or
* <code>p</code> is not a valid quantile value (p must be greater than 0
* and less than or equal to 100)</li>
* </ul></p>
* <p>
* See {@link Percentile} for a description of the percentile estimation
* algorithm used.</p>
*
* @param values array of input values
* @param p the percentile to compute
* @param begin the first (0-based) element to include in the computation
* @param length the number of array elements to include
* @return the percentile value
* @throws IllegalArgumentException if the parameters are not valid or the
* input array is null
*/
public double evaluate(final double[] values, final int begin,
final int length, final double p) {
test(values, begin, length);
if ((p > 100) || (p <= 0)) {
throw MathRuntimeException.createIllegalArgumentException(
LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
}
if (length == 0) {
return Double.NaN;
}
if (length == 1) {
return values[begin]; // always return single value for n = 1
}
double n = length;
double pos = p * (n + 1) / 100;
double fpos = FastMath.floor(pos);
int intPos = (int) fpos;
double dif = pos - fpos;
double[] work;
int[] pivotsHeap;
if (values == getDataRef()) {
work = getDataRef();
pivotsHeap = cachedPivots;
} else {
work = new double[length];
System.arraycopy(values, begin, work, 0, length);
pivotsHeap = new int[(0x1 << MAX_CACHED_LEVELS) - 1];
Arrays.fill(pivotsHeap, -1);
}
if (pos < 1) {
return select(work, pivotsHeap, 0);
}
if (pos >= n) {
return select(work, pivotsHeap, length - 1);
}
double lower = select(work, pivotsHeap, intPos - 1);
double upper = select(work, pivotsHeap, intPos);
return lower + dif * (upper - lower);
}
/**
* Select the k<sup>th</sup> smallest element from work array
* @param work work array (will be reorganized during the call)
* @param pivotsHeap set of pivot index corresponding to elements that
* are already at their sorted location, stored as an implicit heap
* (i.e. a sorted binary tree stored in a flat array, where the
* children of a node at index n are at indices 2n+1 for the left
* child and 2n+2 for the right child, with 0-based indices)
* @param k index of the desired element
* @return k<sup>th</sup> smallest element
*/
private double select(final double[] work, final int[] pivotsHeap, final int k) {
int begin = 0;
int end = work.length;
int node = 0;
while (end - begin > MIN_SELECT_SIZE) {
final int pivot;
if ((node < pivotsHeap.length) && (pivotsHeap[node] >= 0)) {
// the pivot has already been found in a previous call
// and the array has already been partitioned around it
pivot = pivotsHeap[node];
} else {
// select a pivot and partition work array around it
pivot = partition(work, begin, end, medianOf3(work, begin, end));
if (node < pivotsHeap.length) {
pivotsHeap[node] = pivot;
}
}
if (k == pivot) {
// the pivot was exactly the element we wanted
return work[k];
} else if (k < pivot) {
// the element is in the left partition
end = pivot;
node = Math.min(2 * node + 1, pivotsHeap.length); // the min is here to avoid integer overflow
} else {
// the element is in the right partition
begin = pivot + 1;
node = Math.min(2 * node + 2, pivotsHeap.length); // the min is here to avoid integer overflow
}
}
// the element is somewhere in the small sub-array
// sort the sub-array using insertion sort
insertionSort(work, begin, end);
return work[k];
}
/** Select a pivot index as the median of three
* @param work data array
* @param begin index of the first element of the slice
* @param end index after the last element of the slice
* @return the index of the median element chosen between the
* first, the middle and the last element of the array slice
*/
int medianOf3(final double[] work, final int begin, final int end) {
final int inclusiveEnd = end - 1;
final int middle = begin + (inclusiveEnd - begin) / 2;
final double wBegin = work[begin];
final double wMiddle = work[middle];
final double wEnd = work[inclusiveEnd];
if (wBegin < wMiddle) {
if (wMiddle < wEnd) {
return middle;
} else {
return (wBegin < wEnd) ? inclusiveEnd : begin;
}
} else {
if (wBegin < wEnd) {
return begin;
} else {
return (wMiddle < wEnd) ? inclusiveEnd : middle;
}
}
}
/**
* Partition an array slice around a pivot
* <p>
* Partitioning exchanges array elements such that all elements
* smaller than pivot are before it and all elements larger than
* pivot are after it
* </p>
* @param work data array
* @param begin index of the first element of the slice
* @param end index after the last element of the slice
* @param pivot initial index of the pivot
* @return index of the pivot after partition
*/
private int partition(final double[] work, final int begin, final int end, final int pivot) {
final double value = work[pivot];
work[pivot] = work[begin];
int i = begin + 1;
int j = end - 1;
while (i < j) {
while ((i < j) && (work[j] >= value)) {
--j;
}
while ((i < j) && (work[i] <= value)) {
++i;
}
if (i < j) {
final double tmp = work[i];
work[i++] = work[j];
work[j--] = tmp;
}
}
if ((i >= end) || (work[i] > value)) {
--i;
}
work[begin] = work[i];
work[i] = value;
return i;
}
/**
* Sort in place a (small) array slice using insertion sort
* @param work array to sort
* @param begin index of the first element of the slice to sort
* @param end index after the last element of the slice to sort
*/
private void insertionSort(final double[] work, final int begin, final int end) {
for (int j = begin + 1; j < end; j++) {
final double saved = work[j];
int i = j - 1;
while ((i >= begin) && (saved < work[i])) {
work[i + 1] = work[i];
i--;
}
work[i + 1] = saved;
}
}
/**
* Returns the value of the quantile field (determines what percentile is
* computed when evaluate() is called with no quantile argument).
*
* @return quantile
*/
public double getQuantile() {
return quantile;
}
/**
* Sets the value of the quantile field (determines what percentile is
* computed when evaluate() is called with no quantile argument).
*
* @param p a value between 0 < p <= 100
* @throws IllegalArgumentException if p is not greater than 0 and less
* than or equal to 100
*/
public void setQuantile(final double p) {
if (p <= 0 || p > 100) {
throw MathRuntimeException.createIllegalArgumentException(
LocalizedFormats.OUT_OF_BOUNDS_QUANTILE_VALUE, p);
}
quantile = p;
}
/**
* {@inheritDoc}
*/
@Override
public Percentile copy() {
Percentile result = new Percentile();
copy(this, result);
return result;
}
/**
* Copies source to dest.
* <p>Neither source nor dest can be null.</p>
*
* @param source Percentile to copy
* @param dest Percentile to copy to
* @throws NullPointerException if either source or dest is null
*/
public static void copy(Percentile source, Percentile dest) {
dest.setData(source.getDataRef());
if (source.cachedPivots != null) {
System.arraycopy(source.cachedPivots, 0, dest.cachedPivots, 0, source.cachedPivots.length);
}
dest.quantile = source.quantile;
}
}