View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.math.stat.descriptive;
18  
19  import java.io.Serializable;
20  import java.util.Arrays;
21  
22  import org.apache.commons.math.DimensionMismatchException;
23  import org.apache.commons.math.MathRuntimeException;
24  import org.apache.commons.math.linear.RealMatrix;
25  import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
26  import org.apache.commons.math.stat.descriptive.moment.Mean;
27  import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
28  import org.apache.commons.math.stat.descriptive.rank.Max;
29  import org.apache.commons.math.stat.descriptive.rank.Min;
30  import org.apache.commons.math.stat.descriptive.summary.Sum;
31  import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
32  import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
33  import org.apache.commons.math.util.MathUtils;
34  
35  /**
36   * <p>Computes summary statistics for a stream of n-tuples added using the 
37   * {@link #addValue(double[]) addValue} method. The data values are not stored
38   * in memory, so this class can be used to compute statistics for very large
39   * n-tuple streams.</p>
40   * 
41   * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
42   * summary state and compute statistics are configurable via setters.
43   * For example, the default implementation for the mean can be overridden by
44   * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
45   * parameters to these methods must implement the 
46   * {@link StorelessUnivariateStatistic} interface and configuration must be
47   * completed before <code>addValue</code> is called. No configuration is
48   * necessary to use the default, commons-math provided implementations.</p>
49   * 
50   * <p>To compute statistics for a stream of n-tuples, construct a
51   * MultivariateStatistics instance with dimension n and then use 
52   * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
53   * methods where Xxx is a statistic return an array of <code>double</code>
54   * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
55   * value of the given statistic for data range consisting of the i<sup>th</sup> element of
56   * each of the input n-tuples.  For example, if <code>addValue</code> is called
57   * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
58   * <code>getSum</code> will return a three-element array with values
59   * {0+3+6, 1+4+7, 2+5+8}</p>
60   * 
61   * <p>Note: This class is not thread-safe. Use 
62   * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
63   * threads is required.</p>
64   *
65   * @since 1.2
66   * @version $Revision: 762116 $ $Date: 2009-04-05 12:48:53 -0400 (Sun, 05 Apr 2009) $
67   */
68  public class MultivariateSummaryStatistics
69    implements StatisticalMultivariateSummary, Serializable {
70  
71      /** Serialization UID */
72      private static final long serialVersionUID = 2271900808994826718L;
73  
74      /**
75       * Construct a MultivariateSummaryStatistics instance
76       * @param k dimension of the data
77       * @param isCovarianceBiasCorrected if true, the unbiased sample
78       * covariance is computed, otherwise the biased population covariance
79       * is computed
80       */
81      public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
82          this.k = k;
83  
84          sumImpl     = new StorelessUnivariateStatistic[k];
85          sumSqImpl   = new StorelessUnivariateStatistic[k];
86          minImpl     = new StorelessUnivariateStatistic[k];
87          maxImpl     = new StorelessUnivariateStatistic[k];
88          sumLogImpl  = new StorelessUnivariateStatistic[k];
89          geoMeanImpl = new StorelessUnivariateStatistic[k];
90          meanImpl    = new StorelessUnivariateStatistic[k];
91  
92          for (int i = 0; i < k; ++i) {
93              sumImpl[i]     = new Sum();
94              sumSqImpl[i]   = new SumOfSquares();
95              minImpl[i]     = new Min();
96              maxImpl[i]     = new Max();
97              sumLogImpl[i]  = new SumOfLogs();
98              geoMeanImpl[i] = new GeometricMean();
99              meanImpl[i]    = new Mean();
100         }
101 
102         covarianceImpl =
103             new VectorialCovariance(k, isCovarianceBiasCorrected);
104 
105     }
106 
107     /** Dimension of the data. */
108     private int k;
109 
110     /** Count of values that have been added */
111     private long n = 0;
112     
113     /** Sum statistic implementation - can be reset by setter. */
114     private StorelessUnivariateStatistic[] sumImpl;
115     
116     /** Sum of squares statistic implementation - can be reset by setter. */
117     private StorelessUnivariateStatistic[] sumSqImpl;
118     
119     /** Minimum statistic implementation - can be reset by setter. */
120     private StorelessUnivariateStatistic[] minImpl;
121     
122     /** Maximum statistic implementation - can be reset by setter. */
123     private StorelessUnivariateStatistic[] maxImpl;
124     
125     /** Sum of log statistic implementation - can be reset by setter. */
126     private StorelessUnivariateStatistic[] sumLogImpl;
127     
128     /** Geometric mean statistic implementation - can be reset by setter. */
129     private StorelessUnivariateStatistic[] geoMeanImpl;
130     
131     /** Mean statistic implementation - can be reset by setter. */
132     private StorelessUnivariateStatistic[] meanImpl;
133     
134     /** Covariance statistic implementation - cannot be reset. */
135     private VectorialCovariance covarianceImpl;
136 
137     /**
138      * Add an n-tuple to the data
139      * 
140      * @param value  the n-tuple to add
141      * @throws DimensionMismatchException if the length of the array
142      * does not match the one used at construction
143      */
144     public void addValue(double[] value)
145       throws DimensionMismatchException {
146         checkDimension(value.length);
147         for (int i = 0; i < k; ++i) {
148             double v = value[i];
149             sumImpl[i].increment(v);
150             sumSqImpl[i].increment(v);
151             minImpl[i].increment(v);
152             maxImpl[i].increment(v);
153             sumLogImpl[i].increment(v);
154             geoMeanImpl[i].increment(v);
155             meanImpl[i].increment(v);
156         }
157         covarianceImpl.increment(value);
158         n++;
159     }
160 
161     /** 
162      * Returns the dimension of the data
163      * @return The dimension of the data
164      */
165     public int getDimension() {
166         return k;
167     }
168 
169     /** 
170      * Returns the number of available values
171      * @return The number of available values
172      */
173     public long getN() {
174         return n;
175     }
176 
177     /**
178      * Returns an array of the results of a statistic.
179      * @param stats univariate statistic array
180      * @return results array
181      */
182     private double[] getResults(StorelessUnivariateStatistic[] stats) {
183         double[] results = new double[stats.length];
184         for (int i = 0; i < results.length; ++i) {
185             results[i] = stats[i].getResult();
186         }
187         return results;
188     }
189 
190     /**
191      * Returns an array whose i<sup>th</sup> entry is the sum of the
192      * i<sup>th</sup> entries of the arrays that have been added using 
193      * {@link #addValue(double[])}
194      * 
195      * @return the array of component sums
196      */
197     public double[] getSum() {
198         return getResults(sumImpl);
199     }
200 
201     /**
202      * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
203      * i<sup>th</sup> entries of the arrays that have been added using 
204      * {@link #addValue(double[])}
205      * 
206      * @return the array of component sums of squares
207      */
208     public double[] getSumSq() {
209         return getResults(sumSqImpl);
210     }
211 
212     /**
213      * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
214      * i<sup>th</sup> entries of the arrays that have been added using 
215      * {@link #addValue(double[])}
216      * 
217      * @return the array of component log sums
218      */
219     public double[] getSumLog() {
220         return getResults(sumLogImpl);
221     }
222 
223     /**
224      * Returns an array whose i<sup>th</sup> entry is the mean of the
225      * i<sup>th</sup> entries of the arrays that have been added using 
226      * {@link #addValue(double[])}
227      * 
228      * @return the array of component means
229      */
230     public double[] getMean() {
231         return getResults(meanImpl);
232     }
233 
234     /**
235      * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
236      * i<sup>th</sup> entries of the arrays that have been added using 
237      * {@link #addValue(double[])}
238      * 
239      * @return the array of component standard deviations
240      */
241     public double[] getStandardDeviation() {
242         double[] stdDev = new double[k];
243         if (getN() < 1) {
244             Arrays.fill(stdDev, Double.NaN);
245         } else if (getN() < 2) {
246             Arrays.fill(stdDev, 0.0);
247         } else {
248             RealMatrix matrix = covarianceImpl.getResult();
249             for (int i = 0; i < k; ++i) {
250                 stdDev[i] = Math.sqrt(matrix.getEntry(i, i));
251             }
252         }
253         return stdDev;
254     }
255 
256     /**
257      * Returns the covariance matrix of the values that have been added.
258      *
259      * @return the covariance matrix 
260      */
261     public RealMatrix getCovariance() {
262         return covarianceImpl.getResult();
263     }
264 
265     /**
266      * Returns an array whose i<sup>th</sup> entry is the maximum of the
267      * i<sup>th</sup> entries of the arrays that have been added using 
268      * {@link #addValue(double[])}
269      * 
270      * @return the array of component maxima
271      */
272     public double[] getMax() {
273         return getResults(maxImpl);
274     }
275 
276     /**
277      * Returns an array whose i<sup>th</sup> entry is the minimum of the
278      * i<sup>th</sup> entries of the arrays that have been added using 
279      * {@link #addValue(double[])}
280      * 
281      * @return the array of component minima
282      */
283     public double[] getMin() {
284         return getResults(minImpl);
285     }
286 
287     /**
288      * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
289      * i<sup>th</sup> entries of the arrays that have been added using 
290      * {@link #addValue(double[])}
291      * 
292      * @return the array of component geometric means
293      */
294     public double[] getGeometricMean() {
295         return getResults(geoMeanImpl);
296     }
297     
298     /**
299      * Generates a text report displaying
300      * summary statistics from values that
301      * have been added.
302      * @return String with line feeds displaying statistics
303      */
304     @Override
305     public String toString() {
306         StringBuffer outBuffer = new StringBuffer();
307         outBuffer.append("MultivariateSummaryStatistics:\n");
308         outBuffer.append("n: " + getN() + "\n");
309         append(outBuffer, getMin(), "min: ", ", ", "\n");
310         append(outBuffer, getMax(), "max: ", ", ", "\n");
311         append(outBuffer, getMean(), "mean: ", ", ", "\n");
312         append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n");
313         append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n");
314         append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n");
315         append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n");
316         outBuffer.append("covariance: " + getCovariance().toString() + "\n");
317         return outBuffer.toString();
318     }
319 
320     /**
321      * Append a text representation of an array to a buffer.
322      * @param buffer buffer to fill
323      * @param data data array
324      * @param prefix text prefix
325      * @param separator elements separator
326      * @param suffix text suffix
327      */
328     private void append(StringBuffer buffer, double[] data,
329                         String prefix, String separator, String suffix) {
330         buffer.append(prefix);
331         for (int i = 0; i < data.length; ++i) {
332             if (i > 0) {
333                 buffer.append(separator);
334             }
335             buffer.append(data[i]);
336         }
337         buffer.append(suffix);
338     }
339 
340     /** 
341      * Resets all statistics and storage
342      */
343     public void clear() {
344         this.n = 0;
345         for (int i = 0; i < k; ++i) {
346             minImpl[i].clear();
347             maxImpl[i].clear();
348             sumImpl[i].clear();
349             sumLogImpl[i].clear();
350             sumSqImpl[i].clear();
351             geoMeanImpl[i].clear();
352             meanImpl[i].clear();
353         }
354         covarianceImpl.clear();
355     }
356     
357     /**
358      * Returns true iff <code>object</code> is a <code>SummaryStatistics</code>
359      * instance and all statistics have the same values as this.
360      * @param object the object to test equality against.
361      * @return true if object equals this
362      */
363     @Override
364     public boolean equals(Object object) {
365         if (object == this ) {
366             return true;
367         }
368         if (object instanceof MultivariateSummaryStatistics == false) {
369             return false;
370         }
371         MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
372         return (MathUtils.equals(stat.getGeometricMean(), 
373                 this.getGeometricMean()) &&
374                 MathUtils.equals(stat.getMax(), this.getMax()) && 
375                 MathUtils.equals(stat.getMean(),this.getMean()) &&
376                 MathUtils.equals(stat.getMin(),this.getMin()) &&
377                 MathUtils.equals(stat.getN(), this.getN()) &&
378                 MathUtils.equals(stat.getSum(), this.getSum()) &&
379                 MathUtils.equals(stat.getSumSq(),this.getSumSq()) &&
380                 MathUtils.equals(stat.getSumLog(),this.getSumLog()) &&
381                 stat.getCovariance().equals(this.getCovariance()));
382     }
383     
384     /**
385      * Returns hash code based on values of statistics
386      * 
387      * @return hash code
388      */
389     @Override
390     public int hashCode() {
391         int result = 31 + MathUtils.hash(getGeometricMean());
392         result = result * 31 + MathUtils.hash(getGeometricMean());
393         result = result * 31 + MathUtils.hash(getMax());
394         result = result * 31 + MathUtils.hash(getMean());
395         result = result * 31 + MathUtils.hash(getMin());
396         result = result * 31 + MathUtils.hash(getN());
397         result = result * 31 + MathUtils.hash(getSum());
398         result = result * 31 + MathUtils.hash(getSumSq());
399         result = result * 31 + MathUtils.hash(getSumLog());
400         result = result * 31 + getCovariance().hashCode();
401         return result;
402     }
403 
404     // Getters and setters for statistics implementations
405     /**
406      * Sets statistics implementations.
407      * @param newImpl new implementations for statistics
408      * @param oldImpl old implementations for statistics
409      * @throws DimensionMismatchException if the array dimension
410      * does not match the one used at construction
411      * @throws IllegalStateException if data has already been added
412      *  (i.e if n > 0)
413      */
414     private void setImpl(StorelessUnivariateStatistic[] newImpl,
415                          StorelessUnivariateStatistic[] oldImpl)
416        throws DimensionMismatchException, IllegalStateException {
417         checkEmpty();
418         checkDimension(newImpl.length);
419         System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
420     }
421 
422     /**
423      * Returns the currently configured Sum implementation
424      * 
425      * @return the StorelessUnivariateStatistic implementing the sum
426      */
427     public StorelessUnivariateStatistic[] getSumImpl() {
428         return sumImpl.clone();
429     }
430 
431     /**
432      * <p>Sets the implementation for the Sum.</p>
433      * <p>This method must be activated before any data has been added - i.e.,
434      * before {@link #addValue(double[]) addValue} has been used to add data; 
435      * otherwise an IllegalStateException will be thrown.</p>
436      * 
437      * @param sumImpl the StorelessUnivariateStatistic instance to use
438      * for computing the Sum
439      * @throws DimensionMismatchException if the array dimension
440      * does not match the one used at construction
441      * @throws IllegalStateException if data has already been added
442      *  (i.e if n > 0)
443      */
444     public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
445       throws DimensionMismatchException {
446         setImpl(sumImpl, this.sumImpl);
447     }
448 
449     /**
450      * Returns the currently configured sum of squares implementation
451      * 
452      * @return the StorelessUnivariateStatistic implementing the sum of squares
453      */
454     public StorelessUnivariateStatistic[] getSumsqImpl() {
455         return sumSqImpl.clone();
456     }
457 
458     /**
459      * <p>Sets the implementation for the sum of squares.</p>
460      * <p>This method must be activated before any data has been added - i.e.,
461      * before {@link #addValue(double[]) addValue} has been used to add data; 
462      * otherwise an IllegalStateException will be thrown.</p>
463      * 
464      * @param sumsqImpl the StorelessUnivariateStatistic instance to use
465      * for computing the sum of squares
466      * @throws DimensionMismatchException if the array dimension
467      * does not match the one used at construction
468      * @throws IllegalStateException if data has already been added
469      *  (i.e if n > 0)
470      */
471     public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
472       throws DimensionMismatchException {
473         setImpl(sumsqImpl, this.sumSqImpl);
474     }
475 
476     /**
477      * Returns the currently configured minimum implementation
478      * 
479      * @return the StorelessUnivariateStatistic implementing the minimum
480      */
481     public StorelessUnivariateStatistic[] getMinImpl() {
482         return minImpl.clone();
483     }
484 
485     /**
486      * <p>Sets the implementation for the minimum.</p>
487      * <p>This method must be activated before any data has been added - i.e.,
488      * before {@link #addValue(double[]) addValue} has been used to add data; 
489      * otherwise an IllegalStateException will be thrown.</p>
490      * 
491      * @param minImpl the StorelessUnivariateStatistic instance to use
492      * for computing the minimum
493      * @throws DimensionMismatchException if the array dimension
494      * does not match the one used at construction
495      * @throws IllegalStateException if data has already been added
496      *  (i.e if n > 0)
497      */
498     public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
499       throws DimensionMismatchException {
500         setImpl(minImpl, this.minImpl);
501     }
502 
503     /**
504      * Returns the currently configured maximum implementation
505      * 
506      * @return the StorelessUnivariateStatistic implementing the maximum
507      */
508     public StorelessUnivariateStatistic[] getMaxImpl() {
509         return maxImpl.clone();
510     }
511 
512     /**
513      * <p>Sets the implementation for the maximum.</p>
514      * <p>This method must be activated before any data has been added - i.e.,
515      * before {@link #addValue(double[]) addValue} has been used to add data; 
516      * otherwise an IllegalStateException will be thrown.</p>
517      * 
518      * @param maxImpl the StorelessUnivariateStatistic instance to use
519      * for computing the maximum
520      * @throws DimensionMismatchException if the array dimension
521      * does not match the one used at construction
522      * @throws IllegalStateException if data has already been added
523      *  (i.e if n > 0)
524      */
525     public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
526       throws DimensionMismatchException {
527         setImpl(maxImpl, this.maxImpl);
528     }
529 
530     /**
531      * Returns the currently configured sum of logs implementation
532      * 
533      * @return the StorelessUnivariateStatistic implementing the log sum
534      */
535     public StorelessUnivariateStatistic[] getSumLogImpl() {
536         return sumLogImpl.clone();
537     }
538 
539     /**
540      * <p>Sets the implementation for the sum of logs.</p>
541      * <p>This method must be activated before any data has been added - i.e.,
542      * before {@link #addValue(double[]) addValue} has been used to add data; 
543      * otherwise an IllegalStateException will be thrown.</p>
544      * 
545      * @param sumLogImpl the StorelessUnivariateStatistic instance to use
546      * for computing the log sum
547      * @throws DimensionMismatchException if the array dimension
548      * does not match the one used at construction
549      * @throws IllegalStateException if data has already been added 
550      *  (i.e if n > 0)
551      */
552     public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
553       throws DimensionMismatchException {
554         setImpl(sumLogImpl, this.sumLogImpl);
555     }
556 
557     /**
558      * Returns the currently configured geometric mean implementation
559      * 
560      * @return the StorelessUnivariateStatistic implementing the geometric mean
561      */
562     public StorelessUnivariateStatistic[] getGeoMeanImpl() {
563         return geoMeanImpl.clone();
564     }
565 
566     /**
567      * <p>Sets the implementation for the geometric mean.</p>
568      * <p>This method must be activated before any data has been added - i.e.,
569      * before {@link #addValue(double[]) addValue} has been used to add data; 
570      * otherwise an IllegalStateException will be thrown.</p>
571      * 
572      * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
573      * for computing the geometric mean
574      * @throws DimensionMismatchException if the array dimension
575      * does not match the one used at construction
576      * @throws IllegalStateException if data has already been added
577      *  (i.e if n > 0)
578      */
579     public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
580       throws DimensionMismatchException {
581         setImpl(geoMeanImpl, this.geoMeanImpl);
582     }
583 
584     /**
585      * Returns the currently configured mean implementation
586      * 
587      * @return the StorelessUnivariateStatistic implementing the mean
588      */
589     public StorelessUnivariateStatistic[] getMeanImpl() {
590         return meanImpl.clone();
591     }
592 
593     /**
594      * <p>Sets the implementation for the mean.</p>
595      * <p>This method must be activated before any data has been added - i.e.,
596      * before {@link #addValue(double[]) addValue} has been used to add data; 
597      * otherwise an IllegalStateException will be thrown.</p>
598      * 
599      * @param meanImpl the StorelessUnivariateStatistic instance to use
600      * for computing the mean
601      * @throws DimensionMismatchException if the array dimension
602      * does not match the one used at construction
603      * @throws IllegalStateException if data has already been added
604      *  (i.e if n > 0)
605      */
606     public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
607       throws DimensionMismatchException {
608         setImpl(meanImpl, this.meanImpl);
609     }
610 
611     /**
612      * Throws IllegalStateException if n > 0.
613      */
614     private void checkEmpty() {
615         if (n > 0) {
616             throw MathRuntimeException.createIllegalStateException(
617                     "{0} values have been added before statistic is configured",
618                     n);
619         }
620     }
621 
622     /**
623      * Throws DimensionMismatchException if dimension != k.
624      * @param dimension dimension to check
625      * @throws DimensionMismatchException if dimension != k
626      */
627     private void checkDimension(int dimension)
628       throws DimensionMismatchException {
629         if (dimension != k) {
630             throw new DimensionMismatchException(dimension, k);
631         }
632     }
633 
634 }