001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive;
018    
019    import java.io.Serializable;
020    import java.util.Arrays;
021    
022    import org.apache.commons.math.DimensionMismatchException;
023    import org.apache.commons.math.MathRuntimeException;
024    import org.apache.commons.math.linear.RealMatrix;
025    import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
026    import org.apache.commons.math.stat.descriptive.moment.Mean;
027    import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
028    import org.apache.commons.math.stat.descriptive.rank.Max;
029    import org.apache.commons.math.stat.descriptive.rank.Min;
030    import org.apache.commons.math.stat.descriptive.summary.Sum;
031    import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
032    import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
033    import org.apache.commons.math.util.MathUtils;
034    
035    /**
036     * <p>Computes summary statistics for a stream of n-tuples added using the 
037     * {@link #addValue(double[]) addValue} method. The data values are not stored
038     * in memory, so this class can be used to compute statistics for very large
039     * n-tuple streams.</p>
040     * 
041     * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
042     * summary state and compute statistics are configurable via setters.
043     * For example, the default implementation for the mean can be overridden by
044     * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
045     * parameters to these methods must implement the 
046     * {@link StorelessUnivariateStatistic} interface and configuration must be
047     * completed before <code>addValue</code> is called. No configuration is
048     * necessary to use the default, commons-math provided implementations.</p>
049     * 
050     * <p>To compute statistics for a stream of n-tuples, construct a
051     * MultivariateStatistics instance with dimension n and then use 
052     * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
053     * methods where Xxx is a statistic return an array of <code>double</code>
054     * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
055     * value of the given statistic for data range consisting of the i<sup>th</sup> element of
056     * each of the input n-tuples.  For example, if <code>addValue</code> is called
057     * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
058     * <code>getSum</code> will return a three-element array with values
059     * {0+3+6, 1+4+7, 2+5+8}</p>
060     * 
061     * <p>Note: This class is not thread-safe. Use 
062     * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
063     * threads is required.</p>
064     *
065     * @since 1.2
066     * @version $Revision: 762116 $ $Date: 2009-04-05 12:48:53 -0400 (Sun, 05 Apr 2009) $
067     */
068    public class MultivariateSummaryStatistics
069      implements StatisticalMultivariateSummary, Serializable {
070    
071        /** Serialization UID */
072        private static final long serialVersionUID = 2271900808994826718L;
073    
074        /**
075         * Construct a MultivariateSummaryStatistics instance
076         * @param k dimension of the data
077         * @param isCovarianceBiasCorrected if true, the unbiased sample
078         * covariance is computed, otherwise the biased population covariance
079         * is computed
080         */
081        public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
082            this.k = k;
083    
084            sumImpl     = new StorelessUnivariateStatistic[k];
085            sumSqImpl   = new StorelessUnivariateStatistic[k];
086            minImpl     = new StorelessUnivariateStatistic[k];
087            maxImpl     = new StorelessUnivariateStatistic[k];
088            sumLogImpl  = new StorelessUnivariateStatistic[k];
089            geoMeanImpl = new StorelessUnivariateStatistic[k];
090            meanImpl    = new StorelessUnivariateStatistic[k];
091    
092            for (int i = 0; i < k; ++i) {
093                sumImpl[i]     = new Sum();
094                sumSqImpl[i]   = new SumOfSquares();
095                minImpl[i]     = new Min();
096                maxImpl[i]     = new Max();
097                sumLogImpl[i]  = new SumOfLogs();
098                geoMeanImpl[i] = new GeometricMean();
099                meanImpl[i]    = new Mean();
100            }
101    
102            covarianceImpl =
103                new VectorialCovariance(k, isCovarianceBiasCorrected);
104    
105        }
106    
107        /** Dimension of the data. */
108        private int k;
109    
110        /** Count of values that have been added */
111        private long n = 0;
112        
113        /** Sum statistic implementation - can be reset by setter. */
114        private StorelessUnivariateStatistic[] sumImpl;
115        
116        /** Sum of squares statistic implementation - can be reset by setter. */
117        private StorelessUnivariateStatistic[] sumSqImpl;
118        
119        /** Minimum statistic implementation - can be reset by setter. */
120        private StorelessUnivariateStatistic[] minImpl;
121        
122        /** Maximum statistic implementation - can be reset by setter. */
123        private StorelessUnivariateStatistic[] maxImpl;
124        
125        /** Sum of log statistic implementation - can be reset by setter. */
126        private StorelessUnivariateStatistic[] sumLogImpl;
127        
128        /** Geometric mean statistic implementation - can be reset by setter. */
129        private StorelessUnivariateStatistic[] geoMeanImpl;
130        
131        /** Mean statistic implementation - can be reset by setter. */
132        private StorelessUnivariateStatistic[] meanImpl;
133        
134        /** Covariance statistic implementation - cannot be reset. */
135        private VectorialCovariance covarianceImpl;
136    
137        /**
138         * Add an n-tuple to the data
139         * 
140         * @param value  the n-tuple to add
141         * @throws DimensionMismatchException if the length of the array
142         * does not match the one used at construction
143         */
144        public void addValue(double[] value)
145          throws DimensionMismatchException {
146            checkDimension(value.length);
147            for (int i = 0; i < k; ++i) {
148                double v = value[i];
149                sumImpl[i].increment(v);
150                sumSqImpl[i].increment(v);
151                minImpl[i].increment(v);
152                maxImpl[i].increment(v);
153                sumLogImpl[i].increment(v);
154                geoMeanImpl[i].increment(v);
155                meanImpl[i].increment(v);
156            }
157            covarianceImpl.increment(value);
158            n++;
159        }
160    
161        /** 
162         * Returns the dimension of the data
163         * @return The dimension of the data
164         */
165        public int getDimension() {
166            return k;
167        }
168    
169        /** 
170         * Returns the number of available values
171         * @return The number of available values
172         */
173        public long getN() {
174            return n;
175        }
176    
177        /**
178         * Returns an array of the results of a statistic.
179         * @param stats univariate statistic array
180         * @return results array
181         */
182        private double[] getResults(StorelessUnivariateStatistic[] stats) {
183            double[] results = new double[stats.length];
184            for (int i = 0; i < results.length; ++i) {
185                results[i] = stats[i].getResult();
186            }
187            return results;
188        }
189    
190        /**
191         * Returns an array whose i<sup>th</sup> entry is the sum of the
192         * i<sup>th</sup> entries of the arrays that have been added using 
193         * {@link #addValue(double[])}
194         * 
195         * @return the array of component sums
196         */
197        public double[] getSum() {
198            return getResults(sumImpl);
199        }
200    
201        /**
202         * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
203         * i<sup>th</sup> entries of the arrays that have been added using 
204         * {@link #addValue(double[])}
205         * 
206         * @return the array of component sums of squares
207         */
208        public double[] getSumSq() {
209            return getResults(sumSqImpl);
210        }
211    
212        /**
213         * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
214         * i<sup>th</sup> entries of the arrays that have been added using 
215         * {@link #addValue(double[])}
216         * 
217         * @return the array of component log sums
218         */
219        public double[] getSumLog() {
220            return getResults(sumLogImpl);
221        }
222    
223        /**
224         * Returns an array whose i<sup>th</sup> entry is the mean of the
225         * i<sup>th</sup> entries of the arrays that have been added using 
226         * {@link #addValue(double[])}
227         * 
228         * @return the array of component means
229         */
230        public double[] getMean() {
231            return getResults(meanImpl);
232        }
233    
234        /**
235         * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
236         * i<sup>th</sup> entries of the arrays that have been added using 
237         * {@link #addValue(double[])}
238         * 
239         * @return the array of component standard deviations
240         */
241        public double[] getStandardDeviation() {
242            double[] stdDev = new double[k];
243            if (getN() < 1) {
244                Arrays.fill(stdDev, Double.NaN);
245            } else if (getN() < 2) {
246                Arrays.fill(stdDev, 0.0);
247            } else {
248                RealMatrix matrix = covarianceImpl.getResult();
249                for (int i = 0; i < k; ++i) {
250                    stdDev[i] = Math.sqrt(matrix.getEntry(i, i));
251                }
252            }
253            return stdDev;
254        }
255    
256        /**
257         * Returns the covariance matrix of the values that have been added.
258         *
259         * @return the covariance matrix 
260         */
261        public RealMatrix getCovariance() {
262            return covarianceImpl.getResult();
263        }
264    
265        /**
266         * Returns an array whose i<sup>th</sup> entry is the maximum of the
267         * i<sup>th</sup> entries of the arrays that have been added using 
268         * {@link #addValue(double[])}
269         * 
270         * @return the array of component maxima
271         */
272        public double[] getMax() {
273            return getResults(maxImpl);
274        }
275    
276        /**
277         * Returns an array whose i<sup>th</sup> entry is the minimum of the
278         * i<sup>th</sup> entries of the arrays that have been added using 
279         * {@link #addValue(double[])}
280         * 
281         * @return the array of component minima
282         */
283        public double[] getMin() {
284            return getResults(minImpl);
285        }
286    
287        /**
288         * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
289         * i<sup>th</sup> entries of the arrays that have been added using 
290         * {@link #addValue(double[])}
291         * 
292         * @return the array of component geometric means
293         */
294        public double[] getGeometricMean() {
295            return getResults(geoMeanImpl);
296        }
297        
298        /**
299         * Generates a text report displaying
300         * summary statistics from values that
301         * have been added.
302         * @return String with line feeds displaying statistics
303         */
304        @Override
305        public String toString() {
306            StringBuffer outBuffer = new StringBuffer();
307            outBuffer.append("MultivariateSummaryStatistics:\n");
308            outBuffer.append("n: " + getN() + "\n");
309            append(outBuffer, getMin(), "min: ", ", ", "\n");
310            append(outBuffer, getMax(), "max: ", ", ", "\n");
311            append(outBuffer, getMean(), "mean: ", ", ", "\n");
312            append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n");
313            append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n");
314            append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n");
315            append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n");
316            outBuffer.append("covariance: " + getCovariance().toString() + "\n");
317            return outBuffer.toString();
318        }
319    
320        /**
321         * Append a text representation of an array to a buffer.
322         * @param buffer buffer to fill
323         * @param data data array
324         * @param prefix text prefix
325         * @param separator elements separator
326         * @param suffix text suffix
327         */
328        private void append(StringBuffer buffer, double[] data,
329                            String prefix, String separator, String suffix) {
330            buffer.append(prefix);
331            for (int i = 0; i < data.length; ++i) {
332                if (i > 0) {
333                    buffer.append(separator);
334                }
335                buffer.append(data[i]);
336            }
337            buffer.append(suffix);
338        }
339    
340        /** 
341         * Resets all statistics and storage
342         */
343        public void clear() {
344            this.n = 0;
345            for (int i = 0; i < k; ++i) {
346                minImpl[i].clear();
347                maxImpl[i].clear();
348                sumImpl[i].clear();
349                sumLogImpl[i].clear();
350                sumSqImpl[i].clear();
351                geoMeanImpl[i].clear();
352                meanImpl[i].clear();
353            }
354            covarianceImpl.clear();
355        }
356        
357        /**
358         * Returns true iff <code>object</code> is a <code>SummaryStatistics</code>
359         * instance and all statistics have the same values as this.
360         * @param object the object to test equality against.
361         * @return true if object equals this
362         */
363        @Override
364        public boolean equals(Object object) {
365            if (object == this ) {
366                return true;
367            }
368            if (object instanceof MultivariateSummaryStatistics == false) {
369                return false;
370            }
371            MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
372            return (MathUtils.equals(stat.getGeometricMean(), 
373                    this.getGeometricMean()) &&
374                    MathUtils.equals(stat.getMax(), this.getMax()) && 
375                    MathUtils.equals(stat.getMean(),this.getMean()) &&
376                    MathUtils.equals(stat.getMin(),this.getMin()) &&
377                    MathUtils.equals(stat.getN(), this.getN()) &&
378                    MathUtils.equals(stat.getSum(), this.getSum()) &&
379                    MathUtils.equals(stat.getSumSq(),this.getSumSq()) &&
380                    MathUtils.equals(stat.getSumLog(),this.getSumLog()) &&
381                    stat.getCovariance().equals(this.getCovariance()));
382        }
383        
384        /**
385         * Returns hash code based on values of statistics
386         * 
387         * @return hash code
388         */
389        @Override
390        public int hashCode() {
391            int result = 31 + MathUtils.hash(getGeometricMean());
392            result = result * 31 + MathUtils.hash(getGeometricMean());
393            result = result * 31 + MathUtils.hash(getMax());
394            result = result * 31 + MathUtils.hash(getMean());
395            result = result * 31 + MathUtils.hash(getMin());
396            result = result * 31 + MathUtils.hash(getN());
397            result = result * 31 + MathUtils.hash(getSum());
398            result = result * 31 + MathUtils.hash(getSumSq());
399            result = result * 31 + MathUtils.hash(getSumLog());
400            result = result * 31 + getCovariance().hashCode();
401            return result;
402        }
403    
404        // Getters and setters for statistics implementations
405        /**
406         * Sets statistics implementations.
407         * @param newImpl new implementations for statistics
408         * @param oldImpl old implementations for statistics
409         * @throws DimensionMismatchException if the array dimension
410         * does not match the one used at construction
411         * @throws IllegalStateException if data has already been added
412         *  (i.e if n > 0)
413         */
414        private void setImpl(StorelessUnivariateStatistic[] newImpl,
415                             StorelessUnivariateStatistic[] oldImpl)
416           throws DimensionMismatchException, IllegalStateException {
417            checkEmpty();
418            checkDimension(newImpl.length);
419            System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
420        }
421    
422        /**
423         * Returns the currently configured Sum implementation
424         * 
425         * @return the StorelessUnivariateStatistic implementing the sum
426         */
427        public StorelessUnivariateStatistic[] getSumImpl() {
428            return sumImpl.clone();
429        }
430    
431        /**
432         * <p>Sets the implementation for the Sum.</p>
433         * <p>This method must be activated before any data has been added - i.e.,
434         * before {@link #addValue(double[]) addValue} has been used to add data; 
435         * otherwise an IllegalStateException will be thrown.</p>
436         * 
437         * @param sumImpl the StorelessUnivariateStatistic instance to use
438         * for computing the Sum
439         * @throws DimensionMismatchException if the array dimension
440         * does not match the one used at construction
441         * @throws IllegalStateException if data has already been added
442         *  (i.e if n > 0)
443         */
444        public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
445          throws DimensionMismatchException {
446            setImpl(sumImpl, this.sumImpl);
447        }
448    
449        /**
450         * Returns the currently configured sum of squares implementation
451         * 
452         * @return the StorelessUnivariateStatistic implementing the sum of squares
453         */
454        public StorelessUnivariateStatistic[] getSumsqImpl() {
455            return sumSqImpl.clone();
456        }
457    
458        /**
459         * <p>Sets the implementation for the sum of squares.</p>
460         * <p>This method must be activated before any data has been added - i.e.,
461         * before {@link #addValue(double[]) addValue} has been used to add data; 
462         * otherwise an IllegalStateException will be thrown.</p>
463         * 
464         * @param sumsqImpl the StorelessUnivariateStatistic instance to use
465         * for computing the sum of squares
466         * @throws DimensionMismatchException if the array dimension
467         * does not match the one used at construction
468         * @throws IllegalStateException if data has already been added
469         *  (i.e if n > 0)
470         */
471        public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
472          throws DimensionMismatchException {
473            setImpl(sumsqImpl, this.sumSqImpl);
474        }
475    
476        /**
477         * Returns the currently configured minimum implementation
478         * 
479         * @return the StorelessUnivariateStatistic implementing the minimum
480         */
481        public StorelessUnivariateStatistic[] getMinImpl() {
482            return minImpl.clone();
483        }
484    
485        /**
486         * <p>Sets the implementation for the minimum.</p>
487         * <p>This method must be activated before any data has been added - i.e.,
488         * before {@link #addValue(double[]) addValue} has been used to add data; 
489         * otherwise an IllegalStateException will be thrown.</p>
490         * 
491         * @param minImpl the StorelessUnivariateStatistic instance to use
492         * for computing the minimum
493         * @throws DimensionMismatchException if the array dimension
494         * does not match the one used at construction
495         * @throws IllegalStateException if data has already been added
496         *  (i.e if n > 0)
497         */
498        public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
499          throws DimensionMismatchException {
500            setImpl(minImpl, this.minImpl);
501        }
502    
503        /**
504         * Returns the currently configured maximum implementation
505         * 
506         * @return the StorelessUnivariateStatistic implementing the maximum
507         */
508        public StorelessUnivariateStatistic[] getMaxImpl() {
509            return maxImpl.clone();
510        }
511    
512        /**
513         * <p>Sets the implementation for the maximum.</p>
514         * <p>This method must be activated before any data has been added - i.e.,
515         * before {@link #addValue(double[]) addValue} has been used to add data; 
516         * otherwise an IllegalStateException will be thrown.</p>
517         * 
518         * @param maxImpl the StorelessUnivariateStatistic instance to use
519         * for computing the maximum
520         * @throws DimensionMismatchException if the array dimension
521         * does not match the one used at construction
522         * @throws IllegalStateException if data has already been added
523         *  (i.e if n > 0)
524         */
525        public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
526          throws DimensionMismatchException {
527            setImpl(maxImpl, this.maxImpl);
528        }
529    
530        /**
531         * Returns the currently configured sum of logs implementation
532         * 
533         * @return the StorelessUnivariateStatistic implementing the log sum
534         */
535        public StorelessUnivariateStatistic[] getSumLogImpl() {
536            return sumLogImpl.clone();
537        }
538    
539        /**
540         * <p>Sets the implementation for the sum of logs.</p>
541         * <p>This method must be activated before any data has been added - i.e.,
542         * before {@link #addValue(double[]) addValue} has been used to add data; 
543         * otherwise an IllegalStateException will be thrown.</p>
544         * 
545         * @param sumLogImpl the StorelessUnivariateStatistic instance to use
546         * for computing the log sum
547         * @throws DimensionMismatchException if the array dimension
548         * does not match the one used at construction
549         * @throws IllegalStateException if data has already been added 
550         *  (i.e if n > 0)
551         */
552        public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
553          throws DimensionMismatchException {
554            setImpl(sumLogImpl, this.sumLogImpl);
555        }
556    
557        /**
558         * Returns the currently configured geometric mean implementation
559         * 
560         * @return the StorelessUnivariateStatistic implementing the geometric mean
561         */
562        public StorelessUnivariateStatistic[] getGeoMeanImpl() {
563            return geoMeanImpl.clone();
564        }
565    
566        /**
567         * <p>Sets the implementation for the geometric mean.</p>
568         * <p>This method must be activated before any data has been added - i.e.,
569         * before {@link #addValue(double[]) addValue} has been used to add data; 
570         * otherwise an IllegalStateException will be thrown.</p>
571         * 
572         * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
573         * for computing the geometric mean
574         * @throws DimensionMismatchException if the array dimension
575         * does not match the one used at construction
576         * @throws IllegalStateException if data has already been added
577         *  (i.e if n > 0)
578         */
579        public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
580          throws DimensionMismatchException {
581            setImpl(geoMeanImpl, this.geoMeanImpl);
582        }
583    
584        /**
585         * Returns the currently configured mean implementation
586         * 
587         * @return the StorelessUnivariateStatistic implementing the mean
588         */
589        public StorelessUnivariateStatistic[] getMeanImpl() {
590            return meanImpl.clone();
591        }
592    
593        /**
594         * <p>Sets the implementation for the mean.</p>
595         * <p>This method must be activated before any data has been added - i.e.,
596         * before {@link #addValue(double[]) addValue} has been used to add data; 
597         * otherwise an IllegalStateException will be thrown.</p>
598         * 
599         * @param meanImpl the StorelessUnivariateStatistic instance to use
600         * for computing the mean
601         * @throws DimensionMismatchException if the array dimension
602         * does not match the one used at construction
603         * @throws IllegalStateException if data has already been added
604         *  (i.e if n > 0)
605         */
606        public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
607          throws DimensionMismatchException {
608            setImpl(meanImpl, this.meanImpl);
609        }
610    
611        /**
612         * Throws IllegalStateException if n > 0.
613         */
614        private void checkEmpty() {
615            if (n > 0) {
616                throw MathRuntimeException.createIllegalStateException(
617                        "{0} values have been added before statistic is configured",
618                        n);
619            }
620        }
621    
622        /**
623         * Throws DimensionMismatchException if dimension != k.
624         * @param dimension dimension to check
625         * @throws DimensionMismatchException if dimension != k
626         */
627        private void checkDimension(int dimension)
628          throws DimensionMismatchException {
629            if (dimension != k) {
630                throw new DimensionMismatchException(dimension, k);
631            }
632        }
633    
634    }