1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math.stat.descriptive; 18 19 import java.io.Serializable; 20 import java.util.Arrays; 21 22 import org.apache.commons.math.DimensionMismatchException; 23 import org.apache.commons.math.MathRuntimeException; 24 import org.apache.commons.math.linear.RealMatrix; 25 import org.apache.commons.math.stat.descriptive.moment.GeometricMean; 26 import org.apache.commons.math.stat.descriptive.moment.Mean; 27 import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance; 28 import org.apache.commons.math.stat.descriptive.rank.Max; 29 import org.apache.commons.math.stat.descriptive.rank.Min; 30 import org.apache.commons.math.stat.descriptive.summary.Sum; 31 import org.apache.commons.math.stat.descriptive.summary.SumOfLogs; 32 import org.apache.commons.math.stat.descriptive.summary.SumOfSquares; 33 import org.apache.commons.math.util.MathUtils; 34 35 /** 36 * <p>Computes summary statistics for a stream of n-tuples added using the 37 * {@link #addValue(double[]) addValue} method. The data values are not stored 38 * in memory, so this class can be used to compute statistics for very large 39 * n-tuple streams.</p> 40 * 41 * <p>The {@link StorelessUnivariateStatistic} instances used to maintain 42 * summary state and compute statistics are configurable via setters. 43 * For example, the default implementation for the mean can be overridden by 44 * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual 45 * parameters to these methods must implement the 46 * {@link StorelessUnivariateStatistic} interface and configuration must be 47 * completed before <code>addValue</code> is called. No configuration is 48 * necessary to use the default, commons-math provided implementations.</p> 49 * 50 * <p>To compute statistics for a stream of n-tuples, construct a 51 * MultivariateStatistics instance with dimension n and then use 52 * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code> 53 * methods where Xxx is a statistic return an array of <code>double</code> 54 * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the 55 * value of the given statistic for data range consisting of the i<sup>th</sup> element of 56 * each of the input n-tuples. For example, if <code>addValue</code> is called 57 * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8}, 58 * <code>getSum</code> will return a three-element array with values 59 * {0+3+6, 1+4+7, 2+5+8}</p> 60 * 61 * <p>Note: This class is not thread-safe. Use 62 * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple 63 * threads is required.</p> 64 * 65 * @since 1.2 66 * @version $Revision: 762116 $ $Date: 2009-04-05 12:48:53 -0400 (Sun, 05 Apr 2009) $ 67 */ 68 public class MultivariateSummaryStatistics 69 implements StatisticalMultivariateSummary, Serializable { 70 71 /** Serialization UID */ 72 private static final long serialVersionUID = 2271900808994826718L; 73 74 /** 75 * Construct a MultivariateSummaryStatistics instance 76 * @param k dimension of the data 77 * @param isCovarianceBiasCorrected if true, the unbiased sample 78 * covariance is computed, otherwise the biased population covariance 79 * is computed 80 */ 81 public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) { 82 this.k = k; 83 84 sumImpl = new StorelessUnivariateStatistic[k]; 85 sumSqImpl = new StorelessUnivariateStatistic[k]; 86 minImpl = new StorelessUnivariateStatistic[k]; 87 maxImpl = new StorelessUnivariateStatistic[k]; 88 sumLogImpl = new StorelessUnivariateStatistic[k]; 89 geoMeanImpl = new StorelessUnivariateStatistic[k]; 90 meanImpl = new StorelessUnivariateStatistic[k]; 91 92 for (int i = 0; i < k; ++i) { 93 sumImpl[i] = new Sum(); 94 sumSqImpl[i] = new SumOfSquares(); 95 minImpl[i] = new Min(); 96 maxImpl[i] = new Max(); 97 sumLogImpl[i] = new SumOfLogs(); 98 geoMeanImpl[i] = new GeometricMean(); 99 meanImpl[i] = new Mean(); 100 } 101 102 covarianceImpl = 103 new VectorialCovariance(k, isCovarianceBiasCorrected); 104 105 } 106 107 /** Dimension of the data. */ 108 private int k; 109 110 /** Count of values that have been added */ 111 private long n = 0; 112 113 /** Sum statistic implementation - can be reset by setter. */ 114 private StorelessUnivariateStatistic[] sumImpl; 115 116 /** Sum of squares statistic implementation - can be reset by setter. */ 117 private StorelessUnivariateStatistic[] sumSqImpl; 118 119 /** Minimum statistic implementation - can be reset by setter. */ 120 private StorelessUnivariateStatistic[] minImpl; 121 122 /** Maximum statistic implementation - can be reset by setter. */ 123 private StorelessUnivariateStatistic[] maxImpl; 124 125 /** Sum of log statistic implementation - can be reset by setter. */ 126 private StorelessUnivariateStatistic[] sumLogImpl; 127 128 /** Geometric mean statistic implementation - can be reset by setter. */ 129 private StorelessUnivariateStatistic[] geoMeanImpl; 130 131 /** Mean statistic implementation - can be reset by setter. */ 132 private StorelessUnivariateStatistic[] meanImpl; 133 134 /** Covariance statistic implementation - cannot be reset. */ 135 private VectorialCovariance covarianceImpl; 136 137 /** 138 * Add an n-tuple to the data 139 * 140 * @param value the n-tuple to add 141 * @throws DimensionMismatchException if the length of the array 142 * does not match the one used at construction 143 */ 144 public void addValue(double[] value) 145 throws DimensionMismatchException { 146 checkDimension(value.length); 147 for (int i = 0; i < k; ++i) { 148 double v = value[i]; 149 sumImpl[i].increment(v); 150 sumSqImpl[i].increment(v); 151 minImpl[i].increment(v); 152 maxImpl[i].increment(v); 153 sumLogImpl[i].increment(v); 154 geoMeanImpl[i].increment(v); 155 meanImpl[i].increment(v); 156 } 157 covarianceImpl.increment(value); 158 n++; 159 } 160 161 /** 162 * Returns the dimension of the data 163 * @return The dimension of the data 164 */ 165 public int getDimension() { 166 return k; 167 } 168 169 /** 170 * Returns the number of available values 171 * @return The number of available values 172 */ 173 public long getN() { 174 return n; 175 } 176 177 /** 178 * Returns an array of the results of a statistic. 179 * @param stats univariate statistic array 180 * @return results array 181 */ 182 private double[] getResults(StorelessUnivariateStatistic[] stats) { 183 double[] results = new double[stats.length]; 184 for (int i = 0; i < results.length; ++i) { 185 results[i] = stats[i].getResult(); 186 } 187 return results; 188 } 189 190 /** 191 * Returns an array whose i<sup>th</sup> entry is the sum of the 192 * i<sup>th</sup> entries of the arrays that have been added using 193 * {@link #addValue(double[])} 194 * 195 * @return the array of component sums 196 */ 197 public double[] getSum() { 198 return getResults(sumImpl); 199 } 200 201 /** 202 * Returns an array whose i<sup>th</sup> entry is the sum of squares of the 203 * i<sup>th</sup> entries of the arrays that have been added using 204 * {@link #addValue(double[])} 205 * 206 * @return the array of component sums of squares 207 */ 208 public double[] getSumSq() { 209 return getResults(sumSqImpl); 210 } 211 212 /** 213 * Returns an array whose i<sup>th</sup> entry is the sum of logs of the 214 * i<sup>th</sup> entries of the arrays that have been added using 215 * {@link #addValue(double[])} 216 * 217 * @return the array of component log sums 218 */ 219 public double[] getSumLog() { 220 return getResults(sumLogImpl); 221 } 222 223 /** 224 * Returns an array whose i<sup>th</sup> entry is the mean of the 225 * i<sup>th</sup> entries of the arrays that have been added using 226 * {@link #addValue(double[])} 227 * 228 * @return the array of component means 229 */ 230 public double[] getMean() { 231 return getResults(meanImpl); 232 } 233 234 /** 235 * Returns an array whose i<sup>th</sup> entry is the standard deviation of the 236 * i<sup>th</sup> entries of the arrays that have been added using 237 * {@link #addValue(double[])} 238 * 239 * @return the array of component standard deviations 240 */ 241 public double[] getStandardDeviation() { 242 double[] stdDev = new double[k]; 243 if (getN() < 1) { 244 Arrays.fill(stdDev, Double.NaN); 245 } else if (getN() < 2) { 246 Arrays.fill(stdDev, 0.0); 247 } else { 248 RealMatrix matrix = covarianceImpl.getResult(); 249 for (int i = 0; i < k; ++i) { 250 stdDev[i] = Math.sqrt(matrix.getEntry(i, i)); 251 } 252 } 253 return stdDev; 254 } 255 256 /** 257 * Returns the covariance matrix of the values that have been added. 258 * 259 * @return the covariance matrix 260 */ 261 public RealMatrix getCovariance() { 262 return covarianceImpl.getResult(); 263 } 264 265 /** 266 * Returns an array whose i<sup>th</sup> entry is the maximum of the 267 * i<sup>th</sup> entries of the arrays that have been added using 268 * {@link #addValue(double[])} 269 * 270 * @return the array of component maxima 271 */ 272 public double[] getMax() { 273 return getResults(maxImpl); 274 } 275 276 /** 277 * Returns an array whose i<sup>th</sup> entry is the minimum of the 278 * i<sup>th</sup> entries of the arrays that have been added using 279 * {@link #addValue(double[])} 280 * 281 * @return the array of component minima 282 */ 283 public double[] getMin() { 284 return getResults(minImpl); 285 } 286 287 /** 288 * Returns an array whose i<sup>th</sup> entry is the geometric mean of the 289 * i<sup>th</sup> entries of the arrays that have been added using 290 * {@link #addValue(double[])} 291 * 292 * @return the array of component geometric means 293 */ 294 public double[] getGeometricMean() { 295 return getResults(geoMeanImpl); 296 } 297 298 /** 299 * Generates a text report displaying 300 * summary statistics from values that 301 * have been added. 302 * @return String with line feeds displaying statistics 303 */ 304 @Override 305 public String toString() { 306 StringBuffer outBuffer = new StringBuffer(); 307 outBuffer.append("MultivariateSummaryStatistics:\n"); 308 outBuffer.append("n: " + getN() + "\n"); 309 append(outBuffer, getMin(), "min: ", ", ", "\n"); 310 append(outBuffer, getMax(), "max: ", ", ", "\n"); 311 append(outBuffer, getMean(), "mean: ", ", ", "\n"); 312 append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n"); 313 append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n"); 314 append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n"); 315 append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n"); 316 outBuffer.append("covariance: " + getCovariance().toString() + "\n"); 317 return outBuffer.toString(); 318 } 319 320 /** 321 * Append a text representation of an array to a buffer. 322 * @param buffer buffer to fill 323 * @param data data array 324 * @param prefix text prefix 325 * @param separator elements separator 326 * @param suffix text suffix 327 */ 328 private void append(StringBuffer buffer, double[] data, 329 String prefix, String separator, String suffix) { 330 buffer.append(prefix); 331 for (int i = 0; i < data.length; ++i) { 332 if (i > 0) { 333 buffer.append(separator); 334 } 335 buffer.append(data[i]); 336 } 337 buffer.append(suffix); 338 } 339 340 /** 341 * Resets all statistics and storage 342 */ 343 public void clear() { 344 this.n = 0; 345 for (int i = 0; i < k; ++i) { 346 minImpl[i].clear(); 347 maxImpl[i].clear(); 348 sumImpl[i].clear(); 349 sumLogImpl[i].clear(); 350 sumSqImpl[i].clear(); 351 geoMeanImpl[i].clear(); 352 meanImpl[i].clear(); 353 } 354 covarianceImpl.clear(); 355 } 356 357 /** 358 * Returns true iff <code>object</code> is a <code>SummaryStatistics</code> 359 * instance and all statistics have the same values as this. 360 * @param object the object to test equality against. 361 * @return true if object equals this 362 */ 363 @Override 364 public boolean equals(Object object) { 365 if (object == this ) { 366 return true; 367 } 368 if (object instanceof MultivariateSummaryStatistics == false) { 369 return false; 370 } 371 MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object; 372 return (MathUtils.equals(stat.getGeometricMean(), 373 this.getGeometricMean()) && 374 MathUtils.equals(stat.getMax(), this.getMax()) && 375 MathUtils.equals(stat.getMean(),this.getMean()) && 376 MathUtils.equals(stat.getMin(),this.getMin()) && 377 MathUtils.equals(stat.getN(), this.getN()) && 378 MathUtils.equals(stat.getSum(), this.getSum()) && 379 MathUtils.equals(stat.getSumSq(),this.getSumSq()) && 380 MathUtils.equals(stat.getSumLog(),this.getSumLog()) && 381 stat.getCovariance().equals(this.getCovariance())); 382 } 383 384 /** 385 * Returns hash code based on values of statistics 386 * 387 * @return hash code 388 */ 389 @Override 390 public int hashCode() { 391 int result = 31 + MathUtils.hash(getGeometricMean()); 392 result = result * 31 + MathUtils.hash(getGeometricMean()); 393 result = result * 31 + MathUtils.hash(getMax()); 394 result = result * 31 + MathUtils.hash(getMean()); 395 result = result * 31 + MathUtils.hash(getMin()); 396 result = result * 31 + MathUtils.hash(getN()); 397 result = result * 31 + MathUtils.hash(getSum()); 398 result = result * 31 + MathUtils.hash(getSumSq()); 399 result = result * 31 + MathUtils.hash(getSumLog()); 400 result = result * 31 + getCovariance().hashCode(); 401 return result; 402 } 403 404 // Getters and setters for statistics implementations 405 /** 406 * Sets statistics implementations. 407 * @param newImpl new implementations for statistics 408 * @param oldImpl old implementations for statistics 409 * @throws DimensionMismatchException if the array dimension 410 * does not match the one used at construction 411 * @throws IllegalStateException if data has already been added 412 * (i.e if n > 0) 413 */ 414 private void setImpl(StorelessUnivariateStatistic[] newImpl, 415 StorelessUnivariateStatistic[] oldImpl) 416 throws DimensionMismatchException, IllegalStateException { 417 checkEmpty(); 418 checkDimension(newImpl.length); 419 System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length); 420 } 421 422 /** 423 * Returns the currently configured Sum implementation 424 * 425 * @return the StorelessUnivariateStatistic implementing the sum 426 */ 427 public StorelessUnivariateStatistic[] getSumImpl() { 428 return sumImpl.clone(); 429 } 430 431 /** 432 * <p>Sets the implementation for the Sum.</p> 433 * <p>This method must be activated before any data has been added - i.e., 434 * before {@link #addValue(double[]) addValue} has been used to add data; 435 * otherwise an IllegalStateException will be thrown.</p> 436 * 437 * @param sumImpl the StorelessUnivariateStatistic instance to use 438 * for computing the Sum 439 * @throws DimensionMismatchException if the array dimension 440 * does not match the one used at construction 441 * @throws IllegalStateException if data has already been added 442 * (i.e if n > 0) 443 */ 444 public void setSumImpl(StorelessUnivariateStatistic[] sumImpl) 445 throws DimensionMismatchException { 446 setImpl(sumImpl, this.sumImpl); 447 } 448 449 /** 450 * Returns the currently configured sum of squares implementation 451 * 452 * @return the StorelessUnivariateStatistic implementing the sum of squares 453 */ 454 public StorelessUnivariateStatistic[] getSumsqImpl() { 455 return sumSqImpl.clone(); 456 } 457 458 /** 459 * <p>Sets the implementation for the sum of squares.</p> 460 * <p>This method must be activated before any data has been added - i.e., 461 * before {@link #addValue(double[]) addValue} has been used to add data; 462 * otherwise an IllegalStateException will be thrown.</p> 463 * 464 * @param sumsqImpl the StorelessUnivariateStatistic instance to use 465 * for computing the sum of squares 466 * @throws DimensionMismatchException if the array dimension 467 * does not match the one used at construction 468 * @throws IllegalStateException if data has already been added 469 * (i.e if n > 0) 470 */ 471 public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl) 472 throws DimensionMismatchException { 473 setImpl(sumsqImpl, this.sumSqImpl); 474 } 475 476 /** 477 * Returns the currently configured minimum implementation 478 * 479 * @return the StorelessUnivariateStatistic implementing the minimum 480 */ 481 public StorelessUnivariateStatistic[] getMinImpl() { 482 return minImpl.clone(); 483 } 484 485 /** 486 * <p>Sets the implementation for the minimum.</p> 487 * <p>This method must be activated before any data has been added - i.e., 488 * before {@link #addValue(double[]) addValue} has been used to add data; 489 * otherwise an IllegalStateException will be thrown.</p> 490 * 491 * @param minImpl the StorelessUnivariateStatistic instance to use 492 * for computing the minimum 493 * @throws DimensionMismatchException if the array dimension 494 * does not match the one used at construction 495 * @throws IllegalStateException if data has already been added 496 * (i.e if n > 0) 497 */ 498 public void setMinImpl(StorelessUnivariateStatistic[] minImpl) 499 throws DimensionMismatchException { 500 setImpl(minImpl, this.minImpl); 501 } 502 503 /** 504 * Returns the currently configured maximum implementation 505 * 506 * @return the StorelessUnivariateStatistic implementing the maximum 507 */ 508 public StorelessUnivariateStatistic[] getMaxImpl() { 509 return maxImpl.clone(); 510 } 511 512 /** 513 * <p>Sets the implementation for the maximum.</p> 514 * <p>This method must be activated before any data has been added - i.e., 515 * before {@link #addValue(double[]) addValue} has been used to add data; 516 * otherwise an IllegalStateException will be thrown.</p> 517 * 518 * @param maxImpl the StorelessUnivariateStatistic instance to use 519 * for computing the maximum 520 * @throws DimensionMismatchException if the array dimension 521 * does not match the one used at construction 522 * @throws IllegalStateException if data has already been added 523 * (i.e if n > 0) 524 */ 525 public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl) 526 throws DimensionMismatchException { 527 setImpl(maxImpl, this.maxImpl); 528 } 529 530 /** 531 * Returns the currently configured sum of logs implementation 532 * 533 * @return the StorelessUnivariateStatistic implementing the log sum 534 */ 535 public StorelessUnivariateStatistic[] getSumLogImpl() { 536 return sumLogImpl.clone(); 537 } 538 539 /** 540 * <p>Sets the implementation for the sum of logs.</p> 541 * <p>This method must be activated before any data has been added - i.e., 542 * before {@link #addValue(double[]) addValue} has been used to add data; 543 * otherwise an IllegalStateException will be thrown.</p> 544 * 545 * @param sumLogImpl the StorelessUnivariateStatistic instance to use 546 * for computing the log sum 547 * @throws DimensionMismatchException if the array dimension 548 * does not match the one used at construction 549 * @throws IllegalStateException if data has already been added 550 * (i.e if n > 0) 551 */ 552 public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl) 553 throws DimensionMismatchException { 554 setImpl(sumLogImpl, this.sumLogImpl); 555 } 556 557 /** 558 * Returns the currently configured geometric mean implementation 559 * 560 * @return the StorelessUnivariateStatistic implementing the geometric mean 561 */ 562 public StorelessUnivariateStatistic[] getGeoMeanImpl() { 563 return geoMeanImpl.clone(); 564 } 565 566 /** 567 * <p>Sets the implementation for the geometric mean.</p> 568 * <p>This method must be activated before any data has been added - i.e., 569 * before {@link #addValue(double[]) addValue} has been used to add data; 570 * otherwise an IllegalStateException will be thrown.</p> 571 * 572 * @param geoMeanImpl the StorelessUnivariateStatistic instance to use 573 * for computing the geometric mean 574 * @throws DimensionMismatchException if the array dimension 575 * does not match the one used at construction 576 * @throws IllegalStateException if data has already been added 577 * (i.e if n > 0) 578 */ 579 public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl) 580 throws DimensionMismatchException { 581 setImpl(geoMeanImpl, this.geoMeanImpl); 582 } 583 584 /** 585 * Returns the currently configured mean implementation 586 * 587 * @return the StorelessUnivariateStatistic implementing the mean 588 */ 589 public StorelessUnivariateStatistic[] getMeanImpl() { 590 return meanImpl.clone(); 591 } 592 593 /** 594 * <p>Sets the implementation for the mean.</p> 595 * <p>This method must be activated before any data has been added - i.e., 596 * before {@link #addValue(double[]) addValue} has been used to add data; 597 * otherwise an IllegalStateException will be thrown.</p> 598 * 599 * @param meanImpl the StorelessUnivariateStatistic instance to use 600 * for computing the mean 601 * @throws DimensionMismatchException if the array dimension 602 * does not match the one used at construction 603 * @throws IllegalStateException if data has already been added 604 * (i.e if n > 0) 605 */ 606 public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl) 607 throws DimensionMismatchException { 608 setImpl(meanImpl, this.meanImpl); 609 } 610 611 /** 612 * Throws IllegalStateException if n > 0. 613 */ 614 private void checkEmpty() { 615 if (n > 0) { 616 throw MathRuntimeException.createIllegalStateException( 617 "{0} values have been added before statistic is configured", 618 n); 619 } 620 } 621 622 /** 623 * Throws DimensionMismatchException if dimension != k. 624 * @param dimension dimension to check 625 * @throws DimensionMismatchException if dimension != k 626 */ 627 private void checkDimension(int dimension) 628 throws DimensionMismatchException { 629 if (dimension != k) { 630 throw new DimensionMismatchException(dimension, k); 631 } 632 } 633 634 }