1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.math.stat.regression;
19 import java.io.Serializable;
20
21 import org.apache.commons.math.MathException;
22 import org.apache.commons.math.MathRuntimeException;
23 import org.apache.commons.math.distribution.TDistribution;
24 import org.apache.commons.math.distribution.TDistributionImpl;
25
26 /**
27 * Estimates an ordinary least squares regression model
28 * with one independent variable.
29 * <p>
30 * <code> y = intercept + slope * x </code></p>
31 * <p>
32 * Standard errors for <code>intercept</code> and <code>slope</code> are
33 * available as well as ANOVA, r-square and Pearson's r statistics.</p>
34 * <p>
35 * Observations (x,y pairs) can be added to the model one at a time or they
36 * can be provided in a 2-dimensional array. The observations are not stored
37 * in memory, so there is no limit to the number of observations that can be
38 * added to the model.</p>
39 * <p>
40 * <strong>Usage Notes</strong>: <ul>
41 * <li> When there are fewer than two observations in the model, or when
42 * there is no variation in the x values (i.e. all x values are the same)
43 * all statistics return <code>NaN</code>. At least two observations with
44 * different x coordinates are requred to estimate a bivariate regression
45 * model.
46 * </li>
47 * <li> getters for the statistics always compute values based on the current
48 * set of observations -- i.e., you can get statistics, then add more data
49 * and get updated statistics without using a new instance. There is no
50 * "compute" method that updates all statistics. Each of the getters performs
51 * the necessary computations to return the requested statistic.</li>
52 * </ul></p>
53 *
54 * @version $Revision: 773189 $ $Date: 2009-05-09 05:57:04 -0400 (Sat, 09 May 2009) $
55 */
56 public class SimpleRegression implements Serializable {
57
58 /** Serializable version identifier */
59 private static final long serialVersionUID = -3004689053607543335L;
60
61 /** the distribution used to compute inference statistics. */
62 private TDistribution distribution;
63
64 /** sum of x values */
65 private double sumX = 0d;
66
67 /** total variation in x (sum of squared deviations from xbar) */
68 private double sumXX = 0d;
69
70 /** sum of y values */
71 private double sumY = 0d;
72
73 /** total variation in y (sum of squared deviations from ybar) */
74 private double sumYY = 0d;
75
76 /** sum of products */
77 private double sumXY = 0d;
78
79 /** number of observations */
80 private long n = 0;
81
82 /** mean of accumulated x values, used in updating formulas */
83 private double xbar = 0;
84
85 /** mean of accumulated y values, used in updating formulas */
86 private double ybar = 0;
87
88 // ---------------------Public methods--------------------------------------
89
90 /**
91 * Create an empty SimpleRegression instance
92 */
93 public SimpleRegression() {
94 this(new TDistributionImpl(1.0));
95 }
96
97 /**
98 * Create an empty SimpleRegression using the given distribution object to
99 * compute inference statistics.
100 * @param t the distribution used to compute inference statistics.
101 * @since 1.2
102 */
103 public SimpleRegression(TDistribution t) {
104 super();
105 setDistribution(t);
106 }
107
108 /**
109 * Adds the observation (x,y) to the regression data set.
110 * <p>
111 * Uses updating formulas for means and sums of squares defined in
112 * "Algorithms for Computing the Sample Variance: Analysis and
113 * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
114 * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
115 * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
116 *
117 *
118 * @param x independent variable value
119 * @param y dependent variable value
120 */
121 public void addData(double x, double y) {
122 if (n == 0) {
123 xbar = x;
124 ybar = y;
125 } else {
126 double dx = x - xbar;
127 double dy = y - ybar;
128 sumXX += dx * dx * (double) n / (n + 1d);
129 sumYY += dy * dy * (double) n / (n + 1d);
130 sumXY += dx * dy * (double) n / (n + 1d);
131 xbar += dx / (n + 1.0);
132 ybar += dy / (n + 1.0);
133 }
134 sumX += x;
135 sumY += y;
136 n++;
137
138 if (n > 2) {
139 distribution.setDegreesOfFreedom(n - 2);
140 }
141 }
142
143
144 /**
145 * Removes the observation (x,y) from the regression data set.
146 * <p>
147 * Mirrors the addData method. This method permits the use of
148 * SimpleRegression instances in streaming mode where the regression
149 * is applied to a sliding "window" of observations, however the caller is
150 * responsible for maintaining the set of observations in the window.</p>
151 *
152 * The method has no effect if there are no points of data (i.e. n=0)
153 *
154 * @param x independent variable value
155 * @param y dependent variable value
156 */
157 public void removeData(double x, double y) {
158 if (n > 0) {
159 double dx = x - xbar;
160 double dy = y - ybar;
161 sumXX -= dx * dx * (double) n / (n - 1d);
162 sumYY -= dy * dy * (double) n / (n - 1d);
163 sumXY -= dx * dy * (double) n / (n - 1d);
164 xbar -= dx / (n - 1.0);
165 ybar -= dy / (n - 1.0);
166 sumX -= x;
167 sumY -= y;
168 n--;
169
170 if (n > 2) {
171 distribution.setDegreesOfFreedom(n - 2);
172 }
173 }
174 }
175
176 /**
177 * Adds the observations represented by the elements in
178 * <code>data</code>.
179 * <p>
180 * <code>(data[0][0],data[0][1])</code> will be the first observation, then
181 * <code>(data[1][0],data[1][1])</code>, etc.</p>
182 * <p>
183 * This method does not replace data that has already been added. The
184 * observations represented by <code>data</code> are added to the existing
185 * dataset.</p>
186 * <p>
187 * To replace all data, use <code>clear()</code> before adding the new
188 * data.</p>
189 *
190 * @param data array of observations to be added
191 */
192 public void addData(double[][] data) {
193 for (int i = 0; i < data.length; i++) {
194 addData(data[i][0], data[i][1]);
195 }
196 }
197
198
199 /**
200 * Removes observations represented by the elements in <code>data</code>.
201 * <p>
202 * If the array is larger than the current n, only the first n elements are
203 * processed. This method permits the use of SimpleRegression instances in
204 * streaming mode where the regression is applied to a sliding "window" of
205 * observations, however the caller is responsible for maintaining the set
206 * of observations in the window.</p>
207 * <p>
208 * To remove all data, use <code>clear()</code>.</p>
209 *
210 * @param data array of observations to be removed
211 */
212 public void removeData(double[][] data) {
213 for (int i = 0; i < data.length && n > 0; i++) {
214 removeData(data[i][0], data[i][1]);
215 }
216 }
217
218 /**
219 * Clears all data from the model.
220 */
221 public void clear() {
222 sumX = 0d;
223 sumXX = 0d;
224 sumY = 0d;
225 sumYY = 0d;
226 sumXY = 0d;
227 n = 0;
228 }
229
230 /**
231 * Returns the number of observations that have been added to the model.
232 *
233 * @return n number of observations that have been added.
234 */
235 public long getN() {
236 return n;
237 }
238
239 /**
240 * Returns the "predicted" <code>y</code> value associated with the
241 * supplied <code>x</code> value, based on the data that has been
242 * added to the model when this method is activated.
243 * <p>
244 * <code> predict(x) = intercept + slope * x </code></p>
245 * <p>
246 * <strong>Preconditions</strong>: <ul>
247 * <li>At least two observations (with at least two different x values)
248 * must have been added before invoking this method. If this method is
249 * invoked before a model can be estimated, <code>Double,NaN</code> is
250 * returned.
251 * </li></ul></p>
252 *
253 * @param x input <code>x</code> value
254 * @return predicted <code>y</code> value
255 */
256 public double predict(double x) {
257 double b1 = getSlope();
258 return getIntercept(b1) + b1 * x;
259 }
260
261 /**
262 * Returns the intercept of the estimated regression line.
263 * <p>
264 * The least squares estimate of the intercept is computed using the
265 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
266 * The intercept is sometimes denoted b0.</p>
267 * <p>
268 * <strong>Preconditions</strong>: <ul>
269 * <li>At least two observations (with at least two different x values)
270 * must have been added before invoking this method. If this method is
271 * invoked before a model can be estimated, <code>Double,NaN</code> is
272 * returned.
273 * </li></ul></p>
274 *
275 * @return the intercept of the regression line
276 */
277 public double getIntercept() {
278 return getIntercept(getSlope());
279 }
280
281 /**
282 * Returns the slope of the estimated regression line.
283 * <p>
284 * The least squares estimate of the slope is computed using the
285 * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
286 * The slope is sometimes denoted b1.</p>
287 * <p>
288 * <strong>Preconditions</strong>: <ul>
289 * <li>At least two observations (with at least two different x values)
290 * must have been added before invoking this method. If this method is
291 * invoked before a model can be estimated, <code>Double.NaN</code> is
292 * returned.
293 * </li></ul></p>
294 *
295 * @return the slope of the regression line
296 */
297 public double getSlope() {
298 if (n < 2) {
299 return Double.NaN; //not enough data
300 }
301 if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
302 return Double.NaN; //not enough variation in x
303 }
304 return sumXY / sumXX;
305 }
306
307 /**
308 * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
309 * sum of squared errors</a> (SSE) associated with the regression
310 * model.
311 * <p>
312 * The sum is computed using the computational formula</p>
313 * <p>
314 * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
315 * <p>
316 * where <code>SYY</code> is the sum of the squared deviations of the y
317 * values about their mean, <code>SXX</code> is similarly defined and
318 * <code>SXY</code> is the sum of the products of x and y mean deviations.
319 * </p><p>
320 * The sums are accumulated using the updating algorithm referenced in
321 * {@link #addData}.</p>
322 * <p>
323 * The return value is constrained to be non-negative - i.e., if due to
324 * rounding errors the computational formula returns a negative result,
325 * 0 is returned.</p>
326 * <p>
327 * <strong>Preconditions</strong>: <ul>
328 * <li>At least two observations (with at least two different x values)
329 * must have been added before invoking this method. If this method is
330 * invoked before a model can be estimated, <code>Double,NaN</code> is
331 * returned.
332 * </li></ul></p>
333 *
334 * @return sum of squared errors associated with the regression model
335 */
336 public double getSumSquaredErrors() {
337 return Math.max(0d, sumYY - sumXY * sumXY / sumXX);
338 }
339
340 /**
341 * Returns the sum of squared deviations of the y values about their mean.
342 * <p>
343 * This is defined as SSTO
344 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
345 * <p>
346 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
347 *
348 * @return sum of squared deviations of y values
349 */
350 public double getTotalSumSquares() {
351 if (n < 2) {
352 return Double.NaN;
353 }
354 return sumYY;
355 }
356
357 /**
358 * Returns the sum of squared deviations of the x values about their mean.
359 *
360 * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
361 *
362 * @return sum of squared deviations of x values
363 */
364 public double getXSumSquares() {
365 if (n < 2) {
366 return Double.NaN;
367 }
368 return sumXX;
369 }
370
371 /**
372 * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
373 *
374 * @return sum of cross products
375 */
376 public double getSumOfCrossProducts() {
377 return sumXY;
378 }
379
380 /**
381 * Returns the sum of squared deviations of the predicted y values about
382 * their mean (which equals the mean of y).
383 * <p>
384 * This is usually abbreviated SSR or SSM. It is defined as SSM
385 * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
386 * <p>
387 * <strong>Preconditions</strong>: <ul>
388 * <li>At least two observations (with at least two different x values)
389 * must have been added before invoking this method. If this method is
390 * invoked before a model can be estimated, <code>Double.NaN</code> is
391 * returned.
392 * </li></ul></p>
393 *
394 * @return sum of squared deviations of predicted y values
395 */
396 public double getRegressionSumSquares() {
397 return getRegressionSumSquares(getSlope());
398 }
399
400 /**
401 * Returns the sum of squared errors divided by the degrees of freedom,
402 * usually abbreviated MSE.
403 * <p>
404 * If there are fewer than <strong>three</strong> data pairs in the model,
405 * or if there is no variation in <code>x</code>, this returns
406 * <code>Double.NaN</code>.</p>
407 *
408 * @return sum of squared deviations of y values
409 */
410 public double getMeanSquareError() {
411 if (n < 3) {
412 return Double.NaN;
413 }
414 return getSumSquaredErrors() / (n - 2);
415 }
416
417 /**
418 * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
419 * Pearson's product moment correlation coefficient</a>,
420 * usually denoted r.
421 * <p>
422 * <strong>Preconditions</strong>: <ul>
423 * <li>At least two observations (with at least two different x values)
424 * must have been added before invoking this method. If this method is
425 * invoked before a model can be estimated, <code>Double,NaN</code> is
426 * returned.
427 * </li></ul></p>
428 *
429 * @return Pearson's r
430 */
431 public double getR() {
432 double b1 = getSlope();
433 double result = Math.sqrt(getRSquare());
434 if (b1 < 0) {
435 result = -result;
436 }
437 return result;
438 }
439
440 /**
441 * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
442 * coefficient of determination</a>,
443 * usually denoted r-square.
444 * <p>
445 * <strong>Preconditions</strong>: <ul>
446 * <li>At least two observations (with at least two different x values)
447 * must have been added before invoking this method. If this method is
448 * invoked before a model can be estimated, <code>Double,NaN</code> is
449 * returned.
450 * </li></ul></p>
451 *
452 * @return r-square
453 */
454 public double getRSquare() {
455 double ssto = getTotalSumSquares();
456 return (ssto - getSumSquaredErrors()) / ssto;
457 }
458
459 /**
460 * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
461 * standard error of the intercept estimate</a>,
462 * usually denoted s(b0).
463 * <p>
464 * If there are fewer that <strong>three</strong> observations in the
465 * model, or if there is no variation in x, this returns
466 * <code>Double.NaN</code>.</p>
467 *
468 * @return standard error associated with intercept estimate
469 */
470 public double getInterceptStdErr() {
471 return Math.sqrt(
472 getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
473 }
474
475 /**
476 * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
477 * error of the slope estimate</a>,
478 * usually denoted s(b1).
479 * <p>
480 * If there are fewer that <strong>three</strong> data pairs in the model,
481 * or if there is no variation in x, this returns <code>Double.NaN</code>.
482 * </p>
483 *
484 * @return standard error associated with slope estimate
485 */
486 public double getSlopeStdErr() {
487 return Math.sqrt(getMeanSquareError() / sumXX);
488 }
489
490 /**
491 * Returns the half-width of a 95% confidence interval for the slope
492 * estimate.
493 * <p>
494 * The 95% confidence interval is</p>
495 * <p>
496 * <code>(getSlope() - getSlopeConfidenceInterval(),
497 * getSlope() + getSlopeConfidenceInterval())</code></p>
498 * <p>
499 * If there are fewer that <strong>three</strong> observations in the
500 * model, or if there is no variation in x, this returns
501 * <code>Double.NaN</code>.</p>
502 * <p>
503 * <strong>Usage Note</strong>:<br>
504 * The validity of this statistic depends on the assumption that the
505 * observations included in the model are drawn from a
506 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
507 * Bivariate Normal Distribution</a>.</p>
508 *
509 * @return half-width of 95% confidence interval for the slope estimate
510 * @throws MathException if the confidence interval can not be computed.
511 */
512 public double getSlopeConfidenceInterval() throws MathException {
513 return getSlopeConfidenceInterval(0.05d);
514 }
515
516 /**
517 * Returns the half-width of a (100-100*alpha)% confidence interval for
518 * the slope estimate.
519 * <p>
520 * The (100-100*alpha)% confidence interval is </p>
521 * <p>
522 * <code>(getSlope() - getSlopeConfidenceInterval(),
523 * getSlope() + getSlopeConfidenceInterval())</code></p>
524 * <p>
525 * To request, for example, a 99% confidence interval, use
526 * <code>alpha = .01</code></p>
527 * <p>
528 * <strong>Usage Note</strong>:<br>
529 * The validity of this statistic depends on the assumption that the
530 * observations included in the model are drawn from a
531 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
532 * Bivariate Normal Distribution</a>.</p>
533 * <p>
534 * <strong> Preconditions:</strong><ul>
535 * <li>If there are fewer that <strong>three</strong> observations in the
536 * model, or if there is no variation in x, this returns
537 * <code>Double.NaN</code>.
538 * </li>
539 * <li><code>(0 < alpha < 1)</code>; otherwise an
540 * <code>IllegalArgumentException</code> is thrown.
541 * </li></ul></p>
542 *
543 * @param alpha the desired significance level
544 * @return half-width of 95% confidence interval for the slope estimate
545 * @throws MathException if the confidence interval can not be computed.
546 */
547 public double getSlopeConfidenceInterval(double alpha)
548 throws MathException {
549 if (alpha >= 1 || alpha <= 0) {
550 throw MathRuntimeException.createIllegalArgumentException(
551 "out of bounds significance level {0}, must be between {1} and {2}",
552 alpha, 0.0, 1.0);
553 }
554 return getSlopeStdErr() *
555 distribution.inverseCumulativeProbability(1d - alpha / 2d);
556 }
557
558 /**
559 * Returns the significance level of the slope (equiv) correlation.
560 * <p>
561 * Specifically, the returned value is the smallest <code>alpha</code>
562 * such that the slope confidence interval with significance level
563 * equal to <code>alpha</code> does not include <code>0</code>.
564 * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
565 * </p><p>
566 * <strong>Usage Note</strong>:<br>
567 * The validity of this statistic depends on the assumption that the
568 * observations included in the model are drawn from a
569 * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
570 * Bivariate Normal Distribution</a>.</p>
571 * <p>
572 * If there are fewer that <strong>three</strong> observations in the
573 * model, or if there is no variation in x, this returns
574 * <code>Double.NaN</code>.</p>
575 *
576 * @return significance level for slope/correlation
577 * @throws MathException if the significance level can not be computed.
578 */
579 public double getSignificance() throws MathException {
580 return 2d * (1.0 - distribution.cumulativeProbability(
581 Math.abs(getSlope()) / getSlopeStdErr()));
582 }
583
584 // ---------------------Private methods-----------------------------------
585
586 /**
587 * Returns the intercept of the estimated regression line, given the slope.
588 * <p>
589 * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
590 *
591 * @param slope current slope
592 * @return the intercept of the regression line
593 */
594 private double getIntercept(double slope) {
595 return (sumY - slope * sumX) / n;
596 }
597
598 /**
599 * Computes SSR from b1.
600 *
601 * @param slope regression slope estimate
602 * @return sum of squared deviations of predicted y values
603 */
604 private double getRegressionSumSquares(double slope) {
605 return slope * slope * sumXX;
606 }
607
608 /**
609 * Modify the distribution used to compute inference statistics.
610 * @param value the new distribution
611 * @since 1.2
612 */
613 public void setDistribution(TDistribution value) {
614 distribution = value;
615
616 // modify degrees of freedom
617 if (n > 2) {
618 distribution.setDegreesOfFreedom(n - 2);
619 }
620 }
621 }