001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.stat.regression;
019    import java.io.Serializable;
020    
021    import org.apache.commons.math.MathException;
022    import org.apache.commons.math.MathRuntimeException;
023    import org.apache.commons.math.distribution.TDistribution;
024    import org.apache.commons.math.distribution.TDistributionImpl;
025    
026    /**
027     * Estimates an ordinary least squares regression model
028     * with one independent variable.
029     * <p>
030     * <code> y = intercept + slope * x  </code></p>
031     * <p>
032     * Standard errors for <code>intercept</code> and <code>slope</code> are 
033     * available as well as ANOVA, r-square and Pearson's r statistics.</p>
034     * <p>
035     * Observations (x,y pairs) can be added to the model one at a time or they 
036     * can be provided in a 2-dimensional array.  The observations are not stored
037     * in memory, so there is no limit to the number of observations that can be
038     * added to the model.</p> 
039     * <p>
040     * <strong>Usage Notes</strong>: <ul>
041     * <li> When there are fewer than two observations in the model, or when
042     * there is no variation in the x values (i.e. all x values are the same) 
043     * all statistics return <code>NaN</code>. At least two observations with
044     * different x coordinates are requred to estimate a bivariate regression 
045     * model.
046     * </li>
047     * <li> getters for the statistics always compute values based on the current
048     * set of observations -- i.e., you can get statistics, then add more data
049     * and get updated statistics without using a new instance.  There is no 
050     * "compute" method that updates all statistics.  Each of the getters performs
051     * the necessary computations to return the requested statistic.</li>
052     * </ul></p>
053     *
054     * @version $Revision: 773189 $ $Date: 2009-05-09 05:57:04 -0400 (Sat, 09 May 2009) $
055     */
056    public class SimpleRegression implements Serializable {
057    
058        /** Serializable version identifier */
059        private static final long serialVersionUID = -3004689053607543335L;
060    
061        /** the distribution used to compute inference statistics. */
062        private TDistribution distribution;
063        
064        /** sum of x values */
065        private double sumX = 0d;
066    
067        /** total variation in x (sum of squared deviations from xbar) */
068        private double sumXX = 0d;
069    
070        /** sum of y values */
071        private double sumY = 0d;
072    
073        /** total variation in y (sum of squared deviations from ybar) */
074        private double sumYY = 0d;
075    
076        /** sum of products */
077        private double sumXY = 0d;
078    
079        /** number of observations */
080        private long n = 0;
081    
082        /** mean of accumulated x values, used in updating formulas */
083        private double xbar = 0;
084    
085        /** mean of accumulated y values, used in updating formulas */
086        private double ybar = 0;
087    
088        // ---------------------Public methods--------------------------------------
089    
090        /**
091         * Create an empty SimpleRegression instance
092         */
093        public SimpleRegression() {
094            this(new TDistributionImpl(1.0));
095        }
096        
097        /**
098         * Create an empty SimpleRegression using the given distribution object to
099         * compute inference statistics.
100         * @param t the distribution used to compute inference statistics.
101         * @since 1.2
102         */
103        public SimpleRegression(TDistribution t) {
104            super();
105            setDistribution(t);
106        }
107        
108        /**
109         * Adds the observation (x,y) to the regression data set.
110         * <p>
111         * Uses updating formulas for means and sums of squares defined in 
112         * "Algorithms for Computing the Sample Variance: Analysis and
113         * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J. 
114         * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
115         * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
116         *
117         *
118         * @param x independent variable value
119         * @param y dependent variable value
120         */
121        public void addData(double x, double y) {
122            if (n == 0) {
123                xbar = x;
124                ybar = y;
125            } else {
126                double dx = x - xbar;
127                double dy = y - ybar;
128                sumXX += dx * dx * (double) n / (n + 1d);
129                sumYY += dy * dy * (double) n / (n + 1d);
130                sumXY += dx * dy * (double) n / (n + 1d);
131                xbar += dx / (n + 1.0);
132                ybar += dy / (n + 1.0);
133            }
134            sumX += x;
135            sumY += y;
136            n++;
137            
138            if (n > 2) {
139                distribution.setDegreesOfFreedom(n - 2);
140            }
141        }
142    
143        
144        /**
145         * Removes the observation (x,y) from the regression data set.
146         * <p>
147         * Mirrors the addData method.  This method permits the use of 
148         * SimpleRegression instances in streaming mode where the regression 
149         * is applied to a sliding "window" of observations, however the caller is 
150         * responsible for maintaining the set of observations in the window.</p>
151         * 
152         * The method has no effect if there are no points of data (i.e. n=0)
153         *
154         * @param x independent variable value
155         * @param y dependent variable value
156         */
157        public void removeData(double x, double y) {
158            if (n > 0) {
159                double dx = x - xbar;
160                double dy = y - ybar;
161                sumXX -= dx * dx * (double) n / (n - 1d);
162                sumYY -= dy * dy * (double) n / (n - 1d);
163                sumXY -= dx * dy * (double) n / (n - 1d);
164                xbar -= dx / (n - 1.0);
165                ybar -= dy / (n - 1.0);
166                sumX -= x;
167                sumY -= y;
168                n--;
169                
170                if (n > 2) {
171                    distribution.setDegreesOfFreedom(n - 2);
172                } 
173            }
174        }
175    
176        /**
177         * Adds the observations represented by the elements in 
178         * <code>data</code>.
179         * <p>
180         * <code>(data[0][0],data[0][1])</code> will be the first observation, then
181         * <code>(data[1][0],data[1][1])</code>, etc.</p>
182         * <p> 
183         * This method does not replace data that has already been added.  The
184         * observations represented by <code>data</code> are added to the existing
185         * dataset.</p>
186         * <p> 
187         * To replace all data, use <code>clear()</code> before adding the new 
188         * data.</p>
189         * 
190         * @param data array of observations to be added
191         */
192        public void addData(double[][] data) {
193            for (int i = 0; i < data.length; i++) {
194                addData(data[i][0], data[i][1]);
195            }
196        }
197    
198    
199        /**
200         * Removes observations represented by the elements in <code>data</code>.
201          * <p> 
202         * If the array is larger than the current n, only the first n elements are 
203         * processed.  This method permits the use of SimpleRegression instances in 
204         * streaming mode where the regression is applied to a sliding "window" of 
205         * observations, however the caller is responsible for maintaining the set 
206         * of observations in the window.</p>
207         * <p> 
208         * To remove all data, use <code>clear()</code>.</p>
209         * 
210         * @param data array of observations to be removed
211         */
212        public void removeData(double[][] data) {
213            for (int i = 0; i < data.length && n > 0; i++) {
214                removeData(data[i][0], data[i][1]);
215            }
216        }
217    
218        /**
219         * Clears all data from the model.
220         */
221        public void clear() {
222            sumX = 0d;
223            sumXX = 0d;
224            sumY = 0d;
225            sumYY = 0d;
226            sumXY = 0d;
227            n = 0;
228        }
229    
230        /**
231         * Returns the number of observations that have been added to the model.
232         *
233         * @return n number of observations that have been added.
234         */
235        public long getN() {
236            return n;
237        }
238    
239        /**
240         * Returns the "predicted" <code>y</code> value associated with the 
241         * supplied <code>x</code> value,  based on the data that has been
242         * added to the model when this method is activated.
243         * <p>
244         * <code> predict(x) = intercept + slope * x </code></p>
245         * <p>
246         * <strong>Preconditions</strong>: <ul>
247         * <li>At least two observations (with at least two different x values)
248         * must have been added before invoking this method. If this method is 
249         * invoked before a model can be estimated, <code>Double,NaN</code> is
250         * returned.
251         * </li></ul></p>
252         *
253         * @param x input <code>x</code> value
254         * @return predicted <code>y</code> value
255         */
256        public double predict(double x) {
257            double b1 = getSlope();
258            return getIntercept(b1) + b1 * x;
259        }
260    
261        /**
262         * Returns the intercept of the estimated regression line.
263         * <p>
264         * The least squares estimate of the intercept is computed using the 
265         * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
266         * The intercept is sometimes denoted b0.</p>
267         * <p>
268         * <strong>Preconditions</strong>: <ul>
269         * <li>At least two observations (with at least two different x values)
270         * must have been added before invoking this method. If this method is 
271         * invoked before a model can be estimated, <code>Double,NaN</code> is
272         * returned.
273         * </li></ul></p>
274         *
275         * @return the intercept of the regression line
276         */
277        public double getIntercept() {
278            return getIntercept(getSlope());
279        }
280    
281        /**
282        * Returns the slope of the estimated regression line.  
283        * <p>
284        * The least squares estimate of the slope is computed using the 
285        * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
286        * The slope is sometimes denoted b1.</p>
287        * <p>
288        * <strong>Preconditions</strong>: <ul>
289        * <li>At least two observations (with at least two different x values)
290        * must have been added before invoking this method. If this method is 
291        * invoked before a model can be estimated, <code>Double.NaN</code> is
292        * returned.
293        * </li></ul></p>
294        *
295        * @return the slope of the regression line
296        */
297        public double getSlope() {
298            if (n < 2) {
299                return Double.NaN; //not enough data 
300            }
301            if (Math.abs(sumXX) < 10 * Double.MIN_VALUE) {
302                return Double.NaN; //not enough variation in x
303            }
304            return sumXY / sumXX;
305        }
306    
307        /**
308         * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
309         * sum of squared errors</a> (SSE) associated with the regression 
310         * model.
311         * <p>
312         * The sum is computed using the computational formula</p>
313         * <p>
314         * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
315         * <p>
316         * where <code>SYY</code> is the sum of the squared deviations of the y
317         * values about their mean, <code>SXX</code> is similarly defined and
318         * <code>SXY</code> is the sum of the products of x and y mean deviations.
319         * </p><p>
320         * The sums are accumulated using the updating algorithm referenced in 
321         * {@link #addData}.</p>
322         * <p>
323         * The return value is constrained to be non-negative - i.e., if due to 
324         * rounding errors the computational formula returns a negative result, 
325         * 0 is returned.</p>
326         * <p>
327         * <strong>Preconditions</strong>: <ul>
328         * <li>At least two observations (with at least two different x values)
329         * must have been added before invoking this method. If this method is 
330         * invoked before a model can be estimated, <code>Double,NaN</code> is
331         * returned.
332         * </li></ul></p>
333         *
334         * @return sum of squared errors associated with the regression model
335         */
336        public double getSumSquaredErrors() {
337            return Math.max(0d, sumYY - sumXY * sumXY / sumXX);
338        }
339    
340        /**
341         * Returns the sum of squared deviations of the y values about their mean.
342         * <p>
343         * This is defined as SSTO 
344         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
345         * <p>
346         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
347         *
348         * @return sum of squared deviations of y values
349         */
350        public double getTotalSumSquares() {
351            if (n < 2) {
352                return Double.NaN;
353            }
354            return sumYY;
355        }
356        
357        /**
358         * Returns the sum of squared deviations of the x values about their mean.
359         *
360         * If <code>n < 2</code>, this returns <code>Double.NaN</code>.</p>
361         *
362         * @return sum of squared deviations of x values
363         */
364        public double getXSumSquares() {
365            if (n < 2) {
366                return Double.NaN;
367            }
368            return sumXX;
369        }
370        
371        /**
372         * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
373         *
374         * @return sum of cross products
375         */
376        public double getSumOfCrossProducts() {
377            return sumXY;
378        }
379    
380        /**
381         * Returns the sum of squared deviations of the predicted y values about 
382         * their mean (which equals the mean of y).
383         * <p>
384         * This is usually abbreviated SSR or SSM.  It is defined as SSM 
385         * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
386         * <p>
387         * <strong>Preconditions</strong>: <ul>
388         * <li>At least two observations (with at least two different x values)
389         * must have been added before invoking this method. If this method is 
390         * invoked before a model can be estimated, <code>Double.NaN</code> is
391         * returned.
392         * </li></ul></p>
393         *
394         * @return sum of squared deviations of predicted y values
395         */
396        public double getRegressionSumSquares() {
397            return getRegressionSumSquares(getSlope());
398        }
399    
400        /**
401         * Returns the sum of squared errors divided by the degrees of freedom,
402         * usually abbreviated MSE. 
403         * <p>
404         * If there are fewer than <strong>three</strong> data pairs in the model,
405         * or if there is no variation in <code>x</code>, this returns 
406         * <code>Double.NaN</code>.</p>
407         *
408         * @return sum of squared deviations of y values
409         */
410        public double getMeanSquareError() {
411            if (n < 3) {
412                return Double.NaN;
413            }
414            return getSumSquaredErrors() / (n - 2);
415        }
416    
417        /**
418         * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
419         * Pearson's product moment correlation coefficient</a>,
420         * usually denoted r. 
421         * <p>
422         * <strong>Preconditions</strong>: <ul>
423         * <li>At least two observations (with at least two different x values)
424         * must have been added before invoking this method. If this method is 
425         * invoked before a model can be estimated, <code>Double,NaN</code> is
426         * returned.
427         * </li></ul></p>
428         *
429         * @return Pearson's r
430         */
431        public double getR() {
432            double b1 = getSlope();
433            double result = Math.sqrt(getRSquare());
434            if (b1 < 0) {
435                result = -result;
436            }
437            return result;
438        }
439    
440        /** 
441         * Returns the <a href="http://www.xycoon.com/coefficient1.htm"> 
442         * coefficient of determination</a>,
443         * usually denoted r-square. 
444         * <p>
445         * <strong>Preconditions</strong>: <ul>
446         * <li>At least two observations (with at least two different x values)
447         * must have been added before invoking this method. If this method is 
448         * invoked before a model can be estimated, <code>Double,NaN</code> is
449         * returned.
450         * </li></ul></p>
451         *
452         * @return r-square
453         */
454        public double getRSquare() {
455            double ssto = getTotalSumSquares();
456            return (ssto - getSumSquaredErrors()) / ssto;
457        }
458    
459        /**
460         * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
461         * standard error of the intercept estimate</a>, 
462         * usually denoted s(b0). 
463         * <p>
464         * If there are fewer that <strong>three</strong> observations in the 
465         * model, or if there is no variation in x, this returns 
466         * <code>Double.NaN</code>.</p>
467         *
468         * @return standard error associated with intercept estimate
469         */
470        public double getInterceptStdErr() {
471            return Math.sqrt(
472                getMeanSquareError() * ((1d / (double) n) + (xbar * xbar) / sumXX));
473        }
474    
475        /**
476         * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
477         * error of the slope estimate</a>,
478         * usually denoted s(b1). 
479         * <p>
480         * If there are fewer that <strong>three</strong> data pairs in the model,
481         * or if there is no variation in x, this returns <code>Double.NaN</code>.
482         * </p>
483         * 
484         * @return standard error associated with slope estimate
485         */
486        public double getSlopeStdErr() {
487            return Math.sqrt(getMeanSquareError() / sumXX);
488        }
489    
490        /**
491         * Returns the half-width of a 95% confidence interval for the slope
492         * estimate.
493         * <p>
494         * The 95% confidence interval is</p>
495         * <p>
496         * <code>(getSlope() - getSlopeConfidenceInterval(), 
497         * getSlope() + getSlopeConfidenceInterval())</code></p>
498         * <p>
499         * If there are fewer that <strong>three</strong> observations in the 
500         * model, or if there is no variation in x, this returns 
501         * <code>Double.NaN</code>.</p>
502         * <p>
503         * <strong>Usage Note</strong>:<br>
504         * The validity of this statistic depends on the assumption that the 
505         * observations included in the model are drawn from a
506         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
507         * Bivariate Normal Distribution</a>.</p>
508         *
509         * @return half-width of 95% confidence interval for the slope estimate
510         * @throws MathException if the confidence interval can not be computed.
511         */
512        public double getSlopeConfidenceInterval() throws MathException {
513            return getSlopeConfidenceInterval(0.05d);
514        }
515    
516        /**
517         * Returns the half-width of a (100-100*alpha)% confidence interval for 
518         * the slope estimate.
519         * <p>
520         * The (100-100*alpha)% confidence interval is </p>
521         * <p>
522         * <code>(getSlope() - getSlopeConfidenceInterval(), 
523         * getSlope() + getSlopeConfidenceInterval())</code></p>
524         * <p>
525         * To request, for example, a 99% confidence interval, use 
526         * <code>alpha = .01</code></p>
527         * <p>
528         * <strong>Usage Note</strong>:<br>
529         * The validity of this statistic depends on the assumption that the 
530         * observations included in the model are drawn from a
531         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
532         * Bivariate Normal Distribution</a>.</p>
533         * <p>
534         * <strong> Preconditions:</strong><ul>
535         * <li>If there are fewer that <strong>three</strong> observations in the 
536         * model, or if there is no variation in x, this returns 
537         * <code>Double.NaN</code>.
538         * </li>
539         * <li><code>(0 < alpha < 1)</code>; otherwise an 
540         * <code>IllegalArgumentException</code> is thrown.
541         * </li></ul></p> 
542         *
543         * @param alpha the desired significance level 
544         * @return half-width of 95% confidence interval for the slope estimate
545         * @throws MathException if the confidence interval can not be computed.
546         */
547        public double getSlopeConfidenceInterval(double alpha)
548            throws MathException {
549            if (alpha >= 1 || alpha <= 0) {
550                throw MathRuntimeException.createIllegalArgumentException(
551                      "out of bounds significance level {0}, must be between {1} and {2}",
552                      alpha, 0.0, 1.0);
553            }
554            return getSlopeStdErr() *
555                distribution.inverseCumulativeProbability(1d - alpha / 2d);
556        }
557    
558        /**
559         * Returns the significance level of the slope (equiv) correlation. 
560         * <p>
561         * Specifically, the returned value is the smallest <code>alpha</code>
562         * such that the slope confidence interval with significance level
563         * equal to <code>alpha</code> does not include <code>0</code>.
564         * On regression output, this is often denoted <code>Prob(|t| > 0)</code>
565         * </p><p>
566         * <strong>Usage Note</strong>:<br>
567         * The validity of this statistic depends on the assumption that the 
568         * observations included in the model are drawn from a
569         * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
570         * Bivariate Normal Distribution</a>.</p>
571         * <p>
572         * If there are fewer that <strong>three</strong> observations in the 
573         * model, or if there is no variation in x, this returns 
574         * <code>Double.NaN</code>.</p>
575         *
576         * @return significance level for slope/correlation
577         * @throws MathException if the significance level can not be computed.
578         */
579        public double getSignificance() throws MathException {
580            return 2d * (1.0 - distribution.cumulativeProbability(
581                        Math.abs(getSlope()) / getSlopeStdErr()));
582        }
583    
584        // ---------------------Private methods-----------------------------------
585    
586        /**
587        * Returns the intercept of the estimated regression line, given the slope.
588        * <p>
589        * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
590        *
591        * @param slope current slope
592        * @return the intercept of the regression line
593        */
594        private double getIntercept(double slope) {
595            return (sumY - slope * sumX) / n;
596        }
597    
598        /**
599         * Computes SSR from b1.
600         * 
601         * @param slope regression slope estimate
602         * @return sum of squared deviations of predicted y values
603         */
604        private double getRegressionSumSquares(double slope) {
605            return slope * slope * sumXX;
606        }
607        
608        /**
609         * Modify the distribution used to compute inference statistics.
610         * @param value the new distribution
611         * @since 1.2
612         */
613        public void setDistribution(TDistribution value) {
614            distribution = value;
615            
616            // modify degrees of freedom
617            if (n > 2) {
618                distribution.setDegreesOfFreedom(n - 2);
619            }
620        }
621    }