ESCI 340 Biostatistical Analysis John McLaughlin 12 Feb. 2014 Simple Linear Regression: Example with fabricated "data" Note: steps below use R as a calculator; you should obtain similar results, for similar effort, using any other calculating device. > # Generate X and Y data, using model: > # Y_i = alpha + beta*X_i + epsilon_i > # where epsilon is normally distributed (mean=0, sd=1) > xx <- c(0:10) > xx [1] 0 1 2 3 4 5 6 7 8 9 10 > epsilon <- rnorm(11, 0, 1) # rnorm generates random deviates > yy <- 0 + 1*xx + epsilon > yy [1] 0.7006623 1.2315538 1.4135857 2.7472831 3.8411617 4.9782227 7.5647158 [8] 5.4765739 8.8717807 8.2626596 9.2053993 > round(yy, 2) # show Y values w/ 2 decimal degrees of precision [1] 0.70 1.23 1.41 2.75 3.84 4.98 7.56 5.48 8.87 8.26 9.21 > plot(xx, yy) > lines(c(0, 10), c(0,10)) # Plot the line of the "actual" relationship > mean(xx) [1] 5 > mean(yy) [1] 4.935782 > sum((xx - 5)^2) # SS(X) [1] 110 > sum((yy - 4.936)^2) # SS(Y) [1] 102.0465 > sum((xx - 5)*(yy - 4.936)) # SXY [1] 102.2048 > bb <- 102.2 / 110 # estimate slope, b > bb [1] 0.929091 > aa <- mean(yy) - bb*mean(xx) # estimate intercept, a > aa [1] 0.2903271 > y1 <- aa + bb*0 # determine coodinates for endpoints of line > y1 [1] 0.2903271 > y2 <- aa + bb*10 > y2 [1] 9.581236 > plot(xx, yy) # Plot "data" points > lines(c(0, 10), c(0,10)) # Plot "true" line or relationship > lines(c(0, 10), c(y1, y2), lty=2) # Plot fitted line > # Test null hypothesis, H_o: beta = 0 > # Test with ANOVA > sxy <- sum((xx-5)*(yy - mean(yy))) > sxy [1] 102.2048 > ss.reg <- bb*sxy # Regression SS > ss.reg [1] 94.95758 > ss.tot <- 10*var(yy) # Total SS > ss.tot [1] 102.0465 > ss.res <- ss.tot - ss.reg # Residual SS > ss.res [1] 7.088938 > ms.res <- ss.res / 9 > ms.res [1] 0.7876598 > fcalc <- ss.reg / ms.res > fcalc [1] 120.5566 > # P < 0.0005 > # Test with t-test > sb <- sqrt(ms.res/110) # standard error of b > sb [1] 0.08462 > tcalc <- bb / sb > tcalc [1] 10.97957 > # P < 0.001 > # Calcuate 95% confidence limits for b > 2.262*sb [1] 0.1914104 > # 95% confidence interval: [0.73, 1.11] > # Calculate regression coefficient of determination > r2 <- ss.reg / ss.tot > r2 [1] 0.9305323 > # very high value > # Calculate XY correlation coefficient > rcor <- sxy / sqrt((sum((xx - 5)^2)*sum((yy - 4.936)^2))) > rcor [1] 0.9646638