ESCI 340 computer lab: Friday 5 February 2016 Bartlett's test for homoscedasticity, Nonparametric Analysis of Variance (Kruskal-Wallis test) > # *************************************************** > # Bartlett's test for homoscedasticity > # m&m location data from class 2/1/2016 > plain <- c(4, 24, 32, 44, 33, 14, 66, 49, 41, 29) > dark <- c(84, 86, 96, 99, 77, 71, 79, 90, 97, 84) > pnut <- c(26, 22, 25, 32, 49, 35, 52, 32, 35, 74) > par(mfrow = c(3,1)) > hist(plain, xlim=c(0,100)) > hist(dark, xlim=c(0,100)) > hist(pnut, xlim=c(0,100)) > # Variances for each sample > vp <- var(pnut) > vd <- var(dark) > vpl <- var(plain) > vp [1] 252.4 > vd [1] 85.34444 > vpl [1] 314.0444 > # Pooled variance = [ Sum(SS_i) / Sum(DF_i) ] > # SS_i = (n_i)*Var_i > vmm <- c(vp, vd, vpl) > spool <- sum(9*vmm) / 27 # n_i = 10 for all samples > spool [1] 217.263 > # in R, log = natural log (base e), log10 = base 10 logarithm > # B_calc = (log s_p^2)[Sum(df_i)] - Sum[df_i * log(s_i^2)] > > bcalc <- 27*log(spool) - sum(9*log(vmm)) > bcalc [1] 3.744723 > # Correction factor: C = 1 + [1/3(k-1)]*[Sum(1/df_i) - 1/Sum(df_i)] > ccalc <- 1 + (1/(3*2))*(3/9 - 1/27) > ccalc [1] 1.049383 > bc.calc <- bcalc / ccalc > bc.calc [1] 3.5685 # B_c is Chi-squared w/ k-1 DF # => 0.25 > P > 0.10 # Precise P-value: > 1 - pchisq(bc.calc, 2) [1] 0.1679229 > # *************************************************** > # Kruskal-Wallis nonparametric ANOVA > # m&m location data from class 2/1/2016 > sort(plain) [1] 4 14 24 29 32 33 41 44 49 66 > sort(dark) [1] 71 77 79 84 84 86 90 96 97 99 > sort(pnut) [1] 22 25 26 32 32 35 35 49 52 74 > # m&m location data from class 2/1/2016 > sort(c(plain, dark, pnut)) [1] 4 14 22 24 25 26 29 32 32 32 33 35 35 41 44 49 49 52 66 71 74 77 79 84 84 [26] 86 90 96 97 99 > # Ranks > r.plain <- c(1, 2, 4, 7, 9, 11, 14, 15, 16.5, 19) > r.dark <- c(20, 22, 23, 24.5, 24.5, 26, 27, 28, 29, 30) > r.pnut <- c(3, 5, 6, 9, 9, 12.5, 12.5, 16.5, 18, 21) > R.plain <- sum(r.plain) > R.dark <- sum(r.dark) > R.pnut <- sum(r.pnut) > R.plain [1] 98.5 > R.dark [1] 254 > R.pnut [1] 112.5 > # H_calc = [12 / N(N+1)] Sum(R_i^2/n_i) - 3(N+1) > h.calc <- (12/(30*31)) * sum((c(R.plain, R.dark, R.pnut)^2)/10) - 3*31 > h.calc [1] 19.09613 > # Correction factor: C = 1 - (Sum(t) / (N^3 - N), > # Sum(t) = Sum(t_i^3 - t_i), where t_i = no. ties in i_th group > # t_i = 3, 2, 2, 2 > t.i <- c(3, 2, 2, 2) > t.sum <- sum(t.i^3 - t.i) > t.sum [1] 42 > c.calc <- 1 - t.sum / (30^3 - 30) > c.calc [1] 0.9984427 > H.c <- h.calc / c.calc > H.c [1] 19.12591 > # H.c is Chi-squared w/ k-1 DF > # P < 0.001 > # Precise P-value > 1 - pchisq(H.c, 2) [1] 7.028468e-05 > # ****************************************************************** > # ANOVA example with "data" sampled from 3 normal distributions > # Generate sample "data" by sampling at random from normal distributions > d1 <- rnorm(10, 10, 1) # mean=10, sd=1 > d1 [1] 9.437802 9.871616 10.337045 9.353637 11.255971 11.519821 9.756238 [8] 7.844987 10.478557 9.541979 > d2 <- rnorm(10, 30, 5) # mean=30, sd=5 > d3 <- rnorm(10, 20, 5) # mean=20, sd=5 > d2 [1] 37.69680 41.14481 24.92904 21.63567 38.82300 25.43807 27.22682 24.75699 [9] 39.67105 42.61349 > d3 [1] 19.78196 21.07187 27.34954 11.98093 19.81077 18.47410 19.42451 24.65529 [9] 11.47386 22.67759 > range(c(d1, d2, d3)) [1] 7.844987 42.613489 > # Plot all 3 samples in a single histogram > hist(c(d1,d2,d3)) > # Plot 3 samples in individual histograms > par(mfrow = c(3,1)) > hist(d1, xlim=c(0,50)) > hist(d2, xlim=c(0,50)) > hist(d3, xlim=c(0,50)) > m.grand <- mean(c(d1, d2, d3)) > m.grand [1] 20.66779 > ss.tot <- 29*var(c(d1,d2,d3)) > ss.tot [1] 3376.616 > m1 <- mean(d1) > m2 <- mean(d2) > m3 <- mean(d3) > m.all <- c(m1,m2,m3) > m.all [1] 9.939765 32.393574 19.670043 > # Two ways to calculate groups SS: > ssg <- 10*2*var(m.all) > ssg [1] 2535.8 > ssgx <- 10*((m1-m.grand)^2 + (m2-m.grand)^2 + (m3-m.grand)^2) > ssgx [1] 2535.8 > sse <- ss.tot - ssg > sse [1] 840.8157 > msg <- ssg / 2 > mse <- sse / 27 > msg [1] 1267.9 > mse [1] 31.14132 > fcalc <- msg / mse > fcalc [1] 40.7144 > 1 - pf(fcalc, 2, 27) # P-value [1] 7.062499e-09 # ************************************************************* # ************************************************************* # R commands w/out R output # ************************************************************* # ************************************************************* # Bartlett's test for homoscedasticity # m&m location data from class 2/1/2016 plain <- c(4, 24, 32, 44, 33, 14, 66, 49, 41, 29) dark <- c(84, 86, 96, 99, 77, 71, 79, 90, 97, 84) pnut <- c(26, 22, 25, 32, 49, 35, 52, 32, 35, 74) par(mfrow = c(3,1)) hist(plain, xlim=c(0,100)) hist(dark, xlim=c(0,100)) hist(pnut, xlim=c(0,100)) # Variances for each sample vp <- var(pnut) vd <- var(dark) vpl <- var(plain) vp vd vpl # Pooled variance = [ Sum(SS_i) / Sum(DF_i) ] # SS_i = (n_i)*Var_i vmm <- c(vp, vd, vpl) spool <- sum(9*vmm) / 27 # n_i = 10 for all samples spool # in R, log = natural log (base e), log10 = base 10 logarithm > log(10) [1] 2.302585 > log10(10) [1] 1 # B_calc = (log s_p^2)[Sum(df_i)] - Sum[df_i * log(s_i^2)] bcalc <- 27*log(spool) - sum(9*log(vmm)) bcalc # Correction factor: C = 1 + [1/3(k-1)]*[Sum(1/df_i) - 1/Sum(df_i)] ccalc <- 1 + (1/(3*2))*(3/9 - 1/27) ccalc bc.calc <- bcalc / ccalc bc.calc # B_c is Chi-squared w/ k-1 DF # => 0.25 > P > 0.10 # Precise P-value: > 1 - pchisq(bc.calc, 2) [1] 0.1679229 # *************************************************** # Kruskal-Wallis nonparametric ANOVA # m&m location data from class 2/1/2016 plain <- c(4, 24, 32, 44, 33, 14, 66, 49, 41, 29) dark <- c(84, 86, 96, 99, 77, 71, 79, 90, 97, 84) pnut <- c(26, 22, 25, 32, 49, 35, 52, 32, 35, 74) sort(plain) sort(dark) sort(pnut) sort(c(plain, dark, pnut)) > sort(plain) [1] 4 14 24 29 32 33 41 44 49 66 > sort(dark) [1] 71 77 79 84 84 86 90 96 97 99 > sort(pnut) [1] 22 25 26 32 32 35 35 49 52 74 # Ranks r.plain <- c(1, 2, 4, 7, 9, 11, 14, 15, 16.5, 19) r.dark <- c(20, 22, 23, 24.5, 24.5, 26, 27, 28, 29, 30) r.pnut <- c(3, 5, 6, 9, 9, 12.5, 12.5, 16.5, 18, 21) R.plain <- sum(r.plain) R.dark <- sum(r.dark) R.pnut <- sum(r.pnut) R.plain R.dark R.pnut # H_calc = [12 / N(N+1)] Sum(R_i^2/n_i) - 3(N+1) h.calc <- (12/(30*31)) * sum((c(R.plain, R.dark, R.pnut)^2)/10) - 3*31 h.calc # Correction factor: C = 1 - (Sum(t) / (N^3 - N), # Sum(t) = Sum(t_i^3 - t_i), where t_i = no. ties in i_th group # t_i = 3, 2, 2, 2 t.i <- c(3, 2, 2, 2) t.sum <- sum(t.i^3 - t.i) t.sum c.calc <- 1 - t.sum / (30^3 - 30) c.calc H.c <- h.calc / c.calc # H.c is Chi-squared w/ k-1 DF # P < 0.001 # Precise P-value 1 - pchisq(H.c, 2) # ****************************************************************** # ANOVA example with "data" sampled from 3 normal distributions # Generate sample "data" by sampling at random from normal distributions d1 <- rnorm(10, 10, 1) d2 <- rnorm(10, 30, 5) d3 <- rnorm(10, 20, 5) hist(c(d1,d2,d3)) hist(d1, xlim=c(0,50)) hist(d2, xlim=c(0,50)) hist(d3, xlim=c(0,50)) m.grand <- mean(c(d1, d2, d3)) m.grand ss.tot <- 29*var(c(d1,d2,d3)) ss.tot m1 <- mean(d1) m2 <- mean(d2) m3 <- mean(d3) m.all <- c(m1,m2,m3) m.all ssg <- 10*2*var(m.all) ssgx <- 10*((m1-m.grand)^2 + (m2-m.grand)^2 + (m3-m.grand)^2) sse <- ss.tot - ssg msg <- ssg / 2 mse <- sse / 27 fcalc <- msg / mse pf(fcalc, 2, 27) 1 - pf(fcalc, 2, 27) # P-value