########################### # MS-C1620 # Statistical inference # Lecture 3 library(scales) library(EnvStats) # The significance level is always taken to be equal to 0.05 below. #################################################################################### # One-sample sign test faithful # Claim: the location of the eruption times equal to 3 minutes 35 seconds plot(faithful) hist(faithful$eruptions, breaks = 25) # Is the median of the eruption times equal to 3 minutes 35 seconds? # H0: med == 3.583 # H1: med != 3.583 n <- length(faithful$eruptions) # Test statistic is the amount of "red" points. plot(faithful$eruptions, rep(1, n), col = as.factor(faithful$eruptions > 3.583)) abline(v = 3.583) s <- sum(faithful$eruptions > 3.583) # If H0 is true, we expect around n/2 = 136 obs. to be larger than 3.583 plot((n/2 - 50):(n/2 + 50), dbinom((n/2 - 50):(n/2 + 50), n, 0.5), type = "b") abline(h = 0) abline(v = 0.5*n, lwd = 2) # The corresponding observed amount is 164 # p-value is the probability of the areas outside the green lines abline(v = 0.5*n + abs(s - 0.5*n) , col = 3, lwd = 2) # This is what we observe abline(v = 0.5*n - abs(s - 0.5*n) , col = 3, lwd = 2) # The one-sample sign test can be performed through "Exact Binomial Test" # Significance level 5% binom.test(s, n) # Extremely small p-value -> reject H0 # The same with normal approximation z <- (s - n/2)/sqrt(n/4) curve(dnorm, from = -5, to = 5) abline(v = 0, lwd = 2) abline(h = 0) # p-value is the probability of the areas outside the green lines abline(v = -abs(z), col = 3, lwd = 2) # This is what we observe abline(v = abs(z), col = 3, lwd = 2) 2*(1 - pnorm(abs(z))) # Reject H0 #################################################################################### # One-sample sign test vs. one-sample t-test # t-test for the same data on significance level 5% t.test(faithful$eruptions, mu = 3.583) # Why do the conclusions differ even though both tests test whether the "location" is 3.583? hist(faithful$eruptions, breaks = 25) #################################################################################### # Power comparison # We have a sample of size 20 from N(mu, 1)-distribution # H0: mu == 0 # H1: mu != 0 # Tests: one sample t-test vs. one-sample sign test, both on significance level 5% # Type I error rates # We generate data where H0 is true and check whether the test "sees" this x <- rnorm(20, 0, sqrt(1)) # The null distribution of the test statistic with the critical points t_density <- function(x) dt(x, df = 19) curve(t_density, from = -5, to = 5) abline(h = 0) abline(v = qt(0.025, df = 19), col = 2, lwd = 2) abline(v = qt(0.975, df = 19), col = 2, lwd = 2) # Observed value x <- rnorm(20, 0, sqrt(1)) abline(v = t.test(x)$statistic, lty = 2) # Repeat multiple times replicate(20, abline(v = t.test(rnorm(20, 0, sqrt(1)))$statistic, lty = 2)) # Simulated probabilities mean(replicate(10000, t.test(rnorm(20, 0, sqrt(1)))$p.value < 0.05)) mean(replicate(10000, binom.test(sum(rnorm(20, 0, sqrt(1)) > 0), 20)$p.value < 0.05)) # Type II error when the true value is actually mu = 1 # The null distribution curve(t_density, from = -4, to = 6) abline(h = 0) abline(v = qt(0.025, df = 19), col = 2, lwd = 2) abline(v = qt(0.975, df = 19), col = 2, lwd = 2) # Observed value x <- rnorm(20, 1, sqrt(1)) abline(v = t.test(x)$statistic, lty = 2) # Repeat multiple times replicate(20, abline(v = t.test(rnorm(20, 1, sqrt(1)))$statistic, lty = 2)) mean(replicate(10000, t.test(rnorm(20, 1, sqrt(1)))$p.value >= 0.05)) mean(replicate(10000, binom.test(sum(rnorm(20, 1, sqrt(1)) > 0), 20)$p.value >= 0.05)) # Sign test has larger type II error for normally distributed data! #################################################################################### # Discrete data and the one-sample sign test # We test whether the population median equals 5 for the following simulated sample set.seed(012019) x <- sample(0:10, 30, replace = TRUE) # Conservative approach -> we choose the one (> or >=) that gives the larger p-value # -> Put the zeroes on the side which "balances" the signs # The following might differ sum(x - 5 > 0) sum(x - 5 >= 0) # 17 negative, 2 zeroes, 11 positive -> zeroes will go with the positive ones in the cons. approach binom.test(sum(x - 5 > 0), 30)$p.value binom.test(sum(x - 5 >= 0), 30)$p.value # Conservartive choice #################################################################################### # Paired sign-test # Weight of mice before treatment before <-c(200.1, 190.9, 192.7, 213, 241.4, 196.9, 172.2, 185.5, 205.2, 193.7) # Weight of mice after treatment after <-c(392.9, 393.2, 345.1, 393, 434, 427.9, 422, 383.9, 392.3, 352.2) # Does the treatment affect the weight? x <- cbind(before, after) matplot(t(x), type = "l") abline(v = c(1, 2)) # d <- after - before plot(d, rep (1, 10)) # Difficult to say whether the difference is normally distributed -> sign test # H0: med_d == 0, i.e. treatment had no effect # H1: med_d != 0, i.e. treatment had an effect binom.test(sum(d > 0), 10) # -> The treatment had an effect #################################################################################### # One-sample signed rank test # Assumes symmetry nottem # Average January Temperatures at Nottingham, 1920-1939 nottem_jan <- nottem[seq(from = 1, length.out = 20, by = 12)] # Seems to be symmetric hist(nottem_jan) boxplot(nottem_jan) # Is the (population) January average equal to 40 degrees Fahrenheit? # H0: med == 40 # H1: med != 40 # Significance level 5% n <- length(nottem_jan) nottem_r <- rank(abs(nottem_jan - 40)) nottem_jan nottem_r plot(nottem_jan, rep(1, n)) abline(v = 40) text(nottem_jan, rep(0.9, n), labels = as.character(nottem_r)) # One-sample signed rank test wilcox.test(nottem_jan, mu = 40) # No evidence against the null #################################################################################### # Power comparison continued # Type II error of one-sample signed rank test in the same situation as before # Null distribution of the one-sample signed rank test statistic for H0: med = 0 nulls <- replicate(1000, wilcox.test(rnorm(20, 0, sqrt(1)))$statistic) hist(nulls, breaks = 30, xlim = c(0, 250)) abline(v = quantile(nulls, 0.025), lwd = 2, col = 2) abline(v = quantile(nulls, 0.975), lwd = 2, col = 2) # Draw test statistic values for med = 1 abline(v = replicate(20, wilcox.test(rnorm(20, 1, sqrt(1)))$statistic), lty = 2) mean(replicate(10000, wilcox.test(rnorm(20, 1, sqrt(1)))$p.value >= 0.05)) # Better Type II error rate than for sign test because of the used extra information! #################################################################################### # Two-sample rank test # Data which show the effect of two soporific drugs (increase in hours of sleep compared to control) on 10 patients. sleep # Compare whether the drugs differ in their effect at significance level 5% # H0: med1 == med2 # H1: med1 != med2 # Are the distribution same up to shift? hist(sleep[sleep$group == 1, 1]) hist(sleep[sleep$group == 2, 1]) plot(sleep[sleep$group == 1, 1], rep(1, 10), xlim = c(-2, 6), cex = 1.5) abline(h = 1.0) points(sleep[sleep$group == 2, 1], rep(0.8, 10), col = 2, cex = 1.5) abline(h = 0.8, col = 2) # Maybe... wilcox.test(sleep[sleep$group == 1, 1], sleep[sleep$group == 2, 1]) # p-value 0.069 - not enough evidence against H0 # Remember that t-test gave a p-value 0.079 t.test(sleep[sleep$group == 1, 1], sleep[sleep$group == 2, 1]) #################################################################################### # Two-sample rank test with ranked data # The 10 best selling albums of all time with the corresponding nationalities (US, non-US) are: # (https://www.pastemagazine.com/articles/2018/08/the-best-selling-albums-of-all-time.html) # 10. Alanis Morissette: Jagged Little Pill (non-US) # 9. Adele: 21 (non-US) # 8. AC/DC: Back in Black (non-US) # 7. Fleetwood Mac: Rumours (non-US) # 6. Whitney Houston (Various Artists): The Bodyguard Soundtrack (US) # 5. Led Zeppelin: Led Zeppelin IV (non-US) # 4. Shania Twain: Come on Over (non-US) # 3. Eagles: Hotel California (US) # 2. Eagles: Their Greatest Hits 1971-1975 (US) # 1. Michael Jackson: Thriller (US) # Is US music more popular (in the sense of more albums sold?). Significance level 5% # H0: med_US == med_nonUS # H1: med_US != med_nonUS # The medians can be thought to be the medians of album sales in the corresponding regions albums <- data.frame(rank = 1:10, origin = c("US", "US", "US", "non-US", "non-US", "US", "non-US", "non-US", "non-US", "non-US")) albums[albums$origin == "US", ]$rank albums[albums$origin == "non-US", ]$rank wilcox.test(albums[albums$origin == "US", ]$rank, albums[albums$origin == "non-US", ]$rank) # US albums are more popular (p-value = 0.0381)