########################### # MS-C1620 # Statistical inference # Lecture 4 library(catdata) library(boot) # Significance level 0.05 thorughout # Note. We will here try to use "theta" as the name for the # probability parameter for a binomial distribution. # (It is often called "p", but this causes confusion with # p-values.) ################################################################# # Bernoulli and Binomial distributions # Prob. mass function for Beroulli theta <- 0.6 barplot(c(1 - theta, theta), names.arg = c(0, 1)) # Prob. mass function for Binomial theta <- 0.6 n <- 20 barplot(dbinom(0:20, n, theta), names.arg = 0:20) # Small-sample and/or skewed binomials are not quite normal # so it is better to work with the exact binomial distribution # in those cases theta <- 0.9 n <- 10 barplot(dbinom(0:10, n, theta), names.arg = 0:10) ################################################################# # Confidence intervals ################################################################# # Unemployment data(unemployment) head(unemployment) help("unemployment") # A random (?) sample of 982 subjects amongst unemployed persons. # What is the true proportion of long-term unemployed people among unemployed? barplot(table(unemployment$durbin), names.arg = c("Short-term", "Long-term")) # A point estimate and a 95% confidence interval longterm <- 1*(unemployment$durbin == 2) longterm n <- length(longterm) m <- mean(longterm) m ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n)) ci plot(0, col = 0, xlim = c(0, 1)) abline(h = 0, lwd = 2) abline(v = c(m, ci), col = c(1, 2, 2)) #################################################### # Women with no offspring data(children) head(children) help("children") # A random (?) sample of 1761 subjects amongst all German women of certain age. # What is the true proportion of women without children? barplot(table(children$child)) # A point estimate and a 95% confidence interval (bootstrap) nochildren <- 1*(children$child == 0) nochildren n <- length(nochildren) m <- mean(nochildren) m B <- 1000 mean_boot <- function(x, indices) mean(x[indices]) boots <- sort(boot(nochildren, mean_boot, B)$t) boots ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)]) ci plot(0, col = 0, xlim = c(0, 1)) abline(h = 0, lwd = 2) abline(v = c(m, ci), col = c(1, 2, 2)) ################################################################# # Comparison of the coverage probabilities of bootstrap and CLT confidence intervals for binary data # We will be setting the seed of the random generator # before generating the random numbers, to ensure # fair comparison between CLT and bootstrap. # (They will be working with the same numbers.) theseed <- 876362 ########### CLT n <- 10 theta <- 0.6 set.seed(theseed) x <- rbinom(n, 1, theta) x m <- mean(x) m # applying "magic formula" that uses normal approximation ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n)) ci # Is the true value inside the interval? theta >= ci[1] & theta <= ci[2] # The previous into function: clt_ci <- function(n, theta){ m <- mean(rbinom(n, 1, theta)) ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n)) theta >= ci[1] & theta <= ci[2] } # Large number of replications: proportion of intervals which cover the true value: set.seed(theseed) mean(replicate(10000, clt_ci(n,theta))) # Coverage only about 90%, not 95% as desired. # That is, we are not covering the true value # about 10% of the time. ########### Bootstrap set.seed(theseed) x <- rbinom(n, 1, theta) mean(x) B <- 1000 mean_boot <- function(x, indices) mean(x[indices]) boots <- sort(boot(x, mean_boot, B)$t) ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)]) ci # Is the true value in the interval? theta >= ci[1] & theta <= ci[2] # The previous into function: clt_bs <- function(n, theta, B){ x <- rbinom(n, 1, theta) boots <- sort(boot(x, mean_boot, B)$t) ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)]) theta >= ci[1] & theta <= ci[2] } # Large number of replications: proportion of intervals which cover the true value: set.seed(theseed) mean(replicate(1000, clt_bs(n, theta, 1000))) # (takes about 10 seconds) ########### Comparison set.seed(theseed) mean(replicate(1000, clt_ci(n, theta))) set.seed(theseed) mean(replicate(1000, clt_bs(n, theta, 1000))) # Bootstrap CI gives a better coverage (close to the 95%) but at the cost of requiring more computation ############################################################## # One-sample proportion test # The data set "faithful" contains the waiting times between eruptions and the durations of the eruption # for the Old Faithful geyser in Yellowstone National Park, Wyoming, USA. faithful # A park ranger claims that less than every fourth eruption has a duration longer than 4 minutes 30 seconds # Test her claim using the one-sample proportion test and the significance level 5% # theta = probability of long eruption (unknown parameter) # H0: theta >= 1/4 Composite null hypothesis! # H1: theta < 1/4 # Binary data 1*(faithful$eruptions > 4.5) n <- length(faithful$eruptions) n # Number of eruptions longer than 4 min 30 sec. x <- sum(faithful$eruptions > 4.5) x x/n # If the null hypotehsis is true (the prob. of long eruptions is 1/4) # then the distribution of the test statistic is: barplot(dbinom(0:n, n, 1/4), names.arg = 0:n, width = 1, space = 0) # We observed: abline(v = x, col = 2, lwd = 2) # Deviations toward H1 are in the low end of the graph (small numbers of long eruptions imply small prob.) # Proportion test prop.test(x, n, 0.25, alternative = "less", correct = FALSE) # -> Not enough evidence against the null -> not enough evidence for the ranger's claim # WARNING. # The third argument to the R function "prop.test" is confusingly named "p", # but it really is the probability parameter of the null hypothesis, # which we are calling "theta". # # DO NOT CONFUSE this parameter with the p-value. # # The p-value is a RESULT of the test, and quite different from the # probability parameter, even though R confusingly calls both "p". ############################################################################### # Unemployment data data("unemployment") head(unemployment) # Claim: more than 30% of unemployed are long-term unemployed # theta = probability of long-term unemployment # H0: theta <= 0.30 Composite null hypothesis! # H1: theta > 0.30 longterm <- 1*(unemployment$durbin == 2) x <- sum(longterm) n <- length(longterm) x/n # Is it likely to get this large proportion even if the true probability is 0.30? prop.test(x, n, 0.30, alternative = "greater", correct = FALSE) # H0 not plausible -> more than 30% of unemployed are long-term unemployed # Above we rejected the hypothesis theta=0.30 at p-value 0.0004821. # What would we do with hypotheses with even smaller p's? prop.test(x, n, 0.29, alternative = "greater", correct = FALSE) prop.test(x, n, 0.27, alternative = "greater", correct = FALSE) prop.test(x, n, 0.25, alternative = "greater", correct = FALSE) # The p-value would be even smaller. # So we can, at significance level 0.05, # reject not just the point hypothesis "theta=0.30", # but also all of the composite hypothesis "theta<=0.30". ############################################################## # Two-sample proportion test head(iris) # An iris flower is said to have odd-shaped petals if the petal # length/width ratio is smaller than 3. # A botanist claims that it is more common for the iris of species virginica to have odd-shaped petals # than for the iris of species versicolor. # Test his claim using the two-sample proportion test and the significance level 5% # theta1 = probability of odd-shaped petals for virginica # theta2 = probability of odd-shaped petals for versicolor # H0: theta1 = theta2 # H1: theta1 > theta2 virginica <- iris[iris$Species == "virginica", ] x1 <- sum(virginica$Petal.Length/virginica$Petal.Width < 3.0) n1 <- nrow(virginica) x1/n1 versicolor <- iris[iris$Species == "versicolor", ] x2 <- sum(versicolor$Petal.Length/versicolor$Petal.Width < 3.0) n2 <- nrow(versicolor) x2/n2 prop.test(c(x1, x2), c(n1, n2), alternative = "greater", correct = FALSE) # Strong evidence against the null -> virginica has more frequently odd-shaped petals ##################################################################### # Quiz # 1 x <- c(5, -4, -2, 2) mean(x) sd(x) median(x) median(abs(x - median(x))) c(min(x), max(x)) sign(x) rank(x) rank(abs(x))*sign(x) # 2 # a. data with no outliers or with a bounded range # b. data with small amounts of outliers # c. discrete data with a small amount of possible values # 3 # a. data with no outliers or with a bounded range # b. data with small amouns of outliers # c. data with bounded range # 4 # a. the confidence interval is accurate, in the sense that it pinpoints the true value accurately (does not necessary have a large coverage probability, though) # b. we require lots of "evidence" to reject the null (accept any "effects") # c. no evidence against null (no "effect" is seen) # d. we think there is an effect when there is none # e. we fail to see an effect # 5 # a. Two boxplots or histograms set.seed(123) heights <- data.frame(height = c(rnorm(40, 175, 5), rnorm(40, 165, 5)), sex = c(rep(0, 40), rep(1, 40))) boxplot(height ~ sex, data = heights) h1 <- hist(heights[heights$sex == 0, 1], breaks = 8) h2 <- hist(heights[heights$sex == 1, 1], breaks = 8) plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4)) plot(h2, add = TRUE, col = rgb(1,0,0,1/4)) # or par(mfrow = c(2, 1)) plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4)) plot(h2, xlim = c(150, 190), col = rgb(1,0,0,1/4)) par(mfrow = c(1, 1)) # b. Bar chart? barplot(table(sample(0:24, 300, TRUE))) # c. Bar chart again? props <- c(0.10, 0.04, 0.03, 0.01, 0.09) barplot(props, names.arg = 1:5) # d. Time series plot x <- cbind(20 + arima.sim(n = 100, list(ar = c(0.85)), sd = sqrt(10)), 1000 + arima.sim(n = 100, list(ar = c(0.65)), sd = sqrt(1000)), 420 + arima.sim(n = 100, list(ar = c(0.55)), sd = sqrt(500))) plot.ts(x) # e. A map with the postal areas colored based on the median salary in the region? # ... # 6. single_t <- function(){ x <- rnorm(10) sqrt(10)*x/sd(x) } single_sign <- function(){ x <- rnorm(10) sum(x > 0) } single_rank <- function(){ x <- rnorm(10) sum((sign(x) == 1)*rank(abs(x))) } par(mfrow = c(1, 3)) hist(replicate(1000, single_rank()), xlab = "", main = "") hist(replicate(1000, single_t()), xlab = "", main = "") hist(replicate(1000, single_sign()), xlab = "", main = "") par(mfrow = c(1, 1)) data("psych") library(psych) data() ############################################################# # Comparison of t-test, sign test and one-sample signed rank test data(cd4) cd4 help(cd4) # Does the drug increase the amount of CD4 cells in the blood? # mu = the location of the distribution of the difference between oneyear and baseline # H0: mu = 0 # H1: mu > 0 # Visualization matplot(t(cd4), type = "l") difference <- cd4$oneyear - cd4$baseline difference hist(difference) # Paired t-test (assumes normality) t.test(difference, mu = 0, alternative = "greater") # small p-value -> drug works (assuming normality holds) # Paired sign test (assumes continuity) # No zero values sum(difference == 0) binom.test(sum(difference > 0), length(difference), alternative = "greater") # Small p-value -> drug works # Paired signed rank test (assumes continuity and symmetry) wilcox.test(x, mu = 0, alternative = "greater") # Small p-value -> drug works