########################### # MS-C1620 # Statistical inference # Lecture 5 # We use the significance level 0.05 throughout the script ################################################ # Normality testing # Consider three data sets: # 1. The sepal lengths of 100 irises of the species Versicolor and Virginica # 2. The January sunspot numbers for the years 1749-1848 # 3. A random sample of size 100 from the standard normal distribution x1 <- iris[51:150, 1] x2 <- sunspots[seq(1, by = 12, length.out = 100)] x3 <- rnorm(100) # Do the samples look normal? hist(x1, breaks = 10) hist(x2, breaks = 10) hist(x3, breaks = 10) # Multiple histograms of normal samples par(mfrow = c(4, 2)) invisible(replicate(8, hist(rnorm(100), breaks = 10))) par(mfrow = c(1, 1)) # Bowman-Shenton/Jarque-Bera test library(tseries) jarque.bera.test(x1) # No evidence against normality jarque.bera.test(x2) # Strong evidence aganist normality jarque.bera.test(x3) # QQ-plot qqnorm(x1) qqline(x1) # The sample is slightly right-skew qqnorm(x2) qqline(x2) # The sample is heavily right-skew qqnorm(x3) qqline(x3) # Shapiro-Wilk test shapiro.test(x1) # No evidence against normality shapiro.test(x2) # Strong evidence against normality shapiro.test(x3) ##################################################################### # Multinomial distribution # Probability of having eye color # BROWN | GREEN | BLUE # 0.43 | 0.32 | 0.25 # We observe 292 people # A sample from Multinomial(292, 0.43, 0.32, 0.25) rmultinom(1, 292, c(0.43, 0.32, 0.25)) ##################################################################### # Chi-squared tests ########################## # X^2 goodness-of-fit for discrete data #Distribution of hair and eye color and sex in 592 statistics students. HairEyeColor # H0: the distribution (probabilities) of hair colors is (0.20, 0.50, 0.10, 0.20) for # (Black, Brown, Red, Blond)? hair <- margin.table(HairEyeColor, 1) hair hair/sum(hair) # Observed and expected hair sum(hair)*c(0.2, 0.5, 0.1, 0.2) chisq.test(hair, p = c(0.20, 0.50, 0.10, 0.20)) # No evidence against H0 ########################## # X^2 goodness-of-fit for discretized data # Is the sunspot data we used before from an exponential distribution x2 <- sunspots[seq(1, by = 12, length.out = 100)] hist(x2, breaks = 15, probability = TRUE, ylim = c(0, 0.022)) # Estimate the parameters lambdahat <- 1/mean(x2) lambdahat expfit <- function(x){ dexp(x, rate = lambdahat) } curve(expfit, add = TRUE, col = 3, lwd = 2) # Break the distribution into 8 categories, (0, 25, 50,...) + leftover category hist(x2, breaks = 15, xlim = c(-1, 201), ylim = c(0, 23)) abline(v = seq(0, 200, by = 25), col = "red", lwd = 2) # Observed frequencies cum_x2_obs <- sapply(seq(0, 200, by = 25), function(i) sum(x2 <= i))[-1] x2_obs <- c(cum_x2_obs[1], diff(cum_x2_obs), 0) x2_obs # Expected frequencies curve(expfit, col = 3, lwd = 2, from = 0, to = 201) abline(h = 0, lwd = 1) segments(seq(0, 200, by = 25), rep(0, 9), seq(0, 200, by = 25), expfit(seq(0, 200, by = 25)), col = 2, lwd = 2) p_exp <- diff(pexp(seq(0, 200, by = 25), rate = lambdahat)) x2_exp <- 100*p_exp c(x2_exp, 100 - sum(x2_exp)) # The probabilities don't sum to one so we add one extra category # Degrees of freedom is "categories" - 1 - "est. parameters" = 9 - 1 - 1 = 7 pchisq(chisq.test(x2_obs, p = c(p_exp, 1 - sum(p_exp)))$statistic, df = 7, lower.tail = FALSE) # No evidence against the null that the data comes from an exponential distribution ########################## # X^2 test of homogeneity # This data set provides information on the fate of passengers on the fatal maiden voyage # of the ocean liner 'Titanic', summarized according to economic status (class), sex, age and survival. Titanic # Let's assume that the 1st, 2nd and 3rd class ticket amounts (and the crew size) were fixed and # each category had a specific probability to survive Titanic_2 <- margin.table(Titanic, c(1, 4)) Titanic_2 # Is the probability to survive the same in each category? chisq.test(Titanic_2) # Strong evidence against H0 ########################## # X^2 test of independence # Cross-classification of a sample of British males according to each subject's # occupational status and his father's occupational status. occupationalStatus heatmap(occupationalStatus, Colv = NA, Rowv = NA, revC = TRUE) # Are the occupational statuses of a father and his son independent? chisq.test(occupationalStatus) # Strong evidence against H0 # (the warning has to do with some cell counts being very low, # and the extreme classes could be combined to give a more reliable result)