################################ # MS-C1620 # Summary lecture library(carData) library(tidyverse) library(MASS) library(reldist) #################### # Example problem 1 # Data with 39 highways in Minnesota # Explore the connection between # - Accident rate between million vehicle miles # - Percentage of trucks (out of all vehicles) data_1 <- data.frame(accident_rate = Highway1$rate, truck_perc = Highway1$trks) head(data_1) # plot(accident_rate ~ truck_perc, data = data_1) # lm_1 <- lm(accident_rate ~ truck_perc, data = data_1) summary(lm_1) # resid_1 <- data.frame(res = residuals(lm_1), fit = fitted(lm_1)) plot(res ~ fit, data = resid_1) abline(h = 0) # #################### # Example problem 2 # Data with 30351 subjects # Explore the connection between # - Education in years, categorical variable with groups (5, 10], (10, 15] # - Vocabulary test score, 1-10 data_2 <- data.frame(education = cut(Vocab$education, breaks = c(5, 10, 15)), test_score = Vocab$vocabulary) %>% filter(!is.na(education)) head(data_2) # boxplot(test_score ~ education, data = data_2) ggplot(data_2, aes(x = test_score)) + geom_bar(aes(y = ..prop.., group = 1)) + facet_wrap(. ~ education) # ttest_2 <- t.test(test_score ~ education, data = data_2) ttest_2 # ranktest_2 <- wilcox.test(test_score ~ education, data = data_2) ranktest_2 #################### # Example problem 3 # Data with 237 Statistics I students at the University of Adelaide # Research question: # Is smoking equally common among both sexes? data_3 <- data.frame(sex = survey$Sex, smoking = !(survey$Smoke == "Never")) %>% filter(!is.na(sex)) %>% filter(!is.na(smoking)) head(data_3) # plot(table(data_3)) # counts_3 <- c(by(data_3$smoking, data_3$sex, sum)) totals_3 <- c(table(data_3$sex)) prop.test(counts_3, totals_3) # #################### # Example problem 4 # Data with 4147 subjects # Research question: # Is the Gini-coefficient of in Canada smaller than 0.25? # # Gini-coefficient measures the inequality in the income distribution # * 0 perfect equality # * 1 perfect inequality data_4 <- data.frame(wages = SLID$wages) %>% filter(!is.na(wages)) head(data_4) # hist(data_4$wages, breaks = 30) gini(data_4$wages) # n <- nrow(data_4) res <- NULL B <- 1000 for(b in 1:B){ res[b] <- gini(sample(data_4$wages, n, TRUE)) } hist(res) # quantile(res, c(0.005, 0.995)) #