# Exercise 2.1 # Read data and estimate linear regression model smoking <- read.table("data/tobacco.txt", header = TRUE, sep = "\t", row.names = "COUNTRY") smoking <- smoking[, c("CONSUMPTION", "ILL")] head(smoking) str(smoking) View(smoking) model <- lm(ILL ~ CONSUMPTION, data = smoking) # a) Scatter plot and regression line # Country labels countries <- c("Iceland", "Norway", "Sweden", "Canada", "Denmark", "Austria", "USA", "Netherlands", "Switzerland", "Finland", "England") # Plotting plot(smoking$CONSUMPTION, smoking$ILL, ylab = "Cases in 1950", xlab = "CONSUMPTION in 1930", main = "CONSUMPTION/ILL per 100 000 individuals", pch = 16, cex = 1.5, col = "midnightblue", xlim = c(0, max(smoking$CONSUMPTION)), ylim = c(min(smoking$ILL), max(smoking$ILL) + 50)) abline(model, lty = 2, lwd = 2) # Add labels for points text(smoking$CONSUMPTION, smoking$ILL, labels = countries, cex = 0.5, pos = 3) # b) fitted values and residuals. names(model) fit <- model$fitted.values res <- model$residuals # c) # Scatter plot (ILL, FIT) plot(smoking$ILL, fit, pch = 21, bg = "skyblue", cex = 1.5, ylab = "Fits", xlab = "Sick") abline(a = 0, b = 1, col = "grey") # label USA with ifelse function text(smoking$ILL, fit, labels = ifelse(countries == "USA", "USA", NA), pos = 1) cor(smoking$ILL, fit)^2 summary(model)$r.squared # Scatter plot (FIT, RES) plot(fit, res, pch = 21, bg = "skyblue", cex = 1.5, xlab = "Fits", ylab = "Residuals") abline(h = 0, col = "grey") # Another way to label USA text(fit[7], res[7], labels = "USA", pos = 3) # Useful diagnostics plots par(mfrow = c(2, 2)) plot(model) par(mfrow = c(1, 1)) # d) The outlierness of USA is visible especially in (FIT,RES) plot. # e) cooksd <- cooks.distance(model) # xaxt = "n" => No ticks at x-axis # type = "h" => Plot vertical lines instead of points plot(cooksd, xaxt = "n", type = "h", lwd = 3, xlab = NA, ylab = "Cook's distances") # side = 1 => Modify x-axis (bottom axis) # at = 1:11 => Add tick marks to places 1:11 # las = 2 => Rotate text 90 degrees # cex.axis = 0.9 => Smaller x-axis labels axis(side = 1, at = 1:11, labels = countries, las = 2, cex.axis = 0.9) # f) Linear model without USA. smoking2 <- smoking[-7, ] model2 <- lm(ILL ~ CONSUMPTION, data = smoking2) # Compare models summary(model) summary(model2) plot(smoking$CONSUMPTION, smoking$ILL, ylab = "Cases in 1950", xlab = "CONSUMPTION in 1930", main = "CONSUMPTION/ILL per 100 000 individuals", pch = 16, cex = 1.5, col = ifelse(countries != "USA", "blue", "red"), xlim = c(0, max(smoking$CONSUMPTION))) abline(model2, lty = 1, lwd = 2) abline(model, lty = 2, lwd = 2) legend("topleft", legend = c("No USA", "With USA"), lty = 1:2) # Exercise 2.2 # install.packages('car') library(car) ?vif hald <- read.table("data/hald.txt", header = TRUE, sep = "\t") str(hald) # a) Estimation of the full model fullmodel <- lm(HEAT ~ CHEM1 + CHEM2 + CHEM3 + CHEM4, data = hald) summary(fullmodel) # Model is statistically significant according to F-test. # With 5% level of significance none of the variables are statistically # significant. # Remember normality assumptions for t-test and F-test! # Residuals do not seem to be normally distributed. res <- fullmodel$residuals b <- seq(-4, 4, length.out = 9) hist(res, breaks = b, border = FALSE, col = "red") # Example: Calculate VIF of CHEM1 model1 <- lm(CHEM1 ~ CHEM2 + CHEM3 + CHEM4, data = hald) r1 <- summary(model1)$r.squared vif1 <- 1 / (1 - r1) vif1 # Use function vif from package car vif(fullmodel) # VIF_i > 10 for all i = 1,..,4 # => Strong multicollinearity is present in the model. # b) ?step # Use AIC for variable selection step(fullmodel) model <- lm(HEAT ~ CHEM1 + CHEM2 + CHEM4, data = hald) summary(model)