# Exercise 2.1

# Read data and estimate linear regression model
smoking <- read.table("data/tobacco.txt", header = TRUE, sep = "\t",
                      row.names = "COUNTRY")
smoking <- smoking[, c("CONSUMPTION", "ILL")]

head(smoking)
str(smoking)
View(smoking)

model <- lm(ILL ~ CONSUMPTION, data = smoking)

# a) Scatter plot and regression line

# Country labels
countries <- c("Iceland", "Norway", "Sweden", "Canada", "Denmark", "Austria",
               "USA", "Netherlands", "Switzerland", "Finland", "England")

# Plotting
plot(smoking$CONSUMPTION, smoking$ILL,
     ylab = "Cases in 1950",
     xlab = "CONSUMPTION in 1930",
     main = "CONSUMPTION/ILL per 100 000 individuals",
     pch = 16, cex = 1.5, col = "midnightblue",
     xlim = c(0, max(smoking$CONSUMPTION)),
     ylim = c(min(smoking$ILL), max(smoking$ILL) + 50))
abline(model, lty = 2, lwd = 2)

# Add labels for points
text(smoking$CONSUMPTION, smoking$ILL, labels = countries, cex = 0.5,
     pos = 3)

# b) fitted values and residuals.
names(model)
fit <- model$fitted.values
res <- model$residuals

# c)
# Scatter plot (ILL, FIT)
plot(smoking$ILL, fit, pch = 21, bg = "skyblue", cex = 1.5, ylab = "Fits",
     xlab = "Sick")
abline(a = 0, b = 1, col = "grey")

# label USA with ifelse function
text(smoking$ILL, fit, labels = ifelse(countries == "USA", "USA", NA), pos = 1)

cor(smoking$ILL, fit)^2
summary(model)$r.squared

# Scatter plot (FIT, RES)
plot(fit, res, pch = 21, bg = "skyblue", cex = 1.5,
     xlab = "Fits", ylab = "Residuals")
abline(h = 0, col = "grey")

# Another way to label USA
text(fit[7], res[7], labels = "USA", pos = 3)

# Useful diagnostics plots
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))

# d) The outlierness of USA is visible especially in (FIT,RES) plot.

# e)
cooksd <- cooks.distance(model)

# xaxt = "n" => No ticks at x-axis
# type = "h" => Plot vertical lines instead of points
plot(cooksd, xaxt = "n", type = "h", lwd = 3, xlab = NA,
     ylab = "Cook's distances")

# side = 1 => Modify x-axis (bottom axis)
# at = 1:11 => Add tick marks to places 1:11
# las = 2 => Rotate text 90 degrees
# cex.axis = 0.9 => Smaller x-axis labels
axis(side = 1, at = 1:11, labels = countries, las = 2, cex.axis = 0.9)

# f) Linear model without USA.

smoking2 <- smoking[-7, ]
model2 <- lm(ILL ~ CONSUMPTION, data = smoking2)

# Compare models
summary(model)
summary(model2)

plot(smoking$CONSUMPTION, smoking$ILL,
     ylab = "Cases in 1950",
     xlab = "CONSUMPTION in 1930",
     main = "CONSUMPTION/ILL per 100 000 individuals",
     pch = 16,
     cex = 1.5,
     col = ifelse(countries != "USA", "blue", "red"),
     xlim = c(0, max(smoking$CONSUMPTION)))
abline(model2, lty = 1, lwd = 2)
abline(model, lty = 2, lwd = 2)
legend("topleft", legend = c("No USA", "With USA"), lty = 1:2)

# Exercise 2.2

# install.packages('car')
library(car)
?vif
hald <- read.table("data/hald.txt", header = TRUE, sep = "\t")
str(hald)

# a) Estimation of the full model

fullmodel <- lm(HEAT ~ CHEM1 + CHEM2 + CHEM3 + CHEM4, data = hald)
summary(fullmodel)

# Model is statistically significant according to F-test.
# With 5% level of significance none of the variables are statistically
# significant.
# Remember normality assumptions for t-test and F-test!

# Residuals do not seem to be normally distributed.
res <- fullmodel$residuals
b <- seq(-4, 4, length.out = 9)
hist(res, breaks = b, border = FALSE, col = "red")

# Example: Calculate VIF of CHEM1
model1 <- lm(CHEM1 ~ CHEM2 + CHEM3 + CHEM4, data = hald)
r1 <- summary(model1)$r.squared
vif1 <- 1 / (1 - r1)
vif1

# Use function vif from package car
vif(fullmodel)
# VIF_i > 10 for all i = 1,..,4
# => Strong multicollinearity is present in the model.

# b)
?step

# Use AIC for variable selection
step(fullmodel)
model <- lm(HEAT ~ CHEM1 + CHEM2 + CHEM4, data = hald)
summary(model)