setwd("~/School/Prediction/Week_5")

library(car)
library(lmtest) # install.packages("lmtest")

alko <- read.table("data/alcohol.txt", header = TRUE, sep = "\t")
alko <- alko[, 1:5][,-2]
colnames(alko)[2:4] <- c('Alcohol_C', 'Price_I', 'Total_C')

lm_plotter <- function(data, model, response_name, lag_max) {
  par(mfrow = c(3,2))
  
  start_year <- data$YEAR[1]
  
  y <- ts(data$Alcohol_C, start = start_year)
  
  # Fitted vs observed
  plot(y, xlab = 'Time', col = 'red')
  lines(ts(model$fitted.values, start = start_year), col = 'blue')
  legend("topleft", legend = c(response_name, "Fit"), col = c("red", "blue"),
         lty = c(1, 1))
  
  # Cook's distances
  cooksd <- cooks.distance(model)
  plot(cooksd, xaxt = "n", type = "h", lwd = 3, xlab = NA, ylab = "Cook's distances")
  axis(side = 1, at = seq(1, length(y), 5), cex.axis = 0.9,
       labels = seq(start(y)[1], (start(y)[1] + length(y) - 1), 5))
  
  # Residual plot
  res <- ts(model$residuals, start = start_year)
  plot(res, type = "p", pch = 16)
  abline(h = 0)
  
  # ACF of residuals
  acf(res, main = "", lag.max = lag_max)
  
  # Histogram of the residuals
  breaks <- seq(min(res), max(res), length.out = 9)
  hist(res, xlab = "Residuals", ylab = "Frequency", main = "",
       breaks = breaks)

  # Quantile-Quantile plot 
  qqnorm(res, pch = 16, main = "")
  qqline(res, col = "red", lwd = 2)
  
}

# 5.1

log_log_model <- lm(Alcohol_C ~ Price_I + Total_C, data = alko)
summary(log_log_model)

# Interpreting log-log model parameters
# One can show that for a model log(y) = beta * log(x),
# when the change in x is small, the parameter beta
# estimates the proportional change in y given 1 % change in x.
# This can be shown by analyzing y_1 = (1 + p/100) * y_0 when
# x_1 = (1 + 1/100)*x_0 -> p \approx beta

# So in this case
# 1 % increase in Real price of alcohol causes a 1 % reduction in alcohol consumption
# expenditures
# 1 % increase in Total Consumption expenditures causes a 1.5 % increase in
# alcohol consumption expenditures

# Model looks reasonable from the summary

lm_plotter(alko, log_log_model, 'Alcohol_C', lag_max = 15)
vif(log_log_model)

# The residuals exhibit substantial autocorrelation which can also
# be observed in the shape of the residual plot
# The model systematically over- and underpredicts
# There are shocks in the observed series that can cause
# such systematic bias
# Changes in alcohol consumption are likely to be delayed
# Thus this static model is not sufficient

# 5.2

alko$LAW <- c(rep(0,19),rep(1,13))

log_law_model <- lm(Alcohol_C ~ Price_I + Total_C + LAW, data = alko)
summary(log_law_model)
# 1 % increase in Real price of alcohol causes a 0.9 % reduction in alcohol consumption
# expenditures
# 1 % increase in Total Consumption expenditures causes a 1 % increase in
# alcohol consumption expenditures
# For LAW, we can approximate by exponentiation that approximately 37 %
# increase in alcohol consumption expenditures occurred after passing
# the legislation

# Summary looks reasonable

lm_plotter(alko, log_law_model, 'Alcohol_C', lag_max = 15)
vif(log_law_model)
# While somewhat better than the previous model, the same issues
# still persist

# 5.3

# As noted before, the current level is probably influenced by the previous
# level, thereby causing the autocorrelations. If this is stable,
# we can eliminate it through differencing.

alko_diff <- as.data.frame(apply(alko[,-1],2,diff))
alko_diff$YEAR <- alko$YEAR[-1]

diff_model <- lm(Alcohol_C ~ Price_I + Total_C + LAW, data = alko_diff)
summary(diff_model)

# The interpretation remains the same for the parameter estimates
# 1 % increase in Real price of alcohol causes a 0.8 % reduction in alcohol consumption
# expenditures
# 1 % increase in Total Consumption expenditures causes a 1.4 % increase in
# alcohol consumption expenditures
# For LAW, we can approximate by exponentiation that approximately 22 %
# increase in alcohol consumption expenditures occurred after passing
# the legislation

# Summary looks reasonable, though not directly comparable due to a
# different response variable

lm_plotter(alko_diff, diff_model, 'D(Alcohol_C)', lag_max = 15)
vif(diff_model)

# While there are some patterns in the residuals, the individual autocorrelations
# are not significant. We can also formally test these using the
# Breusch-Godfrey test. While for this model Ljung-Box would suffice,
# for the sake of consistency we use BG as a more general test which
# also can be used for the next model. The null is of the same
# form as in Ljung-Box.
?bgtest

bg_p_vals <- function(model) {
  sapply(seq(1, nrow(model$model) - length(coef(model))),
         function(order) bgtest(model, order = order)$p.value)
}
which(bg_p_vals(diff_model) < 0.05)
# The null is not rejected for any order -> The residual process is WN
# -> The model is sufficient.

# 5.4

n <- nrow(alko)

dist_lag_model <- lm(alko$Alcohol_C[-1] ~ alko$Alcohol_C[-n] + alko$Price_I[-1] + 
                       alko$Price_I[-n] + alko$Total_C[-1] + alko$Total_C[-n] +
                       alko$LAW[-1] + alko$LAW[-n])
summary(dist_lag_model)

# The addition of the auto-regressive term complicates the analysis
# but we can draw some rough conclusions from the estimates

# The instantaneous effects are as before
# The auto-regressive parameter is quite close to one
# indicating that the previous level has substantial 
# influence on the current level. The effect of the covariates
# on the response are thus quite immediate

# The lag 1 terms have an opposite sign, indicating
# possible adjustment

# Long-term elasticities can be estimated, check
# the solution pdf for those

lm_plotter(data.frame(Alcohol_C = alko$Alcohol_C[-1], YEAR = alko$YEAR[-1]), dist_lag_model,
           'Alcohol_C', lag_max = 15)
vif(dist_lag_model) # Substantial variance inflation as expected

# No apparent structure to the residual process and no autocorrelation detected
# The model fit looks good.

which(bg_p_vals(dist_lag_model) < 0.05)
# The null is not rejected for any order -> The residual process is WN
# -> The model is sufficient.

# HW 5.5
# Similar analysis to the demos, consider whether the parameter estimates
# make sense and whether the residual process is WN or not.