setwd('~/Prediction/Week_5')

library(car)
library(forecast)
library(lmtest)

alcohol_data <-read.table("alcohol.txt",header=T,sep="\t")
alcohol_data <- alcohol_data[,c('YEAR', 'LR1C', 'LQ1CPC','LQTOTALPC')]
colnames(alcohol_data) <- c('Year', 'Price_Index','Alcohol_Consump', 'Total_Consump')
# The data is already log-transformed

# 5.1

model_a <- lm(Alcohol_Consump ~  . - Year, data = alcohol_data)
summary(model_a)

# Interpreting the model coefficients:
# The additive effects on the logarithmic scale are multiplicative on the 
# original scale. If the effect is small enough, we can approximately say
# that b_i represents the percentage increase/decrease in the expected response
# when the explanatory variables increases by 1 percent.

# Thus in this case:
# Price_Index: A percentage increase in the real prices is associated with a percentage reduction
# in alcohol consumption expenditures. This is to be expected since demand usually drops when
# the price levels increase and since alcohol products are not infinitely divisible but
# have to be purchased in discrete units.

# Total_Consump: A percentage increase in total consumption expenditures is associated with
# 1.46 % increase in alcohol consumption expenditures. Again, this is to be expected since
# alcohol consumption is part of the total consumption and might be related to the general
# increase in household wealth from which more money can be used for non-essential consumption.

# The individual t-tests and the R^2 of the model indicate a well-estimated model (5 % threshold) which
# also accounts most of the variation present in the response.

lm_diag_func <- function(data, model) {
  par(mfrow = c(3,2))
  
  res <- resid(model)
  
  qqnorm(res)
  qqline(res)
  hist(res, xlab = 'Residual')
  plot(data$Year, res, xlab = 'Year', ylab = 'Residual')
  abline(b = 0, a = 0)
  acf(res, main = 'Residual ACF')
  
  fit <- fitted(model)
  plot(data$Year,data$Alcohol_Consump, type = 'l', col = 'Red',
       xlab = 'Year', ylab = 'Alcohol expend.')
  lines(data$Year ,fit, col = 'Blue')
  legend("topleft", legend=c("Observed", "Fitted"),
         col=c("red","blue"),lty=c(1,1),cex=0.8)
  
  plot(data$Year ,cooks.distance(model),
       ylab="Cook's distances",xlab="Year",
       pch=16)
}

lm_diag_func(alcohol_data, model_a)
vif(model_a)

# Some observations from the diagnostics:
# (1) Residuals do not seems to be normally distributed, thus the parametric tests might not be reliable.
# (2) Residuals exhibit substantial autocorrelation, violating one of the standard assumptions.
# (3) Influential observations are detected at the few years after the change in legislation.
# (4) Variance inflation is low.
# Due to the shift after the change in legislation and the autocorrelation,
# the model systematically over-predicts and under-predicts
# across different time periods.

# Given these observations, model (1) is found quite lacking.

# b)

alcohol_data$Legis <- c(rep(0,19), rep(1,13))

model_b <- lm(Alcohol_Consump ~  . - Year, data = alcohol_data)
summary(model_b)

# The dummy variable is statistically significant and has a positive sign as expected.
# Interpretation of the effect:
# When legislation becomes active, the expected alcohol consumption expenditure increases by a factor of
exp(0.317)
# The addition of the dummy has a substantial effect on the parameter estimates.

# Other observations remain similar to model from part (b)

lm_diag_func(alcohol_data, model_b)
vif(model_b)

# Some observations from the diagnostics:
# (1) Residuals are still not normal.
# (2) Residuals exhibit substantial autocorrelation, violating one of the standard assumptions.
# (3) Influence is now a bit more evenly distributed across observations
# (4) Variance inflation is relatively low.
# Due the autocorrelation, the model still systematically over-predicts and under-predicts
# across different time periods.

# Given these observations, model (2) is found lacking.

# c)

# If the underlying autoregressive structure is stable across time and linear,
# then taking the first order difference should eliminate it from the time series.

alcohol_data_diff <- as.data.frame(
  apply(alcohol_data[,!(colnames(alcohol_data) %in% c('Year'))],2, diff, simplify = T))

alcohol_data_diff$Year <- alcohol_data$Year[-1]

model_c <- lm(Alcohol_Consump ~ . - Year, data = alcohol_data_diff)
summary(model_c)

# Note that model (3) is not directly comparable to the previous models since the response is different

# Interpreting the difference effects:
# A 1 % change in the difference is associated with a b % change in the difference of the response, not
# the response itself.
# So for example for the price index, a 1 % increase in the real price
# log(x_t) - log(x_{t-1}) = log(x_t/x_{t-1})
# is associated with a 0.082 % reduction in the total consumption
# log(y_t/y_{t-1})

# Otherwise, the observations are similar as before.

lm_diag_func(alcohol_data_diff, model_c)
vif(model_c)

# Some observations from the diagnostics:
# (1) Residuals are still a bit asymmetric.
# (2) Residuals do not seem to exhibit any autocorrelation
# (3) Influence is still relatively uniform across observations.
# (4) Variance inflation is low.
# While considering differences might be more noisy and sometimes more difficult to model
# due to the reduction in the range of the observed values,
# the autoregressive elements impart a lesser impact on the difference scale, making
# it possible to estimate the effects with a smaller bias.

# We can also test the residuals using the Breusch-Godfrey test
# H_0: ACF is zero up to lag p, H_1: ACF is not zero for some lag <= p
?bgtest

bg_function <- function(model, n, n_params) {
  lags <- seq(1,n-n_params)
  sapply(lags, function(x) bgtest(model, order = x)$p.value)
}

which(bg_function(model_c, nrow(alcohol_data_diff), 4) < 0.05)
# Null is not reject for any testable autocorrelation

# Given these observations, model (3) can be considered satisfactory.

# 5.4

n <- nrow(alcohol_data)

alcohol <- alcohol_data$Alcohol_Consump
price_index <- alcohol_data$Price_Index
total_consump <- alcohol_data$Total_Consump
legis <- alcohol_data$Legis

model_d <- lm(alcohol[-1] ~ alcohol[-n] + price_index[-1] + price_index[-n] +
                total_consump[-1] + total_consump[-n] + legis[-1] + legis[-n])
summary(model_d)

# Do note that since one of the explanatory variables is endogenous (i.e. not independent of the error term),
# the significance tests and likely the standard deviations are erroneous. Same with R^2.

# However, a substantial autoregressive structure is observed, implying that the effect on the current level of the time
# series is heavily influenced by the preceding levels. Thus any exogenous effect on the previous observations
# substantially influences the following one. So even if the exogenous variables do not change in the current step,
# their effects from the previous step are observed in the current step which makes sense since it takes
# some time (though not long) for consumers to adjust to new economic conditions.

# The signs of the instantaneous changes make sense for price index and total consumption.
# The lagged variables having an opposite sign also makes sense since the consumers are likely to
# "overreact" to the changes in economic conditions and then arrive to a better equilibrium.

# Once we introduced the autoregressive element to the model, the effect of the dummy variable became
# rather small relative to other effects.

lm_diag_func(alcohol_data[-1,], model_d)
vif(model_d)

# Some observations from the diagnostics:
# (1) Residuals are still a bit asymmetric.
# (2) Residuals do not seem to exhibit any autocorrelation
# (3) Influence again peaks near the change in legislation, likely due to the AR term
# (4) Variance inflation is high due to the lagged variables.

which(bg_function(model_d, n - 1, 8) < 0.05)
# The null is accepted for all testable lags.

# Aside from the issues introduced by the endogenous explanatory variable,
# this model seems to be able to account for most of the behavior
# of the time series that was quite lacking in the first two models.

# HW
# The most important diagnostic criterion for our purposes is to check whether autocorrelation
# is present in the residuals. Thus the figures related to the residuals are the most useful ones.
# You can also use the function found in this script to plot all the information that we used in the demos,
# though you might have to modify it a bit to work with the data of the exercise.