setwd("~/School/Prediction/Week_5") library(car) library(lmtest) # install.packages("lmtest") alko <- read.table("data/alcohol.txt", header = TRUE, sep = "\t") alko <- alko[, 1:5][,-2] colnames(alko)[2:4] <- c('Alcohol_C', 'Price_I', 'Total_C') lm_plotter <- function(data, model, response_name, lag_max) { par(mfrow = c(3,2)) start_year <- data$YEAR[1] y <- ts(data$Alcohol_C, start = start_year) # Fitted vs observed plot(y, xlab = 'Time', col = 'red') lines(ts(model$fitted.values, start = start_year), col = 'blue') legend("topleft", legend = c(response_name, "Fit"), col = c("red", "blue"), lty = c(1, 1)) # Cook's distances cooksd <- cooks.distance(model) plot(cooksd, xaxt = "n", type = "h", lwd = 3, xlab = NA, ylab = "Cook's distances") axis(side = 1, at = seq(1, length(y), 5), cex.axis = 0.9, labels = seq(start(y)[1], (start(y)[1] + length(y) - 1), 5)) # Residual plot res <- ts(model$residuals, start = start_year) plot(res, type = "p", pch = 16) abline(h = 0) # ACF of residuals acf(res, main = "", lag.max = lag_max) # Histogram of the residuals breaks <- seq(min(res), max(res), length.out = 9) hist(res, xlab = "Residuals", ylab = "Frequency", main = "", breaks = breaks) # Quantile-Quantile plot qqnorm(res, pch = 16, main = "") qqline(res, col = "red", lwd = 2) } # 5.1 log_log_model <- lm(Alcohol_C ~ Price_I + Total_C, data = alko) summary(log_log_model) # Interpreting log-log model parameters # One can show that for a model log(y) = beta * log(x), # when the change in x is small, the parameter beta # estimates the proportional change in y given 1 % change in x. # This can be shown by analyzing y_1 = (1 + p/100) * y_0 when # x_1 = (1 + 1/100)*x_0 -> p \approx beta # So in this case # 1 % increase in Real price of alcohol causes a 1 % reduction in alcohol consumption # expenditures # 1 % increase in Total Consumption expenditures causes a 1.5 % increase in # alcohol consumption expenditures # Model looks reasonable from the summary lm_plotter(alko, log_log_model, 'Alcohol_C', lag_max = 15) vif(log_log_model) # The residuals exhibit substantial autocorrelation which can also # be observed in the shape of the residual plot # The model systematically over- and underpredicts # There are shocks in the observed series that can cause # such systematic bias # Changes in alcohol consumption are likely to be delayed # Thus this static model is not sufficient # 5.2 alko$LAW <- c(rep(0,19),rep(1,13)) log_law_model <- lm(Alcohol_C ~ Price_I + Total_C + LAW, data = alko) summary(log_law_model) # 1 % increase in Real price of alcohol causes a 0.9 % reduction in alcohol consumption # expenditures # 1 % increase in Total Consumption expenditures causes a 1 % increase in # alcohol consumption expenditures # For LAW, we can approximate by exponentiation that approximately 37 % # increase in alcohol consumption expenditures occurred after passing # the legislation # Summary looks reasonable lm_plotter(alko, log_law_model, 'Alcohol_C', lag_max = 15) vif(log_law_model) # While somewhat better than the previous model, the same issues # still persist # 5.3 # As noted before, the current level is probably influenced by the previous # level, thereby causing the autocorrelations. If this is stable, # we can eliminate it through differencing. alko_diff <- as.data.frame(apply(alko[,-1],2,diff)) alko_diff$YEAR <- alko$YEAR[-1] diff_model <- lm(Alcohol_C ~ Price_I + Total_C + LAW, data = alko_diff) summary(diff_model) # The interpretation remains the same for the parameter estimates # 1 % increase in Real price of alcohol causes a 0.8 % reduction in alcohol consumption # expenditures # 1 % increase in Total Consumption expenditures causes a 1.4 % increase in # alcohol consumption expenditures # For LAW, we can approximate by exponentiation that approximately 22 % # increase in alcohol consumption expenditures occurred after passing # the legislation # Summary looks reasonable, though not directly comparable due to a # different response variable lm_plotter(alko_diff, diff_model, 'D(Alcohol_C)', lag_max = 15) vif(diff_model) # While there are some patterns in the residuals, the individual autocorrelations # are not significant. We can also formally test these using the # Breusch-Godfrey test. While for this model Ljung-Box would suffice, # for the sake of consistency we use BG as a more general test which # also can be used for the next model. The null is of the same # form as in Ljung-Box. ?bgtest bg_p_vals <- function(model) { sapply(seq(1, nrow(model$model) - length(coef(model))), function(order) bgtest(model, order = order)$p.value) } which(bg_p_vals(diff_model) < 0.05) # The null is not rejected for any order -> The residual process is WN # -> The model is sufficient. # 5.4 n <- nrow(alko) dist_lag_model <- lm(alko$Alcohol_C[-1] ~ alko$Alcohol_C[-n] + alko$Price_I[-1] + alko$Price_I[-n] + alko$Total_C[-1] + alko$Total_C[-n] + alko$LAW[-1] + alko$LAW[-n]) summary(dist_lag_model) # The addition of the auto-regressive term complicates the analysis # but we can draw some rough conclusions from the estimates # The instantaneous effects are as before # The auto-regressive parameter is quite close to one # indicating that the previous level has substantial # influence on the current level. The effect of the covariates # on the response are thus quite immediate # The lag 1 terms have an opposite sign, indicating # possible adjustment # Long-term elasticities can be estimated, check # the solution pdf for those lm_plotter(data.frame(Alcohol_C = alko$Alcohol_C[-1], YEAR = alko$YEAR[-1]), dist_lag_model, 'Alcohol_C', lag_max = 15) vif(dist_lag_model) # Substantial variance inflation as expected # No apparent structure to the residual process and no autocorrelation detected # The model fit looks good. which(bg_p_vals(dist_lag_model) < 0.05) # The null is not rejected for any order -> The residual process is WN # -> The model is sufficient. # HW 5.5 # Similar analysis to the demos, consider whether the parameter estimates # make sense and whether the residual process is WN or not.