setwd('~/Prediction/Week_5') library(car) library(forecast) library(lmtest) alcohol_data <-read.table("alcohol.txt",header=T,sep="\t") alcohol_data <- alcohol_data[,c('YEAR', 'LR1C', 'LQ1CPC','LQTOTALPC')] colnames(alcohol_data) <- c('Year', 'Price_Index','Alcohol_Consump', 'Total_Consump') # The data is already log-transformed # 5.1 model_a <- lm(Alcohol_Consump ~ . - Year, data = alcohol_data) summary(model_a) # Interpreting the model coefficients: # The additive effects on the logarithmic scale are multiplicative on the # original scale. If the effect is small enough, we can approximately say # that b_i represents the percentage increase/decrease in the expected response # when the explanatory variables increases by 1 percent. # Thus in this case: # Price_Index: A percentage increase in the real prices is associated with a percentage reduction # in alcohol consumption expenditures. This is to be expected since demand usually drops when # the price levels increase and since alcohol products are not infinitely divisible but # have to be purchased in discrete units. # Total_Consump: A percentage increase in total consumption expenditures is associated with # 1.46 % increase in alcohol consumption expenditures. Again, this is to be expected since # alcohol consumption is part of the total consumption and might be related to the general # increase in household wealth from which more money can be used for non-essential consumption. # The individual t-tests and the R^2 of the model indicate a well-estimated model (5 % threshold) which # also accounts most of the variation present in the response. lm_diag_func <- function(data, model) { par(mfrow = c(3,2)) res <- resid(model) qqnorm(res) qqline(res) hist(res, xlab = 'Residual') plot(data$Year, res, xlab = 'Year', ylab = 'Residual') abline(b = 0, a = 0) acf(res, main = 'Residual ACF') fit <- fitted(model) plot(data$Year,data$Alcohol_Consump, type = 'l', col = 'Red', xlab = 'Year', ylab = 'Alcohol expend.') lines(data$Year ,fit, col = 'Blue') legend("topleft", legend=c("Observed", "Fitted"), col=c("red","blue"),lty=c(1,1),cex=0.8) plot(data$Year ,cooks.distance(model), ylab="Cook's distances",xlab="Year", pch=16) } lm_diag_func(alcohol_data, model_a) vif(model_a) # Some observations from the diagnostics: # (1) Residuals do not seems to be normally distributed, thus the parametric tests might not be reliable. # (2) Residuals exhibit substantial autocorrelation, violating one of the standard assumptions. # (3) Influential observations are detected at the few years after the change in legislation. # (4) Variance inflation is low. # Due to the shift after the change in legislation and the autocorrelation, # the model systematically over-predicts and under-predicts # across different time periods. # Given these observations, model (1) is found quite lacking. # b) alcohol_data$Legis <- c(rep(0,19), rep(1,13)) model_b <- lm(Alcohol_Consump ~ . - Year, data = alcohol_data) summary(model_b) # The dummy variable is statistically significant and has a positive sign as expected. # Interpretation of the effect: # When legislation becomes active, the expected alcohol consumption expenditure increases by a factor of exp(0.317) # The addition of the dummy has a substantial effect on the parameter estimates. # Other observations remain similar to model from part (b) lm_diag_func(alcohol_data, model_b) vif(model_b) # Some observations from the diagnostics: # (1) Residuals are still not normal. # (2) Residuals exhibit substantial autocorrelation, violating one of the standard assumptions. # (3) Influence is now a bit more evenly distributed across observations # (4) Variance inflation is relatively low. # Due the autocorrelation, the model still systematically over-predicts and under-predicts # across different time periods. # Given these observations, model (2) is found lacking. # c) # If the underlying autoregressive structure is stable across time and linear, # then taking the first order difference should eliminate it from the time series. alcohol_data_diff <- as.data.frame( apply(alcohol_data[,!(colnames(alcohol_data) %in% c('Year'))],2, diff, simplify = T)) alcohol_data_diff$Year <- alcohol_data$Year[-1] model_c <- lm(Alcohol_Consump ~ . - Year, data = alcohol_data_diff) summary(model_c) # Note that model (3) is not directly comparable to the previous models since the response is different # Interpreting the difference effects: # A 1 % change in the difference is associated with a b % change in the difference of the response, not # the response itself. # So for example for the price index, a 1 % increase in the real price # log(x_t) - log(x_{t-1}) = log(x_t/x_{t-1}) # is associated with a 0.082 % reduction in the total consumption # log(y_t/y_{t-1}) # Otherwise, the observations are similar as before. lm_diag_func(alcohol_data_diff, model_c) vif(model_c) # Some observations from the diagnostics: # (1) Residuals are still a bit asymmetric. # (2) Residuals do not seem to exhibit any autocorrelation # (3) Influence is still relatively uniform across observations. # (4) Variance inflation is low. # While considering differences might be more noisy and sometimes more difficult to model # due to the reduction in the range of the observed values, # the autoregressive elements impart a lesser impact on the difference scale, making # it possible to estimate the effects with a smaller bias. # We can also test the residuals using the Breusch-Godfrey test # H_0: ACF is zero up to lag p, H_1: ACF is not zero for some lag <= p ?bgtest bg_function <- function(model, n, n_params) { lags <- seq(1,n-n_params) sapply(lags, function(x) bgtest(model, order = x)$p.value) } which(bg_function(model_c, nrow(alcohol_data_diff), 4) < 0.05) # Null is not reject for any testable autocorrelation # Given these observations, model (3) can be considered satisfactory. # 5.4 n <- nrow(alcohol_data) alcohol <- alcohol_data$Alcohol_Consump price_index <- alcohol_data$Price_Index total_consump <- alcohol_data$Total_Consump legis <- alcohol_data$Legis model_d <- lm(alcohol[-1] ~ alcohol[-n] + price_index[-1] + price_index[-n] + total_consump[-1] + total_consump[-n] + legis[-1] + legis[-n]) summary(model_d) # Do note that since one of the explanatory variables is endogenous (i.e. not independent of the error term), # the significance tests and likely the standard deviations are erroneous. Same with R^2. # However, a substantial autoregressive structure is observed, implying that the effect on the current level of the time # series is heavily influenced by the preceding levels. Thus any exogenous effect on the previous observations # substantially influences the following one. So even if the exogenous variables do not change in the current step, # their effects from the previous step are observed in the current step which makes sense since it takes # some time (though not long) for consumers to adjust to new economic conditions. # The signs of the instantaneous changes make sense for price index and total consumption. # The lagged variables having an opposite sign also makes sense since the consumers are likely to # "overreact" to the changes in economic conditions and then arrive to a better equilibrium. # Once we introduced the autoregressive element to the model, the effect of the dummy variable became # rather small relative to other effects. lm_diag_func(alcohol_data[-1,], model_d) vif(model_d) # Some observations from the diagnostics: # (1) Residuals are still a bit asymmetric. # (2) Residuals do not seem to exhibit any autocorrelation # (3) Influence again peaks near the change in legislation, likely due to the AR term # (4) Variance inflation is high due to the lagged variables. which(bg_function(model_d, n - 1, 8) < 0.05) # The null is accepted for all testable lags. # Aside from the issues introduced by the endogenous explanatory variable, # this model seems to be able to account for most of the behavior # of the time series that was quite lacking in the first two models. # HW # The most important diagnostic criterion for our purposes is to check whether autocorrelation # is present in the residuals. Thus the figures related to the residuals are the most useful ones. # You can also use the function found in this script to plot all the information that we used in the demos, # though you might have to modify it a bit to work with the data of the exercise.