library(car) # Calculate VIF library(lmtest) # Breusch–Godfrey test # Read data alko <- read.table("data/alcohol.txt", header = TRUE, sep = "\t") alko <- alko[, 1:5] head(alko) consumption <- ts(alko$LQ1CPC, start = 1950) price <- ts(alko$LR1C, start = 1950) total <- ts(alko$LQTOTALPC, start = 1950) # Define some functions for diagnostics #' Plot original time series and fit #' #' @param y Response variable. #' @param model Linear regression model object of class lm. #' @param name Name of the response variable. plot_fit <- function(y, model, name) { fit <- ts(model$fitted.values, start = start(y)[1]) plot(y, col = "red", xlab = "Time", ylab = "") lines(fit, col = "blue") legend("topleft", legend = c(name, "Fit"), col = c("red", "blue"), lty = c(1, 1)) } #' Plot Cook's distances #' #' @param y Response variable. #' @param model Linear regression model object of class lm. plot_cook <- function(y, model) { cooksd <- cooks.distance(model) plot(cooksd, xaxt = "n", type = "h", lwd = 3, xlab = NA, ylab = "Cook's distances") axis(side = 1, at = seq(1, length(y), 5), cex.axis = 0.9, labels = seq(start(y)[1], (start(y)[1] + length(y) - 1), 5)) } #' Plot residuals versus time #' #' @param y Response variable. #' @param model Linear regression model object of class lm. plot_res <- function(y, model) { res <- ts(model$residuals, start = start(y)[1]) plot(res, type = "p", pch = 16) } #' ACF plot of residuals #' #' @param model Linear regression model object of class lm. #' @param lag_max Maximum lag at which to calculate the acf. plot_acf <- function(model, lag_max = NULL){ acf(model$residuals, main = "", lag.max = lag_max) } #' Histogram of residuals (8 bins) #' #' @param model Linear regression model object of class lm. plot_hist <- function(model) { res <- model$residuals breaks <- seq(min(res), max(res), length.out = 9) hist(model$residuals, xlab = "Residuals", ylab = "Frequency", main = "", breaks = breaks) } #' QQ plot #' #' @param model Linear regression model object of class lm. plot_qq <- function(model) { res <- model$residuals qqnorm(res, pch = 16, main = "") qqline(res, col = "red", lwd = 2) } #' Perform Breusch-Godfrey test #' #' Breusch-Godfrey can be performed up to order #' 'sample size' - 'number of estimated parameters'. #' #' @param model Linear regression model object of class lm. #' @param m Number of estimated parameters. #' #' @return Vector of p-values. res_test <- function(model, m) { n <- length(model$residuals) pvalues <- rep(NA, n - m) for (i in 1:(n - m)) { pvalues[i] <- bgtest(model, order = i)$p.value } pvalues } # 5.1 # Fit the first model model_log <- lm(consumption ~ price + total) summary(model_log) # Comments about summary # - The regression coefficients corresponding to the variable price and variable # total are statistically significant with 5% level of significance. # # - The signs of the regression coefficients of the price and total expenditures # variables are as expected: the coefficient of the price variable (price) is # negative and the coefficient of the total expenditures variable (total) is # positive. # # - Interpretations of the regression coefficients as elasticities: # - If the price goes up by 1%, then the alcohol expenditures are reduced by # 1.003%. # - If the total expenditures are increased by 1%, then the alcohol # expenditures are increased by 1.465%. # # - The coefficient of determination of the model is 96.42%. # Diagnostics par(mfrow = c(3, 2)) plot_fit(consumption, model_log, name = "Consumption") plot_cook(consumption, model_log) plot_res(consumption, model_log) plot_acf(model_log) plot_hist(model_log) plot_qq(model_log) # Comments about diagnostics: # - Residuals do not look normally distributed. # # - Residuals seem to be heavily correlated. # # - By the variance inflation factor (VIF), multicollinearity is not a problem # here. # # - Reason for the correlatedness of the residuals: # => Fitted curve stays above and below the response variable consumption for # long time periods. # # - The model does not take account of the change in the legislation # (the beginning of the year 1969). This is also visible in the Cook's # distances. # Residuals do not seem to be white noise # => Model is not sufficient # 5.2 # Fit the second model law <- ts(c(rep(0, 19), rep(1, 13)), start = 1950) model_law <- lm(consumption ~ price + total + law) summary(model_law) # Comments about summary # - The regression coefficients corresponding to the price variable (price) and # the total expenditures variable (total) are statistically significant with # 5% level of significance. # # - The estimates for the regression coefficients differ from the estimates of # the second model. # # - The signs of the regression coefficients for the price and total # expenditures are as expected: the coefficient of the price variable is # negative and the coefficient of the total expenditures variable is positive. # # - Interpretations of the regression coefficients as elasticities: # - If the price goes up by 1\%, then the alcohol expenditures are reduced # by 0.886%. # # - If the total expenditures are increased by 1\%, then the alcohol # expenditures are increased by 1.044%. # # - The regression coefficient 0.317 corresponding to law is statistically # significant with 5\% level of significance. # # - The sign of the regression coefficient of law is as expected. # # - The coefficient of determination has increased to 98.49%. # Diagnostics plot_fit(consumption, model_law, name = "Consumption") plot_cook(consumption, model_law) plot_res(consumption, model_law) plot_acf(model_law) plot_hist(model_law) plot_qq(model_law) # Comments about diagnostics: # - Distribution of the residuals does not seem to be normal. The histogram of # the residuals is skewed, which is evidence against normality. # # - Residuals are strongly correlated. # # - By VIF, there is no problem with multicollinearity. # # - Reason for the correlatedness of the residuals: # => Fitted curve stays above and below the response variable consumption for # long time periods. # # - The model takes into account the change in the legislation # (beginning of 1969). # 5.3 # Compute differenced variables and fit the third model. consumption_d <- diff(consumption) price_d <- diff(price) total_d <- diff(total) law_d <- diff(law) model_diff <- lm(consumption_d ~ price_d + total_d + law_d) summary(model_diff) # Comments about summary: # - All the coefficients of the third model are statistically significant # (with the exception of the constant term) with 5\% level of significance. # # - The estimates for Model (3) are clearly different when compared to the # estimates of Model (3). # # - The signs of the regression coefficients of the price variable (price_d) and # total expenditures variable (total_d) are as expected: the coefficient of # the price variable is negative and the coefficient of the total expenditures # variable is positive. # # - Interpretations of the regression coefficients as elasticities: # - If the price goes up by 1%, then the alcohol expenditures are reduced # by 0.817%. # - If the total expenditures are increased by 1%, then the alcohol # expenditures are increased by 1.372%. # - The coefficient of the instant effect of the dummy variable law_d # is 0.196. # # - The coefficient of determination is 82.64%. # # - The coefficient of determination of the Model (3) is not comparable with the # coefficients of determinations corresponding to Models (1) and (2), since # the response variable is not the same. # Diagnostics plot_fit(consumption_d, model_diff, name = "D(Consumption)") plot_cook(consumption_d, model_diff) plot_res(consumption_d, model_diff) plot_acf(model_diff) plot_hist(model_diff) plot_qq(model_diff) vif(model_diff) res_test(model_diff, m = 4) > 0.05 # Comments about diagnostics: # - Residuals could be normally distributed. # # - Residuals are not correlated. # # - By the Breusch-Godfrey test, the residuals are not correlated, since the # null hypothesis is accepted with 5% level of significance for all lags. # # - By VIF, multicollinearity is not a problem. # # - By plotting residuals against time, there does not seem to be evidence of # heteroscedasticity. # All in all, Model (3) is sufficient. # 5.4 # Fit Model (4) n <- length(consumption) model_lag <- lm(consumption[-1] ~ consumption[-n] + price[-1] + price[-n] + total[-1] + total[-n] + law[-1] + law[-n]) summary(model_lag) # Comments about summary: # # - It is not possible to draw direct conclusions regarding the significance of # the regression coefficients based on the t-tests. However, the results give # some general direction, and the results indicate that all regression # coefficients would be statistically significant # (with the exception of the constant). # # - The coefficient of the variable consumption with lag 1 is 0.912, which # implies that the adjustment to changes in prices and total expenditures # is rather fast. # # - The signs of the coefficients of variables price and total with lag 0 are as # expected: the coefficient -0.829 of the price variable is negative and the # coefficient 1.469 of the total expenditures variable is positive. These # coefficients describe the instant effects of changes in prices and total # expenditures. # # - The signs of the coefficients of the price and total expenditures variables # with lag 1 are also as expected. # # - Long term elasticities are: # Price: (beta_2 + beta_3)(1 - beta_1) approx -1.31 # Total expenditures: (beta_4 + beta_5)(1 - beta_1) approx 1.75. # # - Interpretations of the regression coefficients of price and total # expenditures variables with lag 0: # - If the price goes up by 1%, then the alcohol expenditures are instantly # reduced by (without a lag) 0.829%. # - If the total expenditures are increased by 1\%, then the alcohol # expenditures are increased by 1.75%. # # Interpretations of the long term elasticities of price and total expenditures # variables: # # - If the price goes up by 1%, then the alcohol expenditures are reduced # by 1.31% in the long term. # # - If the total expenditures are increased by 1%, then the alcohol expenditures # are increased by 1.75% in the long term. # # - The coefficient of the instant effect of the dummy variable law is 0.177 and # the long term coefficient is small # (beta_6 + beta_7)/(1 - beta_1) approx -0.16. # Hence, the change in the legislation has rather minor effect on the # behaviour of the consumers in a long term, which seems plausible. # # - Additionally, it is not possible to draw conclusions from the coefficient # of the determination. # Diagnostics y <- ts(consumption[-1], start = 1951) plot_fit(y, model_lag, name = "Consumption") plot_cook(y, model_lag) plot_res(y, model_lag) plot_acf(model_lag) plot_hist(model_lag) plot_qq(model_lag) # Comments about diagnostics of Model \@ref(eq:diff): # # - Residuals could be normally distributed. However, tails of the # distribution seem to be heavier than tails of the normal distribution. # # - Residuals are not correlated. # # - By the Breusch-Godfrey test, the residuals are not correlated. The null # hypothesis is accepted with 5% level of significance for all lags. # # - By VIF, there is strong multicollinearity in the model. This is # unsurprising, as the model involves same variables with different lags. # # - There is no evidence of heteroscedasticity. # # - The model takes into account the change in the legislation. # # - The fitted model coincides better with the original time series than the # fits of Models (2) and (3). # We consider this model to be sufficient in explaining the alcohol # expenditures.