setwd('~/Prediction/Week_2') # 2.1 tobacco_data <- read.table('tobacco.txt', header = T, sep = '\t') tobacco_data <- tobacco_data[,c('CONSUMPTION', 'ILL')] countries <- c("Iceland","Norway","Sweden","Canada","Denmark", "Austria","USA","Netherlands","Switzerland","Finland", "England") tobacco_data$Country <- countries # a) tobacco_fit <- lm(ILL ~ CONSUMPTION, data = tobacco_data) summary(tobacco_fit) ?plot ?abline plot(tobacco_data$CONSUMPTION, tobacco_data$ILL, xlab = 'Cigarette consumption per capita in 1930', ylab = 'Lung cancer cases per 100k in 1950', pch = 16) abline(tobacco_fit, col = '#FF000075', lwd = 4) text(x = tobacco_data$CONSUMPTION, y = tobacco_data$ILL, labels = tobacco_data$Country, pos = 1, cex = 0.8) # b) FIT <- fitted(tobacco_fit) RES <- resid(tobacco_fit) # c) # Here we would desire a linear one-to-one relationship between the fitted and observed values plot(tobacco_data$ILL, FIT, xlab = 'Observed value', ylab = 'Fitted value', pch = 16) abline(b = 1, a = 0, col = '#FF000075', lwd = 4) text(x = tobacco_data$ILL, y = FIT, labels = tobacco_data$Country, pos = 1, cex = 0.8) # R^2 cor(tobacco_data$ILL, FIT)^2 # Couple of things to look for in this residual plot: # If there is any systematic behavior in the residuals as a function of the fitted value, # there is some kind of model misspecification. # There are two main sources for it: the systematic part is misspecified or the random part # is misspecified. Sometimes it can be hard to tell where the misspecification lies. # If the systematic part is misspecified, then usually the estimated residuals behave in such a way # that their sign changes systematically in different parts of fitted value range (i.e. E[e_i] != 0 but instead E[e_i](X)). # This usually indicates the presence of some kind of non-linear structure # (can sometimes be remedied with a transformation of certain explanatory variables) # Example: Non-linear structure ---- x <- runif(60, min = 0, max = 7) y <- 2 + exp(0.5 *x) + rnorm(60, sd = 1) exampl_fit <- lm(y ~ x) summary(exampl_fit) plot(x,y, pch = 16) abline(exampl_fit) # Clearly the linear model fails to account for the exponential behavior plot(fitted(exampl_fit), resid(exampl_fit), xlab = 'Fitted value', ylab = 'Estimated residual', pch = 16) abline(b = 0, a = 0) # The systematic change in the sign of the estimated residuals # One can try to log-transform the response log_y <- log(y) exampl_fit_log <- lm(log_y ~ x) summary(exampl_fit_log) plot(x,log_y, pch=16) abline(exampl_fit_log) # Better but the non-linear transformation seems to have introduced some heteroscedasticity plot(fitted(exampl_fit_log), resid(exampl_fit_log), xlab = 'Fitted value', ylab = 'Estimated residual', pch = 16) abline(b = 0, a = 0) # ---- # If the random part is misspecified, then the range of the estimated residuals varies as a function # of the fitted value (i.e. Var[e_i] != sigma^2 but Var[e_i](X)). # Example: Heteroscedasticity ---- x <- runif(80, min = 100, max = 1000) y <- 10 + 10*x + rnorm(80, sd = x) exampl_fit <- lm(y ~ x) plot(x,y, pch = 16) abline(exampl_fit) summary(exampl_fit) # The deviations from the fitted line seem to increase as a function of x plot(fitted(exampl_fit), resid(exampl_fit), xlab = 'Fitted value', ylab = 'Estimated residual', pch = 16) abline(a = 0, b = 0) # The range of variation of the estimated residuals seems to increase as a function of the fitted value. # ---- plot(FIT ,RES, xlab = 'Fitted value', ylab = 'Estimated residual', pch = 16) abline(b = 0, a = 0) text(FIT, RES, labels = tobacco_data$Country, pos = 1, cex = 0.8) # d) # The estimated residual of observation USA deviates substantially from the other estimated residuals (with maybe # England having a similar value) # e) ?cooks.distance cooks_d <- cooks.distance(tobacco_fit) plot(cooks_d, xaxt="n", xlab=" ", ylab="Cook's distance", pch = 16) axis(side=1, at=1:11, labels=countries,las=2) # Clearly the Cook's Dist. of USA differs substantially from the other observations. Thus it is an influential observation. # f) tobacco_data_filt <- tobacco_data[!(tobacco_data$Country %in% c('USA')),] tobacco_filt_fit <- lm(ILL ~ CONSUMPTION, data = tobacco_data_filt) summary(tobacco_fit) summary(tobacco_filt_fit) # Substantial improvements are observed in model quality measures, a larger effect on lung cancer cases is also observed. plot(tobacco_data$CONSUMPTION,tobacco_data$ILL, xlab = 'Cigarette consumption per capita in 1930', ylab = 'Lung cancer cases per 100k in 1950', pch = 16) abline(tobacco_fit, col = '#FF000075', lwd = 4) abline(tobacco_filt_fit, col = '#0000FF75', lwd = 4) legend('topleft', legend = c('With USA', 'Without USA'), lty = 1, col = c('#FF000075','#0000FF75' )) text(x = tobacco_data$CONSUMPTION, y = tobacco_data$ILL, labels = tobacco_data$Country, pos = 1, cex = 0.8) cooks_d_filt <- cooks.distance(tobacco_filt_fit) plot(cooks_d_filt, xaxt="n", xlab=" ", ylab="Cook's distance", pch = 16) axis(side=1, at=1:10, labels=tobacco_data_filt$Country,las=2) # Can the removal of observation USA be justified? According to the model solution, during 1930 # tobacco was milder in the USA in comparison to the other nations. Additionally, filters had been # introduced in the United States unlike in the other countries. The source for the claims is shrouded # in mystery (something to do with Mellins' materials) but if we assume this to be true, then the carcinogenic # "load" of individual cigarettes are not the same between United States and other nations. Thus the metric # "Cigarette Consumption per Capita" cannot be used to compare countries in a meaningful fashion. # Without knowledge of the exact carcinogenic "load", it is quite reasonable to remove the incommensurate observation. # 2.2 library(car) cement_data <- read.table('hald.txt', header = T) # It might be useful to calculate some simple descriptive statistics before estimating the model cor(cement_data) # Each chemical has a substantial correlation with the response # However, the chemicals also exhibit substantial pair-wise correlation # This can indicate (though not imply) possible issues with precision of the LS estimates # a) cement_full_m <- lm(HEAT ~ . - SUM, data = cement_data) summary(cement_full_m) # While the model seems to explain the variation of the response very well (R^2, F-test etc.), # none of the LS estimates are significant at the 5 percent significance level, at least according to the t-test. # This situation tends to arise when the explanatory variables exhibit strong linear dependencies between themselves, # referred to as multicollinearity (in a sense, the generalization of correlation to multiple variables) # Some geometric intuition about the phenomenon ---- library(plotly) multi_col_helper <- function(n, corr, sigma_e) { x <- rnorm(n) z <- corr * x + sqrt(1 - corr^2) * rnorm(n) y <- 0.5 * x + 0.5*z + rnorm(n, sd = sigma_e) model <- lm(y ~x + z) print(summary(model)) eval_points <- seq(-8, 10, by = 0.1) obj_f_val <- outer(eval_points, eval_points, Vectorize(function(b_1,b_2) sum((y - b_1 * x - b_2 * z)^2))) plot_ly(x = eval_points, y = eval_points, z = obj_f_val, type = 'contour') } # Manipulate the sample size, correlation and residual SD to observe the effects on the SD of the estimators multi_col_helper(50, 0, 0.1) # ---- # For higher dimensional cases, we can use the R^2 of the model of explanatory variables to estimate the degree of # linear dependency model_CHEM2 <- lm(CHEM2 ~ . - SUM - HEAT, data = cement_data) summary(model_CHEM2) # R^2 is high, so the linear dependency is substantial. Its effect on the standard deviations of the LS estimators can # be measured using the Variance Inflation Factor (VIF) which after some standardization of the model variables, can be # shown to multiplicatively inflate the SD of the estimators i.e. SD[beta_i] = VIF_i * ... # However, it is important to note that other factors, such as R^2 of the model, sample size etc. also affect # the SDs of the LS estimators. Thus high VIF is not an issue on its own but only if it can be identified as # a substantial factor in reducing the precision of the estimates. # Inspired by: A Caution Regarding Rules of Thumb for Variance Inflation Factors # For CHEM2, the VIF is 1/(1 - summary(model_CHEM2)$r.squared) vif(cement_full_m) # In this case, the issue is that cement_data$SUM # is nearly 100 for each batch. Thus given three chemicals the proportion of the remaining chemical # can be quite accurately predicted by 100 - rowSums(cement_data[,!(names(cement_data) %in% c('CHEM1', 'HEAT', 'SUM'))]) cement_data$CHEM1 # Example of prediction uncertainty with bootstrap ---- pred_helper <- function(X, y, og_resid, x) { # Sample with replacement inds <- sample(1:nrow(X), replace = T) X <- X[inds,] y <- y[inds] # Bootstrap LS estimate beta_b <- solve((t(X) %*% X)) %*% t(X) %*% y # Bootstrap prediction x %*% beta_b + sample(og_resid, 1) } n_boot <- 2000 alpha <- 0.05 n <- nrow(cement_data) Intercept <- rep(1,n) X <- cbind(Intercept, as.matrix(cement_data[,-c(5,6)])) pred_x <- apply(X, 2, function(x) quantile(x, probs = 0.3)) #colMeans(X) y <- cement_data$HEAT # Construct n_boot bootstrap estimates boot_samples <- replicate(n_boot, pred_helper(X, y, resid(cement_full_m), pred_x)) pred_y <- coef(cement_full_m) %*% pred_x boot_samples <- c(pred_y, boot_samples) # Select the desired quantiles for the confidence interval quantile(boot_samples, probs = c(alpha/2, 1 - alpha/2)) hist(boot_samples, xlab = 'Prediction', main = 'Bootstrap distribution') abline(v = pred_y, col = 'red', lwd = 2) # ---- # b) # To perform model selection, we need a metric to evaluate the quality of the model. There are roughly two aspects # to consider: quality of fit and size of the model. Quality of fit measures how well the model fits the observed # data (e.g. Sum of Squared Residuals loss function). Minimizing this loss is clearly desirable. # However, increasing the capacity of the model by e.g. adding additional expl. variables # can cause the model to overfit to the observed data. # Overfitting tends to cause substantial increases in Out-of-Sample prediction errors (generalization error) # and increase the uncertainty of the estimated parameters. Thus model size/capacity should be penalized. # Akaike Information Criterion (AIC) is one such model selection criterion. It can be used to compare # models fitted using the same likelihood function. The smaller the AIC, the better. It can be shown that # AIC is asymptotically equivalent to Leave-One-Out Cross-Validation (CV) in which the one data point is removed, # the remaining are used to fit the model and the prediction error is calculated using the omitted data point. # This CV error estimates the Out-of-Sample prediction error. Thus models with lower AIC should perform better # in Out-of-Sample evaluations, at least asymptotically. ?extractAIC ?step # Performs greedy search for the model with the smallest AIC (by default, backward elimination). step(cement_full_m) # The greedy search results in cement_AIC <- lm(HEAT ~ . - SUM - CHEM3, data = cement_data) summary(cement_AIC) # One way of implementing this by hand AIC_helper <- function(model_formula, excluded_var) { new_model <- paste0(c(model_formula, ' - ', excluded_var), collapse = '') model <- lm(as.formula(new_model), data = cement_data) extractAIC(model)[2] } expl_vars <- colnames(cement_data)[-c(5,6)] while (length(expl_vars) > 0) { linear_part <- paste0(expl_vars, collapse = ' + ') model_formula <- paste0(c('HEAT', ' ~ ', linear_part), collapse = '') model <- lm(as.formula(model_formula), data = cement_data) AIC_vals <- extractAIC(model)[2] AIC_vals <- c(AIC_vals, sapply(expl_vars, function(name) AIC_helper(model_formula, name))) print(AIC_vals) min_index <- which.min(AIC_vals) if (min_index == 1) { break } else { expl_vars <- expl_vars[-(min_index-1)] } } linear_part <- paste0(expl_vars, collapse = ' + ') model_formula <- paste0(c('HEAT', ' ~ ', linear_part), collapse = '') cement_AIC_self <- lm(as.formula(model_formula), data = cement_data) summary(cement_AIC_self) vif(cement_AIC) # Some variance inflation seems to remain and the corresponding variables are still insignificant at 5 % level. # AIC only evaluates the full model so it makes sense that sometimes even small improvements in the fit compensate # for the increased model size. hist(resid(cement_full_m), breaks = 8) # Residual distribution does not seem normal, thus AIC might not have # the properties that motivate its usage. # 2.3 A good idea would be to write a function that performs the permutation test for a given model for each # explanatory variable. Then you can either automate the process or do it manually one model at a time. # 2.4 Scatter plots of the original data with fitted curves # and residual plots are the most useful diagnostics. For b), check ?curve for plotting the fit.