########################### # MS-C1620 # Statistical inference # Lecture 8 # Linear regression II library(car) # We use the significance level 0.05 throughout the script ########################################################### # Examples of data sets with a response and multiple predictors/explanatory variables ################### # Swiss fertility data # Standardized fertility measure and socio-economic indicators # for each of 47 French-speaking provinces of Switzerland at about 1888. head(swiss) ?swiss # The effect of two variables to the response can still be visualized fully library(rgl) # (Opens in a new window) open3d() plot3d(swiss[, c(2, 3, 1)]) # Is the surface linear? # For more than two predictors, we need to resort to pair-wise plotting pairs(swiss) # Which factors influence the fertility? ################### # Marketing data # install.packages("devtools") # library(devtools) # devtools::install_github("kassambara/datarium") library(datarium) # A data frame containing the impact of three advertising medias (youtube, facebook and newspaper) on sales head(marketing) ?marketing # Linear relationships? pairs(marketing) # Do the advertisements have an effect on the sales and if so, how large? ##################### # Diabetes data # install.packages("lars") library(lars) # Data with 442 with diabetes patients. For each, their disease progression one year after the baseline # was measured (the response "dia"), along with 10 baseline covariates/explanatory variables. data(diabetes) diabetes_1 <- cbind(diabetes$y, diabetes$x) colnames(diabetes_1)[1] <- "dia" diabetes_1 <- data.frame(diabetes_1) # (The variables have been standardized) head(diabetes_1) pairs(diabetes_1) # We want to predict the diabetes progression using the baseline variables ########################################################## # Fitting a multiple linear model ################### # Swiss fertility data head(swiss) swiss_lm <- lm(Fertility ~ ., data = swiss) summary(swiss_lm) # Interpretations: when the percentage of males involved in agriculture as occupation (Agriculture) grows by # one, the expected fertility goes down -0.17 units, etc. # VIFs show no significant multicollinearity vif(swiss_lm) # R2-value: 0.7067 -> a very good fit # Diagnostics: plot(predict(swiss_lm), resid(swiss_lm), xlab = "Fitted values", ylab = "Residuals") abline(h = 0) # The points seem to be approximately evenly distributed around the line y = 0 # Their variance seems not to depend on the fitted value # No non-linear patterns are visible # -> The regression model assumptions look plausible! ################### # Marketing data head(marketing) marketing_lm <- lm(sales ~ ., data = marketing) summary(marketing_lm) # When the Facebook advertising budget is increased by 1000$ (check the units from the help file), # the expected sales go up 0.19 units # No significant multicollinearity vif(marketing_lm) # R2-value: 0.8972 -> an extremely good fit # Diagnostics: plot(predict(marketing_lm), resid(marketing_lm), xlab = "Fitted values", ylab = "Residuals") abline(h = 0) # A clear non-linear pattern is visible # -> The model assumptions do not hold # We change the model pairs(marketing) # Adding a log(youtube) term does not help marketing_lm_2 <- lm(sales ~ youtube + I(log(youtube)) + facebook, data = marketing) summary(marketing_lm_2) plot(predict(marketing_lm_2), resid(marketing_lm_2), xlab = "Fitted values", ylab = "Residuals") abline(h = 0) # What does help, is adding also a product of youtube*facebook (coded below as youtube:facebook) marketing_lm_3 <- lm(sales ~ I(log(youtube)) + facebook + youtube:facebook, data = marketing) summary(marketing_lm_3) plot(predict(marketing_lm_3), resid(marketing_lm_3), xlab = "Fitted values", ylab = "Residuals") abline(h = 0) # Interpretation: # 1. there's a non-linear dependency between youtube and sales # 2. the more you spend on youtube, the more benefit you gain from spending on facebook # Inference: # log(youtube), facebook and facebook:youtube are significant predictors (the dependency between sales # and them is not just random noise). ##################### # Diabetes data head(diabetes_1) diabetes_lm <- lm(dia ~ ., data = diabetes_1) summary(diabetes_lm) # Interpretations as before... # ...however, this time multicollinearity is present vif(diabetes_lm) # we can see the multicollinearity clearly here plot(diabetes_1$ldl + diabetes_1$hdl, diabetes_1$tc ) # We remove the variable with highest VIF, which is TC (total cholesterol) diabetes_lm_2 <- lm(dia ~ age + sex + bmi + map + ldl + hdl + tch + ltg + glu, data = diabetes_1) vif(diabetes_lm_2) # No more significant multicollinearities summary(diabetes_lm_2) # Coefficient for ldl and hdl changed signs! # R2-value: 0.5137 -> a quite good fit # Diagnostics: plot(predict(diabetes_lm_2), resid(diabetes_lm_2), xlab = "Fitted values", ylab = "Residuals") abline(h = 0) # The points seem to be approximately evenly distributed around the line y = 0 # Their variance depends on the fitted value # A non-linear patterns seems to be visible # -> The regression model assumptions do not look plausible! # The next step could be to remove the non-significant predictors and try transforming the rest... ########################################### # Bootstrap confidence interval for the R-squared in the fertility data # Point estimate swiss_lm <- lm(Fertility ~ ., data = swiss) summary(swiss_lm)$r.squared # Bootstrapping. # We do bootstrapping for three different quantities: # For R^2, and for two coefficients (agriculture and examination). n <- nrow(swiss) B <- 1000 res <- 0 beta_agri <- 0 beta_exam <- 0 for(i in 1:B){ swiss_boot <- swiss[sample(1:n, n, TRUE), ] model <- lm(Fertility ~ ., data = swiss_boot) res[i] <- summary(model)$r.squared beta_agri[i] <- summary(model)$coef["Agriculture","Estimate"] beta_exam[i] <- summary(model)$coef["Examination","Estimate"] } hist(res, breaks=20) hist(beta_agri, breaks=20) hist(beta_exam, breaks=20) # Bootstrap 95% confidence intervals quantile(res, probs = c(0.025, 0.975)) # In some sense with 95% probability the true R^2 is in the interval (0.59, 0.85) quantile(beta_agri, probs = c(0.025, 0.975)) quantile(beta_exam, probs = c(0.025, 0.975))