setwd('~/Prediction/Week_1') # Basic Syntax and Commands # For packages # install.packages("ggplot2") Anti-virus software might cause some issues... Disable/create an exception if necessary. # Installation might also complain about Rtools4 on Windows, only needed if you want to build from source. library(ggplot2) # You might want to install car for the next computer exercise (though don't do it now) # Assignment y <- 1 # Prefer = to <- when using functions due to scoping rules mean(x = 2) # Only internally to the function call mean(x <- 2) # Assigned globally # Vectors and matrices ?c z <- c(1,2,3,4) x <- c(5,6,7,8) ?cbind m <- cbind(z,x) # Indexing (Starts at 1) ?'[' z[1] m[1,1] m[1,] # Select first row m[,'x'] # Select column with name 'x' z[z>2] # Select elements that are greater than 2 # For strings, you can use either " " or ' '. Use ' ' if you want quotes in the text. # String manipulation a <- c('Hip','Joint') b <- c(1,2) ?paste names <- paste(a, b, sep="_") paste(names, collapse = " + ") ?read.table emission_data_og <- read.table('emissions.txt', header = T, sep = "\t") emission_data_og$ObsNo # Let's drop the first column emission_data <- emission_data_og[,-1] # Set seed to get deterministic results set.seed(123) # a) ?hist ?par par(mfrow=c(2,2)) hist(emission_data$NOx) hist(emission_data$Humidity) hist(emission_data$Temp) hist(emission_data$Pressure) ?apply # This is extremely useful apply(emission_data,2,hist) # But not that pretty this time ?lapply # Same but with lists var_names <- colnames(emission_data) # Anonymous function needed since we are passing the same value to multiple arguments of hist. sapply(var_names, function(x) hist(emission_data[, x], main = x, xlab = 'Unit')) # Clearly all variables are not normally distributed (Skewness, general asymmetry) # Use this to reset graphical parameters dev.off() # b) ?lm # Some useful string manipulation features linear_part <- paste0(var_names[-1], collapse = ' + ') model_formula <- paste0(c('NOx', '~', linear_part),collapse = '') model_formula model_b <- lm(as.formula(model_formula), data = emission_data) lm(NOx ~ ., data = emission_data) # A shorthand for above # Useful function all around ?summary summary(model_b) # All elements of the summary # Call: Model specification and data source # Residuals: Certain descriptive statistics of the estimated residuals # Coefficients: # Estimate: The least squares (LS) estimates for the explanatory variables and the intercept # Std. Error: Estimates for the standard deviations of the LS estimator # t value: The value of the test statistic for the Wald test (a parametric test for maximum likelihood estimates) # The test is: H_0: b_i = 0, H_1: b_i != 0 and is reasonable when residuals are normally distributed # Pr(>|t|): The associated two-tailed p-value # Signif. codes: Traditional significance level thresholds # Residual standard error: Estimate for the residual standard deviation # Multiple R-squared: Coefficient of Determination # Adjusted R-squared: Coeff. of Deter. adjusted for model degrees of freedom (i.e. the number of explanatory variables) # F-statistic: A parametric test for the entire model against a base model that only has the intercept # The test is: H_0: b_i = 0 for all i (excluding the intercept), H_1: b_i != 0 for some i # Is reasonable if residuals are normally distributed # c) summary(model_b)$r.squared # Or we can also do this by hand SST <- sum((emission_data$NOx - mean(emission_data$NOx))^2) SSE <- sum((resid(model_b)^2)) 1-SSE/SST SSM <- sum((fitted(model_b) - mean(emission_data$NOx))^2) # R^2 measures the proportion of variation in the response that can be "explained" by the variation of the model library(dplyr) ?near near(SSM + SSE, SST) # This decomposition is the important observation # Or alternatively r_squared_og <- cor(fitted(model_b), emission_data$NOx)^2 r_squared_og # d) # Permutation test: H_0: b_i = 0, H_1: b_i != 0 # The high level idea: Permute the values of a given explanatory variable. This breaks the dependency structure of the response # to that variable which essentially means that there is no effect. Using many different permutations, we can estimate the distribution # of R^2 under the null hypothesis. The p-value is then the probability of observing at least as good of an R^2 as that computed # from the original data under the null hypothesis. # X: Data matrix, y: Response fit_helper_d <- function(X, y, perm_var) { # Permute the values of perm_var X[,perm_var] <- sample(X[, perm_var]) # LS estimate beta <- solve((t(X) %*% X)) %*% t(X) %*% y # Fitted values y_hat <- X %*% beta # R^2 cor(y_hat, y)^2 } perm_replicator_d <- function(n_perm, X, y, var_name) { # Generate n_perm permutation estimates of R^2 for var_name replicate(n_perm, fit_helper_d(X, y, var_name)) } # Sanity check ?rep n <- nrow(emission_data) Intercept <- rep(1, n) X <- cbind(as.matrix(emission_data[,-1]), Intercept) y <- emission_data$NOx fit_helper_d(X,y, 'Temp') n_perm <- 2000 alpha <- 0.05 expl_var <- var_names[-1] # Compute the permutation estimates for each expl. variable individually r_squares <- sapply(expl_var, function(name) perm_replicator_d(n_perm, X, y, name)) p_values_perm <- apply(r_squares, 2, function(x) sum(x > r_squared_og)/length(x)) p_values_perm # Not exactly the same as in the model solutions since here things are done in a different order. p_values_perm < alpha # The null is rejected for Humidity and Temperature # e) model_e <- lm(NOx ~ . - Pressure, data = emission_data) summary(model_e) summary(model_b) # The only substantial change is the estimate and precision of the intercept # f) # Drop Pressure X_f <- X[, !(colnames(X) %in% c('Pressure'))] # Residual variance res_e <- resid(model_e) res_var <- 1/(n-ncol(X_f)) * sum(res_e^2) sqrt(res_var) # Standard deviations of the LS estimators LS_std <- sqrt(diag(res_var * solve((t(X_f) %*% X_f)))) LS_std <- LS_std[c(3,1,2)] # g) confint(model_e, level = 1-alpha) # h) beta <- coef(model_e) ?qt beta - qt(1-alpha/2, n - ncol(X_f)) * LS_std beta + qt(1-alpha/2, n - ncol(X_f)) * LS_std # i) # Bootstrapping. The high level idea: When we sample with replacement from the original data set, # we are in a sense approximating the real sampling distribution of the random vectors. Using this bootstrap # distribution allows us to approximate the desired quantiles of the true sampling distribution. # X: Data matrix, y: Response fit_helper_i <- function(X, y) { # Sample with replacement inds <- sample(1:nrow(X), replace = T) X <- X[inds,] y <- y[inds] # Bootstrap LS estimate solve((t(X) %*% X)) %*% t(X) %*% y } n_boot <- 2000 # Construct n_boot bootstrap estimates boot_samples <- replicate(n_boot, fit_helper_i(X_f,y), simplify = 'matrix') boot_samples <- cbind(boot_samples, beta[c(2,3,1)]) # Select the desired quantiles for the confidence interval apply(boot_samples, 1, function(x) quantile(x, probs = c(alpha/2, 1 - alpha/2))) par(mfrow = c(2,2)) var_names <- rownames(boot_samples) sapply(var_names, function(x) hist(boot_samples[x,], main = x, xlab = 'Unit')) # If we are a bit lazy... library(boot) ?boot ?boot.ci boot_helper <- function(data, inds) { data <- data[inds,] model <- lm(NOx ~ . - Pressure, data = data) coef(model) } boot_samps <- boot(emission_data, boot_helper, n_boot) # Intercept, Humidity, Temp lapply(seq(1,3), function(ind) boot.ci(boot_samps, conf = 1 - alpha, type = 'perc', index = ind)$perc) # For HW ?plot ?abline