# A short intro to R # Assignment operators x <- 1 x = 1 # These are not exactly the same mean(x = 2) # Local function env substitution mean(x <- 2) # Global env substitution # Vectors ?c z <- c(1,2,3,4) # Matrices ?matrix m <- matrix(z, nrow = 2) colnames(m) <- c('alpha', 'beta') ?'[' m[, 1] # First col m[,-1] # Drop first column m[, 'alpha'] z[z > 2] # Logical indexing # List ?list l <- list(x = x, z = z, m = m) l$x # Note the difference in output l[[1]] l1 <- l[1:2] ?data.frame # Function definition func <- function(x, y = 1) { x + y } func(x = 2) # 1 # Setting up the workspace getwd() setwd("~/School/Prediction/Week_1") # Add you own path here # Import data ?read.table emissions_data <- read.table('emissions.txt', header = T, sep = "\t", row.names = 1) ?str str(emissions_data) ?head head(emissions_data) # a) ?summary summary(emissions_data) ?hist hist(emissions_data$NOx, xlab = 'NOx', main = 'NOx histogram', xlim = c(min(emissions_data$NOx) - 0.1, max(emissions_data$NOx) + 0.1)) # Add all figures to grid ?par par(mfrow = c(2,2)) #dev.off() # For loop for (name in colnames(emissions_data)) { hist(emissions_data[, name], xlab = name, main = paste0(name, " histogram")) } # Apply family of functions ?apply apply(emissions_data, 2, hist) ?lapply sapply(colnames(emissions_data), function(name) hist(emissions_data[, name], xlab = name, main = paste0(name, " histogram"))) # The histograms do not indicate normality for any variable (symmetry, tail behavior), # though the small sample size limits our ability to detect it. # b) # First, we check bivariate dependencies cor(emissions_data) # Strengths of linear dependency # Negative linear relationship with humidity, positive with temperature # and a weak one with pressure # Linear model ?lm emis_model <- lm(NOx ~ Humidity + Temp + Pressure, data = emissions_data) # lm(NOx ~ ., data = emissions_data) summary(emis_model) # Residuals: Summary statistics on the estimated residuals: e_i = y_i - y_hat_i # Coefficients: # Estimate: Least squares estimate (LSE). E[y_i] = a_ix_i, so the value tells # how much the expected value of NOx increases/decreases when x_i increases # by one unit. # Std. Error: Standard deviation of the LSE. # t value and Pr(>|t|): Test statistic and p-value associated with # a parametric significance test: H_0: beta_i = 0, H_1: beta_i != 0. # Assumes that residuals are normally distributed. # Residual standard error: Standard deviation of the residuals # Multiple R-squared: Coefficient of determination # Adjusted R-squared: R^2 adjusted by the number of expl. variables. # F-statistic and p-value: Test statistic and p-value associated with # a parametric test: H_0: beta_i = 0 for all i, H_1: beta_i != 0 for some i # Assumes that residuals are normally distributed. hist(emis_model$residuals) # c) # Coefficient of determination # Variance decomposition y <- emissions_data$NOx y_fitted <- fitted(emis_model) resids <- resid(emis_model) SST <- sum((y - mean(y))^2) # Total Sum of Squares: Constant SSM <- sum((y_fitted - mean(y_fitted))^2) # Model Sum of Squares SSE <- sum(resids^2) # Error Sum of Squares SST SSM + SSE all.equal(SST, SSM + SSE) # The decomposition SSM/SST # Proportion of variation "explained" by the model cor(y, y_fitted)^2 # Alternative way of computing R^2 # Note: For single variable models, you can just square the correlation # between the expl. variable and the response cor(emissions_data)[1,-1]^2 R_sqrd <- summary(emis_model)$r.squared # d) # Permutation test # A non-parametric numerical significance test, often used to test the # hypothesis H_0: beta_i = 0, H_1: beta_i ! = 0. The basic idea is that # by permuting the variable i, we break the dependency between the variable # and the response. Thus these permuted samples are generated under the # null hypothesis. The test statistic is R^2. We estimate the # probability of observing at least as large of an R^2 under the null hypothesis. perm_sample <- function(data, variable) { perm_inds <- sample(nrow(data)) data[, variable] <- data[, variable][perm_inds] perm_model <- lm(NOx ~ ., data = data) summary(perm_model)$r.squared } perm_generator <- function(data, variable, n_perm) { replicate(n_perm, perm_sample(data, variable)) } n_perm <- 2000 set.seed(123) alpha <- 0.05 perm_samples <- sapply(colnames(emissions_data)[-1], function(var) perm_generator(emissions_data, var, n_perm)) # Proportion of R^2's under the null that are at least as large as the observed value p_vals <- apply(perm_samples, 2, function(x) sum(x >= R_sqrd)/length(x)) p_vals < alpha # Reject the null? apply(perm_samples, 2, function(x) quantile(x, 1 - alpha)) < R_sqrd # Alternative # e) emis_model_filt <- lm(NOx ~ . - Pressure, data = emissions_data) summary(emis_model_filt) # f) # Cov(b) = s^2(X'X)^-1 X <- as.matrix(emissions_data[, -c(1, 4)]) n <- nrow(X) X <- cbind(Intercept = rep(1, n), X) resid_var <- sum(resid(emis_model_filt)^2)/(n - ncol(X)) LSE_cov_m <- resid_var * solve(t(X) %*% X) LSE_SD <- sqrt(diag(LSE_cov_m)) LSE_SD # g) ?confint confint(emis_model_filt, level = 1 - alpha) # h) LSE <- coef(emis_model_filt) upper <- LSE + qt(1-alpha/2, n - ncol(X)) * LSE_SD lower <- LSE - qt(1-alpha/2, n - ncol(X)) * LSE_SD cbind(lower, upper) # i) # Bootstrapping: Non-parametric numerical confidence interval estimator. # Basic idea: Generate a new sample from the original data set by sampling # with replacement. Compute the estimate on this bootstrap sample. Repeat # this and calculate the desired empirical quantiles to construct the bootstrap # confidence interval. The intuition is that the empirical distribution function # of the sample is estimating the underlying distribution and thus providing # an estimate for the sampling variation for the quantity of interest. boot_sample <- function(X, y) { boot_inds <- sample(nrow(X), replace = T) X_boot <- X[boot_inds, ] y_boot <- y[boot_inds] # b = (X'X)^-1 X'y solve(t(X_boot) %*% X_boot) %*% t(X_boot) %*% y_boot } boot_generator <- function(X, y, n_boot) { replicate(n_boot, boot_sample(X,y), simplify = 'matrix') } n_boot <- 1999 boot_samples <- boot_generator(X, y, n_boot) boot_samples <- cbind(LSE, boot_samples) apply(boot_samples, 1, function(x) quantile(x, c(alpha/2, 1 - alpha/2))) par(mfrow = c(2,2)) sapply(rownames(boot_samples), function(name) hist(boot_samples[name, ], xlab = name, main = paste0(name, " histogram"))) # Notes about homework 1.2 # f) ?plot ?abline