# A short intro to R

# Assignment operators
x <- 1
x = 1
# These are not exactly the same
mean(x = 2) # Local function env substitution
mean(x <- 2) # Global env substitution

# Vectors
?c
z <- c(1,2,3,4)

# Matrices
?matrix
m <- matrix(z, nrow = 2)
colnames(m) <- c('alpha', 'beta')
?'['
m[, 1] # First col
m[,-1] # Drop first column
m[, 'alpha']
z[z > 2] # Logical indexing

# List
?list
l <- list(x = x, z = z, m = m)
l$x
# Note the difference in output
l[[1]]
l1 <- l[1:2]

?data.frame

# Function definition

func <- function(x, y = 1) {
  x + y
}

func(x = 2)

# 1
# Setting up the workspace
getwd()
setwd("~/School/Prediction/Week_1") # Add you own path here

# Import data
?read.table

emissions_data <- read.table('emissions.txt', header = T, sep = "\t", row.names = 1)
?str
str(emissions_data)
?head
head(emissions_data)

# a)
?summary
summary(emissions_data)

?hist
hist(emissions_data$NOx, xlab = 'NOx', main = 'NOx histogram',
     xlim = c(min(emissions_data$NOx) - 0.1, max(emissions_data$NOx) + 0.1))

# Add all figures to grid
?par
par(mfrow = c(2,2))
#dev.off()

# For loop
for (name in colnames(emissions_data)) {
  hist(emissions_data[, name], xlab = name, main = paste0(name, " histogram"))
}

# Apply family of functions
?apply
apply(emissions_data, 2, hist)

?lapply
sapply(colnames(emissions_data), function(name) 
  hist(emissions_data[, name], xlab = name, main = paste0(name, " histogram")))

# The histograms do not indicate normality for any variable (symmetry, tail behavior),
# though the small sample size limits our ability to detect it.

# b)
# First, we check bivariate dependencies
cor(emissions_data) # Strengths of linear dependency
# Negative linear relationship with humidity, positive with temperature
# and a weak one with pressure

# Linear model
?lm

emis_model <- lm(NOx ~ Humidity + Temp + Pressure, data = emissions_data)
# lm(NOx ~ ., data = emissions_data)

summary(emis_model)
# Residuals: Summary statistics on the estimated residuals: e_i = y_i - y_hat_i

# Coefficients:
# Estimate: Least squares estimate (LSE). E[y_i] = a_ix_i, so the value tells
# how much the expected value of NOx increases/decreases when x_i increases
# by one unit.
# Std. Error: Standard deviation of the LSE.
# t value and Pr(>|t|): Test statistic and p-value associated with
# a parametric significance test: H_0: beta_i = 0, H_1: beta_i != 0.
# Assumes that residuals are normally distributed.

# Residual standard error: Standard deviation of the residuals

# Multiple R-squared: Coefficient of determination
# Adjusted R-squared: R^2 adjusted by the number of expl. variables.

# F-statistic and p-value: Test statistic and p-value associated with
# a parametric test: H_0: beta_i = 0 for all i, H_1: beta_i != 0 for some i
# Assumes that residuals are normally distributed.
hist(emis_model$residuals)

# c)
# Coefficient of determination
# Variance decomposition
y <- emissions_data$NOx
y_fitted <- fitted(emis_model)
resids <- resid(emis_model)

SST <- sum((y - mean(y))^2) # Total Sum of Squares: Constant
SSM <- sum((y_fitted - mean(y_fitted))^2) # Model Sum of Squares
SSE <- sum(resids^2) # Error Sum of Squares
SST 
SSM + SSE
all.equal(SST, SSM + SSE) # The decomposition

SSM/SST # Proportion of variation "explained" by the model
cor(y, y_fitted)^2 # Alternative way of computing R^2

# Note: For single variable models, you can just square the correlation
# between the expl. variable and the response
cor(emissions_data)[1,-1]^2

R_sqrd <- summary(emis_model)$r.squared

# d)
# Permutation test
# A non-parametric numerical significance test, often used to test the
# hypothesis H_0: beta_i = 0, H_1: beta_i ! = 0. The basic idea is that
# by permuting the variable i, we break the dependency between the variable
# and the response. Thus these permuted samples are generated under the
# null hypothesis. The test statistic is R^2. We estimate the
# probability of observing at least as large of an R^2 under the null hypothesis.

perm_sample <- function(data, variable) {
  perm_inds <- sample(nrow(data))
  data[, variable] <- data[, variable][perm_inds]
  
  perm_model <- lm(NOx ~ ., data = data)
  summary(perm_model)$r.squared
}

perm_generator <- function(data, variable, n_perm) {
  replicate(n_perm, perm_sample(data, variable))
}

n_perm <- 2000
set.seed(123)
alpha <- 0.05

perm_samples <- sapply(colnames(emissions_data)[-1], function(var) 
  perm_generator(emissions_data, var, n_perm))

# Proportion of R^2's under the null that are at least as large as the observed value
p_vals <- apply(perm_samples, 2, function(x) sum(x >= R_sqrd)/length(x))

p_vals < alpha # Reject the null?
apply(perm_samples, 2, function(x) quantile(x, 1 - alpha)) < R_sqrd # Alternative

# e)
emis_model_filt <- lm(NOx ~ . - Pressure, data = emissions_data)
summary(emis_model_filt)

# f)

# Cov(b) = s^2(X'X)^-1

X <- as.matrix(emissions_data[, -c(1, 4)])
n <- nrow(X)
X <- cbind(Intercept = rep(1, n), X)

resid_var <- sum(resid(emis_model_filt)^2)/(n - ncol(X))
LSE_cov_m <- resid_var * solve(t(X) %*% X)
LSE_SD <- sqrt(diag(LSE_cov_m))
LSE_SD

# g)
?confint
confint(emis_model_filt, level = 1 - alpha)

# h)
LSE <- coef(emis_model_filt)
upper <- LSE + qt(1-alpha/2, n - ncol(X))  * LSE_SD 
lower <- LSE - qt(1-alpha/2, n - ncol(X))  * LSE_SD 
cbind(lower, upper)

# i)
# Bootstrapping: Non-parametric numerical confidence interval estimator.
# Basic idea: Generate a new sample from the original data set by sampling
# with replacement. Compute the estimate on this bootstrap sample. Repeat
# this and calculate the desired empirical quantiles to construct the bootstrap
# confidence interval. The intuition is that the empirical distribution function
# of the sample is estimating the underlying distribution and thus providing
# an estimate for the sampling variation for the quantity of interest.

boot_sample <- function(X, y) {
  boot_inds <- sample(nrow(X), replace = T)
  X_boot <- X[boot_inds, ]
  y_boot <- y[boot_inds]
  
  # b = (X'X)^-1 X'y
  solve(t(X_boot) %*% X_boot) %*% t(X_boot) %*% y_boot
}


boot_generator <- function(X, y, n_boot) {
  replicate(n_boot, boot_sample(X,y), simplify = 'matrix')
}

n_boot <- 1999
boot_samples <- boot_generator(X, y, n_boot)
boot_samples <- cbind(LSE, boot_samples)

apply(boot_samples, 1, function(x) quantile(x, c(alpha/2, 1 - alpha/2)))

par(mfrow = c(2,2))
sapply(rownames(boot_samples), function(name) 
  hist(boot_samples[name, ], xlab = name, main = paste0(name, " histogram")))

# Notes about homework 1.2
# f)
?plot
?abline