# 1.1
# (a)
# setwd("path")

emis <- read.table("emissions.txt",header=T,sep="\t")
# Note that the imported data is of the data type called "data.frame"
# data.frame behaves usually as matrix, but some matrix operations are
# not available for data frames, e.g. matrix multiplication

emis.matrix <- as.matrix(emis)
# More about different data types on Introduction to R-programming course

set.seed(123)
# set.seed fixes the random number generator in a certain "position"

colnames(emis) #returns the names of the variables
emis$NOx
emis[,"NOx"]
emis[,2] 
# All 3 commands above give the observations of NOx

# With View() it is possible to examine the imported data
View(emis)

# summary() can be used to extract useful information from different objects in R
summary(emis)

# Note that histogram is not the same thing as barplot
hist(emis[,"NOx"],main="NOx", xlab="kg/cm^2")
hist(emis[,"Humidity"],main="Humidity",xlab="unit")
hist(emis[,"Pressure"],main="Pressure",xlab="unit")
hist(emis$Temp,main="Temperature",xlab="unit")

# The number of bins in a histogram can be changed. For example
b1 <- c(seq(min(emis$NOx),max(emis$NOx),length.out=10))
# The command above divides the interval between
# the smallest and largest value of NOx with 10 points.

hist(emis[,"NOx"],breaks=b1)

# Try also
hist(emis[,"NOx"],main="NOx", xlab=expression(kg/cm^2))

# By the histograms, the variables do not seem to be normally distributed

# (b), (c)

fit1 <- lm(NOx~Humidity+Temp+Pressure, data=emis) #linear model = lm

summary(fit1) 

summary(fit1)$r.squared # coefficient of determination

# (d)
# Remember to run set.seed() every time before running the permutation-loop to
# obtain identical results

# The results may vary depending on the operating 
# system and the version of R
set.seed(123)

k <- 2000 # Number of repetitions
y.mean <- mean(emis$NOx) 

# Mathematical formulations of the formulas below can be found from the lecture slides
SST <- sum((emis$NOx-y.mean)^2)
SSE <- sum((fit1$res)^2)
Rsquare1 <- 1 - SSE/SST

# Create an empty matrix, NA = not available.
# The number of rows is k  (number of permutations)
# The number of columns is 3 (number of explanatory variables)
perm<- matrix(NA,nrow=k,ncol=3)

for(i in 1:k){
  # Create 3 temporary variables to permute one explanatory variable at a time
  # Could be written more generally for m explanatory variables
  tmp1 <- emis
  tmp2 <- emis
  tmp3 <- emis
  
  # By default sample() draws a sample of the size of the input without replacement
  # try help(sample) 
  tmp1$Pressure <- sample(tmp1$Pressure)
  tmp2$Humidity <- sample(tmp2$Humidity)
  tmp3$Temp <- sample(tmp3$Temp)
  
  # LS-estimates
  tmpfit1 <- lm(NOx ~ Humidity+Temp+Pressure, data=tmp1)
  tmpfit2 <- lm(NOx ~ Humidity+Temp+Pressure, data=tmp2)
  tmpfit3 <- lm(NOx ~ Humidity+Temp+Pressure, data=tmp3)

  perm[i,1] <- summary(tmpfit1)$r.squared
  perm[i,2] <- summary(tmpfit2)$r.squared
  perm[i,3] <- summary(tmpfit3)$r.squared
}
# Note that it is not feasible to calculate all possible permutations,
# since 20! is of the magnitude 10^18:
factorial(20)

# The perm-matrix now contains three columns with k observations, such that the elements of the
# perm-matrix are values for the coefficient of determination (R-squared values).
# The first column is formed such that the variables Humidity and Temp are exactly as in the original
# data set in every k iterations. The variable Pressure is given a random permutation in every iteration
# and the R-squared value is calculated again in every iteration.

# The second column is formed such that the variables Pressure and Temp are left untouched and
# the variable Humidity is permutated in every iteration.

# Similarly, the third column is formed such that the variables Pressure and Humidity are left untouched
# and the variable Temp is permutated in every iteration.

# Choose the significance level alpha = 5% and order the observations:
preord <- sort(perm[,1])
humord <- sort(perm[,2])
tempord <- sort(perm[,3])

# (note that in R, the sorting is not necessary for the next step)

# The null hypothesis (H_0) of beta_j = 0 is rejected if the 
# original R-squared is larger than the calculated 95th percentile
Rsquare1 > quantile(preord,0.95)  # H_0 Accepted (at a significance level of 5%)
Rsquare1 > quantile(humord,0.95)  # H_0 Rejected
Rsquare1 > quantile(tempord,0.95) # H_0 Rejected

# Alternatively, you get the same results from:
pre <- 1-sum(perm[,1] < Rsquare1)/k #0.4635, H_0 Accepted
hum <- 1-sum(perm[,2] < Rsquare1)/k #0, H_0 Rejected
temp <- 1-sum(perm[,3] < Rsquare1)/k #0.0115, H_0 Rejected


# (e)
fit2 <- lm(NOx~Humidity+Temp, data=emis)
summary(fit2)

# (f)
colnames(emis) #The variables Humidity and Temp are in the third and fourth columns
tmp <-as.matrix(emis[,c(3,4)])

# dim, nrow and ncol commands can be used to find out the dimensions of data
n <- nrow(emis)

Intercept <- rep(1,n) #vector of ones

emis2 <- cbind(Intercept,tmp)
p <- ncol(emis2)

res <- fit2$residuals

s2 <- sum(res^2)/(n-p)

vari <- s2*diag( solve( t(emis2)%*%emis2) )
stdev <- sqrt(vari)

# (g)

confint(fit2,level=0.95)

# (h)
coef <- fit2$coef
up <- coef + stdev * qt(0.975,n-p)
down <- coef - stdev * qt(0.975,n-p)

# (i)
# The logic in bootstrapping is mainly the same as in permutation test

# Below are steps 1-5 from the lecture slides

k <- 2000
bootmat <- matrix(NA,nrow=k,ncol=3) # Empty matrix for the results

y <- emis$NOx
X <- emis2

set.seed(123)

for(i in 1:(k-1)){
  # 1. Select n data points randomly with replacement from the 
  # original observations
  # (Note that in the permutation test the data points are selected
  # without replacement)
  ind <- sample(1:n,replace = TRUE) # Indices for the random rows
  
  Xtmp <- X[ind,]
  ytmp <- y[ind]
  
  # 2. Calculate a new parameter vector estimate from the new sample
  btmp <- solve(t(Xtmp)%*%Xtmp)%*%t(Xtmp)%*%ytmp
  
  # 3. Save the results to bootmat and repeat k-1 times
  bootmat[i,] <- t(btmp)
}

# 4. Include the original estimate and order the obtained estimates 
# from the smallest to the largest
boriginal <- solve(t(X)%*%X)%*%t(X)%*%y

bootmat[k,] <- t(boriginal)

boot1 <- sort(bootmat[,1])
boot2 <- sort(bootmat[,2])
boot3 <- sort(bootmat[,3])

# 5. Set alpha=5% and set the lower end of the confidence interval
# to be smaller or equal to the [0.025*2000] ordered estimate.
# Set the upper end to be larger or equal to the [0.975*2000] ordered estimate.

qconst <- quantile(boot1, probs = c(0.025,0.975))
qhum <- quantile(boot2, probs = c(0.025,0.975))
qtemp <- quantile(boot3, probs = c(0.025,0.975))

# Note that step 4 is not necessary in R, 
# you would get the same results by inputting the
# columns of bootmat into the quantile-function. 

# Here, we have wanted to emphasize the steps of the lecture slides and thus 
# have included step 4 separately
  
# Plot the histograms of the bootstrap estimates
hist(bootmat[,1])
hist(bootmat[,2])
hist(bootmat[,3])