###########################
# MS-C1620
# Statistical inference
# Lecture 2
# updated 18.2.2023

library(scales)
library(EnvStats)
library(car)

####################################################################################
# Comparison of confidence intervals for expected value

# Highly skew data set
# (Used to be "rivers" = 142 major NA rivers, but bad example since not a random sample)
# Now using "salaries" = 397 randomly(?) chosen people's salaries [in some population]

data("Salaries")
x <- Salaries$salary
n <- length(x)
boxplot(x)

# Manual bootstrapping for the expected value
B <- 1000
res <- rep(0, B)
# here we just allocated space for the B different values

for(b in 1:B){
  res[b] <- mean(sample(x, n, replace = TRUE))
}

# Distribution of the bootstrap means
hist(res)

# 95% BS confidence interval
ci_bs <- quantile(res, probs = c(0.025, 0.975))
abline(v = ci_bs, lwd = 2, col = 2)
ci_bs

# Parametric 95% confidence interval assuming normality
ci_ex <- c(mean(x) - 1.96*sd(x)/sqrt(n), mean(x) + 1.96*sd(x)/sqrt(n))
abline(v = ci_ex, lwd = 2, col = 3)
ci_ex






####################################################################################
# BS confidence interval for SKEWNESS

library(moments)

# Does the raw data (sample) look skewed?
boxplot(x)
hist(x)
skewness(x)

# Sample skewness is positive ... but could it be just sample randomness?
# If we take a small sample from NORMAL or UNIFORM distributions, it will
# also be skewed, RANDOMLY, left or right.  So how much is too much?

x2 <- rnorm(20)
boxplot(x2)
skewness(x2)

x3 <- rnorm(100)
boxplot(x3)
skewness(x3)

# Manual bootstrapping
B <- 1000
res <- rep(0, B)

for(b in 1:B){
  res[b] <- skewness(sample(x, n, replace = TRUE))
}

# Distribution of the bootstrap skewnesses
hist(res)

# 95% BS confidence interval
ci_bs <- quantile(res, probs = c(0.025, 0.975))
abline(v = ci_bs, lwd = 2, col = 2)
ci_bs

# Bootstrapping clearly indicates that salary skewness is positive
# (right tail longer than left)



####################################################################
# Illustration of Type I and Type II errors using t-test (optional extra example)

# Single run. Very small data.
n <- 15

theta <- 0
y <- rnorm(n, theta, 3)

# Our test statistic for H0: theta = 0
t_observed_0 <- (mean(y) - 0)/(sd(y)/sqrt(n))
t_observed_0
# Is this too large to happen under H0?




# Find The distribution of t under H0

t_generate <- function(theta){
  n <- 15
  y <- rnorm(n, theta, 3)
  (mean(y) - 0)/(sd(y)/sqrt(n))
}

theta_0 <- replicate(10000, t_generate(0))

hist(theta_0, breaks = 30)

# We choose cut-off points for what is still considered as "normal behavior" under H0 (95% interval)
# (note that this is an approximation and we could also use the true distribution, Student's t with n - 1 degrees of freedom)
quant_0 <- quantile(theta_0, c(0.025, 0.975))
abline(v = quant_0[1], col = "red", lwd = 2)
abline(v = quant_0[2], col = "red", lwd = 2)
# Type I error rate is now roughly 5%

# Where does the observed value fall?
abline(v = t_observed_0, col = "black", lwd = 2)
# p-value measures how far away we are in the tail/how weird the observation is.
mean((theta_0 >= abs(t_observed_0) | theta_0 <= -abs(t_observed_0)))

# Type I error rate is 5% (probability of claiming there is an effect when there is not)




# What about type II errors?

# The distribution of t when H1: theta = 1 is true
theta_1 <- replicate(10000, t_generate(1))

hist(theta_0, breaks = 30, xlim = c(-5, 7))
abline(v = quant_0[1], col = "red", lwd = 2)
abline(v = quant_0[2], col = "red", lwd = 2)

hist(theta_1, breaks = 30, add = TRUE, col = alpha("lightblue", 0.4), lty = 2)
# Type II error rate for theta = 2
mean(theta_1 < quant_0[2] & theta_1 > quant_0[1])

# Conduct a single study where H1: theta = 1 is true
t_observed_1 <- t_generate(1)
abline(v = t_observed_1, col = "black", lwd = 2)
# p-value
mean((theta_0 >= abs(t_observed_1) | theta_0 <= -abs(t_observed_1)))

# Making the "normally behaving" area wider makes true H0 easier to recognize (lower type I)
# but deviations from H0 more difficult to recognize (higher type II) and vice versa



# Recognizing deviations from H0 gets more difficult as we approach theta = 0
theta_01 <- replicate(10000, t_generate(0.1))

hist(theta_0, breaks = 30, xlim = c(-5, 5))
abline(v = quant_0[1], col = "red", lwd = 2)
abline(v = quant_0[2], col = "red", lwd = 2)
hist(theta_01, breaks = 30, add = TRUE, col = alpha("salmon", 0.4), lty = 2)
# Type II error rate for theta = 1
mean(theta_01 < quant_0[2] & theta_01 > quant_0[1])

# Conduct a single study where H0 is false
t_observed_01 <- t_generate(0.1)
abline(v = t_observed_01, col = "black", lwd = 2)
# p-value
mean((theta_0 >= abs(t_observed_01) | theta_0 <= -abs(t_observed_01)))









####################################################################################
# One sample t-test

# Data which show the effect of two soporific drugs (increase in hours of sleep compared to control) on 10 patients.
sleep

# Compare whether the first drugs differs in its effect from placebo at significance level 5%
# H0: mu1 == 0
# H1: mu1 != 0

plot(sleep[sleep$group == 1, 1], rep(1, 10))
t.test(sleep[sleep$group == 1, 1])

# p-value 0.218 - not enough evidence against H0





####################################################################################
# Two-sample t-test

# Data which show the effect of two soporific drugs (increase in hours of sleep compared to control) on 10 patients.
sleep

# Compare whether the drugs differ in their effect at significance level 5%
# H0: mu1 == mu2
# H1: mu1 != mu 2

plot(sleep[sleep$group == 1, 1], rep(1, 10))
plot(sleep[sleep$group == 2, 1], rep(1, 10))
t.test(sleep[sleep$group == 1, 1], sleep[sleep$group == 2, 1])

# p-value 0.079 - not enough evidence against H0




####################################################################################
# Variance test

# This data set provides measurements of the girth, height and volume of timber in 31 felled black cherry trees.
trees

# We test the conjecture that the standard deviation of the height of the felled trees equals 5

# Test the following hypotheses with the significance level 5%
# H0: sigma^2 == 5^2 == 25
# H1: sigma^2 != 25

varTest(trees$Height, sigma.squared = 25)

# p-value 0.034 - H0 not plausible




####################################################################################
# Variance comparison test

# Compare whether the variance of the drug effect differ in the sleep data at significance level 5%
# H0: sigma_1^2 == sigma_2^2
# H1: sigma_1^2 != sigma_2^2

var.test(sleep[sleep$group == 1, 1], sleep[sleep$group == 2, 1])

# p-value 0.743 - no evidence against H0