###########################
# MS-C1620
# Statistical inference
# Lecture 4


library(catdata)
library(boot)


# Significance level 0.05 thorughout


#################################################################
# Bernoulli and Binomial distributions

# Prob. mass function for Beroulli
p <- 0.6
barplot(c(1 - p, p), names.arg = c(0, 1))

# Prob. mass function for Binomial
p <- 0.6
n <- 20
barplot(dbinom(0:20, n, p), names.arg = 0:20)







#################################################################
# Confidence intervals



#################################################################
# Unemployment

data(unemployment)
head(unemployment)
help("unemployment")

# A random (?) sample of 982 subjects amongst unemployed persons.
# What is the true proportion of long-term unemployed people among unemployed?

barplot(table(unemployment$durbin), names.arg = c("Short-term", "Long-term"))

# A point estimate and a 95% confidence interval

longterm <- 1*(unemployment$durbin == 2)
longterm

n <- length(longterm)

m <- mean(longterm)
m

ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
ci

plot(0, col = 0, xlim = c(0, 1))
abline(h = 0, lwd = 2)
abline(v = c(m, ci), col = c(1, 2, 2))






####################################################
# Women with no offspring

data(children)
head(children)
help("children")

# A random (?) sample of 1761 subjects amongst all German women of certain age.
# What is the true proportion of women without children? 

barplot(table(children$child))

# A point estimate and a 95% confidence interval (bootstrap)

nochildren <- 1*(children$child == 0)
nochildren

n <- length(nochildren)

m <- mean(nochildren)
m


B <- 1000
mean_boot <- function(x, indices) mean(x[indices])
boots <- sort(boot(nochildren, mean_boot, B)$t)
ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)])
ci

plot(0, col = 0, xlim = c(0, 1))
abline(h = 0, lwd = 2)
abline(v = c(m, ci), col = c(1, 2, 2))






#################################################################
# Comparison of the coverage probabilities of bootstrap and CLT confidence intervals for binary data


########### CLT

n <- 10
p <- 0.5

x <- rbinom(n, 1, p)
x

m <- mean(x)
m

ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
ci

# The true value in the interval?
p >= ci[1] & p <= ci[2]


# The previous into function:
clt_ci <- function(n, p){
  m <- mean(rbinom(n, 1, p))
  ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
  p >= ci[1] & p <= ci[2]
}

# Large number of replications: proportion of intervals which cover the true value:
mean(replicate(10000, clt_ci(10, 0.5)))



########### Bootstrap


n <- 10
p <- 0.5
x <- rbinom(n, 1, p)

m <- 1000
mean_boot <- function(x, indices) mean(x[indices])

boots <- sort(boot(x, mean_boot, m)$t)
ci <- c(boots[floor(m*0.025)], boots[floor(m*0.975)])
ci

# The true value in the interval?
p >= ci[1] & p <= ci[2]



# The previous into function:
clt_bs <- function(n, p, m){
  x <- rbinom(n, 1, p)
  boots <- sort(boot(x, mean_boot, m)$t)
  ci <- c(boots[floor(m*0.025)], boots[floor(m*0.975)])
  p >= ci[1] & p <= ci[2]
}

# Large number of replications: proportion of intervals which cover the true value:
mean(replicate(1000, clt_bs(10, 0.5, 1000)))



########### Comparison
mean(replicate(1000, clt_ci(10, 0.5)))
mean(replicate(1000, clt_bs(10, 0.5, 1000)))

# Bootstrap CI gives a better coverage (very close to the 95%) but at the cost of requiring more computation 











##############################################################
# One-sample proportion test

# The data set "faithful" contains the waiting times between eruptions and the durations of the eruption
# for the Old Faithful geyser in Yellowstone National Park, Wyoming, USA.
faithful

# A park ranger claims that less than every fourth eruption has a duration longer than 4 minutes 30 seconds
# Test her claim using the one-sample proportion test and the significance level 5%

# p = probability of long eruption
# H0: p = 1/4
# H1: p < 1/4

# Binary data
1*(faithful$eruptions > 4.5)

n <- length(faithful$eruptions)
n

# Number of eruptions longer than 4 min 30 sec.
x <- sum(faithful$eruptions > 4.5)
x
x/n

# If the null hypotehsis is true (the prob. of long eruptions is 1/4)
# then the distribution of the test statistic is:
barplot(dbinom(0:n, n, 1/4), names.arg = 0:n, width = 1, space = 0)

# We observed:
abline(v = x, col = 2, lwd = 2)

# Deviations toward H1 are in the low end of the graph (small numbers of long eruptions imply small prob.)

# Propotion test
prop.test(x, n, p = 0.25, alternative = "less", correct = FALSE)
# -> Not enough evidence against the null -> not enough evidence for the ranger's claim






###############################################################################
# Unemployment data

data("unemployment")
head(unemployment)


# Claim: more than 30% of unemployed are long-term unemployed

# p = probability of long-term unemployment
# H0: p = 0.30
# H1: p > 0.30

longterm <- 1*(unemployment$durbin == 2)
x <- sum(longterm)
n <- length(longterm)
x/n
# Is it likely to get this large proportion even if the true probability is 0.30?


prop.test(x, n, p = 0.30, alternative = "greater", correct = FALSE)
# H0 not plausible -> more than 30% of unemployed are long-term unemployed








##############################################################
# Two-sample proportion test

head(iris)

# An iris flower is said to have odd-shaped petals if the ratio of petal
# length to petal width is smallerthan 3.

# A botanist claims that it is more common for the iris of species virginica to have odd-shaped petals
# than for the iris of species versicolor.

# Test his claim using the two-sample proportion test and the significance level 5%

# p1 = probability of odd-shaped petals for virginica
# p2 = probability of odd-shaped petals for versicolor

# H0: p1 = p2
# H1: p1 > p2


virginica <- iris[iris$Species == "virginica", ]
x1 <- sum(virginica$Petal.Length/virginica$Petal.Width < 3.0)
n1 <- nrow(virginica)

x1/n1

versicolor <- iris[iris$Species == "versicolor", ]
x2 <- sum(versicolor$Petal.Length/versicolor$Petal.Width < 3.0)
n2 <- nrow(versicolor)

x2/n2



prop.test(c(x1, x2), c(n1, n2), alternative = "greater", correct = FALSE)
# Strong evidence against the null -> virginica has more frequently odd-shaped petals









#####################################################################
# Quiz


# 1
x <- c(5, -4, -2, 2)

mean(x)
sd(x)
median(x)
median(abs(x - median(x)))
c(min(x), max(x))
sign(x)
rank(x)
rank(abs(x))*sign(x)


# 2
# a. data with no outliers or with a bounded range
# b. data with small amounts of outliers
# c. discrete data with a small amount of possible values


# 3
# a. data with no outliers or with a bounded range
# b. data with small amouns of outliers
# c. data with bounded range


# 4
# a. the confidence interval is accurate, in the sense that it pinpoints the true value accurately (does not necessary have a large coverage probability, though)
# b. we require lots of "evidence" to reject the null (accept any "effects")
# c. no evidence against null (no "effect" is seen)
# d. we think there is an effect when there is none
# e. we fail to see an effect


# 5
# a. Two boxplots or histograms
set.seed(123)
heights <- data.frame(height = c(rnorm(40, 175, 5), rnorm(40, 165, 5)), sex = c(rep(0, 40), rep(1, 40)))
boxplot(height ~ sex, data = heights)
h1 <- hist(heights[heights$sex == 0, 1], breaks = 8)
h2 <- hist(heights[heights$sex == 1, 1], breaks = 8)

plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4))
plot(h2, add = TRUE, col = rgb(1,0,0,1/4))

# or
par(mfrow = c(2, 1))
plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4))
plot(h2, xlim = c(150, 190), col = rgb(1,0,0,1/4))
par(mfrow = c(1, 1))


# b. Bar chart?
barplot(table(sample(0:24, 300, TRUE)))


# c. Bar chart again?
props <- c(0.10, 0.04, 0.03, 0.01, 0.09)
barplot(props, names.arg = 1:5)


# d. Time series plot
x <- cbind(20 + arima.sim(n = 100, list(ar = c(0.85)), sd = sqrt(10)),
           1000 + arima.sim(n = 100, list(ar = c(0.65)), sd = sqrt(1000)),
           420 + arima.sim(n = 100, list(ar = c(0.55)), sd = sqrt(500)))
plot.ts(x)


# e. A map with the postal areas colored based on the median salary in the region?
# ...




# 6.

single_t <- function(){
  x <- rnorm(10)
  sqrt(10)*x/sd(x)
}

single_sign <- function(){
  x <- rnorm(10)
  sum(x > 0)
}

single_rank <- function(){
  x <- rnorm(10)
  sum((sign(x) == 1)*rank(abs(x)))
}


par(mfrow = c(1, 3))
hist(replicate(1000, single_rank()), xlab = "", main = "")
hist(replicate(1000, single_t()), xlab = "", main = "")
hist(replicate(1000, single_sign()), xlab = "", main = "")
par(mfrow = c(1, 1))


data("psych")
library(psych)
data()




#############################################################
# Comparison of t-test, sign test and one-sample signed rank test


data(cd4)
cd4
help(cd4)

# Does the drug increase the amount of CD4 cells in the blood?

# mu = the location of the distribution of the difference between oneyear and baseline
# H0: mu = 0
# H1: mu > 0


# Visualization
matplot(t(cd4), type = "l")

difference <- cd4$oneyear - cd4$baseline
difference
hist(difference)



# Paired t-test (assumes normality)
t.test(difference, mu = 0, alternative = "greater")
# small p-value -> drug works (assuming normality holds)


# Paired sign test (assumes continuity)

# No zero values
sum(difference == 0)

binom.test(sum(difference > 0), length(difference), alternative = "greater")
# Small p-value -> drug works



# Paired signed rank test (assumes continuity and symmetry)

wilcox.test(x, mu = 0, alternative = "greater")
# Small p-value -> drug works