###########################
# MS-C1620
# Statistical inference
# Lecture 4


library(catdata)
library(boot)


# Significance level 0.05 thorughout

# Note. We will here try to use "theta" as the name for the
# probability parameter for a binomial distribution.
# (It is often called "p", but this causes confusion with
#  p-values.)

#################################################################
# Bernoulli and Binomial distributions

# Prob. mass function for Beroulli
theta <- 0.6
barplot(c(1 - theta, theta), names.arg = c(0, 1))

# Prob. mass function for Binomial
theta <- 0.6
n <- 20
barplot(dbinom(0:20, n, theta), names.arg = 0:20)

# Small-sample and/or skewed binomials are not quite normal
# so it is better to work with the exact binomial distribution
# in those cases
theta <- 0.9
n <- 10
barplot(dbinom(0:10, n, theta), names.arg = 0:10)






#################################################################
# Confidence intervals



#################################################################
# Unemployment

data(unemployment)
head(unemployment)
help("unemployment")

# A random (?) sample of 982 subjects amongst unemployed persons.
# What is the true proportion of long-term unemployed people among unemployed?

barplot(table(unemployment$durbin), names.arg = c("Short-term", "Long-term"))

# A point estimate and a 95% confidence interval

longterm <- 1*(unemployment$durbin == 2)
longterm

n <- length(longterm)

m <- mean(longterm)
m

ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
ci

plot(0, col = 0, xlim = c(0, 1))
abline(h = 0, lwd = 2)
abline(v = c(m, ci), col = c(1, 2, 2))






####################################################
# Women with no offspring

data(children)
head(children)
help("children")

# A random (?) sample of 1761 subjects amongst all German women of certain age.
# What is the true proportion of women without children? 

barplot(table(children$child))

# A point estimate and a 95% confidence interval (bootstrap)

nochildren <- 1*(children$child == 0)
nochildren

n <- length(nochildren)

m <- mean(nochildren)
m


B <- 1000
mean_boot <- function(x, indices) mean(x[indices])
boots <- sort(boot(nochildren, mean_boot, B)$t)
boots
ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)])
ci

plot(0, col = 0, xlim = c(0, 1))
abline(h = 0, lwd = 2)
abline(v = c(m, ci), col = c(1, 2, 2))






#################################################################
# Comparison of the coverage probabilities of bootstrap and CLT confidence intervals for binary data

# We will be setting the seed of the random generator
# before generating the random numbers, to ensure
# fair comparison between CLT and bootstrap.
# (They will be working with the same numbers.)
theseed <- 876362

########### CLT

n <- 10
theta <- 0.6
set.seed(theseed)
x <- rbinom(n, 1, theta)
x

m <- mean(x)
m

# applying "magic formula" that uses normal approximation
ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
ci

# Is the true value inside the interval?
theta >= ci[1] & theta <= ci[2]


# The previous into function:
clt_ci <- function(n, theta){
  m <- mean(rbinom(n, 1, theta))
  ci <- c(m - 1.96*sqrt(m*(1 - m)/n), m + 1.96*sqrt(m*(1 - m)/n))
  theta >= ci[1] & theta <= ci[2]
}

# Large number of replications: proportion of intervals which cover the true value:
set.seed(theseed)
mean(replicate(10000, clt_ci(n,theta)))

# Coverage only about 90%, not 95% as desired.
# That is, we are not covering the true value
# about 10% of the time.


########### Bootstrap

set.seed(theseed)
x <- rbinom(n, 1, theta)
mean(x)

B <- 1000
mean_boot <- function(x, indices) mean(x[indices])

boots <- sort(boot(x, mean_boot, B)$t)
ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)])
ci

# Is the true value in the interval?
theta >= ci[1] & theta <= ci[2]



# The previous into function:
clt_bs <- function(n, theta, B){
  x <- rbinom(n, 1, theta)
  boots <- sort(boot(x, mean_boot, B)$t)
  ci <- c(boots[floor(B*0.025)], boots[floor(B*0.975)])
  theta >= ci[1] & theta <= ci[2]
}

# Large number of replications: proportion of intervals which cover the true value:
set.seed(theseed)
mean(replicate(1000, clt_bs(n, theta, 1000)))
# (takes about 10 seconds)


########### Comparison
set.seed(theseed)
mean(replicate(1000, clt_ci(n, theta)))
set.seed(theseed)
mean(replicate(1000, clt_bs(n, theta, 1000)))

# Bootstrap CI gives a better coverage (close to the 95%) but at the cost of requiring more computation 











##############################################################
# One-sample proportion test

# The data set "faithful" contains the waiting times between eruptions and the durations of the eruption
# for the Old Faithful geyser in Yellowstone National Park, Wyoming, USA.
faithful

# A park ranger claims that less than every fourth eruption has a duration longer than 4 minutes 30 seconds
# Test her claim using the one-sample proportion test and the significance level 5%

# theta = probability of long eruption (unknown parameter)
# H0: theta >= 1/4         Composite null hypothesis!
# H1: theta <  1/4

# Binary data
1*(faithful$eruptions > 4.5)

n <- length(faithful$eruptions)
n

# Number of eruptions longer than 4 min 30 sec.
x <- sum(faithful$eruptions > 4.5)
x
x/n

# If the null hypotehsis is true (the prob. of long eruptions is 1/4)
# then the distribution of the test statistic is:
barplot(dbinom(0:n, n, 1/4), names.arg = 0:n, width = 1, space = 0)

# We observed:
abline(v = x, col = 2, lwd = 2)

# Deviations toward H1 are in the low end of the graph (small numbers of long eruptions imply small prob.)

# Proportion test
prop.test(x, n, 0.25, alternative = "less", correct = FALSE)
# -> Not enough evidence against the null -> not enough evidence for the ranger's claim

# WARNING.
# The third argument to the R function "prop.test" is confusingly named "p",
# but it really is the probability parameter of the null hypothesis,
# which we are calling "theta".
#
# DO NOT CONFUSE this parameter with the p-value.
#
# The p-value is a RESULT of the test, and quite different from the
# probability parameter, even though R confusingly calls both "p".




###############################################################################
# Unemployment data

data("unemployment")
head(unemployment)


# Claim: more than 30% of unemployed are long-term unemployed

# theta = probability of long-term unemployment
# H0: theta <= 0.30         Composite null hypothesis!
# H1: theta >  0.30

longterm <- 1*(unemployment$durbin == 2)
x <- sum(longterm)
n <- length(longterm)
x/n
# Is it likely to get this large proportion even if the true probability is 0.30?


prop.test(x, n, 0.30, alternative = "greater", correct = FALSE)
# H0 not plausible -> more than 30% of unemployed are long-term unemployed

# Above we rejected the hypothesis theta=0.30 at p-value 0.0004821.
# What would we do with hypotheses with even smaller p's?
prop.test(x, n, 0.29, alternative = "greater", correct = FALSE)
prop.test(x, n, 0.27, alternative = "greater", correct = FALSE)
prop.test(x, n, 0.25, alternative = "greater", correct = FALSE)
# The p-value would be even smaller.
# So we can, at significance level 0.05,
# reject not just the point hypothesis "theta=0.30",
# but also all of the composite hypothesis "theta<=0.30".





##############################################################
# Two-sample proportion test

head(iris)

# An iris flower is said to have odd-shaped petals if the petal
# length/width ratio is smaller than 3.

# A botanist claims that it is more common for the iris of species virginica to have odd-shaped petals
# than for the iris of species versicolor.

# Test his claim using the two-sample proportion test and the significance level 5%

# theta1 = probability of odd-shaped petals for virginica
# theta2 = probability of odd-shaped petals for versicolor

# H0: theta1 = theta2
# H1: theta1 > theta2


virginica <- iris[iris$Species == "virginica", ]
x1 <- sum(virginica$Petal.Length/virginica$Petal.Width < 3.0)
n1 <- nrow(virginica)

x1/n1

versicolor <- iris[iris$Species == "versicolor", ]
x2 <- sum(versicolor$Petal.Length/versicolor$Petal.Width < 3.0)
n2 <- nrow(versicolor)

x2/n2



prop.test(c(x1, x2), c(n1, n2), alternative = "greater", correct = FALSE)
# Strong evidence against the null -> virginica has more frequently odd-shaped petals









#####################################################################
# Quiz


# 1
x <- c(5, -4, -2, 2)

mean(x)
sd(x)
median(x)
median(abs(x - median(x)))
c(min(x), max(x))
sign(x)
rank(x)
rank(abs(x))*sign(x)


# 2
# a. data with no outliers or with a bounded range
# b. data with small amounts of outliers
# c. discrete data with a small amount of possible values


# 3
# a. data with no outliers or with a bounded range
# b. data with small amouns of outliers
# c. data with bounded range


# 4
# a. the confidence interval is accurate, in the sense that it pinpoints the true value accurately (does not necessary have a large coverage probability, though)
# b. we require lots of "evidence" to reject the null (accept any "effects")
# c. no evidence against null (no "effect" is seen)
# d. we think there is an effect when there is none
# e. we fail to see an effect


# 5
# a. Two boxplots or histograms
set.seed(123)
heights <- data.frame(height = c(rnorm(40, 175, 5), rnorm(40, 165, 5)), sex = c(rep(0, 40), rep(1, 40)))
boxplot(height ~ sex, data = heights)
h1 <- hist(heights[heights$sex == 0, 1], breaks = 8)
h2 <- hist(heights[heights$sex == 1, 1], breaks = 8)

plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4))
plot(h2, add = TRUE, col = rgb(1,0,0,1/4))

# or
par(mfrow = c(2, 1))
plot(h1, xlim = c(150, 190), col = rgb(0,0,1,1/4))
plot(h2, xlim = c(150, 190), col = rgb(1,0,0,1/4))
par(mfrow = c(1, 1))


# b. Bar chart?
barplot(table(sample(0:24, 300, TRUE)))


# c. Bar chart again?
props <- c(0.10, 0.04, 0.03, 0.01, 0.09)
barplot(props, names.arg = 1:5)


# d. Time series plot
x <- cbind(20 + arima.sim(n = 100, list(ar = c(0.85)), sd = sqrt(10)),
           1000 + arima.sim(n = 100, list(ar = c(0.65)), sd = sqrt(1000)),
           420 + arima.sim(n = 100, list(ar = c(0.55)), sd = sqrt(500)))
plot.ts(x)


# e. A map with the postal areas colored based on the median salary in the region?
# ...




# 6.

single_t <- function(){
  x <- rnorm(10)
  sqrt(10)*x/sd(x)
}

single_sign <- function(){
  x <- rnorm(10)
  sum(x > 0)
}

single_rank <- function(){
  x <- rnorm(10)
  sum((sign(x) == 1)*rank(abs(x)))
}


par(mfrow = c(1, 3))
hist(replicate(1000, single_rank()), xlab = "", main = "")
hist(replicate(1000, single_t()), xlab = "", main = "")
hist(replicate(1000, single_sign()), xlab = "", main = "")
par(mfrow = c(1, 1))


data("psych")
library(psych)
data()




#############################################################
# Comparison of t-test, sign test and one-sample signed rank test


data(cd4)
cd4
help(cd4)

# Does the drug increase the amount of CD4 cells in the blood?

# mu = the location of the distribution of the difference between oneyear and baseline
# H0: mu = 0
# H1: mu > 0


# Visualization
matplot(t(cd4), type = "l")

difference <- cd4$oneyear - cd4$baseline
difference
hist(difference)



# Paired t-test (assumes normality)
t.test(difference, mu = 0, alternative = "greater")
# small p-value -> drug works (assuming normality holds)


# Paired sign test (assumes continuity)

# No zero values
sum(difference == 0)

binom.test(sum(difference > 0), length(difference), alternative = "greater")
# Small p-value -> drug works



# Paired signed rank test (assumes continuity and symmetry)

wilcox.test(x, mu = 0, alternative = "greater")
# Small p-value -> drug works