########################### # MS-C1620 # Statistical inference # Lecture 6 # We use the significance level 0.05 throughout the script ########################### # Forms of dependency # EX1 # Compare the following forms of dependency ### Dependency between continuous and continuous variable ### Sepal length vs sepal width of setosa-irises iris plot(iris[1:50, 1], iris[1:50, 2], xlab = "Sepal length", ylab = "Sepal width") ### Dependency between continuous and continuous variable ### Ozone content vs. temperature airquality plot(airquality[, 1], airquality[, 4], xlab = "Ozone content", ylab = "Temperature") ### Dependency between continuous and continuous variable ### Artificial data with circular shape phi <- runif(400, 0, 2*pi) r <- runif(400, 1, 1.5) x <- r*cos(phi) y <- r*sin(phi) plot(x, y, asp = 1) # Note: no dependency in polar coordinates! plot(phi, r) ### Dependency between categorial and continuous variable ### Chicken weight vs. feed type boxplot(weight ~ feed, data = chickwts) ### what if we treat feed as numerical 1...6 chickmatrix = data.matrix(chickwts) chickx = chickmatrix[,2] chicky = chickmatrix[,1] plot(chickx,chicky) ### Dependency within a single sample of continuous variable ### Approval ratings of US presidents plot(presidents) ### Dependency within and between multiple samples of continuous variables ### Daily closing prices of major European stock indices plot(EuStockMarkets) ########################### # Linear dependence ### Perfect linear dependence x <- rnorm(100) y <- 10 - 2*x plot(x, y) ### Zero linear dependence (but some other dependence still) x <- rnorm(10000) y <- x^2 plot(x, y) cor(x, y) ### Correlations for some of the previous examples # Iris plot(iris[1:50, 1], iris[1:50, 2], xlab = "Sepal length", ylab = "Sepal width") cor(iris[1:50, 1], iris[1:50, 2]) # (Note also the correlation matrix:) cor(iris[1:50, 1:4]) # Air quality plot(airquality[, 1], airquality[, 4], xlab = "Ozone content", ylab = "Temperature") cor(airquality[, 1], airquality[, 4], use = "complete.obs") # Circle data phi <- runif(400, 0, 2*pi) r <- runif(400, 1, 1.5) x <- r*cos(phi) y <- r*sin(phi) plot(x, y) cor(x, y) # Correlation with the series itself! plot(presidents) delay = 1 plot(presidents[1:(120-delay)], presidents[(1+delay):120]) cor(presidents[1:(120-delay)], presidents[(1+delay):120], use = "complete.obs") delay = 2 cor(presidents[1:(120-delay)], presidents[(1+delay):120], use = "complete.obs") delay = 4 cor(presidents[1:(120-delay)], presidents[(1+delay):120], use = "complete.obs") delay = 8 cor(presidents[1:(120-delay)], presidents[(1+delay):120], use = "complete.obs") ########################### # EX2 Bivariate normal distribution library(MASS) library(KernSmooth) # for rotateable plots (persp3d) library(rgl) # Bivariate normal with variances 1, means 0, and correlation 0.5 x <- mvrnorm(10000, c(0, 0), matrix(c(1, 0.5, 0.5, 1), 2, 2)) plot(x) persp(bkde2D(x, 0.25)$fhat) persp3d(bkde2D(x, 0.25)$fhat, col="green") # rotateable plot # Bivariate normal with variances 1, means 0, and correlation 0.8 x <- mvrnorm(10000, c(0, 0), matrix(c(1, 0.8, 0.8, 1), 2, 2)) plot(x) persp(bkde2D(x, 0.25)$fhat) persp3d(bkde2D(x, 0.25)$fhat, col="green") ########################### # EX3 95% confidence intervals for Pearson correlation ### Sepal length vs sepal width of setosa-irises x <- iris[1:50, 1] y <- iris[1:50, 2] n <- 50 # Is this bivariate normal? plot(x, y) persp(bkde2D(data.frame(x, y), 0.25)$fhat, theta = 45) persp3d(bkde2D(data.frame(x, y), 0.25)$fhat, theta = 45, col="green") cor(x, y) ### Parametric confidence interval under the assumption of normality ci_par <- c(tanh(atanh(cor(x, y)) - 1.96/sqrt(n - 3)), tanh(atanh(cor(x, y)) + 1.96/sqrt(n - 3))) ci_par # Note: not symmetric around the estimate plot(c(ci_par, cor(x, y)), rep(1, 3), xlim = c(0, 1)) ### Non-parametric confidence interval B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(cbind(x, y)[sample(1:n, n, replace = TRUE), ])[1, 2] } # Distribution of the bootstrap correlations hist(res, breaks = 20) # 95% BS confidence interval ci_bs <- quantile(res, probs = c(0.025, 0.975)) abline(v = ci_bs, lwd = 2, col = 2) ci_bs # Parametric 95% confidence interval assuming normality abline(v = ci_par, lwd = 2, col = 3) # Quite different answers by the parametric and non-parametric approaches -> safer to trust non-parametric ################################### # Two-sample test for Pearson correlation ### Sepal length/sepal width correlations of setosa vs. versicolor # Claim the correlations differ for the two species # Setosa x_setosa <- iris[1:50, 1] y_setosa <- iris[1:50, 2] n <- 50 plot(x_setosa, y_setosa) cor(x_setosa, y_setosa) # Versicolor x_versicolor <- iris[51:100, 1] y_versicolor <- iris[51:100, 2] m <- 50 plot(x_versicolor, y_versicolor) cor(x_versicolor, y_versicolor) # Test whether the populatation correlation differ # H0: rho1 == rho2 # H1: rho1 != rho2 z <- (atanh(cor(x_setosa, y_setosa)) - atanh(cor(x_versicolor, y_versicolor)))/sqrt(1/(n - 3) + 1/(m - 3)) 2*pnorm(abs(z), lower.tail = FALSE) # Not quite enough evidence against H0 -> we continue to believe that the correlations could be equally large ################################## # Significance tests for correlation ### Petal length vs petal width of setosa-irises # Is the correlation significant? # H0: rho == 0 # H0: rho != 0 x <- iris[1:50, 3] y <- iris[1:50, 4] n <- 50 plot(x, y) # is this bivariate normal? cor(x, y) ### Parametric test (null = zero correlation) cor.test(x, y) # -> enough evidence against null -> the correlation is significant if we can trust the normality of the data ### EX4 Non-parametric permutation test B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(x, sample(y, n, replace = FALSE)) } # Distribution of the permutation test replicates hist(res, breaks = 20) # Estimated probability of observing a more deviating value for the correlation under H0 abline(v = abs(cor(x, y)), col = 2, lwd = 2) abline(v = -1*abs(cor(x, y)), col = 2, lwd = 2) mean(res >= abs(cor(x, y))) # Very similar p-value as in the parametric test -> same conclusion ################################## # EX5 Spearman correlation ### Visualization x1 <- c(1, 2, 3, 4, 5) y1 <- c(1, 8, 27, 64, 125) plot(x1, y1, type = "b") # Spearman correlation ignores the "magnitude" by first "straightening" the data plot(rank(x1), rank(y1), type = "b") x2 <- c(1, 2, 3, 4, 5) y2 <- c(1, 8, 27, 16, 125) plot(x2, y2, type = "b") plot(rank(x2), rank(y2), type = "b") x3 <- c(1, 2, 3, 4, 5) y3 <- c(1, 1.2, 4, 4.2, 7) plot(x3, y3, type = "b") plot(rank(x3), rank(y3), type = "b") ### Ozone content vs. temperature full_obs <- (!is.na(airquality[, 1])) & (!is.na(airquality[, 4])) x <- airquality[full_obs, 1] y <- airquality[full_obs, 4] n <- length(x) plot(x, y, xlab = "Ozone content", ylab = "Temperature") ### Pearson correlation, boostrap confidence interval and a permutation test cor(x, y) ### Bootstrap B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(cbind(x, y)[sample(1:n, n, replace = TRUE), ])[1, 2] } # Distribution of the bootstrap correlations hist(res, breaks = 20) # 95% BS confidence interval ci_bs <- quantile(res, probs = c(0.025, 0.975)) abline(v = ci_bs, lwd = 2, col = 2) ci_bs ### Permutation test B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(x, sample(y, n, replace = FALSE)) } # Distribution of the permutation test replicates hist(res, breaks = 20) # Estimated probability of observing a more deviating value for the correlation under H0 mean(res >= abs(cor(x, y))) # p-value = 0 -> Pearson correlation differs significantly from zero ### Spearman correlation, boostrap confidence interval and a permutation test rx <- rank(x) ry <- rank(y) plot(rx, ry, xlab = "rank(Ozone content)", ylab = "rank(Temperature)") cor(rx, ry) # Spearman correlation is larger than Pearson -> there is more monotonic dependency inthe data than # linear dependency ### Bootstrap B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(cbind(rx, ry)[sample(1:n, n, replace = TRUE), ])[1, 2] } # Distribution of the bootstrap correlations hist(res, breaks = 20) # 95% BS confidence interval ci_bs <- quantile(res, probs = c(0.025, 0.975)) abline(v = ci_bs, lwd = 2, col = 2) ci_bs ### Permutation test B <- 1000 res <- rep(0, B) for(b in 1:B){ res[b] <- cor(rx, sample(ry, n, replace = FALSE)) } # Distribution of the permutation test replicates hist(res, breaks = 20) # Estimated probability of observing a more deviating value for the correlation under H0 mean(res >= abs(cor(rx, ry))) # p-value = 0 -> Spearman correlation differs significantly from zero # Conclusions: # There is both linear and monotonic dependency in the data # The monotonic dependency is a somewhat stronger than the linear dependency # -> When we move to linear models, maybe some transformations should be done to achieve linearity plot(x, y, xlab = "Ozone content", ylab = "Temperature") plot(log(x), y, xlab = "log(Ozone content)", ylab = "Temperature")