Problem 1: Principal Component Analysis
Upload the file DECATHLON.txt into your R-workspace. The file contains the results of 48 decathletes from 1973. Familiarize yourself with the data and perform the CORRELATION matrix based PCA transformation. Conduct the analysis without the variables: points, height and weight.
Try visualizing the correlation strucutre first.
DATA <- read.table("DECATHLON.txt",header=TRUE,sep="\t",row.names=1)
DEC <- DATA[,-c(1,12,13)]
# Correlation matrix can be nicely visualized with a heatmap:
# With corrgram package:
# install.packages('corrgram')
# library(corrgram)
corrgram(DEC)
cor(DEC)[,7]
## R100m Long_jump Shot_put High_jump R400m
## 0.01434309 0.02087518 0.72732985 0.21699480 -0.34464723
## Hurdles Discus_throw Pole_vault Javelin R1500m
## 0.04770710 1.00000000 -0.18180013 0.13556064 -0.57350141
# Alternatively with base package:
heatmap(cor(DEC), Rowv = NA, Colv = NA, symm = T, col = colorRampPalette(c('red','white','blue'))(50))
# Correlation matrix of the first five covariates:
cor(DEC[,1:5])
## R100m Long_jump Shot_put High_jump R400m
## R100m 1.0000000 0.171985363 -0.02795230 -0.411699539 0.4560816
## Long_jump 0.1719854 1.000000000 -0.03439298 -0.003324598 0.1334633
## Shot_put -0.0279523 -0.034392979 1.00000000 0.162541961 -0.3037074
## High_jump -0.4116995 -0.003324598 0.16254196 1.000000000 -0.3388272
## R400m 0.4560816 0.133463323 -0.30370738 -0.338827204 1.0000000
# Further tuning of the visualization with corrgram package:
corrgram(DEC, panel=panel.pie, diag.panel = panel.minmax, col.regions = colorRampPalette(c("blue4", "blue3", "blue2", "blue1", "blue", "red", "red1", "red2", "red3", "red4")))
Now, apply PC transformation using function princomp(), and set the argument cor = T to make correlation matrix based PC transformation:
a) How much of the variation of the original data is explained by k principal components, where k = 1,2,…,10.
DEC.PCA <- princomp(DEC,cor=TRUE)
summary(DEC.PCA)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.6130891 1.4169733 1.098463 1.0329820 0.96516226
## Proportion of Variance 0.2602056 0.2007813 0.120662 0.1067052 0.09315382
## Cumulative Proportion 0.2602056 0.4609870 0.581649 0.6883542 0.78150800
## Comp.6 Comp.7 Comp.8 Comp.9
## Standard deviation 0.77116160 0.75437360 0.73395022 0.49482809
## Proportion of Variance 0.05946902 0.05690795 0.05386829 0.02448548
## Cumulative Proportion 0.84097702 0.89788497 0.95175326 0.97623875
## Comp.10
## Standard deviation 0.48745515
## Proportion of Variance 0.02376125
## Cumulative Proportion 1.00000000
# Approx 69% of the variation explained by the first 4 principal components
# One (more) way of visualizing this (similar plot we did in the week 2 exercises)
plot(cumsum(DEC.PCA$sdev^2 / sum(DEC.PCA$sdev^2)), type = 'b', pch = 21, lty = 3, bg = 2, cex = 1.5, ylim = c(0,1),
xlab = 'Principal component', ylab = 'Cumulative proportion of variance explained', xaxt = 'n', yaxt = 'n')
axis(1, at = 1:10, tck = 0.025)
axis(2, at = 0:10 / 10, tck = 0.025, las = 2)
abline(0,1/10, lty = 3)
b) Choose a sufficient number of principal components and try to interpret them. Visualize the scores of the observations with respect to the first two principal components.
# PC visualization (Similar plot than biplot.princomp would achieve)
PC1PC2 <- DEC.PCA$scores[,1:2]
LD1LD2 <- DEC.PCA$loadings[,1:2]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)
plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'Correlation matrix based PC transformation',sub = '1st and 2nd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)
# Note the different scales in the axes
# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
## Comp.1 Comp.2 Comp.3 Comp.4
## R100m 0.14 0.58 0.15 0.03
## Long_jump -0.01 0.32 -0.65 -0.21
## Shot_put -0.48 0.14 0.24 0.13
## High_jump -0.30 -0.27 -0.27 -0.07
## R400m 0.40 0.29 -0.08 0.32
## Hurdles -0.02 0.46 -0.19 0.07
## Discus_throw -0.52 0.16 0.14 0.05
## Pole_vault 0.17 0.01 0.08 -0.86
## Javelin -0.16 -0.17 -0.60 0.15
## R1500m 0.41 -0.35 0.00 0.25
c) Add one clear outlier into the data set. Use PCA and try to detect the outlier.
# Contamintate your data with a clear outlier
s <- 9
DEC[49,] <- c(rep(1200,s),rep(18000,10-s))
rownames(DEC)[49] <- "outlier"
# Visualize the contaminated data:
pairs(DEC)
# Fit new PC transformation using the contaminated data:
DEC.PCA <- princomp(DEC,cor=T)
# Remember, the first principal component displays highest variance, therefore should also 'detect' the outlier the best:
PC1PC2 <- DEC.PCA$scores[,1:2]
LD1LD2 <- DEC.PCA$loadings[,1:2]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)
plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'PC transformation with outlier observation', sub = '1st and 2nd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)
# Note the different scales in the axes
# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
## Comp.1 Comp.2 Comp.3 Comp.4
## R100m 0.31 0.43 0.32 0.15
## Long_jump 0.33 0.13 -0.07 -0.30
## Shot_put 0.33 -0.35 0.29 0.36
## High_jump 0.28 -0.45 -0.27 -0.16
## R400m 0.31 0.42 0.05 -0.25
## Hurdles 0.32 0.19 0.31 -0.07
## Discus_throw 0.33 -0.37 0.24 0.32
## Pole_vault 0.23 0.27 -0.71 0.56
## Javelin 0.30 -0.23 -0.26 -0.49
## R1500m 0.39 0.01 -0.11 -0.02
# Remember, the first principal component displays highest variance, therefore should also 'detect' the outlier the best:
PC1PC2 <- DEC.PCA$scores[,c(2,3)]
LD1LD2 <- DEC.PCA$loadings[,c(2,3)]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)
plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'PC transformation with outlier observation', sub = '2nd and 3rd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)
# Note the different scales in the axes
# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
## Comp.1 Comp.2 Comp.3 Comp.4
## R100m 0.31 0.43 0.32 0.15
## Long_jump 0.33 0.13 -0.07 -0.30
## Shot_put 0.33 -0.35 0.29 0.36
## High_jump 0.28 -0.45 -0.27 -0.16
## R400m 0.31 0.42 0.05 -0.25
## Hurdles 0.32 0.19 0.31 -0.07
## Discus_throw 0.33 -0.37 0.24 0.32
## Pole_vault 0.23 0.27 -0.71 0.56
## Javelin 0.30 -0.23 -0.26 -0.49
## R1500m 0.39 0.01 -0.11 -0.02