Problem 1: Principal Component Analysis

Upload the file DECATHLON.txt into your R-workspace. The file contains the results of 48 decathletes from 1973. Familiarize yourself with the data and perform the CORRELATION matrix based PCA transformation. Conduct the analysis without the variables: points, height and weight.

Try visualizing the correlation strucutre first.

DATA <- read.table("DECATHLON.txt",header=TRUE,sep="\t",row.names=1)
DEC <- DATA[,-c(1,12,13)]

# Correlation matrix can be nicely visualized with a heatmap:
# With corrgram package:
# install.packages('corrgram')
# library(corrgram)
corrgram(DEC)

cor(DEC)[,7]
##        R100m    Long_jump     Shot_put    High_jump        R400m 
##   0.01434309   0.02087518   0.72732985   0.21699480  -0.34464723 
##      Hurdles Discus_throw   Pole_vault      Javelin       R1500m 
##   0.04770710   1.00000000  -0.18180013   0.13556064  -0.57350141
# Alternatively with base package:
heatmap(cor(DEC), Rowv = NA, Colv = NA, symm = T, col = colorRampPalette(c('red','white','blue'))(50))

# Correlation matrix of the first five covariates:
cor(DEC[,1:5])
##                R100m    Long_jump    Shot_put    High_jump      R400m
## R100m      1.0000000  0.171985363 -0.02795230 -0.411699539  0.4560816
## Long_jump  0.1719854  1.000000000 -0.03439298 -0.003324598  0.1334633
## Shot_put  -0.0279523 -0.034392979  1.00000000  0.162541961 -0.3037074
## High_jump -0.4116995 -0.003324598  0.16254196  1.000000000 -0.3388272
## R400m      0.4560816  0.133463323 -0.30370738 -0.338827204  1.0000000
# Further tuning of the visualization with corrgram package:
corrgram(DEC, panel=panel.pie, diag.panel = panel.minmax, col.regions = colorRampPalette(c("blue4", "blue3", "blue2", "blue1", "blue", "red", "red1", "red2", "red3", "red4")))


Now, apply PC transformation using function princomp(), and set the argument cor = T to make correlation matrix based PC transformation:

a) How much of the variation of the original data is explained by k principal components, where k = 1,2,…,10.

DEC.PCA <- princomp(DEC,cor=TRUE)
summary(DEC.PCA)
## Importance of components:
##                           Comp.1    Comp.2   Comp.3    Comp.4     Comp.5
## Standard deviation     1.6130891 1.4169733 1.098463 1.0329820 0.96516226
## Proportion of Variance 0.2602056 0.2007813 0.120662 0.1067052 0.09315382
## Cumulative Proportion  0.2602056 0.4609870 0.581649 0.6883542 0.78150800
##                            Comp.6     Comp.7     Comp.8     Comp.9
## Standard deviation     0.77116160 0.75437360 0.73395022 0.49482809
## Proportion of Variance 0.05946902 0.05690795 0.05386829 0.02448548
## Cumulative Proportion  0.84097702 0.89788497 0.95175326 0.97623875
##                           Comp.10
## Standard deviation     0.48745515
## Proportion of Variance 0.02376125
## Cumulative Proportion  1.00000000
# Approx 69% of the variation explained by the first 4 principal components
# One (more) way of visualizing this (similar plot we did in the week 2 exercises)
plot(cumsum(DEC.PCA$sdev^2 / sum(DEC.PCA$sdev^2)), type = 'b', pch = 21, lty = 3, bg = 2, cex = 1.5, ylim = c(0,1),
     xlab = 'Principal component', ylab = 'Cumulative proportion of variance explained', xaxt = 'n', yaxt = 'n')
axis(1, at = 1:10, tck = 0.025)
axis(2, at = 0:10 / 10, tck = 0.025, las = 2)
abline(0,1/10, lty = 3)


b) Choose a sufficient number of principal components and try to interpret them. Visualize the scores of the observations with respect to the first two principal components.

# PC visualization (Similar plot than biplot.princomp would achieve)
PC1PC2 <- DEC.PCA$scores[,1:2]
LD1LD2 <- DEC.PCA$loadings[,1:2]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)

plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'Correlation matrix based PC transformation',sub = '1st and 2nd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)

# Note the different scales in the axes

# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
##              Comp.1 Comp.2 Comp.3 Comp.4
## R100m          0.14   0.58   0.15   0.03
## Long_jump     -0.01   0.32  -0.65  -0.21
## Shot_put      -0.48   0.14   0.24   0.13
## High_jump     -0.30  -0.27  -0.27  -0.07
## R400m          0.40   0.29  -0.08   0.32
## Hurdles       -0.02   0.46  -0.19   0.07
## Discus_throw  -0.52   0.16   0.14   0.05
## Pole_vault     0.17   0.01   0.08  -0.86
## Javelin       -0.16  -0.17  -0.60   0.15
## R1500m         0.41  -0.35   0.00   0.25

c) Add one clear outlier into the data set. Use PCA and try to detect the outlier.

# Contamintate your data with a clear outlier
s <- 9
DEC[49,] <- c(rep(1200,s),rep(18000,10-s))

rownames(DEC)[49] <- "outlier"

# Visualize the contaminated data:
pairs(DEC)

# Fit new PC transformation using the contaminated data:
DEC.PCA <- princomp(DEC,cor=T)
# Remember, the first principal component displays highest variance, therefore should also 'detect' the outlier the best:
PC1PC2 <- DEC.PCA$scores[,1:2]
LD1LD2 <- DEC.PCA$loadings[,1:2]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)

plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'PC transformation with outlier observation', sub = '1st and 2nd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)

# Note the different scales in the axes

# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
##              Comp.1 Comp.2 Comp.3 Comp.4
## R100m          0.31   0.43   0.32   0.15
## Long_jump      0.33   0.13  -0.07  -0.30
## Shot_put       0.33  -0.35   0.29   0.36
## High_jump      0.28  -0.45  -0.27  -0.16
## R400m          0.31   0.42   0.05  -0.25
## Hurdles        0.32   0.19   0.31  -0.07
## Discus_throw   0.33  -0.37   0.24   0.32
## Pole_vault     0.23   0.27  -0.71   0.56
## Javelin        0.30  -0.23  -0.26  -0.49
## R1500m         0.39   0.01  -0.11  -0.02
# Remember, the first principal component displays highest variance, therefore should also 'detect' the outlier the best:
PC1PC2 <- DEC.PCA$scores[,c(2,3)]
LD1LD2 <- DEC.PCA$loadings[,c(2,3)]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)

plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
title(main = 'PC transformation with outlier observation', sub = '2nd and 3rd principal component', adj = 0)
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)

# Note the different scales in the axes

# Let's only display the loadings of the first 4 principal components for clarity. Moreover, use rounding to increase clarity.
print(round(DEC.PCA$loadings[,1:4],2))
##              Comp.1 Comp.2 Comp.3 Comp.4
## R100m          0.31   0.43   0.32   0.15
## Long_jump      0.33   0.13  -0.07  -0.30
## Shot_put       0.33  -0.35   0.29   0.36
## High_jump      0.28  -0.45  -0.27  -0.16
## R400m          0.31   0.42   0.05  -0.25
## Hurdles        0.32   0.19   0.31  -0.07
## Discus_throw   0.33  -0.37   0.24   0.32
## Pole_vault     0.23   0.27  -0.71   0.56
## Javelin        0.30  -0.23  -0.26  -0.49
## R1500m         0.39   0.01  -0.11  -0.02