#1.1
#a
setwd("C:/Users/Omistaja/Documents/R")
DATA <- read.table("DECATHLON.txt",header=TRUE,sep="\t",row.names=1)
# Nice way to check that the upload was succesful
head(DATA)
View(DATA)
# Check the size of the data matrix
dim(DATA)
colnames(DATA)
# Remove total points, height and weight from the analysis
DEC <- DATA[,-c(1,12,13)]
help("princomp")
# Visualize the data
plot(DEC) # Note that since DEC is of the type data.frame R automatically uses the function pairs to plot the variables
plot(as.matrix(DEC))# see the difference
# More related to different data types in the course Introduction to R-programming
pairs(DEC)
# A way to plot two specific variables with the names of the athletes:
plot(DEC$R100m,DEC$R400m,xlab="Running 100m",ylab="Running 400m",type="n")
text(DEC$R100m,DEC$R400m,labels=rownames(DEC))
DEC.PCA <- princomp(DEC,cor=FALSE)
DEC.PCA$n.obs # number of observations
names(DEC.PCA)
DEC.PCA$call # input of the function
DEC.PCA$scores # Y from lecture slides
DEC.PCA$scale # Relevant when cor=TRUE
DEC.PCA$center # The sample mean
colMeans(DEC) # Same as above
DEC.PCA$loadings #matrix of eigenvectors (G-matrix) (columns are eigenvectors)
DEC.PCA$loadings[1,1] #How to access a single value from G
DEC.PCA$sdev # The standard deviation of the principal components
plot(DEC.PCA) # Plots the variances of the principal components
(DEC.PCA$sdev)^2
# Note that the variances of the principal components are equal to the eigenvalues
# of the covariance matrix of the original data matrix
n <- nrow(DEC)
DEC_cov <- (n-1)/n*cov(DEC)
# Note that the princomp package uses the maximum likelihood estimator
# of the covariance matrix (1/n divisor instead of 1/(n-1))
DEC_cov_eval <- eigen(DEC_cov)$values
(DEC.PCA$sdev)^2 #Same values as above
# DEC_cov_eval -(DEC.PCA$sdev)^2
#b
summary(DEC.PCA)
sum(DEC_cov_eval[1:4])/sum(DEC_cov_eval)
# Approx 70% of the variation explained with 4 principal components
#c
# We mainly use the loadings to interpret the principal components
DEC.PCA$loadings #Note that values close to 0 are not visible here
#We choose the first 4 components by looking at the values of loadings and the cumulative proportional variance
# Check what the sports are from wikipedia if you are not familiar with them.
# PC visualization (Similar plot than biplot.princomp would achieve)
PC1PC2 <- DEC.PCA$scores[,1:2]
LD1LD2 <- DEC.PCA$loadings[,1:2]
pc.axis <- c(-max(abs(PC1PC2)),max(abs(PC1PC2)))
ld.axis <- c(-0.8,0.8)
plot(PC1PC2, xlim = pc.axis, ylim = pc.axis, pch = 21, bg = 8, cex = 1.25)
text(PC1PC2[,1], PC1PC2[,2],labels=rownames(DEC))
par(new = T)
plot(LD1LD2, axes = F, type = 'n', xlab = '', ylab = '', xlim = ld.axis, ylim = ld.axis)
axis(3, col = 2, tck = 0.025)
axis(4, col = 2, tck = 0.025)
arrows(0,0,LD1LD2[,1], LD1LD2[,2], length = 0.1, col = 2)
text(LD1LD2[,1], LD1LD2[,2], rownames(LD1LD2), pos = 3)
abline(h = 0, lty = 3)
abline(v = 0, lty = 3)
# 3rd and 4th component: Technique 1 and Technique 2
# These components explain sports that require a special technique to perform well.
# Look the loadings yourself and determine which sports are related to these components.
# NOTE: Often the best possible interpretations require the help of an expert related to the
# phenomenon at hand.
# d
cov(DEC.PCA$scores) # Diagonal, as expected
(n-1)/n*diag(cov(DEC.PCA$scores)) # The diagonal elements are equal to the variances of the principal components
colMeans(DEC.PCA$scores) # Zero as expected