#1.1 #a DATA <- read.table("data/decathlon.txt",header=TRUE,sep="\t",row.names=1) # Nice way to check that the upload was succesful head(DATA) View(DATA) # Check the size of the data matrix dim(DATA) colnames(DATA) # Remove total points, height and weight from the analysis DEC <- DATA[,-c(1,12,13)] help("princomp") # Visualize the data plot(DEC) # Note that since DEC is of the type data.frame R automatically uses the function pairs to plot the variables # Use plot(as.matrix(DEC)) the see the difference # More related to different data types in the course Introduction to R-programming pairs(DEC) # The above is not very informative here but sometimes reveals if theres is something weird in the data # A way to plot two specific variables with the names of the athletes: plot(DEC$R100m,DEC$R400m,xlab="Running 100m",ylab="Running 400m",type="n") text(DEC$R100m,DEC$R400m,labels=rownames(DEC)) DEC.PCA <- princomp(DEC,cor=FALSE) names(DEC.PCA) DEC.PCA$call # input of the function DEC.PCA$scores # Y from lecture slides DEC.PCA$n.obs # number of observations DEC.PCA$scale # Relevant when cor=TRUE DEC.PCA$center # The sample mean colMeans(DEC) # Same as above DEC.PCA$loadings #matrix of eigenvectors (G-matrix) (columns are eigenvectors) DEC.PCA$loadings[1,1] #How to access a single value from G DEC.PCA$sdev # The standard deviation of the principal components plot(DEC.PCA) # Plots the variances of the principal components (DEC.PCA$sdev)^2 # Note that the variances of the principal components are equal to the eigenvalues # of the covariance matrix of the original data matrix n <- nrow(DEC) DEC_cov <- (n-1)/n*cov(DEC) # Note that the princomp package uses the maximum likelihood estimator # of the covariance matrix (1/n divisor instead of 1/(n-1)) DEC_cov_eval <- eigen(DEC_cov)$values (DEC.PCA$sdev)^2 #Same values as above # DEC_cov_eval -(DEC.PCA$sdev)^2 #b summary(DEC.PCA) sum(DEC_cov_eval[1:4])/sum(DEC_cov_eval) # Approx 70% of the variation explained with 4 principal components #c # We mainly use the loadings to interpret the principal components DEC.PCA$loadings #Note that values close to 0 are not visible here DEC.PCA$loadings[,1:4] #We choose the first 4 components by looking at the values of loadings and the cumulative proportional variance # Check what the sports are from wikipedia if you are not familiar with them #1st component:Strength # High negative loadings with shot puck and discus throw. Furthermore, a negative loading wiht high # jump. # High positive loading with Running 1500m and a positive loading with r400m # The high jump is a bit mysterious here. Maybe it is the jumping power? # Interpretation: Strength (weakness), here a large negative value means that the sport in question # requires strength. However, athletes with a high body muscle mass are bad in "long" running # distances, when comparing to other decathletes. # The fist component explains particularly well the behaviour of variables Shot puck, R1500m and Discuss throw # ( it explains the variation of the variable.) # 2nd component: Speed # High negative loading with R100 and a negative loading with hurdles. A negative loading with R400. # Positive loading with high jump, javelin and R1500m # Interpretation: Speed (slowness). R100m, 110m hurdles and r400m require speed from the athlete. # However, R1500 is more about stamina and not about top speed. # Here javelin and high jump a bit mysterious. # The second component explains particularly well the behaviour of variable R100 #Note that the scale of strength and speed reversed, see above plot(DEC.PCA$scores[,1],DEC.PCA$scores[,2],type="n",xlab="Strength",ylab="Speed") text(DEC.PCA$scores[,1:2],labels=rownames(DEC)) # 3rd and 4th component: Technique 1 and Technique 2 # These components explain sports that require a special technique to perform well. # Look the loadings yourself and determine which sports are related to these components. # You should note that from summary(DEC.PCA), we see that the 3rd and 4th component explain # together less variation than the 1st component alone. Hereby, they are not as important as the # first two. # NOTE: Often the best possible interpretations require the help of an expert related to the # phenomenon at hand. # d cov(DEC.PCA$scores) # Diagonal, as expected (n-1)/n*diag(cov(DEC.PCA$scores)) # The diagonal elements are equal to the variances of the principal components colMeans(DEC.PCA$scores) # Zero as expected