# install.packages(c("MASS","rrcov"))

library(MASS)
library(rrcov)

# (a), (b)

D <- read.table("data/wood.txt",header=TRUE,sep="\t")
dim(D)
plot(D)
# Again, you might see some outliers,
# Note that when data is truly high dimensional,
# you cannot easily detect outliers visually

# (c)

# Classical unbiased covariance
scov <- cov(D)
scov

# MCD covariance
# try different values of alpha

# previously there was bug with alpha=1, seem to be fixed now (2017)
rcov <- CovMcd(D,alpha=1/2)
# Note that the above is a S4 class object
# Most of the object we have in this course are of class S3
# We will not discuss the differences of the objects (between S3 and S4) in this
# course. 

# The difference relevant to this course is that 
# to access slots of an S4 object you use @, not $:
rcov@cov

# The S3 alternative for CovMcd is covmcd:

# covMcd(D,alpha=1/2)$cov

# however, since some of the later functions in these exercises do not have S3 alternatives,
# we use the CovMcd function here

# (d)

## Using sample covariance
# Note that the next command returns mahalanobis^2

s.maha <- mahalanobis(D,center=colMeans(D),cov=scov)

# If the original data is normally distributed
# --> Squared Mahalanobis distance follows chi^2 distribution with 
# p degrees of freedom (p=5, since 5 variables)

#qqplot(qchisq(ppoints(20),df=5),s.maha,main="Using sample covariance matrix")
#qqline(s.maha,distribution = function(p) qchisq(p,df=5))


# Using Robust

r.maha <- mahalanobis(D,center=rcov@center, cov=rcov@cov)

#qqplot(qchisq(ppoints(20),df=5),r.maha,main="Using MCD")
#qqline(r.maha,distribution = function(p) qchisq(p,df=5), probs=c(0.075,0.875))


plot(c(1,nrow(D)),range(sqrt(c(s.maha,r.maha))),type="n",
     xlab="Observation",ylab="Mahalanobis-Distance")

points(1:nrow(D),sqrt(s.maha),col="blue")

points(1:nrow(D),sqrt(r.maha),col="red",pch=16)


legend("topleft",col=c("blue","red"),cex=0.8,legend=c("Classical","MCD"),
       pch=c(1,16))


# Clearly, when using MCD we see some potential outliers,
# however, using classical covariance the outliers are not visible 

# (e)

apply(D,2,range)
# It seems that the scales of the variables are different
# Also, we do not know if the variables are of the same units
# Hereby, we use the correlation based PCA here

# Correlation based PCA
PCA1 <- princomp(D,cor=T)
PCA1$loadings


# MCD based PCA
PCA2 <- PcaCov(D,scale=TRUE,cov.control = CovControlMcd(alpha=1/2)) 
PCA2@loadings
# see help(PcaCov)
# You can verify that PCA1 and PCA2 are equal if alpha=1
# The scale=TRUE in PcaCov() means that we perform the correlation based PCA


# You can see the objects inside an S3 object with names():
names(PCA1)

# Likewise, for S4 objects use slotNames():
slotNames(PCA2)


# Summary and plotting works for both types
summary(PCA1)
summary(PCA2)

# From Exercises 3, check what happens to regular PCA,
# when outliers are present
par(mfrow=c(1,2))

plot(PCA1)
screeplot(PCA2)

dev.off()

PCA1$loadings
PCA2@loadings


#completely different loadings
# --> different interpretations for the principal components

pairs(PCA1$scores)
pairs(PCA2@scores)

####
#2
#####
set.seed(123)
n <- 200
library(mvtnorm)

D1 <- rmvnorm(n,mean=c(0,0),sigma=diag(2))
plot(D1)

D2 <- rmvt(n,df=5,sigma=diag(2))
plot(D2)

x1 <- rweibull(n,shape=1,scale=2)
x2 <- rgamma(n,shape=2,scale=1)

D3 <- cbind(x1,x2)
plot(D3)

cov1 <- cov(D1)
rcov1 <- CovMcd(D1,alpha=1/2)@cov

cov2 <- cov(D2)
rcov2 <- CovMcd(D2,alpha=1/2)@cov

cov3 <- cov(D3)
rcov3 <- CovMcd(D3,alpha=1/2)@cov