# install.packages("MASS")
setwd("//home.org.aalto.fi/lietzen1/data/Desktop/Monim2016")
library(MASS)

#(a)

data(iris)
# View(iris)
head(iris)
help(iris)
help(lda)


D <- iris[51:150,]

cols <- rep(NA,100)

cols[D$Species == "versicolor"] <- "blue"
cols[D$Species == "virginica"] <- "red"
pairs(D[,1:4],col=cols,pch=16)

# You can similarly check that setosa is quite far from the other two in all variables


class(D[,5]) #now the fifth column contains factors
              # note that sometimes you have to manually set one of the columns to factors

help(lda)

# This gives an error since the variable "remembers" that there is a factor level
# called setosa. However, the error does not effect the results. 
D.lda <- lda(Species~. ,data=D) 

# You can remove the error by modifying the data set as follows
D[,5] <- droplevels(D[,5]) #This drops the empty factor level

D.lda <- lda(Species~. ,data=D) #used similarly as the function lm()
# Species~. is equivalent to Species~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width

names(D.lda)

# The important object in D.lda:
D.lda$prior # prior probabilities that you can manually assing (we do not use them here)
D.lda$N  # Total number of observations
m <- D.lda$means # colMeans for versicolor and virginica
a <- D.lda$scaling #the vector a
# note that if $scaling contains more than one column, the vector a that corresponds to 
# Fisher's linear discriminant analysis is always the first column. 

# Verify that it is equal to W^(-1)B

S1 <- cov(D[1:50,1:4])
S2 <- cov(D[51:100,1:4])
d1 <- colMeans(D[1:50,1:4])
d2 <- colMeans(D[51:100,1:4])

d <- d1 - d2
d = as.matrix(d,ncol=1)

B <- (50*50)/100 * d %*% t(d)
W <- 49*(S1 + S2)
L <- solve(W) %*% B
a2 <- eigen(L)$vectors[,1]

e <- eigen(L)$values[1]

# Note that eigenvectors are not unique and
norm(a,type="2") # not 1
norm(a2,type="2") # equal to 1

# Scale a to have length 1:
a.scaled <- a/norm(a,type="2")

a.scaled
a2
# Thus vector a is equal to the eigenvector corresponding 
# to the largest (only nonzero here) eigenvalue


#(b)

# new observation = (6, 3, 4, 1)

newdat <- data.frame(Sepal.Length=6,Sepal.Width=3,Petal.Length=4,
                     Petal.Width=1)

predict(D.lda,newdata=newdat)$class # New flower classified as versicolor

#(c)

D.cv <- lda(Species~. ,data=D,CV=T) #Results for leave-one out cross-validation

result <- data.frame(Est=D.cv$class,Truth=D[,5])

tab <- table(result) #truth table, here 2 versicolor are classified as virginica and
# 1 virginica is classified as versicolor
      
# Hereby, the missclassification rate is
3/100