library(MASS) #(a) data(iris) # View(iris) head(iris) help(iris) D <- iris[51:150,] cols <- rep(NA,100) cols[D$Species == "versicolor"] <- "blue" cols[D$Species == "virginica"] <- "red" pairs(D[,1:4],col=cols,pch=16) # You can similarly check that setosa is quite far from the other two in all variables class(D[,5]) #now the fifth column contains factors # note that sometimes you have to manually set one of the columns to factors help(lda) # This gives an error since the variable "remembers" that there is a factor level # called setosa. However, the error does not effect the results. D.lda <- lda(Species~. ,data=D) # You can remove the error by modifying the data set as follows D[,5] <- droplevels(D[,5]) #This drops the empty factor level D.lda <- lda(Species~. ,data=D) #used similarly as the function lm() # Species~. is equivalent to Species~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width names(D.lda) # The important object in D.lda: D.lda$prior # prior probabilities that you can manually assing (we do not use them here) D.lda$N # Total number of observations m <- D.lda$means # colMeans for versicolor and virginica a <- D.lda$scaling #the vector a # note that if $scaling contains more than one column, the vector a that corresponds to # Fisher's linear discriminant analysis is always the first column. # Verify that it is equal to the eigenvector of W^(-1)B S1 <- cov(D[1:50,1:4]) S2 <- cov(D[51:100,1:4]) d1 <- colMeans(D[1:50,1:4]) d2 <- colMeans(D[51:100,1:4]) d <- d1 - d2 d = as.matrix(d,ncol=1) B <- (50*50)/100 * d %*% t(d) W <- 49*(S1 + S2) L <- solve(W) %*% B a2 <- eigen(L)$vectors[,1] e <- eigen(L)$values[1] # Note that eigenvectors are not unique and norm(a,type="2") # not 1 norm(a2,type="2") # equal to 1 # Scale a to have length 1: a.scaled <- a/norm(a,type="2") a.scaled a2 as.numeric(a2) # Thus vector a is equal to the eigenvector corresponding # to the largest (only nonzero here) eigenvalue #(b) # new observation = (6, 3, 4, 1) newdat <- data.frame(Sepal.Length=6,Sepal.Width=3,Petal.Length=4, Petal.Width=1) predict(D.lda,newdata=newdat)$class # New flower classified as versicolor #(c) D.cv <- lda(Species~. ,data=D,CV=T) #Results for leave-one out cross-validation result <- data.frame(Est=D.cv$class,Truth=D[,5]) tab <- table(result) #truth table, here 2 versicolor are classified as virginica and # 1 virginica is classified as versicolor # Hereby, the missclassification rate is 3/100