library(MASS) #(a) data(iris) # View(iris) head(iris) help(iris) D <- iris[51:150,] cols <- rep(NA,100) cols[D\$Species == "versicolor"] <- "blue" cols[D\$Species == "virginica"] <- "red" pairs(D[,1:4],col=cols,pch=16) # You can similarly check that setosa is quite far from the other two in all variables class(D[,5]) #now the fifth column contains factors # note that sometimes you have to manually set one of the columns to factors help(lda) # This gives an error since the variable "remembers" that there is a factor level # called setosa. However, the error does not effect the results. D.lda <- lda(Species~. ,data=D) # You can remove the error by modifying the data set as follows D[,5] <- droplevels(D[,5]) #This drops the empty factor level D.lda <- lda(Species~. ,data=D) #used similarly as the function lm() # Species~. is equivalent to Species~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width names(D.lda) # The important object in D.lda: D.lda\$prior # prior probabilities that you can manually assing (we do not use them here) D.lda\$N # Total number of observations m <- D.lda\$means # colMeans for versicolor and virginica a <- D.lda\$scaling #the vector a # note that if \$scaling contains more than one column, the vector a that corresponds to # Fisher's linear discriminant analysis is always the first column. # Verify that it is equal to the eigenvector of W^(-1)B S1 <- cov(D[1:50,1:4]) S2 <- cov(D[51:100,1:4]) d1 <- colMeans(D[1:50,1:4]) d2 <- colMeans(D[51:100,1:4]) d <- d1 - d2 d = as.matrix(d,ncol=1) B <- (50*50)/100 * d %*% t(d) W <- 49*(S1 + S2) L <- solve(W) %*% B a2 <- eigen(L)\$vectors[,1] e <- eigen(L)\$values[1] # Note that eigenvectors are not unique and norm(a,type="2") # not 1 norm(a2,type="2") # equal to 1 # Scale a to have length 1: a.scaled <- a/norm(a,type="2") a.scaled a2 as.numeric(a2) # Thus vector a is equal to the eigenvector corresponding # to the largest (only nonzero here) eigenvalue #(b) # new observation = (6, 3, 4, 1) newdat <- data.frame(Sepal.Length=6,Sepal.Width=3,Petal.Length=4, Petal.Width=1) predict(D.lda,newdata=newdat)\$class # New flower classified as versicolor #(c) D.cv <- lda(Species~. ,data=D,CV=T) #Results for leave-one out cross-validation result <- data.frame(Est=D.cv\$class,Truth=D[,5]) tab <- table(result) #truth table, here 2 versicolor are classified as virginica and # 1 virginica is classified as versicolor # Hereby, the missclassification rate is 3/100