# We will use the MASS-library library(MASS) # First, we read the data poll <- read.table("data/polls.txt",header=T,sep="\t",row.names=1) View(poll) # Create labels for each region label <- c("UM","VS","KH","PiM","PH","KL","PS","PK","PoM","K","L") # a) Scatter plot the variables. Can you spot the different clusters? plot(poll, panel = function(x,y) {text(x,y,labels=label,xpd=T)}) # b) Calculate the euclidean distances between the countries. poll.dist <- dist(poll,method="euclidean") rounded <- round(poll.dist,1) View(as.matrix(rounded)) # c) Perform the “bottom up” hierarchical clustering by hand. # Aggregate two clusters using the minimum distance (single linkage). min(poll.dist) sort(round(poll.dist,1)) #(Paijat-Hame, Kymenlaakso) 3.6 #(Kanta-Hame, Paijat-Hame, Kymenlaakso) 4.8 #(Kanta-Hame, Paijat-Hame, Kymenlaakso), (Varsinais-Suomi, Pirkanmaa) 7.1 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa) 7.7 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa), (Lappi, Kainuu) 7.8 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa), (Lappi, Kainuu), (Pohjois-Savo, Pohjois-Karjala) 9.0 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa, Uusimaa), (Lappi, Kainuu), (Pohjois-Savo, Pohjois-Karjala) 12.3 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa, Uusimaa), (Lappi, Kainuu, Pohjois-Savo, Pohjois-Karjala) 12.8 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa, Uusimaa, Lappi, Kainuu, Pohjois-Savo, Pohjois-Karjala) 16.6 #(Kanta-Hame, Paijat-Hame, Kymenlaakso, Varsinais-Suomi, Pirkanmaa, Uusimaa, Lappi, Kainuu, Pohjois-Savo, Pohjois-Karjala, Pohjanmaa) 47.1 # d) Repeat (c) using the function hclust(). # Simply call on the hclust-function poll.min <- hclust(poll.dist,method="single") # e) Plot the classification tree (dendrogram). # This can be done with the plot-function: plot(poll.min, main="Single") # f) Repeat the steps by aggregating the clusters using the average # (average link- age) and the maximum (complete linkage). # Compare the results. # First, the maximum linkage poll.max <- hclust(poll.dist,method="complete") plot(poll.max, main="Complete") # Second the average linkage poll.ave <- hclust(poll.dist,method="average") plot(poll.ave, main="Average") # g) Where would you cut the tree? # Here is some code which can be used to test the clusters with Minp <- cutree(poll.min,k=5) Minp Maxp <- cutree(poll.max,k=2) Maxp Avep <- cutree(poll.ave,k=4) Avep # #### #2 #### library(cluster) BANK <- read.table("data/bank.txt",header=T,sep="\t") n <- nrow(BANK) cols <- rep(NA,n) cols[BANK$CODE == 0] <- "blue" cols[BANK$CODE == 1] <- "red" plot(BANK[,-1],col=cols,pch=16) plot(BANK[,-1],type='n') text(BANK[,-1],label=1:n) set.seed(100) #500 yields nicely differing results for 3 centers k.mean <- kmeans(BANK[,-1],centers=2) table(k.mean$cluster,BANK[,1]) # zero wrong ones in the 1st category # 95 right ones in the 2nd category clusplot(BANK[,-1],k.mean$cluster,color=T,shade=T) plot(BANK[,-1], col=k.mean$cluster, pch=16) # Perhaps just using the standard plot() is simpler # With 3 centers k.mean3 <- kmeans(BANK[,-1],centers=3) table(k.mean3$cluster) # With a simple table of the number of elements in each cluster we can see the difference. clusplot(BANK[,-1],k.mean3$cluster,color=T,shade=TRUE) plot(BANK[,-1], col=k.mean3$cluster, pch=16) # If you want to manually set the colors for plot(), you have to specify a vector of colors. For example: colvec <- c("blue", "green", "magenta") cols <- rep(NA, nrow(BANK)) for(i in 1:length(colvec)){ cols[k.mean3$cluster==i] <- colvec[i] } plot(BANK[,-1], col=cols, pch=16) # The seed does not seem to affect the results when we have 2 clusters # However, it does have an effect for 3 clusters # If there is a problem with clusplot, installing the package fpc might help # library(fpc)