library(ca) library(psych) data <- read.table("data/sciencedoctorates.txt",header=T,sep="\t",row.names=1) # View(data) dim(data) # Again, remove the total col/row SD <- data[-dim(data)[1],-dim(data)[2]] dim(SD) # To interpret correspondence analysis, the first step is to evaluate whether there is a significant # dependency between the rows and columns. n <- sum(SD) v1 <- matrix(colSums(SD),nrow=1) v2 <- matrix(rowSums(SD),ncol=1) #theoretical frequencies under independence E <- v2 %*% v1/n I <- dim(SD)[1] J <- dim(SD)[2] s <- 0 #chi-square statistic for(i in 1:I){ for(j in 1:J){ s <- s + ( SD[i,j]-E[i,j] )^2/(E[i,j]) } } pchisq(s,df=((I-1)*(J-1)),lower.tail=F) # H0: Discipline and Year are independent # H1: Discipline and Year are not independent chisq.test(SD) # there is evicende that there is statistically # significant association between the number # of doctors graduated and the year (in USA) #ca-function help(ca) SD.ca = ca(SD) #set nd=8 to make all columns visible in summary names(SD.ca) SD.ca$colnames SD.ca$rownames SD.ca$N SD.ca$sv #The square roots of singular values related to the PCA transformation for rows/cols # (how much variation explained by the principal components) # for symmetric matrices, singular values = |eigenvalues| # (here sv's are used since the package uses svd instead of eigen) ######################## # Inertia ######################## # How much of the total "variation" the specific variable explains # i.e. how much it contributes to the chi-squared statistic # Inertia is the chi squared statistic divided by n SD.ca$rowinertia SD.ca$colinertia # You can get the single row-inertia values by fixing i (the row index) s2 <- 0 #For physics, i =3 i <- 3 for(j in 1:J){ s2 <- s2 + ( SD[i,j]-E[i,j] )^2/(E[i,j]) } s2/n #Note that sum(SD.ca$rowinertia) sum(SD.ca$colinertia) #is the same as sum(SD.ca$sv^2) s/n SD.ca$rowinertia/sum(SD.ca$sv^2) #these proportional values are the ones seen in summary(SD.ca) SD.ca$colinertia/(s/n) ############################################### #ca manually ################################### #theoretical relative frequencies under independence Ef <- E/n #observed relative frequencies SDf = SD/n #the matrix Z Z = (SDf - Ef)/sqrt(Ef) class(Z) Z = as.matrix(Z) #matrices V and W V = t(Z)%*%Z W = Z%*%t(Z) #chi-squared test statistic n*tr(V) variances = eigen(V)$values SD.ca$sv^2 components = eigen(V)$vectors #V and W has the same non-zero eigenvalues eigen(W)$values components2 = eigen(W)$vectors #forming the matrix R f1 = v1/n #SD.ca$colmass f2 = v2/n #SD.ca$rowmass one = matrix(rep(1,nrow(SD)), ncol=1) sifting = one%*%sqrt(f1) scaling = f2 %*% sqrt(f1) R = SDf/scaling - sifting #rowcoordinates rowcoord = as.matrix(R)%*%components #omit the dimension corresponding to zero eigenvalue rowcoord = rowcoord[,-8] summary(SD.ca) #standardized rowcoordinates stand = matrix(rep(1,nrow(SD), ncol=1))%*%matrix(variances[-8], nrow=1) standrowcoord = rowcoord/sqrt(stand) SD.ca$rowcoord #forming the matrix C one2 = matrix(rep(1,ncol(SD)), nrow=1) sifting2 = sqrt(f2)%*%one2 scaling2 = sqrt(f2) %*% f1 C = SDf/scaling2 - sifting2 #colcoordinates colcoord = t(C)%*%components2 #omit dimensions corresponding to zero eigenvalues colcoord = colcoord[,-c(8:ncol(colcoord))] summary(SD.ca) #standardized colcoordinates stand2 = matrix(rep(1,ncol(SD), ncol=1))%*%matrix(variances[-8], nrow=1) standcolcoord = colcoord/sqrt(stand2) SD.ca$colcoord ## The chi-squared distances from the "center", where variables close to center do not deviate from the # independence assumption. sqrt(rowSums(rowcoord^2)) SD.ca$rowdist #coldistances sqrt(rowSums(colcoord^2)) SD.ca$coldist #################################################### SD.ca summary(SD.ca) # Note that the quantities are multiplied by 1000 # Quality of representation = as in the lecture slides, but here we consider the angle between profiles and # the plane spanned by the two first principal components # Squared correlations = quality of representation from lecture slides # also, the sum of the squared correlations is the quality of representation. # ctr = contribution in forming that ca-component (contributions sum to 1) # important variables related to forming the specific component have a high ctr # k=1 and k=2 are the coordinates on the plot names(summary(SD.ca)$rows) # Contribution of engineering to the second axis f2[1]*SD.ca$rowcoord[1,2]^2 #recall that these coordinates are already scaled # If the rows and columns were independent, ctr would be same for every variable # Squared correlation of biology with the second component d2 = rowcoord[6,]^2 d2[2]/sum(d2) #Note that the following plots unscaled coordinates (principal coordinates) and hence, deduction based on the plot is questionable plot(SD.ca,arrows=c(T,T),map="symmetric") #plot(SD.ca,arrows=c(T,T),map="symmetric",dim=c(2,4)) #Instead try e.g. following commands plot(SD.ca, arrows=c(T,T), map="rowprincipal") #standard column coordinates plot(SD.ca, arrows=c(T,T), map="rowgreen") #standard column coordinates scaled with square root of the column masses plot(SD.ca, arrows=c(T,T), map="colgab", dim=c(3,4)) #standard row coordinates scaled with the row masses plot(SD.ca, arrows=c(T,T), map="rowgab") # If two row-variables are close on the picture, they have a similar profile, # the same is true for column-variables # Distant row/column-variables have different profiles # Variables distant from the origin represent variables different from the average profile # these are usually the most interesting ones # Now you can again try to interpret the dimensions. # 1st dim splits the sciences into soft/hard # 2nd dim splits the sciences into more formula heavy(math,physics,engineering) vs # the more experimental ones (chemistry,agriculture,earthsciences) # same for different years # You can also try this: plot3d.ca(SD.ca)