library(ca)
library(psych)
“Correspondence analysis is a method of data analysis for representing tabular data graphically. Correspondence analysis is a generalization of a simple graphical concept with which we are all familiar, namely the scatterplot. -Micahel Greenarce, Correspondence Analysis in Practice”

Install the package ca. The data set SCIENCEDOCTORATES.txt contains the number of doctors graduated from diferent fields of science. The data is from USA between the years 1960-1975. Apply correspondence analysis to the data set by using the function ca. Also, write an own code that applies correspondence analysis to the data set as presented in the lecture slides. Compare and interpret the results.

0. Look at the data and calculate Chi-squared test statistic
data <- read.table("SCIENCEDOCTORATES.txt",header=T,sep="\t",row.names=1)
data
##               Y1960 Y1965 Y1970 Y1971 Y1972 Y1973 Y1974 Y1975  Total
## Engineering     794  2073  3432  3495  3475  3338  3144  2959  22710
## Mathematics     291   685  1222  1236  1281  1222  1196  1149   8282
## Physics         530  1046  1655  1740  1635  1590  1334  1293  10823
## Chemistry      1078  1444  2234  2204  2011  1849  1792  1762  14374
## EarthSciences   253   375   511   550   580   577   570   556   3972
## Biology        1245  1963  3360  3633  3580  3636  3473  3498  24388
## Agriculture     414   576   803   900   855   853   830   904   6135
## Psychology      772   954  1888  2116  2262  2444  2587  2749  15772
## Sociology       162   239   504   583   638   599   645   680   4050
## Economics       341   538   826   791   863   907   833   867   5966
## Anthropology     69    82   217   240   260   324   381   385   1958
## Others          314   502  1079  1392  1500  1609  1531  1550   9477
## Total          6263 10477 17731 18880 18940 18948 18316 18352 127907
SD <- data[-dim(data)[1],-dim(data)[2]]
dim(SD)
## [1] 12  8
# To interpret correspondence analysis, the first step is to evaluate whether there is a significant
# dependency between the rows and columns.
n <- sum(SD)

v1 <- matrix(colSums(SD),nrow=1)
v2 <- matrix(rowSums(SD),ncol=1)

# Theoretical frequencies under independence
E <- v2 %*% v1 / n

I <- dim(SD)[1]
J <- dim(SD)[2]

# Chi-square statistic: sum( (obs - expected)^2 / expected )
chisq.statistic <- sum((SD - E)^2 / E)
chisq.statistic
## [1] 1686.083
# Note that, a larger value of the test statistic suggests high discrepancy between the observed and expected frequencies.

# Obtain p-value:
# Degrees of freedom: (I-1) * (J-1)
pchisq(chisq.statistic,df=((I-1)*(J-1)),lower.tail=F)
## [1] 4.825946e-301
# H0: Discipline and Year are independent
# H1: Discipline and Year are not independent

chisq.test(SD)
## 
##  Pearson's Chi-squared test
## 
## data:  SD
## X-squared = 1686.1, df = 77, p-value < 2.2e-16
# there is evicende that there is statistically
# significant association between the number
# of doctors graduated and the year (in USA)

1. Correspondence Analysis with ca package
help(ca)
## starting httpd help server ... done
SD.ca = ca(SD) # set nd=8 to make all columns visible in summary
names(SD.ca)
##  [1] "sv"         "nd"         "rownames"   "rowmass"    "rowdist"   
##  [6] "rowinertia" "rowcoord"   "rowsup"     "colnames"   "colmass"   
## [11] "coldist"    "colinertia" "colcoord"   "colsup"     "N"         
## [16] "call"
SD.ca$colnames
## [1] "Y1960" "Y1965" "Y1970" "Y1971" "Y1972" "Y1973" "Y1974" "Y1975"
SD.ca$rownames
##  [1] "Engineering"   "Mathematics"   "Physics"       "Chemistry"    
##  [5] "EarthSciences" "Biology"       "Agriculture"   "Psychology"   
##  [9] "Sociology"     "Economics"     "Anthropology"  "Others"
SD.ca$N
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
##  [1,]  794 2073 3432 3495 3475 3338 3144 2959
##  [2,]  291  685 1222 1236 1281 1222 1196 1149
##  [3,]  530 1046 1655 1740 1635 1590 1334 1293
##  [4,] 1078 1444 2234 2204 2011 1849 1792 1762
##  [5,]  253  375  511  550  580  577  570  556
##  [6,] 1245 1963 3360 3633 3580 3636 3473 3498
##  [7,]  414  576  803  900  855  853  830  904
##  [8,]  772  954 1888 2116 2262 2444 2587 2749
##  [9,]  162  239  504  583  638  599  645  680
## [10,]  341  538  826  791  863  907  833  867
## [11,]   69   82  217  240  260  324  381  385
## [12,]  314  502 1079 1392 1500 1609 1531 1550
SD.ca$sv
## [1] 0.096435719 0.057074704 0.017157030 0.013850149 0.008218175 0.006517176
## [7] 0.005341320
#The square roots of singular values related to the PCA transformation for rows/cols
# (how much variation explained by the principal components)
# for symmetric matrices, singular values = |eigenvalues|
# (here sv's are used since the package uses svd instead of eigen)
SD.ca
## 
##  Principal inertias (eigenvalues):
##            1      2        3        4        5       6       7      
## Value      0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71%   2.23%    1.46%    0.52%   0.32%   0.22%  
## 
## 
##  Rows:
##         Engineering Mathematics  Physics Chemistry EarthSciences   Biology
## Mass       0.177551    0.064750 0.084616  0.112379      0.031054  0.190670
## ChiDist    0.089524    0.070493 0.110288  0.169574      0.088275  0.012359
## Inertia    0.001423    0.000322 0.001029  0.003232      0.000242  0.000029
## Dim. 1     0.322439   -0.077507 1.011402  1.585369      0.425654  0.008109
## Dim. 2     1.446807    1.182347 0.805056 -1.233691     -1.115307 -0.150126
##         Agriculture Psychology Sociology Economics Anthropology    Others
## Mass       0.047965   0.123308  0.031664  0.046643     0.015308  0.074093
## ChiDist    0.103939   0.139709  0.127069  0.063467     0.273803  0.169701
## Inertia    0.000518   0.002407  0.000511  0.000188     0.001148  0.002134
## Dim. 1     0.570837  -1.321263 -1.254533  0.233745    -2.719310 -1.683658
## Dim. 2    -1.441849  -0.982594 -0.023063 -0.677752    -0.855932  0.487289
## 
## 
##  Columns:
##             Y1960    Y1965    Y1970    Y1971     Y1972     Y1973     Y1974
## Mass     0.048965 0.081911 0.138624 0.147607  0.148076  0.148139  0.143198
## ChiDist  0.269772 0.192164 0.100931 0.057244  0.040987  0.065196  0.100495
## Inertia  0.003564 0.003025 0.001412 0.000484  0.000249  0.000630  0.001446
## Dim. 1   1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2  -3.849055 0.323144 0.659972 0.545106  0.598148  0.305754 -0.219357
##             Y1975
## Mass     0.143479
## ChiDist  0.128614
## Inertia  0.002373
## Dim. 1  -1.238515
## Dim. 2  -0.783409

2. Inertia
########################
# Inertia
########################

# The total "inertia" of a contingency table is chi^2 statistic divided by the total number of observations.
# Geomterically, the inertia measures how 'far' the row profiles (or column profiles) are from their average profile.
# Alternatively, how much of the total "variation" the specific variable explains
# i.e. how much it contributes to the chi-squared statistic

# From ca package:
SD.ca$rowinertia
##  [1] 1.422986e-03 3.217643e-04 1.029233e-03 3.231501e-03 2.419862e-04
##  [6] 2.912176e-05 5.181770e-04 2.406814e-03 5.112553e-04 1.878816e-04
## [11] 1.147611e-03 2.133771e-03
SD.ca$colinertia
## [1] 0.0035635436 0.0030247205 0.0014121599 0.0004836882 0.0002487642
## [6] 0.0006296666 0.0014461900 0.0023733687
# ...and manually calculated from the original data:
rowSums((SD-E)^2 / E) / sum(SD)
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##  1.422986e-03  3.217643e-04  1.029233e-03  3.231501e-03  2.419862e-04 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##  2.912176e-05  5.181770e-04  2.406814e-03  5.112553e-04  1.878816e-04 
##  Anthropology        Others 
##  1.147611e-03  2.133771e-03
colSums((SD-E)^2 / E) / sum(SD)
##        Y1960        Y1965        Y1970        Y1971        Y1972 
## 0.0035635436 0.0030247205 0.0014121599 0.0004836882 0.0002487642 
##        Y1973        Y1974        Y1975 
## 0.0006296666 0.0014461900 0.0023733687
# You can get the single inertia values:
((SD-E)^2 / E) / sum(SD)
##                      Y1960        Y1965        Y1970        Y1971
## Engineering   7.109807e-04 1.903211e-04 2.000845e-04 4.758578e-05
## Mathematics   2.528851e-04 5.039124e-07 3.720434e-05 1.168449e-06
## Physics       3.534831e-11 2.242892e-04 1.246624e-04 9.930145e-05
## Chemistry     1.555197e-03 4.719966e-04 2.286770e-04 2.495462e-05
## EarthSciences 1.376157e-04 5.923501e-05 2.228333e-05 1.756733e-05
## Biology       1.691880e-05 4.698143e-06 9.972256e-07 2.387297e-06
## Agriculture   3.358499e-04 8.399127e-05 2.070585e-05 2.678950e-07
## Psychology    7.944824e-10 6.909657e-04 3.183611e-04 1.510201e-04
## Sociology     5.197551e-05 2.026940e-04 4.592579e-05 2.868235e-06
## Economics     6.392585e-05 3.891346e-05 1.006250e-08 7.131345e-05
## Anthropology  5.889373e-05 2.994896e-04 8.532347e-05 6.498949e-05
## Others        3.793000e-04 7.576224e-04 3.279248e-04 2.640764e-07
##                      Y1972        Y1973        Y1974        Y1975
## Engineering   2.926060e-05 1.599370e-06 2.805276e-05 2.151012e-04
## Mathematics   1.902723e-05 1.521415e-07 6.639655e-07 1.015918e-05
## Physics       5.111550e-06 8.634898e-07 2.349872e-04 3.400172e-04
## Chemistry     5.066908e-05 2.885724e-04 2.694113e-04 3.420220e-04
## EarthSciences 8.849260e-07 1.729096e-06 2.040414e-08 2.650391e-06
## Biology       2.119023e-06 1.163650e-06 8.345584e-07 3.068295e-09
## Agriculture   2.458520e-05 2.681570e-05 2.094926e-05 5.011937e-06
## Psychology    1.806487e-05 3.870788e-05 3.735185e-04 8.161751e-04
## Sociology     1.911414e-05 1.207142e-08 5.704201e-05 1.316236e-04
## Economics     3.691424e-06 4.762749e-06 4.158916e-06 1.105709e-06
## Anthropology  2.416136e-05 3.105644e-05 2.823022e-04 3.013945e-04
## Others        5.207478e-05 2.342316e-04 1.742490e-04 2.081048e-04
# ...which sums to total inertia:
sum((SD-E)^2 / E / sum(SD))
## [1] 0.0131821
#Note that the total inertia is:
sum(SD.ca$rowinertia)
## [1] 0.0131821
sum(SD.ca$colinertia)
## [1] 0.0131821
#is the same as
sum(SD.ca$sv^2)
## [1] 0.0131821
# And same as Chi-squared test statistic divided by the grand total number of observations:
chisq.statistic / n 
## [1] 0.0131821
# Use summary function to obtain more details about ca:
summary(SD.ca)
## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.009300  70.5  70.5  ******************       
##  2      0.003258  24.7  95.3  ******                   
##  3      0.000294   2.2  97.5  *                        
##  4      0.000192   1.5  98.9                           
##  5      6.8e-050   0.5  99.5                           
##  6      4.2e-050   0.3  99.8                           
##  7      2.9e-050   0.2 100.0                           
##         -------- -----                                 
##  Total: 0.013182 100.0                                 
## 
## 
## Rows:
##      name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1  | Engn |  178  971  108 |   31 121  18 |   83 851 372 |
## 2  | Mthm |   65  928   24 |   -7  11   0 |   67 916  91 |
## 3  | Phys |   85  956   78 |   98 782  87 |   46 174  55 |
## 4  | Chms |  112  985  245 |  153 813 282 |  -70 172 171 |
## 5  | ErtS |   31  736   18 |   41 216   6 |  -64 520  39 |
## 6  | Blgy |  191  485    2 |    1   4   0 |   -9 481   4 |
## 7  | Agrc |   48  907   39 |   55 281  16 |  -82 627 100 |
## 8  | Psyc |  123  993  183 | -127 832 215 |  -56 161 119 |
## 9  | Sclg |   32  907   39 | -121 906  50 |   -1   0   0 |
## 10 | Ecnm |   47  498   14 |   23 126   3 |  -39 371  21 |
## 11 | Anth |   15  949   87 | -262 917 113 |  -49  32  11 |
## 12 | Othr |   74  942  162 | -162 915 210 |   28  27  18 |
## 
## Columns:
##      name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 | Y1960 |   49  995  270 |  155 332 127 | -220 663 725 |
## 2 | Y1965 |   82  961  229 |  187 952 310 |   18   9   9 |
## 3 | Y1970 |  139  929  107 |   90 789 120 |   38 139  60 |
## 4 | Y1971 |  148  744   37 |   38 449  23 |   31 295  44 |
## 5 | Y1972 |  148  755   19 |  -10  62   2 |   34 694  53 |
## 6 | Y1973 |  148  830   48 |  -57 758  51 |   17  72  14 |
## 7 | Y1974 |  143  962  110 |  -98 946 147 |  -13  16   7 |
## 8 | Y1975 |  143  983  180 | -119 862 220 |  -45 121  88 |
# Note that, in the summary, everything is multiplied by 1000 to add visual clarity.

# These proportional values are the ones seen in summary(SD.ca)
SD.ca$rowinertia / sum(SD.ca$sv^2)
##  [1] 0.10794834 0.02440918 0.07807803 0.24514304 0.01835718 0.00220919
##  [7] 0.03930913 0.18258198 0.03878405 0.01425278 0.08705826 0.16186883
SD.ca$colinertia / (chisq.statistic/n)
## [1] 0.27033198 0.22945662 0.10712707 0.03669280 0.01887136 0.04776678
## [7] 0.10970861 0.18004479
SD.ca$rowinertia / sum(SD.ca$rowinertia)
##  [1] 0.10794834 0.02440918 0.07807803 0.24514304 0.01835718 0.00220919
##  [7] 0.03930913 0.18258198 0.03878405 0.01425278 0.08705826 0.16186883

3. CA manually
#theoretical relative frequencies under independence
Ef <- E/n
#observed relative frequencies
SDf = SD/n

#the matrix Z
Z = (SDf - Ef)/sqrt(Ef)
class(Z)
## [1] "data.frame"
Z = as.matrix(Z)

# PAUSE: Recall, the chi-squared-distance:
sqrt(rowSums(Z^2) / (rowSums(SD)/n))
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##    0.08952388    0.07049341    0.11028846    0.16957446    0.08827503 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##    0.01235856    0.10393910    0.13970921    0.12706869    0.06346696 
##  Anthropology        Others 
##    0.27380295    0.16970149
# Or:
sqrt(rowSums(((SD/n - E/n)^2) / (E/n)) / (rowSums(SD)/n))
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##    0.08952388    0.07049341    0.11028846    0.16957446    0.08827503 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##    0.01235856    0.10393910    0.13970921    0.12706869    0.06346696 
##  Anthropology        Others 
##    0.27380295    0.16970149
# CONTINUE.

#matrices V and W
V = t(Z)%*%Z
W = Z%*%t(Z)

#chi-squared test statistic
n*tr(V)
## [1] 1686.083
variances = eigen(V)$values
SD.ca$sv^2
## [1] 9.299848e-03 3.257522e-03 2.943637e-04 1.918266e-04 6.753840e-05
## [6] 4.247359e-05 2.852970e-05
components = eigen(V)$vectors

#V and W has the same non-zero eigenvalues
eigen(W)$values
##  [1]  9.299848e-03  3.257522e-03  2.943637e-04  1.918266e-04  6.753840e-05
##  [6]  4.247359e-05  2.852970e-05  1.220016e-18  4.337860e-19 -5.184921e-20
## [11] -1.130535e-19 -1.786060e-19
components2 = eigen(W)$vectors

#forming the matrix R
f1 = v1/n #SD.ca$colmass
f2 = v2/n #SD.ca$rowmass
one = matrix(rep(1,nrow(SD)), ncol=1)
sifting = one%*%sqrt(f1)
scaling = f2 %*% sqrt(f1)
R = SDf/scaling - sifting
# PAUSE: Chi-squared-distance:
sqrt(rowSums(R^2))
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##    0.08952388    0.07049341    0.11028846    0.16957446    0.08827503 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##    0.01235856    0.10393910    0.13970921    0.12706869    0.06346696 
##  Anthropology        Others 
##    0.27380295    0.16970149
#rowcoordinates (think as PCA scores)
rowcoord = as.matrix(R) %*% components
#omit the dimension corresponding to zero eigenvalue
rowcoord = rowcoord[,-8]

# PAUSE HERE:
# These are the same 'raw and unscaled' so-called principal co-ordinates than in the CA image shown at the beginning:
# (Without the years plotted)
par(mfrow = c(1,2))
plot(rowcoord[,1] * -1, rowcoord[,2]* -1, pch = 16, xlim = c(-0.2,0.2), ylim = c(-0.2,0.2))
text(rowcoord[,1]* -1, rowcoord[,2]* -1, rownames(rowcoord))
abline(h = 0, v = 0, lty = 3)
plot.ca(ca(SD), map = 'symmetric')

# Standardized rowcoordinates
stand = matrix(rep(1,nrow(SD), ncol=1)) %*% matrix(variances[-8], nrow=1)
standrowcoord = rowcoord / sqrt(stand)
SD.ca$rowcoord
##                       Dim1        Dim2       Dim3         Dim4       Dim5
## Engineering    0.322438878  1.44680720  0.8482280 -0.007459002 -0.3661262
## Mathematics   -0.077507427  1.18234663  0.8452192 -0.675259459 -0.2595699
## Physics        1.011402351  0.80505558 -1.1958994  0.453545684  0.7139126
## Chemistry      1.585369258 -1.23369079 -0.3149825 -1.366044795  0.4105074
## EarthSciences  0.425654104 -1.11530691  0.5296933  2.714065450 -1.3022776
## Biology        0.008109185 -0.15012565 -0.4379684 -0.104110739  0.4176107
## Agriculture    0.570837241 -1.44184897  0.1000322  1.302055137 -2.5726441
## Psychology    -1.321263122 -0.98259417  0.5715345 -0.379061483  0.1089682
## Sociology     -1.254532552 -0.02306289 -0.4419189 -1.818647798 -2.8867506
## Economics      0.233745056 -0.67775239  1.3459936  2.322331238  1.6640115
## Anthropology  -2.719309874 -0.85593181  2.8853357 -1.277746350  2.6419221
## Others        -1.683657551  0.48728926 -2.2932582  0.659518061  0.2617515
##                     Dim6        Dim7
## Engineering    0.3550650  0.30347494
## Mathematics   -1.1612581 -0.03416783
## Physics        0.6578247 -0.95293042
## Chemistry     -0.5101158  0.69482903
## EarthSciences -1.2095653  3.66473773
## Biology        0.4378783 -0.09172852
## Agriculture    2.1324481 -1.06690138
## Psychology     0.1191914 -0.68409280
## Sociology     -2.2953312 -1.01846617
## Economics     -2.0849905 -1.71683848
## Anthropology   3.0650718  2.49932816
## Others        -0.3520651  0.86594977
# Forming the matrix C is the same as the R matrix, but to column profiles instead.
one2 = matrix(rep(1,ncol(SD)), nrow=1)
sifting2 = sqrt(f2)%*%one2
scaling2 = sqrt(f2) %*% f1
C = SDf/scaling2 - sifting2

# Column co-ordinates
colcoord = t(C)%*%components2
# Omit dimensions corresponding to zero eigenvalues
colcoord = colcoord[,-c(8:ncol(colcoord))]

# Standardized colcoordinates
stand2 = matrix(rep(1,ncol(SD), ncol=1))%*%matrix(variances[-8], nrow=1)
standcolcoord = colcoord/sqrt(stand2)
SD.ca$colcoord
##             Dim1       Dim2       Dim3       Dim4        Dim5       Dim6
## Y1960  1.6106625 -3.8490551 -1.0809061 -0.1725293  0.55209387 -0.6163723
## Y1965  1.9439873  0.3231438  1.5425491  1.8375923 -1.04979904  0.6491194
## Y1970  0.9299411  0.6599718  0.8557550 -1.4208929  1.21979719 -0.4725652
## Y1971  0.3975732  0.5451063 -1.4168200 -0.8028717 -0.68105651  1.4714772
## Y1972 -0.1056064  0.5981483 -0.7718786  0.2352183 -0.97394826 -1.9448011
## Y1973 -0.5886455  0.3057542 -0.6865726  1.4522227  1.61145599  0.2752951
## Y1974 -1.0136835 -0.2193574  0.9065753 -0.4103387  0.08366545  0.1000611
## Y1975 -1.2385154 -0.7834090  0.7197212 -0.1240095 -0.80909988  0.4055471
##              Dim7
## Y1960  0.36101993
## Y1965  0.21263900
## Y1970 -0.67150608
## Y1971  0.19612414
## Y1972  0.04887571
## Y1973 -0.23983826
## Y1974  1.97494489
## Y1975 -1.57146779
## The chi-squared distances from the "center", where variables close to center do not deviate from the
# independence assumption:
# Row distances:
sqrt(rowSums(rowcoord^2))
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##    0.08952388    0.07049341    0.11028846    0.16957446    0.08827503 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##    0.01235856    0.10393910    0.13970921    0.12706869    0.06346696 
##  Anthropology        Others 
##    0.27380295    0.16970149
SD.ca$rowdist
##  [1] 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503 0.01235856
##  [7] 0.10393910 0.13970921 0.12706869 0.06346696 0.27380295 0.16970149
# Column distances:
sqrt(rowSums(colcoord^2))
##      Y1960      Y1965      Y1970      Y1971      Y1972      Y1973 
## 0.26977207 0.19216368 0.10093051 0.05724386 0.04098747 0.06519598 
##      Y1974      Y1975 
## 0.10049502 0.12861395
SD.ca$coldist
## [1] 0.26977207 0.19216368 0.10093051 0.05724386 0.04098747 0.06519598
## [7] 0.10049502 0.12861395
# Now, the inertia can be also stated as MASS times Chi-squared-distance^2.
# For rows:
round((rowSums(SD) / n) * sqrt(rowSums(rowcoord^2))^2,5)
##   Engineering   Mathematics       Physics     Chemistry EarthSciences 
##       0.00142       0.00032       0.00103       0.00323       0.00024 
##       Biology   Agriculture    Psychology     Sociology     Economics 
##       0.00003       0.00052       0.00241       0.00051       0.00019 
##  Anthropology        Others 
##       0.00115       0.00213
SD.ca
## 
##  Principal inertias (eigenvalues):
##            1      2        3        4        5       6       7      
## Value      0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71%   2.23%    1.46%    0.52%   0.32%   0.22%  
## 
## 
##  Rows:
##         Engineering Mathematics  Physics Chemistry EarthSciences   Biology
## Mass       0.177551    0.064750 0.084616  0.112379      0.031054  0.190670
## ChiDist    0.089524    0.070493 0.110288  0.169574      0.088275  0.012359
## Inertia    0.001423    0.000322 0.001029  0.003232      0.000242  0.000029
## Dim. 1     0.322439   -0.077507 1.011402  1.585369      0.425654  0.008109
## Dim. 2     1.446807    1.182347 0.805056 -1.233691     -1.115307 -0.150126
##         Agriculture Psychology Sociology Economics Anthropology    Others
## Mass       0.047965   0.123308  0.031664  0.046643     0.015308  0.074093
## ChiDist    0.103939   0.139709  0.127069  0.063467     0.273803  0.169701
## Inertia    0.000518   0.002407  0.000511  0.000188     0.001148  0.002134
## Dim. 1     0.570837  -1.321263 -1.254533  0.233745    -2.719310 -1.683658
## Dim. 2    -1.441849  -0.982594 -0.023063 -0.677752    -0.855932  0.487289
## 
## 
##  Columns:
##             Y1960    Y1965    Y1970    Y1971     Y1972     Y1973     Y1974
## Mass     0.048965 0.081911 0.138624 0.147607  0.148076  0.148139  0.143198
## ChiDist  0.269772 0.192164 0.100931 0.057244  0.040987  0.065196  0.100495
## Inertia  0.003564 0.003025 0.001412 0.000484  0.000249  0.000630  0.001446
## Dim. 1   1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2  -3.849055 0.323144 0.659972 0.545106  0.598148  0.305754 -0.219357
##             Y1975
## Mass     0.143479
## ChiDist  0.128614
## Inertia  0.002373
## Dim. 1  -1.238515
## Dim. 2  -0.783409

4. Interpreation of a CA plot
SD.ca
## 
##  Principal inertias (eigenvalues):
##            1      2        3        4        5       6       7      
## Value      0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71%   2.23%    1.46%    0.52%   0.32%   0.22%  
## 
## 
##  Rows:
##         Engineering Mathematics  Physics Chemistry EarthSciences   Biology
## Mass       0.177551    0.064750 0.084616  0.112379      0.031054  0.190670
## ChiDist    0.089524    0.070493 0.110288  0.169574      0.088275  0.012359
## Inertia    0.001423    0.000322 0.001029  0.003232      0.000242  0.000029
## Dim. 1     0.322439   -0.077507 1.011402  1.585369      0.425654  0.008109
## Dim. 2     1.446807    1.182347 0.805056 -1.233691     -1.115307 -0.150126
##         Agriculture Psychology Sociology Economics Anthropology    Others
## Mass       0.047965   0.123308  0.031664  0.046643     0.015308  0.074093
## ChiDist    0.103939   0.139709  0.127069  0.063467     0.273803  0.169701
## Inertia    0.000518   0.002407  0.000511  0.000188     0.001148  0.002134
## Dim. 1     0.570837  -1.321263 -1.254533  0.233745    -2.719310 -1.683658
## Dim. 2    -1.441849  -0.982594 -0.023063 -0.677752    -0.855932  0.487289
## 
## 
##  Columns:
##             Y1960    Y1965    Y1970    Y1971     Y1972     Y1973     Y1974
## Mass     0.048965 0.081911 0.138624 0.147607  0.148076  0.148139  0.143198
## ChiDist  0.269772 0.192164 0.100931 0.057244  0.040987  0.065196  0.100495
## Inertia  0.003564 0.003025 0.001412 0.000484  0.000249  0.000630  0.001446
## Dim. 1   1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2  -3.849055 0.323144 0.659972 0.545106  0.598148  0.305754 -0.219357
##             Y1975
## Mass     0.143479
## ChiDist  0.128614
## Inertia  0.002373
## Dim. 1  -1.238515
## Dim. 2  -0.783409
summary(SD.ca)
## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.009300  70.5  70.5  ******************       
##  2      0.003258  24.7  95.3  ******                   
##  3      0.000294   2.2  97.5  *                        
##  4      0.000192   1.5  98.9                           
##  5      6.8e-050   0.5  99.5                           
##  6      4.2e-050   0.3  99.8                           
##  7      2.9e-050   0.2 100.0                           
##         -------- -----                                 
##  Total: 0.013182 100.0                                 
## 
## 
## Rows:
##      name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1  | Engn |  178  971  108 |   31 121  18 |   83 851 372 |
## 2  | Mthm |   65  928   24 |   -7  11   0 |   67 916  91 |
## 3  | Phys |   85  956   78 |   98 782  87 |   46 174  55 |
## 4  | Chms |  112  985  245 |  153 813 282 |  -70 172 171 |
## 5  | ErtS |   31  736   18 |   41 216   6 |  -64 520  39 |
## 6  | Blgy |  191  485    2 |    1   4   0 |   -9 481   4 |
## 7  | Agrc |   48  907   39 |   55 281  16 |  -82 627 100 |
## 8  | Psyc |  123  993  183 | -127 832 215 |  -56 161 119 |
## 9  | Sclg |   32  907   39 | -121 906  50 |   -1   0   0 |
## 10 | Ecnm |   47  498   14 |   23 126   3 |  -39 371  21 |
## 11 | Anth |   15  949   87 | -262 917 113 |  -49  32  11 |
## 12 | Othr |   74  942  162 | -162 915 210 |   28  27  18 |
## 
## Columns:
##      name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 | Y1960 |   49  995  270 |  155 332 127 | -220 663 725 |
## 2 | Y1965 |   82  961  229 |  187 952 310 |   18   9   9 |
## 3 | Y1970 |  139  929  107 |   90 789 120 |   38 139  60 |
## 4 | Y1971 |  148  744   37 |   38 449  23 |   31 295  44 |
## 5 | Y1972 |  148  755   19 |  -10  62   2 |   34 694  53 |
## 6 | Y1973 |  148  830   48 |  -57 758  51 |   17  72  14 |
## 7 | Y1974 |  143  962  110 |  -98 946 147 |  -13  16   7 |
## 8 | Y1975 |  143  983  180 | -119 862 220 |  -45 121  88 |
# Note that the quantities are multiplied by 1000
# Quality of representation = as in the lecture slides, but here we consider the angle between profiles and 
# the plane spanned by the two first principal components
# Squared correlations = quality of representation from lecture slides
# also, the sum of the squared correlations is the quality of representation.

# ctr = contribution in forming that ca-component (contributions sum to 1)
# important variables related to forming the specific component have a high ctr 

# k=1 and k=2 are the coordinates on the plot 

names(summary(SD.ca)$rows)
##  [1] "name" "mass" " qlt" " inr" " k=1" "cor"  "ctr"  " k=2" "cor"  "ctr"
# Contribution of engineering to the second axis
f2[1]*SD.ca$rowcoord[1,2]^2 #recall that these coordinates are already scaled
## [1] 0.3716586
# If the rows and columns were independent, ctr would be same for every variable

# Squared correlation of biology with the second component
d2 = rowcoord[6,]^2
d2[2]/sum(d2)
## [1] 0.4806858
#Note that the following plots unscaled coordinates (principal coordinates) and hence, deduction based on the plot is questionable
plot(SD.ca,arrows=c(T,T),map="symmetric") 

#plot(SD.ca,arrows=c(T,T),map="symmetric",dim=c(2,4))

#Instead try e.g. following commands
plot(SD.ca, arrows=c(T,T), map="rowprincipal") #standard column coordinates

plot(SD.ca, arrows=c(T,T), map="colprincipal")

plot(SD.ca, arrows=c(T,T), map="rowgreen") #standard column coordinates scaled with square root of the column masses

plot(SD.ca, arrows=c(T,T), map="colgab", dim=c(1,2)) #standard row coordinates scaled with the row masses

plot(SD.ca, arrows=c(T,T), map="rowgab") 

# If two row-variables are close on the picture, they have a similar profile,
# the same is true for column-variables

# Distant row/column-variables have different profiles

# Variables distant from the origin represent variables different from the average profile
# these are usually the most interesting ones

# Now you can again try to interpret the dimensions.
# 1st dim splits the sciences into soft/hard
# 2nd dim splits the sciences into more formula heavy(math,physics,engineering) vs
# the more experimental ones (chemistry,agriculture,earthsciences)

# same for different years