library(ca)
library(psych)
“Correspondence analysis is a method of data analysis for representing tabular data graphically. Correspondence analysis is a generalization of a simple graphical concept with which we are all familiar, namely the scatterplot. -Micahel Greenarce, Correspondence Analysis in Practice”
Install the package ca. The data set SCIENCEDOCTORATES.txt contains the number of doctors graduated from diferent fields of science. The data is from USA between the years 1960-1975. Apply correspondence analysis to the data set by using the function ca. Also, write an own code that applies correspondence analysis to the data set as presented in the lecture slides. Compare and interpret the results.
0. Look at the data and calculate Chi-squared test statistic
data <- read.table("SCIENCEDOCTORATES.txt",header=T,sep="\t",row.names=1)
data
## Y1960 Y1965 Y1970 Y1971 Y1972 Y1973 Y1974 Y1975 Total
## Engineering 794 2073 3432 3495 3475 3338 3144 2959 22710
## Mathematics 291 685 1222 1236 1281 1222 1196 1149 8282
## Physics 530 1046 1655 1740 1635 1590 1334 1293 10823
## Chemistry 1078 1444 2234 2204 2011 1849 1792 1762 14374
## EarthSciences 253 375 511 550 580 577 570 556 3972
## Biology 1245 1963 3360 3633 3580 3636 3473 3498 24388
## Agriculture 414 576 803 900 855 853 830 904 6135
## Psychology 772 954 1888 2116 2262 2444 2587 2749 15772
## Sociology 162 239 504 583 638 599 645 680 4050
## Economics 341 538 826 791 863 907 833 867 5966
## Anthropology 69 82 217 240 260 324 381 385 1958
## Others 314 502 1079 1392 1500 1609 1531 1550 9477
## Total 6263 10477 17731 18880 18940 18948 18316 18352 127907
SD <- data[-dim(data)[1],-dim(data)[2]]
dim(SD)
## [1] 12 8
# To interpret correspondence analysis, the first step is to evaluate whether there is a significant
# dependency between the rows and columns.
n <- sum(SD)
v1 <- matrix(colSums(SD),nrow=1)
v2 <- matrix(rowSums(SD),ncol=1)
# Theoretical frequencies under independence
E <- v2 %*% v1 / n
I <- dim(SD)[1]
J <- dim(SD)[2]
# Chi-square statistic: sum( (obs - expected)^2 / expected )
chisq.statistic <- sum((SD - E)^2 / E)
chisq.statistic
## [1] 1686.083
# Note that, a larger value of the test statistic suggests high discrepancy between the observed and expected frequencies.
# Obtain p-value:
# Degrees of freedom: (I-1) * (J-1)
pchisq(chisq.statistic,df=((I-1)*(J-1)),lower.tail=F)
## [1] 4.825946e-301
# H0: Discipline and Year are independent
# H1: Discipline and Year are not independent
chisq.test(SD)
##
## Pearson's Chi-squared test
##
## data: SD
## X-squared = 1686.1, df = 77, p-value < 2.2e-16
# there is evicende that there is statistically
# significant association between the number
# of doctors graduated and the year (in USA)
1. Correspondence Analysis with ca package
help(ca)
## starting httpd help server ... done
SD.ca = ca(SD) # set nd=8 to make all columns visible in summary
names(SD.ca)
## [1] "sv" "nd" "rownames" "rowmass" "rowdist"
## [6] "rowinertia" "rowcoord" "rowsup" "colnames" "colmass"
## [11] "coldist" "colinertia" "colcoord" "colsup" "N"
## [16] "call"
SD.ca$colnames
## [1] "Y1960" "Y1965" "Y1970" "Y1971" "Y1972" "Y1973" "Y1974" "Y1975"
SD.ca$rownames
## [1] "Engineering" "Mathematics" "Physics" "Chemistry"
## [5] "EarthSciences" "Biology" "Agriculture" "Psychology"
## [9] "Sociology" "Economics" "Anthropology" "Others"
SD.ca$N
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 794 2073 3432 3495 3475 3338 3144 2959
## [2,] 291 685 1222 1236 1281 1222 1196 1149
## [3,] 530 1046 1655 1740 1635 1590 1334 1293
## [4,] 1078 1444 2234 2204 2011 1849 1792 1762
## [5,] 253 375 511 550 580 577 570 556
## [6,] 1245 1963 3360 3633 3580 3636 3473 3498
## [7,] 414 576 803 900 855 853 830 904
## [8,] 772 954 1888 2116 2262 2444 2587 2749
## [9,] 162 239 504 583 638 599 645 680
## [10,] 341 538 826 791 863 907 833 867
## [11,] 69 82 217 240 260 324 381 385
## [12,] 314 502 1079 1392 1500 1609 1531 1550
SD.ca$sv
## [1] 0.096435719 0.057074704 0.017157030 0.013850149 0.008218175 0.006517176
## [7] 0.005341320
#The square roots of singular values related to the PCA transformation for rows/cols
# (how much variation explained by the principal components)
# for symmetric matrices, singular values = |eigenvalues|
# (here sv's are used since the package uses svd instead of eigen)
SD.ca
##
## Principal inertias (eigenvalues):
## 1 2 3 4 5 6 7
## Value 0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71% 2.23% 1.46% 0.52% 0.32% 0.22%
##
##
## Rows:
## Engineering Mathematics Physics Chemistry EarthSciences Biology
## Mass 0.177551 0.064750 0.084616 0.112379 0.031054 0.190670
## ChiDist 0.089524 0.070493 0.110288 0.169574 0.088275 0.012359
## Inertia 0.001423 0.000322 0.001029 0.003232 0.000242 0.000029
## Dim. 1 0.322439 -0.077507 1.011402 1.585369 0.425654 0.008109
## Dim. 2 1.446807 1.182347 0.805056 -1.233691 -1.115307 -0.150126
## Agriculture Psychology Sociology Economics Anthropology Others
## Mass 0.047965 0.123308 0.031664 0.046643 0.015308 0.074093
## ChiDist 0.103939 0.139709 0.127069 0.063467 0.273803 0.169701
## Inertia 0.000518 0.002407 0.000511 0.000188 0.001148 0.002134
## Dim. 1 0.570837 -1.321263 -1.254533 0.233745 -2.719310 -1.683658
## Dim. 2 -1.441849 -0.982594 -0.023063 -0.677752 -0.855932 0.487289
##
##
## Columns:
## Y1960 Y1965 Y1970 Y1971 Y1972 Y1973 Y1974
## Mass 0.048965 0.081911 0.138624 0.147607 0.148076 0.148139 0.143198
## ChiDist 0.269772 0.192164 0.100931 0.057244 0.040987 0.065196 0.100495
## Inertia 0.003564 0.003025 0.001412 0.000484 0.000249 0.000630 0.001446
## Dim. 1 1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2 -3.849055 0.323144 0.659972 0.545106 0.598148 0.305754 -0.219357
## Y1975
## Mass 0.143479
## ChiDist 0.128614
## Inertia 0.002373
## Dim. 1 -1.238515
## Dim. 2 -0.783409
2. Inertia
########################
# Inertia
########################
# The total "inertia" of a contingency table is chi^2 statistic divided by the total number of observations.
# Geomterically, the inertia measures how 'far' the row profiles (or column profiles) are from their average profile.
# Alternatively, how much of the total "variation" the specific variable explains
# i.e. how much it contributes to the chi-squared statistic
# From ca package:
SD.ca$rowinertia
## [1] 1.422986e-03 3.217643e-04 1.029233e-03 3.231501e-03 2.419862e-04
## [6] 2.912176e-05 5.181770e-04 2.406814e-03 5.112553e-04 1.878816e-04
## [11] 1.147611e-03 2.133771e-03
SD.ca$colinertia
## [1] 0.0035635436 0.0030247205 0.0014121599 0.0004836882 0.0002487642
## [6] 0.0006296666 0.0014461900 0.0023733687
# ...and manually calculated from the original data:
rowSums((SD-E)^2 / E) / sum(SD)
## Engineering Mathematics Physics Chemistry EarthSciences
## 1.422986e-03 3.217643e-04 1.029233e-03 3.231501e-03 2.419862e-04
## Biology Agriculture Psychology Sociology Economics
## 2.912176e-05 5.181770e-04 2.406814e-03 5.112553e-04 1.878816e-04
## Anthropology Others
## 1.147611e-03 2.133771e-03
colSums((SD-E)^2 / E) / sum(SD)
## Y1960 Y1965 Y1970 Y1971 Y1972
## 0.0035635436 0.0030247205 0.0014121599 0.0004836882 0.0002487642
## Y1973 Y1974 Y1975
## 0.0006296666 0.0014461900 0.0023733687
# You can get the single inertia values:
((SD-E)^2 / E) / sum(SD)
## Y1960 Y1965 Y1970 Y1971
## Engineering 7.109807e-04 1.903211e-04 2.000845e-04 4.758578e-05
## Mathematics 2.528851e-04 5.039124e-07 3.720434e-05 1.168449e-06
## Physics 3.534831e-11 2.242892e-04 1.246624e-04 9.930145e-05
## Chemistry 1.555197e-03 4.719966e-04 2.286770e-04 2.495462e-05
## EarthSciences 1.376157e-04 5.923501e-05 2.228333e-05 1.756733e-05
## Biology 1.691880e-05 4.698143e-06 9.972256e-07 2.387297e-06
## Agriculture 3.358499e-04 8.399127e-05 2.070585e-05 2.678950e-07
## Psychology 7.944824e-10 6.909657e-04 3.183611e-04 1.510201e-04
## Sociology 5.197551e-05 2.026940e-04 4.592579e-05 2.868235e-06
## Economics 6.392585e-05 3.891346e-05 1.006250e-08 7.131345e-05
## Anthropology 5.889373e-05 2.994896e-04 8.532347e-05 6.498949e-05
## Others 3.793000e-04 7.576224e-04 3.279248e-04 2.640764e-07
## Y1972 Y1973 Y1974 Y1975
## Engineering 2.926060e-05 1.599370e-06 2.805276e-05 2.151012e-04
## Mathematics 1.902723e-05 1.521415e-07 6.639655e-07 1.015918e-05
## Physics 5.111550e-06 8.634898e-07 2.349872e-04 3.400172e-04
## Chemistry 5.066908e-05 2.885724e-04 2.694113e-04 3.420220e-04
## EarthSciences 8.849260e-07 1.729096e-06 2.040414e-08 2.650391e-06
## Biology 2.119023e-06 1.163650e-06 8.345584e-07 3.068295e-09
## Agriculture 2.458520e-05 2.681570e-05 2.094926e-05 5.011937e-06
## Psychology 1.806487e-05 3.870788e-05 3.735185e-04 8.161751e-04
## Sociology 1.911414e-05 1.207142e-08 5.704201e-05 1.316236e-04
## Economics 3.691424e-06 4.762749e-06 4.158916e-06 1.105709e-06
## Anthropology 2.416136e-05 3.105644e-05 2.823022e-04 3.013945e-04
## Others 5.207478e-05 2.342316e-04 1.742490e-04 2.081048e-04
# ...which sums to total inertia:
sum((SD-E)^2 / E / sum(SD))
## [1] 0.0131821
#Note that the total inertia is:
sum(SD.ca$rowinertia)
## [1] 0.0131821
sum(SD.ca$colinertia)
## [1] 0.0131821
#is the same as
sum(SD.ca$sv^2)
## [1] 0.0131821
# And same as Chi-squared test statistic divided by the grand total number of observations:
chisq.statistic / n
## [1] 0.0131821
# Use summary function to obtain more details about ca:
summary(SD.ca)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.009300 70.5 70.5 ******************
## 2 0.003258 24.7 95.3 ******
## 3 0.000294 2.2 97.5 *
## 4 0.000192 1.5 98.9
## 5 6.8e-050 0.5 99.5
## 6 4.2e-050 0.3 99.8
## 7 2.9e-050 0.2 100.0
## -------- -----
## Total: 0.013182 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Engn | 178 971 108 | 31 121 18 | 83 851 372 |
## 2 | Mthm | 65 928 24 | -7 11 0 | 67 916 91 |
## 3 | Phys | 85 956 78 | 98 782 87 | 46 174 55 |
## 4 | Chms | 112 985 245 | 153 813 282 | -70 172 171 |
## 5 | ErtS | 31 736 18 | 41 216 6 | -64 520 39 |
## 6 | Blgy | 191 485 2 | 1 4 0 | -9 481 4 |
## 7 | Agrc | 48 907 39 | 55 281 16 | -82 627 100 |
## 8 | Psyc | 123 993 183 | -127 832 215 | -56 161 119 |
## 9 | Sclg | 32 907 39 | -121 906 50 | -1 0 0 |
## 10 | Ecnm | 47 498 14 | 23 126 3 | -39 371 21 |
## 11 | Anth | 15 949 87 | -262 917 113 | -49 32 11 |
## 12 | Othr | 74 942 162 | -162 915 210 | 28 27 18 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Y1960 | 49 995 270 | 155 332 127 | -220 663 725 |
## 2 | Y1965 | 82 961 229 | 187 952 310 | 18 9 9 |
## 3 | Y1970 | 139 929 107 | 90 789 120 | 38 139 60 |
## 4 | Y1971 | 148 744 37 | 38 449 23 | 31 295 44 |
## 5 | Y1972 | 148 755 19 | -10 62 2 | 34 694 53 |
## 6 | Y1973 | 148 830 48 | -57 758 51 | 17 72 14 |
## 7 | Y1974 | 143 962 110 | -98 946 147 | -13 16 7 |
## 8 | Y1975 | 143 983 180 | -119 862 220 | -45 121 88 |
# Note that, in the summary, everything is multiplied by 1000 to add visual clarity.
# These proportional values are the ones seen in summary(SD.ca)
SD.ca$rowinertia / sum(SD.ca$sv^2)
## [1] 0.10794834 0.02440918 0.07807803 0.24514304 0.01835718 0.00220919
## [7] 0.03930913 0.18258198 0.03878405 0.01425278 0.08705826 0.16186883
SD.ca$colinertia / (chisq.statistic/n)
## [1] 0.27033198 0.22945662 0.10712707 0.03669280 0.01887136 0.04776678
## [7] 0.10970861 0.18004479
SD.ca$rowinertia / sum(SD.ca$rowinertia)
## [1] 0.10794834 0.02440918 0.07807803 0.24514304 0.01835718 0.00220919
## [7] 0.03930913 0.18258198 0.03878405 0.01425278 0.08705826 0.16186883
3. CA manually
#theoretical relative frequencies under independence
Ef <- E/n
#observed relative frequencies
SDf = SD/n
#the matrix Z
Z = (SDf - Ef)/sqrt(Ef)
class(Z)
## [1] "data.frame"
Z = as.matrix(Z)
# PAUSE: Recall, the chi-squared-distance:
sqrt(rowSums(Z^2) / (rowSums(SD)/n))
## Engineering Mathematics Physics Chemistry EarthSciences
## 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503
## Biology Agriculture Psychology Sociology Economics
## 0.01235856 0.10393910 0.13970921 0.12706869 0.06346696
## Anthropology Others
## 0.27380295 0.16970149
# Or:
sqrt(rowSums(((SD/n - E/n)^2) / (E/n)) / (rowSums(SD)/n))
## Engineering Mathematics Physics Chemistry EarthSciences
## 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503
## Biology Agriculture Psychology Sociology Economics
## 0.01235856 0.10393910 0.13970921 0.12706869 0.06346696
## Anthropology Others
## 0.27380295 0.16970149
# CONTINUE.
#matrices V and W
V = t(Z)%*%Z
W = Z%*%t(Z)
#chi-squared test statistic
n*tr(V)
## [1] 1686.083
variances = eigen(V)$values
SD.ca$sv^2
## [1] 9.299848e-03 3.257522e-03 2.943637e-04 1.918266e-04 6.753840e-05
## [6] 4.247359e-05 2.852970e-05
components = eigen(V)$vectors
#V and W has the same non-zero eigenvalues
eigen(W)$values
## [1] 9.299848e-03 3.257522e-03 2.943637e-04 1.918266e-04 6.753840e-05
## [6] 4.247359e-05 2.852970e-05 1.220016e-18 4.337860e-19 -5.184921e-20
## [11] -1.130535e-19 -1.786060e-19
components2 = eigen(W)$vectors
#forming the matrix R
f1 = v1/n #SD.ca$colmass
f2 = v2/n #SD.ca$rowmass
one = matrix(rep(1,nrow(SD)), ncol=1)
sifting = one%*%sqrt(f1)
scaling = f2 %*% sqrt(f1)
R = SDf/scaling - sifting
# PAUSE: Chi-squared-distance:
sqrt(rowSums(R^2))
## Engineering Mathematics Physics Chemistry EarthSciences
## 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503
## Biology Agriculture Psychology Sociology Economics
## 0.01235856 0.10393910 0.13970921 0.12706869 0.06346696
## Anthropology Others
## 0.27380295 0.16970149
#rowcoordinates (think as PCA scores)
rowcoord = as.matrix(R) %*% components
#omit the dimension corresponding to zero eigenvalue
rowcoord = rowcoord[,-8]
# PAUSE HERE:
# These are the same 'raw and unscaled' so-called principal co-ordinates than in the CA image shown at the beginning:
# (Without the years plotted)
par(mfrow = c(1,2))
plot(rowcoord[,1] * -1, rowcoord[,2]* -1, pch = 16, xlim = c(-0.2,0.2), ylim = c(-0.2,0.2))
text(rowcoord[,1]* -1, rowcoord[,2]* -1, rownames(rowcoord))
abline(h = 0, v = 0, lty = 3)
plot.ca(ca(SD), map = 'symmetric')
# Standardized rowcoordinates
stand = matrix(rep(1,nrow(SD), ncol=1)) %*% matrix(variances[-8], nrow=1)
standrowcoord = rowcoord / sqrt(stand)
SD.ca$rowcoord
## Dim1 Dim2 Dim3 Dim4 Dim5
## Engineering 0.322438878 1.44680720 0.8482280 -0.007459002 -0.3661262
## Mathematics -0.077507427 1.18234663 0.8452192 -0.675259459 -0.2595699
## Physics 1.011402351 0.80505558 -1.1958994 0.453545684 0.7139126
## Chemistry 1.585369258 -1.23369079 -0.3149825 -1.366044795 0.4105074
## EarthSciences 0.425654104 -1.11530691 0.5296933 2.714065450 -1.3022776
## Biology 0.008109185 -0.15012565 -0.4379684 -0.104110739 0.4176107
## Agriculture 0.570837241 -1.44184897 0.1000322 1.302055137 -2.5726441
## Psychology -1.321263122 -0.98259417 0.5715345 -0.379061483 0.1089682
## Sociology -1.254532552 -0.02306289 -0.4419189 -1.818647798 -2.8867506
## Economics 0.233745056 -0.67775239 1.3459936 2.322331238 1.6640115
## Anthropology -2.719309874 -0.85593181 2.8853357 -1.277746350 2.6419221
## Others -1.683657551 0.48728926 -2.2932582 0.659518061 0.2617515
## Dim6 Dim7
## Engineering 0.3550650 0.30347494
## Mathematics -1.1612581 -0.03416783
## Physics 0.6578247 -0.95293042
## Chemistry -0.5101158 0.69482903
## EarthSciences -1.2095653 3.66473773
## Biology 0.4378783 -0.09172852
## Agriculture 2.1324481 -1.06690138
## Psychology 0.1191914 -0.68409280
## Sociology -2.2953312 -1.01846617
## Economics -2.0849905 -1.71683848
## Anthropology 3.0650718 2.49932816
## Others -0.3520651 0.86594977
# Forming the matrix C is the same as the R matrix, but to column profiles instead.
one2 = matrix(rep(1,ncol(SD)), nrow=1)
sifting2 = sqrt(f2)%*%one2
scaling2 = sqrt(f2) %*% f1
C = SDf/scaling2 - sifting2
# Column co-ordinates
colcoord = t(C)%*%components2
# Omit dimensions corresponding to zero eigenvalues
colcoord = colcoord[,-c(8:ncol(colcoord))]
# Standardized colcoordinates
stand2 = matrix(rep(1,ncol(SD), ncol=1))%*%matrix(variances[-8], nrow=1)
standcolcoord = colcoord/sqrt(stand2)
SD.ca$colcoord
## Dim1 Dim2 Dim3 Dim4 Dim5 Dim6
## Y1960 1.6106625 -3.8490551 -1.0809061 -0.1725293 0.55209387 -0.6163723
## Y1965 1.9439873 0.3231438 1.5425491 1.8375923 -1.04979904 0.6491194
## Y1970 0.9299411 0.6599718 0.8557550 -1.4208929 1.21979719 -0.4725652
## Y1971 0.3975732 0.5451063 -1.4168200 -0.8028717 -0.68105651 1.4714772
## Y1972 -0.1056064 0.5981483 -0.7718786 0.2352183 -0.97394826 -1.9448011
## Y1973 -0.5886455 0.3057542 -0.6865726 1.4522227 1.61145599 0.2752951
## Y1974 -1.0136835 -0.2193574 0.9065753 -0.4103387 0.08366545 0.1000611
## Y1975 -1.2385154 -0.7834090 0.7197212 -0.1240095 -0.80909988 0.4055471
## Dim7
## Y1960 0.36101993
## Y1965 0.21263900
## Y1970 -0.67150608
## Y1971 0.19612414
## Y1972 0.04887571
## Y1973 -0.23983826
## Y1974 1.97494489
## Y1975 -1.57146779
## The chi-squared distances from the "center", where variables close to center do not deviate from the
# independence assumption:
# Row distances:
sqrt(rowSums(rowcoord^2))
## Engineering Mathematics Physics Chemistry EarthSciences
## 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503
## Biology Agriculture Psychology Sociology Economics
## 0.01235856 0.10393910 0.13970921 0.12706869 0.06346696
## Anthropology Others
## 0.27380295 0.16970149
SD.ca$rowdist
## [1] 0.08952388 0.07049341 0.11028846 0.16957446 0.08827503 0.01235856
## [7] 0.10393910 0.13970921 0.12706869 0.06346696 0.27380295 0.16970149
# Column distances:
sqrt(rowSums(colcoord^2))
## Y1960 Y1965 Y1970 Y1971 Y1972 Y1973
## 0.26977207 0.19216368 0.10093051 0.05724386 0.04098747 0.06519598
## Y1974 Y1975
## 0.10049502 0.12861395
SD.ca$coldist
## [1] 0.26977207 0.19216368 0.10093051 0.05724386 0.04098747 0.06519598
## [7] 0.10049502 0.12861395
# Now, the inertia can be also stated as MASS times Chi-squared-distance^2.
# For rows:
round((rowSums(SD) / n) * sqrt(rowSums(rowcoord^2))^2,5)
## Engineering Mathematics Physics Chemistry EarthSciences
## 0.00142 0.00032 0.00103 0.00323 0.00024
## Biology Agriculture Psychology Sociology Economics
## 0.00003 0.00052 0.00241 0.00051 0.00019
## Anthropology Others
## 0.00115 0.00213
SD.ca
##
## Principal inertias (eigenvalues):
## 1 2 3 4 5 6 7
## Value 0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71% 2.23% 1.46% 0.52% 0.32% 0.22%
##
##
## Rows:
## Engineering Mathematics Physics Chemistry EarthSciences Biology
## Mass 0.177551 0.064750 0.084616 0.112379 0.031054 0.190670
## ChiDist 0.089524 0.070493 0.110288 0.169574 0.088275 0.012359
## Inertia 0.001423 0.000322 0.001029 0.003232 0.000242 0.000029
## Dim. 1 0.322439 -0.077507 1.011402 1.585369 0.425654 0.008109
## Dim. 2 1.446807 1.182347 0.805056 -1.233691 -1.115307 -0.150126
## Agriculture Psychology Sociology Economics Anthropology Others
## Mass 0.047965 0.123308 0.031664 0.046643 0.015308 0.074093
## ChiDist 0.103939 0.139709 0.127069 0.063467 0.273803 0.169701
## Inertia 0.000518 0.002407 0.000511 0.000188 0.001148 0.002134
## Dim. 1 0.570837 -1.321263 -1.254533 0.233745 -2.719310 -1.683658
## Dim. 2 -1.441849 -0.982594 -0.023063 -0.677752 -0.855932 0.487289
##
##
## Columns:
## Y1960 Y1965 Y1970 Y1971 Y1972 Y1973 Y1974
## Mass 0.048965 0.081911 0.138624 0.147607 0.148076 0.148139 0.143198
## ChiDist 0.269772 0.192164 0.100931 0.057244 0.040987 0.065196 0.100495
## Inertia 0.003564 0.003025 0.001412 0.000484 0.000249 0.000630 0.001446
## Dim. 1 1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2 -3.849055 0.323144 0.659972 0.545106 0.598148 0.305754 -0.219357
## Y1975
## Mass 0.143479
## ChiDist 0.128614
## Inertia 0.002373
## Dim. 1 -1.238515
## Dim. 2 -0.783409
4. Interpreation of a CA plot
SD.ca
##
## Principal inertias (eigenvalues):
## 1 2 3 4 5 6 7
## Value 0.0093 0.003258 0.000294 0.000192 6.8e-05 4.2e-05 2.9e-05
## Percentage 70.55% 24.71% 2.23% 1.46% 0.52% 0.32% 0.22%
##
##
## Rows:
## Engineering Mathematics Physics Chemistry EarthSciences Biology
## Mass 0.177551 0.064750 0.084616 0.112379 0.031054 0.190670
## ChiDist 0.089524 0.070493 0.110288 0.169574 0.088275 0.012359
## Inertia 0.001423 0.000322 0.001029 0.003232 0.000242 0.000029
## Dim. 1 0.322439 -0.077507 1.011402 1.585369 0.425654 0.008109
## Dim. 2 1.446807 1.182347 0.805056 -1.233691 -1.115307 -0.150126
## Agriculture Psychology Sociology Economics Anthropology Others
## Mass 0.047965 0.123308 0.031664 0.046643 0.015308 0.074093
## ChiDist 0.103939 0.139709 0.127069 0.063467 0.273803 0.169701
## Inertia 0.000518 0.002407 0.000511 0.000188 0.001148 0.002134
## Dim. 1 0.570837 -1.321263 -1.254533 0.233745 -2.719310 -1.683658
## Dim. 2 -1.441849 -0.982594 -0.023063 -0.677752 -0.855932 0.487289
##
##
## Columns:
## Y1960 Y1965 Y1970 Y1971 Y1972 Y1973 Y1974
## Mass 0.048965 0.081911 0.138624 0.147607 0.148076 0.148139 0.143198
## ChiDist 0.269772 0.192164 0.100931 0.057244 0.040987 0.065196 0.100495
## Inertia 0.003564 0.003025 0.001412 0.000484 0.000249 0.000630 0.001446
## Dim. 1 1.610663 1.943987 0.929941 0.397573 -0.105606 -0.588645 -1.013683
## Dim. 2 -3.849055 0.323144 0.659972 0.545106 0.598148 0.305754 -0.219357
## Y1975
## Mass 0.143479
## ChiDist 0.128614
## Inertia 0.002373
## Dim. 1 -1.238515
## Dim. 2 -0.783409
summary(SD.ca)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.009300 70.5 70.5 ******************
## 2 0.003258 24.7 95.3 ******
## 3 0.000294 2.2 97.5 *
## 4 0.000192 1.5 98.9
## 5 6.8e-050 0.5 99.5
## 6 4.2e-050 0.3 99.8
## 7 2.9e-050 0.2 100.0
## -------- -----
## Total: 0.013182 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Engn | 178 971 108 | 31 121 18 | 83 851 372 |
## 2 | Mthm | 65 928 24 | -7 11 0 | 67 916 91 |
## 3 | Phys | 85 956 78 | 98 782 87 | 46 174 55 |
## 4 | Chms | 112 985 245 | 153 813 282 | -70 172 171 |
## 5 | ErtS | 31 736 18 | 41 216 6 | -64 520 39 |
## 6 | Blgy | 191 485 2 | 1 4 0 | -9 481 4 |
## 7 | Agrc | 48 907 39 | 55 281 16 | -82 627 100 |
## 8 | Psyc | 123 993 183 | -127 832 215 | -56 161 119 |
## 9 | Sclg | 32 907 39 | -121 906 50 | -1 0 0 |
## 10 | Ecnm | 47 498 14 | 23 126 3 | -39 371 21 |
## 11 | Anth | 15 949 87 | -262 917 113 | -49 32 11 |
## 12 | Othr | 74 942 162 | -162 915 210 | 28 27 18 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Y1960 | 49 995 270 | 155 332 127 | -220 663 725 |
## 2 | Y1965 | 82 961 229 | 187 952 310 | 18 9 9 |
## 3 | Y1970 | 139 929 107 | 90 789 120 | 38 139 60 |
## 4 | Y1971 | 148 744 37 | 38 449 23 | 31 295 44 |
## 5 | Y1972 | 148 755 19 | -10 62 2 | 34 694 53 |
## 6 | Y1973 | 148 830 48 | -57 758 51 | 17 72 14 |
## 7 | Y1974 | 143 962 110 | -98 946 147 | -13 16 7 |
## 8 | Y1975 | 143 983 180 | -119 862 220 | -45 121 88 |
# Note that the quantities are multiplied by 1000
# Quality of representation = as in the lecture slides, but here we consider the angle between profiles and
# the plane spanned by the two first principal components
# Squared correlations = quality of representation from lecture slides
# also, the sum of the squared correlations is the quality of representation.
# ctr = contribution in forming that ca-component (contributions sum to 1)
# important variables related to forming the specific component have a high ctr
# k=1 and k=2 are the coordinates on the plot
names(summary(SD.ca)$rows)
## [1] "name" "mass" " qlt" " inr" " k=1" "cor" "ctr" " k=2" "cor" "ctr"
# Contribution of engineering to the second axis
f2[1]*SD.ca$rowcoord[1,2]^2 #recall that these coordinates are already scaled
## [1] 0.3716586
# If the rows and columns were independent, ctr would be same for every variable
# Squared correlation of biology with the second component
d2 = rowcoord[6,]^2
d2[2]/sum(d2)
## [1] 0.4806858
#Note that the following plots unscaled coordinates (principal coordinates) and hence, deduction based on the plot is questionable
plot(SD.ca,arrows=c(T,T),map="symmetric")
#plot(SD.ca,arrows=c(T,T),map="symmetric",dim=c(2,4))
#Instead try e.g. following commands
plot(SD.ca, arrows=c(T,T), map="rowprincipal") #standard column coordinates
plot(SD.ca, arrows=c(T,T), map="colprincipal")
plot(SD.ca, arrows=c(T,T), map="rowgreen") #standard column coordinates scaled with square root of the column masses
plot(SD.ca, arrows=c(T,T), map="colgab", dim=c(1,2)) #standard row coordinates scaled with the row masses
plot(SD.ca, arrows=c(T,T), map="rowgab")
# If two row-variables are close on the picture, they have a similar profile,
# the same is true for column-variables
# Distant row/column-variables have different profiles
# Variables distant from the origin represent variables different from the average profile
# these are usually the most interesting ones
# Now you can again try to interpret the dimensions.
# 1st dim splits the sciences into soft/hard
# 2nd dim splits the sciences into more formula heavy(math,physics,engineering) vs
# the more experimental ones (chemistry,agriculture,earthsciences)
# same for different years