3 DIMENSIONALITY REDUCTION IN R

Since we are not trying to predict the value of any target variable like in supervised approaches, the value of unsupervised machine learning can be to see how data can be separated based solely on the nature of their features. This is of major value, as we can include all of the data at once, and just see how it sorts! Unsupervised approaches are also useful for optimizing other machine learning algorithms.

Principal component analysis (PCA) is a powerful linear transformation technique used to explore patterns in data and highly correlated variables. It is useful for distilling variation across many variables onto a reduced feature space, such as a two-dimensional scatterplot.

3.1 Reclass variables

dplyr is essential for changing the classes of multiple features at once :^)

3.2 Load data

Reimport the heart disease dataset.

load("data/preprocessed.RData")

3.3 Scale numeric variables

vars_to_scale = c("age", "trestbps", "chol", "thalach", "oldpeak")
h = data_original %>% mutate_at(scale, .vars = vars(vars_to_scale))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(vars_to_scale)` instead of `vars_to_scale` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
h = h %>% mutate_at(as.numeric, .vars = vars(vars_to_scale))
head(h)
##          age sex cp    trestbps        chol fbs restecg     thalach exang
## 1  0.9506240   1  3  0.76269408 -0.25591036   1       0  0.01541728     0
## 2 -1.9121497   1  2 -0.09258463  0.07208025   0       1  1.63077374     0
## 3 -1.4717230   0  1 -0.09258463 -0.81542377   0       0  0.97589950     0
## 4  0.1798773   1  1 -0.66277043 -0.19802967   0       1  1.23784920     0
## 5  0.2899839   0  0 -0.66277043  2.07861109   0       1  0.58297496     1
## 6  0.2899839   1  0  0.47760118 -1.04694656   0       1 -0.07189928     0
##      oldpeak slope ca thal target
## 1  1.0855423     0  0    1      1
## 2  2.1190672     0  0    2      1
## 3  0.3103986     2  0    2      1
## 4 -0.2063639     2  0    2      1
## 5 -0.3786180     2  0    2      1
## 6 -0.5508722     1  0    1      1

3.4 Factorize categorical variables

# Quick rename target outcomes
h$target = ifelse(h$target == 1, "yes", "no")

vars_to_fac = c("sex", "cp", "fbs", "restecg", "exang", 
                "slope", "ca", "thal", "target")

h = h %>% mutate_at(as.factor, .vars = vars(vars_to_fac))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(vars_to_fac)` instead of `vars_to_fac` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Awesome!
sapply(h, class)
##       age       sex        cp  trestbps      chol       fbs   restecg   thalach 
## "numeric"  "factor"  "factor" "numeric" "numeric"  "factor"  "factor" "numeric" 
##     exang   oldpeak     slope        ca      thal    target 
##  "factor" "numeric"  "factor"  "factor"  "factor"  "factor"
# Create subset of numeric-only data (along with h.target)
# Combine the scaled numeric data and the original target feature
ml_num = data.frame(subset(h, select = vars_to_scale), h$target)
head(ml_num)
##          age    trestbps        chol     thalach    oldpeak h.target
## 1  0.9506240  0.76269408 -0.25591036  0.01541728  1.0855423      yes
## 2 -1.9121497 -0.09258463  0.07208025  1.63077374  2.1190672      yes
## 3 -1.4717230 -0.09258463 -0.81542377  0.97589950  0.3103986      yes
## 4  0.1798773 -0.66277043 -0.19802967  1.23784920 -0.2063639      yes
## 5  0.2899839 -0.66277043  2.07861109  0.58297496 -0.3786180      yes
## 6  0.2899839  0.47760118 -1.04694656 -0.07189928 -0.5508722      yes

3.5 Fit model

split = splitmix(h)
X1 = split$X.quanti 
X2 = split$X.quali 
res.pcamix = PCAmix(X.quanti = X1, 
                    X.quali = X2,
                    rename.level = TRUE,
                    graph = TRUE)

# Stuff to unpack
names(res.pcamix)
##  [1] "call"         "eig"          "ind"          "quanti"       "levels"      
##  [6] "quali"        "sqload"       "coef"         "Z"            "M"           
## [11] "quanti.sup"   "levels.sup"   "sqload.sup"   "rec.sup"      "scores.stand"
## [16] "scores"       "V"            "A"            "categ.coord"  "quanti.cor"  
## [21] "quali.eta2"   "rec"          "ndim"         "W"            "rename.level"
res.pcamix$eig
##        Eigenvalue Proportion Cumulative
## dim 1   3.8787886  16.864298   16.86430
## dim 2   1.6879338   7.338843   24.20314
## dim 3   1.4908837   6.482103   30.68524
## dim 4   1.3312321   5.787966   36.47321
## dim 5   1.2157241   5.285757   41.75897
## dim 6   1.1179188   4.860517   46.61948
## dim 7   1.0869543   4.725888   51.34537
## dim 8   1.0522819   4.575138   55.92051
## dim 9   1.0343524   4.497184   60.41769
## dim 10  0.9604933   4.176058   64.59375
## dim 11  0.9299643   4.043323   68.63707
## dim 12  0.8961699   3.896391   72.53347
## dim 13  0.8255254   3.589241   76.12271
## dim 14  0.8045898   3.498217   79.62092
## dim 15  0.7339800   3.191217   82.81214
## dim 16  0.7237726   3.146838   85.95898
## dim 17  0.6862958   2.983895   88.94287
## dim 18  0.5911739   2.570321   91.51319
## dim 19  0.5082960   2.209983   93.72318
## dim 20  0.4234490   1.841083   95.56426
## dim 21  0.3820072   1.660901   97.22516
## dim 22  0.3484562   1.515027   98.74019
## dim 23  0.2897568   1.259812  100.00000

3.6 Screeplot

barplot(res.pcamix$eig[,2], 
        ylim = c(0, 20), las = 2)

3.7 ggplot coordinates

# ?plot.PCAmix

# Convert the coordinates to a dataframe, and add the original target column
pca1 = data.frame(res.pcamix$ind$coord, h$target)

ggplot(pca1, aes(x = dim.1, y = dim.2, color = h.target)) + 
  geom_point() + 
  theme_bw() +
  guides(color = guide_legend(title = "Has heart \n disease?")) + 
  ggtitle("PCA of heart disease") +
  xlab(paste("Dimension 1", paste0("(", 
                                   round(res.pcamix$eig[1, 2], 2), 
                                   "%", ")"))) + 
  ylab(paste("Dimension 2", paste0("(", 
                                   round(res.pcamix$eig[2, 2], 2), 
                                   "%", ")")))

3.8 View factor loadings

pca2 = data.frame(res.pcamix$sqload)
pca2
##               dim.1        dim.2       dim.3       dim.4       dim.5
## age      0.21337352 0.2477799674 0.041744496 0.031009752 0.034038015
## trestbps 0.06834390 0.2419102430 0.082816187 0.051728438 0.001589446
## chol     0.02082961 0.2066356059 0.152307150 0.043233560 0.049610585
## thalach  0.44360573 0.0002023463 0.047419425 0.058275497 0.046975232
## oldpeak  0.43930006 0.0193048486 0.082960245 0.086332841 0.025271527
## sex      0.05563966 0.3243727308 0.160043366 0.069228609 0.007629010
## cp       0.43389618 0.0855538209 0.231154112 0.046455629 0.376511477
## fbs      0.01052402 0.0676379198 0.110885644 0.133605806 0.178892828
## restecg  0.09350096 0.1355432393 0.002523635 0.325035267 0.007697222
## exang    0.35569689 0.0645725466 0.015438480 0.001024528 0.010754496
## slope    0.38393372 0.0324258579 0.318500790 0.183487912 0.019374687
## ca       0.31595887 0.1136921858 0.137120105 0.208372158 0.342211693
## thal     0.40934316 0.1045544271 0.104108768 0.073799024 0.112059111
## target   0.63484228 0.0437480745 0.003861254 0.019643099 0.003108742
# Dimension 1
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.1), y = dim.1)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + ggtitle("Dimension 1 loadings") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Dimension 2
ggplot(pca2, aes(x = reorder(rownames(pca2), -dim.2), y = dim.2)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + ggtitle("Dimension 2 loadings") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))