4 PCA FOR MACHINE LEARNING IN R

Create a 70/30 training/test split

# Set seed for reproducibility
set.seed(1)

# Create a stratified random split
training_rows = caret::createDataPartition(ml_num$h.target, 
                                           p = 0.70, list = FALSE) 

# Partition training dataset
train_x_class = ml_num[training_rows, ] 

# Partition test dataset
test_x_class = ml_num[-training_rows, ] 

dim(train_x_class)
## [1] 213   6
dim(test_x_class)
## [1] 90  6
table(train_x_class$h.target)
## 
##  no yes 
##  97 116
table(test_x_class$h.target)
## 
##  no yes 
##  41  49

4.1 Fit PCA model to training set for numeric values

?prcomp
pca_ml = prcomp(subset(train_x_class, select = -h.target), 
                       retx = TRUE, 
                       center = FALSE, scale = FALSE)
pca_ml
## Standard deviations (1, .., p=5):
## [1] 1.3433029 1.0702577 1.0024705 0.8699753 0.7086122
## 
## Rotation (n x k) = (5 x 5):
##                 PC1        PC2        PC3         PC4        PC5
## age       0.5518423  0.1059808  0.1358815  0.49852917  0.6459436
## trestbps  0.3295905  0.5611404 -0.7087114  0.05481389 -0.2668618
## chol      0.3127980  0.5539657  0.6586925 -0.30551601 -0.2608901
## thalach  -0.5286582  0.5219200 -0.0809096 -0.28283066  0.6013156
## oldpeak   0.4577315 -0.3075892 -0.1970845 -0.75837384  0.2861776
# view percentage of variance explained
summary(pca_ml)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.3433 1.0703 1.0025 0.8700 0.70861
## Proportion of Variance 0.3461 0.2197 0.1928 0.1452 0.09631
## Cumulative Proportion  0.3461 0.5658 0.7585 0.9037 1.00000
# or
expl.var = round(pca_ml$sdev ^ 2 / sum(pca_ml$sdev ^ 2) * 100, 4) 
expl.var
## [1] 34.6090 21.9694 19.2746 14.5163  9.6307

4.2 Generate predicted values of PCs for test dataset

predicted_values = predict(pca_ml, 
                           newdata = subset(test_x_class, select = -h.target))
head(predicted_values)
##           PC1         PC2        PC3        PC4         PC5
## 1   1.1846594  0.06110625 -0.7951150 -0.2337026  0.79720858
## 3  -1.4715750 -0.24577784 -0.8116120 -1.0010592 -0.03755396
## 7   0.5700275  0.80468713  0.2365695 -0.3770120 -0.09933266
## 8  -1.6961573  0.49353662  0.6212806 -0.3133308 -0.28781615
## 11 -0.0844052  0.37950489 -0.5000747 -0.1837164  0.19438368
## 19 -0.6521884  0.82777404 -1.0568587 -1.1352000 -0.41767546

4.3 Define plotting parameters

# Assign one color to each condition
target_colors = 1:2

# Assign one shape for the training data and another shape for the test data
target_shapes = c(1,16)
# Squares = training data
# Circles = test data

# Which PCs to plot?
target_PCs = 1:2

4.4 Store the scores inside of dataframes

# Assign the data into dataframes like before
gg_train = data.frame(pca_ml$x[, target_PCs])
head(gg_train)
##           PC1         PC2
## 2  -0.9553317 -0.01534275
## 4  -0.9299807  0.24698821
## 5   0.1102680  1.23102981
## 6  -0.2241860 -0.14932181
## 9  -0.1680809  1.18351579
## 10 -0.3081090  0.18865942
gg_test = data.frame(predicted_values[, target_PCs])
head(gg_test)
##           PC1         PC2
## 1   1.1846594  0.06110625
## 3  -1.4715750 -0.24577784
## 7   0.5700275  0.80468713
## 8  -1.6961573  0.49353662
## 11 -0.0844052  0.37950489
## 19 -0.6521884  0.82777404

4.5 Visualize

We can plot the training and test data on the same plot!

ggplot(
  # training data
  gg_train, aes(x = gg_train[,1], y = gg_train[,2], 
                color = train_x_class$h.target)) + 
  geom_point(shape = 0, alpha = .5, stroke = 1, size = 3) + 
  stat_ellipse(show.legend = FALSE, lwd = 0.5) + 
  labs(color = "Has heart disease?", 
       caption = "Squares = training data \n 
       Circles = test data \n 
       Ellipses are 95% confidence ellipses for training data") + 
  xlab("Dimension 1") + 
  ylab("Dimension 2") + 
  xlim(c(-4, 4)) + 
  ylim(c(-4, 4)) +
  theme_bw() + 
  
  # test data
  
  geom_point(gg_test, mapping = aes(x = gg_test[,1], y = gg_test[,2], 
                                    color = test_x_class$h.target, 
                                    size = 3, alpha = 0.75)) + 
  guides(size = FALSE, alpha = FALSE) + 
  theme(legend.position = "top") + 
  ggtitle("Heart disease training/test data") + 
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10))  

4.6 Save ml_num

save(ml_num, file = "data/unsupervised.RData")