4 PCA FOR MACHINE LEARNING IN R

Create a 70/30 training/test split

# Set seed for reproducibility
set.seed(1)

# Create a stratified random split
training_rows = caret::createDataPartition(ml_num$h.target, 
                                           p = 0.70, list = FALSE) 

# Partition training dataset
train_x_class = ml_num[training_rows, ] 

# Partition test dataset
test_x_class = ml_num[-training_rows, ] 

dim(train_x_class)

## [1] 213   6

dim(test_x_class)

## [1] 90  6

table(train_x_class$h.target)

## 
##  no yes 
##  97 116

table(test_x_class$h.target)

## 
##  no yes 
##  41  49

4.1 Fit PCA model to training set for numeric values

?prcomp
pca_ml = prcomp(subset(train_x_class, select = -h.target), 
                       retx = TRUE, 
                       center = FALSE, scale = FALSE)
pca_ml

## Standard deviations (1, .., p=5):
## [1] 1.3433029 1.0702577 1.0024705 0.8699753 0.7086122
## 
## Rotation (n x k) = (5 x 5):
##                 PC1        PC2        PC3         PC4        PC5
## age       0.5518423  0.1059808  0.1358815  0.49852917  0.6459436
## trestbps  0.3295905  0.5611404 -0.7087114  0.05481389 -0.2668618
## chol      0.3127980  0.5539657  0.6586925 -0.30551601 -0.2608901
## thalach  -0.5286582  0.5219200 -0.0809096 -0.28283066  0.6013156
## oldpeak   0.4577315 -0.3075892 -0.1970845 -0.75837384  0.2861776

# view percentage of variance explained
summary(pca_ml)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5
## Standard deviation     1.3433 1.0703 1.0025 0.8700 0.70861
## Proportion of Variance 0.3461 0.2197 0.1928 0.1452 0.09631
## Cumulative Proportion  0.3461 0.5658 0.7585 0.9037 1.00000

# or
expl.var = round(pca_ml$sdev ^ 2 / sum(pca_ml$sdev ^ 2) * 100, 4) 
expl.var

## [1] 34.6090 21.9694 19.2746 14.5163  9.6307

4.2 Generate predicted values of PCs for test dataset

predicted_values = predict(pca_ml, 
                           newdata = subset(test_x_class, select = -h.target))
head(predicted_values)

##           PC1         PC2        PC3        PC4         PC5
## 1   1.1846594  0.06110625 -0.7951150 -0.2337026  0.79720858
## 3  -1.4715750 -0.24577784 -0.8116120 -1.0010592 -0.03755396
## 7   0.5700275  0.80468713  0.2365695 -0.3770120 -0.09933266
## 8  -1.6961573  0.49353662  0.6212806 -0.3133308 -0.28781615
## 11 -0.0844052  0.37950489 -0.5000747 -0.1837164  0.19438368
## 19 -0.6521884  0.82777404 -1.0568587 -1.1352000 -0.41767546

4.3 Define plotting parameters

# Assign one color to each condition
target_colors = 1:2

# Assign one shape for the training data and another shape for the test data
target_shapes = c(1,16)
# Squares = training data
# Circles = test data

# Which PCs to plot?
target_PCs = 1:2

4.4 Store the scores inside of dataframes

# Assign the data into dataframes like before
gg_train = data.frame(pca_ml$x[, target_PCs])
head(gg_train)

##           PC1         PC2
## 2  -0.9553317 -0.01534275
## 4  -0.9299807  0.24698821
## 5   0.1102680  1.23102981
## 6  -0.2241860 -0.14932181
## 9  -0.1680809  1.18351579
## 10 -0.3081090  0.18865942

gg_test = data.frame(predicted_values[, target_PCs])
head(gg_test)

##           PC1         PC2
## 1   1.1846594  0.06110625
## 3  -1.4715750 -0.24577784
## 7   0.5700275  0.80468713
## 8  -1.6961573  0.49353662
## 11 -0.0844052  0.37950489
## 19 -0.6521884  0.82777404

4.5 Visualize

We can plot the training and test data on the same plot!

ggplot(
  # training data
  gg_train, aes(x = gg_train[,1], y = gg_train[,2], 
                color = train_x_class$h.target)) + 
  geom_point(shape = 0, alpha = .5, stroke = 1, size = 3) + 
  stat_ellipse(show.legend = FALSE, lwd = 0.5) + 
  labs(color = "Has heart disease?", 
       caption = "Squares = training data \n 
       Circles = test data \n 
       Ellipses are 95% confidence ellipses for training data") + 
  xlab("Dimension 1") + 
  ylab("Dimension 2") + 
  xlim(c(-4, 4)) + 
  ylim(c(-4, 4)) +
  theme_bw() + 
  
  # test data
  
  geom_point(gg_test, mapping = aes(x = gg_test[,1], y = gg_test[,2], 
                                    color = test_x_class$h.target, 
                                    size = 3, alpha = 0.75)) + 
  guides(size = FALSE, alpha = FALSE) + 
  theme(legend.position = "top") + 
  ggtitle("Heart disease training/test data") + 
  theme(plot.title = element_text(hjust = 0.5, size = 10),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10))

4.6 Save `ml_num`

save(ml_num, file = "data/unsupervised.RData")

4 PCA FOR MACHINE LEARNING IN R

4.1 Fit PCA model to training set for numeric values

4.2 Generate predicted values of PCs for test dataset

4.3 Define plotting parameters

4.4 Store the scores inside of dataframes

4.5 Visualize

4.6 Save ml_num

4.6 Save `ml_num`