PCA and UMAP Analysis Workflow

Biostat 212A

Author

Dr. Jin Zhou @ UCLA

Published

March 9, 2025

Display system information for reproducibility.

sessionInfo()

R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS 15.3.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] htmlwidgets_1.6.4 compiler_4.4.1    fastmap_1.2.0     cli_3.6.4        
 [5] tools_4.4.1       htmltools_0.5.8.1 rstudioapi_0.17.1 yaml_2.3.10      
 [9] rmarkdown_2.29    knitr_1.49        jsonlite_1.9.1    xfun_0.51        
[13] digest_0.6.37     rlang_1.1.5       evaluate_1.0.3

library(tidyverse)
library(tidymodels)
# install.packages("tidytext")
library(tidytext)

1 Gene expression data

The goal is to evaluate whether gene expression can separate three disease types.

expression <- read_csv("expression.csv")

2 PCA

pca_rec <- recipe(~., data = expression) %>%
  update_role(ID, disease, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_prep

library(tidytext)
tidied_pca <- tidy(pca_prep, 2)


tidied_pca %>%
  filter(component %in% paste0("PC", 1:4)) %>%
  group_by(component) %>%
  top_n(8, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

juice(pca_prep) %>%
  ggplot(aes(PC1, PC2, label = ID)) +
  geom_point(aes(color = disease), alpha = 0.7, size = 2) +
  #geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)

3 UMAP

library(embed)
umap_rec <- recipe(~., data = expression) %>%
  update_role(ID, disease, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prep

juice(umap_prep) %>%
  ggplot(aes(UMAP1, UMAP2, label = ID)) +
  geom_point(aes(color = disease), alpha = 0.7, size = 2) +
#  geom_text(check_overlap = TRUE, hjust = "inward") +
  labs(color = NULL)