Display system information for reproducibility.
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS 15.3.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Los_Angeles
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] htmlwidgets_1.6.4 compiler_4.4.1 fastmap_1.2.0 cli_3.6.4
[5] tools_4.4.1 htmltools_0.5.8.1 rstudioapi_0.17.1 yaml_2.3.10
[9] rmarkdown_2.29 knitr_1.49 jsonlite_1.9.1 xfun_0.51
[13] digest_0.6.37 rlang_1.1.5 evaluate_1.0.3
library(tidyverse)
library(tidymodels)
# install.packages("tidytext")
library(tidytext)
Gene expression data
The goal is to evaluate whether gene expression can separate three disease types.
expression <- read_csv("expression.csv")
PCA
pca_rec <- recipe(~., data = expression) %>%
update_role(ID, disease, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
pca_prep <- prep(pca_rec)
pca_prep
library(tidytext)
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
juice(pca_prep) %>%
ggplot(aes(PC1, PC2, label = ID)) +
geom_point(aes(color = disease), alpha = 0.7, size = 2) +
#geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)
UMAP
library(embed)
umap_rec <- recipe(~., data = expression) %>%
update_role(ID, disease, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prep
juice(umap_prep) %>%
ggplot(aes(UMAP1, UMAP2, label = ID)) +
geom_point(aes(color = disease), alpha = 0.7, size = 2) +
# geom_text(check_overlap = TRUE, hjust = "inward") +
labs(color = NULL)