# install packages
if (!("umap" %in% installed.packages())) {
install.packages("umap")
}
if (!("preprocessCore" %in% installed.packages())) {
BiocManager::install("preprocessCore", update = FALSE)
}
if (!("sva" %in% installed.packages())) {
BiocManager::install("sva", update = FALSE)
}
# load packages
library(ggplot2)
Registered S3 method overwritten by 'dplyr':
method from
print.rowwise_df
Need help? Try Stackoverflow: https://stackoverflow.com/tags/ggplot2.
# magrittr pipe
`%>%` <- dplyr::`%>%`
# set seed
set.seed(2019)
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
output_dir <- file.path(root_dir,
"analyses",
"selection-strategy-comparison",
"plots")
# Create directory to hold the output.
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
#' Plot of a umap model
#'
#' The umap_plot function takes a umap model and associated metadata and
#' returns a ggplot object that plots the first two UMAP components,
#' colored by disease type and with shapes determined by selection method.
#' Optionally writes the plot to a file.
#'
#' @param umap_model A umap model object output from [umap::umap]
#' @param metadata A data frame of metadata for the samples. Must contain a
#' column of `disease_comparable` and `RNA_library` as in the
#' "pbta-histologies.tsv" file.
#' @param sample_id The name of the column containing sample ids. Defaults to
#' "Kids_First_Biospecimen_ID", as in the "pbta-histologies.tsv" file.
#' @param comparable Boolean to determine whether to include only diseases
#' present in both strategies in plots. Requires the presence of a
#' `comparable` column in the metadata function.
#' @param filename File for writinge the output plot. Default `NA` will not
#' write to a file, but still returns the plot.
#'
#' @return A ggplot object containing a plot of the first two UMAP components
umap_plot <- function(umap_model,
metadata,
sample_id = "Kids_First_Biospecimen_ID",
filter_comparable = TRUE,
filename = NA){
layout <- data.frame(umap_model$layout) %>%
dplyr::rename_all(.funs = gsub, pattern = "^X", replacement = "UMAP") %>%
tibble::rownames_to_column(var = sample_id)
plot_data <- layout %>%
dplyr::left_join(metadata, by = sample_id)
if (filter_comparable) {
plot_data <- plot_data %>% dplyr::filter(comparable)
}
u_plot <- ggplot(plot_data,
aes(x = UMAP1,
y = UMAP2,
color = disease_comparable,
shape = RNA_library)) +
geom_point(alpha = 0.3) +
labs(color = "Disease Type",
shape = "Selection Method")
if (is.character(filename)) {
ggsave(filename,
u_plot,
width = 11,
height = 7)
}
return(u_plot)
}
For now we will read in only the rsem data; we may examine the kallisto data later.
exp_rsem_polyA <- readRDS(file.path(root_dir,
"data",
"pbta-gene-expression-rsem-fpkm.polya.rds"))
exp_rsem_stranded <- readRDS(file.path(root_dir,
"data",
"pbta-gene-expression-rsem-fpkm.stranded.rds"))
# join polyA and stranded data together
exp_rsem <- dplyr::bind_cols(exp_rsem_polyA,
exp_rsem_stranded[,-1]) %>%
dplyr::filter(complete.cases(.))
# transpose the expression values to a matrix
exp_rsem_t <- t(exp_rsem[,-1])
colnames(exp_rsem_t) <- exp_rsem[[1]]
metadata_df <- readr::read_tsv(file.path(root_dir,
"data",
"pbta-histologies.tsv")) %>%
dplyr::right_join(dplyr::tibble(Kids_First_Biospecimen_ID = rownames(exp_rsem_t)),
by = "Kids_First_Biospecimen_ID")
Parsed with column specification:
cols(
.default = col_character(),
age_at_diagnosis_days = [32mcol_double()[39m,
OS_days = [32mcol_double()[39m
)
See spec(...) for full column specifications.
Looking at metadata first, separating out the poly-A and stranded samples.
polyA_meta_df <- metadata_df %>%
dplyr::filter(RNA_library == "poly-A")
stranded_meta_df <- metadata_df %>%
dplyr::filter(RNA_library == "stranded")
There are 58 samples and 970 stranded samples, so the chances of much meaningful comparison seem slim. But just to compare the kinds of samples between the two:
polyA_counts <- polyA_meta_df %>%
dplyr::group_by(disease_type_new) %>%
dplyr::summarise(polyA_n = dplyr::n())
polyA_diseases <- polyA_counts$disease_type_new
stranded_counts <- stranded_meta_df %>%
dplyr::group_by(disease_type_new) %>%
dplyr::summarise(stranded_n = dplyr::n())
strategy_counts <- dplyr::left_join(polyA_counts, stranded_counts)
Joining, by = "disease_type_new"
strategy_counts
NA
Looks like there is possible comparison among the gliomas, if those might cluster together on some scale. Focus on those first to see how they compare in analyses.
First, add a flag for comparable datasets (those disease types with both poly-A and stranded samples) to the metadata table for later use.
metadata_df <- metadata_df %>%
dplyr::mutate(comparable = disease_type_new %in% polyA_diseases,
disease_comparable = ifelse(comparable, disease_type_new, "other"))
To start, we will reproduce an naive dimensionality reduction analysis to show the problem of selection method that we are facing. Since UMAP seems to work well overall, that is what we will focus on first.
# set neighbors parameter
neighbors <- params$neighbors
# remove low counts
dm_set <- exp_rsem_t[, colSums(exp_rsem_t) > 100]
rsem_umap_raw <- umap::umap(dm_set, n_neighbors = neighbors)
plot_raw <- umap_plot(rsem_umap_raw,
metadata_df,
filename = file.path(output_dir,
"umap_raw.png"))
plot_raw
So even with more dimensions, we are not surprised to see that the poly-A samples are still clustering together.
As a first pass, lets see what happens when we remove genes are simply not expressed in the poly-A samples (or vice versa?).
polyA_samples <- rownames(exp_rsem_t) %in% polyA_meta_df$Kids_First_Biospecimen_ID
polyA_exp <- exp_rsem_t[polyA_samples, ]
# get genes expressed at all in all samples
polyA_gene_filter <- colSums(polyA_exp <= 0) == 0
polyA_expressed <- exp_rsem_t[, polyA_gene_filter]
ncol(polyA_expressed)
[1] 17521
rsem_umap_polyA <- umap::umap(polyA_expressed, n_neighbors = neighbors)
plot_polyA <- umap_plot(rsem_umap_polyA,
metadata_df,
filename = file.path(output_dir,
"umap_polyA.png"))
plot_polyA
Since simple elimination of samples doesn’t work, lets see what happens with some simple normalization. Using the poly-A expressed data set as the base.
First, just trying a simple ranking of genes by expression per sample.
polyA_expressed_rank <- t(apply(polyA_expressed, 1, rank))
rsem_umap_polyA_rank <- umap::umap(polyA_expressed_rank, n_neighbors = neighbors)
plot_polyA_rank <- umap_plot(rsem_umap_polyA_rank,
metadata_df,
filename = file.path(output_dir,
"umap_polyA_ranks.png"))
plot_polyA_rank
This substantially improves clustering by cancer type, but poly-A is still a quite divergent cluster.
An alternative to the simple ranking is to do quantile normalization, which might have different results.
polyA_expressed_norm <- t(preprocessCore::normalize.quantiles(t(polyA_expressed)))
# normalize.quantiles strips names
rownames(polyA_expressed_norm) <- rownames(polyA_expressed)
rsem_umap_polyA_norm <- umap::umap(polyA_expressed_norm, n_neighbors = neighbors)
plot_polyA_norm <- umap_plot(rsem_umap_polyA_norm,
metadata_df,
filename = file.path(output_dir,
"umap_polyA_normalized.png"))
plot_polyA_norm
That is surprisingly worse than the raw ranks. I would have expected similar results to the ranks, but this actually seems to perform somewhat worse, especially in differentiating different disease types. Either way though the poly-A and stranded samples are well separated in any component.
One more principled effort would be to see if we can apply ComBat and get anything reasonable.
For the combat model we will create a model matrix with our factors of interest, which in this case will be disease type. We will go with the parametric adjustment first, to see if that is effective at all. If wanted, we could try non-parametric, but that takes much longer.
combat_model <- model.matrix(~as.factor(disease_type_new),
data = metadata_df)
combat_exp <- sva::ComBat(dat = t(polyA_expressed),
batch = metadata_df$RNA_library,
mod = combat_model,
par.prior = TRUE) # parametric adjustment
Found2batches
Adjusting for70covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
rsem_umap_polyA_combat <- umap::umap(t(combat_exp), n_neighbors = neighbors)
plot_polyA_combat <- umap_plot(rsem_umap_polyA_combat,
metadata_df,
filename = file.path(output_dir,
"umap_polyA_combat.png"))
plot_polyA_combat
This seems to be more effective than anything else, in that it pulls the astrocytomas together (though the poly-A remain far more mightly clustered), but it still leaves the poly-A DIPG samples quite distinct.
While with a (much) larger data set, and perhaps more balanced arrangements among the tumor types, it might be possible to implement a principled correction for different library selection methods, it seems unlikely that this will be done with the current data set. It is possible that some specific analyses may be salvageable with ComBat or a similar method, but the overall recommendation is likely to be to do most of the expression analysis treating the stranded and poly-A selected libraries separately, and not attempt to draw conclusions about comparisons between these two sets.
It would be ideal if there were sufficient additional tissue and resources to perform stranded RNAseq on the poly-A selected samples, but note that we might still be concerned about batch effects. Indeed, batch effects are potentially a larger problem in general. Information on the technical processing of the samples (including dates, kits, Illumina model, etc.) should be included in the metadata to allow examination and/or mitigation of some of these more subtle effects.
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_3.2.0
loaded via a namespace (and not attached):
[1] reticulate_1.12 genefilter_1.68.0 tidyselect_0.2.5
[4] xfun_0.8 purrr_0.3.2 splines_3.6.0
[7] lattice_0.20-38 colorspace_1.4-1 htmltools_0.3.6
[10] stats4_3.6.0 mgcv_1.8-28 yaml_2.2.0
[13] base64enc_0.1-3 survival_2.44-1.1 blob_1.1.1
[16] XML_3.98-1.20 rlang_0.4.0 pillar_1.4.2
[19] glue_1.3.1 withr_2.1.2 DBI_1.0.0
[22] BiocParallel_1.20.0 bit64_0.9-7 BiocGenerics_0.32.0
[25] matrixStats_0.55.0 sva_3.34.0 umap_0.2.2.0
[28] munsell_0.5.0 gtable_0.3.0 memoise_1.1.0
[31] evaluate_0.14 labeling_0.3 Biobase_2.46.0
[34] knitr_1.23 IRanges_2.20.1 parallel_3.6.0
[37] AnnotationDbi_1.48.0 preprocessCore_1.48.0 Rcpp_1.0.1
[40] xtable_1.8-4 readr_1.3.1 backports_1.1.4
[43] scales_1.0.0 limma_3.42.0 S4Vectors_0.24.1
[46] jsonlite_1.6 annotate_1.64.0 bit_1.1-14
[49] RSpectra_0.15-0 hms_0.4.2 digest_0.6.20
[52] dplyr_0.8.3 grid_3.6.0 rprojroot_1.3-2
[55] tools_3.6.0 bitops_1.0-6 magrittr_1.5
[58] lazyeval_0.2.2 RCurl_1.95-4.12 tibble_2.1.3
[61] RSQLite_2.1.1 crayon_1.3.4 pkgconfig_2.0.2
[64] Matrix_1.2-17 assertthat_0.2.1 rmarkdown_1.13
[67] rstudioapi_0.10 R6_2.4.0 nlme_3.1-140
[70] compiler_3.6.0