If we want to use PLIER
, we need to construct prior information matrices for zebrafish. We’ll do that using two sources: WikiPathways and ZFIN
`%>%` <- dplyr::`%>%`
r
r # BiocManager::install(c(, , .Dr.eg.db), # update = FALSE) # install.packages() library(rWikiPathways) library(XML) library(org.Dr.eg.db)
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap, parApply,
parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums, dirname, do.call,
duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
rowMeans, rownames, rowSums, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which,
which.max, which.min
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see
'citation(\Biobase\)', and for packages 'citation(\pkgname\)'.
Loading required package: IRanges
Loading required package: S4Vectors
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:base’:
expand.grid
r
r pathway_dir <- file.path(, ) dir.create(pathway_dir, showWarnings = FALSE, recursive = TRUE)
r
r # download current version of WikiPathway pathways # this is CC0 downloadPathwayArchive(organism = rerio, format = , destpath = pathway_dir, date = 0190110)
trying URL 'http://data.wikipathways.org/20190110/gmt/wikipathways-20190110-gmt-Danio_rerio.gmt'
Content type 'unknown' length 40448 bytes (39 KB)
==================================================
downloaded 39 KB
[1] \wikipathways-20190110-gmt-Danio_rerio.gmt\
Read the .gmt
file in and convert to binary matrix
r
r zebrafish_wp_list <- GSA::GSA.read.gmt(file.path(pathway_dir, -20190110-gmt-Danio_rerio.gmt))
Read 83 records
Read 4510 items
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182831
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
Clean up the list to make it more amenable to the binary prior information matrix we’ll need for use with PLIER.
r
r names(zebrafish_wp_list\(genesets) <- make.names( # remove hyphens gsub(\ \, \_\, # convert spaces to underscores gsub(\%WikiPathways_20190110%\, \ \, # remove date, WP information gsub(\%Danio rerio\, \\, # remove species information zebrafish_wp_list\)geneset.names)) ) ) # remove pathway names and descriptions zebrafish_wp_list <- zebrafish_wp_list$genesets
We’re going to use ZFIN identifiers instead of Entrez IDs, which are what are used in WikiPathways.
r
r # our goal is to get a binary matrix that represents gene-pathway relationships # to do this, we first need to melt the list into a data.frame zebrafish_wp_binary_df <- reshape2::melt(zebrafish_wp_list) %>% # get sensible column names dplyr::mutate(Gene = value, Pathway = L1) %>% dplyr::select(Gene:Pathway) %>% # conversion from Entrez IDs to ZFIN IDs with mapIds – this should be 1:1 dplyr::mutate(ZFIN = mapIds(org.Dr.eg.db, keys = as.character(Gene), keytype = , column = )) %>% # drop Entrez IDs dplyr::select(-Gene) %>% # drop records missing ZFIN ids dplyr::filter(!is.na(ZFIN)) %>% # unique records only, just in case dplyr::distinct() %>% # get binary matrix, where the first column will be gene and the columns are # pathways reshape2::dcast(ZFIN ~ Pathway, fun.aggregate = length)
'select()' returned 1:1 mapping between keys and columns
Using ZFIN as value column: use value.var to override.
r
r # now a more PLIER friendly format zebrafish_wp_binary_mat <- zebrafish_wp_binary_df %>% tibble::column_to_rownames() %>% as.matrix() saveRDS(zebrafish_wp_binary_mat, file.path(pathway_dir, _rerio_wikipathways_prior_info.RDS))
Given the licensing of the ZFIN resource and the fact that downloading via the web interface has the advantage of including the archive date in the file name/header and including the column names, we’ve obtained these files via the downloads webpage using the date that is currently the most recent archive (2019-01-25
).
r
r wildtype_expression_file <- file.path(pathway_dir, -expression_fish_2019.01.25.txt) # we’re only going to use the in situ data wildtype_expression_df <- readr::read_tsv(wildtype_expression_file, skip = 1) %>% dplyr::filter(Assay == in situ hybridization) %>% dplyr::select(-X15)
Missing column names filled in: 'X15' [15]Parsed with column specification:
cols(
`Gene ID` = col_character(),
`Gene Symbol` = col_character(),
`Fish Name` = col_character(),
`Super Structure ID` = col_character(),
`Super Structure Name` = col_character(),
`Sub Structure ID` = col_character(),
`Sub Structure Name` = col_character(),
`Start Stage` = col_character(),
`End Stage` = col_character(),
Assay = col_character(),
`Publication ID` = col_character(),
`Probe ID` = col_character(),
`Antibody ID` = col_character(),
`Fish ID` = col_character(),
X15 = col_character()
)
number of columns of result is not a multiple of vector length (arg 1)201048 parsing failures.
row [38;5;246m# A tibble: 5 x 5[39m col row col expected actual file expected [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m actual [38;5;250m1[39m 1 [31mNA[39m 15 columns 14 columns 'data/pathways/wildtype-expression_fish_2019.01.25.txt' file [38;5;250m2[39m 2 [31mNA[39m 15 columns 14 columns 'data/pathways/wildtype-expression_fish_2019.01.25.txt' row [38;5;250m3[39m 3 [31mNA[39m 15 columns 14 columns 'data/pathways/wildtype-expression_fish_2019.01.25.txt' col [38;5;250m4[39m 4 [31mNA[39m 15 columns 14 columns 'data/pathways/wildtype-expression_fish_2019.01.25.txt' expected [38;5;250m5[39m 5 [31mNA[39m 15 columns 14 columns 'data/pathways/wildtype-expression_fish_2019.01.25.txt'
... ................................. ... ........................................................................................... ........ ........................................................................................................................................................................................................................ ...... ..................................................................................................................... .... ..................................................................................................................... ... ..................................................................................................................... ... ..................................................................................................................... ........ .....................................................................................................................
See problems(...) for more details.
We’re hoping to get two gene sets from this ZFIN information: 1) anatomical structure and 2) developmental stage. Parsing this by stage-tissue pairs ends up being extremely sparse, so we can’t approach it that way and get anything useful.
r
r stage_file <- file.path(pathway_dir, _ontology_2019.01.25.txt) stage_df <- readr::read_tsv(stage_file, skip = 1) %>% # drop column associated with parsing error dplyr::select(-X6)
Missing column names filled in: 'X6' [6]Parsed with column specification:
cols(
`Stage ID` = col_character(),
`Stage OBO ID` = col_character(),
`Stage Name` = col_character(),
`Begin Hours` = col_double(),
`End Hours` = col_double(),
X6 = col_character()
)
number of columns of result is not a multiple of vector length (arg 1)45 parsing failures.
row [38;5;246m# A tibble: 5 x 5[39m col row col expected actual file expected [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m actual [38;5;250m1[39m 1 [31mNA[39m 6 columns 5 columns 'data/pathways/stage_ontology_2019.01.25.txt' file [38;5;250m2[39m 2 [31mNA[39m 6 columns 5 columns 'data/pathways/stage_ontology_2019.01.25.txt' row [38;5;250m3[39m 3 [31mNA[39m 6 columns 5 columns 'data/pathways/stage_ontology_2019.01.25.txt' col [38;5;250m4[39m 4 [31mNA[39m 6 columns 5 columns 'data/pathways/stage_ontology_2019.01.25.txt' expected [38;5;250m5[39m 5 [31mNA[39m 6 columns 5 columns 'data/pathways/stage_ontology_2019.01.25.txt'
... ................................. ... ............................................................................... ........ ............................................................................................................................................................................................................ ...... ......................................................................................................... .... ......................................................................................................... ... ......................................................................................................... ... ......................................................................................................... ........ .........................................................................................................
See problems(...) for more details.
r
r stage_ordered_factor <- factor(stage_df\(`Stage Name`[-1], # drop unknown levels = stage_df\)Stage Name
[-1], ordered = TRUE) stage_ordered_factor
[1] Zygote:1-cell Cleavage:2-cell Cleavage:4-cell Cleavage:8-cell
[5] Cleavage:16-cell Cleavage:32-cell Cleavage:64-cell Blastula:128-cell
[9] Blastula:256-cell Blastula:512-cell Blastula:1k-cell Blastula:High
[13] Blastula:Oblong Blastula:Sphere Blastula:Dome Blastula:30%-epiboly
[17] Gastrula:50%-epiboly Gastrula:Germ-ring Gastrula:Shield Gastrula:75%-epiboly
[21] Gastrula:90%-epiboly Gastrula:Bud Segmentation:1-4 somites Segmentation:5-9 somites
[25] Segmentation:10-13 somites Segmentation:14-19 somites Segmentation:20-25 somites Segmentation:26+ somites
[29] Pharyngula:Prim-5 Pharyngula:Prim-15 Pharyngula:Prim-25 Pharyngula:High-pec
[33] Hatching:Long-pec Hatching:Pec-fin Larval:Protruding-mouth Larval:Day 4
[37] Larval:Day 5 Larval:Day 6 Larval:Days 7-13 Larval:Days 14-20
[41] Larval:Days 21-29 Juvenile:Days 30-44 Juvenile:Days 45-89 Adult
44 Levels: Zygote:1-cell < Cleavage:2-cell < Cleavage:4-cell < Cleavage:8-cell < ... < Adult
r
r gene_stages_list <- list() for (current_gene in unique(wildtype_expression_df$Gene ID
)) {
# initialize vector to hold all the stages for the current gene current_gene_stages <- vector()
# data.frame filtered to current gene only current_gene_df <- wildtype_expression_df %>% dplyr::filter(Gene ID
== current_gene)
# all rows in current_gene_df for (gene_row_iter in 1:nrow(current_gene_df)) { # pull pertinent stage information from this row start_stage <- current_gene_df\(`Start Stage`[gene_row_iter] end_stage <- current_gene_df\)End Stage
[gene_row_iter]
# find the indices corresponding to the stages and extract the stage
# names from the ordered factor vector
start_index <- which(stage_ordered_factor == start_stage)
end_index <- which(stage_ordered_factor == end_stage)
covered_stages <- as.character(stage_ordered_factor[start_index:end_index])
current_gene_stages <- append(current_gene_stages,
covered_stages)
} gene_stages_list[[current_gene]] <- current_gene_stages }
We want to go from this list to a binary matrix for use with PLIER
r
r gene_stages_binary_mat <- reshape2::melt(gene_stages_list) %>% # remove any duplicate stages dplyr::distinct() %>% # sensible column names dplyr::mutate(Stage = value, Gene = L1) %>% dplyr::select(Stage:Gene) %>% # and finally, get a PLIER-friendly binary matrix reshape2::dcast(Gene ~ Stage, fun.aggregate = length) %>% tibble::column_to_rownames() %>% as.matrix()
Using Gene as value column: use value.var to override.
r
r # save to file saveRDS(gene_stages_binary_mat, file = file.path(pathway_dir, _stages_prior_info.RDS))
We’ll use the adult stage only to do the gene-tissue mapping.
r
r # filter to only adult adult_wt_df <- wildtype_expression_df %>% dplyr::filter(End Stage
== ) # then we can get a list by tissue -> binary matrix adult_tissue_wt_binary_mat <- reshape2::melt( # this gives us a list that we’re melting into a data.frame split(adult_wt_df\(`Gene ID`, adult_wt_df\)Super Structure Name
)) %>% # setting informative column names dplyr::mutate(Gene = value, Tissue = L1) %>% dplyr::select(Gene:Tissue) %>% # filtering out any duplicate records dplyr::distinct() %>% # and finally, get a PLIER-friendly binary matrix reshape2::dcast(Gene ~ Tissue, fun.aggregate = length) %>% tibble::column_to_rownames() %>% as.matrix()
Using Tissue as value column: use value.var to override.
r
r # save the RDS to file saveRDS(adult_tissue_wt_binary_mat, file.path(pathway_dir, _adult_tissue_prior_info.RDS))
r
r sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS: /usr/lib/openblas-base/libblas.so.3
LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 org.Dr.eg.db_3.7.0 AnnotationDbi_1.44.0 IRanges_2.16.0 S4Vectors_0.20.1
[6] Biobase_2.42.0 BiocGenerics_0.28.0 XML_3.98-1.16 rWikiPathways_1.2.0
loaded via a namespace (and not attached):
[1] Rcpp_0.12.18 plyr_1.8.4 pillar_1.3.0 compiler_3.5.1 bindr_0.1.1 bitops_1.0-6
[7] tools_3.5.1 digest_0.6.17 bit_1.1-14 RSQLite_2.1.1 memoise_1.1.0 tibble_1.4.2
[13] pkgconfig_2.0.2 rlang_0.2.2 cli_1.0.1 DBI_1.0.0 rstudioapi_0.7 curl_3.2
[19] yaml_2.2.0 stringr_1.3.1 dplyr_0.7.6 httr_1.3.1 knitr_1.20 hms_0.4.2
[25] caTools_1.17.1.1 bit64_0.9-7 tidyselect_0.2.4 glue_1.3.0 R6_2.2.2 fansi_0.3.0
[31] RJSONIO_1.3-1.1 GSA_1.03 readr_1.1.1 reshape2_1.4.3 purrr_0.2.5 blob_1.1.1
[37] magrittr_1.5 assertthat_0.2.0 utf8_1.1.4 stringi_1.2.4 crayon_1.3.4