This notebook identifies samples to include in subset files for the purpose of molecularly subtyping embryonal tumors (AlexsLemonade/OpenPBTA-analysis#251
).
This closed pull request is also relevant to the task at hand: [AlexsLemonade/OpenPBTA-analysis#401
](https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/401
We’ll use the subset files generated in 02-generate-subset-files.R
to construct tables that summarize the data on the subtyping issue.
This notebook is intended to be run via the command line from the top directory of the repository as follows:
Rscript -e "rmarkdown::render('analyses/molecular-subtyping-embryonal/04-table-prep.Rmd', clean = TRUE)"
library(tidyverse)
── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.0 ✔ purrr 0.3.2
✔ tibble 2.1.3 ✔ dplyr 0.8.3
✔ tidyr 0.8.3 ✔ stringr 1.4.0
✔ readr 1.3.1 ✔ forcats 0.4.0
── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
# TODO: consider moving this out of this notebook to use with ependymoma
# subtyping?
wrangle_fusions <- function(fusions_df, string_to_match, column_name) {
# This function takes the fusion summary binary matrix and returns a
# data frame that contains two columns: Kids_First_Biospecimen_ID and
# a column containing a comma-separated 'list' of relevant fusions.
# A "relevant fusion" is determined by passing the string_to_match argument
# to dplyr::matches. To specify that a gene symbol should be a 5' partner,
# you can pass "<gene symbol>--". The 2nd column name is determined by the
# column_name argument.
#
# Args:
# fusions_df: a fusion_summary data.frame from `fusion-summary`
# string_to_match: character; passed to dplyr::matches to select relevant
# columns
# column_name: the name of the relevant fusion column in the data.frame
# that is returned
#
# Returns: a data.frame with Kids_First_Biospecimen_ID and relevant fusions
# columns; biospecimens without any relevant fusions will have "None"
# in the second column
specific_fusions <- fusions_df %>%
# only include relevant fusions using string_to_match
select(Kids_First_Biospecimen_ID, matches(string_to_match)) %>%
reshape2::melt() %>%
# a value of 0 means that fusion is not present in that biospecimen
filter(value > 0) %>%
select(-value) %>%
group_by(Kids_First_Biospecimen_ID) %>%
# get the comma separated 'list' of relevant fusions
summarize(!!column_name := paste(sort(unique(variable)), collapse = ", "))
# if a sample had none of the relevant fusions, it's missing from
# specific_fusions -- let's add them back in with "None" in the second
# column
missing_ids <- setdiff(fusions_df$Kids_First_Biospecimen_ID,
specific_fusions$Kids_First_Biospecimen_ID)
specific_fusions <- specific_fusions %>%
bind_rows(data.frame(Kids_First_Biospecimen_ID = missing_ids)) %>%
replace(is.na(.), "None")
return(specific_fusions)
}
subset_dir <- "subset-files"
results_dir <- "results"
data_dir <- file.path("..", "..", "data")
Set up filenames.
# full clinical file
histologies_file <- file.path(data_dir, "pbta-histologies-base.tsv")
# full fusions of interest file
fusion_file <- file.path(data_dir,"fusion_summary_embryonal_foi.tsv")
# subset files
manta_file <- file.path(subset_dir, "embryonal_manta_sv.tsv")
polya_file <- file.path(subset_dir, "embryonal_zscored_exp.polya.rds")
stranded_file <- file.path(subset_dir, "embryonal_zscored_exp.stranded.rds")
# file that contains the biospecimen IDs that met criteria for subtyping
biospecimen_file <- file.path(results_dir,
"biospecimen_ids_embryonal_subtyping.tsv")
# cleaned chr19 data
chr19_file <- file.path(results_dir, "cleaned_chr19_cn.tsv")
# annotated sex chromosomes copy number alterations
sex_chr_consensus <- file.path(data_dir,
"consensus_seg_annotated_cn_x_and_y.tsv.gz")
Read in files.
histologies_df <- read_tsv(histologies_file,
col_types = cols(
molecular_subtype = col_character()
))
# manta data for BCOR tandem duplication
manta_sv_df <- read_tsv(manta_file)
Warning: 55 parsing failures.
row col expected actual file
1158 TriS_CGscore 1/0/T/F/TRUE/FALSE 3 'subset-files/embryonal_manta_sv.tsv'
2665 TriS_CGscore 1/0/T/F/TRUE/FALSE 3 'subset-files/embryonal_manta_sv.tsv'
5738 TriS_CGscore 1/0/T/F/TRUE/FALSE 3 'subset-files/embryonal_manta_sv.tsv'
5740 TriS_CGscore 1/0/T/F/TRUE/FALSE 3 'subset-files/embryonal_manta_sv.tsv'
5743 TriS_CGscore 1/0/T/F/TRUE/FALSE 3 'subset-files/embryonal_manta_sv.tsv'
.... ............ .................. ...... .....................................
See problems(...) for more details.
# all of the RNA data
polya_exp <- read_rds(polya_file)
stranded_exp <- read_rds(stranded_file)
fusions_df <- read_tsv(fusion_file)
# get biospecimen ids as a vector rather than data frame
biospecimen_ids <- read_tsv(biospecimen_file) %>%
pull(Kids_First_Biospecimen_ID)
# cleaned chr19 amplification data
chr19_cn_df <- read_tsv(chr19_file)
# we'll filter to only the relevant biospecimens for the copy number alteration
# data that is committed to the repository
consensus_sex_chr <- read_tsv(sex_chr_consensus) %>%
filter(biospecimen_id %in% biospecimen_ids)
output_all <- file.path(results_dir,
"embryonal_tumor_subtyping_relevant_data.tsv")
output_subtypes <- file.path(results_dir,
"embryonal_tumor_molecular_subtypes.tsv")
Summarizing the salient points from AlexsLemonade/OpenPBTA-analysis#251
:
ETMR, C19MC-altered: These tumors have focal amplification of the miRNA cluster on chr19 (denoted C19MC) and often have gene fusions involving TTYH1 and chr19 miRNA cluster genes
CNS HGNET-MN1: Contain gene fusions involving 5’ MN1. 3’ fusion partners can include BEND2 and CXXC5.
CNS NB-FOXR2: Over-expression and/or gene fusions in FOXR2
CNS EFT-CIC: Alterations in CIC, commonly fused with NUTM1
colnames(fusions_df)
[1] "Kids_First_Biospecimen_ID" "CXXC5--MN1"
[3] "MACF1--TTYH1" "MN1--AC008667.1"
[5] "MN1--CXXC5" "MN1--PATZ1"
[7] "MN1--PSD2" "NPAS3--AC245884.4/TTYH1"
[9] "RLIM--AL627224.1/FOXR2" "TTYH1--DPRX/RNU6-698P"
[11] "TTYH1--MIR512-2/MIR1323" "TTYH1--MIR515-2/MIR519C"
[13] "ZC4H2--AL627224.1/FOXR2" "ZC4H2--FOXR2"
[15] "CIC--NUTM1" "MN1--BEND2"
Kids_First_Biospecimen_ID
CXXC5--MN1
MACF1--TTYH1
MN1--AC008667.1
MN1--CXXC5
MN1--PATZ1
MN1--PSD2
NPAS3--AC245884.4/TTYH1
RLIM--AL627224.1/FOXR2
TTYH1--DPRX/RNU6-698P
TTYH1--MIR512-2/MIR1323
TTYH1--MIR515-2/MIR519C
ZC4H2--AL627224.1/FOXR2
ZC4H2--FOXR2
CIC--NUTM1
MN1--BEND2
ttyh1_fusions <- wrangle_fusions(fusions_df, "TTYH1--", "TTYH1_fusions")
Warning in bind_rows_(x, .id): binding character and factor vector,
coercing into character vector
mn1_fusions <- wrangle_fusions(fusions_df, "MN1--", "MN1_fusions")
Warning in bind_rows_(x, .id): binding character and factor vector,
coercing into character vector
foxr2_fusions <- wrangle_fusions(fusions_df, "FOXR2", "FOXR2_fusions")
Warning in bind_rows_(x, .id): binding character and factor vector,
coercing into character vector
cic_fusions <- wrangle_fusions(fusions_df, "CIC--", "CIC_fusions")
Warning in bind_rows_(x, .id): binding character and factor vector,
coercing into character vector
fusions_summary_df <- list(ttyh1_fusions,
mn1_fusions,
foxr2_fusions,
cic_fusions) %>%
purrr::reduce(dplyr::inner_join, by = "Kids_First_Biospecimen_ID") %>%
# Filter out specific MN1--PATZ1 fusion based on comment: https://github.com/AlexsLemonade/OpenPBTA-analysis/pull/788#discussion_r495212879
filter(!(MN1_fusions == "MN1--PATZ1"))
# remove data.frame we no longer need
rm(ttyh1_fusions, mn1_fusions, foxr2_fusions, cic_fusions, fusions_df)
head(fusions_summary_df %>% arrange(desc(TTYH1_fusions)), n = 10)
Summarizing salient points from AlexsLemonade/OpenPBTA-analysis#251
:
ETMR, C19MC-altered: These tumors have high expression of LIN28A and serves as a biomarker for ETMRs
CNS NB-FOXR2: Over-expression and/or gene fusions in FOXR2
ETMR, NOS: These tumors have high expression of LIN28A and serves as a biomarker for ETMRs, but do not show focal amplification of C19MC.
exp_genes_of_interest <- c("LIN28A", "FOXR2")
# set up poly-A data
polya_exp <- polya_exp[, exp_genes_of_interest] %>%
as.data.frame() %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID")
# set up stranded data
stranded_exp <- stranded_exp[, exp_genes_of_interest] %>%
as.data.frame() %>%
tibble::rownames_to_column("Kids_First_Biospecimen_ID")
# bind together both datasets, but keep track of what selection strategy was
# used
exp_df <- bind_rows("polya" = polya_exp,
"stranded" = stranded_exp,
.id = "exp_dataset") %>%
select(-exp_dataset, everything())
# remove data.frame we no longer need
rm(polya_exp, stranded_exp, exp_genes_of_interest)
rna_df <- exp_df %>%
left_join(fusions_summary_df) %>%
select(-exp_dataset, everything())
Joining, by = "Kids_First_Biospecimen_ID"
# remove data.frame we no longer need
rm(exp_df)
head(rna_df, n = 10)
We see some repeated negative values for the expression z-scores here. Generally that comes from small or zero log2(FPKM + 1) values. For example, all of the poly-A values for FOXR2 are zero prior to scaling.
CNS HGNET-BCOR
- CNS high-grade neuroepithelial tumor with BCOR alteration
- Tumors have internal tandem duplication of BCOR
The definition of internal tandem duplication from Rustagi et al. BMC Bioinformatics. 2016.:
Detection of tandem duplication within coding exons, referred to as internal tandem duplication (ITD)
The annotated files from the focal-cn-file-preparation
module use exons to check for overlaps. So, we can try to supplement the Manta calls with the annotated sex chromosome files.
# only include rows from Manta that make reference to BCOR, pass all filters
# https://github.com/Illumina/manta/blob/75b5c38d4fcd2f6961197b28a41eb61856f2d976/docs/userGuide/README.md#vcf-format-fields
# and are recorded as tandem duplication events
bcor_sv_df <- manta_sv_df %>%
filter(str_detect(Gene.name, "BCOR"),
FILTER == "PASS",
ALT == "<DUP:TANDEM>")
# take a look at the genes
bcor_sv_df %>%
head(n = 2) %>%
pull(Gene.name)
[1] "ABCB7/ACE2/ACOT9/ADGRG2/AKAP4/ALAS2/AMELX/AMER1/ANOS1/AP1S2/APEX2/APOO/APOOL/AR/ARAF/ARHGAP6/ARHGEF9/ARHGEF9-IT1/ARR3/ARX/ASB11/ASB12/ASB9/ATP6AP2/ATP7A/ATRX/ATXN3L/AWAT1/AWAT2/BCLAF3/BCOR/BEND2/BMP15/BMX/BRDTP1/BRWD3/CA5B/CA5BP1/CA5BP1-CA5B/CACNA1F/CASK/CBLL2/CCDC120/CCDC22/CCNB3/CDK16/CDKL5/CDX4/CENPVL1/CENPVL2/CENPVL3/CFAP47/CFP/CHIC1/CHM/CHMP1B2P/CHST7/CITED1/CLCN4/CLCN5/CLDN34/CLTRN/CNKSR2/COX7B/CPXCR1/CTPS2/CXCR3/CXXC1P1/CXorf21/CXorf38/CXorf49/CXorf49B/CXorf58/CXorf65/CXorf67/CYBB/CYLC1/CYSLTR1/DACH2/DCAF8L1/DCAF8L2/DDX3X/DDX53/DGAT2L6/DGKK/DIAPH2/DIAPH2-AS1/DIPK2B/DLG3/DLG3-AS1/DMD/DMRTC1/DMRTC1B/DUSP21/DYNLT3/EBP/EDA/EDA2R/EFHC2/EFNB1/EGFL6/EIF1AX/EIF1AX-AS1/EIF2S3/ELK1/ERAS/ERCC6L/FAAH2/FAM104B/FAM120C/FAM133A/FAM155B/FAM156A/FAM156B/FAM226A/FAM226B/FAM236A/FAM236B/FAM236C/FAM236D/FAM47A/FAM47B/FAM47C/FAM9A/FAM9B/FAM9C/FANCB/FGD1/FGF16/FLICR/FLJ44635/FOXO4/FOXP3/FOXR2/FRMD8P1/FRMPD4/FTH1P18/FTHL17/FTSJ1/FTX/FUNDC1/GAGE1/GAGE10/GAGE12B/GAGE12C/GAGE12D/GAGE12E/GAGE12F/GAGE12G/GAGE12H/GAGE12I/GAGE12J/GAGE13/GAGE2A/GAGE2B/GAGE2C/GAGE2D/GAGE2E/GAGE4/GAGE5/GAGE6/GAGE7/GAGE8/GATA1/GCNA/GDPD2/GEMIN8/GJB1/GK/GLOD5/GLRA2/GNL3L/GPKOW/GPM6B/GPR143/GPR173/GPR174/GPR34/GPR82/GRIPAP1/GRPR/GS1-594A7.3/GS1-600G8.3/GSPT2/HCCS/HDAC6/HDAC8/HDX/HEPH/HMGN5/HSD17B10/HUWE1/HYPM/IGBP1/IL1RAPL1/IL2RG/INE1/INE2/INGX/IQSEC2/ITGB1BP2/ITIH6/ITM2A/JADE3/JPX/KANTR/KCND1/KDM5C/KDM6A/KIF4A/KLF8/KLHL15/KLHL34/KLHL4/KRBOX4/LANCL3/LAS1L/LINC00269/LINC00891/LINC01186/LINC01203/LINC01204/LINC01278/LINC01281/LINC01282/LINC01284/LINC01456/LINC01496/LINC01545/LINC01560/LINC02154/LINC02595/LINC02601/LOC100129291/LOC100132741/LOC100132831/LOC100421746/LOC101059915/LOC101927476/LOC101927635/LOC101928128/LOC101928201/LOC101928389/LOC101928627/LOC105373156/LOC392452/LOC729609/LPAR4/MAGEB1/MAGEB10/MAGEB16/MAGEB17/MAGEB18/MAGEB2/MAGEB3/MAGEB4/MAGEB5/MAGEB6/MAGED1/MAGED2/MAGED4/MAGED4B/MAGEE1/MAGEE2/MAGEH1/MAGIX/MAGT1/MAOA/MAOB/MAP2K4P1/MAP3K15/MAP7D2/MBTPS2/MED12/MED14/MED14OS/MID1/MID1IP1/MID1IP1-AS1/MIR1321/MIR1468/MIR1587/MIR188/MIR221/MIR222/MIR223/MIR23C/MIR325/MIR325HG/MIR361/MIR362/MIR374A/MIR374B/MIR374C/MIR384/MIR3915/MIR3937/MIR421/MIR4328/MIR4454/MIR4536-1/MIR4536-2/MIR4666B/MIR4767/MIR4768/MIR4769/MIR4770/MIR500A/MIR500B/MIR501/MIR502/MIR532/MIR545/MIR548AJ2/MIR548AM/MIR548AX/MIR548F5/MIR548I4/MIR548M/MIR6086/MIR6134/MIR651/MIR660/MIR676/MIR6857/MIR6894/MIR6895/MIR8088/MIR98/MIRLET7F2/MOSPD2/MPC1L/MSL3/MSN/MTMR8/MTRNR2L10/NAP1L2/NAP1L3/NAP1L6/NBDY/NDP/NDP-AS1/NDUFB11/NEXMIF/NHS/NHS-AS1/NHSL2/NLGN3/NLGN4X/NLRP2B/NONO/NR0B1/NUDT10/NUDT11/NYX/OFD1/OGT/OPHN1/OTC/OTUD5/OTUD6A/P2RY10/P2RY4/PABPC1L2A/PABPC1L2B/PABPC1L2B-AS1/PABPC5/PABPC5-AS1/PAGE1/PAGE2/PAGE2B/PAGE3/PAGE4/PAGE5/PBDC1/PCDH11X/PCSK1N/PCYT1B/PCYT1B-AS1/PDHA1/PDK3/PDZD11/PFKFB1/PGAM4/PGK1/PHEX/PHEX-AS1/PHF8/PHKA1/PHKA1-AS1/PHKA2/PHKA2-AS1/PIGA/PIM2/PIN4/PINCR/PIR/PIR-FIGF/PJA1/PLP2/PNPLA4/POF1B/POLA1/PORCN/POU3F4/PPEF1/PPEF1-AS1/PPP1R2C/PPP1R3F/PPP4R3C/PQBP1/PRAF2/PRDX4/PRICKLE3/PRPS2/PRRG1/PTCHD1/PTCHD1-AS/PUDP/RAB41/RAB9A/RAI2/RBBP7/RBM10/RBM3/REPS2/RGN/RIBC1/RLIM/RP2/RPA4/RPGR/RPS26P11/RPS4X/RPS6KA3/RPS6KA6/RRAGB/RS1/RTL3/RTL5/S100G/SAT1/SATL1/SCARNA23/SCARNA9L/SCML1/SCML2/SH3BGRL/SH3KBP1/SHROOM2/SHROOM4/SLC16A2/SLC35A2/SLC38A5/SLC7A3/SLC9A7/SMC1A/SMPX/SMS/SNORA109/SNORA11/SNORA11C/SNORA11D/SNORA11E/SNORA11G/SNORD3E/SNX12/SPACA5/SPACA5B/SPANXN5/SPIN2A/SPIN2B/SPIN3/SPIN4/SRPX/SSX1/SSX2/SSX2B/SSX3/SSX4/SSX4B/SSX5/SSX6P/SSX7/SSX8P/SSX9P/STARD8/STS/SUPT20HL1/SUPT20HL2/SUV39H1/SYAP1/SYN1/SYP/SYP-AS1/SYTL5/TAB3/TAB3-AS1/TAF1/TAF9B/TBC1D25/TBL1X/TBX22/TCEANC/TENT5D/TEX11/TFE3/TGIF2LX/TIMM17B/TIMP1/TLR7/TLR8/TLR8-AS1/TMEM47/TMSB4X/TRAPPC2/TRO/TSIX/TSPAN7/TSPYL2/TSR2/TTC3P1/TXLNG/UBA1/UBE2DNL/UBE2E4P/UBQLN2/UPRT/UQCRBP1/USP11/USP27X/USP27X-AS1/USP51/USP9X/UXT/UXT-AS1/VCX/VCX2/VCX3A/VCX3B/VEGFD/VENTXP1/VSIG4/WAS/WDR13/WDR45/WNK3/WWC3/XAGE1A/XAGE1B/XAGE2/XAGE3/XAGE5/XIST/XK/YIPF6/YY2/ZC3H12B/ZC4H2/ZCCHC13/ZDHHC15/ZFX/ZFX-AS1/ZMYM3/ZNF157/ZNF182/ZNF41/ZNF630/ZNF630-AS1/ZNF674/ZNF674-AS1/ZNF711/ZNF81/ZRSR2/ZXDA/ZXDB"
[2] "ACE2/ACOT9/ADGRG2/AMELX/ANOS1/AP1S2/APOO/ARHGAP6/ARX/ASB11/ASB9/ATP6AP2/ATXN3L/BCLAF3/BCOR/BEND2/BMX/CA5B/CA5BP1/CA5BP1-CA5B/CASK/CBLL2/CDKL5/CFAP47/CLCN4/CLDN34/CLTRN/CNKSR2/CTPS2/CXorf21/CXorf38/CXorf58/CYBB/DCAF8L1/DCAF8L2/DDX3X/DDX53/DIPK2B/DMD/DUSP21/DYNLT3/EFHC2/EGFL6/EIF1AX/EIF1AX-AS1/EIF2S3/FAM47A/FAM47B/FAM47C/FAM9A/FAM9B/FAM9C/FANCB/FRMPD4/FTH1P18/FTHL17/FUNDC1/GEMIN8/GK/GLRA2/GPM6B/GPR143/GPR34/GPR82/GRPR/GS1-594A7.3/GS1-600G8.3/HCCS/HYPM/IL1RAPL1/INE2/KDM6A/KLHL15/KLHL34/KRBOX4/LANCL3/LINC01186/LINC01203/LINC01204/LINC01281/LINC01282/LINC01456/LINC02154/LINC02595/LINC02601/LOC100132831/LOC101927476/LOC101928389/LOC101928627/LOC392452/LOC729609/MAGEB1/MAGEB10/MAGEB16/MAGEB17/MAGEB18/MAGEB2/MAGEB3/MAGEB4/MAGEB5/MAGEB6/MAOA/MAOB/MAP3K15/MAP7D2/MBTPS2/MED14/MED14OS/MID1/MID1IP1/MID1IP1-AS1/MIR1587/MIR221/MIR222/MIR23C/MIR3915/MIR3937/MIR4666B/MIR4767/MIR4768/MIR548AJ2/MIR548AM/MIR548AX/MIR548F5/MIR6086/MIR6134/MIR651/MOSPD2/MPC1L/MSL3/NDP/NDP-AS1/NHS/NHS-AS1/NR0B1/NYX/OFD1/OTC/PCYT1B/PCYT1B-AS1/PDHA1/PDK3/PHEX/PHEX-AS1/PHKA2/PHKA2-AS1/PIGA/PINCR/PIR/PIR-FIGF/PNPLA4/POLA1/PPEF1/PPEF1-AS1/PPP1R2C/PPP4R3C/PRDX4/PRPS2/PRRG1/PTCHD1/PTCHD1-AS/PUDP/RAB9A/RAI2/RBBP7/REPS2/RPGR/RPS6KA3/RS1/S100G/SAT1/SCARNA23/SCARNA9L/SCML1/SCML2/SH3KBP1/SHROOM2/SMPX/SMS/SRPX/STS/SUPT20HL1/SUPT20HL2/SYAP1/SYTL5/TAB3/TAB3-AS1/TBL1X/TCEANC/TLR7/TLR8/TLR8-AS1/TMEM47/TMSB4X/TRAPPC2/TSPAN7/TXLNG/UBE2E4P/USP9X/VCX/VCX2/VCX3B/VEGFD/VENTXP1/WWC3/XK/YY2/ZFX/ZFX-AS1/ZNF674/ZNF674-AS1/ZRSR2"
ABCB7/ACE2/ACOT9/ADGRG2/AKAP4/ALAS2/AMELX/AMER1/ANOS1/AP1S2/APEX2/APOO/APOOL/AR/ARAF/ARHGAP6/ARHGEF9/ARHGEF9-IT1/ARR3/ARX/ASB11/ASB12/ASB9/ATP6AP2/ATP7A/ATRX/ATXN3L/AWAT1/AWAT2/BCLAF3/BCOR/BEND2/BMP15/BMX/BRDTP1/BRWD3/CA5B/CA5BP1/CA5BP1-CA5B/CACNA1F/CASK/CBLL2/CCDC120/CCDC22/CCNB3/CDK16/CDKL5/CDX4/CENPVL1/CENPVL2/CENPVL3/CFAP47/CFP/CHIC1/CHM/CHMP1B2P/CHST7/CITED1/CLCN4/CLCN5/CLDN34/CLTRN/CNKSR2/COX7B/CPXCR1/CTPS2/CXCR3/CXXC1P1/CXorf21/CXorf38/CXorf49/CXorf49B/CXorf58/CXorf65/CXorf67/CYBB/CYLC1/CYSLTR1/DACH2/DCAF8L1/DCAF8L2/DDX3X/DDX53/DGAT2L6/DGKK/DIAPH2/DIAPH2-AS1/DIPK2B/DLG3/DLG3-AS1/DMD/DMRTC1/DMRTC1B/DUSP21/DYNLT3/EBP/EDA/EDA2R/EFHC2/EFNB1/EGFL6/EIF1AX/EIF1AX-AS1/EIF2S3/ELK1/ERAS/ERCC6L/FAAH2/FAM104B/FAM120C/FAM133A/FAM155B/FAM156A/FAM156B/FAM226A/FAM226B/FAM236A/FAM236B/FAM236C/FAM236D/FAM47A/FAM47B/FAM47C/FAM9A/FAM9B/FAM9C/FANCB/FGD1/FGF16/FLICR/FLJ44635/FOXO4/FOXP3/FOXR2/FRMD8P1/FRMPD4/FTH1P18/FTHL17/FTSJ1/FTX/FUNDC1/GAGE1/GAGE10/GAGE12B/GAGE12C/GAGE12D/GAGE12E/GAGE12F/GAGE12G/GAGE12H/GAGE12I/GAGE12J/GAGE13/GAGE2A/GAGE2B/GAGE2C/GAGE2D/GAGE2E/GAGE4/GAGE5/GAGE6/GAGE7/GAGE8/GATA1/GCNA/GDPD2/GEMIN8/GJB1/GK/GLOD5/GLRA2/GNL3L/GPKOW/GPM6B/GPR143/GPR173/GPR174/GPR34/GPR82/GRIPAP1/GRPR/GS1-594A7.3/GS1-600G8.3/GSPT2/HCCS/HDAC6/HDAC8/HDX/HEPH/HMGN5/HSD17B10/HUWE1/HYPM/IGBP1/IL1RAPL1/IL2RG/INE1/INE2/INGX/IQSEC2/ITGB1BP2/ITIH6/ITM2A/JADE3/JPX/KANTR/KCND1/KDM5C/KDM6A/KIF4A/KLF8/KLHL15/KLHL34/KLHL4/KRBOX4/LANCL3/LAS1L/LINC00269/LINC00891/LINC01186/LINC01203/LINC01204/LINC01278/LINC01281/LINC01282/LINC01284/LINC01456/LINC01496/LINC01545/LINC01560/LINC02154/LINC02595/LINC02601/LOC100129291/LOC100132741/LOC100132831/LOC100421746/LOC101059915/LOC101927476/LOC101927635/LOC101928128/LOC101928201/LOC101928389/LOC101928627/LOC105373156/LOC392452/LOC729609/LPAR4/MAGEB1/MAGEB10/MAGEB16/MAGEB17/MAGEB18/MAGEB2/MAGEB3/MAGEB4/MAGEB5/MAGEB6/MAGED1/MAGED2/MAGED4/MAGED4B/MAGEE1/MAGEE2/MAGEH1/MAGIX/MAGT1/MAOA/MAOB/MAP2K4P1/MAP3K15/MAP7D2/MBTPS2/MED12/MED14/MED14OS/MID1/MID1IP1/MID1IP1-AS1/MIR1321/MIR1468/MIR1587/MIR188/MIR221/MIR222/MIR223/MIR23C/MIR325/MIR325HG/MIR361/MIR362/MIR374A/MIR374B/MIR374C/MIR384/MIR3915/MIR3937/MIR421/MIR4328/MIR4454/MIR4536-1/MIR4536-2/MIR4666B/MIR4767/MIR4768/MIR4769/MIR4770/MIR500A/MIR500B/MIR501/MIR502/MIR532/MIR545/MIR548AJ2/MIR548AM/MIR548AX/MIR548F5/MIR548I4/MIR548M/MIR6086/MIR6134/MIR651/MIR660/MIR676/MIR6857/MIR6894/MIR6895/MIR8088/MIR98/MIRLET7F2/MOSPD2/MPC1L/MSL3/MSN/MTMR8/MTRNR2L10/NAP1L2/NAP1L3/NAP1L6/NBDY/NDP/NDP-AS1/NDUFB11/NEXMIF/NHS/NHS-AS1/NHSL2/NLGN3/NLGN4X/NLRP2B/NONO/NR0B1/NUDT10/NUDT11/NYX/OFD1/OGT/OPHN1/OTC/OTUD5/OTUD6A/P2RY10/P2RY4/PABPC1L2A/PABPC1L2B/PABPC1L2B-AS1/PABPC5/PABPC5-AS1/PAGE1/PAGE2/PAGE2B/PAGE3/PAGE4/PAGE5/PBDC1/PCDH11X/PCSK1N/PCYT1B/PCYT1B-AS1/PDHA1/PDK3/PDZD11/PFKFB1/PGAM4/PGK1/PHEX/PHEX-AS1/PHF8/PHKA1/PHKA1-AS1/PHKA2/PHKA2-AS1/PIGA/PIM2/PIN4/PINCR/PIR/PIR-FIGF/PJA1/PLP2/PNPLA4/POF1B/POLA1/PORCN/POU3F4/PPEF1/PPEF1-AS1/PPP1R2C/PPP1R3F/PPP4R3C/PQBP1/PRAF2/PRDX4/PRICKLE3/PRPS2/PRRG1/PTCHD1/PTCHD1-AS/PUDP/RAB41/RAB9A/RAI2/RBBP7/RBM10/RBM3/REPS2/RGN/RIBC1/RLIM/RP2/RPA4/RPGR/RPS26P11/RPS4X/RPS6KA3/RPS6KA6/RRAGB/RS1/RTL3/RTL5/S100G/SAT1/SATL1/SCARNA23/SCARNA9L/SCML1/SCML2/SH3BGRL/SH3KBP1/SHROOM2/SHROOM4/SLC16A2/SLC35A2/SLC38A5/SLC7A3/SLC9A7/SMC1A/SMPX/SMS/SNORA109/SNORA11/SNORA11C/SNORA11D/SNORA11E/SNORA11G/SNORD3E/SNX12/SPACA5/SPACA5B/SPANXN5/SPIN2A/SPIN2B/SPIN3/SPIN4/SRPX/SSX1/SSX2/SSX2B/SSX3/SSX4/SSX4B/SSX5/SSX6P/SSX7/SSX8P/SSX9P/STARD8/STS/SUPT20HL1/SUPT20HL2/SUV39H1/SYAP1/SYN1/SYP/SYP-AS1/SYTL5/TAB3/TAB3-AS1/TAF1/TAF9B/TBC1D25/TBL1X/TBX22/TCEANC/TENT5D/TEX11/TFE3/TGIF2LX/TIMM17B/TIMP1/TLR7/TLR8/TLR8-AS1/TMEM47/TMSB4X/TRAPPC2/TRO/TSIX/TSPAN7/TSPYL2/TSR2/TTC3P1/TXLNG/UBA1/UBE2DNL/UBE2E4P/UBQLN2/UPRT/UQCRBP1/USP11/USP27X/USP27X-AS1/USP51/USP9X/UXT/UXT-AS1/VCX/VCX2/VCX3A/VCX3B/VEGFD/VENTXP1/VSIG4/WAS/WDR13/WDR45/WNK3/WWC3/XAGE1A/XAGE1B/XAGE2/XAGE3/XAGE5/XIST/XK/YIPF6/YY2/ZC3H12B/ZC4H2/ZCCHC13/ZDHHC15/ZFX/ZFX-AS1/ZMYM3/ZNF157/ZNF182/ZNF41/ZNF630/ZNF630-AS1/ZNF674/ZNF674-AS1/ZNF711/ZNF81/ZRSR2/ZXDA/ZXDB
ACE2/ACOT9/ADGRG2/AMELX/ANOS1/AP1S2/APOO/ARHGAP6/ARX/ASB11/ASB9/ATP6AP2/ATXN3L/BCLAF3/BCOR/BEND2/BMX/CA5B/CA5BP1/CA5BP1-CA5B/CASK/CBLL2/CDKL5/CFAP47/CLCN4/CLDN34/CLTRN/CNKSR2/CTPS2/CXorf21/CXorf38/CXorf58/CYBB/DCAF8L1/DCAF8L2/DDX3X/DDX53/DIPK2B/DMD/DUSP21/DYNLT3/EFHC2/EGFL6/EIF1AX/EIF1AX-AS1/EIF2S3/FAM47A/FAM47B/FAM47C/FAM9A/FAM9B/FAM9C/FANCB/FRMPD4/FTH1P18/FTHL17/FUNDC1/GEMIN8/GK/GLRA2/GPM6B/GPR143/GPR34/GPR82/GRPR/GS1-594A7.3/GS1-600G8.3/HCCS/HYPM/IL1RAPL1/INE2/KDM6A/KLHL15/KLHL34/KRBOX4/LANCL3/LINC01186/LINC01203/LINC01204/LINC01281/LINC01282/LINC01456/LINC02154/LINC02595/LINC02601/LOC100132831/LOC101927476/LOC101928389/LOC101928627/LOC392452/LOC729609/MAGEB1/MAGEB10/MAGEB16/MAGEB17/MAGEB18/MAGEB2/MAGEB3/MAGEB4/MAGEB5/MAGEB6/MAOA/MAOB/MAP3K15/MAP7D2/MBTPS2/MED14/MED14OS/MID1/MID1IP1/MID1IP1-AS1/MIR1587/MIR221/MIR222/MIR23C/MIR3915/MIR3937/MIR4666B/MIR4767/MIR4768/MIR548AJ2/MIR548AM/MIR548AX/MIR548F5/MIR6086/MIR6134/MIR651/MOSPD2/MPC1L/MSL3/NDP/NDP-AS1/NHS/NHS-AS1/NR0B1/NYX/OFD1/OTC/PCYT1B/PCYT1B-AS1/PDHA1/PDK3/PHEX/PHEX-AS1/PHKA2/PHKA2-AS1/PIGA/PINCR/PIR/PIR-FIGF/PNPLA4/POLA1/PPEF1/PPEF1-AS1/PPP1R2C/PPP4R3C/PRDX4/PRPS2/PRRG1/PTCHD1/PTCHD1-AS/PUDP/RAB9A/RAI2/RBBP7/REPS2/RPGR/RPS6KA3/RS1/S100G/SAT1/SCARNA23/SCARNA9L/SCML1/SCML2/SH3KBP1/SHROOM2/SMPX/SMS/SRPX/STS/SUPT20HL1/SUPT20HL2/SYAP1/SYTL5/TAB3/TAB3-AS1/TBL1X/TCEANC/TLR7/TLR8/TLR8-AS1/TMEM47/TMSB4X/TRAPPC2/TSPAN7/TXLNG/UBE2E4P/USP9X/VCX/VCX2/VCX3B/VEGFD/VENTXP1/WWC3/XK/YY2/ZFX/ZFX-AS1/ZNF674/ZNF674-AS1/ZRSR2
Not necessarily specific to BCOR alterations.
consensus_sex_chr %>% filter(gene_symbol == "BCOR")
This is not conclusive.
# remove all the sv and cnv data we will not use
rm(bcor_sv_df, manta_sv_df, consensus_sex_chr)
Kids_First_Participant_ID
and sample_id
identifiers_df <- histologies_df %>%
select(Kids_First_Participant_ID,
sample_id,
Kids_First_Biospecimen_ID)
# add IDs to chromosome 19 data
chr19_cn_df <- identifiers_df %>%
inner_join(chr19_cn_df, by = "Kids_First_Biospecimen_ID")
# add IDs to RNA data
rna_df <- identifiers_df %>%
inner_join(rna_df)
Joining, by = "Kids_First_Biospecimen_ID"
molecular_data_df <- full_join(chr19_cn_df, rna_df,
by = c("Kids_First_Participant_ID",
"sample_id")) %>%
rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID.x,
Kids_First_Biospecimen_ID_RNA = Kids_First_Biospecimen_ID.y)
The following were mentioned in AlexsLemonade/OpenPBTA-analysis#251
:
CNS HGNET-BCOR: Median age of diagnosis less than 10 years
CNS HGNET-MN1: Predominantly female patients.
CNS Embryonal, NOS: Tumors previously called PNET that do not fit into other groups above.
relevant_clinical_data <- histologies_df %>%
filter(Kids_First_Biospecimen_ID %in% biospecimen_ids) %>%
select(Kids_First_Participant_ID,
sample_id,
age_at_diagnosis_days,
germline_sex_estimate,
primary_site,
pathology_diagnosis,
pathology_free_text_diagnosis) %>%
distinct() %>%
# convert age to years
mutate(age_at_diagnosis_yrs =
floor(as.numeric(age_at_diagnosis_days) / 365)) %>%
select(-age_at_diagnosis_days)
all_data_df <- inner_join(relevant_clinical_data,
molecular_data_df)
Joining, by = c("Kids_First_Participant_ID", "sample_id")
Write to file.
all_data_df %>%
select(Kids_First_Participant_ID,
sample_id,
Kids_First_Biospecimen_ID_DNA,
Kids_First_Biospecimen_ID_RNA,
age_at_diagnosis_yrs,
germline_sex_estimate,
pathology_diagnosis,
pathology_free_text_diagnosis,
LIN28A,
TTYH1_fusions,
chr19_amplification,
FOXR2,
FOXR2_fusions,
MN1_fusions,
CIC_fusions) %>%
arrange(Kids_First_Participant_ID, sample_id) %>%
write_tsv(output_all)
The differences between ETMR, C19MC-altered and ETMR, NOS is a difference in C19MC amplification. We’ll filter on LIN28A overexpression, as that is a feature of both.
Recall that we z-scored these expression values before subsetting the matrix so they are in the context of all samples for a given selection strategy.
all_data_df %>%
# 3 standard deviations above the mean for LIN28A expression or
# evidence of C19MC amplification
filter(LIN28A > 3 | chr19_amplification == "Yes") %>%
select(ends_with("ID"),
ends_with("diagnosis"),
LIN28A,
TTYH1_fusions,
chr19_amplification)
For some samples that have a TTYH1 fusion, we do not have DNA data to check for C19MC amplification but that’s okay per this comment which references Kleinman et al. Nat Genet. 2014.. We will call all samples with a TTYH1 fusion ETMR, C19MC altered.
etmr_c19mc_df <- all_data_df %>%
filter(LIN28A > 3 | chr19_amplification == "Yes",
TTYH1_fusions != "None") %>%
select(contains("ID")) %>%
mutate(molecular_subtype = "ETMR, C19MC-altered")
For now, we will call the sample with a LIN28A z-score > 3 that has no TTYH1 fusion and no copy number data ETMR, NOS.
# TODO: is this the correct call or should this be a CNS Embryonal, NOS call?
etmr_nos_df <- all_data_df %>%
filter(LIN28A > 3,
TTYH1_fusions == "None") %>%
select(contains("ID")) %>%
mutate(molecular_subtype = "ETMR, NOS")
- CNS high-grade neuroepithelial tumor with MN1 alteration
- Likely previously diagnosed as PNET.
- Contain gene fusions involving 5’ MN1. 3’ fusion partners can include BEND2 and CXXC5.
- Predominantly female patients.
all_data_df %>%
filter(MN1_fusions != "None") %>%
select(ends_with("ID"),
ends_with("diagnosis"),
germline_sex_estimate,
MN1_fusions)
We will call the sample with the MN1–CXXC5 fusion CNS HGNET-MN1.
hgnet_mn1_df <- all_data_df %>%
filter(MN1_fusions != "None") %>%
select(contains("ID")) %>%
mutate(molecular_subtype = "CNS HGNET-MN1")
- CNS high-grade neuroepithelial tumor with BCOR alteration
- Tumors have internal tandem duplication of BCOR
- Median age of diagnosis less than 10 years
We have no clear evidence of BCOR alterations.
- Central nervous system (CNS) neuroblastoma with FOXR2 activation
- Over-expression and/or gene fusions in FOXR2
all_data_df %>%
filter(FOXR2_fusions != "None" | FOXR2 > 3 )%>%
select(ends_with("ID"),
FOXR2,
FOXR2_fusions)
nb_foxr2_df <- all_data_df %>%
filter(FOXR2_fusions != "None" | FOXR2 > 3) %>%
select(contains("ID")) %>%
mutate(molecular_subtype = "CNS NB-FOXR2")
- CNS Ewing sarcoma family tumor with CIC alteration
- Alterations in CIC, commonly fused with NUTM1
all_data_df %>%
filter(CIC_fusions != "None")
- CNS Embryonal tumor, not otherwise specified
- Tumors previously called PNET that do not fit into other groups above.
All other samples will be called CNS Embryonal, NOS with an exception (see below).
If neuroblastoma samples were included on the basis of pathology_diagnosis
, pathology_free_text_diagnosis
, and primary_site
(see 01-samples-to-subset
), but do not meet the criteria for FOXR2 alterations (CNS NB-FOXR2), those samples should not get a subtype label.
other_nbl_df <- all_data_df %>%
filter(
# All other criteria would have been met to be included in all_data_df
pathology_diagnosis == "Neuroblastoma",
# Exclude samples that *do* meet FOXR2 criteria
!(sample_id %in% nb_foxr2_df$sample_id)
)
other_nbl_df
We’re going to use the sample_id
to exclude from the subtyping table.
exclude_sample_id <- other_nbl_df %>%
pull(sample_id)
subtypes_df <- bind_rows(
etmr_c19mc_df,
etmr_nos_df,
hgnet_mn1_df,
nb_foxr2_df
)
subtypes_df <- all_data_df %>%
select(contains("ID")) %>%
# Remove any NBL samples that did not meet CNS NB-FOXR2 criteria
filter(!(sample_id %in% exclude_sample_id)) %>%
full_join(subtypes_df) %>%
replace_na(list(molecular_subtype = "CNS Embryonal, NOS"))
Joining, by = c("Kids_First_Participant_ID", "sample_id", "Kids_First_Biospecimen_ID_DNA", "Kids_First_Biospecimen_ID_RNA")
subtypes_df %>%
arrange(Kids_First_Participant_ID, sample_id) %>%
write_tsv(output_subtypes)
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)
Matrix products: default
BLAS/LAPACK: /usr/lib/libopenblasp-r0.2.19.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2
[5] readr_1.3.1 tidyr_0.8.3 tibble_2.1.3 ggplot2_3.2.0
[9] tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.1 plyr_1.8.4 cellranger_1.1.0 pillar_1.4.2
[5] compiler_3.6.0 base64enc_0.1-3 tools_3.6.0 digest_0.6.20
[9] lubridate_1.7.4 jsonlite_1.6 evaluate_0.14 nlme_3.1-140
[13] gtable_0.3.0 lattice_0.20-38 pkgconfig_2.0.2 rlang_0.4.0
[17] cli_1.1.0 rstudioapi_0.10 yaml_2.2.0 haven_2.1.1
[21] xfun_0.8 withr_2.1.2 xml2_1.2.0 httr_1.4.0
[25] knitr_1.23 generics_0.0.2 hms_0.4.2 grid_3.6.0
[29] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 readxl_1.3.1
[33] rmarkdown_1.13 reshape2_1.4.3 modelr_0.1.4 magrittr_1.5
[37] backports_1.1.4 scales_1.0.0 htmltools_0.3.6 rvest_0.3.4
[41] assertthat_0.2.1 colorspace_1.4-1 stringi_1.4.3 lazyeval_0.2.2
[45] munsell_0.5.0 broom_0.5.2 crayon_1.3.4