Purpose: This notebook illustrates one way that you can use harmonized data from refine.bio in downstream analyses.

1) Install libraries

This script uses the bioconductor R package ComplexHeatmap for clustering and creating a heatmap.
Citation: Gu Z, Eils R, Schlesner M (2016). “Complex heatmaps reveal patterns and correlations in multidimensional genomic data.” Bioinformatics.

if (!("ComplexHeatmap" %in% installed.packages())) {
  # Install ComplexHeatmap
  BiocManager::install("ComplexHeatmap", update = FALSE)
}

Attach the ComplexHeatmap library:

# Attach the library
library(ComplexHeatmap)
# Magrittr pipe
`%>%` <- dplyr::`%>%`
# Set the seed so our results are reproducible:
set.seed(12345)

Create output folder.

# Create the plots folder if it doesn't exist
if (!dir.exists("plots")) {
  dir.create("plots")
}

2) Import and set up data

Data downloaded from refine.bio include a metadata tab separated values (“tsv”) file and a data tsv file. This chunk of code will read the both tsv files and add them as data.frames to your environment.

# Read in metadata tsv file
metadata <- readr::read_tsv(file.path("data", "metadata_GSE12955.tsv"))
Parsed with column specification:
cols(
  .default = col_character(),
  refinebio_age = col_logical(),
  refinebio_cell_line = col_logical(),
  refinebio_compound = col_logical(),
  refinebio_disease = col_logical(),
  refinebio_disease_stage = col_logical(),
  refinebio_genetic_information = col_logical(),
  refinebio_race = col_logical(),
  refinebio_sex = col_logical(),
  refinebio_source_archive_url = col_logical(),
  refinebio_specimen_part = col_logical(),
  refinebio_subject = col_logical(),
  refinebio_time = col_logical(),
  refinebio_treatment = col_logical(),
  channel_count = col_double(),
  data_row_count = col_double(),
  taxid_ch1 = col_double()
)
See spec(...) for full column specifications.
# Read in data tsv file
df <- readr::read_tsv(file.path("data", "GSE12955.tsv")) %>%
  tibble::column_to_rownames('Gene')
Parsed with column specification:
cols(
  Gene = col_character(),
  GSM324905 = col_double(),
  GSM324904 = col_double(),
  GSM324903 = col_double(),
  GSM324906 = col_double()
)

Let’s ensure that the metadata and data are in the same sample order.

# Make the data in the order of the metadata
df <- df %>% dplyr::select(metadata$geo_accession)
# Check if this is in the same order
all.equal(colnames(df), metadata$geo_accession)
[1] TRUE

3) Choose genes of interest

Although you may want to create a heatmap including all of the genes in the set, alternatively, the heatmap could be created using only genes of interest. For this example, we will sort genes by variance, but there are many alternative criterion by which you may want to sort your genes eg fold change, t-statistic, membership to a particular gene ontology, so on.

# Calculate the variance for each gene
variances <- apply(df, 1, var)
# Determine summary statistics for gene variances
sum.stats.var <- summary(variances)
# Subset the data choosing only genes whose variances are in the upper quartile
df.by.var <- df[which(variances > sum.stats.var[5]), ]

4) Create a heatmap

To further customize the heatmap, see the vignettes on Bioconductor for a guide at this link.

# Reference the ComplexHeatmap guide for further customizing your heatmap
browseVignettes("ComplexHeatmap")
# Create an annotation that labels samples' groups by color
annot <- HeatmapAnnotation(df = data.frame(Groups = rep(c("SP", "MP"), each = 2)),
    col = list(Groups = c("SP" = "green", "MP" = "orange")))
# Create the heatmap object
heatmap <- Heatmap(df.by.var, 
        name = "Gene_Expression",
        show_row_names = FALSE,
        show_row_dend = FALSE,   # Can show the gene/row cluster if this is 
        #changed to TRUE
        column_dend_height = unit(4, "cm"),
        bottom_annotation = annot) # assign the previously made annotation object
# Print out the heatmap
heatmap

5) Save heatmap as a png

You can easily switch this to save to a jpeg or tiff by changing the function and file name within the function to the respective file suffix.

# Open a png file
png(file.path("plots", "HeatmapGSE12955.png"))
# Print your heatmap
heatmap
# Close the png file:
dev.off()
null device 
          1 

Print session info:

LS0tCnRpdGxlOiAiQ2x1c3RlcmluZyBEYXRhIgphdXRob3I6ICJBTFNGIENDREwgLSBDYW5kYWNlIFNhdm9uZW4iCm91dHB1dDogICAKICBodG1sX25vdGVib29rOiAKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCi0tLQoKKlB1cnBvc2UqOiBUaGlzIG5vdGVib29rIGlsbHVzdHJhdGVzIG9uZSB3YXkgdGhhdCB5b3UgY2FuIHVzZSBoYXJtb25pemVkIGRhdGEgCmZyb20gcmVmaW5lLmJpbyBpbiBkb3duc3RyZWFtIGFuYWx5c2VzLgoKIyMgMSkgSW5zdGFsbCBsaWJyYXJpZXMKVGhpcyBzY3JpcHQgdXNlcyB0aGUgYmlvY29uZHVjdG9yIFIgcGFja2FnZSBDb21wbGV4SGVhdG1hcCBmb3IgY2x1c3RlcmluZyBhbmQgCmNyZWF0aW5nIGEgaGVhdG1hcC4KPGJyLz4gPGk+IENpdGF0aW9uOjwvaT4gR3UgWiwgRWlscyBSLCBTY2hsZXNuZXIgTSAoMjAxNikuIOKAnENvbXBsZXggaGVhdG1hcHMgCnJldmVhbCBwYXR0ZXJucyBhbmQgY29ycmVsYXRpb25zIGluIG11bHRpZGltZW5zaW9uYWwgZ2Vub21pYyBkYXRhLuKAnSBCaW9pbmZvcm1hdGljcy4KCmBgYHtyIEluc3RhbGwgYW5kIGF0dGFjaCB0aGUgQ29tcGxleEhlYXRtYXAgbGlicmFyeX0KaWYgKCEoIkNvbXBsZXhIZWF0bWFwIiAlaW4lIGluc3RhbGxlZC5wYWNrYWdlcygpKSkgewogICMgSW5zdGFsbCBDb21wbGV4SGVhdG1hcAogIEJpb2NNYW5hZ2VyOjppbnN0YWxsKCJDb21wbGV4SGVhdG1hcCIsIHVwZGF0ZSA9IEZBTFNFKQp9CmBgYAoKQXR0YWNoIHRoZSBgQ29tcGxleEhlYXRtYXBgIGxpYnJhcnk6CgpgYGB7cn0KIyBBdHRhY2ggdGhlIGxpYnJhcnkKbGlicmFyeShDb21wbGV4SGVhdG1hcCkKCiMgTWFncml0dHIgcGlwZQpgJT4lYCA8LSBkcGx5cjo6YCU+JWAKCiMgU2V0IHRoZSBzZWVkIHNvIG91ciByZXN1bHRzIGFyZSByZXByb2R1Y2libGU6CnNldC5zZWVkKDEyMzQ1KQpgYGAKCkNyZWF0ZSBvdXRwdXQgZm9sZGVyLgoKYGBge3J9CiMgQ3JlYXRlIHRoZSBwbG90cyBmb2xkZXIgaWYgaXQgZG9lc24ndCBleGlzdAppZiAoIWRpci5leGlzdHMoInBsb3RzIikpIHsKICBkaXIuY3JlYXRlKCJwbG90cyIpCn0KYGBgCgojIyAyKSBJbXBvcnQgYW5kIHNldCB1cCBkYXRhCkRhdGEgZG93bmxvYWRlZCBmcm9tIHJlZmluZS5iaW8gaW5jbHVkZSBhIG1ldGFkYXRhIHRhYiBzZXBhcmF0ZWQgdmFsdWVzICgidHN2IikKZmlsZSBhbmQgYSBkYXRhIHRzdiBmaWxlLiBUaGlzIGNodW5rIG9mIGNvZGUgd2lsbCByZWFkIHRoZSBib3RoIHRzdiBmaWxlcyBhbmQgCmFkZCB0aGVtIGFzIGRhdGEuZnJhbWVzIHRvIHlvdXIgZW52aXJvbm1lbnQuCgpgYGB7ciBJbXBvcnQgZGF0YSBmcm9tIC50c3YgZmlsZXN9CiMgUmVhZCBpbiBtZXRhZGF0YSB0c3YgZmlsZQptZXRhZGF0YSA8LSByZWFkcjo6cmVhZF90c3YoZmlsZS5wYXRoKCJkYXRhIiwgIm1ldGFkYXRhX0dTRTEyOTU1LnRzdiIpKQoKIyBSZWFkIGluIGRhdGEgdHN2IGZpbGUKZGYgPC0gcmVhZHI6OnJlYWRfdHN2KGZpbGUucGF0aCgiZGF0YSIsICJHU0UxMjk1NS50c3YiKSkgJT4lCiAgdGliYmxlOjpjb2x1bW5fdG9fcm93bmFtZXMoJ0dlbmUnKQpgYGAKCkxldCdzIGVuc3VyZSB0aGF0IHRoZSBtZXRhZGF0YSBhbmQgZGF0YSBhcmUgaW4gdGhlIHNhbWUgc2FtcGxlIG9yZGVyLiAKCmBgYHtyfQojIE1ha2UgdGhlIGRhdGEgaW4gdGhlIG9yZGVyIG9mIHRoZSBtZXRhZGF0YQpkZiA8LSBkZiAlPiUgZHBseXI6OnNlbGVjdChtZXRhZGF0YSRnZW9fYWNjZXNzaW9uKQoKIyBDaGVjayBpZiB0aGlzIGlzIGluIHRoZSBzYW1lIG9yZGVyCmFsbC5lcXVhbChjb2xuYW1lcyhkZiksIG1ldGFkYXRhJGdlb19hY2Nlc3Npb24pCmBgYAoKIyMgMykgQ2hvb3NlIGdlbmVzIG9mIGludGVyZXN0CkFsdGhvdWdoIHlvdSBtYXkgd2FudCB0byBjcmVhdGUgYSBoZWF0bWFwIGluY2x1ZGluZyBhbGwgb2YgdGhlIGdlbmVzIGluIHRoZSBzZXQsCmFsdGVybmF0aXZlbHksIHRoZSBoZWF0bWFwIGNvdWxkIGJlIGNyZWF0ZWQgdXNpbmcgb25seSBnZW5lcyBvZiBpbnRlcmVzdC4gCkZvciB0aGlzIGV4YW1wbGUsIHdlIHdpbGwgc29ydCBnZW5lcyBieSB2YXJpYW5jZSwgYnV0IHRoZXJlIGFyZSBtYW55IGFsdGVybmF0aXZlCmNyaXRlcmlvbiBieSB3aGljaCB5b3UgbWF5IHdhbnQgdG8gc29ydCB5b3VyIGdlbmVzIDxpPmVnPC9pPiBmb2xkIGNoYW5nZSwKdC1zdGF0aXN0aWMsIG1lbWJlcnNoaXAgdG8gYSBwYXJ0aWN1bGFyIGdlbmUgb250b2xvZ3ksIHNvIG9uLiAKCmBgYHtyIENob29zZSBnZW5lc30KIyBDYWxjdWxhdGUgdGhlIHZhcmlhbmNlIGZvciBlYWNoIGdlbmUKdmFyaWFuY2VzIDwtIGFwcGx5KGRmLCAxLCB2YXIpCgojIERldGVybWluZSBzdW1tYXJ5IHN0YXRpc3RpY3MgZm9yIGdlbmUgdmFyaWFuY2VzCnN1bS5zdGF0cy52YXIgPC0gc3VtbWFyeSh2YXJpYW5jZXMpCgojIFN1YnNldCB0aGUgZGF0YSBjaG9vc2luZyBvbmx5IGdlbmVzIHdob3NlIHZhcmlhbmNlcyBhcmUgaW4gdGhlIHVwcGVyIHF1YXJ0aWxlCmRmLmJ5LnZhciA8LSBkZlt3aGljaCh2YXJpYW5jZXMgPiBzdW0uc3RhdHMudmFyWzVdKSwgXQpgYGAKCiMjIDQpIENyZWF0ZSBhIGhlYXRtYXAKVG8gZnVydGhlciBjdXN0b21pemUgdGhlIGhlYXRtYXAsIHNlZSB0aGUgdmlnbmV0dGVzIG9uIEJpb2NvbmR1Y3RvciBmb3IgYSBndWlkZSAKYXQgdGhpcyBbbGlua10oaHR0cHM6Ly9iaW9jb25kdWN0b3Iub3JnL3BhY2thZ2VzL2RldmVsL2Jpb2MvdmlnbmV0dGVzL0NvbXBsZXhIZWF0bWFwL2luc3QvZG9jL3MxLmludHJvZHVjdGlvbi5odG1sKS4KCmBgYHtyIENyZWF0ZSBhIGhlYXRtYXB9CiMgUmVmZXJlbmNlIHRoZSBDb21wbGV4SGVhdG1hcCBndWlkZSBmb3IgZnVydGhlciBjdXN0b21pemluZyB5b3VyIGhlYXRtYXAKYnJvd3NlVmlnbmV0dGVzKCJDb21wbGV4SGVhdG1hcCIpCgojIENyZWF0ZSBhbiBhbm5vdGF0aW9uIHRoYXQgbGFiZWxzIHNhbXBsZXMnIGdyb3VwcyBieSBjb2xvcgphbm5vdCA8LSBIZWF0bWFwQW5ub3RhdGlvbihkZiA9IGRhdGEuZnJhbWUoR3JvdXBzID0gcmVwKGMoIlNQIiwgIk1QIiksIGVhY2ggPSAyKSksCiAgICBjb2wgPSBsaXN0KEdyb3VwcyA9IGMoIlNQIiA9ICJncmVlbiIsICJNUCIgPSAib3JhbmdlIikpKQoKIyBDcmVhdGUgdGhlIGhlYXRtYXAgb2JqZWN0CmhlYXRtYXAgPC0gSGVhdG1hcChkZi5ieS52YXIsIAogICAgICAgIG5hbWUgPSAiR2VuZV9FeHByZXNzaW9uIiwKICAgICAgICBzaG93X3Jvd19uYW1lcyA9IEZBTFNFLAogICAgICAgIHNob3dfcm93X2RlbmQgPSBGQUxTRSwgICAjIENhbiBzaG93IHRoZSBnZW5lL3JvdyBjbHVzdGVyIGlmIHRoaXMgaXMgCiAgICAgICAgI2NoYW5nZWQgdG8gVFJVRQogICAgICAgIGNvbHVtbl9kZW5kX2hlaWdodCA9IHVuaXQoNCwgImNtIiksCiAgICAgICAgYm90dG9tX2Fubm90YXRpb24gPSBhbm5vdCkgIyBhc3NpZ24gdGhlIHByZXZpb3VzbHkgbWFkZSBhbm5vdGF0aW9uIG9iamVjdAoKIyBQcmludCBvdXQgdGhlIGhlYXRtYXAKaGVhdG1hcApgYGAKCiMjIDUpIFNhdmUgaGVhdG1hcCBhcyBhIHBuZwpZb3UgY2FuIGVhc2lseSBzd2l0Y2ggdGhpcyB0byBzYXZlIHRvIGEganBlZyBvciB0aWZmIGJ5IGNoYW5naW5nIHRoZSBmdW5jdGlvbiAKYW5kIGZpbGUgbmFtZSB3aXRoaW4gdGhlIGZ1bmN0aW9uIHRvIHRoZSByZXNwZWN0aXZlIGZpbGUgc3VmZml4LgoKYGBge3IgU2F2ZSBoZWF0bWFwIGFzIGEgcG5nfQojIE9wZW4gYSBwbmcgZmlsZQpwbmcoZmlsZS5wYXRoKCJwbG90cyIsICJIZWF0bWFwR1NFMTI5NTUucG5nIikpCgojIFByaW50IHlvdXIgaGVhdG1hcApoZWF0bWFwCgojIENsb3NlIHRoZSBwbmcgZmlsZToKZGV2Lm9mZigpCmBgYAoKUHJpbnQgc2Vzc2lvbiBpbmZvOgoKYGBge3IgUHJpbnQgc2Vzc2lvbiBpbmZvfQojIFByaW50IHNlc3Npb24gaW5mbyAKc2Vzc2lvbkluZm8oKQpgYGAK