Library preparation and alignment

Detailed description of the RNASeq data generation and preprocessing can be found in the supplementary material of the paper by Zhernakova et al. (Zhernakova et al. 2017)

Feature quantification read counts

Gene read counts are generated using htseq using mode union for handling overlapping exons.

Reference annotation used is from ensembl: but only chromosomes 1-22 and X, Y (and MT) are retained.

Prepare gene annotation

For convenience we create a TxDb-package. For more information on using a TxDb-package see GenomicFeatures.

*** Note this requires sudo rights to install and use sudo -i to load /etc/profile with global R_SITE_LIB =/opt/… ***

library(GenomicFeatures)
library(rtracklayer)
gtf <- "ftp://ftp.ensembl.org/pub/release-71/gtf/homo_sapiens/Homo_sapiens.GRCh37.71.gtf.gz"
txdb <- makeTxDbFromGFF(gtf, organism="Homo sapiens")
##Hack by James MacDonald (https://support.bioconductor.org/p/89855/)
con <- dbconn(txdb)
## insert a Resource URL entry in the metadata table,
## pointing to the ftp site where the data were procured
DBI::dbGetQuery(con, "INSERT INTO metadata VALUES ('Resource URL', 'ftp://ftp.ensembl.org/pub/release-71/gtf/homo_sapiens/Homo_sapiens.GRCh37.71.gtf.gz');")
makeTxDbPackage(txdb,
                version = "0.0.1",
                maintainer = "m. van Iterson <m.van_iterson@lumc.nl>",
                author = "m. van Iterson",
                destDir= ".",
                license = "Artistic-2.0",
                pkgname = "TxDb.Hsapiens.Ensembl.v71")
library(devtools)
install("./TxDb.Hsapiens.Ensembl.v71")

Now can use the Ensembl v71 annoation like any other TxDb-package.

library(TxDb.Hsapiens.Ensembl.v71)
(txdb <- TxDb.Hsapiens.Ensembl.v71)
genes(txdb)

Prepare sample metadata

library(BBMRIomics)
colData <- read.table(file=file.path(VM_BASE_ANALYSIS, "BBMRIomics/BBMRIomics/data", "freeze2_rna_identifiers.csv"), header=TRUE, sep=",", as.is=TRUE)

##add methylation samplesheet
rdb <- getView("getStats", usrpwd=RP3_MDB_USRPWD, url=RP3_RDB)
rdb <- do.call('cbind', rdb)
colData <- merge(colData, rdb, by.x="run_id", by.y="ids", all.x=TRUE)

##harmonize phenotypes
source(file.path(path.package("BBMRIomics"), "scripts/Phenotype_Helpers.R"), verbose=FALSE)
phenotypes <- cleanPhenotypes()

##add phenotypes
colData <- merge(colData, phenotypes, by="ids", all.x=TRUE) 

##add genotype ids
imputation <- getView("getImputations", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
hrc <- subset(imputation, imputation_reference == "HRC")
colData <- merge(colData, hrc[, c("ids", "imputation_id")], by="ids", all.x=TRUE)

##add uuid and gonl_id
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
colData <- merge(colData, ids[, c("ids", "uuid", "gonl_id", "biobank_id")], by="ids", all.x=TRUE)

##ordering
first <- match(c("ids", "uuid", "biobank_id", "gonl_id", "run_id"), colnames(colData))

colData <- cbind(colData[, first], colData[, -first])
rownames(colData) <- colData$uuid

##add flowcell maybe other derived lane, index?
colData$flowcell <- gsub("-.*$", "", colData$run_id)

dim(colData)
head(colData)

table(!is.na(colData$gonl_id))
table(!is.na(colData$imputation_id))
table(!(is.na(colData$imputation_id) & is.na(colData$gonl_id)))

Prepare count table

library(BiocParallel)
register(MulticoreParam(32, log=TRUE))
files <- list.files(file.path(VM_BASE_DATA, "RNASeq/v2.1.3/", "gene_read"), full.names=TRUE, pattern="gene.read.count.gz$")
##select runs

files <- files[gsub(".gene.read.count.gz", "", basename(files)) %in% colData$run_id]

head(read.table(files[1], header=TRUE, check.names=FALSE)) ##checking the first few lines
tmp <- bplapply(files, read.table, header=TRUE, check.names=FALSE)
table(unlist(lapply(tmp, nrow))) ##checking all the same length
for(i in 2:length(tmp)) { ##checking all genes same position
    if(!all.equal(tmp[[1]]$gene, tmp[[i]]$gene))
        message("not equal:", i)
}
rownames <- as.character(tmp[[1]]$gene)
colnames <- gsub("\\.gene.read.count.gz", "", basename(files))
counts <- matrix(nrow=nrow(tmp[[1]]), ncol=length(tmp),
                 dimnames=list(rownames, colnames))
for(i in 1:length(tmp)) counts[,i] <- tmp[[i]][,2]
counts[1:5, 1:5]
mid <- match(colnames(counts), colData$run_id)
colnames(counts) <- colData$uuid[mid]

library(TxDb.Hsapiens.Ensembl.v71)
rowRanges <- genes(TxDb.Hsapiens.Ensembl.v71)
library(GenomeInfoDb)
rowRanges <- keepSeqlevels(rowRanges, c(1:22, "X", "Y", "MT"))
rowRanges
head(colData)
counts[1:5, 1:5]
counts <- makeSE(counts, colData[,-1], rowRanges, note="A maximal set of unrelated individuals for which both good quality RNA sequencing and DNA methylation data could be generated.")
save(counts, file = file.path(VM_BASE_DATA, "RNASeq/v2.1.3/gene_read/", paste0("rnaSeqData_ReadCounts_cleaned.RData")))
tmp <- counts
for(biobank in unique(tmp$biobank_id)) {
    counts <- tmp[, tmp$biobank_id == biobank]
    save(counts, file = file.path(VM_BASE_DATA, "RNASeq/v2.1.3/gene_read/", paste0("rnaSeqData_ReadCounts_", biobank, "_cleaned.RData")))
}

Create GoNL-RNAseq dataset

Prepare sample metadata

library(BBMRIomics)
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
gonl <- subset(ids, !is.na(gonl_id))

rna <- getView("getRNASeqRuns", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
rna <- rna[!duplicated(rna$run_id),] ##drop overlapping freezes
rna <- subset(rna, qc == "passed") ##keep those passing qc
rna <- subset(rna, type != "replicate") ##drop replicated
##now we have still original, reruns and merged runs
merged <- subset(rna, type == "merged") ##select
merged <- subset(merged, !(run_id %in% c("BD1NYRACXX-5-27_AC1JL5ACXX-6-27", "AC1C40ACXX-6-15_BD2D5MACXX-6-15"))) ##these have were already merged in freeze one and have new merges so drop
original <- subset(rna, !(ids %in% merged$ids) & type == "original")
rna <- rbind(merged, original)
rna <- subset(rna, ids %in% gonl$ids)

colData <- rna

##add methylation samplesheet
rdb <- getView("getStats", usrpwd=RP3_MDB_USRPWD, url=RP3_RDB)
rdb <- do.call('cbind', rdb)
colData <- merge(colData, rdb, by.x="run_id", by.y="ids", all.x=TRUE)

##harmonize phenotypes
source(file.path(path.package("BBMRIomics"), "scripts/Phenotype_Helpers.R"), verbose=FALSE)
phenotypes <- cleanPhenotypes()

##add phenotypes
colData <- merge(colData, phenotypes, by="ids", all.x=TRUE) 

##add uuid and gonl_id
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
colData <- merge(colData, ids[, c("ids", "uuid", "gonl_id")], by="ids", all.x=TRUE)

##ordering
first <- match(c("ids", "uuid", "biobank_id", "gonl_id", "run_id"), colnames(colData))

colData <- cbind(colData[, first], colData[, -first])
rownames(colData) <- colData$uuid

##add flowcell maybe other derived lane, index?
colData$flowcell <- gsub("-.*$", "", colData$run_id)

dim(colData)
head(colData)

table(!is.na(colData$gonl_id))

Prepare count table

library(BiocParallel)
register(MulticoreParam(16, log=TRUE))
files <- list.files(file.path(VM_BASE_DATA, "RNASeq/v2.1.3/", "gene_read"), full.names=TRUE, pattern="gene.read.count.gz$")
##select runs

files <- files[gsub(".gene.read.count.gz", "", basename(files)) %in% colData$run_id]

head(read.table(files[1], header=TRUE, check.names=FALSE)) ##checking the first few lines
tmp <- bplapply(files, read.table, header=TRUE, check.names=FALSE)
table(unlist(lapply(tmp, nrow))) ##checking all the same length
for(i in 2:length(tmp)) { ##checking all genes same position
    if(!all.equal(tmp[[1]]$gene, tmp[[i]]$gene))
        message("not equal:", i)
}
rownames <- as.character(tmp[[1]]$gene)
colnames <- gsub("\\.gene.read.count.gz", "", basename(files))
counts <- matrix(nrow=nrow(tmp[[1]]), ncol=length(tmp),
                 dimnames=list(rownames, colnames))
for(i in 1:length(tmp)) counts[,i] <- tmp[[i]][,2]
counts[1:5, 1:5]
mid <- match(colnames(counts), colData$run_id)
colnames(counts) <- colData$uuid[mid]

library(TxDb.Hsapiens.Ensembl.v71)
rowRanges <- genes(TxDb.Hsapiens.Ensembl.v71)
library(GenomeInfoDb)
rowRanges <- keepSeqlevels(rowRanges, c(1:22, "X", "Y", "MT"))
rowRanges
head(colData)
counts[1:5, 1:5]
counts <- makeSE(counts, colData[,-1], rowRanges, note="GoNL subset RNAseq data")
save(counts, file = file.path(VM_BASE_DATA, "RNASeq/v2.1.3/gene_read/", paste0("rnaSeqData_ReadCounts_GoNL.RData")))

Miscellaneous

Code to calculate gene-level GC-content.

library(AnnotationHub)
ah <- AnnotationHub()
query(ah, c("ensembl", "sapiens", "71"))

gtf <- rtracklayer::import(ah["AH7666"]$sourceurl)

library(GenomeInfoDb)
gtf <- keepSeqlevels(gtf, c(1:22, "X", "Y", "MT"))
gtf <- gtf[order(gtf)] ##now we have `cut and sorted` as on the VM

seqlevels(gtf) <- mapSeqlevels(seqlevels(gtf), "UCSC")  ##for use in combination with `BSgenome.Hsapiens.UCSC.hg19`
genome(gtf) <- "hg19" ##for use in combination with `BSgenome.Hsapiens.UCSC.hg19`

genes <- split(gtf, mcols(gtf)$gene_id)

register(MulticoreParam(8))
gc <- bplapply(genes, function(x) {
    require(BSgenome.Hsapiens.UCSC.hg19)
    require(Rsamtools)
    require(Biostrings)
    seqs <- getSeq(Hsapiens, reduce(x))
    alf <- as.matrix(alphabetFrequency(seqs, as.prob=TRUE)[, c("G", "C")], ncol=2)
    gc <- rowSums(alf)
    mean(gc)
})
gc <- do.call("rbind", gc)
all.equal(names(genes), rownames(gc))
mcols(genes)$gc <- gc
genes

Zhernakova, D. V., P. Deelen, M. Vermaat, M. van Iterson, M. van Galen, W. Arindrarto, P. van ’t Hof, et al. 2017. “Identification of context-dependent expression quantitative trait loci in whole blood.” Nat. Genet. 49 (1): 139–45.