Library preparation and alignment

Detailed description of the RNASeq data generation and preprocessing can be found in the supplementary material of the paper by Zhernakova et al. (Zhernakova et al. 2017)

Feature quantification read counts

Gene read counts are generated using htseq using mode union for handling overlapping exons.

Reference annotation used is from ensembl: but only chromosomes 1-22 and X, Y (and MT) are retained.

Prepare count table

Create GoNL-RNAseq dataset

Prepare sample metadata

library(BBMRIomics)
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
gonl <- subset(ids, !is.na(gonl_id))

rna <- getView("getRNASeqRuns", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
rna <- rna[!duplicated(rna$run_id),] ##drop overlapping freezes
rna <- subset(rna, qc == "passed") ##keep those passing qc
rna <- subset(rna, type != "replicate") ##drop replicated
##now we have still original, reruns and merged runs
merged <- subset(rna, type == "merged") ##select
merged <- subset(merged, !(run_id %in% c("BD1NYRACXX-5-27_AC1JL5ACXX-6-27", "AC1C40ACXX-6-15_BD2D5MACXX-6-15"))) ##these have were already merged in freeze one and have new merges so drop
original <- subset(rna, !(ids %in% merged$ids) & type == "original")
rna <- rbind(merged, original)
rna <- subset(rna, ids %in% gonl$ids)

colData <- rna

##add methylation samplesheet
rdb <- getView("getStats", usrpwd=RP3_MDB_USRPWD, url=RP3_RDB)
rdb <- do.call('cbind', rdb)
colData <- merge(colData, rdb, by.x="run_id", by.y="ids", all.x=TRUE)

##harmonize phenotypes
source(file.path(path.package("BBMRIomics"), "scripts/Phenotype_Helpers.R"), verbose=FALSE)
phenotypes <- cleanPhenotypes()

##add phenotypes
colData <- merge(colData, phenotypes, by="ids", all.x=TRUE) 

##add uuid and gonl_id
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
colData <- merge(colData, ids[, c("ids", "uuid", "gonl_id")], by="ids", all.x=TRUE)

##ordering
first <- match(c("ids", "uuid", "biobank_id", "gonl_id", "run_id"), colnames(colData))

colData <- cbind(colData[, first], colData[, -first])
rownames(colData) <- colData$uuid

##add flowcell maybe other derived lane, index?
colData$flowcell <- gsub("-.*$", "", colData$run_id)

dim(colData)
head(colData)

table(!is.na(colData$gonl_id))