Detailed description of the RNASeq data generation and preprocessing can be found in the supplementary material of the paper by Zhernakova et al. (Zhernakova et al. 2017)
Gene read counts are generated using featurecounts.
Reference annotation used is from ensembl: release 91
For convenience we create a TxDb
-package. For more information on using a TxDb
-package see GenomicFeatures.
*** Note this requires sudo rights to install and use sudo -i to load /etc/profile with global R_SITE_LIB =/opt/… ***
library(GenomicFeatures)
library(rtracklayer)
gtf <- "ftp://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz"
txdb <- makeTxDbFromGFF(gtf, organism="Homo sapiens")
##Hack by James MacDonald (https://support.bioconductor.org/p/89855/)
con <- dbconn(txdb)
## insert a Resource URL entry in the metadata table,
## pointing to the ftp site where the data were procured
DBI::dbGetQuery(con, "INSERT INTO metadata VALUES ('Resource URL', 'ftp://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz');")
makeTxDbPackage(txdb,
version = "0.0.1",
maintainer = "D. Cats <d.cats@lumc.nl>",
author = "d. cats",
destDir= ".",
license = "Artistic-2.0",
pkgname = "TxDb.Hsapiens.Ensembl.v91")
library(devtools)
install("./TxDb.Hsapiens.Ensembl.v91")
Now we can use the Ensembl v91 annoation like any other TxDb
-package.
Metadata for the samples is retrieved from the metadatabase. This is done only for the samples which were part of maximum set of unrelated individuals from Freeze2. These are the same samples as were uploaded to EGA (Fastq files: EGAD00001003785).
library(BBMRIomics)
freeze2_unrelated <- runQuery(paste0(
"SELECT uuid, biobank as biobank_id, bios_id AS ids, gonl_id, ",
"rna_run.rna_run_id AS run_id, age AS sampling_age, ",
"COALESCE(rna_sampling_date, dna_bloodsampling_date, ",
"cellcount_bloodsampling_date, lipids_bloodsampling_date, ",
"crp_bloodSampling_date) AS sampling_date, ",
"COALESCE(rna_sampling_time, dna_bloodsampling_time, ",
"lipids_bloodsampling_time, crp_bloodsampling_time, ",
"cellcount_bloodsampling_time) AS sampling_time, ",
"sex, birth_year, weight, height, anthropometry_age, smoking, smoking_age, ",
"ascertainment_criterion, totchol, triglycerides, hdlchol, ldlchol, ",
"ldlcholmethod, hscrp, lipidmed, lipidsmed_age, lipids_bloodsampling_fasting, ",
"rna_source, rna_rin, rna_a260a280ratio, rna_extraction_method, ",
"rna_extraction_date, dna_source, dna_quantificationmethod, dna_a260a280ratio, ",
"dna_extraction_method, dna_extraction_date, gwas_chip, ",
"gwas_datageneration_date, baso, baso_perc, eos, eos_perc, granulocyte, ",
"granulocyte_perc, luc, luc_perc, lymph, lymph_perc, mono, mono_perc, neut, ",
"neut_perc, hct, hgb, mch, mchc, mcv, mpv, plt, rbc, rdw, wbc, ",
"flowcell_num, machine, run_num, insert_size, rna_id ",
"FROM rna_freeze ",
"LEFT JOIN rna_run ON rna_freeze.rna_run_id = rna_run.rna_run_id ",
"LEFT JOIN rna_sample ON rna_run.rna_sample_id = rna_sample.rna_sample_id ",
"LEFT JOIN visit ON rna_sample.visit_id = visit.visit_id ",
"LEFT JOIN person ON visit.person_id = person.person_id ",
"LEFT JOIN gwas on person.person_id = gwas.person_id ",
"LEFT JOIN dna_sample on visit.visit_id = dna_sample.visit_id ",
"WHERE set = 'unrelated' AND freeze_num = 2;"),
usrpwd = RP3_MDB_USRPWD)
Some of the biobank ids need to be relabeled to be conformant with the previous datasets.
Information about genotypes are still missing from the metadata of the samples. Genotype imputation ids are retrieved from the meta database and added to the sample metadata.
imputation <- getSQLview("getimputations")
hrc <- subset(imputation, imputation_reference == "HRC")
hrc <- hrc[, c("ids", "imputations_id")]
names(hrc) <- c("ids", "imputation_id_hrc")
hrcv1.1 <- subset(imputation, imputation_reference == "HRCv1.1")
hrcv1.1 <- hrcv1.1[, c("ids", "imputations_id")]
names(hrcv1.1) <- c("ids", "imputation_id_hrcv1.1")
freeze2_unrelated <- merge(freeze2_unrelated, hrc[, c("ids", "imputation_id_hrc")],
by="ids", all.x=TRUE)
freeze2_unrelated <- merge(freeze2_unrelated, hrcv1.1[, c("ids", "imputation_id_hrcv1.1")],
by="ids", all.x=TRUE)
There are a number of fields which contain inconsistent values. The potentially affected fields are checked and adjusted accordingly.
table(freeze2_unrelated$biobank)
table(freeze2_unrelated$sex)
table(freeze2_unrelated$smoking)
table(freeze2_unrelated$ascertainment_criterion) #needs some normalization
table(freeze2_unrelated$ldlcholmethod)
table(freeze2_unrelated$lipidmed)
table(freeze2_unrelated$lipids_bloodsampling_fasting)
table(freeze2_unrelated$rna_source)
table(freeze2_unrelated$rna_extraction_method) # needs some normalization
table(freeze2_unrelated$dna_source)
table(freeze2_unrelated$dna_quantificationmethod) # needs some normalization
table(freeze2_unrelated$dna_extraction_method) # needs some normalization
table(freeze2_unrelated$gwas_chip) # needs some normalization
## ascertainment_criterion
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL$", "GONL_subject",
freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^Complete genomics sequencing$",
"CG_subject",
freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL / Complete genomics sequencing",
"GoNL/CG_subject",
freeze2_unrelated$ascertainment_criterion)
table(freeze2_unrelated$ascertainment_criterion)
## rna_extraction_method
freeze2_unrelated$rna_extraction_method[!is.na(freeze2_unrelated$rna_extraction_method)] <-
"PAXgene Blood RNA Kit (Qiagen)"
table(freeze2_unrelated$rna_extraction_method)
## dna_quantificationmethod
freeze2_unrelated$dna_quantificationmethod <- gsub("nanodrop|nano", "Nanodrop",
freeze2_unrelated$dna_quantificationmethod)
freeze2_unrelated$dna_quantificationmethod <- gsub(
"spectrofotometer|Spectofotometer|spectofotometer", "Spectrofotometer",
freeze2_unrelated$dna_quantificationmethod)
table(freeze2_unrelated$dna_quantificationmethod)
## dna_extraction_method
freeze2_unrelated$dna_extraction_method <- gsub("salting out|Saltingout", "Salting out",
freeze2_unrelated$dna_extraction_method)
freeze2_unrelated$dna_extraction_method <- gsub("QIAamp DNA minikit", "DNA Mini Kit (Qiaamp)",
freeze2_unrelated$dna_extraction_method)
table(freeze2_unrelated$dna_extraction_method)
## gwas_chip
freeze2_unrelated$gwas_chip <- gsub(" *, *", "/", freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina human omni express", "Illumina OmniExpress",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina660", "Illumina Human660-Quad",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("OverlappingSNPsfrom", "Overlapping SNPs from ",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("GONLSequenceData", "GONL", freeze2_unrelated$gwas_chip)
table(freeze2_unrelated$gwas_chip)
For some samples the LDLchol values are missing, but can be inferred from the totChol, HDLchol and TriGlycerides using the Friedewald estimation.
table(is.na(freeze2_unrelated$ldlchol))
##Calculate LDLchol, for those missing, from Tot, HDL and TriGlycerides
##http://www.gpnotebook.co.uk/simplepage.cfm?ID=x20030114211535665170
LDLchol <- freeze2_unrelated$totchol - freeze2_unrelated$hdlchol - (freeze2_unrelated$triglycerides/2.2)
freeze2_unrelated$ldlcholmethod[is.na(freeze2_unrelated$ldlchol)] <- "Friedewald estimation"
freeze2_unrelated$ldlchol[is.na(freeze2_unrelated$ldlchol)] <- LDLchol[is.na(freeze2_unrelated$ldlchol)]
## Friedewald estimation should not be used when triglycerides exceed 4.52
freeze2_unrelated$ldlcholmethod[freeze2_unrelated$triglycerides > 4.52 &
freeze2_unrelated$ldlcholmethod == "Friedewald estimation"] <- NA
freeze2_unrelated$ldlchol[freeze2_unrelated$triglycerides > 4.52 &
freeze2_unrelated$ldlcholmethod == "Friedewald estimation"] <- NA
table(is.na(freeze2_unrelated$ldlchol))
# prepare count table
library(BiocParallel)
register(MulticoreParam(32, log=TRUE))
files <- list.files(file.path(VM_BASE_DATA, "RNASeq/GRCh38/", "gene_read"),
full.names=TRUE, pattern=".tsv$")
##select runs
files <- files[gsub("\\.tsv$", "", basename(files)) %in% freeze2_unrelated$run_id]
head(read.table(files[1], header=TRUE, check.names=FALSE)) ##checking the first few lines
tmp <- bplapply(files, read.table, header=TRUE, check.names=FALSE)
table(unlist(lapply(tmp, nrow))) ##checking all the same length
for(i in 2:length(tmp)) { ##checking all genes same position
if(!all.equal(tmp[[1]]$gene, tmp[[i]]$gene))
message("not equal:", i)
}
rownames <- as.character(tmp[[1]]$feature)
colnames <- gsub("\\.tsv$", "", basename(files))
countsMat <- matrix(nrow=nrow(tmp[[1]]), ncol=length(tmp),
dimnames=list(rownames, colnames))
for(i in 1:length(tmp)) countsMat[,i] <- tmp[[i]][,2]
countsMat[1:5, 1:5]
mid <- match(colnames(countsMat), freeze2_unrelated$run_id)
colnames(countsMat) <- freeze2_unrelated$uuid[mid]
# remove unassigned counts
countsMat <- countsMat[grep("^Unassigned_", rownames(countsMat), invert=T),]
# prepare annotation
library(TxDb.Hsapiens.Ensembl.v91)
rowRanges <- genes(TxDb.Hsapiens.Ensembl.v91)
rownames(freeze2_unrelated) <- freeze2_unrelated$uuid
rowRanges
head(freeze2_unrelated)
countsMat[1:5, 1:5]
# make SE
counts <- makeSE(countsMat, freeze2_unrelated[,-1], rowRanges,
author="D. Cats", dbVersion=mdbVersion(), note=paste0(
"A maximal set of unrelated individuals for which both good quality RNA",
"sequencing and DNA methylation data could be generated."))
save(counts, file = file.path(VM_BASE_DATA, "RNASeq/v2.1.3/gene_read/",
"rnaSeqData_ReadCounts_BIOS_Freeze2_unrelated_GRCh38.RData"),
compress = "xz")
tmp <- counts
table(counts$biobank_id)
for(biobank in unique(tmp$biobank_id)) {
counts <- tmp[, tmp$biobank_id == biobank]
print(table(counts$biobank_id))
save(counts, file = file.path(VM_BASE_DATA, "RNASeq/v2.1.3/gene_read/", paste0(
"rnaSeqData_ReadCounts_", biobank, "_Freeze2_unrelated_GRCh38.RData")),
compress = "xz")
}
Zhernakova, D. V., P. Deelen, M. Vermaat, M. van Iterson, M. van Galen, W. Arindrarto, P. van ’t Hof, et al. 2017. “Identification of context-dependent expression quantitative trait loci in whole blood.” Nat. Genet. 49 (1): 139–45.