Metadata for the samples is retrieved from the metadatabase. This is done only for the samples which were part of maximum set of unrelated individuals from Freeze2. These are the same samples as were uploaded to EGA (EGAD00010001416).
library(BBMRIomics)
freeze2_unrelated <- runQuery(paste0(
"SELECT uuid, biobank AS biobank_id, bios_id AS ids, gonl_id, ",
"methylation_450k_run.methylation_450k_run_id AS run_id, ",
"age AS sampling_age, COALESCE(dna_bloodsampling_date, rna_sampling_date, ",
"cellcount_bloodsampling_date, lipids_bloodsampling_date, ",
"crp_bloodSampling_date) AS sampling_date, ",
"COALESCE(dna_bloodsampling_time, rna_sampling_time, ",
"lipids_bloodsampling_time, crp_bloodsampling_time, ",
"cellcount_bloodsampling_time) AS sampling_time, ",
"sex, birth_year, weight, height, anthropometry_age, smoking, smoking_age, ",
"ascertainment_criterion, totchol, triglycerides, hdlchol, ldlchol, ",
"ldlcholmethod, hscrp, lipidmed, lipidsmed_age, lipids_bloodsampling_fasting, ",
"rna_source, rna_rin, rna_a260a280ratio, rna_extraction_method, ",
"rna_extraction_date, dna_source, dna_quantificationmethod, dna_a260a280ratio, ",
"dna_extraction_method, dna_extraction_date, gwas_chip, ",
"gwas_datageneration_date, baso, baso_perc, eos, eos_perc, granulocyte, ",
"granulocyte_perc, luc, luc_perc, lymph, lymph_perc, mono, mono_perc, neut, ",
"neut_perc, hct, hgb, mch, mchc, mcv, mpv, plt, rbc, rdw, wbc, ",
"c1_lotnumber, c1_barcode, c2_lotnumber, c2_barcode, sentrix_lotnumber, ",
"sentrix_barcode, sentrix_position, tem_lotnumber, tem_barcode, atm_lotnumber, ",
"stm_lotnumber, stm_barcode, sample_well, sample_plate, scan_date, ",
"scan_time, scanner_name, stain_date, library_date, hybridization_date ",
"FROM methylation_450k_freeze ",
"LEFT JOIN methylation_450k_run ON ",
"methylation_450k_freeze.methylation_450k_run_id = ",
"methylation_450k_run.methylation_450k_run_id ",
"LEFT JOIN dna_sample ON methylation_450k_run.dna_sample_id = dna_sample.dna_sample_id ",
"LEFT JOIN visit ON dna_sample.visit_id = visit.visit_id ",
"LEFT JOIN person ON visit.person_id = person.person_id ",
"LEFT JOIN gwas on person.person_id = gwas.person_id ",
"LEFT JOIN rna_sample on visit.visit_id = rna_sample.visit_id ",
"WHERE set = 'unrelated' AND freeze_num = 2;"),
usrpwd = RP3_MDB_USRPWD)
Some of the biobank ids need to be relabeled to be conformant with the previous datasets.
Information about genotypes are still missing from the metadata of the samples. Genotype imputation ids are retrieved from the meta database and added to the sample metadata.
imputation <- getSQLview("getimputations")
hrc <- subset(imputation, imputation_reference == "HRC")
hrc <- hrc[, c("ids", "imputations_id")]
names(hrc) <- c("ids", "imputation_id_hrc")
hrcv1.1 <- subset(imputation, imputation_reference == "HRCv1.1")
hrcv1.1 <- hrcv1.1[, c("ids", "imputations_id")]
names(hrcv1.1) <- c("ids", "imputation_id_hrcv1.1")
freeze2_unrelated <- merge(freeze2_unrelated, hrc[, c("ids", "imputation_id_hrc")],
by="ids", all.x=TRUE)
freeze2_unrelated <- merge(freeze2_unrelated, hrcv1.1[, c("ids", "imputation_id_hrcv1.1")],
by="ids", all.x=TRUE)
There are a number of fields which contain inconsistent values. The potentially affected fields are checked and adjusted accordingly.
table(freeze2_unrelated$biobank)
table(freeze2_unrelated$sex)
table(freeze2_unrelated$smoking)
table(freeze2_unrelated$ascertainment_criterion) #needs some normalization
table(freeze2_unrelated$ldlcholmethod)
table(freeze2_unrelated$lipidmed)
table(freeze2_unrelated$lipids_bloodsampling_fasting)
table(freeze2_unrelated$rna_source)
table(freeze2_unrelated$rna_extraction_method) # needs some normalization
table(freeze2_unrelated$dna_source)
table(freeze2_unrelated$dna_quantificationmethod) # needs some normalization
table(freeze2_unrelated$dna_extraction_method) # needs some normalization
table(freeze2_unrelated$gwas_chip) # needs some normalization
# ascertainment_criterion
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL$", "GONL_subject",
freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^Complete genomics sequencing$",
"CG_subject",
freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL / Complete genomics sequencing",
"GoNL/CG_subject",
freeze2_unrelated$ascertainment_criterion)
table(freeze2_unrelated$ascertainment_criterion)
# rna_extraction_method
freeze2_unrelated$rna_extraction_method[!is.na(
freeze2_unrelated$rna_extraction_method)] <-"PAXgene Blood RNA Kit (Qiagen)"
table(freeze2_unrelated$rna_extraction_method)
# dna_quantificationmethod
freeze2_unrelated$dna_quantificationmethod <- gsub("nanodrop|nano", "Nanodrop",
freeze2_unrelated$dna_quantificationmethod)
freeze2_unrelated$dna_quantificationmethod <- gsub(
"spectrofotometer|Spectofotometer|spectofotometer", "Spectrofotometer",
freeze2_unrelated$dna_quantificationmethod)
table(freeze2_unrelated$dna_quantificationmethod)
# dna_extraction_method
freeze2_unrelated$dna_extraction_method <- gsub("salting out|Saltingout", "Salting out",
freeze2_unrelated$dna_extraction_method)
freeze2_unrelated$dna_extraction_method <- gsub("QIAamp DNA minikit", "DNA Mini Kit (Qiaamp)",
freeze2_unrelated$dna_extraction_method)
table(freeze2_unrelated$dna_extraction_method)
# gwas_chip
freeze2_unrelated$gwas_chip <- gsub(" *, *", "/", freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina human omni express", "Illumina OmniExpress",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina660", "Illumina Human660-Quad",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("OverlappingSNPsfrom", "Overlapping SNPs from ",
freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("GONLSequenceData", "GONL", freeze2_unrelated$gwas_chip)
table(freeze2_unrelated$gwas_chip)
For some samples the LDLchol values are missing, but can be inferred from the totChol, HDLchol and TriGlycerides using the Friedewald estimation.
table(is.na(freeze2_unrelated$ldlchol))
# Calculate LDLchol, for those missing, from Tot, HDL and TriGlycerides
# http://www.gpnotebook.co.uk/simplepage.cfm?ID=x20030114211535665170
LDLchol <- freeze2_unrelated$totchol - freeze2_unrelated$hdlchol -
(freeze2_unrelated$triglycerides/2.2)
freeze2_unrelated$ldlcholmethod[is.na(freeze2_unrelated$ldlchol)] <- "Friedewald estimation"
freeze2_unrelated$ldlchol[is.na(
freeze2_unrelated$ldlchol)] <- LDLchol[is.na(freeze2_unrelated$ldlchol)]
# Friedewald estimation should not be used when triglycerides exceed 4.52
freeze2_unrelated$ldlcholmethod[freeze2_unrelated$triglycerides > 4.52 &
freeze2_unrelated$ldlcholmethod == "Friedewald estimation"] <- NA
freeze2_unrelated$ldlchol[freeze2_unrelated$triglycerides > 4.52 &
freeze2_unrelated$ldlcholmethod == "Friedewald estimation"] <- NA
table(is.na(freeze2_unrelated$ldlchol))
Summarized Experiments are generated per biobank first. In order to do this the data for the original idat files needs to be normalized, filtered and converted into M-values and beta values.
library(BiocParallel)
library(DNAmArray)
library(impute)
register(MulticoreParam(16, log=TRUE))
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")
for(biobank in unique(freeze2_unrelated$biobank_id)) {
targets <- subset(freeze2_unrelated, biobank_id == biobank)
targets$Basename <- with(targets,
file.path(path450k, "raw", sentrix_barcode,paste(sentrix_barcode,
sentrix_position, sep="_")))
# Read the data
RGset <- read.metharray.exp.par(targets, verbose=FALSE, extended=TRUE)
cat(timestamp(Sys.time()), "\treading done\n") # keep track of what's happening
# normalize
GRset <- preprocessFunnorm.DNAmArray(RGset, nPCs=3, keepCN=FALSE)
cat(timestamp(Sys.time()), "\tpreprocessing done\n") # keep track of what's happening
# filter
RGset <- probeFiltering(RGset)
cat(timestamp(Sys.time()), "\tfiltering done\n") # keep track of what's happening
# store M-values
mvalues <- reduce(GRset, RGset, what="M")
cat(timestamp(Sys.time()), "\treduce done\n") # keep track of what's happening
sink("/dev/null") ##to ignore impute.knn output
mvalues <- impute.knn(as.matrix(mvalues))$data
sink()
cat(timestamp(Sys.time()), "\timpute.knn done\n") # keep track of what's happening
mid <- match(colnames(mvalues), freeze2_unrelated$run_id)
colnames(mvalues) <- freeze2_unrelated$uuid[mid]
rownames(freeze2_unrelated) <- freeze2_unrelated$uuid
# make SE
rowData <- getPlatform(platform="HM450", genome="hg19")
mvalues <- makeSE(mvalues, freeze2_unrelated[,-1], rowData, author="D. Cats",
dbVersion=mdbVersion(),
note="No chen probe filtering, Functional normalization, knn imputed")
save(mvalues, file=file.path(path450k, "450k", biobank,
paste0("methData_Mvalues_", biobank,
"_Freeze2_unrelated.RData")),
compress = "xz")
rm(mvalues)
gc() #remove and clean
cat(timestamp(Sys.time()), "\tM values done\n") # keep track of what's happening
# store beta-values
betas <- reduce(GRset, RGset, what="beta")
cat(timestamp(Sys.time()), "\treduce done\n") # keep track of what's happening
sink("/dev/null") ##to ignore impute.knn output
betas <- impute.knn(as.matrix(betas))$data
sink()
cat(timestamp(Sys.time()), "\timpute.knn done\n") # keep track of what's happening
mid <- match(colnames(betas), freeze2_unrelated$run_id)
colnames(betas) <- freeze2_unrelated$uuid[mid]
betas <- makeSE(betas, freeze2_unrelated[,-1], rowData, author="D. Cats",
dbVersion=mdbVersion(),
note="No chen probe filtering, Functional normalization, knn imputed")
save(betas, file=file.path(path450k, "450k", biobank,
paste0("methData_Betas_", biobank,
"_Freeze2_unrelated.RData")),
compress = "xz")
rm(betas)
rm(GRset)
rm(RGset)
gc() # remove and clean
cat(timestamp(Sys.time()), "\tBetas done\n") # keep track of what's happening
}
library(GenomicRanges)
library(SummarizedExperiment)
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")
biobank <- "PAN"
load(file.path(path450k, "450k", biobank, paste0("methData_Betas_", biobank,
"_Freeze2_unrelated.RData")))
b <- betas
for (biobank in c("CODAM", "LLS", "LL", "RS", "NTR")) {
load(file.path(path450k, "450k", biobank, paste0("methData_Betas_", biobank,
"_Freeze2_unrelated.RData")))
betas <- subsetByOverlaps(betas, b)
b <- subsetByOverlaps(b, betas)
rowData(betas) <- NULL
b <- SummarizedExperiment::cbind(b, betas)
gc()
print(b)
}
betas <- b
save(betas, file=file.path(path450k, "450k",
"methData_Betas_BIOS_Freeze2_unrelated.RData"))
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")
biobank <- "PAN"
load(file.path(path450k, "450k", biobank, paste0("methData_Mvalues_", biobank,
"_Freeze2_unrelated.RData")))
m <- mvalues
for (biobank in c("CODAM", "LLS", "LL", "RS", "NTR")) {
load(file.path(path450k, "450k", biobank, paste0("methData_Mvalues_", biobank,
"_Freeze2_unrelated.RData")))
mvalues <- subsetByOverlaps(mvalues, m)
m <- subsetByOverlaps(m, mvalues)
rowData(mvalues) <- NULL
m <- SummarizedExperiment::cbind(m, mvalues)
gc()
print(m)
}
mvalues <- m
save(mvalues, file=file.path(path450k, "450k",
"methData_Mvalues_BIOS_Freeze2_unrelated.RData"))