Creating the SummarizedExperiments

Here we follow the approach described at DNAmArray for preprocessing and quality control.

library(BiocParallel)
library(DNAmArray)
library(impute)
register(MulticoreParam(16, log=TRUE))
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")

for(biobank in unique(colData$biobank_id)[-1]) {
    biobank <- "RS"
    targets <- subset(colData, biobank_id == biobank)
    targets$Basename <- with(targets,
                             file.path(path450k, "raw", Sentrix_Barcode,
                                       paste(Sentrix_Barcode, Sentrix_Position, sep="_")))
    ##reading the data
    RGset <- read.metharray.exp.par(targets, verbose=FALSE, extended=TRUE)
    ##normalize
    pc <- screeplot(RGset)
    GRset <- preprocessFunnorm.DNAmArray(RGset, nPCs=3, keepCN=FALSE)
    rm(pc)
    gc() #remove and clean
    ##filter
    RGset <- probeFiltering(RGset)
    ##store M-values
    mvalues <- reduce(GRset, RGset, what="M")
    sink("/dev/null") ##to ignore impute.knn output
    mvalues <- impute.knn(as.matrix(mvalues))$data
    sink()

    mid <- match(colnames(mvalues), colData$run_id)
    colnames(mvalues) <- colData$uuid[mid]
    rowData <- getPlatform(platform="HM450", genome="hg19")
    mvalues <- makeSE(mvalues, colData[,-1], rowData, note="No chen probe filtering, Functional normalization, knn imputed")
    save(mvalues, file=file.path(path450k, "450k", biobank, paste0("methData_Mvalues_", biobank, "_F2_cleaned.RData")))
    rm(mvalues)
    gc() #remove and clean
    ##store beta-values
    betas <- reduce(GRset, RGset, what="beta")
    sink("/dev/null") ##to ignore impute.knn output
    betas <- impute.knn(as.matrix(betas))$data
    sink()
    mid <- match(colnames(betas), colData$run_id)
    colnames(betas) <- colData$uuid[mid]
    betas <- makeSE(betas, colData[,-1], rowData, note="No chen probe filtering, Functional normalization, knn imputed")
    save(betas, file=file.path(path450k, "450k", biobank, paste0("methData_Betas_", biobank, "_F2_cleaned.RData")))
    rm(betas)
    rm(GRset)
    rm(RGset)
    gc() ##remove and clean
}

Create one SummarizedExperiment containg all cohorts

Here we merge the SummarizedExperiment of the separate cohorts in one single SummarizedExperiment.

NOTE: direct use of cbind currently doesn’t work with GRanges containing DNAStringSets problem has been reported to Bioconductor-developers who proposed to fix it!

path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")
biobank <- "PAN"
load(file.path(path450k, "450k", biobank, paste0("methData_Betas_", biobank, "_F2_cleaned.RData")))
b <- betas
library(GenomicRanges)
for(biobank in c("CODAM", "LLS", "LL", "RS", "NTR")) {
    load(file.path(path450k, "450k", biobank, paste0("methData_Betas_", biobank, "_F2_cleaned.RData")))
    betas <- subsetByOverlaps(betas, b)
    b <- subsetByOverlaps(b, betas)
    rowData(betas) <- NULL
    b <- SummarizedExperiment::cbind(b, betas)
    gc()
    print(b)
}
betas <- b
save(betas, file=file.path(path450k, "450k", paste0("methData_Betas_BIOS_F2_cleaned.RData")))
##can we reduce the size?
tools::checkRdaFiles(file.path(path450k, "450k", paste0("methData_Betas_BIOS_F2_cleaned.RData")))


##Now mvalues-values
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")
biobank <- "PAN"
load(file.path(path450k, "450k", biobank, paste0("methData_Mvalues_", biobank, "_F2_cleaned.RData")))
b <- mvalues
library(GenomicRanges)
for(biobank in c("CODAM", "LLS", "LL", "RS", "NTR")) {
    load(file.path(path450k, "450k", biobank, paste0("methData_Mvalues_", biobank, "_F2_cleaned.RData")))
    mvalues <- subsetByOverlaps(mvalues, b)
    b <- subsetByOverlaps(b, mvalues)
    rowData(mvalues) <- NULL
    b <- SummarizedExperiment::cbind(b, mvalues)
    gc()
    print(b)
}
mvalues <- b
save(mvalues, file=file.path(path450k, "450k", paste0("methData_Mvalues_BIOS_F2_cleaned.RData")))
##can we reduce the size?
tools::checkRdaFiles(file.path(path450k, "450k", paste0("methData_Mvalues_BIOS_F2_cleaned.RData")))