Prepare sample metadata

Metadata for the samples is retrieved from the metadatabase. This is done only for the samples which were part of maximum set of unrelated individuals from Freeze2. These are the same samples as were uploaded to EGA (EGAD00010001416).

library(BBMRIomics)
freeze2_unrelated <- runQuery(paste0(
    "SELECT uuid, biobank AS biobank_id, bios_id AS ids, gonl_id, ",
        "methylation_450k_run.methylation_450k_run_id AS run_id, ",
        "age AS sampling_age, COALESCE(dna_bloodsampling_date, rna_sampling_date, ",
            "cellcount_bloodsampling_date, lipids_bloodsampling_date, ",
            "crp_bloodSampling_date) AS sampling_date, ",
        "COALESCE(dna_bloodsampling_time, rna_sampling_time, ",
            "lipids_bloodsampling_time, crp_bloodsampling_time, ",
            "cellcount_bloodsampling_time) AS sampling_time, ",
        "sex, birth_year, weight, height, anthropometry_age, smoking, smoking_age, ",
        "ascertainment_criterion, totchol, triglycerides, hdlchol, ldlchol, ",
        "ldlcholmethod, hscrp, lipidmed, lipidsmed_age, lipids_bloodsampling_fasting, ",
        "rna_source, rna_rin, rna_a260a280ratio, rna_extraction_method, ",
        "rna_extraction_date, dna_source, dna_quantificationmethod, dna_a260a280ratio, ",
        "dna_extraction_method, dna_extraction_date, gwas_chip, ",
        "gwas_datageneration_date, baso, baso_perc, eos, eos_perc, granulocyte, ",
        "granulocyte_perc, luc, luc_perc, lymph, lymph_perc, mono, mono_perc, neut, ",
        "neut_perc, hct, hgb, mch, mchc, mcv, mpv, plt, rbc, rdw, wbc, ",
        "c1_lotnumber, c1_barcode, c2_lotnumber, c2_barcode, sentrix_lotnumber, ",
        "sentrix_barcode, sentrix_position, tem_lotnumber, tem_barcode, atm_lotnumber, ",
        "stm_lotnumber, stm_barcode, sample_well, sample_plate, scan_date, ",
        "scan_time, scanner_name, stain_date, library_date, hybridization_date ",
    "FROM methylation_450k_freeze ",
    "LEFT JOIN methylation_450k_run ON ",
        "methylation_450k_freeze.methylation_450k_run_id = ",
            "methylation_450k_run.methylation_450k_run_id ",
    "LEFT JOIN dna_sample ON methylation_450k_run.dna_sample_id = dna_sample.dna_sample_id ",
    "LEFT JOIN visit ON dna_sample.visit_id = visit.visit_id ",
    "LEFT JOIN person ON visit.person_id = person.person_id ",
    "LEFT JOIN gwas on person.person_id = gwas.person_id ",
    "LEFT JOIN rna_sample on visit.visit_id = rna_sample.visit_id ",
    "WHERE set = 'unrelated' AND freeze_num = 2;"),
    usrpwd = RP3_MDB_USRPWD)

Normalize inconsistent phenotype values

There are a number of fields which contain inconsistent values. The potentially affected fields are checked and adjusted accordingly.

table(freeze2_unrelated$biobank)
table(freeze2_unrelated$sex)
table(freeze2_unrelated$smoking)
table(freeze2_unrelated$ascertainment_criterion) #needs some normalization
table(freeze2_unrelated$ldlcholmethod)
table(freeze2_unrelated$lipidmed)
table(freeze2_unrelated$lipids_bloodsampling_fasting)
table(freeze2_unrelated$rna_source)
table(freeze2_unrelated$rna_extraction_method) # needs some normalization
table(freeze2_unrelated$dna_source)
table(freeze2_unrelated$dna_quantificationmethod) # needs some normalization
table(freeze2_unrelated$dna_extraction_method) # needs some normalization
table(freeze2_unrelated$gwas_chip) # needs some normalization

# ascertainment_criterion
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL$", "GONL_subject", 
                                                  freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^Complete genomics sequencing$", 
                                                  "CG_subject", 
                                                  freeze2_unrelated$ascertainment_criterion)
freeze2_unrelated$ascertainment_criterion <- gsub("^GoNL / Complete genomics sequencing",
                                                  "GoNL/CG_subject",
                                                  freeze2_unrelated$ascertainment_criterion)
table(freeze2_unrelated$ascertainment_criterion)

# rna_extraction_method
freeze2_unrelated$rna_extraction_method[!is.na(
    freeze2_unrelated$rna_extraction_method)] <-"PAXgene Blood RNA Kit (Qiagen)"
table(freeze2_unrelated$rna_extraction_method)

# dna_quantificationmethod
freeze2_unrelated$dna_quantificationmethod <- gsub("nanodrop|nano", "Nanodrop",
                                                   freeze2_unrelated$dna_quantificationmethod)
freeze2_unrelated$dna_quantificationmethod <- gsub(
    "spectrofotometer|Spectofotometer|spectofotometer", "Spectrofotometer",
    freeze2_unrelated$dna_quantificationmethod)
table(freeze2_unrelated$dna_quantificationmethod)

# dna_extraction_method
freeze2_unrelated$dna_extraction_method <- gsub("salting out|Saltingout", "Salting out",
                                                freeze2_unrelated$dna_extraction_method)
freeze2_unrelated$dna_extraction_method <- gsub("QIAamp DNA minikit", "DNA Mini Kit (Qiaamp)",
                                                freeze2_unrelated$dna_extraction_method)
table(freeze2_unrelated$dna_extraction_method)

# gwas_chip
freeze2_unrelated$gwas_chip <- gsub(" *, *", "/", freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina human omni express", "Illumina OmniExpress",
                                    freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("Illumina660", "Illumina Human660-Quad",
                                    freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("OverlappingSNPsfrom", "Overlapping SNPs from ",
                                    freeze2_unrelated$gwas_chip)
freeze2_unrelated$gwas_chip <- gsub("GONLSequenceData", "GONL", freeze2_unrelated$gwas_chip)
table(freeze2_unrelated$gwas_chip)

Making SummarizedExperiments per biobank

Summarized Experiments are generated per biobank first. In order to do this the data for the original idat files needs to be normalized, filtered and converted into M-values and beta values.

library(BiocParallel)
library(DNAmArray)
library(impute)
register(MulticoreParam(16, log=TRUE))
path450k <- file.path(VM_BASE_DATA, "IlluminaHumanMethylation450k")

for(biobank in unique(freeze2_unrelated$biobank_id)) {
    targets <- subset(freeze2_unrelated, biobank_id == biobank)
    targets$Basename <- with(targets, 
        file.path(path450k, "raw", sentrix_barcode,paste(sentrix_barcode, 
                                                         sentrix_position, sep="_")))

    # Read the data
    RGset <- read.metharray.exp.par(targets, verbose=FALSE, extended=TRUE)
    cat(timestamp(Sys.time()), "\treading done\n") # keep track of what's happening

    # normalize
    GRset <- preprocessFunnorm.DNAmArray(RGset, nPCs=3, keepCN=FALSE)
    cat(timestamp(Sys.time()), "\tpreprocessing done\n") # keep track of what's happening

    # filter
    RGset <- probeFiltering(RGset)
    cat(timestamp(Sys.time()), "\tfiltering done\n") # keep track of what's happening

    # store M-values
    mvalues <- reduce(GRset, RGset, what="M")
    cat(timestamp(Sys.time()), "\treduce done\n") # keep track of what's happening

    sink("/dev/null") ##to ignore impute.knn output
    mvalues <- impute.knn(as.matrix(mvalues))$data
    sink()
    cat(timestamp(Sys.time()), "\timpute.knn done\n") # keep track of what's happening

    mid <- match(colnames(mvalues), freeze2_unrelated$run_id)
    colnames(mvalues) <- freeze2_unrelated$uuid[mid]

    rownames(freeze2_unrelated) <- freeze2_unrelated$uuid

    # make SE
    rowData <- getPlatform(platform="HM450", genome="hg19")
    mvalues <- makeSE(mvalues, freeze2_unrelated[,-1], rowData, author="D. Cats", 
                      dbVersion=mdbVersion(),
                      note="No chen probe filtering, Functional normalization, knn imputed")
    save(mvalues, file=file.path(path450k, "450k", biobank, 
                                 paste0("methData_Mvalues_", biobank,
                                        "_Freeze2_unrelated.RData")),
         compress = "xz")
    rm(mvalues)
    gc() #remove and clean
    cat(timestamp(Sys.time()), "\tM values done\n") # keep track of what's happening

    # store beta-values
    betas <- reduce(GRset, RGset, what="beta")
    cat(timestamp(Sys.time()), "\treduce done\n") # keep track of what's happening

    sink("/dev/null") ##to ignore impute.knn output
    betas <- impute.knn(as.matrix(betas))$data
    sink()
    cat(timestamp(Sys.time()), "\timpute.knn done\n") # keep track of what's happening

    mid <- match(colnames(betas), freeze2_unrelated$run_id)
    colnames(betas) <- freeze2_unrelated$uuid[mid]
    betas <- makeSE(betas, freeze2_unrelated[,-1], rowData, author="D. Cats", 
                    dbVersion=mdbVersion(),
                    note="No chen probe filtering, Functional normalization, knn imputed")
    save(betas, file=file.path(path450k, "450k", biobank, 
                               paste0("methData_Betas_", biobank,
                                      "_Freeze2_unrelated.RData")),
         compress = "xz")
  
    rm(betas)
    rm(GRset)
    rm(RGset)
    gc() # remove and clean
    cat(timestamp(Sys.time()), "\tBetas done\n") # keep track of what's happening
}