Library preparation and alignment

Detailed description of the RNASeq data generation and preprocessing can be found in the supplementary material of the paper by Zhernakova et al. (Zhernakova et al. 2017)

Feature quantification read counts

Gene read counts are generated using htseq using mode union for handling overlapping exons.

Reference annotation used is from ensembl: release 71 but only chromosomes 1-22 and X, Y (and MT) are retained.

Prepare sample metadata

Metadata for the samples is retrieved from the metadatabase. This is done only for the samples which were part of GoNL set from Freeze2. These are the same samples as were uploaded to EGA (BAM files: EGAD00001003786, Fastq files: EGAD00001003787).

library(BBMRIomics)
gonl <- runQuery(paste0(
    "SELECT uuid, biobank as biobank_id, bios_id AS ids, gonl_id, ", 
        "rna_run.rna_run_id AS run_id, age AS sampling_age, ",
        "COALESCE(rna_sampling_date, dna_bloodsampling_date, ",
            "cellcount_bloodsampling_date, lipids_bloodsampling_date, ",
            "crp_bloodSampling_date) AS sampling_date, ",
        "COALESCE(rna_sampling_time, dna_bloodsampling_time, ",
            "lipids_bloodsampling_time, crp_bloodsampling_time, ",
            "cellcount_bloodsampling_time) AS sampling_time, ",
        "sex, birth_year, weight, height, anthropometry_age, smoking, smoking_age, ",
        "ascertainment_criterion, totchol, triglycerides, hdlchol, ldlchol, ",
        "ldlcholmethod, hscrp, lipidmed, lipidsmed_age, lipids_bloodsampling_fasting, ",
        "rna_source, rna_rin, rna_a260a280ratio, rna_extraction_method, ",
        "rna_extraction_date, dna_source, dna_quantificationmethod, dna_a260a280ratio, ",
        "dna_extraction_method, dna_extraction_date, gwas_chip, ",
        "gwas_datageneration_date, baso, baso_perc, eos, eos_perc, granulocyte, ",
        "granulocyte_perc, luc, luc_perc, lymph, lymph_perc, mono, mono_perc, neut, ",
        "neut_perc, hct, hgb, mch, mchc, mcv, mpv, plt, rbc, rdw, wbc, ",
        "flowcell_num, machine, run_num, insert_size, rna_id ",
    "FROM rna_freeze ",
    "LEFT JOIN rna_run ON rna_freeze.rna_run_id = rna_run.rna_run_id ",
    "LEFT JOIN rna_sample ON rna_run.rna_sample_id = rna_sample.rna_sample_id ",
    "LEFT JOIN visit ON rna_sample.visit_id = visit.visit_id ",
    "LEFT JOIN person ON visit.person_id = person.person_id ",
    "LEFT JOIN gwas on person.person_id = gwas.person_id ",
    "LEFT JOIN dna_sample on visit.visit_id = dna_sample.visit_id ",
    "WHERE set = 'GoNL' AND freeze_num = 2;"),
    usrpwd = RP3_MDB_USRPWD)

Normalize inconsistent phenotype values

There are a number of fields which contain inconsistent values. The potentially affected fields are checked and adjusted accordingly.

table(gonl$biobank)
table(gonl$sex)
table(gonl$smoking)
table(gonl$ascertainment_criterion) #needs some normalization
table(gonl$ldlcholmethod)
table(gonl$lipidmed)
table(gonl$lipids_bloodsampling_fasting)
table(gonl$rna_source)
table(gonl$rna_extraction_method) # needs some normalization
table(gonl$dna_source)
table(gonl$dna_quantificationmethod) # needs some normalization
table(gonl$dna_extraction_method) # needs some normalization
table(gonl$gwas_chip) # needs some normalization

## ascertainment_criterion
gonl$ascertainment_criterion <- gsub("^GoNL$", "GONL_subject", gonl$ascertainment_criterion)
gonl$ascertainment_criterion <- gsub("^Complete genomics sequencing$", "CG_subject", 
                                     gonl$ascertainment_criterion)
gonl$ascertainment_criterion <- gsub("^GoNL / Complete genomics sequencing", 
                                     "GoNL/CG_subject", 
                                     gonl$ascertainment_criterion)
table(gonl$ascertainment_criterion)

# rna_extraction_method
gonl$rna_extraction_method[!is.na(
    gonl$rna_extraction_method)] <- "PAXgene Blood RNA Kit (Qiagen)"
table(gonl$rna_extraction_method)

# dna_quantificationmethod
gonl$dna_quantificationmethod <- gsub("nanodrop|nano", "Nanodrop",
                                      gonl$dna_quantificationmethod)
gonl$dna_quantificationmethod <- gsub("spectrofotometer|Spectofotometer|spectofotometer", 
                                      "Spectrofotometer", gonl$dna_quantificationmethod)
table(gonl$dna_quantificationmethod)

# dna_extraction_method
gonl$dna_extraction_method <- gsub("salting out|Saltingout", "Salting out", 
                                   gonl$dna_extraction_method)
gonl$dna_extraction_method <- gsub("QIAamp DNA minikit", "DNA Mini Kit (Qiaamp)", 
                                   gonl$dna_extraction_method)
table(gonl$dna_extraction_method)

# gwas_chip
gonl$gwas_chip <- gsub(" *, *", "/", gonl$gwas_chip)
gonl$gwas_chip <- gsub("Illumina human omni express", "Illumina OmniExpress", gonl$gwas_chip)
gonl$gwas_chip <- gsub("Illumina660", "Illumina Human660-Quad", gonl$gwas_chip)
gonl$gwas_chip <- gsub("OverlappingSNPsfrom", "Overlapping SNPs from ", gonl$gwas_chip)
gonl$gwas_chip <- gsub("GONLSequenceData", "GONL", gonl$gwas_chip)
table(gonl$gwas_chip)