The aim of the BIOS consortium is to generate RNA sequencing and DNA methylation data for 4000 individuals that have been selected for having array-based genotypes or whole sequencing data (GoNL) already available. Furthermore, the generated data should pass a set of quality control metrics.
The following code shows how to extract the RNA sequencing and DNA methylation samples that pass quality control.
## No username and password provided for the MDB use stored views!
## No username and password provided for the MDB use stored views!
rna <- rna[!duplicated(rna$run_id),] ##drop overlapping freezes
rna <- subset(rna, qc == "passed") ##keep those passing qc
rna <- subset(rna, type != "replicate") ##drop replicated
##now we have still original, reruns and merged runs
merged <- subset(rna, type == "merged" & freeze == 2) ##select
original <- subset(rna, !(ids %in% merged$ids) & type == "original")
rna <- rbind(merged, original)
dnam <- getView("getMethylationRuns", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
## No username and password provided for the MDB use stored views!
dnam <- dnam[!duplicated(dnam$run_id),] ##drop overlapping freezes
dnam <- subset(dnam, qc == "passed") ##keep those passing qc
dnam <- subset(dnam, type != "replicate") ##drop replicated
##small checks
dim(ids)
## [1] 6379 14
## [1] 4427 9
## [1] 6072 7
## [1] 0
## [1] 0
## [1] 0
library(limma)
df <- data.frame(rna = ids$ids %in% rna$ids,
dnam = ids$ids %in% dnam$ids)
colSums(df)
## rna dnam
## 4427 6072
## rna dnam Counts
## 1 0 0 92
## 2 0 1 1860
## 3 1 0 215
## 4 1 1 4212
## attr(,"class")
## [1] "VennCounts"
Another requirement for Freeze 2 is to have a maximal set of unrelated individuals for which both RNA sequencing and dna methylation data could be generated.
## No username and password provided for the MDB use stored views!
relations <- subset(relations, !is.na(relation_id))
all(relations$ids %in% relations$relation_id) ##relations consistent
## [1] TRUE
## [1] 4262
## [1] 3558
##
## 2nd degree family genetical 1st degree family
## 6 8
## has child has dizygotic twin
## 534 1028
## has monozygotic twin has parent
## 1881 548
## has repeated measurements has sib
## 88 99
## inferred 1st degree family
## 70
getRelation <- function(id, relations) as.character(relations$id[relations$relation_id %in% id])
getFamily <- function(id, relations) {
fam <- id
id <- getRelation(id, relations)
while( length(setdiff(id, fam)) > 0 ) {
id <- setdiff(id, fam)
fam <- c(fam, id)
id <- getRelation(id, relations)
}
fam
}
relations$family_id <- NA
famId <- 0
for(i in 1:nrow(relations)) {
id <- as.character(relations$ids)[i]
family <- getFamily(id, relations)
if(all(is.na(relations$family_id[relations$ids %in% family]))) {
##message("adding new family...")
famId <- famId + 1
relations$family_id[relations$ids %in% family] <- famId
}
}
max(relations$family_id)
## [1] 1616
## famSizes
## 2 4 6 8 10 12 17 18 21 22 26 28
## 1375 171 13 3 22 19 1 6 1 1 1 3
## [1] "1" "2" "3" "4" "6" "7"
## ids uuid biobank_id gonl_id gwas_id
## 17 CODAM-2037 BIOS71A89511 CODAM 2037
## 73 CODAM-2240 BIOS6601F3E0 CODAM 2240
## relation_type relation_id family_id
## 17 inferred 1st degree family CODAM-2240 1
## 73 inferred 1st degree family CODAM-2037 1
## [1] "9" "10" "11" "12" "13" "14"
## ids uuid biobank_id gonl_id gwas_id relation_type
## 956 LL-LLDeep_1600 BIOS82666E4 LL gonl-56a has child
## 960 LL-LLDeep_1603 BIOS855C858 LL gonl-56b has child
## 981 LL-LLDeep_1619 BIOS9542DD0 LL gonl-56c has parent
## 982 LL-LLDeep_1619 BIOS9542DD0 LL gonl-56c has parent
## relation_id family_id
## 956 LL-LLDeep_1619 9
## 960 LL-LLDeep_1619 9
## 981 LL-LLDeep_1600 9
## 982 LL-LLDeep_1603 9
## [1] "165" "194" "308" "325" "383" "387"
## ids uuid biobank_id gonl_id gwas_id
## 2148 NTR-A1083C-10146 BIOS7E1BFC2D NTR 10175
## 2149 NTR-A1083C-10146 BIOS7E1BFC2D NTR 10175
## 2150 NTR-A1083C-10146 BIOS7E1BFC2D NTR 10175
## 2151 NTR-A1083C-NTR15215-8666 BIOS59AC7812 NTR 10175
## 2152 NTR-A1083C-NTR15215-8666 BIOS59AC7812 NTR 10175
## 2153 NTR-A1083C-NTR15215-8666 BIOS59AC7812 NTR 10175
## 2154 NTR-A1083D-10175 BIOS0AC3A8CD NTR 10175
## 2155 NTR-A1083D-10175 BIOS0AC3A8CD NTR 10175
## 2156 NTR-A1083D-10175 BIOS0AC3A8CD NTR 10175
## 2157 NTR-A1083D-NTR15589-8860 BIOS063C99A7 NTR 10175
## 2158 NTR-A1083D-NTR15589-8860 BIOS063C99A7 NTR 10175
## 2159 NTR-A1083D-NTR15589-8860 BIOS063C99A7 NTR 10175
## relation_type relation_id family_id
## 2148 has monozygotic twin NTR-A1083D-10175 165
## 2149 has monozygotic twin NTR-A1083D-NTR15589-8860 165
## 2150 has repeated measurements NTR-A1083C-NTR15215-8666 165
## 2151 has monozygotic twin NTR-A1083D-10175 165
## 2152 has monozygotic twin NTR-A1083D-NTR15589-8860 165
## 2153 has repeated measurements NTR-A1083C-10146 165
## 2154 has monozygotic twin NTR-A1083C-10146 165
## 2155 has monozygotic twin NTR-A1083C-NTR15215-8666 165
## 2156 has repeated measurements NTR-A1083D-NTR15589-8860 165
## 2157 has monozygotic twin NTR-A1083C-10146 165
## 2158 has monozygotic twin NTR-A1083C-NTR15215-8666 165
## 2159 has repeated measurements NTR-A1083D-10175 165
## [1] "261" "692" "738"
## ids uuid biobank_id gonl_id gwas_id
## 2368 NTR-A118A-NTR01371-06D07229 BIOS29A0CC56 NTR gonl-96a 06D07229
## 2369 NTR-A118A-NTR01371-06D07229 BIOS29A0CC56 NTR gonl-96a 06D07229
## 2370 NTR-A118A-NTR01371-06D07229 BIOS29A0CC56 NTR gonl-96a 06D07229
## 2371 NTR-A118A-NTR01371-06D07229 BIOS29A0CC56 NTR gonl-96a 06D07229
## 2372 NTR-A118B-NTR01373-06D07230 BIOS13D06DE0 NTR gonl-96b 06D07230
## 2373 NTR-A118B-NTR01373-06D07230 BIOS13D06DE0 NTR gonl-96b 06D07230
## 2374 NTR-A118B-NTR01373-06D07230 BIOS13D06DE0 NTR gonl-96b 06D07230
## 2375 NTR-A118B-NTR01373-06D07230 BIOS13D06DE0 NTR gonl-96b 06D07230
## 2376 NTR-A118C-829 BIOS34C8B8A5 NTR gonl-96c 10375
## 2377 NTR-A118C-829 BIOS34C8B8A5 NTR gonl-96c 10375
## 2378 NTR-A118C-829 BIOS34C8B8A5 NTR gonl-96c 10375
## 2379 NTR-A118C-829 BIOS34C8B8A5 NTR gonl-96c 10375
## 2380 NTR-A118C-829 BIOS34C8B8A5 NTR gonl-96c 10375
## 2381 NTR-A118C-NT0027644-10375 BIOS125DBF1F NTR gonl-96c 10375
## 2382 NTR-A118C-NT0027644-10375 BIOS125DBF1F NTR gonl-96c 10375
## 2383 NTR-A118C-NT0027644-10375 BIOS125DBF1F NTR gonl-96c 10375
## 2384 NTR-A118C-NT0027644-10375 BIOS125DBF1F NTR gonl-96c 10375
## 2385 NTR-A118C-NT0027644-10375 BIOS125DBF1F NTR gonl-96c 10375
## 2386 NTR-A118D-NT0027645-10376 BIOSD15C4FC7 NTR 10375
## 2387 NTR-A118D-NT0027645-10376 BIOSD15C4FC7 NTR 10375
## 2388 NTR-A118D-NT0027645-10376 BIOSD15C4FC7 NTR 10375
## 2389 NTR-A118D-NT0027645-10376 BIOSD15C4FC7 NTR 10375
## 2390 NTR-A118D-NT0027645-10376 BIOSD15C4FC7 NTR 10375
## 2391 NTR-A118D-NTR00927-607 BIOSA249C90B NTR 10375
## 2392 NTR-A118D-NTR00927-607 BIOSA249C90B NTR 10375
## 2393 NTR-A118D-NTR00927-607 BIOSA249C90B NTR 10375
## 2394 NTR-A118D-NTR00927-607 BIOSA249C90B NTR 10375
## 2395 NTR-A118D-NTR00927-607 BIOSA249C90B NTR 10375
## relation_type relation_id family_id
## 2368 has child NTR-A118C-829 261
## 2369 has child NTR-A118C-NT0027644-10375 261
## 2370 has child NTR-A118D-NT0027645-10376 261
## 2371 has child NTR-A118D-NTR00927-607 261
## 2372 has child NTR-A118C-829 261
## 2373 has child NTR-A118C-NT0027644-10375 261
## 2374 has child NTR-A118D-NT0027645-10376 261
## 2375 has child NTR-A118D-NTR00927-607 261
## 2376 has monozygotic twin NTR-A118D-NT0027645-10376 261
## 2377 has monozygotic twin NTR-A118D-NTR00927-607 261
## 2378 has parent NTR-A118A-NTR01371-06D07229 261
## 2379 has parent NTR-A118B-NTR01373-06D07230 261
## 2380 has repeated measurements NTR-A118C-NT0027644-10375 261
## 2381 has monozygotic twin NTR-A118D-NT0027645-10376 261
## 2382 has monozygotic twin NTR-A118D-NTR00927-607 261
## 2383 has parent NTR-A118A-NTR01371-06D07229 261
## 2384 has parent NTR-A118B-NTR01373-06D07230 261
## 2385 has repeated measurements NTR-A118C-829 261
## 2386 has monozygotic twin NTR-A118C-829 261
## 2387 has monozygotic twin NTR-A118C-NT0027644-10375 261
## 2388 has parent NTR-A118A-NTR01371-06D07229 261
## 2389 has parent NTR-A118B-NTR01373-06D07230 261
## 2390 has repeated measurements NTR-A118D-NTR00927-607 261
## 2391 has monozygotic twin NTR-A118C-829 261
## 2392 has monozygotic twin NTR-A118C-NT0027644-10375 261
## 2393 has parent NTR-A118A-NTR01371-06D07229 261
## 2394 has parent NTR-A118B-NTR01373-06D07230 261
## 2395 has repeated measurements NTR-A118D-NT0027645-10376 261
Now we can selected the maximal unrelated individuals e.g. in case of GoNL trio’s if all have dnam and rna chose the parents; to maximize the number of individuals.
The rna or dnam freeze 2 is extended with unrelated individuals having only rna or dnam.
reduceRelations <- function(ids, relations) {
rels <- relations[relations$ids %in% ids,]
selection <- function(fam) {
if(sum(fam$relation_type == "has child") >= 2) { ##both parents have rna/dnam
##message("family having both parents with both rna/dnam!")
return( as.character(fam$ids[fam$relation_type == "has child"]) )
} else if( sum(grepl("twin", fam$relation_type)) >= 1 ) { ##return single twin
##message("family with at least one twin with both rna/dnam!")
return( as.character(fam$ids[which(grepl("twin", fam$relation_type))[1]]) )
} else {
##message("family with at least on member having both rna/dnam!")
return( as.character(fam$ids[1] ) ) ##return any other single family member with complete rna/dnam
}
}
selected <- c()
for(i in unique(rels$family_id)) {
fam <- subset(rels, family_id == i)
##print(fam)
select <- selection(fam)
##print(select)
selected <- c(selected, select)
}
selected <- unique(selected) ##relations of parents having two children
c(setdiff(ids, rels$ids), selected)
}
##unrelated having both rna and dnam
ovl <- intersect(rna$ids, dnam$ids)
ovlRed <- reduceRelations(ovl, relations)
length(ovlRed)
## [1] 3405
## [1] 3405
##remove all family members in remaining rna/dnam
ovlRel <- unlist(lapply(ovlRed, getFamily, relations))
srna <- subset(rna, !(ids %in% c(ovlRed, ovlRel)))
##and get the unrelated set
rnaOnly <- reduceRelations(srna$ids, relations)
rnaF2 <- subset(rna, ids %in% c(ovlRed, rnaOnly))
dim(rnaF2)
## [1] 3530 9
sdnam <- subset(dnam, !(ids %in% c(ovlRed, ovlRel)))
##and get the unrelated set
dnamOnly <- reduceRelations(sdnam$ids, relations)
dnamF2 <- subset(dnam, ids %in% c(ovlRed, dnamOnly))
dim(dnamF2)
## [1] 4386 7
freeze2 <- merge(rnaF2[,c("ids", "run_id")], dnamF2[,c("ids", "run_id")], by="ids", suffixes=c("_rna","_dnam"))
head(freeze2)
## ids run_id_rna run_id_dnam
## 1 CODAM-2001 BD1NYRACXX-5-1 8667053102_R05C02
## 2 CODAM-2002 AD10W1ACXX-4-1 8667053157_R01C02
## 3 CODAM-2013 AD10W1ACXX-4-2 8655685053_R04C02
## 4 CODAM-2016 BD1NYRACXX-5-3 8655685094_R01C01
## 5 CODAM-2017 AD10W1ACXX-4-3 8667053076_R02C01
## 6 CODAM-2020 BD1NYRACXX-5-4 8655685021_R04C02
## [1] 3405 3
##double check
relations <- getView("getRelations", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB)
dim(freeze2)
## No username and password provided for the MDB use stored views!
relations <- subset(relations, !is.na(relation_id))
relations <- subset(relations, ids %in% freeze2$ids)
intersect(relations$ids, relations$relation_id) ##intersect is empty!
## character(0)
write.table(freeze2, file=file.path(VM_BASE_ANALYSIS, "BBMRIomics/BBMRIomics/data", "freeze2_overlap_identifiers.csv"), row.names=FALSE, quote=FALSE, sep=",")
write.table(rnaF2[, c("ids", "run_id")], file=file.path(VM_BASE_ANALYSIS, "BBMRIomics/BBMRIomics/data", "freeze2_rna_identifiers.csv"), row.names=FALSE, quote=FALSE, sep=",")
write.table(dnamF2[, c("ids", "run_id")], file=file.path(VM_BASE_ANALYSIS, "BBMRIomics/BBMRIomics/data", "freeze2_dnam_identifiers.csv"), row.names=FALSE, quote=FALSE, sep=",")