This document describes how we try to impute some missing phenotypes, specifically, sex, age, smoking status and white blood cell composition.
preprocess <- function(sumExp) {
require(SummarizedExperiment)
require(edgeR)
y <- DGEList(assays(sumExp)$data)
message("[preprocess] drop features not expressed")
##https://www.bioconductor.org/packages/devel/bioc/vignettes/limma/inst/doc/usersguide.pdf
A <- rowSums(y$counts)
isexpr <- A > 50
y <- y[isexpr,]
message("[preprocess] normalize")
y <- calcNormFactors(y) ##normalize takes some time
message("[preprocess] calculate log counts per million")
y <- cpm(y, log=TRUE)
sumExp <- sumExp[isexpr,]
assays(sumExp)$data <- y ##replace data with filtered subset
sumExp
}
validate <- function(phenotype, features, train.frac = 2/3, methods = c("ridge", "elastic-net", "pls", "lasso", "gbm", "mboost"), ntop = c(50, 100, 250, 500, 1000, 2500, 5000), nrep = 25, verbose=TRUE) {
require(omicsPrint)
require(reshape2)
require(BiocParallel)
validations <- c()
for(ntopi in ntop) {
reps <- bplapply(1:nrep, function(x) phenotyping(phenotype, features, train.frac=train.frac, methods = methods, ntop=ntopi, verbose=verbose)$validation)
reps <- do.call('rbind', reps)
validation <- melt(reps, varnames=c("method", "measure"))
validation$ntop <- ntopi
validations <- rbind(validations, validation)
gc()
}
validations
}
path <- "/virdir/Scratch/RP3_analysis/imputed_phenotypes"
library(BBMRIomics)
library(omicsPrint)
data(rnaSeqData_ReadCounts_BIOS_cleaned)
covariates <- as.data.frame(colData(counts)[, c("Sex", "Sampling_Age", "Smoking", "Lymph_Perc","Mono_Perc", "Eos_Perc", "Neut_Perc")])
##missing covariate summary
missingCov <- apply(covariates, 2, function(x) round(100*table(is.na(x))/length(x),2))
missingCov
##missing sample summary
missingSam <- apply(covariates, 1, function(x) round(100*sum(is.na(x))/length(x),2))
table(missingSam)
tbl <- table(counts$biobank_id, is.na(counts$Sampling_Age))
round(100*tbl[,2]/rowSums(tbl),2)
biobank <- factor(colData(counts)$biobank_id)
missing <- c(4, 82,12,2,2,48)
levels(biobank) <- paste0(levels(biobank), " (", missing, ")")
colData(counts)$biobank_id <- biobank
library(ggplot2)
gp <- ggplot(as.data.frame(colData(counts)), aes(x=Sampling_Age))
gp <- gp + geom_histogram()
gp + facet_wrap(~biobank_id)
ggsave(file.path(path, "Age_RNAseq_distr.eps"))
First we need to find out which algorithm and which number of features to use.
data(rnaSeqData_ReadCounts_BIOS_cleaned)
dim(counts)
cnts <- preprocess(counts)
dim(cnts)
metadata(cnts)$formula <- ~0+Sex
phenotype <- unlist(get_all_vars(metadata(cnts)$formula, data=colData(cnts)))
features <- assays(cnts)$data
table(phenotype, useNA="always")
library(BiocParallel)
register(MulticoreParam(10))
validation <- validate(phenotype, features, methods = c("ridge", "elastic-net", "lasso"), ntop = c(5, 10, 25, 50, 100), nrep = 25, verbose=TRUE)
library(ggplot2)
library(ggsci)
gp <- ggplot(subset(validation, measure %in% c("accuracy (Overall)", "f1 (Male)", "f1 (Female)")),
aes(x=as.factor(ntop), y=value, col=method))
gp <- gp + geom_boxplot()
gp <- gp + facet_wrap(~measure, scales="free", nrow=3, ncol=1)
gp <- gp + xlab("Number of features") + ylab("") + ggtitle("Validation of predictors for `Sex`")
gp + theme_bw() + scale_color_npg() + theme(legend.position = "bottom", legend.direction = "horizontal")
ggsave(file.path(path, "Sex_RNAseq_eval.eps"))
phenotyped <- phenotyping(phenotype, features, methods = "lasso", ntop=25, verbose=TRUE)
phenotyped$validation
predicted <- phenotyped$predicted
rownames(predicted) <- colnames(features)
write.table(predicted, file=file.path(path, "Sex_RNAseq.csv"), sep=",", quote=FALSE)
TODO add diagnostic plots!
It could be interesting to see which genes are the top genes selected for predicting Age.
this does not include e.g., the lasso selection
library(org.Hs.eg.db)
map <- select(org.Hs.eg.db, keys = names(phenotyped$top), keytype="ENSEMBL", columns=c("SYMBOL", "CHR"))
top <- merge(map, phenotyped$top, by.x="ENSEMBL", by.y="row.names")
top <- top[order(abs(top$y), decreasing=TRUE),]
top
mid <- match(top$ENSEMBL, rownames(features))
y <- features[mid,]
rownames(y) <- top$SYMBOL
rownames(y)[is.na(top$SYMBOL)] <- top$ENSEMBL[is.na(top$SYMBOL)]
rownames(y) <- paste0(rownames(y), " ( chr", top$CHR, ")")
colnames(y) <- NULL
library(ComplexHeatmap)
sex <- colData(cnts)$Sex
sex[is.na(sex)] <- "Unknown"
ha <- HeatmapAnnotation(df = data.frame(Sex = factor(sex)), col = list(Sex = c("Male" = "blue", "Female" = "pink", "Unknown" = "grey")))
setEPS()
postscript(file.path(path, "Sex_RNAseq_feats.eps"))
Heatmap(y, top_annotation = ha, name="log CPM", row_names_gp = gpar(fontsize = 8))
dev.off()
Outlier or missclassified sample could potentially be wrongly labeled.
Since, we use a subset for training it could be accidentally be missclassified. Therefore, we run multiple models and collected the number and frequencies of missclassified samples.
library(BiocParallel)
register(MulticoreParam(10))
nrep <- 25
predicted <- bplapply(1:nrep, function(x) phenotyping(phenotype, features, methods = "lasso", ntop=25, verbose=FALSE)$predicted)
predicted <- do.call('cbind', predicted)
rownames(predicted) <- names(phenotype) <- colnames(features)
missclassified <- (predicted != phenotype)[!is.na(phenotype), ]
rownames(missclassified) <- colnames(features)[!is.na(phenotype)]
missclassified <- missclassified[rowSums(missclassified) > 0,]
missclass <- 100*rowSums(missclassified)/nrep
missclass[missclass > 50]
x <- cbind(predicted[rownames(predicted) %in% names(missclass[missclass > 50]), 1:5],
reported=as.character(phenotype[names(phenotype) %in% names(missclass[missclass > 50])])
)
ids <- getView("getIds", usrpwd=RP3_MDB_USRPWD, url=RP3_MDB, verbose=FALSE)
subset(ids, uuid %in% rownames(x))
First we need to find out which algorithm and which number of features to use:
metadata(cnts)$formula <- ~0+Sampling_Age
phenotype <- unlist(get_all_vars(metadata(cnts)$formula, data=colData(cnts)))
features <- assays(cnts)$data
summary(phenotype)
library(BiocParallel)
register(MulticoreParam(10, log=TRUE))
validation <- validate(phenotype, features, methods = c("ridge", "lasso", "elastic-net", "pls"),
ntop = c(100, 250, 500, 1000, 2500, 5000, 10000), nrep = 25, verbose=TRUE)
library(ggplot2)
library(ggsci)
gp <- ggplot(validation, aes(x=as.factor(ntop), y=value, col=method))
gp <- gp + geom_boxplot()
gp <- gp + facet_wrap(~measure, scales="free", nrow=3, ncol=1)
gp <- gp + xlab("Number of features") + ylab("") + ggtitle("Validation of predictors for `Age`")
gp + theme_bw() + scale_color_npg() + theme(legend.position = "bottom", legend.direction = "horizontal")
ggsave(file.path(path, "Age_RNAseq_eval.eps"))
some diagnostic plots:
phenotyped <- phenotyping(phenotype, features, methods = "lasso", ntop=5000, verbose=TRUE)
predicted <- phenotyped$predicted
rownames(predicted) <- colnames(features)
type <- rep("train", length(phenotype))
type[is.na(phenotype)] <- "unknown"
type[phenotyped$testid] <- "test"
phenotype[is.na(phenotype)] <- predicted[is.na(phenotype)]
d <- data.frame(predicted=predicted[,1], measured=phenotype, type=factor(type))
gp <- ggplot(d, aes(x=predicted, y=measured, col=type))
gp <- gp + geom_point() + geom_abline(intercept=0, slope=1, col=1) + ggtitle("Age prediction")
gp + theme_bw() + scale_color_npg() + theme(legend.position = "bottom", legend.direction = "horizontal")
ggsave(file.path(path, "Age_RNAseq_diagn.eps"))
library(org.Hs.eg.db)
map <- select(org.Hs.eg.db, keys = names(phenotyped$top), keytype="ENSEMBL", columns="SYMBOL")
top <- merge(map, phenotyped$top, by.x="ENSEMBL", by.y="row.names")
head(top[order(abs(top$y), decreasing=TRUE),])
library(irlba)
y <- features[rownames(features) %in% names(phenotyped$top),]
pc <- prcomp_irlba(t(y))
summary(pc)
cols <- rainbow(10)[cut(phenotype, breaks=10)]
plot(pc$x, col=adjustcolor(cols, alpha.f=0.3), pch=16)
library(BiocParallel)
register(MulticoreParam(10))
nrep <- 100
predicted <- bplapply(1:nrep, function(x) phenotyping(phenotype, features, methods = "lasso", ntop=5000, verbose=FALSE)$predicted)
predicted <- do.call('cbind', predicted)
residuals <- predicted - phenotype
popo <- which(rowSums(abs(residuals) > mean(residuals, na.rm=TRUE) + 3*sd(residuals, na.rm=TRUE)) > 50)
names(residuals) <- colnames(features)
names(residuals)[popo]
library(matrixStats)
mns <- rowMeans(predicted)
sds <- rowSds(predicted)
col <- rep(1, length(mns))
col[popo] <- 2
plot(phenotype, mns, pch=16, cex=sds, col=adjustcolor(col, alpha.f=0.3), bty="n", xlab="reported age", ylab="average predicted age")
abline(0, 1, col=4, lwd=2, lty=2)
grid(col=1)
metadata(cnts)$formula <- ~0 + Lymph_Perc + Mono_Perc + Eos_Perc + Neut_Perc
phenotypes <- get_all_vars(metadata(cnts)$formula, data=colData(cnts))
features <- assays(cnts)$data
complete <- apply(phenotypes, 1, function(x) all(!is.na(x)))
table(complete)
trainId <- sample(which(complete), size = (2/3)*sum(complete))
testId <- setdiff(which(complete), trainId)
length(trainId)
length(testId)
flowcell <- gsub("-.*$", "", cnts$run_id)
batches <- model.matrix(~flowcell)
library(pls)
pls.options(parallel = 5)
predictor <- plsr(log10(phenotypes+1)~batches+data, ncomp = 50,
data = list(phenotypes = as.matrix(phenotypes[trainId,]),
batches = batches[trainId, ],
data=t(features[, trainId])), validation = "CV", keep.model = TRUE)
str(predictor)
summary(predictor)
cumsum(explvar(predictor))
barplot(cumsum(explvar(predictor)), las = 2, ylab = "Cumulative Variance Explained")
validationplot(predictor, val.type = "R2", ncomp = 1:50)
setEPS()
postscript(file.path(path, "WBCC_eval.eps"))
validationplot(predictor, val.type = "RMSEP", ncomp = 1:50)
dev.off()
predicted <- predict(predictor, newdata = list(data=t(features[, testId]), batches = batches[testId,]) , ncomp = 20)
predicted <- 10^predicted - 1
library(reshape2)
data <- cbind(melt(phenotypes[testId,], value.name="measured"),
predicted=melt(predicted[,,1])[,3])
by(data, data$variable, function(x) round(cor(x$measured, x$predicted),2))
##hack to get plot area square
dummy <- t(simplify2array(by(data, data$variable, function(x) range(c(x$measured, x$predicted)))))
dummy <- data.frame(variable=rep(rownames(dummy), 2), measured=as.vector(dummy), predicted=as.vector(dummy))
##correlation plot
library(ggplot2)
library(ggsci)
gp <- ggplot(data, aes(x=measured, y=predicted))
gp <- gp + geom_point() + geom_abline(intercept=0, slope=1, col=2)
gp <- gp + geom_blank(data=dummy)
gp <- gp + facet_wrap(~variable, scales="free")
gp + theme_bw() + scale_color_npg()
ggsave(file.path(path, "WBCC_RNAseq_diagn1.eps"))
##bland altman plot
tdata <- data
tdata[,2] <- rowMeans(data[,-1])
tdata[,3] <- data[,2] - data[,3]
colnames(tdata)[2:3] <- c("Average", "Difference")
gp <- ggplot(tdata, aes(x=Average, y=Difference))
gp <- gp + geom_point() + geom_hline(yintercept=0, col=2)
gp <- gp + facet_wrap(~variable, scales="free")
gp + theme_bw() + scale_color_npg()
ggsave(file.path(path, "WBCC_RNAseq_diagn2.eps"))
predicted <- predict(predictor, newdata = list(data=t(features), batches = batches), ncomp = 20)[,,1]
predicted <- 10^predicted-1
hist(rowSums(predicted), n=50)
dim(predicted)
rownames(predicted) <- colnames(features)
head(predicted)
write.table(predicted, file=file.path(path, "WBCC_RNAseq.csv"),
sep=",", quote=FALSE)
First we need to find out which algorithm and which number of features to use.
data(rnaSeqData_ReadCounts_BIOS_cleaned)
dim(counts)
cnts <- preprocess(counts)
dim(cnts)
metadata(cnts)$formula <- ~0+Smoking
phenotype <- unlist(get_all_vars(metadata(cnts)$formula, data=colData(cnts)))
features <- assays(cnts)$data
round(100*table(phenotype, useNA="always")/length(phenotype),2)
levels(phenotype) <- c("current smoker", "non-smoker", "non-smoker")
round(100*table(phenotype, useNA="always")/length(phenotype),2)
library(BiocParallel)
register(MulticoreParam(5))
validation <- validate(phenotype, features, methods = c("ridge", "elastic-net", "lasso"), ntop = c(10, 25, 50, 100, 250, 500, 5000, 10000), nrep = 25, verbose=TRUE)
library(ggplot2)
library(ggsci)
gp <- ggplot(subset(validation, measure %in% c("accuracy (Overall)", "f1 (non-smoker)", "f1 (current smoker)")),
aes(x=as.factor(ntop), y=value, col=method))
gp <- gp + geom_boxplot()
gp <- gp + facet_wrap(~measure, scales="free", nrow=3, ncol=1)
gp <- gp + xlab("Number of features") + ylab("") + ggtitle("Validation of predictors for `Smoking`")
gp + theme_bw() + scale_color_npg() + theme(legend.position = "bottom", legend.direction = "horizontal")
ggsave(file.path(path, "Smoking_RNAseq_eval.eps"))
phenotyped <- phenotyping(phenotype, features, methods = "elastic-net", ntop=50, verbose=TRUE)
phenotyped$validation
predicted <- phenotyped$predicted
rownames(predicted) <- colnames(features)
write.table(predicted, file=file.path(path, "Smoking_RNAseq.csv"), sep=",", quote=FALSE)
TODO add diagnostic plots!
It could be interesting to see which genes are the top genes selected for predicting Age.
this does not include e.g., the lasso selection
library(org.Hs.eg.db)
map <- select(org.Hs.eg.db, keys = names(phenotyped$top), keytype="ENSEMBL", columns=c("SYMBOL", "CHR"))
top <- merge(map, phenotyped$top, by.x="ENSEMBL", by.y="row.names")
top <- top[order(abs(top$y), decreasing=TRUE),]
top
mid <- match(top$ENSEMBL, rownames(features))
y <- features[mid,]
rownames(y) <- top$SYMBOL
rownames(y)[is.na(top$SYMBOL)] <- top$ENSEMBL[is.na(top$SYMBOL)]
colnames(y) <- NULL
library(ComplexHeatmap)
smoking <- as.character(phenotype)
smoking[is.na(smoking)] <- "Unknown"
ha <- HeatmapAnnotation(df = data.frame(Smoking = factor(smoking)), col = list(Smoking = c("current smoker" = "blue", "non-smoker" = "pink", "Unknown" = "grey")))
setEPS()
postscript(file.path(path, "Smoking_RNAseq_feats.eps"))
Heatmap(y, top_annotation = ha, name="log CPM", row_names_gp = gpar(fontsize = 8))
dev.off()
sparklyr
##setup spark context
library(sparklyr)
sc <- spark_connect(master = "local", config = spark_config(), version = "1.6.2")
metadata(cnts)$formula <- ~0+Sampling_Age
phenotype <- unlist(get_all_vars(metadata(cnts)$formula, data=colData(cnts)))
features <- assays(cnts)$data
library(HDF5Array)
smallMatrix <- DelayedArray(features)
class(smallMatrix)
writeHDF5Dataset(smallMatrix, file="test.hdf5", name="features")
data <- t(features)
data <- cbind(phenotype=phenotype, data)
rownames(data) <- colnames(features)
data[1:5, 1:5]
dim(data)
library(SparkR, lib.loc=".cache/spark/spark-1.6.2-bin-hadoop2.6/R/lib/")
library(rhdf5)
data <- h5read("test.hdf5", name="features")
x <- as.DataFrame(data.frame(data))
write.parquet(data, path="~/test.parquet")
temp_parquet <- tempfile(fileext = ".parquet")
hdf5_to <- function(sc, file) {
spark_context(sc) %>%
invoke("org.apache.spark.ml.Utils.loadHDF5", file)
}
hdf5_to(sc, "test.hdf5")
partitions <- data %>%
sdf_partition(training = 2/3, test = 1/3, seed = 1099)
fit <- ml_linear_regression(response = "phenotype", features = c("wt", "cyl"))
alpha = 0,
lambda = 0