Condition
GA Control
12wk 96
20wk 96
28wk 95
36wk 96
4 Longitudinal Analysis
Here we sought to identify all cfRNAs that change in abundance as gestation progresses.
For this analysis, we used a total of 96 healthy samples from the validation cohort (i.e. term delivery) and carried out two types of analyses: 1) to identify cfRNAs that differ between specific gestational ages (i.e. 12wk to 20wk, 20wk to 28wk, and 28wk to 36wk), and 2) a longitudinal study of cfRNAs across the four gestational ages.
Below shows the number of (healthy) samples used in this analysis by gestational age (see Listing 1.3 for both cases and controls):
4.1 cfRNAs that differ between specific gestational ages
We used DESeq2
Bioconductor package for this analysis.
Code to find DEGs by gestational age via DESeq2
library(magrittr)
library(DESeq2)
library("BiocParallel")
register(MulticoreParam(12))
# set the file name to store the result of DEG by GA via `DESEq2
myRData="RData/dl.DEG.GA.control.term.POPS-2022.GRCh38.88.RData"
if(file.exists(myRData)){
load(myRData)
}else{
my.dds.RData<-paste0("RData/dds.term.",my.salmon.index,".RData")
load(my.dds.RData)
# use 15150 genes considered in this study
dds.f2.term<-DESeq(dds.term[rownames(dds.f2)], parallel=T)
# using only control samples (i.e. healthy)
keep.samples<-colData(dds.f2.term)$Condition=="Control"
dds.ga.term<-dds.f2.term[,keep.samples]
design(dds.ga.term) <- formula(~ Batch + Sex + GA) # NB, GA is the last term
dds.ga.term<-DESeq(dds.ga.term, parallel=TRUE) # isa 'DESeqDataSet'
GA.diff<-c("20wk-12wk","28wk-20wk","36wk-28wk")
li.DEG.GA<-lapply(GA.diff, function(i){
my.ga<-strsplit(i,"-")[[1]]
my.res<-results(dds.ga.term,
independentFiltering=FALSE,
lfcThreshold=log2(minFC),
contrast=c("GA",c(my.ga[1],my.ga[2])),
parallel=TRUE)
lfcShrink(dds.ga.term,
res=my.res, #li.res[[my.GA]],
lfcThreshold=log2(minFC), # not applicable for 'asher'
type="ashr",
parallel=TRUE)
})
names(li.DEG.GA)<-GA.diff
dl.DEG.GA<-lapply(li.DEG.GA, function(i)
data.table(`gene_id`=rownames(i), as.data.frame(i))[order(pvalue)][,`:=`("BH"=p.adjust(pvalue,"BH"),"BY"=p.adjust(pvalue,"BY"),"bf"=p.adjust(pvalue,"bonferroni"))]
)
save(dl.DEG.GA, file=myRData)
}
Now check the number of DEG by GA intervals.
4.2 A longitudinal study of cfRNAs across the four gestational ages
We used lme4
and lmerTest
R packages for this analysis.
4.2.1 Prepare the dataset
Below shows the exact gestational age (in weeks) and its normalised GA:
Code
library(magrittr)
dt.GA<-readxl::read_excel("static/R/data/cfRNA_samples_GA_exact.xlsx") %>% data.table
dt.samples<-merge(dt.samples, dt.GA[,.(SampleID,GAwk=GA_exact)]) # n=383
dt.samples$POPSID<-factor(dt.samples$POPSID)
# summary of GA (in weeks)
lapply(split(dt.samples, dt.samples$GA), function(DT) DT$GAwk %>% summary)
$`12wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
10.57 12.21 12.71 12.71 13.14 17.57
$`20wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
19.00 20.14 20.43 20.45 20.71 21.71
$`28wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
26.86 28.00 28.29 28.26 28.50 29.86
$`36wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
34.71 36.00 36.22 36.26 36.43 38.00
$`12wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.8213 1.0269 1.0888 1.0882 1.1425 1.6963
$`20wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.875 2.017 2.054 2.056 2.089 2.214
$`28wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.857 3.000 3.036 3.032 3.062 3.232
$`36wk`
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.839 4.000 4.027 4.033 4.054 4.250
And we need the transcript quatification matrix (in CPM
) in Validation
cohort, which is also shown in Section 3.1.1.
Code to prepare CPM by transcript
#####################
## Based on logCPM ##
#####################
load("RData/dt.cpmZ.term.POPS-2022.GRCh38.88.RData") # dt.cpmZ.term (term)
dt.logCPM<-merge(dt.cpmZ.term[Condition=="Control"],
dt.samples[,.(SampleID,POPSID,GAwk,rGAwk)],
by="SampleID"
)
dl.logCPM<-split(dt.logCPM, dt.logCPM$geneName)
dl.logCPM %>% length # n=15150
rm(dt.logCPM) # free the memory
4.2.2 Build the models using quaratic term (i.e. \(GA^2\)) and linear term (\(GA\))
Code to run longitudinal analysis of cfRNA change by gestational age via lme4
if(file.exists("static/R/data/li.lmer.RData")){
load("static/R/data/li.lmer.RData")
}else{
library(lme4)
library(lmerTest)
library(MuMIn)
library(ggeffects)
li.lmer<-
lapply(dl.logCPM, function(dt.foo){ # by each gene
my.gene<-unique(dt.foo[["geneName"]])
message(my.gene)
#####################################################
## 1a. Mixed Effect Model via lmerTest (non-linear) #
#####################################################
my.lmer.GA2 <- lmerTest::lmer(logCPM ~ 1 + rGAwk + I(rGAwk^2) + (1 | POPSID), data = dt.foo)
dt.predict.GA2<-
my.lmer.GA2 %>% ggeffects::ggpredict(terms = list(rGAwk= seq(min(dt.foo$rGAwk), max(dt.foo$rGAwk), length = 20)), verbose = FALSE) %>% data.table %>% setnames("x","rGAwk") %>% setnames("predicted","logCPM")
dt.predict.GA2[,GAwk:=rGAwk*8 +4][,group:=NULL][,rGAwk:=NULL] %>% setcolorder("GAwk")
mat.coef<-lmerTest:::get_coefmat(my.lmer.GA2)
my.coeff.GA2<-mat.coef[2:3,1,drop=F] %>% t
my.pval.GA2<-mat.coef[2:3,5,drop=F] %>% t
my.r2.GA2<-MuMIn::r.squaredGLMM(my.lmer.GA2)
dt.lmer.GA2<-data.table(my.gene,my.r2.GA2,my.coeff.GA2, my.pval.GA2) %>% setnames(c("gene","r2m","r2c","coef.GA","coef.GA2","p.GA","p.GA2"))
##############################################
## 1b. Mixed Effect Model via lmer4 (linear) #
##############################################
my.lmer.GA <- lme4::lmer(logCPM ~ 1 + rGAwk + (1 | POPSID), data = dt.foo)
dt.predict.GA <-
my.lmer.GA %>% ggeffects::ggpredict(terms = list(rGAwk= seq(min(dt.foo$rGAwk), max(dt.foo$rGAwk), length = 20)), verbose = FALSE) %>% data.table %>% setnames("x","rGAwk") %>% setnames("predicted","logCPM")
dt.predict.GA[,GAwk:=rGAwk*8 +4][,group:=NULL][,rGAwk:=NULL] %>% setcolorder("GAwk")
mat.coef<-lmerTest:::get_coefmat(my.lmer.GA)
my.coeff.GA<-mat.coef[2:2,1,drop=F] %>% t
my.pval.GA<-mat.coef[2:2,5,drop=F] %>% t
my.r2.GA<-MuMIn::r.squaredGLMM(my.lmer.GA)
dt.lmer.GA<-data.table(my.gene,my.r2.GA,my.coeff.GA,NA,my.pval.GA,NA) %>% setnames(c("gene","r2m","r2c","coef.GA","coef.GA2","p.GA","p.GA2"))
###############
## Anova test #
###############
my.anova<-anova(my.lmer.GA2, my.lmer.GA)
## Summary
dt.lmer<-
rbind(
cbind("Model"="GA2",dt.lmer.GA2),
cbind("Model"="GA",dt.lmer.GA)
)[,p.anova:=my.anova[["Pr(>Chisq)"]][2]] %>% setcolorder("gene")
# return this list object
list(`stat`=dt.lmer, `predict`=dt.predict.GA2, `predict.GA`=dt.predict.GA)
})
save(li.lmer, file="static/R/data/li.lmer.RData")
}
4.2.3 Quadratic (\(GA^2\)) and linear (\(GA\)) models
Code to generate a table quadratic (\(GA^2\)) models per gene
library(DT)
dt.R2<-(lapply(li.lmer, function(li.model) li.model[["stat"]]) %>% rbindlist)[order(-r2m)]
## $GA2$
dt.R2.GA2<-dt.R2[Model=="GA2"]
dt.R2.GA2[,p.combined:=pchisq(-2 * (log(p.GA) + log(p.GA2)), 2 * 2, lower.tail = FALSE)]
dt.R2.GA2[,BH.GA:=p.adjust(p.GA,"BH")]
dt.R2.GA2[,BH.GA2:=p.adjust(p.GA2,"BH")]
dt.R2.GA2[,BH.combined:=p.adjust(p.combined,"BH")]
dt.R2.GA2[,BH.anova:=p.adjust(p.anova,"BH")]
dt.R2.GA2[order(-r2m)][1:100][,-"Model"] %>%
datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
)) %>%
formatRound(columns=c('r2m', 'r2c','coef.GA','coef.GA2'), digits=2) %>%
formatSignif(columns = c('p.GA', 'p.GA2','p.anova','p.combined','BH.GA','BH.GA2','BH.combined','BH.anova'), digits = 2)
R-squared (R²) is a statistical measure, also known as the coefficient of determination, that indicates how well a regression model predicts the values of a dependent variable. The marginal \(R^2\) considers only the variance of the fixed effects (without the random effects), while the conditional \(R^2\) takes both the fixed and random effects into account (i.e., the total model). We used MuMIn
R
package to calculate this.
Code to generate a table showing the quadratic (\(GA^2\)) models
dt.R2.GA2[order(p.combined)][1:100][,-"Model"] %>%
datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
)) %>%
formatRound(columns=c('r2m', 'r2c','coef.GA','coef.GA2'), digits=2) %>%
formatSignif(columns = c('p.GA', 'p.GA2','p.anova','p.combined','BH.GA','BH.GA2','BH.combined','BH.anova'), digits = 2)
Code to generate a table showing the linear (\(GA\)) models
# GA
dt.R2.GA<-dt.R2[Model=="GA"]
dt.R2.GA[,BH.GA:=p.adjust(p.GA,"BH")]
dt.R2.GA[,BH.anova:=p.adjust(p.anova,"BH")]
dt.R2.GA[BH.GA<0.05][order(p.GA)][coef.GA<0][1:100][,-c("Model","coef.GA2","p.GA2")] %>%
datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
)) %>%
formatRound(columns=c('r2m', 'r2c','coef.GA'), digits=2) %>%
formatSignif(columns = c('p.GA', 'p.anova','BH.GA','BH.anova'), digits = 2)