4  Longitudinal Analysis

Here we sought to identify all cfRNAs that change in abundance as gestation progresses.

For this analysis, we used a total of 96 healthy samples from the validation cohort (i.e. term delivery) and carried out two types of analyses: 1) to identify cfRNAs that differ between specific gestational ages (i.e. 12wk to 20wk, 20wk to 28wk, and 28wk to 36wk), and 2) a longitudinal study of cfRNAs across the four gestational ages.

Below shows the number of (healthy) samples used in this analysis by gestational age (see Listing 1.3 for both cases and controls):

Listing 4.1: Tabulate non-case samples in the validation cohort
Code
dt.samples<-data.table::fread("static/R/data/dt.samples.csv")
dt.samples<-dt.samples[!SampleID %in% c("GS-B-374-UW","GS-B-374-UW-b","GS-59-CX","GS-59-CX-b","GS-179-HQ")]
xtabs(~GA+Condition, dt.samples[Cohort=="term" & Condition=="Control"])
      Condition
GA     Control
  12wk      96
  20wk      96
  28wk      95
  36wk      96

4.1 cfRNAs that differ between specific gestational ages

We used DESeq2 Bioconductor package for this analysis.

Code to find DEGs by gestational age via DESeq2
library(magrittr)
library(DESeq2)
library("BiocParallel")
register(MulticoreParam(12))

# set the file name to store the result of DEG by GA via `DESEq2
myRData="RData/dl.DEG.GA.control.term.POPS-2022.GRCh38.88.RData"
if(file.exists(myRData)){
  load(myRData)
}else{
  my.dds.RData<-paste0("RData/dds.term.",my.salmon.index,".RData")
  load(my.dds.RData)
  # use 15150 genes considered in this study
  dds.f2.term<-DESeq(dds.term[rownames(dds.f2)], parallel=T)

  # using only control samples (i.e. healthy)
  keep.samples<-colData(dds.f2.term)$Condition=="Control"
  dds.ga.term<-dds.f2.term[,keep.samples] 
  design(dds.ga.term) <- formula(~ Batch + Sex + GA)        # NB, GA is the last term

  dds.ga.term<-DESeq(dds.ga.term, parallel=TRUE) # isa 'DESeqDataSet'

  GA.diff<-c("20wk-12wk","28wk-20wk","36wk-28wk")
  li.DEG.GA<-lapply(GA.diff, function(i){
    my.ga<-strsplit(i,"-")[[1]]
    my.res<-results(dds.ga.term, 
            independentFiltering=FALSE,
            lfcThreshold=log2(minFC), 
            contrast=c("GA",c(my.ga[1],my.ga[2])),
            parallel=TRUE)  

    lfcShrink(dds.ga.term, 
              res=my.res, #li.res[[my.GA]],
              lfcThreshold=log2(minFC), # not applicable for 'asher' 
              type="ashr",
              parallel=TRUE)  
  })
  names(li.DEG.GA)<-GA.diff

  dl.DEG.GA<-lapply(li.DEG.GA, function(i)
    data.table(`gene_id`=rownames(i), as.data.frame(i))[order(pvalue)][,`:=`("BH"=p.adjust(pvalue,"BH"),"BY"=p.adjust(pvalue,"BY"),"bf"=p.adjust(pvalue,"bonferroni"))]
  )
  save(dl.DEG.GA, file=myRData)
}

Now check the number of DEG by GA intervals.

Code to find the number of DEGs by gestational age via DESeq2
dl.DEG.GA[["20wk-12wk"]][BH<0.05] %>% nrow # n=76
dl.DEG.GA[["28wk-20wk"]][BH<0.05] %>% nrow # n=64
dl.DEG.GA[["36wk-28wk"]][BH<0.05] %>% nrow # n=55

4.2 A longitudinal study of cfRNAs across the four gestational ages

We used lme4 and lmerTest R packages for this analysis.

4.2.1 Prepare the dataset

Below shows the exact gestational age (in weeks) and its normalised GA:

Listing 4.2: Summary of the exact gestational age
Code
library(magrittr)
dt.GA<-readxl::read_excel("static/R/data/cfRNA_samples_GA_exact.xlsx") %>% data.table
dt.samples<-merge(dt.samples, dt.GA[,.(SampleID,GAwk=GA_exact)]) # n=383
dt.samples$POPSID<-factor(dt.samples$POPSID)

# summary of GA (in weeks)
lapply(split(dt.samples, dt.samples$GA), function(DT) DT$GAwk %>% summary)
$`12wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.57   12.21   12.71   12.71   13.14   17.57 

$`20wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   20.14   20.43   20.45   20.71   21.71 

$`28wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  26.86   28.00   28.29   28.26   28.50   29.86 

$`36wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  34.71   36.00   36.22   36.26   36.43   38.00 
Listing 4.3: Summary of the relative gestational age
Code
# normalise GA to make 12wk=>1, 20wk=>2, 28wk=>3, and 36wk=>4
dt.samples[,rGAwk:=(GAwk-4)/8] 
lapply(split(dt.samples, dt.samples$GA), function(DT) DT$rGAwk %>% summary)
$`12wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.8213  1.0269  1.0888  1.0882  1.1425  1.6963 

$`20wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.875   2.017   2.054   2.056   2.089   2.214 

$`28wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.857   3.000   3.036   3.032   3.062   3.232 

$`36wk`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.839   4.000   4.027   4.033   4.054   4.250 

And we need the transcript quatification matrix (in CPM) in Validation cohort, which is also shown in Section 3.1.1.

Code to prepare CPM by transcript
#####################
## Based on logCPM ##
#####################
load("RData/dt.cpmZ.term.POPS-2022.GRCh38.88.RData") # dt.cpmZ.term (term)

dt.logCPM<-merge(dt.cpmZ.term[Condition=="Control"],
                 dt.samples[,.(SampleID,POPSID,GAwk,rGAwk)],
                 by="SampleID"
              )

dl.logCPM<-split(dt.logCPM, dt.logCPM$geneName)
dl.logCPM %>% length # n=15150
rm(dt.logCPM) # free the memory

4.2.2 Build the models using quaratic term (i.e. \(GA^2\)) and linear term (\(GA\))

Code to run longitudinal analysis of cfRNA change by gestational age via lme4
if(file.exists("static/R/data/li.lmer.RData")){
  load("static/R/data/li.lmer.RData")
}else{
  library(lme4)
  library(lmerTest)
  library(MuMIn)
  library(ggeffects)

  li.lmer<-
  lapply(dl.logCPM, function(dt.foo){ # by each gene
    my.gene<-unique(dt.foo[["geneName"]])
    message(my.gene)

    #####################################################
    ## 1a. Mixed Effect Model via lmerTest (non-linear) #
    #####################################################
    my.lmer.GA2 <- lmerTest::lmer(logCPM ~ 1 + rGAwk + I(rGAwk^2) + (1 | POPSID), data = dt.foo)
    dt.predict.GA2<- 
      my.lmer.GA2 %>% ggeffects::ggpredict(terms = list(rGAwk= seq(min(dt.foo$rGAwk), max(dt.foo$rGAwk), length = 20)), verbose = FALSE) %>% data.table %>% setnames("x","rGAwk") %>% setnames("predicted","logCPM")
    dt.predict.GA2[,GAwk:=rGAwk*8 +4][,group:=NULL][,rGAwk:=NULL] %>% setcolorder("GAwk")

    mat.coef<-lmerTest:::get_coefmat(my.lmer.GA2)
    my.coeff.GA2<-mat.coef[2:3,1,drop=F] %>% t
    my.pval.GA2<-mat.coef[2:3,5,drop=F] %>% t
    my.r2.GA2<-MuMIn::r.squaredGLMM(my.lmer.GA2)

    dt.lmer.GA2<-data.table(my.gene,my.r2.GA2,my.coeff.GA2, my.pval.GA2) %>% setnames(c("gene","r2m","r2c","coef.GA","coef.GA2","p.GA","p.GA2"))

    ##############################################
    ## 1b. Mixed Effect Model via lmer4 (linear) #
    ##############################################
    my.lmer.GA <- lme4::lmer(logCPM ~ 1 + rGAwk + (1 | POPSID), data = dt.foo)
    dt.predict.GA <- 
      my.lmer.GA %>% ggeffects::ggpredict(terms = list(rGAwk= seq(min(dt.foo$rGAwk), max(dt.foo$rGAwk), length = 20)), verbose = FALSE) %>% data.table %>% setnames("x","rGAwk") %>% setnames("predicted","logCPM")
    dt.predict.GA[,GAwk:=rGAwk*8 +4][,group:=NULL][,rGAwk:=NULL] %>% setcolorder("GAwk")

    mat.coef<-lmerTest:::get_coefmat(my.lmer.GA)
    my.coeff.GA<-mat.coef[2:2,1,drop=F] %>% t
    my.pval.GA<-mat.coef[2:2,5,drop=F] %>% t
    my.r2.GA<-MuMIn::r.squaredGLMM(my.lmer.GA)

    dt.lmer.GA<-data.table(my.gene,my.r2.GA,my.coeff.GA,NA,my.pval.GA,NA) %>% setnames(c("gene","r2m","r2c","coef.GA","coef.GA2","p.GA","p.GA2"))

    ###############
    ## Anova test #
    ###############
    my.anova<-anova(my.lmer.GA2, my.lmer.GA)

    ## Summary
    dt.lmer<-
      rbind(
      cbind("Model"="GA2",dt.lmer.GA2),
      cbind("Model"="GA",dt.lmer.GA)
      )[,p.anova:=my.anova[["Pr(>Chisq)"]][2]] %>% setcolorder("gene")

    # return this list object
    list(`stat`=dt.lmer, `predict`=dt.predict.GA2, `predict.GA`=dt.predict.GA)
  })
  save(li.lmer, file="static/R/data/li.lmer.RData")
}

4.2.3 Quadratic (\(GA^2\)) and linear (\(GA\)) models

Code to generate a table quadratic (\(GA^2\)) models per gene
library(DT)

dt.R2<-(lapply(li.lmer, function(li.model) li.model[["stat"]]) %>% rbindlist)[order(-r2m)]

## $GA2$
dt.R2.GA2<-dt.R2[Model=="GA2"]
dt.R2.GA2[,p.combined:=pchisq(-2 * (log(p.GA) + log(p.GA2)), 2 * 2, lower.tail = FALSE)]
dt.R2.GA2[,BH.GA:=p.adjust(p.GA,"BH")]
dt.R2.GA2[,BH.GA2:=p.adjust(p.GA2,"BH")]
dt.R2.GA2[,BH.combined:=p.adjust(p.combined,"BH")]
dt.R2.GA2[,BH.anova:=p.adjust(p.anova,"BH")]

dt.R2.GA2[order(-r2m)][1:100][,-"Model"] %>%
  datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
                                                  buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
                                                  )) %>%
  formatRound(columns=c('r2m', 'r2c','coef.GA','coef.GA2'), digits=2) %>%
  formatSignif(columns = c('p.GA', 'p.GA2','p.anova','p.combined','BH.GA','BH.GA2','BH.combined','BH.anova'), digits = 2)
Table 4.1: Top 100 quadratic (\(GA^2\)) model by the marginal \(R^2\)
Marginal and conditional \(R^2\)

R-squared (R²) is a statistical measure, also known as the coefficient of determination, that indicates how well a regression model predicts the values of a dependent variable. The marginal \(R^2\) considers only the variance of the fixed effects (without the random effects), while the conditional \(R^2\) takes both the fixed and random effects into account (i.e., the total model). We used MuMIn R package to calculate this.

Code to generate a table showing the quadratic (\(GA^2\)) models
dt.R2.GA2[order(p.combined)][1:100][,-"Model"] %>%
  datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
                                                  buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
                                                  )) %>%
  formatRound(columns=c('r2m', 'r2c','coef.GA','coef.GA2'), digits=2) %>%
  formatSignif(columns = c('p.GA', 'p.GA2','p.anova','p.combined','BH.GA','BH.GA2','BH.combined','BH.anova'), digits = 2)
Table 4.2: Top 100 quadratic (\(GA^2\)) model by the combined p-value
Code to generate a table showing the linear (\(GA\)) models
# GA
dt.R2.GA<-dt.R2[Model=="GA"]
dt.R2.GA[,BH.GA:=p.adjust(p.GA,"BH")]
dt.R2.GA[,BH.anova:=p.adjust(p.anova,"BH")]

dt.R2.GA[BH.GA<0.05][order(p.GA)][coef.GA<0][1:100][,-c("Model","coef.GA2","p.GA2")] %>%
  datatable(extensions = 'Buttons',options = list( dom = 'Bfrtip',
                                                  buttons =list('copy', list(extend = 'collection',buttons = c('csv', 'excel', 'pdf'),text = 'Download'))
                                                  )) %>%
  formatRound(columns=c('r2m', 'r2c','coef.GA'), digits=2) %>%
  formatSignif(columns = c('p.GA', 'p.anova','BH.GA','BH.anova'), digits = 2)
Table 4.3: Top 100 linear (\(GA\)) model by the negative coefficient