7  Supplementary Text Figures

This chapter demonstrates the R codes that make supplementary text figures shown in the paper. Making the main figures are shown in Chapter 6.

7.1 Supplementary Text Figure A

Code
  lazyLoad("downsample.chrY.TPM_cache/beamer/load_ds_salmon_4b8dd727729fd0ae5deb423d69e7c16d") # dl.chrY.TPM and dl.stat

  dt.ds.chrY <- lapply(names(dl.stat), function(DS) dl.stat[[DS]][,DS:=DS]) %>% rbindlist
  dt.ds.chrY$DS<-factor(dt.ds.chrY$DS, level=c("1M","5M","10M"))
  dt.ds.chrY[,GA:=gsub(" weeks","wk",GA)]
  setnames(dt.ds.chrY, "Type", "Method")

  p1.chrY<-ggplot(dt.ds.chrY, aes(Method,SumCount)) +
  geom_rect(data = dt.ds.chrY[DS=="5M"],fill="grey60",xmin = -Inf,xmax = Inf, ymin = -Inf,ymax = Inf,alpha = 0.2) +
  geom_bar(aes(fill=Method), stat="identity",width=.5, alpha=.7) +
  ggsci::scale_fill_jama() +
  facet_grid(Sex~DS+GA,scales="free") +
  labs(y="Sum of read count on chrY genes") +
  theme_Publication() +
  theme(legend.position="top",
        panel.border=element_rect(colour="black"),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

  #
  pdf(file="Figures/Suppl/cfRNA.Suppl.Text.Fig1A.pdf", width=11, height=8, title="Abundance of chrY")
  p1.chrY
  dev.off()

7.2 Supplementary Text Figure B

Code
  lazyLoad("downsample.chrY.TPM_cache/beamer/load_ds_salmon_4b8dd727729fd0ae5deb423d69e7c16d") # dl.chrY.TPM and dl.stat

  lapply(dl.chrY.TPM, function(i) i[,.(sum(Counts),sum(TPM))])

  dl.chrY.TPM<-lapply(dl.chrY.TPM, function(DT) {
                  #DT[Sex=="Male",Class:=ifelse(TPM>0,"TP","FN")]
                  #DT[Sex=="Female",Class:=ifelse(TPM==0,"TN","FP")] 
                  DT[,`Predicted Sex`:=ifelse(TPM>0, "Male","Female"),Type]
                  DT$Sex<-factor(DT$Sex, level=c("Male","Female"))
                  DT$`Predicted Sex`<-factor(DT$`Predicted Sex`, level=c("Male","Female"))
                  DT 
  }) 

  #####################################
  # Predictive performance by DS & GA #
  #####################################
  dt.chrY.con.ga<-lapply(names(dl.chrY.TPM), function(DS){
    lapply(split(dl.chrY.TPM[[DS]], dl.chrY.TPM[[DS]]$GA), function(DT){
    my.GA<-DT[,.N,GA]$GA

    cm1<-caret::confusionMatrix(DT[Type=="HiSat2+Salmon"][["Predicted Sex"]], reference=DT[Type=="HiSat2+Salmon"][["Sex"]])
    cm2<-caret::confusionMatrix(DT[Type=="Salmon (SA mode)"][["Predicted Sex"]], reference=DT[Type=="Salmon (SA mode)"][["Sex"]])

    rbind(
    data.table(DS=DS, GA=my.GA,Type="HiSat2+Salmon", Measure=names(cm1$byClass), Value=cm1$byClass),
    data.table(DS=DS, GA=my.GA,Type="Salmon (SA mode)", Measure=names(cm2$byClass), Value=cm2$byClass)
    )[Measure %in% c("F1","Precision", "Recall")]
    }) %>% rbindlist 
  }) %>% rbindlist 

  dt.chrY.con.ga$DS<-factor(dt.chrY.con.ga$DS, level=c("1M","5M","10M"))
  dt.chrY.con.ga[,GA:=gsub(" weeks","wk",GA)]
  setnames(dt.chrY.con.ga, "Type", "Method")

  p1.chrY.con<-ggplot(dt.chrY.con.ga, aes(Method,Value)) +
  geom_rect(data = dt.chrY.con.ga[DS=="5M"],fill="grey60",xmin = -Inf,xmax = Inf, ymin = -Inf,ymax = Inf,alpha = 0.2) +
  geom_bar(aes(fill=Method), stat="identity",width=.5, alpha=.7) +
  ggsci::scale_fill_jama() +
  facet_grid(Measure~DS+GA,scales="free") +
  labs(y="Score") +
  theme_Publication() +
  theme(legend.position="none",
        panel.border=element_rect(colour="black"),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())


  # merge SI.Fig1A + SI.Fig1B
  pdf(file="Figures/Suppl/cfRNA.Suppl.Text.Fig1AB.pdf", width=11, height=14, title="Abundance of chrY")
  cowplot::plot_grid(p1.chrY, p1.chrY.con, labels="AUTO", ncol=1, label_size=27, align="v", axis="l")
  dev.off()

7.3 Supplementary Text Figure C

Code
  # load the transcript definition used in Salmon (SA)
  load("RData/gr.ensg.Homo_sapiens.GRCh38.88.cdna.all.ncrna.fa.gz.RData") %>% system.time # isa GRanges
  gr.ensg[seqnames(gr.ensg)=="Y" & gr.ensg$gene_biotype=="protein_coding",] # chrY protein-coding; n=42 
  this.gene<-(gr.ensg[seqnames(gr.ensg)=="Y" & gr.ensg$gene_biotype=="protein_coding",] %>% names) # chrY protein-coding; n=42 (without the training version e.g. ENSG00000176679)

  # load the TPM based om 10M-ds dataset
  load("RData/li.TPM.10M.RData")
  li.TPM[["Salmon_aln"]][["TPM"]] %>% dim # 62803 genes x 100 samples

  ## Heatamp of TPM using `HiSat2+Salmon` (based on 10M)
  li.TPM[["Salmon_aln"]][["counts"]][this.gene,] %>% rowSums %>% sort %>% rev # count of 42 chrY genes
  filter.g2<-li.TPM[["Salmon_aln"]][["counts"]][this.gene,] %>% rowSums %>% sort %>% rev >0 # filter for 42 chrY genes with TPM >0
  this.gene2<-(li.TPM[["Salmon_aln"]][["counts"]][this.gene,] %>% rowSums %>% sort %>% rev)[filter.g2] %>% names # n=24 genes
  this.sample2<-li.TPM[["Salmon_aln"]][["counts"]][this.gene,] %>% colSums %>% sort %>% rev %>% names # n=100 samples sorted 
  filter.s2<-li.TPM[["Salmon_aln"]][["counts"]][this.gene2,] %>% colSums %>% sort %>% rev >0 # filter for samples with TPM >0 
  predicted.male2<-(li.TPM[["Salmon_aln"]][["counts"]][this.gene,] %>% colSums %>% sort %>% rev)[filter.s2] %>% names # n=60 samples 

  my.mat2<-log2(li.TPM[["Salmon_aln"]][["counts"]][this.gene2,this.sample2]+1)
  rownames(my.mat2)<-gr.ensg[this.gene2]$gene_name

  col.anno<-mat.samples[this.sample2,c("GA","Sex"),drop=F] # isa `data.frame`
  col.anno$"Predicted Sex"<-ifelse(this.sample2 %in% predicted.male2,"Male","Female")
  col.anno<-col.anno[,c("Predicted Sex","Sex","GA")]

  GA.color<-ggsci::pal_jama("default",alpha=.9)(4)
  names(GA.color)<-c("12wk","20wk","28wk","36wk")
  my.color=list(
      `Sex`=c(Female="hotpink",Male='skyblue'),
      `Predicted Sex`=c(Female="hotpink3",Male='skyblue3'),
      #`GA`=c("12wk"=cbPalette2[1],"20wk"=cbPalette2[2],"28wk"=cbPalette2[3],"36wk"=cbPalette2[4])
      `GA`=GA.color
  )

  cph1<-ComplexHeatmap::pheatmap(my.mat2,
                                 annotation_col=col.anno, 
                                 annotation_colors=my.color, 
                                 cluster_cols=F,
                                 cluster_rows=F,
                                 show_colnames=F,
                                 border_color=NA,
                                 fontsize=12,
                                 name="log2(count)",
                                 row_title="\nHiSat2+Salmon",
                                 row_title_gp = gpar(fontsize = 20),
  )

  filter.g3<-li.TPM[["Salmon"]][["counts"]][this.gene,] %>% rowSums %>% sort %>% rev >0 # filter for 42 chrY genes with count >0
  this.gene3<-(li.TPM[["Salmon"]][["counts"]][this.gene,] %>% rowSums %>% sort %>% rev)[filter.g3] %>% names # n=16 genes (out of 42) >0 count 
  this.sample3<-li.TPM[["Salmon"]][["counts"]][this.gene,] %>% colSums %>% sort %>% rev %>% names # 100 sample names
  filter.s3<-li.TPM[["Salmon"]][["counts"]][this.gene3,] %>% colSums %>% sort %>% rev >0 # samples with count>0 
  #(li.TPM[["Salmon"]][["counts"]][this.gene,] %>% colSums %>% sort %>% rev)[filter.s3] # n=38 samples with count>0 (predicted as Male)
  predicted.male3<-(li.TPM[["Salmon"]][["counts"]][this.gene,] %>% colSums %>% sort %>% rev)[filter.s3] %>% names # n=38 samples with count>0 (predicted as Male)

  my.mat3<-log2(li.TPM[["Salmon"]][["counts"]][this.gene3,this.sample3]+1)
  rownames(my.mat3)<-gr.ensg[this.gene3]$gene_name

  col.anno3<-mat.samples[this.sample3,c("GA","Sex"),drop=F]
  col.anno3$"Predicted Sex"<-ifelse(this.sample3 %in% predicted.male3,"Male","Female")
  col.anno3<-col.anno3[,c("Predicted Sex","Sex","GA")]

  # via ComplexHeatmap
  cph3<-ComplexHeatmap::pheatmap(my.mat3,
                                 annotation_col=col.anno3, 
                                 annotation_colors=my.color, 
                                 cluster_cols=F,
                                 cluster_rows=F,
                                 show_colnames=F,
                                 border_color=NA,
                                 fontsize=12,
                                 name="log2(count)",
                                 row_title="\nSalmon (SA mode)",
                                 row_title_gp = gpar(fontsize = 20),
                                 heatmap_legend_param = list(
                                                            title_gp=gpar(fontsize=13,fontface="bold"),
                                                            labels_gp=gpar(fontsize=11),
                            annotation_legend_param = list(title_gp=gpar(fontsize=13,fontface="bold"),labels_gp = gpar(fontsize = 11))
                                )
  )


  pdf(file="Figures/Suppl/cfRNA.Suppl.Text.Fig1C.pdf", width=11, height=14, title="Abundance of chrY")
  draw(cph1 %v% cph3, main_heatmap="log2(count)", ht_gap=unit(.7,"cm"), auto_adjust=F)
  grid.text(label="C",gp=gpar(fontsize=27,fontface="bold"),x=0.02,y=.99,just="top")
  dev.off()

7.4 Supplementary Text Figure D

Code
  dl.chrY.TPM %>% names

  # load the TPM based om 10M-ds dataset
  load("RData/li.TPM.10M.trim.RData")
  li.TPM[["Salmon"]][["TPM"]] %>% dim # 59354 genes x 100 samples

  dt.trim.10M<-data.table(
      `Type`="Salmon (SA mode)",
      `SampleID`=li.TPM[["Salmon"]][["counts"]][this.gene,] %>% colSums %>% names,
      `Counts`=li.TPM[["Salmon"]][["counts"]][this.gene,] %>% colSums, 
      `TPM`=li.TPM[["Salmon"]][["TPM"]][this.gene,] %>% colSums 
      )
  # add Sex and GA
  dt.trim.10M<-merge(dt.trim.10M, dt.samples[,.(SampleID,GA,Sex,pn_female)])

  dt.trim.10M[,`Predicted Sex`:=ifelse(TPM>0, "Male","Female"),Type]
  dt.trim.10M$Sex<-factor(dt.trim.10M$Sex, level=c("Male","Female"))
  dt.trim.10M$`Predicted Sex`<-factor(dt.trim.10M$`Predicted Sex`, level=c("Male","Female"))

  dt.chrY.trim<-rbind(
        dl.chrY.TPM[["10M"]][Type=="Salmon (SA mode)"][,Trim:="No"],
        dt.trim.10M[,Trim:="Yes"]
        )
  ## Number of false positives: chrY signal detected from females 
  dt.chrY.trim[,.(Trim,Sex,`Predicted Sex`)] %>% ftable

  ##
  ## Read counts by trim or not
  dt.bar<-merge(
    dl.chrY.TPM[["10M"]][Type=="Salmon (SA mode)",.(SampleID,GA=gsub(" weeks","wk",GA),Sex,Counts,TPM)], # no-trim
    dt.trim.10M[Type=="Salmon (SA mode)",.(SampleID,GA,Sex,Counts,TPM)], # trim
    by=c("SampleID","Sex","GA")
  )
  cor.test(~Counts.x + Counts.y, dt.bar)
  dt.bar[Counts.x>Counts.y] # no-trim > trim
  dt.bar[Counts.x<Counts.y] # no-trim < trim
  dt.bar[Sex=="Female" & Counts.x<Counts.y] # more counts for Trimmed Salmon

  my.limit<-dt.bar[,max(Counts.x,Counts.y)]  %>% ceiling
  p1.chrY.trim<-ggplot(dt.bar, aes(Counts.x+1,Counts.y+1)) + 
  geom_point(data=dt.bar, size=5, position=position_jitter(width=0.2,height=0.2),shape=21) +
  scale_x_continuous(trans = scales::log2_trans(),
                     breaks = scales::trans_breaks("log2",function(x) 2^x),
                     labels = scales::trans_format("log2", math_format(2^.x)),
                     limits=c(.5,my.limit*2)) +
  scale_y_continuous(trans = scales::log2_trans(),
                     breaks = scales::trans_breaks("log2",function(x) 2^x),
                     labels = scales::trans_format("log2", math_format(2^.x)),
                     limits=c(.5,my.limit*2)) +
  #coord_fixed() + 
  facet_grid(Sex~GA,scales="free_x") +
  labs(x="Sum of read count (No trimmed reads)", y="Sum of read count (trimmed reads)") +
  theme_Publication() + theme(panel.border = element_rect(colour = "black"))

  pdf(file="Figures/Suppl/cfRNA.Suppl.Text.Fig1D.v2.pdf", width=11, height=7, title="Trim vs No trim")
  cowplot::plot_grid(p1.chrY.trim, labels=c("D"),label_size=27)
  dev.off()