Sourcecode

source('Rscripts/Utils.R')

## Warning: package 'valr' was built under R version 4.0.5

Cleaning tft_legacy MsigDB geneset names

This is necessary in order to make the gene identifiers the same across the analysis

#getting data that cleans gene names
ensembl.hg19 <- useEnsembl_GRCh37()
bm.hg19 <- getBM(attributes = c('ensembl_gene_id', 
                                'external_gene_name',
                                'hgnc_symbol',
                                'external_synonym',
                                'gene_biotype'), 
                 mart = ensembl.hg19, 
                 filters = 'chromosome_name', values = c(1:22,'X'))
bm.hg19.prot <- bm.hg19 %>%
  filter(gene_biotype == 'protein_coding')

ensembl_IDs <- melt(bm.hg19.prot, id = 1) %>%
  dplyr::rename(GENE = value)

#Adding ensembl annotation to TFs (TF list from Lambert et al.)
load(file = 'Data/merged-list-TFs.Rdata') 
tf_list <- merge(merged.list %>% dplyr::rename(GENE = NameTF), 
                 ensembl_IDs, 
                 by = 'GENE') 

##Cleaning MsigDB
tft_legacy <- getting_MsigDB_TF() 
tft_legacy_curated <- tft_legacy %>%
  #dplyr::filter(!str_detect(NAME, 'UNKNOWN')) %>%
  dplyr::mutate(GENE = sub("_.*", "", NAME)) 
#Have to manually re-name these as they are protein nomenclature
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "ER",
                                      "ESR1") 
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "EVI1",
                                      "MECOM") 
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "ARP1",
                                      "ACTR1A") 
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "CART1",
                                      "ALX1") 
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "DR3",
                                      "TNFRSF25")   
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "DR4",
                                      "TNFRSF10A")  
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "E12",
                                      "TCF3")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "HIF1",
                                      "HIF1A")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "OCT1",
                                      "POU2F1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "OSF2",
                                      "POSTN")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "P300",
                                      "EP300")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "PIT1",
                                      "POU1F1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "PR",
                                      "PGR")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "S8",
                                      "RPS8")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "TEL2",
                                      "TELO2")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
                                      "TFIIA",
                                      "GTF2A1")


cleaned_tf_legacy <- merge(tft_legacy_curated,
                            ensembl_IDs,
                            by = 'GENE') 
NAs <- merge(tft_legacy_curated,
             ensembl_IDs,
             by = 'GENE',
             all.x = TRUE) %>%
  filter(is.na(ensembl_gene_id)) 
NAs$GENE <- 'NA'

cleaned_tf_legacy <- rbind(cleaned_tf_legacy,NAs)

#counting # of unique tf_legacy
count(cleaned_tf_legacy[unique(cleaned_tf_legacy$NAME),])

##     n
## 1 574

#cleaned legacy tf dataset 304/324 unique gene-sets were found to be tfs
legacy_tfs <- cleaned_tf_legacy %>%
  group_by(GWAS_TYPE,FEATURE) %>%
  distinct(NAME,.keep_all = TRUE) %>%
  mutate(rank = dense_rank(desc(NES))) %>%
  ungroup() %>%
  dplyr::rename( MsigdB_NAME = NAME)  

#Supplementary Table 6
head(legacy_tfs)

## # A tibble: 6 x 17
##   GENE  MsigdB_NAME  SIZE    ES   NES NOM.p.val FDR.q.val FWER.p.val RANK.AT.MAX
##   <chr> <chr>       <int> <dbl> <dbl>     <dbl>     <dbl>      <dbl>       <int>
## 1 ACTR~ ARP1_01       156 0.393 1.10      0.171     0.565          1        6525
## 2 ACTR~ ARP1_01       148 0.262 1.00      0.511     0.628          1        2305
## 3 ACTR~ ARP1_01       148 0.278 1.10      0.249     0.431          1        7974
## 4 ACTR~ ARP1_01       148 0.259 0.911     0.736     0.814          1        3404
## 5 ACTR~ ARP1_01       156 0.403 1.04      0.328     0.600          1        5430
## 6 ACTR~ ARP1_01       148 0.215 0.819     0.91      0.939          1        6441
## # ... with 8 more variables: LEADING.EDGE <chr>, GWAS_TYPE <chr>,
## #   FEATURE <chr>, GENESET <chr>, TIDY_GWAS_TYPE <chr>, ensembl_gene_id <chr>,
## #   variable <fct>, rank <int>

UpsetPlot

histotype = c('HGSOC','LGSOC','CCOC','NMOC','MOC','EnOC')
for(i in histotype) {
  #chromMAGMA for each histotype
  assign(paste0('min_P_chromMAGMA_sig_legacy_',i),
         legacy_tfs %>%
           filter(FEATURE == 'chromMAGMA') %>%
           filter(!GENE == 'NA') %>%
           filter(GWAS_TYPE == paste(i)) %>%
  filter(NOM.p.val < 0.05) %>%
  filter(FDR.q.val < 0.25)%>%
  dplyr::group_by(FEATURE, GWAS_TYPE, GENE) %>%
  dplyr::mutate(min_Pvalue = min(NOM.p.val)) %>% 
  filter(NOM.p.val == min_Pvalue) 
  )
}


histotype_list <- list(
  CCOC = min_P_chromMAGMA_sig_legacy_CCOC$GENE,
  EnOC = min_P_chromMAGMA_sig_legacy_EnOC$GENE,
  HGSOC = min_P_chromMAGMA_sig_legacy_HGSOC$GENE,
  LGSOC = min_P_chromMAGMA_sig_legacy_LGSOC$GENE,
  MOC = min_P_chromMAGMA_sig_legacy_MOC$GENE
)


library(UpSetR)

## Warning: package 'UpSetR' was built under R version 4.0.5

upset(fromList(histotype_list),
      nsets = length(histotype_list),
      #$empty.intersections = 'on',
      order.by ='freq')

histotype specific dotplot based on upset Figure 5A

fromList <- function (input) {
  # Same as original fromList()...
  elements <- unique(unlist(input))
  data <- unlist(lapply(input, function(x) {
      x <- as.vector(match(elements, x))
      }))
  data[is.na(data)] <- as.integer(0)
  data[data != 0] <- as.integer(1)
  data <- data.frame(matrix(data, ncol = length(input), byrow = F))
  data <- data[which(rowSums(data) != 0), ]
  names(data) <- names(input)
  # ... Except now it conserves your original value names!
  row.names(data) <- elements
  return(data)
  }

# Binary table with colnames:
annotated_TFs<- fromList(histotype_list)
annotated_TFs$GENE <- rownames(annotated_TFs) 
annotated_TFs$binary <-  paste(annotated_TFs$CCOC,
                            annotated_TFs$EnOC,
                            annotated_TFs$HGSOC,
                            annotated_TFs$LGSOC,
                            annotated_TFs$MOC) 
annotated_TFs$anno <- ifelse(annotated_TFs$binary %in% "1 0 0 0 0",'CCOC Specific',
                             ifelse(annotated_TFs$binary %in% "0 1 0 0 0",'EnOC Specific',
                                    ifelse(annotated_TFs$binary %in% "0 0 1 0 0",'HGSOC Specific',
                                                  ifelse(annotated_TFs$binary %in% "0 0 0 1 0",'LGSOC Specific',
                                                         ifelse(annotated_TFs$binary %in% "0 0 0 0 1",'MOC Specific',
                                                                ifelse(annotated_TFs$binary %in% "1 1 1 1 1",'All Common','NA'))))))
annotated_TFs <- annotated_TFs %>% filter(!anno == 'NA')
annotated_TFs<- annotated_TFs[order(annotated_TFs$anno),]


legacy_tfs %>%
  filter(FEATURE == 'chromMAGMA') %>%
  filter(!GENE == 'NA') %>%
  group_by(GWAS_TYPE) %>% 
  filter(GENE %in% annotated_TFs$GENE) %>%
  mutate(NOM.p.val = replace(NOM.p.val, NOM.p.val == 0, 0.001)) %>% # Replacing values; GSEA's P-value is limited to <0.001
  mutate(NEG_LOG10_Pvalue = -log10(NOM.p.val)) %>%
  filter(!TIDY_GWAS_TYPE == 'Non-mucinous')  %>%
  dplyr::group_by(FEATURE, GWAS_TYPE, GENE) %>%
  dplyr::mutate(min_Pvalue = min(NOM.p.val)) %>% 
  filter(NOM.p.val == min_Pvalue) %>%
  {ggplot(.,aes(factor(GWAS_TYPE),GENE)) + 
  geom_point(aes(colour=NEG_LOG10_Pvalue,size= FDR.q.val)) + 
  scale_colour_gradient(low="blue", high="red", n.breaks = 4) + 
  scale_size_continuous(limits = c(0, 1.5), range = c(7,0)) + 
  theme_bw() +
  theme(axis.text=element_text(size=7), 
        axis.title=element_text(size=7,face="bold")) + 
  theme(axis.text.x = element_text(size = 7, angle = 90),
        axis.title.x=element_blank(),
        legend.title = element_blank()) +
  scale_y_discrete(limits = annotated_TFs$GENE) }

histotype specific dotplot

09 Cleaning legacy transcription factor MsigDB BROAD GSEA results - Supplementary Table 6

Sourcecode

Cleaning tft_legacy MsigDB geneset names

UpsetPlot

histotype specific dotplot based on upset Figure 5A

histotype specific dotplot