source('Rscripts/Utils.R')
## Warning: package 'valr' was built under R version 4.0.5
This is necessary in order to make the gene identifiers the same across the analysis
#getting data that cleans gene names
ensembl.hg19 <- useEnsembl_GRCh37()
bm.hg19 <- getBM(attributes = c('ensembl_gene_id',
'external_gene_name',
'hgnc_symbol',
'external_synonym',
'gene_biotype'),
mart = ensembl.hg19,
filters = 'chromosome_name', values = c(1:22,'X'))
bm.hg19.prot <- bm.hg19 %>%
filter(gene_biotype == 'protein_coding')
ensembl_IDs <- melt(bm.hg19.prot, id = 1) %>%
dplyr::rename(GENE = value)
#Adding ensembl annotation to TFs (TF list from Lambert et al.)
load(file = 'Data/merged-list-TFs.Rdata')
tf_list <- merge(merged.list %>% dplyr::rename(GENE = NameTF),
ensembl_IDs,
by = 'GENE')
##Cleaning MsigDB
tft_legacy <- getting_MsigDB_TF()
tft_legacy_curated <- tft_legacy %>%
#dplyr::filter(!str_detect(NAME, 'UNKNOWN')) %>%
dplyr::mutate(GENE = sub("_.*", "", NAME))
#Have to manually re-name these as they are protein nomenclature
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"ER",
"ESR1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"EVI1",
"MECOM")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"ARP1",
"ACTR1A")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"CART1",
"ALX1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"DR3",
"TNFRSF25")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"DR4",
"TNFRSF10A")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"E12",
"TCF3")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"HIF1",
"HIF1A")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"OCT1",
"POU2F1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"OSF2",
"POSTN")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"P300",
"EP300")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"PIT1",
"POU1F1")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"PR",
"PGR")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"S8",
"RPS8")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"TEL2",
"TELO2")
tft_legacy_curated$GENE <- str_replace(tft_legacy_curated$GENE,
"TFIIA",
"GTF2A1")
cleaned_tf_legacy <- merge(tft_legacy_curated,
ensembl_IDs,
by = 'GENE')
NAs <- merge(tft_legacy_curated,
ensembl_IDs,
by = 'GENE',
all.x = TRUE) %>%
filter(is.na(ensembl_gene_id))
NAs$GENE <- 'NA'
cleaned_tf_legacy <- rbind(cleaned_tf_legacy,NAs)
#counting # of unique tf_legacy
count(cleaned_tf_legacy[unique(cleaned_tf_legacy$NAME),])
## n
## 1 574
#cleaned legacy tf dataset 304/324 unique gene-sets were found to be tfs
legacy_tfs <- cleaned_tf_legacy %>%
group_by(GWAS_TYPE,FEATURE) %>%
distinct(NAME,.keep_all = TRUE) %>%
mutate(rank = dense_rank(desc(NES))) %>%
ungroup() %>%
dplyr::rename( MsigdB_NAME = NAME)
#Supplementary Table 6
head(legacy_tfs)
## # A tibble: 6 x 17
## GENE MsigdB_NAME SIZE ES NES NOM.p.val FDR.q.val FWER.p.val RANK.AT.MAX
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 ACTR~ ARP1_01 156 0.393 1.10 0.171 0.565 1 6525
## 2 ACTR~ ARP1_01 148 0.262 1.00 0.511 0.628 1 2305
## 3 ACTR~ ARP1_01 148 0.278 1.10 0.249 0.431 1 7974
## 4 ACTR~ ARP1_01 148 0.259 0.911 0.736 0.814 1 3404
## 5 ACTR~ ARP1_01 156 0.403 1.04 0.328 0.600 1 5430
## 6 ACTR~ ARP1_01 148 0.215 0.819 0.91 0.939 1 6441
## # ... with 8 more variables: LEADING.EDGE <chr>, GWAS_TYPE <chr>,
## # FEATURE <chr>, GENESET <chr>, TIDY_GWAS_TYPE <chr>, ensembl_gene_id <chr>,
## # variable <fct>, rank <int>
histotype = c('HGSOC','LGSOC','CCOC','NMOC','MOC','EnOC')
for(i in histotype) {
#chromMAGMA for each histotype
assign(paste0('min_P_chromMAGMA_sig_legacy_',i),
legacy_tfs %>%
filter(FEATURE == 'chromMAGMA') %>%
filter(!GENE == 'NA') %>%
filter(GWAS_TYPE == paste(i)) %>%
filter(NOM.p.val < 0.05) %>%
filter(FDR.q.val < 0.25)%>%
dplyr::group_by(FEATURE, GWAS_TYPE, GENE) %>%
dplyr::mutate(min_Pvalue = min(NOM.p.val)) %>%
filter(NOM.p.val == min_Pvalue)
)
}
histotype_list <- list(
CCOC = min_P_chromMAGMA_sig_legacy_CCOC$GENE,
EnOC = min_P_chromMAGMA_sig_legacy_EnOC$GENE,
HGSOC = min_P_chromMAGMA_sig_legacy_HGSOC$GENE,
LGSOC = min_P_chromMAGMA_sig_legacy_LGSOC$GENE,
MOC = min_P_chromMAGMA_sig_legacy_MOC$GENE
)
library(UpSetR)
## Warning: package 'UpSetR' was built under R version 4.0.5
upset(fromList(histotype_list),
nsets = length(histotype_list),
#$empty.intersections = 'on',
order.by ='freq')
fromList <- function (input) {
# Same as original fromList()...
elements <- unique(unlist(input))
data <- unlist(lapply(input, function(x) {
x <- as.vector(match(elements, x))
}))
data[is.na(data)] <- as.integer(0)
data[data != 0] <- as.integer(1)
data <- data.frame(matrix(data, ncol = length(input), byrow = F))
data <- data[which(rowSums(data) != 0), ]
names(data) <- names(input)
# ... Except now it conserves your original value names!
row.names(data) <- elements
return(data)
}
# Binary table with colnames:
annotated_TFs<- fromList(histotype_list)
annotated_TFs$GENE <- rownames(annotated_TFs)
annotated_TFs$binary <- paste(annotated_TFs$CCOC,
annotated_TFs$EnOC,
annotated_TFs$HGSOC,
annotated_TFs$LGSOC,
annotated_TFs$MOC)
annotated_TFs$anno <- ifelse(annotated_TFs$binary %in% "1 0 0 0 0",'CCOC Specific',
ifelse(annotated_TFs$binary %in% "0 1 0 0 0",'EnOC Specific',
ifelse(annotated_TFs$binary %in% "0 0 1 0 0",'HGSOC Specific',
ifelse(annotated_TFs$binary %in% "0 0 0 1 0",'LGSOC Specific',
ifelse(annotated_TFs$binary %in% "0 0 0 0 1",'MOC Specific',
ifelse(annotated_TFs$binary %in% "1 1 1 1 1",'All Common','NA'))))))
annotated_TFs <- annotated_TFs %>% filter(!anno == 'NA')
annotated_TFs<- annotated_TFs[order(annotated_TFs$anno),]
legacy_tfs %>%
filter(FEATURE == 'chromMAGMA') %>%
filter(!GENE == 'NA') %>%
group_by(GWAS_TYPE) %>%
filter(GENE %in% annotated_TFs$GENE) %>%
mutate(NOM.p.val = replace(NOM.p.val, NOM.p.val == 0, 0.001)) %>% # Replacing values; GSEA's P-value is limited to <0.001
mutate(NEG_LOG10_Pvalue = -log10(NOM.p.val)) %>%
filter(!TIDY_GWAS_TYPE == 'Non-mucinous') %>%
dplyr::group_by(FEATURE, GWAS_TYPE, GENE) %>%
dplyr::mutate(min_Pvalue = min(NOM.p.val)) %>%
filter(NOM.p.val == min_Pvalue) %>%
{ggplot(.,aes(factor(GWAS_TYPE),GENE)) +
geom_point(aes(colour=NEG_LOG10_Pvalue,size= FDR.q.val)) +
scale_colour_gradient(low="blue", high="red", n.breaks = 4) +
scale_size_continuous(limits = c(0, 1.5), range = c(7,0)) +
theme_bw() +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=7,face="bold")) +
theme(axis.text.x = element_text(size = 7, angle = 90),
axis.title.x=element_blank(),
legend.title = element_blank()) +
scale_y_discrete(limits = annotated_TFs$GENE) }