In the following workflow, we describe the main procedures to analyse the enrichment of differentially expressed genes (DEGs) close to experimentaly peaks, derived from Cistrome, for some interesting Trascription Factor (TF).
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(GenomicRanges)
The buildTable
function is used to select all genes close to 50000 bp of a given peak coordinates from all Cistrome peaks file. We used the distance
function from Genomic Ranges
package.
buildTable <- function(pTF, pInfo.GR, pDados.RNAseq.GR) {
max_len = 50000
table.all.genes = NULL
for (i in 1: length(info.GR)) {
dist = distance(pInfo.GR[i], pDados.RNAseq.GR)
info.gene = NULL
list = which(dist > 0 & dist <= max_len)
if (length(list) > 0 ) {
info = data.frame(tf = pTF, peak=i, distance=dist[list], bin=max_len, gene = pDados.RNAseq.GR$symbol[list])
info.gene = rbind(info.gene, info)
}
table.all.genes = rbind(table.all.genes, info.gene)
}
return(table.all.genes)
}
The getRandomGR
function is used to generate random peaks following the same chromosome proportion and size of the original cistrome peaks files.
getRandomGR <- function(gr){
seqnames <- factor(x = paste0('chr',c(1:22,'X', 'Y')), levels = paste0('chr',c(1:22,'X', 'Y')))
hg19.len <- read.table(file = 'files/hg19.len')
hg19.len <- hg19.len[hg19.len$V1 %in% seqnames,]
hg19.len$V1 <- factor(x = hg19.len$V1, levels = seqnames)
hg19.len <- hg19.len[order(hg19.len$V1),]
.seqnames <- factor(as.character(seqnames(gr)), levels = levels(seqnames))
gr.random <- foreach(i=1:length(seqnames), .combine = c) %do% {
.n <- sum(.seqnames == seqnames[i])
.max <- hg19.len$V2[i]
.start <- round(x = runif(n = .n, min = 1, max = .max), digits = 0)
.width <- width(gr[.seqnames == seqnames[i]])
.gr <- GRanges(seqnames = seqnames[i], ranges = IRanges(start = .start, width = .width))
return(.gr)
}
return(gr.random)
}
The differentially expressed genes are organized in a data frame. All genes are loaded and only genes up and down regulated are selected in the analysis.
load("RData/scenario-NOSExHGSOC-394-TCGA-all-genes.RData")
table(NOSExHGSOC$STATUS)
##
## Down NOT.SIG Up
## 243 20509 176
dados.RNAseq <- NOSExHGSOC[which(NOSExHGSOC$STATUS %in% c("Down","Up")),c(1:584)]
scenario = "OxH"
176-p6 | 179-Lp6 | 181-Lp7 | 185-p5 | 189-Lp6 | 191-Lp5 | 192-Rp4 | 197-Lp6 | 1mR-p4 | 200-p6 | |
---|---|---|---|---|---|---|---|---|---|---|
TMEM88B | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 | -2.9695494 |
GABRD | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | -2.9953130 | 2.3529824 |
RNU5E-1 | 1.7090026 | 1.5927555 | 2.2206903 | 0.7731689 | 2.3052644 | 0.7503659 | 1.0112272 | -0.3709081 | 1.8898249 | 1.1523578 |
IGSF21 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 | -2.3670616 |
C1QA | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 | 0.3977825 |
We selected the genomic annotation, regarding chromosome name, start, end coordinates for human hg19 build. After the annation is merged with our differentially expressed genes.
dados.RNAseq <- as.data.frame(dados.RNAseq)
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene
genes <- genes(txdb_hg19)
symbols <- unlist(mapIds(org.Hs.eg.db, genes$gene_id, "SYMBOL", "ENTREZID", multiVals = "first"))
## 'select()' returned 1:1 mapping between keys and columns
genes <- as.data.frame(genes)
genes$symbol <- as.matrix(symbols)
genes <- genes[!(duplicated(genes$symbol)),]
head(genes)
dados.RNAseq <- merge(genes,dados.RNAseq, by.x="symbol",by.y="row.names")
Convert gene annotation into a Genomic Ranges object dados.RNAseq.GR
.
dados.RNAseq.GR <- makeGRangesFromDataFrame(dados.RNAseq[,1:7],TRUE)
Define the number of interaction for random peaks.
ninter = 10
all.rd.c.T = NULL
In this step we can select a cistrome peaks file for a given TF. We filter out non-canonical chromosomes. The information are converted to Genomic Ranges
object.
TF = "ASCL2"
cistrome.info <- read.delim("cistrome/all.ASCL2.peaks.bed", comment.char = "#", header = F)
colnames(cistrome.info) <- c("chr", "start", "end", "name", "score")
cistrome.info <- cistrome.info[which(cistrome.info$chr %in% c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY")),]
cistrome.info <- droplevels(cistrome.info)
cistrome.info$width <- cistrome.info$end - cistrome.info$start
info.GR <- makeGRangesFromDataFrame(cistrome.info, ignore.strand=TRUE)
The steps bellow call both auxiliary function.
table.TF = buildTable(TF, info.GR, dados.RNAseq.GR)
if (is.null(table.TF)) {
table.TF = data.frame(TF=TF, Cistrome=0)
}
for (inter in 1: ninter) {
#print(paste0("Interaction: ", inter))
rd.peaks = getRandomGR(info.GR)
temp_rd = buildTable(TF, rd.peaks, dados.RNAseq.GR)
if (!is.null(temp_rd)) {
all.rd.c.T = rbind(all.rd.c.T, data.frame(Var1=TF, Freq=dim(temp_rd)[1]))
}else {
all.rd.c.T = rbind(all.rd.c.T, data.frame(Var1=TF, Freq=0))
}
}
all.rd.c.T$Source = "Random"
head(all.rd.c.T)
## Var1 Freq Source
## 1 ASCL2 65 Random
## 2 ASCL2 84 Random
## 3 ASCL2 76 Random
## 4 ASCL2 80 Random
## 5 ASCL2 49 Random
## 6 ASCL2 67 Random
head(table.TF)
## tf peak distance bin gene
## 1 ASCL2 7 22135 50000 TMEM88B
## 2 ASCL2 94 29372 50000 ZNF683
## 3 ASCL2 95 49566 50000 ZNF683
## 4 ASCL2 106 34221 50000 IFI6
## 5 ASCL2 107 6779 50000 IFI6
## 6 ASCL2 126 34188 50000 ZBTB8B
counts.TF <- as.data.frame(table(table.TF$tf))
counts.TF$Source = "Cistrome"
table.all <- rbind(counts.TF, all.rd.c.T)
colnames(table.all) <- c("TF", "Freq", "Source")
head(table.all)
## TF Freq Source
## 1 ASCL2 128 Cistrome
## 2 ASCL2 65 Random
## 3 ASCL2 84 Random
## 4 ASCL2 76 Random
## 5 ASCL2 80 Random
## 6 ASCL2 49 Random
fill <- "#4271AE"
line <- "#1F3552"
p <- ggplot(table.all, aes(x=TF, y=Freq)) +
ggtitle(paste0("Gene list - OSEC vs. HGSOC - 243 DOWN 176 UP"))+xlab("Transcription Factors")+ylab("Number of matched peaks") +
geom_boxplot(outlier.shape = NA, fill = fill, colour = line, alpha = 0.7) +
scale_color_manual(values=c("#4271AE", "orange")) +
geom_jitter(shape=16, alpha=0.7, aes(colour=Source)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
p
We demonstrated how to analyse the enrichment of DEG close to TF using Cistrome data. The same procedures can be now applied to other DEG scenarios or different TFs by changuing Part 1 and Part 3. To increase the number of random interaction change Part 2.