In the following workflow, we describe the main procedures to analyse the gene expression profile of associated genes to superenghancers (SE) genomic regions.
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(GenomicRanges)
Obtaining only distinct peaks from a given Genomic Ranges
Object
distinctGR <- function(objectGR) {
adj_hgsonc_union = objectGR
index = 1
final = length(adj_hgsonc_union)
distinct_table = NULL
while (index <= final) {
res = as.data.frame(findOverlaps(adj_hgsonc_union[index], adj_hgsonc_union))
n = dim(res)[1]
if (n > 1) {
adj_hgsonc_union = adj_hgsonc_union[-res$subjectHits[2:n]]
}
distinct_table = rbind(distinct_table, as.data.frame(adj_hgsonc_union[res$subjectHits[1]]))
index = index + 1
final = length(adj_hgsonc_union)
}
return(distinct_table)
}
Obtaining only distinct peaks by comparing two Genomic Ranges
objects
distinctGRPairs <- function(query, subject) {
index = 1
final = length(query)
distinct_table = NULL
while (index <= final) {
res = as.data.frame(findOverlaps(query[index], subject))
n = dim(res)[1]
if (n < 1) {
distinct_table = rbind(distinct_table, as.data.frame(query[index]))
}
index = index + 1
}
return(distinct_table)
}
The data is organized in a data frame with 387 samples (190 normals and 197 TCGA) as columns and 20928 unique exprerssed genes as rows, named as combat_alln_mean
. Sample annotation is organized in another data frame NormalSamplesTable
. The TCGA Sample annotation is also added in Normals sample table annotation as follow.
load("RData/190Normal-394TCGA-combatGCnorm_all_data.rda")
176-p6 | 179-Lp6 | 181-Lp7 | 185-p5 | 189-Lp6 | 191-Lp5 | 192-Rp4 | 197-Lp6 | 1mR-p4 | 200-p6 | |
---|---|---|---|---|---|---|---|---|---|---|
WASH7P | 4.466708 | 3.854710 | -1.635562 | -1.635562 | -1.635562 | -1.635562 | -1.635562 | -1.635562 | -1.635562 | -1.635562 |
OR4F5 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 | -5.317438 |
OR4F29 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 | -5.276078 |
OR4F16 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 | -5.308478 |
LINC00115 | -3.934121 | 3.119557 | 1.581284 | -3.934121 | -3.934121 | 4.701893 | 2.901779 | 2.330278 | 2.346626 | 1.414174 |
load("RData/NormalsSamplesTableV3.RData")
head(NormalSamplesTable)
## Samples CellType Colors SP Batch
## 1 176_p6 NOSE #377EB8 NO B1
## 2 179_Lp6 NOSE #377EB8 NO B1
## 3 181_Lp7 NOSE #377EB8 NO B1
## 4 185_p5 NOSE #377EB8 NO B1
## 5 189_Lp6 NOSE #377EB8 NO B1
## 6 191_Lp5 NOSE #377EB8 NO B1
nTCGAsamples = 394
TCGAsamplesTable = data.frame(Samples = colnames(dados.RNAseq)[191:584], CellType = c(rep("HGSOC", nTCGAsamples)), Colors=c(rep("green", nTCGAsamples)), SP="None", Batch="None" )
SamplesTable = rbind(NormalSamplesTable, TCGAsamplesTable)
We selected the genomic annotation, regarding chromosome name, start, end coordinates for human hg19 build. After the annation is merged with our expressed genes.
dados.RNAseq <- as.data.frame(dados.RNAseq)
txdb_hg19 <- TxDb.Hsapiens.UCSC.hg19.knownGene
genes <- genes(txdb_hg19)
symbols <- unlist(mapIds(org.Hs.eg.db, genes$gene_id, "SYMBOL", "ENTREZID", multiVals = "first"))
## 'select()' returned 1:1 mapping between keys and columns
genes <- as.data.frame(genes)
genes$symbol <- as.matrix(symbols)
genes <- genes[!(duplicated(genes$symbol)),]
head(genes)
dados.RNAseq <- merge(genes,dados.RNAseq, by.x="symbol",by.y="row.names")
Convert gene annotation into a Genomic Ranges object dados.RNAseq.GR
. The merged data.frame also is splited into annotation dados.RNAseq.DF
and expression information dados.RNAseq.f
. After, the expression values are scaled as follows.
dados.RNAseq.GR <- makeGRangesFromDataFrame(dados.RNAseq[,1:7],TRUE)
dados.RNAseq.DF <- dados.RNAseq[,1:7]
dados.RNAseq.f <- dados.RNAseq[,c(8:591)]
dados.RNAseq.scaled <- t(scale(t(dados.RNAseq.f)))
rownames(dados.RNAseq.scaled) <- dados.RNAseq.DF$symbol
In this step we will processing the superenhancer bed files. First, the data is loaded and converted to Genomic Ranges Object. The experiments are combined in a single object. Then, using the aulixiliary function distinctGR
, we selected only those regions that are unique considering the entire combined set. The same procedures are applied to OSEC, FTSEC and HGSOC superenhancers.
########### OSEC #################
ose4 = read.table("se/IOSE4_ResultCount_C30H4ACXX_1_SHE1405A185.male.hg19.fa.nodup.superEnhancers.bed")
colnames(ose4) = c("chrom", "start", "end", "name", "score", "strand")
ose4 = makeGRangesFromDataFrame(ose4)
ose11 = read.table("se/IOSE11_ResultCount_D1AHBACXX_8_SHE1405A43.male.hg19.fa.nodup.superEnhancers.bed")
colnames(ose11) = c("chrom", "start", "end", "name", "score", "strand")
ose11 = makeGRangesFromDataFrame(ose11)
ose4.df <- as.data.frame(ose4)
ose11.df <- as.data.frame(ose11)
nose_total <- rbind(ose4.df, ose11.df)
nose_total <- makeGRangesFromDataFrame(nose_total)
distinct_table = distinctGR(nose_total)
nose_union = makeGRangesFromDataFrame(distinct_table)
########### FTSEC #################
ft33 = read.table("se/FT33_ResultCount_C26CUACXX_2_SHE1405A78.male.hg19.fa.nodup.superEnhancers.bed")
colnames(ft33) = c("chrom", "start", "end", "name", "score", "strand")
ft33 = makeGRangesFromDataFrame(ft33)
ft246 = read.table("se/FT246_ResultCount_C269LACXX_5_SHE1405A86.male.hg19.fa.nodup.superEnhancers.bed")
colnames(ft246) = c("chrom", "start", "end", "name", "score", "strand")
ft246 = makeGRangesFromDataFrame(ft246)
ft33.df <- as.data.frame(ft33)
ft246.df <- as.data.frame(ft246)
nfte_total = rbind(ft33.df, ft246.df)
nfte_total = makeGRangesFromDataFrame(nfte_total)
distinct_table = distinctGR(nfte_total)
nfte_union = makeGRangesFromDataFrame(distinct_table)
########### HGSOC #################
hgs229 = read.table("se/HGSerous_229_IP.nodup.superEnhancers.bed")
colnames(hgs229) = c("chrom", "start", "end", "name", "score", "strand")
hgs229 = makeGRangesFromDataFrame(hgs229)
hgs429 = read.table("se/HGSerous_429_IP.nodup.superEnhancers.bed")
colnames(hgs429) = c("chrom", "start", "end", "name", "score", "strand")
hgs429 = makeGRangesFromDataFrame(hgs429)
hgs561 = read.table("se/HGSerous_561_IP.nodup.superEnhancers.bed")
colnames(hgs561) = c("chrom", "start", "end", "name", "score", "strand")
hgs561 = makeGRangesFromDataFrame(hgs561)
hgs550 = read.table("se/HG_Serous_550_IP.nodup.superEnhancers.bed")
colnames(hgs550) = c("chrom", "start", "end", "name", "score", "strand")
hgs550 = makeGRangesFromDataFrame(hgs550)
hgs270 = read.table("se/HGSerous_270_IP.nodup.superEnhancers.bed")
colnames(hgs270) = c("chrom", "start", "end", "name", "score", "strand")
hgs270 = makeGRangesFromDataFrame(hgs270)
hgs229.df <- as.data.frame(hgs229)
hgs429.df <- as.data.frame(hgs429)
hgs561.df <- as.data.frame(hgs561)
hgs550.df <- as.data.frame(hgs550)
hgs270.df <- as.data.frame(hgs270)
hgsoc_total <- rbind(hgs229.df, hgs429.df, hgs561.df, hgs550.df, hgs270.df)
hgsoc_total = makeGRangesFromDataFrame(hgsoc_total)
distinct_table = distinctGR(hgsoc_total)
hgsoc_union = makeGRangesFromDataFrame(distinct_table)
In this step the SE are cross compared in order to obtain only distict ones per cell type. The auxiliary function distinctGRPairs
is used.
subj = makeGRangesFromDataFrame(rbind(as.data.frame(hgsoc_union), as.data.frame(nfte_union)))
tempN = distinctGRPairs(nose_union, subj)
nose_distinct = makeGRangesFromDataFrame(tempN)
subj = makeGRangesFromDataFrame(rbind(as.data.frame(hgsoc_union), as.data.frame(nose_union)))
tempF = distinctGRPairs(nfte_union, subj)
nfte_distinct = makeGRangesFromDataFrame(tempF)
subj = makeGRangesFromDataFrame(rbind(as.data.frame(nose_union), as.data.frame(nfte_union)))
tempR = distinctGRPairs(hgsoc_union, subj)
hgsoc_distinct = makeGRangesFromDataFrame(tempR)
Identify expression profile considering all samples into 3 groups: OSEC - 119 samples FTSEC - 71 samples HGSOC - 394 samples
We also defined a step size of 5000 bp and a window size of 1000000 bp.
OSECLines <- which(SamplesTable$CellType == "NOSE")
FTSECLines <- which(SamplesTable$CellType == "NFTE")
HGSOCLines <- which(SamplesTable$CellType == "HGSOC")
step = 5000
window = 1000000
range.neigh = seq(-window, window, step)
Obtaining the gene expression profile across the 3 cell types considering OSEC specific enhancers.
enhancer = nose_distinct
enh_name = "OSEC_enh_"
names <- paste0("OSEC_enh_", seq(1, length(enhancer)))
Now we can begin to find the genes and their correspondent expression in all 3 cell types with the following iteraction loop.
#####################
e.index = 1
genes.range.all = NULL
while (e.index <= length(enhancer)) {
#print(paste0("Enhancer ", e.index))
e.info = enhancer[e.index]
mid.pos = round(start(e.info) + ((end(e.info) - start(e.info)) / 2))
s.pos = mid.pos - window
e.pos = mid.pos + window
chr.pos = as.character(seqnames(e.info))
new.genes.range = seq(s.pos, e.pos, step)
new.enh.range.GR = GRanges(seqnames=chr.pos, ranges=IRanges(s.pos, e.pos))
res = findOverlaps(new.enh.range.GR, dados.RNAseq.GR, type = "any", select = "all", ignore.strand=TRUE)
if (length(res) > 0) {
genes.range = dados.RNAseq.DF[subjectHits(res),]
OSEC.mean = apply(dados.RNAseq.f[subjectHits(res),OSECLines],1,mean,na.rm=T)
FTSEC.mean = apply(dados.RNAseq.f[subjectHits(res),FTSECLines],1,mean,na.rm=T)
HGSOC.mean = apply(dados.RNAseq.f[subjectHits(res),HGSOCLines],1,mean,na.rm=T)
genes.range = cbind(Enh = paste0(enh_name, e.index), genes.range, Mean.OSEC=OSEC.mean, Mean.FTSEC=FTSEC.mean, Mean.HGSOC=HGSOC.mean)
for (gi in 1:nrow(genes.range)) {
gene = genes.range[gi,]
index = 1
for (value in new.genes.range) {
if (value > gene$start) {
break
}
index = index + 1
}
genes.range$pos[gi] = range.neigh[index]
} # end for
genes.range.all = rbind(genes.range.all, genes.range)
}
e.index = e.index + 1
}
#################
Calculate the z-score for each cell type gene list.
info.plot.nose = data.frame(Position = range.neigh, Type = "OSEC", Value=NA)
info.plot.ftsec = data.frame(Position = range.neigh, Type = "FTSEC", Value=NA)
info.plot.hgsoc = data.frame(Position = range.neigh, Type = "HGSOC", Value=NA)
index.pos = 1
for ( pos in range.neigh ) {
if (pos < 0) {
sub.range = seq(pos, 0, step)
} else if (pos > 0) {
sub.range = seq(0, pos, step)
}
rows.m <- which(genes.range.all$pos %in% sub.range)
enh.name <- unique(genes.range.all$Enh[rows.m])
enh.index <- which(names %in% enh.name)
genes.name = unique(genes.range.all$symbol[rows.m])
info.plot.nose$Value[index.pos] <- mean(dados.RNAseq.scaled[genes.name, OSECLines])
info.plot.ftsec$Value[index.pos] <- mean(dados.RNAseq.scaled[genes.name, FTSECLines])
info.plot.hgsoc$Value[index.pos] <- mean(dados.RNAseq.scaled[genes.name, HGSOCLines])
index.pos = index.pos + 1
}
##########################
Plotting the final result
nose.info.plot.all = rbind(info.plot.nose, info.plot.ftsec, info.plot.hgsoc)
p_osec = ggplot(data=nose.info.plot.all, aes(x=Position, y=Value, colour = Type)) +
labs(title = paste0("OSEC-specific SEs"), y = "Expression z-score", x = "Genomic positions") +
theme(plot.title = element_text(size=12)) +
scale_color_manual(values=c("#3333ffff", "#ff3333ff", "#33ff33ff" )) +
geom_line()
p_osec
We demonstrated how to obtain the expression profile of genes associated to SE genomic regions. The same procedures can be now applied to FTSEC and HGSOC specific SE by changuing Part 2 and running again Part 3 and 4.