Robbin Nameki, Anamay Shetty et al. August 2021
The following workflow describes the process of curating chromMAGMA and MAGMA outputs to the gene identifiers EnsemblgeneIDs available from biomaRt
library(biomaRt)
library(knitr)
source('Rscripts/Utils.R')
Gene identifiers ID'd as 'ensembl_gene_id', 'external_gene_name','external_synonym','hgnc_symbol', 'entrez_gene_id', and 'uniprot_gn_symbol' from BiomaRt are converted to EnsemblGeneIDs. The gene list is also limited here to protein coding genes.
#Cleaning chromMAGMA Outputs.
.ensembl <- useEnsembl_GRCh37()
x <- read_final_gene_results()
#x$GENE[x$FEATURE == 'snp_to_genes'] <- x$originalfeature[x$FEATURE == 'snp_to_genes']
genes <- unique(sort(x$GENE))
### ensembl_gene_id
i <- startsWith(x = genes, prefix = 'ENSG') & nchar(genes) == 15
df.1 <- data.frame(GENE = genes[i], ensembl_gene_id = genes[i])
genes <- genes[!i]
### external_gene_name
bm <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$external_gene_name),]
ensembl_gene_id <- lapply(genes, function(g){
return(bm$ensembl_gene_id[bm$external_gene_name == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.2 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]
### external_synonym
bm <- getBM(attributes = c('ensembl_gene_id', 'external_synonym', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$external_synonym),]
ensembl_gene_id <- lapply(genes, function(g){
return(bm$ensembl_gene_id[bm$external_synonym == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.3 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]
### hgnc_symbol
bm <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$hgnc_symbol),]
ensembl_gene_id <- lapply(genes, function(g){
return(bm$ensembl_gene_id[bm$hgnc_symbol == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.4 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]
### entrezgene_id
bm <- getBM(attributes = c('ensembl_gene_id', 'entrezgene_id', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$entrezgene_id),]
ensembl_gene_id <- lapply(genes, function(g){
return(bm$ensembl_gene_id[paste0('LOC',bm$entrezgene_id) == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.5 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]
### uniprot_gn_symbol
bm <- getBM(attributes = c('ensembl_gene_id', 'uniprot_gn_symbol', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$uniprot_gn_symbol),]
ensembl_gene_id <- lapply(genes, function(g){
return(bm$ensembl_gene_id[bm$uniprot_gn_symbol == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.6 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]
df <- rbind(df.1, df.2, df.3, df.4, df.5, df.6)
x <- merge(x = x, y = df, by = 'GENE', all.x = T)
bm <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype','chromosome_name','start_position','end_position'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
x <- merge(x = x, y = bm, by = 'ensembl_gene_id', all.x = T)
str(x)
## 'data.frame': 3673812 obs. of 19 variables:
## $ ensembl_gene_id : chr "ENSG00000000003" "ENSG00000000003" "ENSG00000000003" "ENSG00000000003" ...
## $ GENE : chr "TSPAN6" "TSPAN6" "TSPAN6" "TSPAN6" ...
## $ FEATURE : chr "snp_to_genes" "snp_to_subsetted_enhancers" "snp_to_subsetted_enhancers" "snp_to_subsetted_enhancers" ...
## $ GWAS_TYPE : chr "endometrioid" "all_non_mucinous" "ser_lg_lmp" "clearcell" ...
## $ originalfeature : chr "TSPAN6" "X:99890397-99892572" "X:99890397-99892572" "X:99890397-99892572" ...
## $ CHR : chr "X" "X" "X" "X" ...
## $ START : int 99882105 99890397 99890397 99890397 99890555 99882106 99882105 99882106 99882106 99897554 ...
## $ STOP : int 99892101 99892572 99892572 99892572 99890743 99884983 99892101 99884983 99884983 99906464 ...
## $ NSNPS : int 24 4 4 4 1 7 24 7 7 27 ...
## $ NPARAM : int 9 2 2 2 1 3 9 3 3 8 ...
## $ N : int 107853 129118 103765 106342 107853 106342 129118 129118 103765 106342 ...
## $ ZSTAT : num -0.00397 0.19104 -1.1948 -0.27256 -1.6306 ...
## $ P : num 0.502 0.424 0.884 0.607 0.949 ...
## $ ZSTAT_v2 : num 0.672 0.7991 0.146 0.5138 0.0646 ...
## $ external_gene_name: chr "TSPAN6" "TSPAN6" "TSPAN6" "TSPAN6" ...
## $ gene_biotype : chr "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
## $ chromosome_name : chr "X" "X" "X" "X" ...
## $ start_position : int 99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 ...
## $ end_position : int 99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 ...
This RDS file will be used in further analysis
#Output with cleaned names
#saveRDS(x = x, file = 'Data/final_gene_results_v1.08_wEnsemblGeneId.4.2.21.rds')