Robbin Nameki, Anamay Shetty et al. August 2021

Introduction

The following workflow describes the process of curating chromMAGMA and MAGMA outputs to the gene identifiers EnsemblgeneIDs available from biomaRt

Libraries

library(biomaRt)
library(knitr)

Sourcecode

source('Rscripts/Utils.R')

Curating chromMAGMA and MAGMA gene identifiers

Gene identifiers ID'd as 'ensembl_gene_id', 'external_gene_name','external_synonym','hgnc_symbol', 'entrez_gene_id', and 'uniprot_gn_symbol' from BiomaRt are converted to EnsemblGeneIDs. The gene list is also limited here to protein coding genes.

#Cleaning chromMAGMA Outputs. 

.ensembl <- useEnsembl_GRCh37()

x <- read_final_gene_results()

#x$GENE[x$FEATURE == 'snp_to_genes'] <- x$originalfeature[x$FEATURE == 'snp_to_genes']

genes <- unique(sort(x$GENE))

### ensembl_gene_id
i <- startsWith(x = genes, prefix = 'ENSG') & nchar(genes) == 15
df.1 <- data.frame(GENE = genes[i], ensembl_gene_id = genes[i])
genes <- genes[!i]

### external_gene_name
bm <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$external_gene_name),]
ensembl_gene_id <- lapply(genes, function(g){
  return(bm$ensembl_gene_id[bm$external_gene_name == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.2 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]

### external_synonym
bm <- getBM(attributes = c('ensembl_gene_id', 'external_synonym', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$external_synonym),]
ensembl_gene_id <- lapply(genes, function(g){
  return(bm$ensembl_gene_id[bm$external_synonym == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.3 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]

### hgnc_symbol
bm <- getBM(attributes = c('ensembl_gene_id', 'hgnc_symbol', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$hgnc_symbol),]
ensembl_gene_id <- lapply(genes, function(g){
  return(bm$ensembl_gene_id[bm$hgnc_symbol == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.4 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]

### entrezgene_id
bm <- getBM(attributes = c('ensembl_gene_id', 'entrezgene_id', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$entrezgene_id),]
ensembl_gene_id <- lapply(genes, function(g){
  return(bm$ensembl_gene_id[paste0('LOC',bm$entrezgene_id) == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.5 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]

### uniprot_gn_symbol
bm <- getBM(attributes = c('ensembl_gene_id', 'uniprot_gn_symbol', 'gene_biotype'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
bm <- bm[!is.na(bm$uniprot_gn_symbol),]
ensembl_gene_id <- lapply(genes, function(g){
  return(bm$ensembl_gene_id[bm$uniprot_gn_symbol == g])
})
i <- lapply(ensembl_gene_id, length) == 1
df.6 <- data.frame(GENE = genes[i], ensembl_gene_id = unlist(ensembl_gene_id[i]))
genes <- genes[!i]

df <- rbind(df.1, df.2, df.3, df.4, df.5, df.6)
x <- merge(x = x, y = df, by = 'GENE', all.x = T)

bm <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype','chromosome_name','start_position','end_position'), mart = .ensembl, filters = 'chromosome_name', values = c(1:22,'X')); bm <- bm[bm$gene_biotype == 'protein_coding',]
x <- merge(x = x, y = bm, by = 'ensembl_gene_id', all.x = T)

str(x)
## 'data.frame':    3673812 obs. of  19 variables:
##  $ ensembl_gene_id   : chr  "ENSG00000000003" "ENSG00000000003" "ENSG00000000003" "ENSG00000000003" ...
##  $ GENE              : chr  "TSPAN6" "TSPAN6" "TSPAN6" "TSPAN6" ...
##  $ FEATURE           : chr  "snp_to_genes" "snp_to_subsetted_enhancers" "snp_to_subsetted_enhancers" "snp_to_subsetted_enhancers" ...
##  $ GWAS_TYPE         : chr  "endometrioid" "all_non_mucinous" "ser_lg_lmp" "clearcell" ...
##  $ originalfeature   : chr  "TSPAN6" "X:99890397-99892572" "X:99890397-99892572" "X:99890397-99892572" ...
##  $ CHR               : chr  "X" "X" "X" "X" ...
##  $ START             : int  99882105 99890397 99890397 99890397 99890555 99882106 99882105 99882106 99882106 99897554 ...
##  $ STOP              : int  99892101 99892572 99892572 99892572 99890743 99884983 99892101 99884983 99884983 99906464 ...
##  $ NSNPS             : int  24 4 4 4 1 7 24 7 7 27 ...
##  $ NPARAM            : int  9 2 2 2 1 3 9 3 3 8 ...
##  $ N                 : int  107853 129118 103765 106342 107853 106342 129118 129118 103765 106342 ...
##  $ ZSTAT             : num  -0.00397 0.19104 -1.1948 -0.27256 -1.6306 ...
##  $ P                 : num  0.502 0.424 0.884 0.607 0.949 ...
##  $ ZSTAT_v2          : num  0.672 0.7991 0.146 0.5138 0.0646 ...
##  $ external_gene_name: chr  "TSPAN6" "TSPAN6" "TSPAN6" "TSPAN6" ...
##  $ gene_biotype      : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
##  $ chromosome_name   : chr  "X" "X" "X" "X" ...
##  $ start_position    : int  99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 99883667 ...
##  $ end_position      : int  99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 99894988 ...

Saving gene-level results with EnsemblGeneIds

This RDS file will be used in further analysis

#Output with cleaned names
#saveRDS(x = x, file = 'Data/final_gene_results_v1.08_wEnsemblGeneId.4.2.21.rds')