Robbin Nameki, Anamay Shetty et al. August 2021
The following code is for further cleanup/tidying of data from 01, followed by a crucial step to assign the lowest P-value to each gene for the generation of gene-level results.
library(tidyverse)
library(reshape2)
source('Rscripts/Utils.R')
## Warning: package 'valr' was built under R version 4.0.5
This chunk describes cleaning and tidying data from 01
#Reading_Full_chromMAGMA and MAGMA table with tidied names
full_stat <- read_final_gene_results_wEnsembl()
#taking out gene synonyms on different chromosomes
full_stat <- full_stat %>%
filter(CHR == chromosome_name)
#Selecting for only protein coding genes
full_stat_prot <- full_stat[complete.cases(full_stat),]
#Selecting Columns and tidying names of FEATURE & GWAS_DATA; to keep consistent language with manuscript.
full_stat_prot.gene_id_tidycols <- full_stat_prot %>%
dplyr::select(ensembl_gene_id,external_gene_name,FEATURE,GWAS_TYPE,originalfeature, CHR,START,STOP,NSNPS,NPARAM,N,ZSTAT,P,chromosome_name,start_position,end_position)
full_stat_prot.gene_id_tidycols <- full_stat_prot.gene_id_tidycols %>%
filter(!grepl('snp_to_individual_exons', FEATURE)) %>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "snp_to_subsetted_enhancers", replacement = "chromMAGMA")%>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "snp_to_genes", replacement = "MAGMA")%>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "all_non_mucinous", replacement = "NMOC") %>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "clearcell", replacement = "CCOC") %>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "endometrioid", replacement = "EnOC") %>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "mucinous_all", replacement = "MOC")%>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "ser_lg_lmp", replacement = "LGSOC")%>%
dplyr::mutate_if(is.character,
stringr::str_replace_all, pattern = "serous_hg_extra", replacement = "HGSOC") %>%
dplyr::rename(originalfeature_CHR = CHR) %>%
dplyr::rename(originalfeature_START = START) %>%
dplyr::rename(originalfeature_STOP = STOP) %>%
dplyr::rename(gene_body_chromosome_name = chromosome_name) %>%
dplyr::rename(gene_body_start_position = start_position) %>%
dplyr::rename(gene_body_end_position = end_position)
head(full_stat_prot.gene_id_tidycols)
## ensembl_gene_id external_gene_name FEATURE GWAS_TYPE originalfeature
## 1 ENSG00000000003 TSPAN6 chromMAGMA MOC X:99890397-99892572
## 2 ENSG00000000003 TSPAN6 chromMAGMA NMOC X:99897554-99906464
## 3 ENSG00000000003 TSPAN6 MAGMA LGSOC TSPAN6
## 4 ENSG00000000003 TSPAN6 MAGMA NMOC TSPAN6
## 5 ENSG00000000003 TSPAN6 MAGMA EnOC TSPAN6
## 6 ENSG00000000003 TSPAN6 chromMAGMA NMOC X:99890397-99892572
## originalfeature_CHR originalfeature_START originalfeature_STOP NSNPS NPARAM
## 1 X 99890397 99892572 4 2
## 2 X 99897554 99906464 27 8
## 3 X 99882105 99892101 24 9
## 4 X 99882105 99892101 24 9
## 5 X 99882105 99892101 24 9
## 6 X 99890397 99892572 4 2
## N ZSTAT P gene_body_chromosome_name gene_body_start_position
## 1 108197 -0.6865800 0.75383 X 99883667
## 2 129118 -0.4251800 0.66465 X 99883667
## 3 103765 -1.7218000 0.95744 X 99883667
## 4 129118 0.2937900 0.38446 X 99883667
## 5 107853 -0.0039745 0.50159 X 99883667
## 6 129118 0.1910400 0.42425 X 99883667
## gene_body_end_position
## 1 99894988
## 2 99894988
## 3 99894988
## 4 99894988
## 5 99894988
## 6 99894988
Since multiple enhancers can be assigned to genes, the most significant P-value is assigned to genes to generate gene-level outputs for MAGMA and chromMAGMA.
full_stat_prot.gene_id_tidycols_maximum <- full_stat_prot.gene_id_tidycols %>%
dplyr::group_by(FEATURE, GWAS_TYPE, external_gene_name) %>%
dplyr::mutate(min_Pvalue = min(P)) %>%
filter(P == min_Pvalue) %>%
#genehancer has duplicates of the same genes with different names so taking out dups
filter(!duplicated(external_gene_name))
#ranking by the NEG_LOG10P
full_stat_prot.gene_id_tidycols_maximum_arranged <- full_stat_prot.gene_id_tidycols_maximum %>%
dplyr::group_by(FEATURE, GWAS_TYPE) %>%
mutate(NEG_LOG10P = -log10(min_Pvalue)) %>%
dplyr::group_by(FEATURE, GWAS_TYPE) %>%
dplyr::arrange(desc(NEG_LOG10P),.by_group = TRUE)
#Cleaned table consisting of pcoding genes with most significant enhancer assigned
head(full_stat_prot.gene_id_tidycols_maximum_arranged)
## # A tibble: 6 x 18
## # Groups: FEATURE, GWAS_TYPE [1]
## ensembl_gene_id external_gene_n~ FEATURE GWAS_TYPE originalfeature
## <chr> <chr> <chr> <chr> <chr>
## 1 ENSG00000108753 HNF1B chromM~ CCOC 17:36098905-36~
## 2 ENSG00000132142 ACACA chromM~ CCOC 17:36100871-36~
## 3 ENSG00000141141 DDX52 chromM~ CCOC 17:36100871-36~
## 4 ENSG00000115008 IL1A chromM~ CCOC 2:113539251-11~
## 5 ENSG00000125538 IL1B chromM~ CCOC 2:113539251-11~
## 6 ENSG00000125571 IL37 chromM~ CCOC 2:113539251-11~
## # ... with 13 more variables: originalfeature_CHR <chr>,
## # originalfeature_START <int>, originalfeature_STOP <int>, NSNPS <int>,
## # NPARAM <int>, N <int>, ZSTAT <dbl>, P <dbl>,
## # gene_body_chromosome_name <chr>, gene_body_start_position <int>,
## # gene_body_end_position <int>, min_Pvalue <dbl>, NEG_LOG10P <dbl>
## Cleaned table consisting of pcoding genes with most significant enhancer assigned
#saveRDS(x = full_stat_prot.gene_id_tidycols_maximum_arranged, file = 'Data/Gene_Level.4.9.21.rds')