Robbin Nameki, Anamay Shetty et al. August 2021

Introduction

The following code is for further cleanup/tidying of data from 01, followed by a crucial step to assign the lowest P-value to each gene for the generation of gene-level results.

Libraries

library(tidyverse)
library(reshape2)

Sourcecode

source('Rscripts/Utils.R')
## Warning: package 'valr' was built under R version 4.0.5

Tidying data from 01

This chunk describes cleaning and tidying data from 01

#Reading_Full_chromMAGMA and MAGMA table with tidied names
full_stat <- read_final_gene_results_wEnsembl()
#taking out gene synonyms on different chromosomes
full_stat <- full_stat %>%
  filter(CHR == chromosome_name)

#Selecting for only protein coding genes
full_stat_prot <- full_stat[complete.cases(full_stat),]

#Selecting Columns and tidying names of FEATURE & GWAS_DATA; to keep consistent language with manuscript.
full_stat_prot.gene_id_tidycols <- full_stat_prot %>%
  dplyr::select(ensembl_gene_id,external_gene_name,FEATURE,GWAS_TYPE,originalfeature, CHR,START,STOP,NSNPS,NPARAM,N,ZSTAT,P,chromosome_name,start_position,end_position)

full_stat_prot.gene_id_tidycols <- full_stat_prot.gene_id_tidycols %>% 
  filter(!grepl('snp_to_individual_exons', FEATURE)) %>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "snp_to_subsetted_enhancers", replacement = "chromMAGMA")%>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "snp_to_genes", replacement = "MAGMA")%>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "all_non_mucinous", replacement = "NMOC") %>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "clearcell", replacement = "CCOC") %>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "endometrioid", replacement = "EnOC") %>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "mucinous_all", replacement = "MOC")%>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "ser_lg_lmp", replacement = "LGSOC")%>%
  dplyr::mutate_if(is.character, 
                   stringr::str_replace_all, pattern = "serous_hg_extra", replacement = "HGSOC") %>%
  dplyr::rename(originalfeature_CHR = CHR) %>%
  dplyr::rename(originalfeature_START = START) %>%
  dplyr::rename(originalfeature_STOP = STOP) %>%
  dplyr::rename(gene_body_chromosome_name = chromosome_name) %>%
  dplyr::rename(gene_body_start_position = start_position) %>%
  dplyr::rename(gene_body_end_position = end_position)
head(full_stat_prot.gene_id_tidycols)
##   ensembl_gene_id external_gene_name    FEATURE GWAS_TYPE     originalfeature
## 1 ENSG00000000003             TSPAN6 chromMAGMA       MOC X:99890397-99892572
## 2 ENSG00000000003             TSPAN6 chromMAGMA      NMOC X:99897554-99906464
## 3 ENSG00000000003             TSPAN6      MAGMA     LGSOC              TSPAN6
## 4 ENSG00000000003             TSPAN6      MAGMA      NMOC              TSPAN6
## 5 ENSG00000000003             TSPAN6      MAGMA      EnOC              TSPAN6
## 6 ENSG00000000003             TSPAN6 chromMAGMA      NMOC X:99890397-99892572
##   originalfeature_CHR originalfeature_START originalfeature_STOP NSNPS NPARAM
## 1                   X              99890397             99892572     4      2
## 2                   X              99897554             99906464    27      8
## 3                   X              99882105             99892101    24      9
## 4                   X              99882105             99892101    24      9
## 5                   X              99882105             99892101    24      9
## 6                   X              99890397             99892572     4      2
##        N      ZSTAT       P gene_body_chromosome_name gene_body_start_position
## 1 108197 -0.6865800 0.75383                         X                 99883667
## 2 129118 -0.4251800 0.66465                         X                 99883667
## 3 103765 -1.7218000 0.95744                         X                 99883667
## 4 129118  0.2937900 0.38446                         X                 99883667
## 5 107853 -0.0039745 0.50159                         X                 99883667
## 6 129118  0.1910400 0.42425                         X                 99883667
##   gene_body_end_position
## 1               99894988
## 2               99894988
## 3               99894988
## 4               99894988
## 5               99894988
## 6               99894988

Assigning most significant P-value (enhancers) to genes

Since multiple enhancers can be assigned to genes, the most significant P-value is assigned to genes to generate gene-level outputs for MAGMA and chromMAGMA.

full_stat_prot.gene_id_tidycols_maximum <- full_stat_prot.gene_id_tidycols %>%
  dplyr::group_by(FEATURE, GWAS_TYPE, external_gene_name) %>%
  dplyr::mutate(min_Pvalue = min(P)) %>% 
  filter(P == min_Pvalue) %>%
  #genehancer has duplicates of the same genes with different names so taking out dups
  filter(!duplicated(external_gene_name))
#ranking by the NEG_LOG10P
full_stat_prot.gene_id_tidycols_maximum_arranged <- full_stat_prot.gene_id_tidycols_maximum %>%
  dplyr::group_by(FEATURE, GWAS_TYPE) %>%
  mutate(NEG_LOG10P = -log10(min_Pvalue)) %>%
  dplyr::group_by(FEATURE, GWAS_TYPE) %>%
  dplyr::arrange(desc(NEG_LOG10P),.by_group = TRUE) 

#Cleaned table consisting of pcoding genes with most significant enhancer assigned
head(full_stat_prot.gene_id_tidycols_maximum_arranged)
## # A tibble: 6 x 18
## # Groups:   FEATURE, GWAS_TYPE [1]
##   ensembl_gene_id external_gene_n~ FEATURE GWAS_TYPE originalfeature
##   <chr>           <chr>            <chr>   <chr>     <chr>          
## 1 ENSG00000108753 HNF1B            chromM~ CCOC      17:36098905-36~
## 2 ENSG00000132142 ACACA            chromM~ CCOC      17:36100871-36~
## 3 ENSG00000141141 DDX52            chromM~ CCOC      17:36100871-36~
## 4 ENSG00000115008 IL1A             chromM~ CCOC      2:113539251-11~
## 5 ENSG00000125538 IL1B             chromM~ CCOC      2:113539251-11~
## 6 ENSG00000125571 IL37             chromM~ CCOC      2:113539251-11~
## # ... with 13 more variables: originalfeature_CHR <chr>,
## #   originalfeature_START <int>, originalfeature_STOP <int>, NSNPS <int>,
## #   NPARAM <int>, N <int>, ZSTAT <dbl>, P <dbl>,
## #   gene_body_chromosome_name <chr>, gene_body_start_position <int>,
## #   gene_body_end_position <int>, min_Pvalue <dbl>, NEG_LOG10P <dbl>
## Cleaned table consisting of pcoding genes with most significant enhancer assigned
#saveRDS(x = full_stat_prot.gene_id_tidycols_maximum_arranged, file = 'Data/Gene_Level.4.9.21.rds')