bios25328
lecture
Author

Haky Im

Published

April 10, 2024

Lecture 8 notes link

Download the GWAS catalog

Code
suppressMessages(library(tidyverse))
suppressMessages(library(glue))
##PRE = "/Users/haekyungim/Library/CloudStorage/Box-Box/LargeFiles/imlab-data/data-Github/web-data"
PRE="~/Downloads/"
DATA = glue("{PRE}/2024-04-10-gwas-catalog-analysis-2022")
if(!file.exists(DATA)) system(glue("mkdir -p {DATA}"))
WORK=DATA

DOWNLOAD_DATE="2024-04-10"

## download data from GWAS catalog
## https://www.ebi.ac.uk/gwas/docs/file-downloads

# - [ ] make sure the DATAFILE name is the same as the one you just downloaded
# DATAFILE = "gwas_catalog_v1.0.2-associations_e105_r2022-04-07.tsv"
DATAFILE="full"
filename = glue("{DATA}/{DATAFILE}")

if(!file.exists(filename)) system(glue("wget -P {DATA} https://www.ebi.ac.uk/gwas/api/search/downloads/full"))
Code
gwascatalog = read_tsv(filename)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 589865 Columns: 34
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (23): FIRST AUTHOR, JOURNAL, LINK, STUDY, DISEASE/TRAIT, INITIAL SAMPLE...
dbl   (9): PUBMEDID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, MERGE...
date  (2): DATE ADDED TO CATALOG, DATE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
dim(gwascatalog)
[1] 589865     34
Code
names(gwascatalog)
 [1] "DATE ADDED TO CATALOG"      "PUBMEDID"                  
 [3] "FIRST AUTHOR"               "DATE"                      
 [5] "JOURNAL"                    "LINK"                      
 [7] "STUDY"                      "DISEASE/TRAIT"             
 [9] "INITIAL SAMPLE SIZE"        "REPLICATION SAMPLE SIZE"   
[11] "REGION"                     "CHR_ID"                    
[13] "CHR_POS"                    "REPORTED GENE(S)"          
[15] "MAPPED_GENE"                "UPSTREAM_GENE_ID"          
[17] "DOWNSTREAM_GENE_ID"         "SNP_GENE_IDS"              
[19] "UPSTREAM_GENE_DISTANCE"     "DOWNSTREAM_GENE_DISTANCE"  
[21] "STRONGEST SNP-RISK ALLELE"  "SNPS"                      
[23] "MERGED"                     "SNP_ID_CURRENT"            
[25] "CONTEXT"                    "INTERGENIC"                
[27] "RISK ALLELE FREQUENCY"      "P-VALUE"                   
[29] "PVALUE_MLOG"                "P-VALUE (TEXT)"            
[31] "OR or BETA"                 "95% CI (TEXT)"             
[33] "PLATFORM [SNPS PASSING QC]" "CNV"                       
Code
## show number of entries in the GWAS catalog with cancer
gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% dim()
[1] 10169     2

Number of distinct loci reported in GWAS of various cancers

Number of entries in the GWAS catalog with cancer, unique loci (it’s a reasonable assumption that mapped genes will be a good way to count loci)

Code
gwascatalog %>% select(`DISEASE/TRAIT`, MAPPED_GENE) %>% filter(grepl("cancer",`DISEASE/TRAIT`)) %>% unique() %>% dim()
[1] 6047    2
Code
## plot histogram of GWAS loci by year
gwascat_sig = gwascatalog %>% mutate(year=as.factor(lubridate::year(lubridate::as_date(`DATE ADDED TO CATALOG`)))) %>% filter(`P-VALUE`<5e-8)

gwascat_sig %>% filter(year!="2024") %>% ggplot(aes(year)) + geom_bar() + theme_bw(base_size = 15) + scale_x_discrete(breaks=c("2008","2012","2016","2020","2022")) + xlab("year") + ylab("GWAS loci reported p<5e-8") + ggtitle(paste0("GWAS Catalog Downloaded ",DOWNLOAD_DATE) )

Code
##ggsave(glue::glue("{DATA}/gwas-catalog/gwas-catalog-by-year.pdf"))