# List of packages you want to install
<- c("tidyverse", "data.table", "BEDMatrix", "Rfast", "susieR", "coloc")
packages
# Function to check and install any missing packages
<- function(pkg){
check_and_install if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, dependencies = TRUE)
library(pkg, character.only = TRUE)
}
}
# Use the function to check and install packages
sapply(packages, check_and_install)
Transcriptome QGT Training 2024
NEEDS UPDATE
Goals of Lab
- Predict whole blood expression from genotype
- Check how well the prediction works with GEUVADIS expression data
- Run association between predicted expression and a phenotype
- Calculate association between expression levels and coronary artery disease risk using s-predixcan
- Fine-map the coronary artery disease gwas results using torus
- Calculate colocalization probability using fastenloc
- Run cTWAS (fine-map SNPs and genes jointly)
Inital Remarks
- We ask you to actively participate in today’s hands on activities.
Notice that we may ask you to share your screen for pedagogic purposes.
Send Haky a private message if you really don’t want to be called to share your screen.
If you have any concerns about this, please ask me or one of the TAs for assistance. We are here to help you learn.
Open the questionnaire forms at the beginning of each section and answer the questions as you go along
- Name is a mandatory field. You can use your name or a pseudonymn. Just keep using the same name for each submission
- We will ask you to submit partial set of answers so that we can get a sense of where people are
Through out the lab you will see code chucks that are written with bash, eval=FALSE in place of R. These chunks are meant to be run in your Rstudio terminal and not in the Rmarkdown file itself.
R code chunks also have “eval=FALSE”. When you click on the green triangle, the code will run in the R console but not when we knit the document. This just happened to be more convenient for us to generate a knitted html.
Set Up
Questionnaire 01
Open and start filling questionnaire 01 Preliminary questionnaire https://forms.gle/fhNJAyjx7MJTy3yt8
Install packages as needed
- Load Rstudio Libraries
library(tidyverse)
## packages needed for susie+coloc
library(data.table)
library(BEDMatrix)
library(Rfast)
library(susieR)
library(coloc)
##library(tidyverse)
- Navigate to starting directory
cd "/cloud/project/"
- activate the the imlabtools environment, which will make sure that the right version of python modules are available
conda activate imlabtools
- To define some variables to access the data more easily within the R session, run the following r chunk
print(getwd())
="/cloud/project/QGT-Columbia-HKI-repo/"
lab=glue::glue("{lab}/code")
CODEsource(glue::glue("{CODE}/load_data_functions.R"))
source(glue::glue("{CODE}/plotting_utils_functions.R"))
="/cloud/project/QGT-Columbia-HKI-repo/box_files"
PRE=glue::glue("{PRE}/models")
MODEL=glue::glue("{PRE}/data")
DATA=glue::glue("{PRE}/results")
RESULTS=glue::glue("{PRE}/repos/MetaXcan-master/software")
METAXCAN=glue::glue("{PRE}/repos/fastenloc-master")
FASTENLOC
# This is a reference table we'll use a lot throughout the lab. It contains information about the genes.
= load_gencode_df() gencode_df
- check the values of the variables you just defined in R
MODEL
DATA
- define some variables to access the data more easily in the terminal. Remember we are running R code in the R console and command line code in the terminal.
export PRE="/cloud/project/QGT-Columbia-HKI-repo/box_files"
export LAB="/cloud/project/QGT-Columbia-HKI-repo/"
export CODE=$LAB/code
export DATA=$PRE/data
export MODEL=$PRE/models
export RESULTS=$PRE/results
export METAXCAN=$PRE/repos/MetaXcan-master/software
- check the values of the variables you just defined
echo $CODE
echo $RESULTS
Transcriptome-wide association (Review)
Now we will perform a transcriptome-wide association analysis using the PrediXcan suite of tools.
We start by predicting the expression levels of genes using the genotype data and the prediction weights and then perform an association between the predicted expression and the phenotype (denoted trait in the figure below).
Questionnaire 02
- Open and start filling questionnaire 02 Prediction https://forms.gle/T6kAHvFTxYfcQguW7
Predict Expression from genotype
In this section we will predict expression of genes in whole blood using the Predict.py code in the METAXCAN folder.
Prediction models (weights) are located in the MODEL folder. Additional models for different tissues and transcriptome studies can be downloaded from predictdb.org.
This run should take about one minute.
run the following code in the terminal.
printf "Predict expression\n\n"
python3 $METAXCAN/Predict.py \
$PRE/models/gtex_v8_en/en_Whole_Blood.db \
--model_db_path $DATA/predixcan/genotype/filtered.vcf.gz \
--vcf_genotypes \
--vcf_mode genotyped $DATA/predixcan/gtex_v8_eur_filtered_maf0.01_monoallelic_variants.txt.gz id rsid \
--variant_mapping "chr{}_{}_{}_{}_b38" \
--on_the_fly_mapping METADATA $RESULTS/predixcan/Whole_Blood__predict.txt \
--prediction_output $RESULTS/predixcan/Whole_Blood__summary.txt \
--prediction_summary_output \
--verbosity 9 --throw
Note we are only predicting chromosome 22 here (check by running “predicted_expression %>% count(chromosome)” in the console)
- run following code in the console to get information on reported prediction performance.
Find additional information in this wiki
= glue::glue("{RESULTS}/predixcan/Whole_Blood__predict.txt")
prediction_fp
## Read the Predict.py output into a dataframe. This function reorganizes the data and adds gene names.
= load_predicted_expression(prediction_fp, gencode_df)
predicted_expression
head(predicted_expression)
## read summary of prediction, number of SNPs per gene, cross validated prediction performance
= load_prediction_summary(glue::glue("{RESULTS}/predixcan/Whole_Blood__summary.txt"), gencode_df)
prediction_summary ## number of genes with a prediction model
dim(prediction_summary)
head(prediction_summary)
## how many unique genes were predicted?
%>% .[["gene_id"]] %>% unique() %>% length()
predicted_expression
## gene expression were predicted for how many people?
%>% .[["IID"]] %>% unique() %>% length()
predicted_expression
print("distribution of prediction performance r2")
summary(prediction_summary$pred_perf_r2)
hist(prediction_summary$pred_perf_r2)
## Note: this is what the prediction trainer reported as prediction performance
(Optional) Assess Actual Prediction Performance
## download and read observed expression data from GEUVADIS
## from https://uchicago.box.com/s/4y7xle5l0pnq9d1fwmthe2ewhogrnlrv
<- read_csv(glue::glue("{DATA}/predixcan/GEUVADIS.observed_df.csv.gz"))
obs_exp
## Note that the version of the ensemble id of the gene was removed
head(predicted_expression)
## Q: how many genes were predicted?
length(unique(predicted_expression$gene_id))
## inner join predicted expression with observed expression data (by IID and gene)
## common errors occur when ensemble id's have versions in one set and not the other set
=inner_join(predicted_expression, obs_exp, by = c("gene_id","IID"))
fullset
## calculate spearman correlation for all genes
= unique(predicted_expression$gene_id)
genelist = rep(NA,length(genelist))
corvec names(corvec) = genelist
for(gg in 1:length(genelist))
{= fullset$gene_id==genelist[gg]
ind = cor(fullset$predicted_expression[ind], fullset$observed_expression[ind])
corvec[gg]
}
## what's the best performing gene?
## plot the histogram of the prediction performance
hist(corvec)
## list the top 10 best performing genes
head(sort(corvec,decreasing = TRUE),2)
tail(sort(corvec,decreasing = TRUE),2)
## plot the correlation of the top 2 best performing genes bottom 2
= "ENSG00000100376"
geneid = gencode_df %>% filter(gene_id==geneid) %>% .[["gene_name"]]
genename %>% filter(gene_id==geneid) %>% ggplot(aes(observed_expression, predicted_expression))+geom_point()+ggtitle(paste(genename, "-",geneid))
fullset
= "ENSG00000075234"
geneid = gencode_df %>% filter(gene_id==geneid) %>% .[["gene_name"]]
genename %>% filter(gene_id==geneid) %>% ggplot(aes(observed_expression, predicted_expression))+geom_point()+ggtitle(paste(genename, "-", geneid))
fullset
= "ENSG00000070371"
geneid = gencode_df %>% filter(gene_id==geneid) %>% .[["gene_name"]]
genename %>% filter(gene_id==geneid) %>% ggplot(aes(observed_expression, predicted_expression))+geom_point()+ggtitle(paste(genename, "-", geneid))
fullset
= "ENSG00000184164"
geneid = gencode_df %>% filter(gene_id==geneid) %>% .[["gene_name"]]
genename %>% filter(gene_id==geneid) %>% ggplot(aes(observed_expression, predicted_expression))+geom_point()+ggtitle(paste(genename, "-", geneid)) fullset
Run PrediXcan association
Questionnaire 03
- Open and start filling questionnaire 03 PrediXcan https://forms.gle/3H319knWbLgnynNs9
We are going to use a simulated phenotype for which only UPK3A has an effect on the phenotype (\(\beta=-0.9887378\))
\(Y = \sum_k T_k \beta_k + \epsilon\)
with random effects \(\beta_k \sim (1-\pi)\cdot \delta_0 + \pi\cdot N(0,1)\)
export PHENO="sim.spike_n_slab_0.01_pve0.1"
printf "association\n\n"
python3 $METAXCAN/PrediXcanAssociation.py \
$RESULTS/predixcan/Whole_Blood__predict.txt \
--expression_file $DATA/predixcan/phenotype/$PHENO.txt \
--input_phenos_file \
--input_phenos_column pheno $RESULTS/predixcan/$PHENO/Whole_Blood__association.txt \
--output \
--verbosity 9 --throw
More predicted phenotypes can be found in $DATA/predixcan/phenotype/. The naming of the phenotypes provides information about the genetic architecture: the number after pve is the proportion of variance of Y explained by the genetic component of expression. The number after spike_n_slab represents the probability that a gene is causal π (i.e. prob β≠0)
Looking at Association Results
## read association results
="sim.spike_n_slab_0.01_pve0.1"
PHENO
= load_predixcan_association(glue::glue("{RESULTS}/predixcan/{PHENO}/Whole_Blood__association.txt"), gencode_df)
predixcan_association
## take a look at the results
dim(predixcan_association)
%>% arrange(pvalue) %>% select(gene_name,effect,se,pvalue,gene) %>% head
predixcan_association %>% arrange(pvalue) %>% ggplot(aes(pvalue)) + geom_histogram(bins=10)
predixcan_association ## compare distribution against the null (uniform)
gg_qqplot(predixcan_association$pvalue, max_yval = 40)
Comparing the Estimated Effect Size with True Effect Size
= load_truebetas(glue::glue("{DATA}/predixcan/phenotype/gene-effects/{PHENO}.txt"), gencode_df)
truebetas = (predixcan_association %>%
betas inner_join(truebetas,by=c("gene"="gene_id")) %>%
select(c('estimated_beta'='effect',
'true_beta'='effect_size',
'pvalue',
'gene_id'='gene',
'gene_name'='gene_name.x',
'region_id'='region_id.x')))
%>% arrange(pvalue) %>% select(gene_name,estimated_beta,true_beta,pvalue) %>% head
betas ## do you see examples of potential LD contamination?
%>% mutate(causal= true_beta!=0) %>% ggplot(aes(estimated_beta, true_beta,col=causal))+geom_point(alpha=0.6,size=5)+geom_abline()+theme_bw() betas
UPK3A is the causal gene and has the most significant pvalue. RIBC2 is also significantly associated but has no causal role (we know because we simulated the phenotype that way). Why?
Hint: correlation between the genes
Run Summary PrediXcan
Now we will use the summary results from a GWAS of coronary artery disease to calculate the association between the genetic component of the expression of genes and coronary artery disease risk. We will use the SPrediXcan.py.
The GWAS results (harmonized and imputed) for coronary artery disease are available in $PRE/spredixcan/data/
Questionnaire 04
- Open and start filling questionnaire 04 S-PrediXcan https://forms.gle/xJs2U66cnrqdb5cj6
Running S-PrediXcan
python $METAXCAN/SPrediXcan.py \
$DATA/spredixcan/imputed_CARDIoGRAM_C4D_CAD_ADDITIVE.txt.gz \
--gwas_file \
--snp_column panel_variant_id \
--effect_allele_column effect_allele \
--non_effect_allele_column non_effect_allele \
--zscore_column zscore $MODEL/gtex_v8_mashr/mashr_Whole_Blood.db \
--model_db_path $MODEL/gtex_v8_mashr/mashr_Whole_Blood.txt.gz \
--covariance \
--keep_non_rsid \
--additional_output \
--model_db_snp_key varID \
--throw $RESULTS/spredixcan/eqtl/CARDIoGRAM_C4D_CAD_ADDITIVE__PM__Whole_Blood.csv --output_file
We can run the full genome because the summary statistics based PrediXcan is much faster than individual level one.
Plot and Interpret Results
= load_spredixcan_association(glue::glue("{RESULTS}/spredixcan/eqtl/CARDIoGRAM_C4D_CAD_ADDITIVE__PM__Whole_Blood.csv"), gencode_df)
spredixcan_association dim(spredixcan_association)
%>% arrange(pvalue) %>% head
spredixcan_association %>% arrange(pvalue) %>% ggplot(aes(pvalue)) + geom_histogram(bins=20)
spredixcan_association
gg_qqplot(spredixcan_association$pvalue)
Question: SORT1, considered to be a causal gene for LDL cholesterol and as a consequence of coronary artery disease, is not found here. Why?
check whether SORT1 is expressed in whole blood GTEx portal
check whether SORT1 has eQTL in whole blood GTEx portal
Run S-PrediXcan using gene expression predicted in liver
-[] Run s-predixcan with liver model, do you find SORT1? Is it significant?
#loction Liver models
#/cloud/project/QGT-Columbia-HKI-repo/box_files/models/gtex_v8_mashr/
python $METAXCAN/SPrediXcan.py \
$DATA/spredixcan/imputed_CARDIoGRAM_C4D_CAD_ADDITIVE.txt.gz \
--gwas_file \
--snp_column panel_variant_id \
--effect_allele_column effect_allele \
--non_effect_allele_column non_effect_allele \
--zscore_column zscore $MODEL/gtex_v8_mashr/mashr_Liver.db \
--model_db_path $MODEL/gtex_v8_mashr/mashr_Liver.txt.gz \
--covariance \
--keep_non_rsid \
--additional_output \
--model_db_snp_key varID \
--throw $RESULTS/spredixcan/eqtl/CARDIoGRAM_C4D_CAD_ADDITIVE__PM__Liver.csv --output_file
= load_spredixcan_association(glue::glue("{RESULTS}/spredixcan/eqtl/CARDIoGRAM_C4D_CAD_ADDITIVE__PM__Liver.csv"), gencode_df)
spredixcan_association_Ldim(spredixcan_association_L)
%>% arrange(pvalue) %>% head
spredixcan_association_L %>% arrange(pvalue) %>% ggplot(aes(pvalue)) + geom_histogram(bins=20)
spredixcan_association_L
gg_qqplot(spredixcan_association_L$pvalue)
= c("gene_name","gene","zscore","effect_size","pvalue","var_g","pred_perf_r2", "pred_perf_pval","pred_perf_qval", "n_snps_used", "n_snps_in_cov", "n_snps_in_model","best_gwas_p","largest_weight")
col_order<- spredixcan_association_L[, col_order]
spredixcan_association_L filter(spredixcan_association_L, gene_name=="SORT1")
(Optional) Compare zscores in liver and whole blood.
Recall that zscore is the effect size divided by the standard error
=rename(spredixcan_association_L, zscore_liver = "zscore")
spredixcan_association_Lhead(spredixcan_association_L)
=left_join(spredixcan_association, spredixcan_association_L, by="gene_name")
test=select(test,"gene_name","zscore","zscore_liver")
test%>% arrange(zscore_liver) %>% head
test
%>% mutate(zscore_WB=zscore) %>% ggplot(aes(zscore_WB,zscore_liver)) + geom_point(size=3,alpha=.6) + geom_abline()
test
## S-PrediXcan association in liver and whole blood are significantly correlated
cor.test(test$zscore,test$zscore_liver)
(Optional) Run MultiXcan
multixcan aggregates information across multiple tissues to boost the power to detect association. It was developed movivated by the fact that eQTLs are shared across multiple tissues, i.e. many genetic variants that regulate expression are common across tissues.
before you run multixcan ensure you have run s-predixcan for all the tissues you want to multixcan. In this tutorial we have two tissues (liver and whole blood), ensure you have run s-predixcan with the two tissues before running multixcan.
One thing to note is to ensure similar naming pattern for the output files. This is to ensure the files are captured correctly when running multixcan’s filter.
python $METAXCAN/SMulTiXcan.py \
$MODEL/gtex_v8_mashr \
--models_folder "mashr_(.*).db" \
--models_name_pattern $MODEL/gtex_v8_expression_mashr_snp_smultixcan_covariance.txt.gz \
--snp_covariance $RESULTS/spredixcan/eqtl/ \
--metaxcan_folder "CARDIoGRAM_C4D_CAD_ADDITIVE__PM__(.*).csv" \
--metaxcan_filter "(.*)__PM__(.*).csv" \
--metaxcan_file_name_parse_pattern $DATA/spredixcan/imputed_CARDIoGRAM_C4D_CAD_ADDITIVE.txt.gz \
--gwas_file --effect_allele_column effect_allele --non_effect_allele_column non_effect_allele --zscore_column zscore --keep_non_rsid --model_db_snp_key varID \
--snp_column panel_variant_id \
--cutoff_condition_number 30 \
--verbosity 9 \
--throw $RESULTS/smultixcan/eqtl/CARDIoGRAM_C4D_CAD_ADDITIVE_smultixcan.txt --output
Colocalization Methods (Review)
Colocalization methods seek to estimate the probability that the complex trait and expression causal variants are the same. We favor methods that calculate the probability of causality for each trait (posterior inclusion probability), called fine-mapping methods. Here we use susie for fine-mapping and coloc for colocalization.
Questionnaire 5
- Open and start filling questionnaire 05 Colocalization https://forms.gle/NfH2MSdy4UyJzGAp7
Run colocalization
When you use coloc on your own data, you may want to check out coloc’s documentation, with good advice and tips on avoiding common mistakes https://cran.r-project.org/web/packages/coloc/vignettes
Due to time constraints, we will run one region and one gene only
- Finemap GWAS of CAD
- Finemap eQTL of SORT1
For finemapping, we need the summary statistics (effect size, standard errors, etc) and the correlation between SNPs (LD matrix)
load the genotype to calculate the ld matrix
# load the genotype to calculate the ld matrix
<- BEDMatrix(glue::glue("{DATA}/colocalization/geuvadis_chr1"))
X_mat colnames(X_mat) <- gsub("\\_.*", "",colnames(X_mat))
colnames(X_mat) <- str_replace_all(colnames(X_mat),":","_")
<- fread(glue::glue("{DATA}/colocalization/geuvadis_chr1.bim")) %>%
snp_info setnames(., colnames(.), c("chr", "snp", "cm", "pos", "alt", "ref"))
$snp <- str_replace_all(snp_info$snp,":","_") snp_info
Load the eqtl and gwas effect size files
# load the eqtl effect sizes
<- fread(glue::glue("{DATA}/colocalization/Liver_chr1.txt"))
gene_ss
# load gwas effect sizes
<- data.table::fread(glue::glue("{DATA}/spredixcan/imputed_CARDIoGRAM_C4D_CAD_ADDITIVE.txt.gz"))
gwas
# filter to select genome wide significant snps at 5 × 10−8
<- gwas %>% dplyr::filter(pvalue < 5e-8)
filtered_regions
# load the ld block
<-read_tsv(glue::glue("{DATA}/spredixcan/eur_ld.hg38.txt.gz")) ldblocks
Find regions with the strongest signal in the gwas
# get the loci where the significant snps are located
for (n in 1:nrow(filtered_regions)){
# extract genename, start and end
<- as.character(filtered_regions[n,"variant_id"])
variant_id <- as.character(filtered_regions[n,"chromosome"])
variant_chr <- as.numeric(filtered_regions[n,"position"])
variant_pos #gene_end <- as.numeric(genes[n,"end"])
<- ldblocks %>%
locus ::filter(chr == variant_chr) %>%
dplyrfilter(variant_pos >= start & variant_pos < stop) %>%
mutate(locus_name = paste0(chr,"_",start,"_",stop)) %>%
::rename(locus_start=start,locus_end=stop) %>%
dplyrmutate(variant_id = variant_id, position=variant_pos)
# create a data frame with info
if (exists('all_loci') && is.data.frame(get('all_loci'))) {
<- rbind(all_loci,locus)
all_loci else {
} <- locus
all_loci
}
}
# select uniq loci
<- all_loci %>%
d_loci ::select(locus_name,chr,locus_start,locus_end) %>%
dplyr::distinct() dplyr
select a region to run coloc
# select regions to fine map. we are going to use regions in chromosome 1
<- d_loci %>% dplyr::filter("chr1" == chr)
uniq_loci = 3 # chr1_107867043_109761309 region
n
# extract information
= as.numeric(str_remove(uniq_loci[n,"chr"],"chr"))
l_chr = uniq_loci[n,]$chr
s_chr = uniq_loci[n,]$locus_start
l_start = uniq_loci[n,]$locus_end
l_stop = uniq_loci[n,]$locus_name l_name
Prepare gwas data for coloc
# select snps for the region from the summary stats
<- gwas %>%
ss ::filter(chromosome == s_chr) %>%
dplyr::filter(position >= l_start & position <= l_stop) %>%
dplyr::filter(! is.na(effect_size))
dplyr
# find the snps in the genotype to calculate the correlation
<- ss %>% inner_join(snp_info %>% mutate(chr = glue::glue("chr{chr}")),
g.snps by=c("chromosome" = "chr","panel_variant_id" = "snp"))
# select genotype to calculate correlation
#f_mat <- X_mat[,g.snps$snp]
<- X_mat[,g.snps$panel_variant_id]
f_mat
# calculate corr
= cora(f_mat) # the package is for speed
R
## clean up
rm(f_mat)
<- g.snps %>% dplyr::filter(! is.na(effect_size)) %>% #select(-snp) %>%
ff ::rename(snp=panel_variant_id,beta=effect_size) %>%
dplyrmutate(varbeta = standard_error^2) %>%
::select(beta,varbeta,snp,position) %>% as.list()
dplyr
$type <- "cc"
ff$sdY <- 1
ff
$LD = R
ff$N = 184305
ff
## check the data (NULL means it's fine)
check_dataset(ff)
Prepare eqtl data for coloc
#Using SORT1 gene and liver tissue
<- gene_ss %>% dplyr::filter(gene_id == "ENSG00000134243.11") %>%
gene ::rename(snp = variant_id,beta = slope, MAF = maf,
dplyrpvalue = pval_nominal) %>%
mutate(varbeta = slope_se^2, name = snp) %>%
filter(! is.na(varbeta)) %>%
separate(name, into = c("chr", "position","ref","alt","build"),sep = "_")
## calculate the ld matrix
### get the snps
<- gene %>% mutate(position = as.integer(position)) %>%
gene.snps inner_join(snp_info %>% mutate(chr = glue::glue("chr{chr}")),
by=c("chr" = "chr","snp" = "snp"))
# select genotype to calculate correlation
<- X_mat[,gene.snps$snp]
g_mat # calculate corr
= cora(g_mat)
g.R
# clean up
rm(g_mat)
# format data for coloc
<- gene %>% dplyr::filter(snp %in% gene.snps$snp) %>%
gg mutate(position = as.integer(position)) %>%
::select(beta,varbeta,snp,position,MAF, pvalue) %>% as.list()
dplyr
$type <- "quant"
gg$LD = g.R
gg$N = 208 # 670 for blood
gg
## check the data
check_dataset(gg)
run older version of coloc which assumes single causal variant
coloc.abf makes the simplifying assumption that each trait has at most one causal variant in the region under consideration
<- coloc.abf(dataset1=ff, dataset2=gg)
my.res sensitivity(my.res,"H4 > 0.9")
run coloc allowing multiple causal variants
Multiple causal variants, using SuSiE to separate the signals
# Run susie fine maooing
= runsusie(ff)
S3 = runsusie(gg)
S4 #summary(S3)
# Run coloc
=coloc.susie(S3,S4)
susie.resprint(susie.res$summary)
SuSiE can take a while to run on larger datasets, so it is best to run once per dataset with the =runsusie= function, store the results and feed those into subsequent analyses.
plot the coloc result with the sensitivity function because weird effects are much easier to understand visually
sensitivity(susie.res,"H4 > 0.9",row=1,dataset1=ff,dataset2=gg,)
cTWAS
cTWAS stands for causal TWAS and seeks to calculate the posterior inclusion probability of genes and SNPs in a given region to be causal. It’s a modification of SUSIER that adds predicted gene expression levels in addition to SNPs in the analysis. It is not yet published, but the authors (Siming Zhao, Wesley Crouse, Matthew Stephens et al) have shared a vignette with us to try it out.
Thank you, Wes, for creating this portion of the lab!
Questionnaire 06
- Open and start filling out questionnaire 06 cTWAS https://forms.gle/A4evWkbhR7cXLy36A
#install.packages("R.utils")
#install.packages("remotes")
#remotes::install_github("simingz/ctwas", ref = "develop")
library(ctwas)
#get positions for region of interest (SORT1/PSRC1 locus)
<- unlist(strsplit(spredixcan_association$region_id[spredixcan_association$gene_name=="PSRC1"], "_"))
region <- region[2]
chr <- as.numeric(region[3])
start <- as.numeric(region[4])
end
#format summary statistics (and subset to variants in region to save memory)
<- data.table::fread(glue::glue("{DATA}/spredixcan/imputed_CARDIoGRAM_C4D_CAD_ADDITIVE.txt.gz"), select=c("chromosome", "position", "variant_id", "effect_allele", "non_effect_allele", "zscore", "sample_size"))
z_snp <- z_snp[z_snp$chromosome==chr & z_snp$position >= start & z_snp$position <= end,]
z_snp <- z_snp[!is.na(z_snp$variant_id),-(1:2)]
z_snp colnames(z_snp) <- c("id", "A1", "A2", "z", "ss")
#specify directories for LD matrices and weights
<- glue::glue("{DATA}/cTWAS/LD_matrices")
ld_R_dir <- glue::glue("{MODEL}/gtex_v8_mashr/mashr_Liver.db")
weight
#specify output locations and names for cTWAS
<- glue::glue("{DATA}/cTWAS/results/")
outputdir <- "CARDIoGRAM_Liver_expr"
outname.e <- "CARDIoGRAM_Liver_ctwas"
outname
#impute gene z scores using cTWAS and save the results
##################################
# NOTE: we are skipping this step and using the precomputed values
## takes ~10 minutes
##################################
# ctwas_imputation <- impute_expr_z(z_snp=z_snp, weight=weight, ld_R_dir=ld_R_dir, outputdir=outputdir, outname=outname.e, harmonize_z=F, harmonize_wgt=F)
# save(ctwas_imputation, file = paste0(outputdir, outname.e, "_output.Rd"))
load(paste0(outputdir, outname.e, "_output.Rd"))
<- ctwas_imputation$z_gene
z_gene <- ctwas_imputation$ld_exprfs
ld_exprfs <- ctwas_imputation$z_snp
z_snp
#make custom region file for single region
<- data.frame("chr" = chr, "start" = start, "stop" = end)
ld_regions_custom
write.table(ld_regions_custom,
file= paste0(outputdir, "ld_regions_custom.txt"),
row.names=F, col.names=T, sep="\t", quote = F)
<- paste0(outputdir, "ld_regions_custom.txt")
ld_regions_custom
#run cTWAS with pre-specified prior parameters at a single locus
#estimating prior requires genome-wide data, too slow for demonstration
#prior is 1% inclusion for genes and is 100x more likely than SNPs
#prior assumes genes have larger effect size than SNPs, in reasonable range for data we've looked at
ctwas_rss(z_gene=z_gene, z_snp=z_snp, ld_exprfs=ld_exprfs, ld_R_dir = ld_R_dir, ld_regions_custom=ld_regions_custom, outputdir = outputdir, outname = outname, thin = 0.01,
estimate_group_prior = F,
estimate_group_prior_var = F,
group_prior=c(0.01, 0.0001),
group_prior_var=c(50, 25))
#load results
<- data.table::fread(paste0(outputdir,outname,".susieIrss.txt"))
ctwas_results
#merge gene names into the results
<- RSQLite::dbDriver("SQLite")
sqlite = RSQLite::dbConnect(sqlite, weight)
db <- function(...) RSQLite::dbGetQuery(db, ...)
query <- query("select * from extra")
extra_table ::dbDisconnect(db)
RSQLite
$genename <- extra_table$genename[match(ctwas_results$id, extra_table$gene)]
ctwas_results
#show results with highest PIPs
= c("genename", "chrom", "id", "pos", "type", "region_tag1", "region_tag2", "cs_index", "susie_pip" , "mu2")
col_order_2<- ctwas_results[, ..col_order_2]
ctwas_results head(ctwas_results[order(-ctwas_results$susie_pip),])
FAQ and frequent errors
conda environment may not load correctly after resuming the project in Rstudio cloud
when inner joining by ensemble id of genes, make sure that both datasets have either the same versions or remove versions
Contributors
- Festus Nyasimi
- Margaret Perry
- Wes Crouse
- Owen Melia