# Defining root, data, and raw data directories
rootdir=/scratch/groups/abattle4/victor/WatershedAFR
datadir=${rootdir}/data
rawdir=${rootdir}/raw_data
Generate file mapping sample identifiers to tissues. Restrict to samples that pass RNA-seq QC (marked as RNASEQ in SMAFRZE column).
cat ${rawdir}/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | tail -n+2 | cut -f1,7,17 | \
sed 's/ - /_/' | sed 's/ /_/g' | sed 's/(//' | sed 's/)//' | sed 's/c-1/c1/' | \
awk '$3=="RNASEQ" {print $1"\t"$2}' | sort -k 1 > ${datadir}/data_prep/gtex_v8_samples_tissues.txt
Population specific TPM and read counts
# Subset `gtex_v8_samples_tissues.txt` by African and European populations
Rscript code/preprocessing/data_prep/population_sample_tissue_map.R \
--MAP ${datadir}/data_prep/gtex_v8_samples_tissues.txt \
--POP.LIST ${datadir}/data_prep/gtex_v8_individuals_AFR.txt \
--OUT ${datadir}/data_prep/gtex_v8_samples_tissues_AFR.txt
Rscript code/preprocessing/data_prep/population_sample_tissue_map.R \
--MAP ${datadir}/data_prep/gtex_v8_samples_tissues.txt \
--POP.LIST ${datadir}/data_prep/gtex_v8_individuals_EUR.txt \
--OUT ${datadir}/data_prep/gtex_v8_samples_tissues_EUR.txt
# split TMP
GTEX_tpm=${rawdir}/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
OUT_AFR=${datadir}/data_prep/PEER_AFR
SAMPLE_AFR=${datadir}/data_prep/gtex_v8_samples_tissues_AFR.txt
python code/preprocessing/data_prep/split_expr_by_tissues.py --gtex $GTEX_tpm --out $OUT_AFR --sample $SAMPLE_AFR --end .tpm.txt
OUT_EUR=${datadir}/data_prep/PEER_EUR
SAMPLE_EUR=${datadir}/data_prep/gtex_v8_samples_tissues_EUR.txt
python code/preprocessing/data_prep/split_expr_by_tissues.py --gtex $GTEX_tpm --out $OUT_EUR --sample $SAMPLE_EUR --end .tpm.txt
# split read counts
GTEX_reads=${rawdir}/GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz
python code/preprocessing/data_prep/split_expr_by_tissues.py --gtex $GTEX_reads --out $OUT_AFR --sample $SAMPLE_AFR --end .reads.txt
python code/preprocessing/data_prep/split_expr_by_tissues.py --gtex $GTEX_reads --out $OUT_EUR --sample $SAMPLE_EUR --end .reads.txt
Build covariate matrix with PC's 1 - 5 and sex from the eQTL covariates by tissue.
Rscript code/preprocessing/data_prep/combine_covariates_across_tissues.R \
--COV ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL_covariates \
--OUT ${datadir}/data_prep
For each tissue, filter for genes with > 20% individuals with TPM > 0.1 and read count > 6, Log2(tpm + 2) transfrom the data, and then z-transform.
# African
Rscript code/preprocessing/data_prep/preprocess_expr.R \
--COV ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
--MAP ${datadir}/data_prep/gtex_v8_samples_tissues_AFR.txt \
--MIN.SAMPLE 11 \
--PEER ${datadir}/data_prep/PEER_AFR
# European
Rscript code/preprocessing/data_prep/preprocess_expr.R \
--COV ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
--MAP ${datadir}/data_prep/gtex_v8_samples_tissues_EUR.txt \
--PEER ${datadir}/data_prep/PEER_EUR
Rename eQTL files from "cervical_c-1" to "cervical_c1" for consistency
rename c-1 c1 ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL/*c-1*
Generate list of top eQTLs for each gene in each tissue, extract from VCF, convert to number alternative alleles
bash code/preprocessing/data_prep/get_eqtl_genotypes.sh
Run PEER correction and compute residuals
## African
# Compute PEER factors (use PEER dockerimage if unable to install peer natively https://hub.docker.com/r/bryancquach/peer)
bash code/preprocessing/data_prep/calculate_PEER_factors.sh \
-p ${datadir}/data_prep/PEER_AFR \
-e ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL \
-c ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
-g ${datadir}/data_prep/gtex_2017-06-05_v8_genotypes_cis_eQTLs_012_processed.txt \
-t ~/scratch \
-d ~/code/PEER/peer-1.3.simg
# Compute residuals (does not need PEER package)
bash code/preprocessing/data_prep/calculate_PEER_residuals.sh \
-p ${datadir}/data_prep/PEER_AFR \
-e ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL \
-c ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
-g ${datadir}/data_prep/gtex_2017-06-05_v8_genotypes_cis_eQTLs_012_processed.txt \
-t ~/scratch
## European
# Compute PEER factors (use PEER dockerimage if unable to install peer natively https://hub.docker.com/r/bryancquach/peer)
bash code/preprocessing/data_prep/calculate_PEER_factors.sh \
-p ${datadir}/data_prep/PEER_EUR \
-e ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL \
-c ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
-g ${datadir}/data_prep/gtex_2017-06-05_v8_genotypes_cis_eQTLs_012_processed.txt \
-t ~/scratch \
-d ~/code/PEER/peer-1.3.simg
# Compute residuals (does not need PEER package)
bash code/preprocessing/data_prep/calculate_PEER_residuals.sh \
-p ${datadir}/data_prep/PEER_EUR \
-e ${rawdir}/GTEx/GTEx_Analysis_v8_eQTL \
-c ${datadir}/data_prep/gtex_v8_eQTL_covariates.txt \
-g ${datadir}/data_prep/gtex_2017-06-05_v8_genotypes_cis_eQTLs_012_processed.txt \
-t ~/scratch
Generate files with data on what tissues are available per individual after PEER correction
#African
bash code/preprocessing/data_prep/get_tissue_by_individual.sh \
-p ${datadir}/data_prep/PEER_AFR \
-o ${datadir}/data_prep/gtex_AFR_tissue_by_ind.txt \
-t ${datadir}/data_prep/gtex_AFR_tissues_all_normalized_samples.txt \
-i ${datadir}/data_prep/gtex_AFR_individuals_all_normalized_samples.txt
#European
bash code/preprocessing/data_prep/get_tissue_by_individual.sh \
-p ${datadir}/data_prep/PEER_EUR \
-o ${datadir}/data_prep/gtex_EUR_tissue_by_ind.txt \
-t ${datadir}/data_prep/gtex_EUR_tissues_all_normalized_samples.txt \
-i ${datadir}/data_prep/gtex_EUR_individuals_all_normalized_samples.txt
Combine the PEER-corrected data
Combined files will be located in ${datadir}/data_prep/gtex_AFR_normalized_expression.txt.gz and ${datadir}/data_prep/gtex_EUR_normalized_expression.txt.gz
#African
python code/preprocessing/data_prep/gather_filter_normalized_expression.py \
-p ${datadir}/data_prep/PEER_AFR \
-t ${datadir}/data_prep/gtex_AFR_tissues_all_normalized_samples.txt \
-i ${datadir}/data_prep/gtex_AFR_individuals_all_normalized_samples.txt \
-o ${datadir}/data_prep/gtex_AFR_normalized_expression.txt
gzip ${datadir}/data_prep/gtex_AFR_normalized_expression.txt
#European
python code/preprocessing/data_prep/gather_filter_normalized_expression.py \
-p ${datadir}/data_prep/PEER_EUR \
-t ${datadir}/data_prep/gtex_EUR_tissues_all_normalized_samples.txt \
-i ${datadir}/data_prep/gtex_EUR_individuals_all_normalized_samples.txt \
-o ${datadir}/data_prep/gtex_EUR_normalized_expression.txt
gzip ${datadir}/data_prep/gtex_EUR_normalized_expression.txt
Call outliers on African individuals
Saved to ${datadir}/outlier_calling/AFR/gtexV8.AFR.outlier.controls.v8ciseQTLs_globalOutliersRemoved.txt
Rscript code/preprocessing/outlier_calling/call_outliers.R \
--Z.SCORES=${datadir}/data_prep/gtex_AFR_normalized_expression.txt.gz \
--OUT=${datadir}/outlier_calling/AFR/gtexV8.AFR.outlier.controls.v8ciseQTLs.txt \
--POP=${datadir}/data_prep/gtex_v8_wgs_individuals_AFR.txt \
--N.PHEN=5 --ZTHRESH=3
# Remove global outliers
Rscript code/preprocessing/outlier_calling/identify_global_outliers.R \
--OUTLIERS=${datadir}/outlier_calling/AFR/gtexV8.AFR.outlier.controls.v8ciseQTLs.txt \
--METHOD=proportion
Call outliers on European individuals
Saved to ${datadir}/outlier_calling/EUR/gtexV8.EUR.outlier.controls.v8ciseQTLs_globalOutliersRemoved.txt
Rscript code/preprocessing/outlier_calling/call_outliers.R \
--Z.SCORES=${datadir}/data_prep/gtex_EUR_normalized_expression.txt.gz \
--OUT=${datadir}/outlier_calling/EUR/gtexV8.EUR.outlier.controls.v8ciseQTLs.txt \
--POP=${datadir}/data_prep/gtex_v8_wgs_individuals_EUR.txt \
--N.PHEN=5 --ZTHRESH=3
# Remove global outliers
Rscript code/preprocessing/outlier_calling/identify_global_outliers.R \
--OUTLIERS=${datadir}/outlier_calling/EUR/gtexV8.EUR.outlier.controls.v8ciseQTLs.txt \
--METHOD=proportion
Requires bcftools
and bedtools
Rare variants file will be saved to {datadir}/rare_variants/gene-AFR-rv.txt
bash code/preprocessing/rare_variants/find_rare_variants_changed_outliers_only.sh \
-d ${datadir}/rare_variants_pop_norm \
-g ${rawdir}/GTEx/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.SHAPEIT2_phased.vcf.gz \
-r ${datadir}/data_prep/gencode.v26.GRCh38.genes_padded10kb_PCandlinc_only.bed \
-l ${datadir}/data_prep/gtex_v8_wgs_individuals_AFR.txt \
-p "AFR" \
-o ${datadir}/outlier_calling/AFR/gtexV8.AFR.outlier.controls.v8ciseQTLs_globalOutliersRemoved.txt
Rare variants file will be saved to {datadir}/rare_variants/gene-EUR-rv.txt
bash code/preprocessing/rare_variants/find_rare_variants_changed_outliers_only.sh \
-d ${datadir}/rare_variants_pop_norm \
-g ${rawdir}/GTEx/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.SHAPEIT2_phased.vcf.gz \
-r ${datadir}/data_prep/gencode.v26.GRCh38.genes_padded10kb_PCandlinc_only.bed \
-l ${datadir}/data_prep/gtex_v8_wgs_individuals_EUR.txt \
-p "EUR" \
-o ${datadir}/outlier_calling/EUR/gtexV8.EUR.outlier.controls.v8ciseQTLs_globalOutliersRemoved.txt