forked from galerp/hpo_sim_gene
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhpo_dist.R
149 lines (95 loc) · 4.13 KB
/
hpo_dist.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
library(tidyverse)
library(memoise)
start <- Sys.time()
source("hpo_dist_helpers.R")
######
#STEP 1: Create base and prop table
######
message(" \n Step 1 is executed: Hpo Base and Prop Table is created \n \n ")
exp321 <- exp321 %>% mutate(famID = gsub("-","_", famID))
pat_table_base <- pat_base(exp321)
pat_table_prop <- pat_prop(pat_table_base)
######
#STEP 2: Calculate Local IC
######
message("\n Step 1 is done. Calculating Local Information Content \n \n ")
base_IC <- base_calc_IC(pat_table_base)
prop_IC <- prop_calc_IC(pat_table_prop)
local_IC <- local_calc_IC(allHPOs)
write.csv(local_IC, paste0(input.yaml$output_dir,"/Local_IC.csv"),row.names = F)
#############
#STEP 3: Simlilarity Analysis - sim_max or sim_av
############
message("\n Local_IC file is written. Calculating the similarity_matrix \n \n ")
ic <- local_IC[,c("term","Propagated.local.IC")]
hpo_all <- allHPOs[,c("term","definition")]
ic2 <- merge(ic,hpo_all,by='term',all.x=TRUE,all.y=FALSE)
names(local_IC)[1] = "HPO"
ic2 <- local_IC[,c("HPO","Propagated.local.IC")]
memo_mica <- memoise(mica)
############
#Run Similarity Analysis on entire cohort
############
sim_score <- Compare_Cohort(pat_table_base)
write.csv(sim_score,paste0(input.yaml$output_dir,"/sim_matrix.csv"),row.names = T)
###########
#STEP 4: Create Distribution of Similarity Scores
###########
message("\n Sim score file is written. Permutation Analysis is to be followed \n \n ")
rownames(sim_score) = names(sim_score)
num_iterations = input.yaml$num_of_iterations
n2_100k <- sim_pat_draw(sim_score, 2,num_iterations)
n3_100k <- sim_pat_draw(sim_score, 3,num_iterations)
n4_100k <- sim_pat_draw(sim_score, 4,num_iterations)
n5_100k <- sim_pat_draw(sim_score, 5,num_iterations)
n7_100k <- sim_pat_draw(sim_score, 7,num_iterations)
###########
#STEP 4: Generate P-values for Genes
###########
message("\n Permutation is done. p-value for the genes is calculated \n \n ")
names(variant)[1] <- "famID"
variant <- variant %>% mutate(famID = gsub("-","_", famID))
famIDs_var <- variant$famID %>% unique %>% as.data.frame %>%
dplyr::rename('famID' = '.') %>% dplyr::mutate(var = "variant")
famIDs_sim <- sim_score %>% rownames %>% as.data.frame %>%
dplyr::rename('famID' = '.') %>% dplyr::mutate(sim = "sim_score")
fam_combined <- famIDs_sim %>% dplyr::full_join(famIDs_var)
#Filtering data
##Trios without sim_scores and without variants
no_sim <- as.vector(fam_combined$famID[is.na(fam_combined$sim) == TRUE])
no_var <- as.vector(fam_combined$famID[is.na(fam_combined$var) == TRUE])
#Trios with sim_scores
all_sim <- as.vector(fam_combined$famID[is.na(fam_combined$sim) == FALSE])
variant_sim <- variant %>% filter(famID %in% all_sim)
denovo <- denovo_calc(variant_sim)
#Table of denovos with famID and gene
tab1 <- denovo %>% dplyr::select(famID, Gene.refGene) %>% unique
#list of all genes
all_genes <- tab1 %>% dplyr::count(Gene.refGene) %>%
dplyr::rename(gene = Gene.refGene, Freq = n) %>%
filter(Freq > 1)
#Creating dataframe of similarity comparisons with every combination of patient pairs
##with the same denove gene
#initialize with first gene
pair_corrected <- gene_df(all_genes$gene[1])
#Create for all genes
for (i in 2:nrow(all_genes)){
temp <- gene_df(all_genes$gene[i])
pair_corrected <- pair_corrected %>% bind_rows(temp)
}
#Determine number of pairs per gene
gene_x <- unique(pair_corrected$gene)
gene_count <- as.data.frame(matrix(ncol=9,nrow=length(gene_x)))
names(gene_count) <- c("gene","n_pats","pairs","av_sim","median_sim","mode_sim","p_av","p_median","p_mode")
gene_count <- gene_count %>% mutate(gene = gene_x)
#Finding the average, median, and mode similarity,for each gene in the cohort
#########
#Find the p_value for each gene's similarity
## using the average, median, and mode similarity,for each gene from gene_count table
#########
#loop through each gene for p_av, p_med
gene_stat = gene_compute(gene_count)
write.csv(gene_stat,paste0(input.yaml$output_dir,"/gene_count.csv"),row.names = F)
message("\n The Entire script is now run successfully. Please find the Final output file gene_count.csv in the output directory \n ")
stop = Sys.time()
stop - start