-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeneIDs_AnnoQ_updated.R
145 lines (127 loc) · 4.68 KB
/
geneIDs_AnnoQ_updated.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
if (!require("httr", quietly = TRUE))
install.packages("httr")
if (!require("jsonlite", quietly = TRUE))
install.packages("jsonlite")
library(httr)
library(jsonlite)
# Define the Base URL
Annotations_URL <- "http://annoq.org/api-v2/graphql"
# Convert a list of fields into a GraphQL query format
create_annotations_query_string <- function(annotations) {
paste(annotations, collapse = "\n")
}
# Perform a GraphQL query
perform_graphql_query <- function(query) {
response <- POST(
Annotations_URL,
content_type_json(),
body = list(query = query),
encode = "json"
)
stop_for_status(response)
content(response, "text", encoding = "UTF-8")
}
# Desired fields
annotations_to_retrieve <- c(
"rs_dbSNP151",
"ANNOVAR_ensembl_Gene_ID",
"ANNOVAR_ensembl_Closest_gene",
"SnpEff_ensembl_Gene_ID",
"VEP_ensembl_Gene_ID",
"ANNOVAR_refseq_Gene_ID",
"ANNOVAR_refseq_Closest_gene",
"SnpEff_refseq_Gene_ID",
"VEP_refseq_Gene_Name",
"enhancer_linked_genes"
)
# Get SNP by chromosome, start, and end
snpQuery <- function(chr, start, end, annotations_to_retrieve) {
annotations_query_string <- create_annotations_query_string(annotations_to_retrieve)
query <- sprintf('
query {
get_SNPs_by_chromosome(chr: "%s", start: %d, end: %d, query_type_option: SNPS) {
snps {
%s
}
}
}', chr, start, end, annotations_query_string)
response_content <- perform_graphql_query(query)
data <- fromJSON(response_content, flatten = TRUE)
data$data$get_SNPs_by_chromosome$snps
}
# Function to process and extract gene annotations for a single SNP position
process_annotations <- function(chr, start, end) {
query_result <- snpQuery(chr, start, end, annotations_to_retrieve)
if (!is.null(query_result) && length(query_result) > 0) {
# Process Ensembl genes
ensembl_genes <- unlist(query_result[grep("ensembl", names(query_result))])
ensembl_genes <- ensembl_genes[!is.na(ensembl_genes)]
ensembl_genes <- unlist(strsplit(as.character(ensembl_genes), "\\|"))
ensembl_genes <- unlist(strsplit(ensembl_genes, "[-:]"))
ensembl_genes <- ensembl_genes[!is.na(ensembl_genes)]
ensembl_genes <- ensembl_genes[grepl("^ENSG", ensembl_genes)]
ensembl_genes <- unique(ensembl_genes)
# Process RefSeq genes
refseq_genes <- unlist(query_result[grep("refseq", names(query_result))])
refseq_genes <- refseq_genes[!is.na(refseq_genes)]
refseq_genes <- unlist(strsplit(as.character(refseq_genes), "\\|"))
refseq_genes <- unlist(lapply(refseq_genes, function(gene) {
parts <- strsplit(gene, "-")[[1]]
parts <- parts[!is.na(parts)]
final_parts <- c()
current_part <- parts[1]
for (i in 2:length(parts)) {
if (grepl("\\d$", current_part) && grepl("^\\d", parts[i])) {
current_part <- paste0(current_part, "-", parts[i])
} else {
final_parts <- c(final_parts, current_part)
current_part <- parts[i]
}
}
final_parts <- c(final_parts, current_part)
return(final_parts)
}))
refseq_genes <- refseq_genes[!is.na(refseq_genes)]
refseq_genes <- unique(refseq_genes)
# Process enhancers
enhancers <- query_result$enhancer_linked_genes
enhancers <- enhancers[!is.na(enhancers)]
enhancers <- unlist(strsplit(as.character(enhancers), ";"))
enhancers <- unique(enhancers)
# Process rs_dbSNP151
rs_dbSNP151 <- query_result$rs_dbSNP151
rs_dbSNP151 <- unique(rs_dbSNP151)
# Combine and format output
ensembl_genes <- paste(ensembl_genes[!is.na(ensembl_genes)], collapse = ",")
refseq_genes <- paste(refseq_genes[!is.na(refseq_genes)], collapse = ",")
enhancers <- paste(enhancers[!is.na(enhancers)], collapse = ",")
rs_dbSNP151 <- paste(rs_dbSNP151[!is.na(rs_dbSNP151)], collapse = ",")
return(data.frame(
chr = chr,
start = start,
end = end,
Ensembl_Genes = ensembl_genes,
RefSeq_Genes = refseq_genes,
Enhancers = enhancers,
rs_dbSNP151 = rs_dbSNP151,
stringsAsFactors = FALSE
))
} else {
return(data.frame(
chr = chr,
start = start,
end = end,
Ensembl_Genes = "",
RefSeq_Genes = "",
Enhancers = "",
rs_dbSNP151 = "",
stringsAsFactors = FALSE
))
}
}
# Function to match gene annotations back to the original df by SNP position
match_annotations_to_df <- function(df, annotations_df) {
# Merge the annotations with the original df on SNP position
merged_df <- merge(df, annotations_df, by = c("chr", "start", "end"), all.x = TRUE)
return(merged_df)
}