Skip to content

Commit

Permalink
Migrate from GIs to Accessions. Fixes #14.
Browse files Browse the repository at this point in the history
  • Loading branch information
StuntsPT committed Oct 21, 2016
1 parent 00226a5 commit b70d434
Showing 1 changed file with 31 additions and 7 deletions.
38 changes: 31 additions & 7 deletions back_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ def record_processor(self, record):
them.
"""
count = int(record["Count"]) # Int
IDs = record["IdList"] # List
if self.database != "genome":
IDs = self.gi_to_acc(record["IdList"]) # List
else:
IDs = record["IdList"] # List
webenv = record["WebEnv"] # String
query_key = record["QueryKey"] #String

Expand All @@ -69,6 +72,25 @@ def record_processor(self, record):
return count, IDs, webenv, query_key


def gi_to_acc(self, id_list):
"""
Makes a new efetch query to convert the old, depreccated GI list into
accession numbers. This became necessary as NCBI deprecated the GI
identifiers in favour of only using accession numbers.
However, esearch still returns a GI list...
"""
# There is a limit of 10k for each query, so we have to break our data.
accessions = []
for chunk in range(0, len(id_list), 10000):
fetch_handle = Entrez.efetch(db=self.database,
id=id_list[chunk:chunk + 10000],
rettype="acc", retmode="text")
accessions += [gi.strip() for gi in fetch_handle]
fetch_handle.close()

return accessions


def main_organizer(self, count, IDs, webenv, query_key, b_size, Run):
"""
Defines what tasks need to be performed, handles NCBI server errors and
Expand Down Expand Up @@ -168,7 +190,7 @@ def error_finder(self, target_file):

for lines in target_handle:
if lines.startswith(">"):
ID = re.search("gi\|.*?\|", lines).group(0)[3:-1]
ID = lines[1:lines.index(" ")]
verified_ids.add(ID)

target_handle.close()
Expand Down Expand Up @@ -216,10 +238,10 @@ def translate_genome(self, gilist):
http://www.ncbi.nlm.nih.gov/books/NBK25499/
"""
import urllib
from re import search
nuc_gi_list = []
query_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + \
"elink.fcgi?dbfrom=genome&db=nucleotide&id="
print(gilist)
query_url = ("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
"elink.fcgi?dbfrom=genome&db=nucleotide&id=")
for genome_id in gilist:
tmplist = []
xml = urllib.request.urlopen(query_url + genome_id)
Expand All @@ -228,7 +250,10 @@ def translate_genome(self, gilist):
tmplist.append(re.search("<Id>.*</Id>", content.decode('utf-8')).group()[4:-5])
nuc_gi_list += tmplist[1:]

return nuc_gi_list
self.database = "nucleotide"
nuc_acc_list = self.gi_to_acc(nuc_gi_list)

return nuc_acc_list


def run_everything(self):
Expand All @@ -247,7 +272,6 @@ def run_everything(self):
if self.database == "genome":
IDs = self.translate_genome(IDs)
count = len(IDs)
self.database = "nucleotide"
self.run = 2
self.main_organizer(count, IDs, webenv, query_key, batch_size, self.run)

Expand Down

0 comments on commit b70d434

Please sign in to comment.