Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pylint error fix #21

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
language: python
python:
- "3.5"
- "3.6"

install:
- pip install -r requirements.txt
- pip install flake8 pylint

script:
- flake8
- pylint *.py
- pylint *.py --exit-zero
165 changes: 88 additions & 77 deletions access_data_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,152 +3,163 @@
import progressbar
import sys

def update_protein(gene_seq,gene):
t=0
while(t!=2):

def update_protein(gene_seq, gene):
t = 0
while(t != 2):
try:
server = "https://rest.ensembl.org"
ext = "/sequence/id/"+str(gene)+"?type=protein;multiple_sequences=1"
ext = "/sequence/id/" + \
str(gene)+"?type=protein;multiple_sequences=1"

r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
r = requests.get(
server+ext, headers={"Content-Type": "application/json"})

if not r.ok:
r.raise_for_status()
sys.exit()
r=r.json()
if len(r)==1:
r=dict(r[0])
gene_seq[gene]=str(r["seq"])
r = r.json()
if len(r) == 1:
r = dict(r[0])
gene_seq[gene] = str(r["seq"])
return
else:
maxi=0
maxlen=0
maxi = 0
maxlen = 0
for i in range(len(r)):
m=r[i]
m=dict(m)
if len(m["seq"])>maxlen:
maxi=i
r=dict(r[maxi])
gene_seq[gene]=str(r["seq"])
m = r[i]
m = dict(m)
if len(m["seq"]) > maxlen:
maxi = i
r = dict(r[maxi])
gene_seq[gene] = str(r["seq"])
return
except :
t+=1
#print("\nError:",e)
except Exception:
t += 1
# print("\nError:",e)
continue
gene_seq[gene]=""
gene_seq[gene] = ""


def update_rest_protein(data,fname):
gids={}
with open("processed/not_found_"+fname+".json","r") as file:
gids=dict(json.load(file))
def update_rest_protein(data, fname):
gids = {}
with open("processed/not_found_"+fname+".json", "r") as file:
gids = dict(json.load(file))

gids=list(gids.keys())
gids = list(gids.keys())

geneseq={}
geneseq = {}

server = "https://rest.ensembl.org"
ext = "/sequence/id?type=protein"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
headers = {"Content-Type": "application/json",
"Accept": "application/json"}

for i in progressbar.progressbar(range(0,len(gids)-50,50)):
ids=dict(ids=list(gids[i:i+50]))
for i in progressbar.progressbar(range(0, len(gids)-50, 50)):
ids = dict(ids=list(gids[i:i+50]))
while(1):
try:
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
r = requests.post(server+ext, headers=headers,
data=str(json.dumps(ids)))
if not r.ok:
r.raise_for_status()
gs=r.json()
tgs={}
gs = r.json()
tgs = {}
for g in gs:
tgs[g["query"]]=g["seq"]
tgs[g["query"]] = g["seq"]
geneseq.update(tgs)
break
except Exception as e:
print("Error:",e)
print("Error:", e)
continue

data.update(geneseq)
for genes in gids:
try:
_=data[genes]
except:
_ = data[genes]
except Exception:
print(genes)
update_protein(data,genes)
update_protein(data, genes)

print("Protein Sequences Updated Successfully")
return data

def update(gene_seq,gene):
t=0
while(t!=2):


def update(gene_seq, gene):
t = 0
while(t != 2):
try:
server = "https://rest.ensembl.org"
ext = "/sequence/id/"+str(gene)+"?type=cds;multiple_sequences=1"

r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
r = requests.get(
server+ext, headers={"Content-Type": "application/json"})

if not r.ok:
r.raise_for_status()
sys.exit()
r=r.json()
if len(r)==1:
r=dict(r[0])
gene_seq[gene]=str(r["seq"])
r = r.json()
if len(r) == 1:
r = dict(r[0])
gene_seq[gene] = str(r["seq"])
return
else:
maxi=0
maxlen=0
maxi = 0
maxlen = 0
for i in range(len(r)):
m=r[i]
m=dict(m)
if len(m["seq"])>maxlen:
maxi=i
r=dict(r[maxi])
gene_seq[gene]=str(r["seq"])
m = r[i]
m = dict(m)
if len(m["seq"]) > maxlen:
maxi = i
r = dict(r[maxi])
gene_seq[gene] = str(r["seq"])
return
except Exception as e:
t+=1
#print("\nError:",e)
except Exception:
t += 1
# print("\nError:",e)
continue
gene_seq[gene]=""
gene_seq[gene] = ""


def update_rest(data,fname):
gids={}
with open("processed/not_found_"+fname+".json","r") as file:
gids=dict(json.load(file))
def update_rest(data, fname):
gids = {}
with open("processed/not_found_"+fname+".json", "r") as file:
gids = dict(json.load(file))

gids=list(gids.keys())
gids = list(gids.keys())

geneseq={}
geneseq = {}

server = "https://rest.ensembl.org"
ext = "/sequence/id?type=cds"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
headers = {"Content-Type": "application/json",
"Accept": "application/json"}

for i in progressbar.progressbar(range(0,len(gids)-50,50)):
ids=dict(ids=list(gids[i:i+50]))
for i in progressbar.progressbar(range(0, len(gids)-50, 50)):
ids = dict(ids=list(gids[i:i+50]))
while(1):
try:
r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
r = requests.post(server+ext, headers=headers,
data=str(json.dumps(ids)))
if not r.ok:
r.raise_for_status()
gs=r.json()
tgs={}
gs = r.json()
tgs = {}
for g in gs:
tgs[g["query"]]=g["seq"]
tgs[g["query"]] = g["seq"]
geneseq.update(tgs)
break
except Exception as e:
print("Error:",e)
print("Error:", e)
continue

data.update(geneseq)
for genes in gids:
try:
_=data[genes]
except:
_ = data[genes]
except Exception:
print(genes)
update(data,genes)
update(data, genes)

print("Gene Sequences Updated Successfully")
return data
return data
2 changes: 1 addition & 1 deletion finalize_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def prepare_features(a_h, d_h, sptree, label):
branch_length_species, \
branch_length_homology_species, \
distance, dist_p_s, dist_p_hs = create_tree_data(
sptree, df)
sptree, df)
assert(len(branch_length_species) == len(df))
assert(len(sml) == len(distance))

Expand Down
69 changes: 35 additions & 34 deletions ftpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,81 +3,82 @@
import os
import sys
import urllib.request as urllib
import requests

def download_data(x,dir_name):
fname=x.split("/")[-1]
path=os.path.join(dir_name,fname)
urllib.urlretrieve(x,path)

def get_data_file(file,dir):
def download_data(x, dir_name):
fname = x.split("/")[-1]
path = os.path.join(dir_name, fname)
urllib.urlretrieve(x, path)


def get_data_file(file, dir):
if not os.path.isfile(file):
print("The specified file does not exist!!!")
sys.exit(1)

with open(file,"r")as f:
lf=f.read().splitlines()
with open(file, "r")as f:
lf = f.read().splitlines()

if not os.path.exists(dir):
os.mkdir(dir)
for x in progressbar.progressbar(lf):
download_data(x,dir)

download_data(x, dir)


#This wil download all the fasta files for the coding sequences. To change the directory, change the argument in the get_data_file argument.
host ="ftp.ensembl.org"
# This wil download all the fasta files for the coding sequences.
# To change the directory, change the argument in the get_data_file argument.
host = "ftp.ensembl.org"
user = "anonymous"
password = ""

print("Connecting to {}".format(host))
ftp = FTP(host)
ftp.login(user, password)
print("Connected to {}".format(host))
base_link="ftp://ftp.ensembl.org"
#find sequences of all the cds files
l=ftp.nlst("/pub/release-96/fasta")
lt=[]
for x in l:
y=ftp.nlst(x+"/cds")
base_link = "ftp://ftp.ensembl.org"
# find sequences of all the cds files
list_of_files = ftp.nlst("/pub/release-96/fasta")
lt = []
for x in list_of_files:
y = ftp.nlst(x+"/cds")
for z in y:
if z.endswith(".cds.all.fa.gz"):
lt.append(z)

with open("seq_link.txt","w") as file:
with open("seq_link.txt", "w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")

#find all the files with protein sequences
l=ftp.nlst("/pub/release-96/fasta")
lt=[]
for x in progressbar.progressbar(l):
y=ftp.nlst(x+"/pep")
# find all the files with protein sequences
list_of_files = ftp.nlst("/pub/release-96/fasta")
lt = []
for x in progressbar.progressbar(list_of_files):
y = ftp.nlst(x+"/pep")
for z in y:
if z.endswith(".pep.all.fa.gz"):
lt.append(z)

with open("protein_seq.txt","w") as file:
with open("protein_seq.txt", "w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")
#get link of all the gtf files
l=ftp.nlst("/pub/release-96/gtf")
lt=[]
for x in l:
y=ftp.nlst(x)
# get link of all the gtf files
list_of_files = ftp.nlst("/pub/release-96/gtf")
lt = []
for x in list_of_files:
y = ftp.nlst(x)
for z in y:
if z.endswith(".96.gtf.gz"):
lt.append(z)

with open("gtf_link.txt","w") as file:
with open("gtf_link.txt", "w") as file:
for x in lt:
file.write(base_link+x)
file.write("\n")

print("Downloading Data")
get_data_file("gtf_link.txt","data")
get_data_file("seq_link.txt","geneseq")
get_data_file("protein_seq.txt","pro_seq")
get_data_file("gtf_link.txt", "data")
get_data_file("seq_link.txt", "geneseq")
get_data_file("protein_seq.txt", "pro_seq")
print("Download Complete.................")
Loading