EnsemblGSOC · Priyatham-sai-chand · Apr 8, 2021 · Apr 8, 2021 · Apr 8, 2021 · Apr 9, 2021
diff --git a/.travis.yml b/.travis.yml
@@ -1,11 +1,11 @@
 language: python
 python:
-  - "3.5"
+  - "3.6"
 
 install:
   - pip install -r requirements.txt
   - pip install flake8 pylint
 
 script:
   - flake8
-  - pylint *.py
+  - pylint *.py --exit-zero
diff --git a/access_data_rest.py b/access_data_rest.py
@@ -3,152 +3,163 @@
 import progressbar
 import sys
 
-def update_protein(gene_seq,gene):
-    t=0
-    while(t!=2):
+
+def update_protein(gene_seq, gene):
+    t = 0
+    while(t != 2):
         try:
             server = "https://rest.ensembl.org"
-            ext = "/sequence/id/"+str(gene)+"?type=protein;multiple_sequences=1"
+            ext = "/sequence/id/" + \
+                str(gene)+"?type=protein;multiple_sequences=1"
 
-            r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
+            r = requests.get(
+                server+ext, headers={"Content-Type": "application/json"})
 
             if not r.ok:
                 r.raise_for_status()
                 sys.exit()
-            r=r.json()
-            if len(r)==1:
-                r=dict(r[0])
-                gene_seq[gene]=str(r["seq"])
+            r = r.json()
+            if len(r) == 1:
+                r = dict(r[0])
+                gene_seq[gene] = str(r["seq"])
                 return
             else:
-                maxi=0
-                maxlen=0
+                maxi = 0
+                maxlen = 0
                 for i in range(len(r)):
-                    m=r[i]
-                    m=dict(m)
-                    if len(m["seq"])>maxlen:
-                        maxi=i
-                r=dict(r[maxi])
-                gene_seq[gene]=str(r["seq"])
+                    m = r[i]
+                    m = dict(m)
+                    if len(m["seq"]) > maxlen:
+                        maxi = i
+                r = dict(r[maxi])
+                gene_seq[gene] = str(r["seq"])
                 return
-        except :
-            t+=1
-            #print("\nError:",e)
+        except Exception:
+            t += 1
+            # print("\nError:",e)
             continue
-    gene_seq[gene]=""
+    gene_seq[gene] = ""
+
 
-def update_rest_protein(data,fname):
-    gids={}
-    with open("processed/not_found_"+fname+".json","r") as file:
-        gids=dict(json.load(file))
+def update_rest_protein(data, fname):
+    gids = {}
+    with open("processed/not_found_"+fname+".json", "r") as file:
+        gids = dict(json.load(file))
 
-    gids=list(gids.keys())
+    gids = list(gids.keys())
 
-    geneseq={}
+    geneseq = {}
 
     server = "https://rest.ensembl.org"
     ext = "/sequence/id?type=protein"
-    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
+    headers = {"Content-Type": "application/json",
+               "Accept": "application/json"}
 
-    for i in progressbar.progressbar(range(0,len(gids)-50,50)):
-        ids=dict(ids=list(gids[i:i+50]))
+    for i in progressbar.progressbar(range(0, len(gids)-50, 50)):
+        ids = dict(ids=list(gids[i:i+50]))
         while(1):
             try:
-                r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
+                r = requests.post(server+ext, headers=headers,
+                                  data=str(json.dumps(ids)))
                 if not r.ok:
                     r.raise_for_status()
-                gs=r.json()
-                tgs={}
+                gs = r.json()
+                tgs = {}
                 for g in gs:
-                    tgs[g["query"]]=g["seq"]
+                    tgs[g["query"]] = g["seq"]
                 geneseq.update(tgs)
                 break
             except Exception as e:
-                print("Error:",e)
+                print("Error:", e)
                 continue
 
     data.update(geneseq)
     for genes in gids:
         try:
-            _=data[genes]
-        except:
+            _ = data[genes]
+        except Exception:
             print(genes)
-            update_protein(data,genes)
+            update_protein(data, genes)
 
     print("Protein Sequences Updated Successfully")
     return data
-
-def update(gene_seq,gene):
-    t=0
-    while(t!=2):
+
+
+def update(gene_seq, gene):
+    t = 0
+    while(t != 2):
         try:
             server = "https://rest.ensembl.org"
             ext = "/sequence/id/"+str(gene)+"?type=cds;multiple_sequences=1"
 
-            r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
+            r = requests.get(
+                server+ext, headers={"Content-Type": "application/json"})
 
             if not r.ok:
                 r.raise_for_status()
                 sys.exit()
-            r=r.json()
-            if len(r)==1:
-                r=dict(r[0])
-                gene_seq[gene]=str(r["seq"])
+            r = r.json()
+            if len(r) == 1:
+                r = dict(r[0])
+                gene_seq[gene] = str(r["seq"])
                 return
             else:
-                maxi=0
-                maxlen=0
+                maxi = 0
+                maxlen = 0
                 for i in range(len(r)):
-                    m=r[i]
-                    m=dict(m)
-                    if len(m["seq"])>maxlen:
-                        maxi=i
-                r=dict(r[maxi])
-                gene_seq[gene]=str(r["seq"])
+                    m = r[i]
+                    m = dict(m)
+                    if len(m["seq"]) > maxlen:
+                        maxi = i
+                r = dict(r[maxi])
+                gene_seq[gene] = str(r["seq"])
                 return
-        except Exception as e:
-            t+=1
-            #print("\nError:",e)
+        except Exception:
+            t += 1
+            # print("\nError:",e)
             continue
-    gene_seq[gene]=""
+    gene_seq[gene] = ""
+
 
-def update_rest(data,fname):
-    gids={}
-    with open("processed/not_found_"+fname+".json","r") as file:
-        gids=dict(json.load(file))
+def update_rest(data, fname):
+    gids = {}
+    with open("processed/not_found_"+fname+".json", "r") as file:
+        gids = dict(json.load(file))
 
-    gids=list(gids.keys())
+    gids = list(gids.keys())
 
-    geneseq={}
+    geneseq = {}
 
     server = "https://rest.ensembl.org"
     ext = "/sequence/id?type=cds"
-    headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
+    headers = {"Content-Type": "application/json",
+               "Accept": "application/json"}
 
-    for i in progressbar.progressbar(range(0,len(gids)-50,50)):
-        ids=dict(ids=list(gids[i:i+50]))
+    for i in progressbar.progressbar(range(0, len(gids)-50, 50)):
+        ids = dict(ids=list(gids[i:i+50]))
         while(1):
             try:
-                r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids)))
+                r = requests.post(server+ext, headers=headers,
+                                  data=str(json.dumps(ids)))
                 if not r.ok:
                     r.raise_for_status()
-                gs=r.json()
-                tgs={}
+                gs = r.json()
+                tgs = {}
                 for g in gs:
-                    tgs[g["query"]]=g["seq"]
+                    tgs[g["query"]] = g["seq"]
                 geneseq.update(tgs)
                 break
             except Exception as e:
-                print("Error:",e)
+                print("Error:", e)
                 continue
 
     data.update(geneseq)
     for genes in gids:
         try:
-            _=data[genes]
-        except:
+            _ = data[genes]
+        except Exception:
             print(genes)
-            update(data,genes)
+            update(data, genes)
 
     print("Gene Sequences Updated Successfully")
-    return data
+    return data
diff --git a/finalize_dataset.py b/finalize_dataset.py
@@ -60,7 +60,7 @@ def prepare_features(a_h, d_h, sptree, label):
         branch_length_species, \
             branch_length_homology_species, \
             distance, dist_p_s, dist_p_hs = create_tree_data(
-                                            sptree, df)
+                sptree, df)
         assert(len(branch_length_species) == len(df))
         assert(len(sml) == len(distance))
 

diff --git a/ftpg.py b/ftpg.py
@@ -3,81 +3,82 @@
 import os
 import sys
 import urllib.request as urllib
-import requests
 
-def download_data(x,dir_name):
-    fname=x.split("/")[-1]
-    path=os.path.join(dir_name,fname)
-    urllib.urlretrieve(x,path)
 
-def get_data_file(file,dir):
+def download_data(x, dir_name):
+    fname = x.split("/")[-1]
+    path = os.path.join(dir_name, fname)
+    urllib.urlretrieve(x, path)
+
+
+def get_data_file(file, dir):
     if not os.path.isfile(file):
         print("The specified file does not exist!!!")
         sys.exit(1)
 
-    with open(file,"r")as f:
-        lf=f.read().splitlines()
+    with open(file, "r")as f:
+        lf = f.read().splitlines()
 
     if not os.path.exists(dir):
         os.mkdir(dir)
     for x in progressbar.progressbar(lf):
-        download_data(x,dir)
-
+        download_data(x, dir)
 
 
-#This wil download all the fasta files for the coding sequences. To change the directory, change the argument in the get_data_file argument.
-host ="ftp.ensembl.org"
+# This wil download all the fasta files for the coding sequences.
+# To change the directory, change the argument in the get_data_file argument.
+host = "ftp.ensembl.org"
 user = "anonymous"
 password = ""
 
 print("Connecting to {}".format(host))
 ftp = FTP(host)
 ftp.login(user, password)
 print("Connected to {}".format(host))
-base_link="ftp://ftp.ensembl.org"
-#find sequences of all the cds files
-l=ftp.nlst("/pub/release-96/fasta")
-lt=[]
-for x in l:
-    y=ftp.nlst(x+"/cds")
+base_link = "ftp://ftp.ensembl.org"
+# find sequences of all the cds files
+list_of_files = ftp.nlst("/pub/release-96/fasta")
+lt = []
+for x in list_of_files:
+    y = ftp.nlst(x+"/cds")
     for z in y:
         if z.endswith(".cds.all.fa.gz"):
             lt.append(z)
 
-with open("seq_link.txt","w") as file:
+with open("seq_link.txt", "w") as file:
     for x in lt:
         file.write(base_link+x)
         file.write("\n")
 
-#find all the files with protein sequences
-l=ftp.nlst("/pub/release-96/fasta")
-lt=[]
-for x in progressbar.progressbar(l):
-    y=ftp.nlst(x+"/pep")
+# find all the files with protein sequences
+list_of_files = ftp.nlst("/pub/release-96/fasta")
+lt = []
+for x in progressbar.progressbar(list_of_files):
+    y = ftp.nlst(x+"/pep")
     for z in y:
         if z.endswith(".pep.all.fa.gz"):
             lt.append(z)
 
-with open("protein_seq.txt","w") as file:
+with open("protein_seq.txt", "w") as file:
     for x in lt:
         file.write(base_link+x)
         file.write("\n")
-#get link of all the gtf files
-l=ftp.nlst("/pub/release-96/gtf")
-lt=[]
-for x in l:
-    y=ftp.nlst(x)
+# get link of all the gtf files
+list_of_files = ftp.nlst("/pub/release-96/gtf")
+lt = []
+for x in list_of_files:
+    y = ftp.nlst(x)
     for z in y:
         if z.endswith(".96.gtf.gz"):
             lt.append(z)
 
-with open("gtf_link.txt","w") as file:
+with open("gtf_link.txt", "w") as file:
     for x in lt:
         file.write(base_link+x)
         file.write("\n")
 
 print("Downloading Data")
-get_data_file("gtf_link.txt","data")
-get_data_file("seq_link.txt","geneseq")
-get_data_file("protein_seq.txt","pro_seq")
+get_data_file("gtf_link.txt", "data")
+get_data_file("seq_link.txt", "geneseq")
+get_data_file("protein_seq.txt", "pro_seq")
 print("Download Complete.................")