Refactor coronavirus_biogrid for readability

DaddyWesker · May 15, 2020 · d1e43f6 · d1e43f6
1 parent 00c3a1a
commit d1e43f6
Showing 1 changed file with 105 additions and 68 deletions.
diff --git a/coronavirus_biogrid.py b/coronavirus_biogrid.py
@@ -17,35 +17,73 @@
 import metadata
 from datetime import date
 import zipfile
+from atomwrappers import *
+
 
 def checkdisc(diction, key, value):
-  try:
-    diction.setdefault(key,[]).append(value)
-  except KeyError:
-    return "key error"
+    try:
+        diction.setdefault(key, []).append(value)
+    except KeyError:
+        return "key error"
+
 
-def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="",symmetric=False, stv=""):
+def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="", symmetric=False, stv=""):
     if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
         if symmetric:
             list_type = "SetLink"
         else:
             list_type = "ListLink"
         return ("(EvaluationLink {}\n".format(stv) +
-            "\t (PredicateNode \""+ predicate + "\")\n" +
-            "\t ({} \n".format(list_type) +
-            "\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" +
-            "\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n" )
+                "\t (PredicateNode \"" + predicate + "\")\n" +
+                "\t ({} \n".format(list_type) +
+                "\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" +
+                "\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n")
     else:
         return ""
 
+
 def member(node1, node1_type, node2, node2_type, prefix1="", prefix2=""):
     if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
-        return ('(MemberLink\n' + 
-                '\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n'+
+        return ('(MemberLink\n' +
+                '\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n' +
                 '\t({} "'.format(node2_type) + prefix2 + str(node2) + '"))\n')
     else:
         return ""
 
+
+def add_taxonomy(taxonomy_id, node, fp):
+    member_ln = CMemberLink(node, CConceptNode("ncbi:" + str(taxonomy_id)))
+    fp.write(member_ln.recursive_print() + "\n")
+
+
+def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp):
+    if not prot_node_1.name in proteins_lst:
+        express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_1, prot_node_1))
+        gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
+                                   CListLink(gene_node_1, CConceptNode("Bio:" + bio_id_1)))
+        prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
+                                   CListLink(prot_node_1, CConceptNode("Bio:" + bio_id_1)))
+
+        fp.write(express_ln.recursive_print() + "\n")
+        fp.write(gene_bio.recursive_print() + "\n")
+        fp.write(prot_bio.recursive_print() + "\n")
+
+        proteins_lst.append(prot_node_1.name)
+
+    if not prot_node_2.name in proteins_lst:
+        express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_2, prot_node_2))
+        gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
+                                   CListLink(gene_node_2, CConceptNode("Bio:" + bio_id_2)))
+        prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
+                                   CListLink(prot_node_2, CConceptNode("Bio:" + bio_id_2)))
+
+        fp.write(express_ln.recursive_print() + "\n")
+        fp.write(gene_bio.recursive_print() + "\n")
+        fp.write(prot_bio.recursive_print() + "\n")
+
+        proteins_lst.append(prot_node_2.name)
+
+
 def process_data(version, file_path):
     if file_path:
         try:
@@ -56,20 +94,21 @@ def process_data(version, file_path):
             print(e)
     else:
         if version:
-            source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-'+ version +'/BIOGRID-CORONAVIRUS-'+ version +'.tab3.zip'
+            source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-' + version + '/BIOGRID-CORONAVIRUS-' + version + '.tab3.zip'
         else:
-            source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip'     
+            source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip'
         try:
             dataset = wget.download(source, "raw_data")
             version = zipfile.ZipFile(dataset).namelist()[0].split('-')[-1].replace(".tab3.txt", "")
             print(version)
             data = pd.read_csv(dataset, low_memory=False, delimiter='\t')
         except:
             print("Error processing biogrid version {0}".format(version))
-            raise  
+            raise
 
         import_data(data, source, version, gene_level=True)
 
+
 def import_data(data, source, version, gene_level=False, form='tab2'):
     # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info
     print("started importing")
@@ -79,98 +118,97 @@ def import_data(data, source, version, gene_level=False, form='tab2'):
     if gene_level:
         if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
             os.makedirs('gene-level')
-        g = open('gene-level/COVID-19-biogrid_'+version+"_gene-level_"+str(date.today())+'.scm','w')
+        g = open('gene-level/COVID-19-biogrid_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w')
 
-    with open('dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm','w') as f:
+    with open('dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm', 'w') as f:
         gene_pairs = []
         protein_pairs = []
         entrez = []
-        genes = []
         covid_genes = []
         proteins = []
         for i in range(len(data)):
-            if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(data.iloc[i]['Official Symbol Interactor B'])):
+            if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(
+                    data.iloc[i]['Official Symbol Interactor B'])):
                 gene1 = str(data.iloc[i]['Official Symbol Interactor A']).upper().strip()
                 gene2 = str(data.iloc[i]['Official Symbol Interactor B']).upper().strip()
                 prot1 = str(data.iloc[i]['SWISS-PROT Accessions Interactor A']).strip()
                 prot2 = str(data.iloc[i]['SWISS-PROT Accessions Interactor B']).strip()
                 score = data.iloc[i]['Score']
                 entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip()
                 entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip()
-                stv = ""
-                if not str(score) in ["-", "nan"]:
-                    stv = '(stv 1.0 {})'.format(round(float(score),3))
                 taxonomy_id_1 = int(data.iloc[i]['Organism ID Interactor A'])
                 taxonomy_id_2 = int(data.iloc[i]['Organism ID Interactor B'])
 
-                if (gene1, gene2) not in gene_pairs or (gene2, gene1) not in genes:
+                gene_node_1 = CGeneNode(gene1)
+                gene_node_2 = CGeneNode(gene2)
+
+                prot_node_1 = CMoleculeNode("Uniprot:" + prot1)
+                prot_node_2 = CMoleculeNode("Uniprot:" + prot2)
+
+                stv_node = None
+                if not str(score) in ["-", "nan"]:
+                    stv_node = CStv(1.0, round(float(score), 3))
+
+                if (gene1, gene2) not in gene_pairs:
+
                     if not gene1 in entrez:
-                        f.write(evaLink(gene1, "GeneNode", entrez1,"ConceptNode", "has_entrez_id",prefix2="entrez:"))
+                        entrez_ln_1 = CEvaluationLink(CPredicateNode("has_entrez_id"),
+                                                      CListLink(gene_node_1, CConceptNode("entrez:" + entrez1)))
+                        f.write(entrez_ln_1.recursive_print() + "\n")
                         entrez.append(gene1)
 
                     if not gene2 in entrez:
-                        f.write(evaLink(gene2, "GeneNode", entrez2,"ConceptNode", "has_entrez_id",prefix2="entrez:"))
+                        eval_ln_2 = CEvaluationLink(CPredicateNode("has_entrez_id"),
+                                                    CListLink(gene_node_2, CConceptNode("entrez:" + entrez2)))
+                        f.write(eval_ln_2.recursive_print() + "\n")
                         entrez.append(gene2)
 
-                    f.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv))
+                    interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
+                                                   CSetLink(gene_node_1, gene_node_2), stv=stv_node)
+                    f.write(interacts_ln.recursive_print() + "\n")
+
                     if gene_level:
-                        g.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv))
+                        g.write(interacts_ln.recursive_print() + "\n")
 
                     if taxonomy_id_1 == 2697049:
                         covid_genes.append(gene1)
-                        f.write(
-                            evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism", prefix2="ncbi:"))
-                        f.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism",
-                                        prefix1="Uniprot:", prefix2="ncbi:"))
+                        add_taxonomy(taxonomy_id_1, gene_node_1, f)
+                        add_taxonomy(taxonomy_id_1, prot_node_1, f)
                         if gene_level:
-                            g.write(evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism",
-                                            prefix2="ncbi:"))
-                            g.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism",
-                                            prefix1="Uniprot:", prefix2="ncbi:"))
+                            add_taxonomy(taxonomy_id_1, gene_node_1, g)
                     if taxonomy_id_2 == 2697049:
                         covid_genes.append(gene2)
-                        f.write(
-                            evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism", prefix2="ncbi:"))
-                        f.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism",
-                                        prefix1="Uniprot:", prefix2="ncbi:"))
+
+                        add_taxonomy(taxonomy_id_2, gene_node_2, f)
+                        add_taxonomy(taxonomy_id_2, prot_node_2, f)
+
                         if gene_level:
-                            g.write(evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism",
-                                            prefix2="ncbi:"))
-                            g.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism",
-                                            prefix1="Uniprot:", prefix2="ncbi:"))
+                            add_taxonomy(taxonomy_id_2, gene_node_2, g)
 
                     gene_pairs.append((gene1, gene2))
 
                 if (prot1, prot2) not in protein_pairs:
+                    interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
+                                                   CSetLink(prot_node_1, prot_node_2), stv=stv_node)
 
-                    f.write(evaLink(prot1, "MoleculeNode", prot2, "MoleculeNode", "interacts_with", symmetric=True, stv=stv,
-                                    prefix1="Uniprot:", prefix2="Uniprot:"))
-
-                    if not prot1 in proteins:
-                        bio = str(data.iloc[i]['BioGRID ID Interactor A']).strip()
-                        f.write(evaLink(gene1, "GeneNode", prot1, "MoleculeNode", "expresses", prefix2="Uniprot:"))
-                        f.write(evaLink(gene1, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:"))
-                        f.write(evaLink(prot1, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:"))
-                        proteins.append(prot1)
+                    f.write(interacts_ln.recursive_print() + "\n")
 
-                    if not prot2 in proteins:
-                        bio = str(data.iloc[i]['BioGRID ID Interactor B']).strip()
-                        f.write(evaLink(gene2, "GeneNode", prot2,"MoleculeNode", "expresses", prefix2="Uniprot:"))
-                        f.write(evaLink(gene2, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:"))
-                        f.write(evaLink(prot2, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:"))
-                        proteins.append(prot2)
+                    bio_1 = str(data.iloc[i]['BioGRID ID Interactor A']).strip()
+                    bio_2 = str(data.iloc[i]['BioGRID ID Interactor B']).strip()
+                    add_protein_interaction(proteins, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_1, bio_2,
+                                            f)
 
-
                     protein_pairs.append((prot1, prot2))
 
-        f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:"))
-        g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:"))
-    gene_pairs = set((a,b) if a<=b else (b,a) for a,b in gene_pairs)
+        f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
+        g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
+    gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs)
     number_of_interactions = len(gene_pairs)
     script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py"
-    metadata.update_meta("Coronavirus Biogrid:"+version, source,script,genes=str(len(set(genes))),prot=len(set(proteins)), interactions=str(number_of_interactions))
-    print("Done, check "+'dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm')
-    with open("Covid19-genes","w") as co:
+    metadata.update_meta("Coronavirus Biogrid:" + version, source, script, genes=str(len(set(entrez))),
+                         prot=len(set(proteins)), interactions=str(number_of_interactions))
+    print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm')
+    with open("Covid19-genes", "w") as co:
         co.write("\n".join(list(set(covid_genes))))
 
 
@@ -186,12 +224,11 @@ def parse_args():
 
 
 if __name__ == "__main__":
-  """
+    """
   usage:
   run the script with the path to the source data (if downloaded)
         python coronavirus_biogrid.py --path=path/to/the/source_data 
   Or run the script and specify a version number you wanted or just hit enter (to get the latest)
   """
-  arguments = parse_args()
-  process_data(arguments.version, arguments.path)
-
+    arguments = parse_args()
+    process_data(arguments.version, arguments.path)