From d1e43f692444a51772f8ef1becd32ad889acbab5 Mon Sep 17 00:00:00 2001 From: Abdulrahman Semrie Date: Fri, 15 May 2020 12:51:19 +0300 Subject: [PATCH] Refactor coronavirus_biogrid for readability --- coronavirus_biogrid.py | 173 +++++++++++++++++++++++++---------------- 1 file changed, 105 insertions(+), 68 deletions(-) diff --git a/coronavirus_biogrid.py b/coronavirus_biogrid.py index f929d65..7e5a245 100644 --- a/coronavirus_biogrid.py +++ b/coronavirus_biogrid.py @@ -17,35 +17,73 @@ import metadata from datetime import date import zipfile +from atomwrappers import * + def checkdisc(diction, key, value): - try: - diction.setdefault(key,[]).append(value) - except KeyError: - return "key error" + try: + diction.setdefault(key, []).append(value) + except KeyError: + return "key error" + -def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="",symmetric=False, stv=""): +def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="", symmetric=False, stv=""): if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]): if symmetric: list_type = "SetLink" else: list_type = "ListLink" return ("(EvaluationLink {}\n".format(stv) + - "\t (PredicateNode \""+ predicate + "\")\n" + - "\t ({} \n".format(list_type) + - "\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" + - "\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n" ) + "\t (PredicateNode \"" + predicate + "\")\n" + + "\t ({} \n".format(list_type) + + "\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" + + "\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n") else: return "" + def member(node1, node1_type, node2, node2_type, prefix1="", prefix2=""): if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]): - return ('(MemberLink\n' + - '\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n'+ + return ('(MemberLink\n' + + '\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n' + '\t({} "'.format(node2_type) + prefix2 + str(node2) + '"))\n') else: return "" + +def add_taxonomy(taxonomy_id, node, fp): + member_ln = CMemberLink(node, CConceptNode("ncbi:" + str(taxonomy_id))) + fp.write(member_ln.recursive_print() + "\n") + + +def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp): + if not prot_node_1.name in proteins_lst: + express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_1, prot_node_1)) + gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"), + CListLink(gene_node_1, CConceptNode("Bio:" + bio_id_1))) + prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"), + CListLink(prot_node_1, CConceptNode("Bio:" + bio_id_1))) + + fp.write(express_ln.recursive_print() + "\n") + fp.write(gene_bio.recursive_print() + "\n") + fp.write(prot_bio.recursive_print() + "\n") + + proteins_lst.append(prot_node_1.name) + + if not prot_node_2.name in proteins_lst: + express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_2, prot_node_2)) + gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"), + CListLink(gene_node_2, CConceptNode("Bio:" + bio_id_2))) + prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"), + CListLink(prot_node_2, CConceptNode("Bio:" + bio_id_2))) + + fp.write(express_ln.recursive_print() + "\n") + fp.write(gene_bio.recursive_print() + "\n") + fp.write(prot_bio.recursive_print() + "\n") + + proteins_lst.append(prot_node_2.name) + + def process_data(version, file_path): if file_path: try: @@ -56,9 +94,9 @@ def process_data(version, file_path): print(e) else: if version: - source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-'+ version +'/BIOGRID-CORONAVIRUS-'+ version +'.tab3.zip' + source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-' + version + '/BIOGRID-CORONAVIRUS-' + version + '.tab3.zip' else: - source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip' + source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip' try: dataset = wget.download(source, "raw_data") version = zipfile.ZipFile(dataset).namelist()[0].split('-')[-1].replace(".tab3.txt", "") @@ -66,10 +104,11 @@ def process_data(version, file_path): data = pd.read_csv(dataset, low_memory=False, delimiter='\t') except: print("Error processing biogrid version {0}".format(version)) - raise + raise import_data(data, source, version, gene_level=True) + def import_data(data, source, version, gene_level=False, form='tab2'): # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info print("started importing") @@ -79,17 +118,17 @@ def import_data(data, source, version, gene_level=False, form='tab2'): if gene_level: if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')): os.makedirs('gene-level') - g = open('gene-level/COVID-19-biogrid_'+version+"_gene-level_"+str(date.today())+'.scm','w') + g = open('gene-level/COVID-19-biogrid_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w') - with open('dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm','w') as f: + with open('dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm', 'w') as f: gene_pairs = [] protein_pairs = [] entrez = [] - genes = [] covid_genes = [] proteins = [] for i in range(len(data)): - if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(data.iloc[i]['Official Symbol Interactor B'])): + if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull( + data.iloc[i]['Official Symbol Interactor B'])): gene1 = str(data.iloc[i]['Official Symbol Interactor A']).upper().strip() gene2 = str(data.iloc[i]['Official Symbol Interactor B']).upper().strip() prot1 = str(data.iloc[i]['SWISS-PROT Accessions Interactor A']).strip() @@ -97,80 +136,79 @@ def import_data(data, source, version, gene_level=False, form='tab2'): score = data.iloc[i]['Score'] entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip() entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip() - stv = "" - if not str(score) in ["-", "nan"]: - stv = '(stv 1.0 {})'.format(round(float(score),3)) taxonomy_id_1 = int(data.iloc[i]['Organism ID Interactor A']) taxonomy_id_2 = int(data.iloc[i]['Organism ID Interactor B']) - if (gene1, gene2) not in gene_pairs or (gene2, gene1) not in genes: + gene_node_1 = CGeneNode(gene1) + gene_node_2 = CGeneNode(gene2) + + prot_node_1 = CMoleculeNode("Uniprot:" + prot1) + prot_node_2 = CMoleculeNode("Uniprot:" + prot2) + + stv_node = None + if not str(score) in ["-", "nan"]: + stv_node = CStv(1.0, round(float(score), 3)) + + if (gene1, gene2) not in gene_pairs: + if not gene1 in entrez: - f.write(evaLink(gene1, "GeneNode", entrez1,"ConceptNode", "has_entrez_id",prefix2="entrez:")) + entrez_ln_1 = CEvaluationLink(CPredicateNode("has_entrez_id"), + CListLink(gene_node_1, CConceptNode("entrez:" + entrez1))) + f.write(entrez_ln_1.recursive_print() + "\n") entrez.append(gene1) if not gene2 in entrez: - f.write(evaLink(gene2, "GeneNode", entrez2,"ConceptNode", "has_entrez_id",prefix2="entrez:")) + eval_ln_2 = CEvaluationLink(CPredicateNode("has_entrez_id"), + CListLink(gene_node_2, CConceptNode("entrez:" + entrez2))) + f.write(eval_ln_2.recursive_print() + "\n") entrez.append(gene2) - f.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv)) + interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"), + CSetLink(gene_node_1, gene_node_2), stv=stv_node) + f.write(interacts_ln.recursive_print() + "\n") + if gene_level: - g.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv)) + g.write(interacts_ln.recursive_print() + "\n") if taxonomy_id_1 == 2697049: covid_genes.append(gene1) - f.write( - evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism", prefix2="ncbi:")) - f.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism", - prefix1="Uniprot:", prefix2="ncbi:")) + add_taxonomy(taxonomy_id_1, gene_node_1, f) + add_taxonomy(taxonomy_id_1, prot_node_1, f) if gene_level: - g.write(evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism", - prefix2="ncbi:")) - g.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism", - prefix1="Uniprot:", prefix2="ncbi:")) + add_taxonomy(taxonomy_id_1, gene_node_1, g) if taxonomy_id_2 == 2697049: covid_genes.append(gene2) - f.write( - evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism", prefix2="ncbi:")) - f.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism", - prefix1="Uniprot:", prefix2="ncbi:")) + + add_taxonomy(taxonomy_id_2, gene_node_2, f) + add_taxonomy(taxonomy_id_2, prot_node_2, f) + if gene_level: - g.write(evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism", - prefix2="ncbi:")) - g.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism", - prefix1="Uniprot:", prefix2="ncbi:")) + add_taxonomy(taxonomy_id_2, gene_node_2, g) gene_pairs.append((gene1, gene2)) if (prot1, prot2) not in protein_pairs: + interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"), + CSetLink(prot_node_1, prot_node_2), stv=stv_node) - f.write(evaLink(prot1, "MoleculeNode", prot2, "MoleculeNode", "interacts_with", symmetric=True, stv=stv, - prefix1="Uniprot:", prefix2="Uniprot:")) - - if not prot1 in proteins: - bio = str(data.iloc[i]['BioGRID ID Interactor A']).strip() - f.write(evaLink(gene1, "GeneNode", prot1, "MoleculeNode", "expresses", prefix2="Uniprot:")) - f.write(evaLink(gene1, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:")) - f.write(evaLink(prot1, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:")) - proteins.append(prot1) + f.write(interacts_ln.recursive_print() + "\n") - if not prot2 in proteins: - bio = str(data.iloc[i]['BioGRID ID Interactor B']).strip() - f.write(evaLink(gene2, "GeneNode", prot2,"MoleculeNode", "expresses", prefix2="Uniprot:")) - f.write(evaLink(gene2, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:")) - f.write(evaLink(prot2, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:")) - proteins.append(prot2) + bio_1 = str(data.iloc[i]['BioGRID ID Interactor A']).strip() + bio_2 = str(data.iloc[i]['BioGRID ID Interactor B']).strip() + add_protein_interaction(proteins, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_1, bio_2, + f) - protein_pairs.append((prot1, prot2)) - f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:")) - g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:")) - gene_pairs = set((a,b) if a<=b else (b,a) for a,b in gene_pairs) + f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:")) + g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:")) + gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs) number_of_interactions = len(gene_pairs) script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py" - metadata.update_meta("Coronavirus Biogrid:"+version, source,script,genes=str(len(set(genes))),prot=len(set(proteins)), interactions=str(number_of_interactions)) - print("Done, check "+'dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm') - with open("Covid19-genes","w") as co: + metadata.update_meta("Coronavirus Biogrid:" + version, source, script, genes=str(len(set(entrez))), + prot=len(set(proteins)), interactions=str(number_of_interactions)) + print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm') + with open("Covid19-genes", "w") as co: co.write("\n".join(list(set(covid_genes)))) @@ -186,12 +224,11 @@ def parse_args(): if __name__ == "__main__": - """ + """ usage: run the script with the path to the source data (if downloaded) python coronavirus_biogrid.py --path=path/to/the/source_data Or run the script and specify a version number you wanted or just hit enter (to get the latest) """ - arguments = parse_args() - process_data(arguments.version, arguments.path) - + arguments = parse_args() + process_data(arguments.version, arguments.path)