Skip to content

Commit

Permalink
Refactor coronavirus_biogrid for readability
Browse files Browse the repository at this point in the history
  • Loading branch information
Habush committed May 15, 2020
1 parent 00c3a1a commit d1e43f6
Showing 1 changed file with 105 additions and 68 deletions.
173 changes: 105 additions & 68 deletions coronavirus_biogrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,35 +17,73 @@
import metadata
from datetime import date
import zipfile
from atomwrappers import *


def checkdisc(diction, key, value):
try:
diction.setdefault(key,[]).append(value)
except KeyError:
return "key error"
try:
diction.setdefault(key, []).append(value)
except KeyError:
return "key error"


def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="",symmetric=False, stv=""):
def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="", symmetric=False, stv=""):
if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
if symmetric:
list_type = "SetLink"
else:
list_type = "ListLink"
return ("(EvaluationLink {}\n".format(stv) +
"\t (PredicateNode \""+ predicate + "\")\n" +
"\t ({} \n".format(list_type) +
"\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" +
"\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n" )
"\t (PredicateNode \"" + predicate + "\")\n" +
"\t ({} \n".format(list_type) +
"\t\t ({}".format(node1_type) + " \"" + prefix1 + str(node1) + "\")\n" +
"\t\t ({}".format(node2_type) + " \"" + prefix2 + str(node2) + "\")))\n")
else:
return ""


def member(node1, node1_type, node2, node2_type, prefix1="", prefix2=""):
if not (str(node1) in ["-", "nan"] or str(node2) in ["-", "nan"]):
return ('(MemberLink\n' +
'\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n'+
return ('(MemberLink\n' +
'\t({} "'.format(node1_type) + prefix1 + str(node1) + '")\n' +
'\t({} "'.format(node2_type) + prefix2 + str(node2) + '"))\n')
else:
return ""


def add_taxonomy(taxonomy_id, node, fp):
member_ln = CMemberLink(node, CConceptNode("ncbi:" + str(taxonomy_id)))
fp.write(member_ln.recursive_print() + "\n")


def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp):
if not prot_node_1.name in proteins_lst:
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_1, prot_node_1))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node_1, CConceptNode("Bio:" + bio_id_1)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node_1, CConceptNode("Bio:" + bio_id_1)))

fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")

proteins_lst.append(prot_node_1.name)

if not prot_node_2.name in proteins_lst:
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_2, prot_node_2))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node_2, CConceptNode("Bio:" + bio_id_2)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node_2, CConceptNode("Bio:" + bio_id_2)))

fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")

proteins_lst.append(prot_node_2.name)


def process_data(version, file_path):
if file_path:
try:
Expand All @@ -56,20 +94,21 @@ def process_data(version, file_path):
print(e)
else:
if version:
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-'+ version +'/BIOGRID-CORONAVIRUS-'+ version +'.tab3.zip'
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-' + version + '/BIOGRID-CORONAVIRUS-' + version + '.tab3.zip'
else:
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip'
source = 'https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-CORONAVIRUS-LATEST.tab3.zip'
try:
dataset = wget.download(source, "raw_data")
version = zipfile.ZipFile(dataset).namelist()[0].split('-')[-1].replace(".tab3.txt", "")
print(version)
data = pd.read_csv(dataset, low_memory=False, delimiter='\t')
except:
print("Error processing biogrid version {0}".format(version))
raise
raise

import_data(data, source, version, gene_level=True)


def import_data(data, source, version, gene_level=False, form='tab2'):
# Set the gene_level to True to get only the GGI without extra entrez and pubmedID info
print("started importing")
Expand All @@ -79,98 +118,97 @@ def import_data(data, source, version, gene_level=False, form='tab2'):
if gene_level:
if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
os.makedirs('gene-level')
g = open('gene-level/COVID-19-biogrid_'+version+"_gene-level_"+str(date.today())+'.scm','w')
g = open('gene-level/COVID-19-biogrid_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w')

with open('dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm','w') as f:
with open('dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm', 'w') as f:
gene_pairs = []
protein_pairs = []
entrez = []
genes = []
covid_genes = []
proteins = []
for i in range(len(data)):
if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(data.iloc[i]['Official Symbol Interactor B'])):
if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(
data.iloc[i]['Official Symbol Interactor B'])):
gene1 = str(data.iloc[i]['Official Symbol Interactor A']).upper().strip()
gene2 = str(data.iloc[i]['Official Symbol Interactor B']).upper().strip()
prot1 = str(data.iloc[i]['SWISS-PROT Accessions Interactor A']).strip()
prot2 = str(data.iloc[i]['SWISS-PROT Accessions Interactor B']).strip()
score = data.iloc[i]['Score']
entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip()
entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip()
stv = ""
if not str(score) in ["-", "nan"]:
stv = '(stv 1.0 {})'.format(round(float(score),3))
taxonomy_id_1 = int(data.iloc[i]['Organism ID Interactor A'])
taxonomy_id_2 = int(data.iloc[i]['Organism ID Interactor B'])

if (gene1, gene2) not in gene_pairs or (gene2, gene1) not in genes:
gene_node_1 = CGeneNode(gene1)
gene_node_2 = CGeneNode(gene2)

prot_node_1 = CMoleculeNode("Uniprot:" + prot1)
prot_node_2 = CMoleculeNode("Uniprot:" + prot2)

stv_node = None
if not str(score) in ["-", "nan"]:
stv_node = CStv(1.0, round(float(score), 3))

if (gene1, gene2) not in gene_pairs:

if not gene1 in entrez:
f.write(evaLink(gene1, "GeneNode", entrez1,"ConceptNode", "has_entrez_id",prefix2="entrez:"))
entrez_ln_1 = CEvaluationLink(CPredicateNode("has_entrez_id"),
CListLink(gene_node_1, CConceptNode("entrez:" + entrez1)))
f.write(entrez_ln_1.recursive_print() + "\n")
entrez.append(gene1)

if not gene2 in entrez:
f.write(evaLink(gene2, "GeneNode", entrez2,"ConceptNode", "has_entrez_id",prefix2="entrez:"))
eval_ln_2 = CEvaluationLink(CPredicateNode("has_entrez_id"),
CListLink(gene_node_2, CConceptNode("entrez:" + entrez2)))
f.write(eval_ln_2.recursive_print() + "\n")
entrez.append(gene2)

f.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv))
interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
CSetLink(gene_node_1, gene_node_2), stv=stv_node)
f.write(interacts_ln.recursive_print() + "\n")

if gene_level:
g.write(evaLink(gene1, "GeneNode",gene2,"GeneNode", "interacts_with", symmetric=True, stv=stv))
g.write(interacts_ln.recursive_print() + "\n")

if taxonomy_id_1 == 2697049:
covid_genes.append(gene1)
f.write(
evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism", prefix2="ncbi:"))
f.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism",
prefix1="Uniprot:", prefix2="ncbi:"))
add_taxonomy(taxonomy_id_1, gene_node_1, f)
add_taxonomy(taxonomy_id_1, prot_node_1, f)
if gene_level:
g.write(evaLink(gene1, "GeneNode", taxonomy_id_1, "ConceptNode", "from_organism",
prefix2="ncbi:"))
g.write(evaLink(prot1, "MoleculeNode", taxonomy_id_1, "ConceptNode", "from_organism",
prefix1="Uniprot:", prefix2="ncbi:"))
add_taxonomy(taxonomy_id_1, gene_node_1, g)
if taxonomy_id_2 == 2697049:
covid_genes.append(gene2)
f.write(
evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism", prefix2="ncbi:"))
f.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism",
prefix1="Uniprot:", prefix2="ncbi:"))

add_taxonomy(taxonomy_id_2, gene_node_2, f)
add_taxonomy(taxonomy_id_2, prot_node_2, f)

if gene_level:
g.write(evaLink(gene2, "GeneNode", taxonomy_id_2, "ConceptNode", "from_organism",
prefix2="ncbi:"))
g.write(evaLink(prot2, "MoleculeNode", taxonomy_id_2, "ConceptNode", "from_organism",
prefix1="Uniprot:", prefix2="ncbi:"))
add_taxonomy(taxonomy_id_2, gene_node_2, g)

gene_pairs.append((gene1, gene2))

if (prot1, prot2) not in protein_pairs:
interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
CSetLink(prot_node_1, prot_node_2), stv=stv_node)

f.write(evaLink(prot1, "MoleculeNode", prot2, "MoleculeNode", "interacts_with", symmetric=True, stv=stv,
prefix1="Uniprot:", prefix2="Uniprot:"))

if not prot1 in proteins:
bio = str(data.iloc[i]['BioGRID ID Interactor A']).strip()
f.write(evaLink(gene1, "GeneNode", prot1, "MoleculeNode", "expresses", prefix2="Uniprot:"))
f.write(evaLink(gene1, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:"))
f.write(evaLink(prot1, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:"))
proteins.append(prot1)
f.write(interacts_ln.recursive_print() + "\n")

if not prot2 in proteins:
bio = str(data.iloc[i]['BioGRID ID Interactor B']).strip()
f.write(evaLink(gene2, "GeneNode", prot2,"MoleculeNode", "expresses", prefix2="Uniprot:"))
f.write(evaLink(gene2, "GeneNode", bio,"ConceptNode", "has_biogridID", prefix2="Bio:"))
f.write(evaLink(prot2, "MoleculeNode", bio,"ConceptNode", "has_biogridID", prefix1="Uniprot:",prefix2="Bio:"))
proteins.append(prot2)
bio_1 = str(data.iloc[i]['BioGRID ID Interactor A']).strip()
bio_2 = str(data.iloc[i]['BioGRID ID Interactor B']).strip()
add_protein_interaction(proteins, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_1, bio_2,
f)


protein_pairs.append((prot1, prot2))

f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:"))
g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode","has_name",prefix1="ncbi:"))
gene_pairs = set((a,b) if a<=b else (b,a) for a,b in gene_pairs)
f.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
g.write(evaLink("2697049", "ConceptNode", "SARS-CoV-2", "ConceptNode", "has_name", prefix1="ncbi:"))
gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs)
number_of_interactions = len(gene_pairs)
script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py"
metadata.update_meta("Coronavirus Biogrid:"+version, source,script,genes=str(len(set(genes))),prot=len(set(proteins)), interactions=str(number_of_interactions))
print("Done, check "+'dataset/COVID-19-biogrid_'+version+"_"+str(date.today())+'.scm')
with open("Covid19-genes","w") as co:
metadata.update_meta("Coronavirus Biogrid:" + version, source, script, genes=str(len(set(entrez))),
prot=len(set(proteins)), interactions=str(number_of_interactions))
print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm')
with open("Covid19-genes", "w") as co:
co.write("\n".join(list(set(covid_genes))))


Expand All @@ -186,12 +224,11 @@ def parse_args():


if __name__ == "__main__":
"""
"""
usage:
run the script with the path to the source data (if downloaded)
python coronavirus_biogrid.py --path=path/to/the/source_data
Or run the script and specify a version number you wanted or just hit enter (to get the latest)
"""
arguments = parse_args()
process_data(arguments.version, arguments.path)

arguments = parse_args()
process_data(arguments.version, arguments.path)

0 comments on commit d1e43f6

Please sign in to comment.