Skip to content

Commit

Permalink
Refactor biogrid and coronavirus_biogrid scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Habush committed May 18, 2020
1 parent b07b920 commit 74fc6b7
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 36 deletions.
12 changes: 3 additions & 9 deletions biogrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,12 @@
from zipfile import ZipFile
from io import BytesIO
import os
import sys
import collections
import metadata
from datetime import date
from atomwrappers import *


def checkdisc(diction, key, value):
try:
diction.setdefault(key, []).append(value)
except KeyError:
return "key error"


def import_data_from_web(version, form='tab2'):
if form not in ('tab2', 'tab3'):
Expand Down Expand Up @@ -87,7 +81,7 @@ def import_data(data, source, version, gene_level=False, form='tab2'):

biogrid_path = os.path.join(dataset_path, 'biogrid_gene_gene_' + version + '_' + str(date.today()) + '.scm')
with open(biogrid_path, 'w') as f:
pairs = {}
pairs = collections.defaultdict(list)
entrez = []
for i in range(len(data)):
if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(
Expand All @@ -103,7 +97,7 @@ def import_data(data, source, version, gene_level=False, form='tab2'):
interactors = node2 + ':' + node1

pubmed_link = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=' + str(pubmed)
checkdisc(pairs, interactors, CConceptNode(pubmed_link))
pairs[interactors].append(CConceptNode(pubmed_link))

if not node1 in entrez:
entrez_1 = CConceptNode("entrez:" + str(data.iloc[i]['Entrez Gene Interactor A']))
Expand Down
48 changes: 21 additions & 27 deletions coronavirus_biogrid.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__author__ = "Hedra"
__email__ = "[email protected]"
__author__ = "Hedra & Abdulrahman"
__email__ = "[email protected] & [email protected]"

# The following script imports SARS-CoV-2 (COVID-19) and Coronavirus-Related Interactions from thebiogrid.com

Expand All @@ -10,15 +10,15 @@
# The version 183 is first release (March 25 2020), It can also be any of the latest versions with the same format

import argparse
import os
import zipfile
from datetime import date

import pandas as pd
import wget
import os
import sys

import metadata
from datetime import date
import zipfile
from atomwrappers import *
from biogrid import checkdisc


def evaLink(node1, node1_type, node2, node2_type, predicate, prefix1="", prefix2="", symmetric=False, stv=""):
Expand Down Expand Up @@ -50,31 +50,25 @@ def add_taxonomy(taxonomy_id, node, fp):
fp.write(member_ln.recursive_print() + "\n")


def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp):
if not prot_node_1.name in proteins_lst:
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_1, prot_node_1))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node_1, CConceptNode("Bio:" + bio_id_1)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node_1, CConceptNode("Bio:" + bio_id_1)))
def define_protein_lns(gene_node, prot_node, bio_id, fp):
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node, prot_node))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node, CConceptNode("Bio:" + bio_id)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node, CConceptNode("Bio:" + bio_id)))

fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")
fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")


def add_protein_interaction(proteins_lst, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_id_1, bio_id_2, fp):
if not prot_node_1.name in proteins_lst:
define_protein_lns(gene_node_1, prot_node_1, bio_id_1, fp)
proteins_lst.append(prot_node_1.name)

if not prot_node_2.name in proteins_lst:
express_ln = CEvaluationLink(CPredicateNode("expresses"), CListLink(gene_node_2, prot_node_2))
gene_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(gene_node_2, CConceptNode("Bio:" + bio_id_2)))
prot_bio = CEvaluationLink(CPredicateNode("has_biogridID"),
CListLink(prot_node_2, CConceptNode("Bio:" + bio_id_2)))

fp.write(express_ln.recursive_print() + "\n")
fp.write(gene_bio.recursive_print() + "\n")
fp.write(prot_bio.recursive_print() + "\n")

define_protein_lns(gene_node_2, prot_node_2, bio_id_2, fp)
proteins_lst.append(prot_node_2.name)


Expand Down

0 comments on commit 74fc6b7

Please sign in to comment.