Skip to content

Commit

Permalink
Merge pull request MOZI-AI#12 from leungmanhin/tcmid
Browse files Browse the repository at this point in the history
Generate Atomese for the TCMID ingredient-targets-disease-drug network
  • Loading branch information
tanksha authored Apr 27, 2020
2 parents 471381f + e9aeeb3 commit 0c3e222
Showing 1 changed file with 70 additions and 2 deletions.
72 changes: 70 additions & 2 deletions tcmid.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import os
import rarfile
import re
import wget
from datetime import date

Expand All @@ -27,9 +28,11 @@
tcmid_prescription,
tcmid_gnsp,
tcmid_spectrum,
# tcmid_network
tcmid_network
]
tcmid_base_url = "http://119.3.41.228:8000/static/download/"
chebi_obo_file = "raw_data/chebi.obo"
chebi_url = "ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.obo"

if os.path.exists(os.path.join(os.getcwd(), output_file)):
os.remove(output_file)
Expand All @@ -52,7 +55,7 @@ def memblink(node1, node2):
out_fp.write(")\n")

def is_available(entry):
return False if entry.strip() == "" or entry.strip().lower() == "na" or entry.strip().lower() == "n/a" else True
return False if entry == None or entry.strip() == "" or entry.strip().lower() == "na" or entry.strip().lower() == "n/a" else True

# ----------
# Keep a record of which part of a herb would be used in a formula
Expand All @@ -63,6 +66,13 @@ def is_available(entry):
for rar_name in tcmid_source_rars:
rar_path = "raw_data/{}".format(rar_name)

if os.path.exists(rar_path):
print("Removing file: {}".format(rar_path))
os.remove(rar_path)

rar_file = wget.download(tcmid_base_url + rar_name, "raw_data")
print("\nFile downloaded: {}".format(rar_file))

with rarfile.RarFile(rar_file) as rf:
# There should only be one file per RAR file
# Decode using UTF-8 for the Chinese characters
Expand Down Expand Up @@ -131,4 +141,62 @@ def is_available(entry):
if is_available(ingredient) and is_available(gnsp_id):
evalink("has_gnsp_id", "MoleculeNode", "ConceptNode", ingredient, gnsp_id)

elif rar_file.endswith(tcmid_network):
##### Get ChEBI info #####
if os.path.exists(chebi_obo_file):
print("Removing file: {}".format(chebi_obo_file))
os.remove(chebi_obo_file)

chebi_file = wget.download(chebi_url, "raw_data")
print("\nFile downloaded: {}".format(chebi_file))

chebi_fp = open(chebi_file, "r", errors="ignore")
chebi_dict = {}
chebi_name = []
chebi_id = None

for line in chebi_fp:
line = line.replace("\n", "")
if line == "[Term]":
if len(chebi_name) > 0 and chebi_id != None:
for name in chebi_name:
chebi_dict[name.lower()] = chebi_id
# print("ChEBI ID: {}\nName: {}\n".format(chebi_id, chebi_name))
chebi_name = []
chebi_id = None
elif line.startswith("id: "):
chebi_id = line.replace("id: CHEBI:", "")
elif line.startswith("name: "):
chebi_name.append(line.replace("name: ", ""))
elif line.startswith("synonym: ") and "EXACT" in line:
name = re.match(".+\"(.+)\".+", line).group(1)
if name not in chebi_name:
chebi_name.append(name)

chebi_fp.close()

for line in lines:
print("--- Reading line: " + line)
if is_available(line):
contents = line.split("\t")
ingredient = contents[0].lower().strip()
chebi_id = chebi_dict.get(ingredient)
uniprot_id = contents[2]
gene = contents[3]
omim_ids = contents[4].split(";")
drug_ids = contents[5].split(";")

full_name = "ChEBI:" + chebi_id if is_available(chebi_id) else "TCM:" + ingredient

if is_available(uniprot_id):
evalink("interacts_with", "MoleculeNode", "MoleculeNode", full_name, "Uniprot:" + uniprot_id)
if is_available(gene):
evalink("interacts_with", "MoleculeNode", "GeneNode", full_name, gene)
for omim in omim_ids:
if is_available(omim):
evalink("treats", "MoleculeNode", "ConceptNode", full_name, "OMIM:" + omim)
for drug in drug_ids:
if is_available(drug) and is_available(uniprot_id):
evalink("targets", "MoleculeNode", "MoleculeNode", "DrugBank:" + drug, "Uniprot:" + uniprot_id)

out_fp.close()

0 comments on commit 0c3e222

Please sign in to comment.