Merge pull request MOZI-AI#12 from leungmanhin/tcmid

Generate Atomese for the TCMID ingredient-targets-disease-drug network
DaddyWesker · Apr 27, 2020 · 0c3e222 · 0c3e222
2 parents 471381f + e9aeeb3
commit 0c3e222
Showing 1 changed file with 70 additions and 2 deletions.
diff --git a/tcmid.py b/tcmid.py
@@ -9,6 +9,7 @@
 
 import os
 import rarfile
+import re
 import wget
 from datetime import date
 
@@ -27,9 +28,11 @@
   tcmid_prescription,
   tcmid_gnsp,
   tcmid_spectrum,
-#  tcmid_network
+  tcmid_network
 ]
 tcmid_base_url = "http://119.3.41.228:8000/static/download/"
+chebi_obo_file = "raw_data/chebi.obo"
+chebi_url = "ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.obo"
 
 if os.path.exists(os.path.join(os.getcwd(), output_file)):
   os.remove(output_file)
@@ -52,7 +55,7 @@ def memblink(node1, node2):
   out_fp.write(")\n")
 
 def is_available(entry):
-  return False if entry.strip() == "" or entry.strip().lower() == "na" or entry.strip().lower() == "n/a" else True
+  return False if entry == None or entry.strip() == "" or entry.strip().lower() == "na" or entry.strip().lower() == "n/a" else True
 
 # ----------
 # Keep a record of which part of a herb would be used in a formula
@@ -63,6 +66,13 @@ def is_available(entry):
 for rar_name in tcmid_source_rars:
   rar_path = "raw_data/{}".format(rar_name)
 
+  if os.path.exists(rar_path):
+    print("Removing file: {}".format(rar_path))
+    os.remove(rar_path)
+
+  rar_file = wget.download(tcmid_base_url + rar_name, "raw_data")
+  print("\nFile downloaded: {}".format(rar_file))
+
   with rarfile.RarFile(rar_file) as rf:
     # There should only be one file per RAR file
     # Decode using UTF-8 for the Chinese characters
@@ -131,4 +141,62 @@ def is_available(entry):
           if is_available(ingredient) and is_available(gnsp_id):
             evalink("has_gnsp_id", "MoleculeNode", "ConceptNode", ingredient, gnsp_id)
 
+    elif rar_file.endswith(tcmid_network):
+      ##### Get ChEBI info #####
+      if os.path.exists(chebi_obo_file):
+        print("Removing file: {}".format(chebi_obo_file))
+        os.remove(chebi_obo_file)
+
+      chebi_file = wget.download(chebi_url, "raw_data")
+      print("\nFile downloaded: {}".format(chebi_file))
+
+      chebi_fp = open(chebi_file, "r", errors="ignore")
+      chebi_dict = {}
+      chebi_name = []
+      chebi_id = None
+
+      for line in chebi_fp:
+        line = line.replace("\n", "")
+        if line == "[Term]":
+          if len(chebi_name) > 0 and chebi_id != None:
+            for name in chebi_name:
+              chebi_dict[name.lower()] = chebi_id
+            # print("ChEBI ID: {}\nName: {}\n".format(chebi_id, chebi_name))
+          chebi_name = []
+          chebi_id = None
+        elif line.startswith("id: "):
+          chebi_id = line.replace("id: CHEBI:", "")
+        elif line.startswith("name: "):
+          chebi_name.append(line.replace("name: ", ""))
+        elif line.startswith("synonym: ") and "EXACT" in line:
+          name = re.match(".+\"(.+)\".+", line).group(1)
+          if name not in chebi_name:
+            chebi_name.append(name)
+
+      chebi_fp.close()
+
+      for line in lines:
+        print("--- Reading line: " + line)
+        if is_available(line):
+          contents = line.split("\t")
+          ingredient = contents[0].lower().strip()
+          chebi_id = chebi_dict.get(ingredient)
+          uniprot_id = contents[2]
+          gene = contents[3]
+          omim_ids = contents[4].split(";")
+          drug_ids = contents[5].split(";")
+
+          full_name = "ChEBI:" + chebi_id if is_available(chebi_id) else "TCM:" + ingredient
+
+          if is_available(uniprot_id):
+            evalink("interacts_with", "MoleculeNode", "MoleculeNode", full_name, "Uniprot:" + uniprot_id)
+          if is_available(gene):
+            evalink("interacts_with", "MoleculeNode", "GeneNode", full_name, gene)
+          for omim in omim_ids:
+            if is_available(omim):
+              evalink("treats", "MoleculeNode", "ConceptNode", full_name, "OMIM:" + omim)
+          for drug in drug_ids:
+            if is_available(drug) and is_available(uniprot_id):
+              evalink("targets", "MoleculeNode", "MoleculeNode", "DrugBank:" + drug, "Uniprot:" + uniprot_id)
+
 out_fp.close()