Skip to content

Commit

Permalink
Merge pull request MOZI-AI#11 from leungmanhin/tcmid
Browse files Browse the repository at this point in the history
TCMID conversion script
  • Loading branch information
tanksha authored Apr 20, 2020
2 parents c1dccaa + 455a375 commit 471381f
Showing 1 changed file with 134 additions and 0 deletions.
134 changes: 134 additions & 0 deletions tcmid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# A script to convert TCMID to AtomSpace representation in Scheme
#
# Some additional dependencies may be required:
# apt install unrar
# python3 -m pip install rarfile
#
# To run the script:
# PYTHONIOENCODING=utf-8 python3 tcmid.py

import os
import rarfile
import wget
from datetime import date

rarfile.UNRAR_TOOL = "unrar"

output_file = "dataset/tcmid_{}.scm".format(str(date.today()))
tcmid_prescription = "prescription-TCMID.v2.01.rar"
tcmid_herb = "herb-TCMID.v2.01.rar"
tcmid_network = "ingredient_targets_disease_drug-TCMID.v2.03.rar"
tcmid_gnsp = "Ingredient_MS-TCMID.v2.01.rar"
tcmid_spectrum = "Herb_MS-TCMID.v2.01.rar"
tcmid_source_rars = [
# Need to know the properties of the herb first, so tcmid_herb
# should be processed before tcmid_prescription
tcmid_herb,
tcmid_prescription,
tcmid_gnsp,
tcmid_spectrum,
# tcmid_network
]
tcmid_base_url = "http://119.3.41.228:8000/static/download/"

if os.path.exists(os.path.join(os.getcwd(), output_file)):
os.remove(output_file)

def evalink(pred, node_type1, node_type2, node1, node2):
print("--- Creating EvaluationLink with:\npredicate = {}\nnode1 = {}\nnode2 = {}\n".format(pred, node1, node2))
out_fp.write("(EvaluationLink\n")
out_fp.write("\t(PredicateNode \"" + pred + "\")\n")
out_fp.write("\t(ListLink\n")
out_fp.write("\t\t(" + node_type1 + " \"" + node1 + "\")\n")
out_fp.write("\t\t(" + node_type2 + " \"" + node2 + "\")\n")
out_fp.write("\t)\n")
out_fp.write(")\n")

def memblink(node1, node2):
print("--- Creating MemberLink with:\nnode1 = {}\nnode2 = {}\n".format(node1, node2))
out_fp.write("(MemberLink\n")
out_fp.write("\t(ConceptNode \"" + node1 + "\")\n")
out_fp.write("\t(ConceptNode \"" + node2 + "\")\n")
out_fp.write(")\n")

def is_available(entry):
return False if entry.strip() == "" or entry.strip().lower() == "na" or entry.strip().lower() == "n/a" else True

# ----------
# Keep a record of which part of a herb would be used in a formula
herb_part_dict = {}

out_fp = open(output_file, "a", encoding='utf8')

for rar_name in tcmid_source_rars:
rar_path = "raw_data/{}".format(rar_name)

with rarfile.RarFile(rar_file) as rf:
# There should only be one file per RAR file
# Decode using UTF-8 for the Chinese characters
lines = rf.read(rf.infolist()[0]).decode("utf-8", "ignore").split("\n")

if rar_file.endswith(tcmid_herb):
# Skip the first line (columns) in this file
for line in lines[1:]:
print("--- Reading line: " + line)
if is_available(line):
contents = line.split("\t")
pinyin_name = contents[0]
english_name = contents[2]
properties = [x.lower().strip() for x in contents[4].split(",")]
meridians = [x.lower().strip() for x in contents[5].split(",")]
use_part = contents[6]
if is_available(pinyin_name) and is_available(english_name):
evalink("has_name", "ConceptNode", "ConceptNode", pinyin_name, english_name)
if is_available(pinyin_name) and is_available(use_part):
use_part_full_name = "{} {}".format(pinyin_name, use_part)
herb_part_dict[pinyin_name] = use_part_full_name
evalink("has_part", "ConceptNode", "ConceptNode", pinyin_name, use_part_full_name)
for prop in properties:
if is_available(pinyin_name) and is_available(prop):
evalink("has_property", "ConceptNode", "ConceptNode", pinyin_name, "TCM:" + prop)
for meri in meridians:
if is_available(pinyin_name) and is_available(meri):
evalink("meridian_affinity", "ConceptNode", "ConceptNode", pinyin_name, "TCM:" + meri)

elif rar_file.endswith(tcmid_prescription):
# Skip the first line (columns) in this file
for line in lines[1:]:
print("--- Reading line: " + line)
if is_available(line):
contents = line.split("\t")
prescription = contents[0]
composition = contents[3].split(",")
for compo in composition:
if is_available(compo) and is_available(prescription):
compo_part = herb_part_dict[compo] if compo in herb_part_dict else compo
evalink("composition", "ConceptNode", "ConceptNode", compo_part, prescription)
memblink(compo, "herb")
if is_available(prescription):
memblink(prescription, "prescription")

elif rar_file.endswith(tcmid_spectrum):
# Skip the first line (columns) in this file
for line in lines[1:]:
print("--- Reading line: " + line)
if is_available(line):
contents = line.split("\t")
pinyin_name = contents[1]
spectrum_description = [x.lower().strip() for x in contents[6].split(";")]
for sd in spectrum_description:
if is_available(sd) and is_available(pinyin_name):
evalink("has_hplc_description", "ConceptNode", "ConceptNode", pinyin_name, sd)

elif rar_file.endswith(tcmid_gnsp):
# Skip the first line (columns) in this file
for line in lines[1:]:
print("--- Reading line: " + line)
if is_available(line):
contents = line.split("\t")
ingredient = contents[0].replace("\"", "").lower().strip()
gnsp_id = contents[1].replace("\"", "").strip()
if is_available(ingredient) and is_available(gnsp_id):
evalink("has_gnsp_id", "MoleculeNode", "ConceptNode", ingredient, gnsp_id)

out_fp.close()

0 comments on commit 471381f

Please sign in to comment.