diff --git a/atomwrappers.py b/atomwrappers.py index 41f7608..e751c08 100644 --- a/atomwrappers.py +++ b/atomwrappers.py @@ -6,14 +6,14 @@ class CAtom: - pass + def __hash__(self): + return hash(str(self)) class CNode(CAtom): + atom_type = None def __init__(self, name): self.name = name - if name == 'HLA-B *57:01:01': - import pdb;pdb.set_trace() def __str__(self): return '({0} "{1}")'.format(self.atom_type, self.name.replace('"', '\\"')) @@ -22,6 +22,7 @@ def recursive_print(self, result='', indent=''): return result + indent + str(self) + class CLink(CAtom): def __init__(self, *atoms): self.outgoing = atoms @@ -60,4 +61,14 @@ class CListLink(CLink): class CGeneNode(CNode): atom_type = 'GeneNode' +class CContextLink(CLink): + atom_type = 'ContextLink' + + +class CSetLink(CLink): + atom_type = 'SetLink' + def __str__(self): + # str is used for hash computation, so need to sort outgoing set for all unordered links + outgoing = '\n'.join(sorted([str(x) for x in self.outgoing])) + return '({0} {1})'.format(self.atom_type, outgoing) diff --git a/pharma2chebi.tsv b/pharma2chebi.tsv new file mode 100644 index 0000000..93ffc5a --- /dev/null +++ b/pharma2chebi.tsv @@ -0,0 +1,256 @@ +pharma_id Name PubChem DrugBank ChEBI +PA166152901 serotonin 5202 28790 +PA161925594 tropisetron 11699 32269 +PA450267 lorazepam 46508468 186 6539 +PA451608 temazepam 46506604 231 9435 +PA164781320 flunitrazepam 46504553 1544 31622 +PA450496 midazolam 46507611 683 6931 +PA449681 flurazepam 7847395 690 5128 +PA10035 bromazepam 46505694 1558 31302 +PA166131305 nordazepam 2997 111762 +PA166178653 angiotensin 1-7 123805 55438 +PA166109554 angiotensin ii 65143 58506 +PA166109574 bradykinin 439201 3165 +PA165111696 artemisinin 68827 223316 +PA165111697 artesunate 5464098 63918 +PA450163 lamivudine 46507855 709 63577 +PA451735 tramadol 46506256 193 75725 +PA450616 nevirapine 46506789 238 63613 + leukotriene c4 16978 + leukotriene b4 15647 +PA134698661 pranlukast 692058 1411 94810 + leukotriene d4 28666 +PA166114483 lumacaftor 16678941 90951 +PA165950341 ivacaftor 16220172 8820 66901 +PA165946873 vemurafenib 42611257 63637 +PA166178276 6-phosphogluconate 91493 48928 +PA166178280 dihydronicotinamide adenine dinucleotide phosphate 5884 2338 16474 +PA166160613 phosphoribosyl pyrophosphate 7339 17111 +PA166178277 ribulose-5-phosphate 439184 16332 +PA166178279 nicotinamide adenine dinucleotide phosphate 5886 44409 +PA166178274 glucose-6-phosphate 5958 17733 +PA166178278 fructose-6-phosphate 69507 15946 +PA166160614 ribose phosphate 77982 17797 +PA450457 methylene blue 6872 +PA451752 triamterene 46507623 384 9671 + atp 15422 +PA449917 hydrogen peroxide 784 16240 + nadh 16908 +PA450566 mycophenolate mofetil 7847817 688 8764 +PA166160619 inosine monophosphate 8582 17202 +PA166178641 androstenedione 6128 1536 16422 +PA451089 pravastatin 46504851 175 63618 +PA451258 ritodrine 9448 867 8872 +PA451616 terbutaline 9339 871 9449 +PA134687907 formoterol 46507099 983 63082 +PA450431 methoxamine 9716 723 6839 +PA451300 salmeterol 46508024 938 64064 + tyrosine 58315 +PA450390 orciprenaline 9353 816 83329 +PA165110775 sn-38 104842 8988 +PA450085 irinotecan 46505871 762 80630 + acetate 30089 +PA450487 metyrosine 46506079 765 6912 +PA166178639 cholesterol 5997 16113 +PA166178666 squalene 638072 11460 15440 +PA166160055 cholic acid 221493 16359 +PA164924493 axitinib 66910 +PA166118340 motesanib 11667893 51098 +PA164924492 brivanib 94562 +PA166118341 vandetanib 3081361 49960 +PA449646 flecainide 46508078 1195 75984 +PA451457 sotalol 9517 489 63622 +PA449839 halofantrine 603092 1218 94392 +PA449958 ibutilide 731244 308 5856 +PA451131 propafenone 46504529 1182 63619 +PA166131350 acrylic acid 6581 18308 +PA166160613 prpp 7339 17111 +PA451663 thioguanine 46508170 352 9555 +PA166160618 s-adenosylhomocysteine 439155 16680 +PA166178659 arachidonic acid 444899 4557 15843 +PA452620 tegafur 5386 9256 32188 +PA166131299 5'-deoxy-5-fluorouridine 18343 31521 + histamine 18295 +PA166178990 somatostatin 16129681 9099 64628 +PA451234 repaglinide 46508150 912 8805 + acetoacetyl-coa 15345 +PA154410481 prasugrel 99443238 6209 87723 +PA166179131 phosphatidylethanolamine 9546747 44887 +PA166179130 phosphatidylcholine 5287971 2306 73002 +PA10204 tenofovir 699321 300 63717 +PA10005 adefovir dipivoxil 7848718 718 31175 + tryptophan 16828 +PA165291492 pazopanib 11525740 71219 +PA166163680 amitriptylinoxide 20313 135224 +PA166114906 bosutinib 160645775 39112 +PA165946122 crizotinib 11626560 64310 +PA166163464 inositol 1,4,5-trisphosphate 439456 16595 +PA165980594 ponatinib 24826799 78543 +PA131890628 tranilast 77572 +PA166178378 uric acid 1175 17775 +PA451106 probenecid 9576 1032 8426 +PA134852920 benzbromarone 3023 +PA166182234 candesartan cilexetil 2540 3348 +PA165958521 febuxostat 31596 +PA166178628 xanthine 1188 17712 +PA166178630 hypoxanthine 790 17368 +PA166131330 oxypurinol 4644 28315 +PA166178623 allantoin 204 15676 +PA450748 oxymorphone 46505296 1192 7865 +PA166163694 5-hydroxydiclofenac 3052566 59612 +PA166163693 4'-hydroxydiclofenac 116545 59613 +PA166163692 diclofenac acyl glucuronide 16084218 59609 +PA166163701 4',5-dihydroxydiclofenac 3052567 223401 +PA166163695 5-hydroxy diclofenac benzoquinone 15480180 59611 +PA166163700 3'-hydroxydiclofenac 112230 223792 +PA166163659 n-desmethylclozapine 2820 64050 +PA166165069 (s)-norverapamil 15593908 134081 + cl- 17996 + gaba 16865 +PA166109575 angiotensin i 3081372 2718 + fe(ii) 29033 +PA165291922 doxorubicinol 83970 133817 +PA166131278 tetrahydrothiophene 1127 48458 + nh4+ 28938 +PA166131317 sulfolane 31347 74794 +PA165260403 arteether 72416 135335 + m2 34827 +PA166131352 1-methyluric acid 69726 68441 +PA166131274 5-acetylamino-6-formylamino-3-methyluracil 108214 32643 +PA166131368 1-methylxanthine 80220 68443 +PA165820585 1,7-dimethylxanthine 4687 25858 +PA166131373 1,7-dimethyluric acid 91611 68449 +PA166131367 1,3,7-trimethyluric acid 79437 691622 +PA166131287 2-hydroxyiminostilbene 13103864 80599 +PA165817200 2-hydroxycarbamazepine 129274 80596 +PA165817197 3-hydroxycarbamazepine 135290 80597 +PA166131247 12-hydroxynevirapine 453338 145206 + leukotriene a4 15651 + leukotriene e4 15650 +PA166131575 chloride 312 17996 + glutamate 14321 + succinate 26806 +PA166178988 succinate semialdehyde 9543238 26805 +PA166178654 alpha-ketoglutarate 164533 16810 +PA451368 sodium 923 26708 +PA166131354 3-methylxanthine 70639 62205 +PA166131337 desmethyldoxepin 5353833 141547 +PA449773 glucose 5793 17234 +PA166178281 adenosine diphosphate 6022 22252 + m1 34826 + m3 51083 + n-acetyl-p-benzoquinone imine 29132 + h2o 15377 + nadph 16474 + gsh 16856 + nadp 25523 + gssg 17858 + h2o2 16240 + o2 15379 + na+ 29101 + fad 16238 + nad 13389 + fmn 17621 +PA166124478 endoxifen 10090750 80555 +PA164712732 enzyme inhibitors 23924 +PA165823907 hydroxyphenytoin 17732 63804 + nad+ 15846 + ip3 16595 + gtp 15996 + pip2 18348 + dopa 49168 + gdp 17552 + camp 17489 +PA166131329 etoposide glucuronide 46173784 5427 +PA166131563 glutathione conjugate 24335 +PA166131331 quinone 76320713 36141 + acth 3892 +PA452347 glucocorticoids 24261 + sn-38g 8990 + ldl 39026 + idl 132933 + vldl 39027 + phospholipid 16247 +PA451776 triglycerides 17855 + mevalonate 25350 + hdl 39025 + ca2+ 29108 +PA448486 arsenic trioxide 46506448 1169 30621 +PA166131270 aldophosphamide 107744 2560 +PA166131319 chloroacetaldehyde 33 27871 +PA166131388 4-hydroxycyclophosphamide 99735 1864 + ndps 7087 +PA166131349 gemcitabine diphosphate 6420157 42852 +PA166131301 aldoifosfamide 189733 80563 +PA166131315 4-hydroxyifosfamide 308171 80560 +PA166131366 acrolein 7847 15368 +PA166131285 carboxyifosfamide 124143 80564 +PA166131264 alcoifosfamide 10265589 80565 +PA166131387 4-ketoifosfamide 99303 80561 +PA166131371 aziridine 9033 30969 +PA166131307 chloroacetic acid 300 27869 + 1,3-oxazolidin-2-one 1237 +PA166160610 6-methylmercaptopurine riboside 237358 44081 +PA166163264 thioxanthine 1268107 51055 +PA166131558 npc 11756356 80552 +PA166131559 apc 10077584 80551 +PA166131242 3-hydroxycotinine glucuronide 53477725 133205 +PA166131378 cotinine n-oxide 9815514 89087 +PA166131324 norcotinine 413 89406 +PA166114414 cotinine 854019 68641 +PA166131561 cotinine glucuronide 178540 145221 +PA166131323 nornicotine 412 28313 +PA166131310 nicotine glucuronide 3035848 63860 +PA166131297 nicotine n-oxide 17785 132572 +PA166131340 morphine-3-glucuronide 5484731 80631 +PA166131341 codeine-6-glucuronide 5489029 80580 +PA166131414 normorphine 430245 7633 +PA452633 morphine-6-glucuronide 5360621 80581 +PA166131386 norcodeine 9925873 80579 + pgh2 15554 + txa2 15627 + pgd2 15555 + pge2 15551 + pgi2 15552 +PA164713176 platinum compounds 33749 +PA166131258 5'-deoxy-5-fluorocytidine 10037499 80627 + h+ 15378 + gastrin 75436 + pip3 16618 +PA164713207 proton pump inhibitors 49200 + amp 456215 + k+ 29103 + co2 16526 +PA166182027 clopidogrel carboxylic acid 9861403 83504 + farnesyl pyrophosphate 17407 + hmg-coa 15467 + geranyl pyrophosphate 17211 + acetyl-coa 15351 + adp 16761 + cyclic amp 17489 +PA449483 eptifibatide 700331 63 291902 + aa 15843 + pgg2 27647 + metabolite 25212 +PA166178987 diacylglycerol 6026790 18035 + 5-hydroxyindoleacetic acid 27823 + p1 60949 +PA166131450 demethylcitalopram 162180 80603 +PA166131265 citalopram propionic acid 10403174 80605 +PA166131333 abacavir 5'-monophosphate 469588 64112 + (-)-carbovir 5'-monophosphate 64116 + (-)-carbovir 5'-triphosphate 64174 + (-)-carbovir 5'-diphosphate 64172 +PA166177866 metoprolol acid 62936 83478 +PA166182233 o-desethyl candesartan 10047287 145224 +PA166182228 glycinexylidide 87833 357241 +PA166182229 monoethylglycinexylidide 24415 222828 +PA166131268 carboxyibuprofen 10444113 133199 + glucuronide 15341 +PA166178985 anandamide 5281969 2700 + pgf2a 15553 + am1 140154 + am9 140152 +PA166178656 5-hydroxyisourate 250388 18072 +PA164784024 peginterferon alfa-2b 22 63615 +PA166178629 5-hydroxyindole-3-acetic acid 1826 27823 diff --git a/pharmagkb.py b/pharmagkb.py index 242158d..4ffb66e 100644 --- a/pharmagkb.py +++ b/pharmagkb.py @@ -11,6 +11,7 @@ import argparse import xml.etree.ElementTree as ET from zipfile import ZipFile +from gzip import GzipFile from io import BytesIO import pandas from atomwrappers import * @@ -64,39 +65,8 @@ def pharma_to_id(chem_table, name): return chemical_id -def gen_chemical_members(mol_id_map, pathway_id): - """ - Generate member links for the substance pointed mol_id_map - - Parameters - ---------- - mol_id_map: dict - Database name, id pairs - pathway_id: str - pharma gkb id for pathway - - Returns - ------- - tuple[list, list] - member links connecting substance to pathway - MoleculeNodes for the substance - """ - tmp = list() - molecules = list() - for (id_type, mol_id) in mol_id_map.items(): - molecule = CMoleculeNode('{0}:{1}'.format(id_type, mol_id)) - member = CMemberLink(molecule, - CConceptNode(pathway_id)) - tmp.append(member) - molecules.append(molecule) - return tmp, molecules - -gene_re = re.compile('([A-Z0-9-]*).*') def gen_gene_member(gene, pathway_id, organism=None): - match = gene_re.match(gene) - assert match is not None - gene = match.group(1) result = [] member = CMemberLink(CGeneNode(gene), CConceptNode(pathway_id)) result.append(member) @@ -125,14 +95,15 @@ def process_genes(genes_str, pathway_id, organism=None): tmp += gen_gene_member(gene.strip(), pathway_id, organism=organism) return tmp + uniprot_re = re.compile('.*UniProtKB:([A-Za-z0-9-]+).*') -def gen_proteins(pharma_id, pathway_id, genes_data): +def gen_proteins(pharma_id, name, pathway_id, pharma2uniprot): """ Generate member links for the protein pointed by pharmagkb id Parameters ---------- - pharma_id: str + pharma_id: List[str] pharma gkb id for protein pathway_id: str pharma gkb id for pathway @@ -147,20 +118,18 @@ def gen_proteins(pharma_id, pathway_id, genes_data): """ tmp = [] proteins = [] - gene = genes_data[genes_data['PharmGKB Accession Id'] == pharma_id] - if len(gene) == 0: - print("no gene for {0} in genes table".format(pharma_id)) - return tmp, proteins - assert len(gene) == 1 - for ref in gene['Cross-references'].tolist()[0].split(','): - match = uniprot_re.match(ref) - if match: - for prot_id in match.groups(): - members, molecules = gen_chemical_members({'Uniprot': prot_id}, pathway_id) - tmp += members - proteins += molecules - else: - assert 'uniprot' not in ref.lower() + for prot in pharma_id: + entry = pharma2uniprot[pharma2uniprot.pharma_id == prot + ';'] + if not len(entry): + print("not found uniprot id for {0}".format(name)) + continue + for i in range(len(entry)): + prot_id = entry.iloc[i].Entry + molecule = CMoleculeNode('Uniprot:{0}'.format(prot_id)) + member = CMemberLink(molecule, + CConceptNode(pathway_id)) + tmp.append(member) + proteins.append(molecule) return tmp, proteins @@ -187,76 +156,432 @@ def generate_locations(elem, ns, chemical_nodes, pathway_id): return result -def process_small_molecules(pathway, ns, pathway_id, chem_data): +def find_chebi(pubchem, drugbank, name, pharma_id, pharma2chebi): + if pharma_id: + frame = pharma2chebi[pharma2chebi.pharma_id == pharma_id] + if len(frame): + assert len(set(frame['ChEBI'].to_list())) == 1 + return frame.iloc[0]['ChEBI'] + if name: + frame = pharma2chebi[pharma2chebi.Name == name] + if len(frame): + assert len(frame) == 1 + return frame.iloc[0]['ChEBI'] + + +def parse_molecule(smallmolecule, ns, chem_data, pharma2chebi=None): + """ + Parse SmallMolecule element + + Parameters + ---------- + smallmolecule: xml.etree.ElementTree.Element + SmallMolecule + ns: dict + namespaces from the owl file + chem_data: pandas.DataFrame + table chemicals from pharagkb + + Returns + ------- + dict, str + external db name: id pairs + human readable name + """ + reference = smallmolecule.find('bp:entityReference', ns) + molecule_drug = dict() + assert reference is not None + name = smallmolecule.find('./bp:standardName', ns).text.lower().strip() + for value in reference.attrib.values(): + pharma_pkg_id = re.match('.*\.(PA\d+)\.?.*', value) + if pharma_pkg_id is None: + # try by standard name + + row = chem_data[chem_data.Name == name] + if len(row): + assert len(row) == 1 + pharma_pkg_id = row.iloc[0]['PharmGKB Accession Id'] + else: + print("no pharmapkg id for {0}".format(value)) + continue + else: + pharma_pkg_id = pharma_pkg_id.group(1) + molecule_drug = pharma_to_id(chem_data, pharma_pkg_id) + + # try pharma2chebi + if 'ChEBI' not in molecule_drug: + pubchem = molecule_drug.get('PubChem', '') + drugbank = molecule_drug.get('DrugBank', '') + if pharma2chebi is not None: + chebi = find_chebi(pubchem, drugbank, name, pharma_pkg_id, pharma2chebi) + if chebi: + molecule_drug['ChEBI'] = chebi + if pharma_pkg_id is None: + pharma_pkg_id = '' + #name = name.replace('"', '\\"') + #with open('not_parsed.csv', 'at') as f: + # f.write(pharma_pkg_id + '\t') + # f.write(name + '\t') + # f.write(pubchem + '\t') + # f.write(drugbank + '\t') + # f.write('\n') + return molecule_drug, name + + +def process_small_molecules(pathway, ns, pathway_id, chem_data, id_map, pharma2chebi=None): tmp = list() for smallmolecule in pathway.findall('./bp:SmallMolecule', ns): - reference = smallmolecule.find('bp:entityReference', ns) - assert reference is not None - for value in reference.attrib.values(): - pharma_pkg_id = re.match('.*\.(PA\d+)\.?.*', value) - if pharma_pkg_id is None: - # try by standard name - name = smallmolecule.find('./bp:standardName', ns).text.lower() - row = chem_data[chem_data.Name == name] - if len(row): - assert len(row) == 1 - pharma_pkg_id = row.iloc[0]['PharmGKB Accession Id'] - else: - print("no pharmapkg id for {0}".format(value)) - continue - else: - pharma_pkg_id = pharma_pkg_id.group(1) - molecule_drug = pharma_to_id(chem_data, pharma_pkg_id) - members, chemical_nodes = gen_chemical_members(molecule_drug, pathway_id) - tmp += generate_locations(smallmolecule, ns, chemical_nodes, pathway_id) - tmp += members + molecule_drug, name = parse_molecule(smallmolecule, ns, chem_data, pharma2chebi) + molecule = CMoleculeNode(name) + member = CMemberLink(molecule, + CConceptNode(pathway_id)) + for db_name in ('ChEBI', 'PubChem', 'DrugBank'): + if db_name in molecule_drug: + name_node = CConceptNode("{0}:{1}".format(db_name, molecule_drug[db_name])) + ctx = CContextLink( + CConceptNode(pathway_id), + CEvaluationLink( + CPredicateNode("has_{0}_id".format(db_name.lower())), + CListLink(molecule, + name_node))) + tmp.append(ctx) + tmp.append(member) + id_map[about(smallmolecule, ns)] = [molecule] + tmp += generate_locations(smallmolecule, ns, [molecule], pathway_id) + return tmp - -def process_proteins(pathway, ns, pathway_id, genes_data): + +def match_protein_id(value): + match = protein_ref_re.match(value) + if match: + return match.group(2) + else: + assert 'PA' not in value + return None + + +def parse_protein(protein, pathway, ns, pathway_id, pharma2uniprot, elem_chemical_map, tmp=None): + name = protein.find('./bp:standardName', ns).text + protein_elem_id = about(protein, ns) + if protein_elem_id in elem_chemical_map: + return elem_chemical_map[protein_elem_id] + + organism = None + if name.strip().startswith('HIV'): + organism = '12721' + reference_elem = protein.find('./bp:entityReference', ns) + if reference_elem is None: + print("failed to find reference for {0}".format(protein_elem_id)) + elem_chemical_map[protein_elem_id] = [] + return [] + ent_ref_id = resource(reference_elem, ns) + ent_ref = pathway.find('./*[@rdf:about="{0}"]'.format(ent_ref_id), ns) + protein_ref_id = [] + if ent_ref.tag.endswith('ProteinReference'): + # it is ether protein group or a protein + xref = ent_ref.find('./bp:xref', ns) + if xref is not None: + value = resource(xref, ns) + prot_id = match_protein_id(value) + if prot_id: + protein_ref_id.append(prot_id) + else: + # protein group + for ent_mem in ent_ref.findall('./bp:memberEntityReference', ns): + value = resource(ent_mem, ns) + prot_id = match_protein_id(value) + if prot_id: + protein_ref_id.append(prot_id) + else: + import pdb;pdb.set_trace() + if protein_ref_id is not None: + members, protein_nodes = gen_proteins(protein_ref_id, name, pathway_id, pharma2uniprot) + elem_chemical_map[protein_elem_id] = protein_nodes + if tmp is not None: + tmp += generate_locations(protein, ns, protein_nodes, pathway_id) + tmp += members + else: + # give up + name = protein.find('./bp:standardName', ns).text + print("can't map protein to uniprot for {0}".format(name)) + return None + return elem_chemical_map.get(protein_elem_id, None) + + +def process_proteins(pathway, ns, pathway_id, genes_data, pharma2uniprot, elem_chemical_map): tmp = list() # properties often don't have valid attributes for protein in pathway.findall('./bp:Protein', ns): - name = protein.find('./bp:standardName', ns).text - organism = None - if name.strip().startswith('HIV'): - organism = '12721' - tmp += process_genes(name, pathway_id, organism) - protein_ref_id = None - for key, value in protein.attrib.items(): - match = protein_ref_re.match(value) - if match: - protein_ref_id = match.group(1) - if protein_ref_id is None: - for ent_ref in protein.findall('./bp:entityReference', ns): - for value in ent_ref.attrib.values(): - match = protein_ref_re.match(value) - if match: - protein_ref_id = match.group(1) - else: - assert 'PA' not in value - if protein_ref_id is None: - # give up - name = protein.find('./bp:standardName', ns).text - print("can't map protein to uniprot for {0}".format(name)) - continue - members, protein_nodes = gen_proteins(protein_ref_id, pathway_id, genes_data) - tmp += generate_locations(protein, ns, protein_nodes, pathway_id) - tmp += members + parse_protein(protein, pathway, ns, pathway_id, pharma2uniprot, elem_chemical_map, tmp=tmp) return tmp -protein_ref_re = re.compile('pgkb.[a-z0-9]+.[A-Za-z0-9]+.*(PA\d+).*') + +protein_ref_re = re.compile('.*(\.|/)(PA\d+)') go_location_re = re.compile('.*(GO:\d+).*') -def convert_pathway(pathway, chem_data, genes_data, pathway_id, pathway_name, ns): +drug_re = re.compile('pgkb.drug.*(PA\d+)') + + + +def wrap_set(molecules): + if len(molecules) == 1: + result = molecules[0] + else: + result = CSetLink(*molecules) + return result + + +def wrap_list(molecules): + if len(molecules) == 1: + result = molecules[0] + else: + result = CListLink(*molecules) + return result + + +def gen_interaction(interaction, pathway, pathway_id, ns, id_map, interaction_name): + result = list() + left_elem = interaction.find('./bp:left', ns) + right_elem = interaction.find('./bp:right', ns) + if left_elem is None or right_elem is None: + print("din't find \"from\" participant in interaction {0}".format(about(interaction, ns))) + id_map[about(interaction, ns)] = result + return result + left = resource(left_elem, ns) + right = resource(right_elem, ns) + parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(left), ns), pathway, ns, pathway_id, id_map) + parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(right), ns), pathway, ns, pathway_id, id_map) + left_mol = id_map.get(left, ()) + right_mol = id_map.get(right, ()) + ev = None + if left_mol and right_mol: + ev = CEvaluationLink( + CPredicateNode(interaction_name), + CListLink(wrap_list(left_mol), + wrap_list(right_mol))) + result.append(CContextLink( + CConceptNode(pathway_id), + ev)) + if not result: + print("failed to parse {0}".format(about(interaction, ns))) + id_map[about(interaction, ns)] = [] if ev is None else [ev] + return result + + +def parse_subelements(interaction, xpath, pathway, pathway_id, ns, id_map): + left_items = list() + for left in interaction.findall(xpath, ns): + left_id = resource(left, ns) + parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(left_id), ns), pathway, ns, pathway_id, id_map) + left_items += id_map.get(left_id, []) + return left_items + + +def gen_conversion(interaction, pathway, pathway_id, ns, id_map): + result = list() + left = parse_subelements(interaction, './bp:left', pathway, pathway_id, ns, id_map) + right = parse_subelements(interaction, './bp:right', pathway, pathway_id, ns, id_map) + if left and right: + ev = CEvaluationLink( + CPredicateNode('conversion_of'), + CListLink(wrap_list(left), + wrap_list(right))) + result.append(ev) + id_map[about(interaction, ns)] = [ev] + else: + id_map[about(interaction, ns)] = [] + return result + + +control_name_map = dict() +control_name_map['ACTIVATION'] = 'activation_of' +control_name_map['INHIBITION'] = 'inhibition_of' +control_name_map['leads_to'] = 'leads_to' + + +def parse_control(control, pathway, ns, pathway_id, id_map): + result = list() + controller_el = control.find('./bp:controller', ns) + if controller_el is None: + print("no controller in {0}".format(about(control, ns))) + id_map[about(control, ns)] = result + return result + controller_id = resource(controller_el, ns) + controller = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controller_id), ns), + pathway, ns, pathway_id, id_map, []) + # controlled is a some interaction or control + controlled_id = control.find('./bp:controlled[@rdf:resource]', ns).attrib['{{{0}}}resource'.format(ns['rdf'])] + + controlled = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controlled_id), ns), + pathway, ns, pathway_id, id_map, []) + control_type = None + if about(control, ns).startswith('pgkb.leadsTo'): + print("leadsTo control is not implemented") + elif about(control, ns).startswith('pgkb.control.transport'): + print("transport control is not implemented") + id_map[about(control, ns)] = result + return result + else: + control_type_elem = control.find('./bp:controlType', ns) + if control_type_elem is not None: + control_type = control_type_elem.text + else: + print("no control type in {0}".format(about(control, ns))) + if controlled and controller and control_type: + res = CEvaluationLink( + CPredicateNode(control_name_map[control_type]), + CListLink( + wrap_list(controller), + wrap_list(controlled))) + result.append(res) + id_map[about(control, ns)] = result + return result + + +def parse_catalysis(element, pathway, ns, pathway_id, id_map): + result = list() + controller_el = element.find('./bp:controller', ns) + if controller_el is None: + print("failed to parse - no controller in Catalysis {0}".format(about(element, ns))) + id_map[about(element, ns)] = [] + return result + controller_id = resource(controller_el, ns) + controller = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controller_id), ns), + pathway, ns, pathway_id, id_map, result=result) + # controled is a some interaction + controlled_id = resource(element.find('./bp:controlled[@rdf:resource]', ns), ns) + controlled_elem = find_about_element(pathway, ns, controlled_id) + controlled = process_component(controlled_elem, pathway, ns, pathway_id, id_map, result=result) + if controlled and controller: + for cont in controller: + res = CEvaluationLink( + CPredicateNode("catalysys_of"), + CListLink(cont, + wrap_list(controlled))) + result.append( + CContextLink(CConceptNode(pathway_id), + res)) + id_map[about(element, ns)] = [res] + else: + id_map[about(element, ns)] = [] + return result + + +def find_about_element(pathway, ns, elem_id): + return pathway.find('./*[@rdf:about="{0}"]'.format(elem_id), ns) + + +def parse_elem(elem, pathway, ns, pathway_id, id_map): + result = list() + if about(elem, ns) in id_map: + return id_map[about(elem, ns)] + if elem.tag.endswith('Complex'): + name = elem.find('./bp:standardName', ns).text + node = CConceptNode(name) + member = CMemberLink(node, + CConceptNode(pathway_id)) + id_map[about(elem, ns)] = [node] + result.append(member) + elif elem.tag.endswith('PhysicalEntity'): + print('PhysicalEntity as part of interaction is not supported {0}'.format(elem.find('./bp:standardName', ns).text)) + id_map[about(elem, ns)] = [] + elif elem.tag.endswith('Pathway'): + name = elem.find('./bp:standardName', ns).text + print("Pathway as component of interaction is not supported: {0}".format(name)) + id_map[about(elem, ns)] = [] + return result + elif elem.tag.endswith('Dna') or elem.tag.endswith('Rna'): + print("Dna and Rna as component of interaction is not supported: {0}".format(about(elem, ns))) + id_map[about(elem, ns)] = [] + return result + else: + import pdb;pdb.set_trace() + return result + + + +def about(elem, ns): + ab = '{{{0}}}about'.format(ns['rdf']) + return elem.attrib[ab] + +def resource(elem, ns): + res = '{{{0}}}resource'.format(ns['rdf']) + return elem.attrib[res] + + +def parse_interaction(interaction, pathway, ns, pathway_id, id_map): + result = [] + perticipant_el = interaction.findall('./bp:participant', ns) + if not perticipant_el: + print("no participant in interaction: {0}".format(about(interaction, ns))) + id_map[about(interaction, ns)] = [] + else: + participant_id = [resource(p, ns) for p in perticipant_el] + for par_id in participant_id: + participant = find_about_element(pathway, ns, par_id) + parse_elem(participant, pathway, ns, pathway_id, id_map) + result += id_map.get(par_id, []) + # it is subiteraction, replace it with it's participant + id_map[about(interaction, ns)] = result + return result + + +def process_component(interaction, pathway, ns, pathway_id, id_map, result): + interaction_name = interaction.tag.split('}')[-1] + if about(interaction, ns) in id_map: + return id_map[about(interaction, ns)] + if interaction_name == 'BiochemicalReaction': + result += gen_interaction(interaction, pathway, pathway_id, ns, id_map, 'reaction') + elif interaction_name == 'Transport': + result += gen_interaction(interaction, pathway, pathway_id, ns, id_map, 'transport_of') + elif interaction_name == 'Catalysis': + result += parse_catalysis(interaction, pathway, ns, pathway_id, id_map) + elif interaction_name == 'Interaction': + result += parse_interaction(interaction, pathway, ns, pathway_id, id_map) + elif interaction_name == 'Control': + result += parse_control(interaction, pathway, ns, pathway_id, id_map) + elif interaction_name == 'Conversion': + result += gen_conversion(interaction, pathway, pathway_id, ns, id_map) + elif interaction_name == 'Pathway': + result += parse_elem(interaction, pathway, ns, pathway_id, id_map) + elif interaction_name == 'ComplexAssembly': + print("ComplexAssembly parsing is not yet implemented") + id_map[about(interaction, ns)] = [] + elif interaction_name == 'Complex': + print("Complex parsing is not yet implemented") + id_map[about(interaction, ns)] = [] + elif interaction_name == 'Degradation': + print("Degradation parsing is not yet implemented") + id_map[about(interaction, ns)] = [] + elif interaction_name == 'TemplateReactionRegulation': + print("TemplateReactionRegulation parsing is not yet implemented") + id_map[about(interaction, ns)] = [] + else: + import pdb;pdb.set_trace() + return id_map[about(interaction, ns)] + + +def process_components(pathway, ns, pathway_id, id_map): + result = list() + for component in pathway.findall('bp:Pathway/bp:pathwayComponent', ns): + for comp in component.attrib.values(): + interaction = pathway.find('./*[@rdf:about="{0}"]'.format(comp), ns) + process_component(interaction, pathway, ns, pathway_id, id_map, result=result) + return result + + +def convert_pathway(pathway, chem_data, genes_data, pharma2uniprot, pathway_id, pathway_name, ns, pharma2chebi): print("processing pathway {0} {1}".format(pathway_id, pathway_name)) ev_name = CEvaluationLink( CPredicateNode("has_name"), CListLink(CConceptNode(pathway_id), CConceptNode(pathway_name))) tmp = [ev_name] - tmp += process_proteins(pathway, ns, pathway_id, genes_data) - tmp += process_small_molecules(pathway, ns, pathway_id, chem_data) + id_map = dict() + tmp += process_proteins(pathway, ns, pathway_id, genes_data, pharma2uniprot, id_map) + tmp += process_small_molecules(pathway, ns, pathway_id, chem_data, id_map, pharma2chebi) + tmp += process_components(pathway, ns, pathway_id, id_map) return '\n'.join([x.recursive_print() for x in tmp]) @@ -316,14 +641,20 @@ def build_request(url): return response +pharma2uniprot_url = 'https://github.com/noskill/knowledge-import/raw/master/uniprot2pharmagkb.tab.gz' +pharma2chebi_url = 'https://github.com/noskill/knowledge-import/raw/master/pharma2chebi.tsv' def download(): pathway = 'https://s3.pgkb.org/data/pathways-biopax.zip' genes = 'https://s3.pgkb.org/data/genes.zip' chemicals = 'https://s3.pgkb.org/data/chemicals.zip' + pathway_zip = ZipFile(BytesIO(build_request(pathway).read())) genes_zip = ZipFile(BytesIO(build_request(genes).read())) chem_zip = ZipFile(BytesIO(build_request(chemicals).read())) - return pathway_zip, genes_zip, chem_zip + pharma2uniprot = GzipFile(fileobj=BytesIO(build_request(pharma2uniprot_url).read())) + + pharma2chebi = BytesIO(build_request(pharma2chebi_url).read()) + return pathway_zip, genes_zip, chem_zip, pharma2uniprot, pharma2chebi def parse_args(): @@ -336,23 +667,59 @@ def parse_args(): help='zip archive with genes data in tsv format') parser.add_argument('--output', type=str, default='/tmp/pharmagkb.scm', help='path to output file') + parser.add_argument('--pharma2uniprot', type=str, default='', + help='path to pharma2uniprot file') + parser.add_argument('--pharma2chebi', type=str, default='', + help='pharma2chebi mapping file') return parser.parse_args() +def remove_duplicates(pharma2uniprot): + loc_entry = ([(i,pharma2uniprot.iloc[i]) for (i,x) in enumerate(pharma2uniprot['Cross-reference (PharmGKB)'].tolist()) if x.count('PA') > 1]) + table = pharma2uniprot.iloc[0:0] + prev_iloc = 0 + for (iloc, entry) in loc_entry: + table = table.append(pharma2uniprot.iloc[prev_iloc: iloc]) + for pharma_gkb_id in entry['Cross-reference (PharmGKB)'].split(';'): + if pharma_gkb_id: + new_entry = entry.copy() + new_entry['Cross-reference (PharmGKB)'] = pharma_gkb_id + ';' + table = table.append(new_entry) + prev_iloc = iloc + 1 + table = table.append(pharma2uniprot.iloc[prev_iloc:]) + table = table.rename(columns={'Cross-reference (PharmGKB)': 'pharma_id'}) + return table + + def main(): args = parse_args() if (args.pathways and args.genes and args): pathway_file = ZipFile(BytesIO(open(args.pathways, 'rb').read())) chemicals_file = ZipFile(BytesIO(open(args.chemicals, 'rb').read())) genes_file = ZipFile(BytesIO(open(args.genes, 'rb').read())) + # file is small, handle it separately + if args.pharma2uniprot: + pharma2uniprot_file = GzipFile(args.pharma2uniprot) + else: + pharma2uniprot_file = GzipFile(fileobj=BytesIO(urllib.request.urlopen(pharma2uniprot_url).read())) + # file is small, handle it separately + if args.pharma2chebi: + pharma2chebi_file = open(args.pharma2chebi) + else: + pharma2chebi_file = BytesIO(urllib.request.urlopen(pharma2chebi_url).read()) else: - pathway_file, genes_file, chemicals_file = download() + pathway_file, genes_file, chemicals_file, pharma2uniprot_file, pharma2chebi_file = download() + + # ensure that chibi id are strings + pharma2chebi = pandas.read_csv(pharma2chebi_file, converters={i: str for i in range(0, 30000)}, sep='\t') + chem_tsv = chemicals_file.open('chemicals.tsv') genes_tsv = genes_file.open('genes.tsv') genes_data = pandas.read_csv(genes_tsv, sep="\t") chem_data = pandas.read_csv(chem_tsv, sep="\t") - + pharma2uniprot = remove_duplicates(pandas.read_csv(pharma2uniprot_file, sep='\t')) + pathway_files = [x for x in pathway_file.namelist() if x.endswith('.owl')] out_path = args.output @@ -364,7 +731,9 @@ def main(): pathway_id, pathway_name = get_pathway_id_name(tree, ns) if pathway_id is None: pathway_id = filename.split('-')[0] - res = convert_pathway(tree, chem_data, genes_data, pathway_id, pathway_name, ns) + res = convert_pathway(tree, chem_data, genes_data, + pharma2uniprot, pathway_id, pathway_name, ns, + pharma2chebi) output.write(res) output.write('\n' * 3) output.close() diff --git a/uniprot2pharmagkb.tab.gz b/uniprot2pharmagkb.tab.gz new file mode 100644 index 0000000..f607718 Binary files /dev/null and b/uniprot2pharmagkb.tab.gz differ