diff --git a/atomwrappers.py b/atomwrappers.py
index 41f7608..e751c08 100644
--- a/atomwrappers.py
+++ b/atomwrappers.py
@@ -6,14 +6,14 @@
 
 
 class CAtom:
-    pass
+    def __hash__(self):
+        return hash(str(self))
 
 
 class CNode(CAtom):
+    atom_type = None
     def __init__(self, name):
         self.name = name
-        if name == 'HLA-B *57:01:01':
-            import pdb;pdb.set_trace()
 
     def __str__(self):
         return '({0} "{1}")'.format(self.atom_type, self.name.replace('"', '\\"')) 
@@ -22,6 +22,7 @@ def recursive_print(self, result='', indent=''):
         return result + indent + str(self)
 
 
+
 class CLink(CAtom):
     def __init__(self, *atoms):
         self.outgoing = atoms
@@ -60,4 +61,14 @@ class CListLink(CLink):
 class CGeneNode(CNode):
     atom_type = 'GeneNode'
 
+class CContextLink(CLink):
+    atom_type = 'ContextLink'
+
+
+class CSetLink(CLink):
+    atom_type = 'SetLink'
 
+    def __str__(self):
+        # str is used for hash computation, so need to sort outgoing set for all unordered links
+        outgoing = '\n'.join(sorted([str(x) for x in self.outgoing]))
+        return '({0} {1})'.format(self.atom_type, outgoing)
diff --git a/pharma2chebi.tsv b/pharma2chebi.tsv
new file mode 100644
index 0000000..93ffc5a
--- /dev/null
+++ b/pharma2chebi.tsv
@@ -0,0 +1,256 @@
+pharma_id	Name	PubChem	DrugBank	ChEBI
+PA166152901	serotonin	5202		28790
+PA161925594	tropisetron		11699	32269
+PA450267	lorazepam	46508468	186	6539
+PA451608	temazepam	46506604	231	9435
+PA164781320	flunitrazepam	46504553	1544	31622
+PA450496	midazolam	46507611	683	6931
+PA449681	flurazepam	7847395	690	5128
+PA10035	bromazepam	46505694	1558	31302
+PA166131305	nordazepam	2997		111762
+PA166178653	angiotensin 1-7	123805		55438
+PA166109554	angiotensin ii	65143		58506
+PA166109574	bradykinin	439201		3165
+PA165111696	artemisinin	68827		223316
+PA165111697	artesunate	5464098		63918
+PA450163	lamivudine	46507855	709	63577
+PA451735	tramadol	46506256	193	75725
+PA450616	nevirapine	46506789	238	63613
+	leukotriene c4			16978
+	leukotriene b4			15647
+PA134698661	pranlukast	692058	1411	94810
+	leukotriene d4			28666
+PA166114483	lumacaftor	16678941		90951
+PA165950341	ivacaftor	16220172	8820	66901
+PA165946873	vemurafenib	42611257		63637
+PA166178276	6-phosphogluconate	91493		48928
+PA166178280	dihydronicotinamide adenine dinucleotide phosphate	5884	2338	16474
+PA166160613	phosphoribosyl pyrophosphate	7339		17111
+PA166178277	ribulose-5-phosphate	439184		16332
+PA166178279	nicotinamide adenine dinucleotide phosphate	5886		44409
+PA166178274	glucose-6-phosphate	5958		17733
+PA166178278	fructose-6-phosphate	69507		15946
+PA166160614	ribose phosphate	77982		17797
+PA450457	methylene blue			6872
+PA451752	triamterene	46507623	384	9671
+	atp			15422
+PA449917	hydrogen peroxide	784		16240
+	nadh			16908
+PA450566	mycophenolate mofetil	7847817	688	8764
+PA166160619	inosine monophosphate	8582		17202
+PA166178641	androstenedione	6128	1536	16422
+PA451089	pravastatin	46504851	175	63618
+PA451258	ritodrine	9448	867	8872
+PA451616	terbutaline	9339	871	9449
+PA134687907	formoterol	46507099	983	63082
+PA450431	methoxamine	9716	723	6839
+PA451300	salmeterol	46508024	938	64064
+	tyrosine			58315
+PA450390	orciprenaline	9353	816	83329
+PA165110775	sn-38	104842		8988
+PA450085	irinotecan	46505871	762	80630
+	acetate			30089
+PA450487	metyrosine	46506079	765	6912
+PA166178639	cholesterol	5997		16113
+PA166178666	squalene	638072	11460	15440
+PA166160055	cholic acid	221493		16359
+PA164924493	axitinib			66910
+PA166118340	motesanib	11667893		51098
+PA164924492	brivanib			94562
+PA166118341	vandetanib	3081361		49960
+PA449646	flecainide	46508078	1195	75984
+PA451457	sotalol	9517	489	63622
+PA449839	halofantrine	603092	1218	94392
+PA449958	ibutilide	731244	308	5856
+PA451131	propafenone	46504529	1182	63619
+PA166131350	acrylic acid	6581		18308
+PA166160613	prpp	7339		17111
+PA451663	thioguanine	46508170	352	9555
+PA166160618	s-adenosylhomocysteine	439155		16680
+PA166178659	arachidonic acid	444899	4557	15843
+PA452620	tegafur	5386	9256	32188
+PA166131299	5'-deoxy-5-fluorouridine	18343		31521
+	histamine			18295
+PA166178990	somatostatin	16129681	9099	64628
+PA451234	repaglinide	46508150	912	8805
+	acetoacetyl-coa			15345
+PA154410481	prasugrel	99443238	6209	87723
+PA166179131	phosphatidylethanolamine	9546747		44887
+PA166179130	phosphatidylcholine	5287971	2306	73002
+PA10204	tenofovir	699321	300	63717
+PA10005	adefovir dipivoxil	7848718	718	31175
+	tryptophan			16828
+PA165291492	pazopanib	11525740		71219
+PA166163680	amitriptylinoxide	20313		135224
+PA166114906	bosutinib	160645775		39112
+PA165946122	crizotinib	11626560		64310
+PA166163464	inositol 1,4,5-trisphosphate	439456		16595
+PA165980594	ponatinib	24826799		78543
+PA131890628	tranilast			77572
+PA166178378	uric acid	1175		17775
+PA451106	probenecid	9576	1032	8426
+PA134852920	benzbromarone			3023
+PA166182234	candesartan cilexetil	2540		3348
+PA165958521	febuxostat			31596
+PA166178628	xanthine	1188		17712
+PA166178630	hypoxanthine	790		17368
+PA166131330	oxypurinol	4644		28315
+PA166178623	allantoin	204		15676
+PA450748	oxymorphone	46505296	1192	7865
+PA166163694	5-hydroxydiclofenac	3052566		59612
+PA166163693	4'-hydroxydiclofenac	116545		59613
+PA166163692	diclofenac acyl glucuronide	16084218		59609
+PA166163701	4',5-dihydroxydiclofenac	3052567		223401
+PA166163695	5-hydroxy diclofenac benzoquinone	15480180		59611
+PA166163700	3'-hydroxydiclofenac	112230		223792
+PA166163659	n-desmethylclozapine	2820		64050
+PA166165069	(s)-norverapamil	15593908		134081
+	cl-			17996
+	gaba			16865
+PA166109575	angiotensin i	3081372		2718
+	fe(ii)			29033
+PA165291922	doxorubicinol	83970		133817
+PA166131278	tetrahydrothiophene	1127		48458
+	nh4+			28938
+PA166131317	sulfolane	31347		74794
+PA165260403	arteether	72416		135335
+	m2			34827
+PA166131352	1-methyluric acid	69726		68441
+PA166131274	5-acetylamino-6-formylamino-3-methyluracil	108214		32643
+PA166131368	1-methylxanthine	80220		68443
+PA165820585	1,7-dimethylxanthine	4687		25858
+PA166131373	1,7-dimethyluric acid	91611		68449
+PA166131367	1,3,7-trimethyluric acid	79437		691622
+PA166131287	2-hydroxyiminostilbene	13103864		80599
+PA165817200	2-hydroxycarbamazepine	129274		80596
+PA165817197	3-hydroxycarbamazepine	135290		80597
+PA166131247	12-hydroxynevirapine	453338		145206
+	leukotriene a4			15651
+	leukotriene e4			15650
+PA166131575	chloride	312		17996
+	glutamate			14321
+	succinate			26806
+PA166178988	succinate semialdehyde	9543238		26805
+PA166178654	alpha-ketoglutarate	164533		16810
+PA451368	sodium	923		26708
+PA166131354	3-methylxanthine	70639		62205
+PA166131337	desmethyldoxepin	5353833		141547
+PA449773	glucose	5793		17234
+PA166178281	adenosine diphosphate	6022		22252
+	m1			34826
+	m3			51083
+	n-acetyl-p-benzoquinone imine			29132
+	h2o			15377
+	nadph			16474
+	gsh			16856
+	nadp			25523
+	gssg			17858
+	h2o2			16240
+	o2			15379
+	na+			29101
+	fad			16238
+	nad			13389
+	fmn			17621
+PA166124478	endoxifen	10090750		80555
+PA164712732	enzyme inhibitors			23924
+PA165823907	hydroxyphenytoin	17732		63804
+	nad+			15846
+	ip3			16595
+	gtp			15996
+	pip2			18348
+	dopa			49168
+	gdp			17552
+	camp			17489
+PA166131329	etoposide glucuronide	46173784		5427
+PA166131563	glutathione conjugate			24335
+PA166131331	quinone	76320713		36141
+	acth			3892
+PA452347	glucocorticoids			24261
+	sn-38g			8990
+	ldl			39026
+	idl			132933
+	vldl			39027
+	phospholipid			16247
+PA451776	triglycerides			17855
+	mevalonate			25350
+	hdl			39025
+	ca2+			29108
+PA448486	arsenic trioxide	46506448	1169	30621
+PA166131270	aldophosphamide	107744		2560
+PA166131319	chloroacetaldehyde	33		27871
+PA166131388	4-hydroxycyclophosphamide	99735		1864
+	ndps			7087
+PA166131349	gemcitabine diphosphate	6420157		42852
+PA166131301	aldoifosfamide	189733		80563
+PA166131315	4-hydroxyifosfamide	308171		80560
+PA166131366	acrolein	7847		15368
+PA166131285	carboxyifosfamide	124143		80564
+PA166131264	alcoifosfamide	10265589		80565
+PA166131387	4-ketoifosfamide	99303		80561
+PA166131371	aziridine	9033		30969
+PA166131307	chloroacetic acid	300		27869
+	1,3-oxazolidin-2-one			1237
+PA166160610	6-methylmercaptopurine riboside	237358		44081
+PA166163264	thioxanthine	1268107		51055
+PA166131558	npc	11756356		80552
+PA166131559	apc	10077584		80551
+PA166131242	3-hydroxycotinine glucuronide	53477725		133205
+PA166131378	cotinine n-oxide	9815514		89087
+PA166131324	norcotinine	413		89406
+PA166114414	cotinine	854019		68641
+PA166131561	cotinine glucuronide	178540		145221
+PA166131323	nornicotine	412		28313
+PA166131310	nicotine glucuronide	3035848		63860
+PA166131297	nicotine n-oxide	17785		132572
+PA166131340	morphine-3-glucuronide	5484731		80631
+PA166131341	codeine-6-glucuronide	5489029		80580
+PA166131414	normorphine	430245		7633
+PA452633	morphine-6-glucuronide	5360621		80581
+PA166131386	norcodeine	9925873		80579
+	pgh2			15554
+	txa2			15627
+	pgd2			15555
+	pge2			15551
+	pgi2			15552
+PA164713176	platinum compounds			33749
+PA166131258	5'-deoxy-5-fluorocytidine	10037499		80627
+	h+			15378
+	gastrin			75436
+	pip3			16618
+PA164713207	proton pump inhibitors			49200
+	amp			456215
+	k+			29103
+	co2			16526
+PA166182027	clopidogrel carboxylic acid	9861403		83504
+	farnesyl pyrophosphate			17407
+	hmg-coa			15467
+	geranyl pyrophosphate			17211
+	acetyl-coa			15351
+	adp			16761
+	cyclic amp			17489
+PA449483	eptifibatide	700331	63	291902
+	aa			15843
+	pgg2			27647
+	metabolite			25212
+PA166178987	diacylglycerol	6026790		18035
+	5-hydroxyindoleacetic acid			27823
+	p1			60949
+PA166131450	demethylcitalopram	162180		80603
+PA166131265	citalopram propionic acid	10403174		80605
+PA166131333	abacavir 5'-monophosphate	469588		64112
+	(-)-carbovir 5'-monophosphate			64116
+	(-)-carbovir 5'-triphosphate			64174
+	(-)-carbovir 5'-diphosphate			64172
+PA166177866	metoprolol acid	62936		83478
+PA166182233	o-desethyl candesartan	10047287		145224
+PA166182228	glycinexylidide	87833		357241
+PA166182229	monoethylglycinexylidide	24415		222828
+PA166131268	carboxyibuprofen	10444113		133199
+	glucuronide			15341
+PA166178985	anandamide	5281969		2700
+	pgf2a			15553
+	am1			140154
+	am9			140152
+PA166178656	5-hydroxyisourate	250388		18072
+PA164784024	peginterferon alfa-2b		22	63615
+PA166178629	5-hydroxyindole-3-acetic acid	1826		27823
diff --git a/pharmagkb.py b/pharmagkb.py
index 242158d..4ffb66e 100644
--- a/pharmagkb.py
+++ b/pharmagkb.py
@@ -11,6 +11,7 @@
 import argparse
 import xml.etree.ElementTree as ET
 from zipfile import ZipFile
+from gzip import GzipFile
 from io import BytesIO
 import pandas
 from atomwrappers import *
@@ -64,39 +65,8 @@ def pharma_to_id(chem_table, name):
     return chemical_id
 
 
-def gen_chemical_members(mol_id_map, pathway_id):
-    """
-    Generate member links for the substance pointed mol_id_map
-    
-    Parameters
-    ----------
-    mol_id_map: dict
-        Database name, id pairs
-    pathway_id: str
-        pharma gkb id for pathway
-        
-    Returns
-    -------
-    tuple[list, list]
-        member links connecting substance to pathway
-        MoleculeNodes for the substance
-    """
-    tmp = list()
-    molecules = list()
-    for (id_type, mol_id) in mol_id_map.items():
-        molecule = CMoleculeNode('{0}:{1}'.format(id_type, mol_id))
-        member = CMemberLink(molecule,
-                             CConceptNode(pathway_id))
-        tmp.append(member)
-        molecules.append(molecule)
-    return tmp, molecules
-
 
-gene_re = re.compile('([A-Z0-9-]*).*')
 def gen_gene_member(gene, pathway_id, organism=None):
-    match = gene_re.match(gene)
-    assert match is not None
-    gene = match.group(1)
     result = []
     member = CMemberLink(CGeneNode(gene), CConceptNode(pathway_id))
     result.append(member)
@@ -125,14 +95,15 @@ def process_genes(genes_str, pathway_id, organism=None):
             tmp += gen_gene_member(gene.strip(), pathway_id, organism=organism)
     return tmp
 
+
 uniprot_re = re.compile('.*UniProtKB:([A-Za-z0-9-]+).*')
-def gen_proteins(pharma_id, pathway_id, genes_data):
+def gen_proteins(pharma_id, name, pathway_id, pharma2uniprot):
     """
     Generate member links for the protein pointed by pharmagkb id
     
     Parameters
     ----------
-    pharma_id: str
+    pharma_id: List[str]
         pharma gkb id for protein
     pathway_id: str
         pharma gkb id for pathway
@@ -147,20 +118,18 @@ def gen_proteins(pharma_id, pathway_id, genes_data):
     """
     tmp = []
     proteins = []
-    gene = genes_data[genes_data['PharmGKB Accession Id'] == pharma_id]
-    if len(gene) == 0:
-        print("no gene for {0} in genes table".format(pharma_id))
-        return tmp, proteins
-    assert len(gene) == 1
-    for ref in gene['Cross-references'].tolist()[0].split(','):
-        match = uniprot_re.match(ref)
-        if match:
-            for prot_id in match.groups():
-                members, molecules = gen_chemical_members({'Uniprot': prot_id}, pathway_id)
-                tmp += members
-                proteins += molecules
-        else:
-            assert 'uniprot' not in ref.lower()
+    for prot in pharma_id:
+        entry = pharma2uniprot[pharma2uniprot.pharma_id == prot + ';']
+        if not len(entry):
+            print("not found uniprot id for {0}".format(name))
+            continue
+        for i in range(len(entry)):
+            prot_id = entry.iloc[i].Entry
+            molecule = CMoleculeNode('Uniprot:{0}'.format(prot_id))
+            member = CMemberLink(molecule,
+                                CConceptNode(pathway_id))
+            tmp.append(member)
+            proteins.append(molecule)
     return tmp, proteins
 
 
@@ -187,76 +156,432 @@ def generate_locations(elem, ns, chemical_nodes, pathway_id):
     return result
 
 
-def process_small_molecules(pathway, ns, pathway_id, chem_data):
+def find_chebi(pubchem, drugbank, name, pharma_id, pharma2chebi):
+    if pharma_id:
+        frame = pharma2chebi[pharma2chebi.pharma_id == pharma_id]
+        if len(frame):
+            assert len(set(frame['ChEBI'].to_list())) == 1
+            return frame.iloc[0]['ChEBI']
+    if name:
+        frame = pharma2chebi[pharma2chebi.Name == name]
+        if len(frame):
+            assert len(frame) == 1
+            return frame.iloc[0]['ChEBI']
+
+
+def parse_molecule(smallmolecule, ns, chem_data, pharma2chebi=None):
+    """
+    Parse SmallMolecule element
+    
+    Parameters
+    ----------
+    smallmolecule: xml.etree.ElementTree.Element
+        SmallMolecule
+    ns: dict
+        namespaces from the owl file
+    chem_data: pandas.DataFrame
+        table chemicals from pharagkb
+    
+    Returns
+    -------
+    dict, str
+        external db name: id pairs
+        human readable name
+    """
+    reference = smallmolecule.find('bp:entityReference', ns)
+    molecule_drug = dict()
+    assert reference is not None
+    name = smallmolecule.find('./bp:standardName', ns).text.lower().strip()
+    for value in reference.attrib.values():
+        pharma_pkg_id = re.match('.*\.(PA\d+)\.?.*', value)
+        if pharma_pkg_id is None:
+            # try by standard name
+
+            row = chem_data[chem_data.Name == name]
+            if len(row):
+                assert len(row) == 1
+                pharma_pkg_id = row.iloc[0]['PharmGKB Accession Id']
+            else:
+                print("no pharmapkg id for {0}".format(value))
+                continue
+        else:
+            pharma_pkg_id = pharma_pkg_id.group(1)
+        molecule_drug = pharma_to_id(chem_data, pharma_pkg_id)
+
+    # try pharma2chebi
+    if 'ChEBI' not in molecule_drug:
+        pubchem = molecule_drug.get('PubChem', '')
+        drugbank = molecule_drug.get('DrugBank', '')
+        if  pharma2chebi is not None:
+            chebi = find_chebi(pubchem, drugbank, name, pharma_pkg_id, pharma2chebi)
+            if chebi:
+                molecule_drug['ChEBI'] = chebi
+        if pharma_pkg_id is None:
+            pharma_pkg_id = ''
+        #name = name.replace('"', '\\"')
+        #with open('not_parsed.csv', 'at') as f:
+        #    f.write(pharma_pkg_id + '\t')
+        #    f.write(name + '\t')
+        #    f.write(pubchem + '\t')
+        #    f.write(drugbank + '\t')
+        #    f.write('\n')
+    return molecule_drug, name
+
+
+def process_small_molecules(pathway, ns, pathway_id, chem_data, id_map, pharma2chebi=None):
     tmp = list()
     for smallmolecule in pathway.findall('./bp:SmallMolecule', ns):
-        reference = smallmolecule.find('bp:entityReference', ns)
-        assert reference is not None
-        for value in reference.attrib.values():
-            pharma_pkg_id = re.match('.*\.(PA\d+)\.?.*', value)
-            if pharma_pkg_id is None:
-                # try by standard name
-                name = smallmolecule.find('./bp:standardName', ns).text.lower()
-                row = chem_data[chem_data.Name == name]
-                if len(row):
-                    assert len(row) == 1
-                    pharma_pkg_id = row.iloc[0]['PharmGKB Accession Id']
-                else:
-                    print("no pharmapkg id for {0}".format(value))
-                    continue
-            else:
-                pharma_pkg_id = pharma_pkg_id.group(1)
-            molecule_drug = pharma_to_id(chem_data, pharma_pkg_id)
-            members, chemical_nodes = gen_chemical_members(molecule_drug, pathway_id)
-            tmp += generate_locations(smallmolecule, ns, chemical_nodes, pathway_id)
-            tmp += members
+        molecule_drug, name = parse_molecule(smallmolecule, ns, chem_data, pharma2chebi)
+        molecule = CMoleculeNode(name)
+        member = CMemberLink(molecule,
+                    CConceptNode(pathway_id))
+        for db_name in ('ChEBI', 'PubChem', 'DrugBank'):
+            if db_name in molecule_drug:
+                name_node = CConceptNode("{0}:{1}".format(db_name, molecule_drug[db_name]))
+                ctx = CContextLink(
+                        CConceptNode(pathway_id),
+                        CEvaluationLink(
+                            CPredicateNode("has_{0}_id".format(db_name.lower())),
+                            CListLink(molecule,
+                                    name_node)))
+                tmp.append(ctx)
+        tmp.append(member)
+        id_map[about(smallmolecule, ns)] = [molecule]
+        tmp += generate_locations(smallmolecule, ns, [molecule], pathway_id)
+            
     return tmp
  
- 
-def process_proteins(pathway, ns, pathway_id, genes_data):
+
+def match_protein_id(value):
+    match = protein_ref_re.match(value)
+    if match:
+        return match.group(2)
+    else:
+        assert 'PA' not in value
+    return None
+
+
+def parse_protein(protein, pathway, ns, pathway_id, pharma2uniprot, elem_chemical_map, tmp=None):
+    name = protein.find('./bp:standardName', ns).text
+    protein_elem_id = about(protein, ns)
+    if protein_elem_id in elem_chemical_map:
+        return elem_chemical_map[protein_elem_id]
+
+    organism = None
+    if name.strip().startswith('HIV'):
+        organism = '12721'
+    reference_elem = protein.find('./bp:entityReference', ns)
+    if reference_elem is None:
+        print("failed to find reference for {0}".format(protein_elem_id))
+        elem_chemical_map[protein_elem_id] = []
+        return []
+    ent_ref_id = resource(reference_elem, ns)
+    ent_ref = pathway.find('./*[@rdf:about="{0}"]'.format(ent_ref_id), ns)
+    protein_ref_id = []
+    if ent_ref.tag.endswith('ProteinReference'):
+        # it is ether protein group or a protein
+        xref = ent_ref.find('./bp:xref', ns)
+        if xref is not None:
+            value = resource(xref, ns)
+            prot_id = match_protein_id(value)
+            if prot_id:
+                protein_ref_id.append(prot_id)
+        else:
+            # protein group
+            for ent_mem in ent_ref.findall('./bp:memberEntityReference', ns):
+                value = resource(ent_mem, ns)
+                prot_id = match_protein_id(value)
+                if prot_id:
+                    protein_ref_id.append(prot_id)
+    else:
+        import pdb;pdb.set_trace()
+    if protein_ref_id is not None:
+        members, protein_nodes = gen_proteins(protein_ref_id, name, pathway_id, pharma2uniprot)
+        elem_chemical_map[protein_elem_id] = protein_nodes
+        if tmp is not None:
+            tmp += generate_locations(protein, ns, protein_nodes, pathway_id)
+            tmp += members
+    else:
+        # give up
+        name = protein.find('./bp:standardName', ns).text
+        print("can't map protein to uniprot for {0}".format(name))
+        return None
+    return elem_chemical_map.get(protein_elem_id, None)
+
+
+def process_proteins(pathway, ns, pathway_id, genes_data, pharma2uniprot, elem_chemical_map):
     tmp = list()
     # properties often don't have valid attributes 
     for protein in pathway.findall('./bp:Protein', ns):
-        name = protein.find('./bp:standardName', ns).text
-        organism = None
-        if name.strip().startswith('HIV'):
-            organism = '12721'
-        tmp += process_genes(name, pathway_id, organism) 
-        protein_ref_id = None
-        for key, value in protein.attrib.items():
-            match = protein_ref_re.match(value)
-            if match:
-                protein_ref_id = match.group(1)
-        if protein_ref_id is None:
-            for ent_ref in protein.findall('./bp:entityReference', ns):
-                for value in ent_ref.attrib.values():
-                    match = protein_ref_re.match(value)
-                    if match:
-                        protein_ref_id = match.group(1)
-                    else:
-                        assert 'PA' not in value
-        if protein_ref_id is None:
-           # give up
-           name = protein.find('./bp:standardName', ns).text
-           print("can't map protein to uniprot for {0}".format(name))
-           continue
-        members, protein_nodes = gen_proteins(protein_ref_id, pathway_id, genes_data)
-        tmp += generate_locations(protein, ns, protein_nodes, pathway_id)
-        tmp += members
+        parse_protein(protein, pathway, ns, pathway_id, pharma2uniprot, elem_chemical_map, tmp=tmp)
     return tmp
 
 
-protein_ref_re = re.compile('pgkb.[a-z0-9]+.[A-Za-z0-9]+.*(PA\d+).*')
+
+protein_ref_re = re.compile('.*(\.|/)(PA\d+)')
 go_location_re = re.compile('.*(GO:\d+).*')
-def convert_pathway(pathway, chem_data, genes_data, pathway_id, pathway_name, ns):
+drug_re = re.compile('pgkb.drug.*(PA\d+)')
+
+
+
+def wrap_set(molecules):
+    if len(molecules) == 1:
+        result = molecules[0]
+    else:
+        result = CSetLink(*molecules)
+    return result
+
+
+def wrap_list(molecules):
+    if len(molecules) == 1:
+        result = molecules[0]
+    else:
+        result = CListLink(*molecules)
+    return result
+
+
+def gen_interaction(interaction, pathway, pathway_id, ns, id_map, interaction_name):
+    result = list()
+    left_elem = interaction.find('./bp:left', ns)
+    right_elem = interaction.find('./bp:right', ns)
+    if left_elem is None or right_elem is None:
+        print("din't find \"from\" participant in interaction {0}".format(about(interaction, ns)))
+        id_map[about(interaction, ns)] = result
+        return result
+    left = resource(left_elem, ns)
+    right = resource(right_elem, ns)
+    parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(left), ns), pathway, ns, pathway_id, id_map)
+    parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(right), ns), pathway, ns, pathway_id, id_map)      
+    left_mol = id_map.get(left, ())
+    right_mol = id_map.get(right, ())
+    ev = None
+    if left_mol and right_mol:
+        ev = CEvaluationLink(
+            CPredicateNode(interaction_name),
+            CListLink(wrap_list(left_mol),
+                        wrap_list(right_mol)))
+        result.append(CContextLink(
+                        CConceptNode(pathway_id),
+                        ev))
+    if not result:
+        print("failed to parse {0}".format(about(interaction, ns)))
+    id_map[about(interaction, ns)] = [] if ev is None else [ev]
+    return result
+
+
+def parse_subelements(interaction, xpath, pathway, pathway_id, ns, id_map):
+    left_items = list()
+    for left in interaction.findall(xpath, ns):
+        left_id = resource(left, ns)
+        parse_elem(pathway.find('./*[@rdf:about="{0}"]'.format(left_id), ns), pathway, ns, pathway_id, id_map)
+        left_items += id_map.get(left_id, [])
+    return left_items
+
+
+def gen_conversion(interaction, pathway, pathway_id, ns, id_map):
+    result = list()
+    left = parse_subelements(interaction, './bp:left', pathway, pathway_id, ns, id_map)
+    right = parse_subelements(interaction, './bp:right', pathway, pathway_id, ns, id_map)
+    if left and right:
+        ev = CEvaluationLink(
+                CPredicateNode('conversion_of'),
+                CListLink(wrap_list(left),
+                        wrap_list(right)))
+        result.append(ev)
+        id_map[about(interaction, ns)] = [ev]
+    else:
+        id_map[about(interaction, ns)] = []
+    return result
+
+
+control_name_map = dict()
+control_name_map['ACTIVATION'] = 'activation_of'
+control_name_map['INHIBITION'] = 'inhibition_of'
+control_name_map['leads_to'] = 'leads_to'
+
+
+def parse_control(control, pathway, ns, pathway_id, id_map):
+    result = list()
+    controller_el = control.find('./bp:controller', ns)
+    if controller_el is None:
+        print("no controller in {0}".format(about(control, ns)))
+        id_map[about(control, ns)] = result
+        return result
+    controller_id = resource(controller_el, ns)
+    controller = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controller_id), ns), 
+                                   pathway, ns, pathway_id, id_map, [])
+    # controlled is a some interaction or control
+    controlled_id = control.find('./bp:controlled[@rdf:resource]', ns).attrib['{{{0}}}resource'.format(ns['rdf'])]
+
+    controlled = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controlled_id), ns), 
+                                   pathway, ns, pathway_id, id_map, [])
+    control_type = None
+    if about(control, ns).startswith('pgkb.leadsTo'):
+        print("leadsTo control is not implemented")
+    elif about(control, ns).startswith('pgkb.control.transport'):
+        print("transport control is not implemented")
+        id_map[about(control, ns)] = result
+        return result
+    else:
+        control_type_elem = control.find('./bp:controlType', ns)
+        if control_type_elem is not None:
+            control_type = control_type_elem.text
+        else:
+            print("no control type in {0}".format(about(control, ns)))
+    if controlled and controller and control_type:
+        res = CEvaluationLink(
+                    CPredicateNode(control_name_map[control_type]),
+                    CListLink(
+                        wrap_list(controller),
+                        wrap_list(controlled)))
+        result.append(res)
+    id_map[about(control, ns)] = result
+    return result
+    
+
+def parse_catalysis(element, pathway, ns, pathway_id, id_map):
+    result = list()
+    controller_el = element.find('./bp:controller', ns)
+    if controller_el is None:
+        print("failed to parse - no controller in Catalysis {0}".format(about(element, ns)))
+        id_map[about(element, ns)] = []
+        return result
+    controller_id = resource(controller_el, ns)
+    controller = process_component(pathway.find('./*[@rdf:about="{0}"]'.format(controller_id), ns), 
+                                   pathway, ns, pathway_id, id_map, result=result)
+    # controled is a some interaction
+    controlled_id = resource(element.find('./bp:controlled[@rdf:resource]', ns), ns)
+    controlled_elem = find_about_element(pathway, ns, controlled_id)
+    controlled = process_component(controlled_elem, pathway, ns, pathway_id, id_map, result=result)
+    if controlled and controller:
+        for cont in controller:
+            res = CEvaluationLink(
+                    CPredicateNode("catalysys_of"),
+                    CListLink(cont,
+                        wrap_list(controlled)))
+        result.append(
+            CContextLink(CConceptNode(pathway_id),
+                         res))
+        id_map[about(element, ns)] = [res]
+    else:
+        id_map[about(element, ns)] = []
+    return result
+
+
+def find_about_element(pathway, ns, elem_id):
+    return pathway.find('./*[@rdf:about="{0}"]'.format(elem_id), ns)
+
+
+def parse_elem(elem, pathway, ns, pathway_id, id_map):
+    result = list()
+    if about(elem, ns) in id_map:
+        return id_map[about(elem, ns)]
+    if elem.tag.endswith('Complex'):
+        name = elem.find('./bp:standardName', ns).text
+        node = CConceptNode(name)
+        member = CMemberLink(node,
+                        CConceptNode(pathway_id))
+        id_map[about(elem, ns)] = [node]
+        result.append(member)
+    elif elem.tag.endswith('PhysicalEntity'):
+        print('PhysicalEntity as part of interaction is not supported {0}'.format(elem.find('./bp:standardName', ns).text))
+        id_map[about(elem, ns)] = []
+    elif elem.tag.endswith('Pathway'):
+        name = elem.find('./bp:standardName', ns).text
+        print("Pathway as component of interaction is not supported: {0}".format(name))
+        id_map[about(elem, ns)] = []
+        return result
+    elif elem.tag.endswith('Dna') or elem.tag.endswith('Rna'):
+        print("Dna and Rna as component of interaction is not supported: {0}".format(about(elem, ns)))
+        id_map[about(elem, ns)] = []
+        return result
+    else:
+        import pdb;pdb.set_trace()
+    return result
+        
+        
+        
+def about(elem, ns):
+    ab = '{{{0}}}about'.format(ns['rdf'])
+    return elem.attrib[ab]
+
+def resource(elem, ns):
+    res = '{{{0}}}resource'.format(ns['rdf'])
+    return elem.attrib[res]
+
+
+def parse_interaction(interaction, pathway, ns, pathway_id, id_map):
+    result = []
+    perticipant_el = interaction.findall('./bp:participant', ns)
+    if not perticipant_el:
+        print("no participant in interaction: {0}".format(about(interaction, ns)))
+        id_map[about(interaction, ns)] = []
+    else:
+        participant_id = [resource(p, ns) for p in perticipant_el]
+        for par_id in participant_id:
+            participant = find_about_element(pathway, ns, par_id)
+            parse_elem(participant, pathway, ns, pathway_id, id_map)
+            result += id_map.get(par_id, [])
+        # it is subiteraction, replace it with it's participant
+        id_map[about(interaction, ns)] = result
+    return result
+
+
+def process_component(interaction, pathway, ns, pathway_id, id_map, result):
+    interaction_name = interaction.tag.split('}')[-1]
+    if about(interaction, ns) in id_map:
+        return id_map[about(interaction, ns)]
+    if interaction_name == 'BiochemicalReaction':
+        result += gen_interaction(interaction, pathway, pathway_id, ns, id_map, 'reaction')
+    elif interaction_name == 'Transport':
+        result += gen_interaction(interaction, pathway, pathway_id, ns, id_map, 'transport_of')
+    elif interaction_name == 'Catalysis':
+        result += parse_catalysis(interaction, pathway, ns, pathway_id, id_map)
+    elif interaction_name == 'Interaction':
+        result += parse_interaction(interaction, pathway, ns, pathway_id, id_map)
+    elif interaction_name == 'Control':
+        result += parse_control(interaction, pathway, ns, pathway_id, id_map)
+    elif interaction_name == 'Conversion':
+        result += gen_conversion(interaction, pathway, pathway_id, ns, id_map)
+    elif interaction_name == 'Pathway':
+        result += parse_elem(interaction, pathway, ns, pathway_id, id_map)
+    elif interaction_name == 'ComplexAssembly':
+        print("ComplexAssembly parsing is not yet implemented")
+        id_map[about(interaction, ns)] = []
+    elif interaction_name == 'Complex':
+        print("Complex parsing is not yet implemented")
+        id_map[about(interaction, ns)] = []
+    elif interaction_name == 'Degradation':
+        print("Degradation parsing is not yet implemented")
+        id_map[about(interaction, ns)] = []
+    elif interaction_name == 'TemplateReactionRegulation':
+        print("TemplateReactionRegulation parsing is not yet implemented")
+        id_map[about(interaction, ns)] = []
+    else:
+        import pdb;pdb.set_trace()
+    return id_map[about(interaction, ns)]
+
+
+def process_components(pathway, ns, pathway_id, id_map):
+    result = list()
+    for component in pathway.findall('bp:Pathway/bp:pathwayComponent', ns):
+        for comp in component.attrib.values():
+            interaction = pathway.find('./*[@rdf:about="{0}"]'.format(comp), ns)
+            process_component(interaction, pathway, ns, pathway_id, id_map, result=result)
+    return result
+        
+        
+def convert_pathway(pathway, chem_data, genes_data, pharma2uniprot, pathway_id, pathway_name, ns, pharma2chebi):
     print("processing pathway {0} {1}".format(pathway_id, pathway_name))
     ev_name = CEvaluationLink(
                  CPredicateNode("has_name"),
                  CListLink(CConceptNode(pathway_id),
                            CConceptNode(pathway_name)))
     tmp = [ev_name]
-    tmp += process_proteins(pathway, ns, pathway_id, genes_data)
-    tmp += process_small_molecules(pathway, ns, pathway_id, chem_data)
+    id_map = dict()
+    tmp += process_proteins(pathway, ns, pathway_id, genes_data, pharma2uniprot, id_map)
+    tmp += process_small_molecules(pathway, ns, pathway_id, chem_data, id_map, pharma2chebi)
+    tmp += process_components(pathway, ns, pathway_id, id_map)
     return '\n'.join([x.recursive_print() for x in tmp])
 
 
@@ -316,14 +641,20 @@ def build_request(url):
     return response
 
 
+pharma2uniprot_url = 'https://github.com/noskill/knowledge-import/raw/master/uniprot2pharmagkb.tab.gz'
+pharma2chebi_url = 'https://github.com/noskill/knowledge-import/raw/master/pharma2chebi.tsv'
 def download():
     pathway = 'https://s3.pgkb.org/data/pathways-biopax.zip' 
     genes = 'https://s3.pgkb.org/data/genes.zip'
     chemicals = 'https://s3.pgkb.org/data/chemicals.zip'
+
     pathway_zip = ZipFile(BytesIO(build_request(pathway).read()))
     genes_zip = ZipFile(BytesIO(build_request(genes).read()))
     chem_zip = ZipFile(BytesIO(build_request(chemicals).read()))
-    return pathway_zip, genes_zip, chem_zip
+    pharma2uniprot = GzipFile(fileobj=BytesIO(build_request(pharma2uniprot_url).read()))
+    
+    pharma2chebi = BytesIO(build_request(pharma2chebi_url).read())
+    return pathway_zip, genes_zip, chem_zip, pharma2uniprot, pharma2chebi
 
 
 def parse_args():
@@ -336,23 +667,59 @@ def parse_args():
                         help='zip archive with genes data in tsv format')
     parser.add_argument('--output', type=str, default='/tmp/pharmagkb.scm',
                         help='path to output file')
+    parser.add_argument('--pharma2uniprot', type=str, default='',
+                        help='path to pharma2uniprot file')
+    parser.add_argument('--pharma2chebi', type=str, default='',
+                        help='pharma2chebi mapping file')
     return parser.parse_args()
 
 
+def remove_duplicates(pharma2uniprot):
+    loc_entry = ([(i,pharma2uniprot.iloc[i]) for (i,x) in enumerate(pharma2uniprot['Cross-reference (PharmGKB)'].tolist()) if x.count('PA') > 1])
+    table = pharma2uniprot.iloc[0:0]
+    prev_iloc = 0
+    for (iloc, entry) in loc_entry:
+        table = table.append(pharma2uniprot.iloc[prev_iloc: iloc])
+        for pharma_gkb_id in entry['Cross-reference (PharmGKB)'].split(';'):
+            if pharma_gkb_id:
+                new_entry = entry.copy()
+                new_entry['Cross-reference (PharmGKB)'] = pharma_gkb_id + ';'
+                table = table.append(new_entry)
+        prev_iloc = iloc + 1
+    table = table.append(pharma2uniprot.iloc[prev_iloc:])
+    table = table.rename(columns={'Cross-reference (PharmGKB)': 'pharma_id'})
+    return table
+
+
 def main():
     args = parse_args()
     if (args.pathways and args.genes and args):
         pathway_file = ZipFile(BytesIO(open(args.pathways, 'rb').read()))
         chemicals_file = ZipFile(BytesIO(open(args.chemicals, 'rb').read()))
         genes_file = ZipFile(BytesIO(open(args.genes, 'rb').read()))
+        # file is small, handle it separately
+        if args.pharma2uniprot:
+            pharma2uniprot_file = GzipFile(args.pharma2uniprot)
+        else:
+            pharma2uniprot_file = GzipFile(fileobj=BytesIO(urllib.request.urlopen(pharma2uniprot_url).read()))
+        # file is small, handle it separately
+        if args.pharma2chebi:
+            pharma2chebi_file = open(args.pharma2chebi)
+        else:
+            pharma2chebi_file = BytesIO(urllib.request.urlopen(pharma2chebi_url).read())
     else:
-        pathway_file, genes_file, chemicals_file = download()
+        pathway_file, genes_file, chemicals_file, pharma2uniprot_file, pharma2chebi_file = download()
+    
+    # ensure that chibi id are strings
+    pharma2chebi = pandas.read_csv(pharma2chebi_file, converters={i: str for i in range(0, 30000)}, sep='\t')
+
     chem_tsv = chemicals_file.open('chemicals.tsv')
     genes_tsv = genes_file.open('genes.tsv')
 
     genes_data = pandas.read_csv(genes_tsv, sep="\t")
     chem_data = pandas.read_csv(chem_tsv, sep="\t")
-
+    pharma2uniprot = remove_duplicates(pandas.read_csv(pharma2uniprot_file, sep='\t'))
+    
     pathway_files = [x for x in pathway_file.namelist() if x.endswith('.owl')]
 
     out_path = args.output
@@ -364,7 +731,9 @@ def main():
         pathway_id, pathway_name = get_pathway_id_name(tree, ns)
         if pathway_id is None:
             pathway_id = filename.split('-')[0]
-        res = convert_pathway(tree, chem_data, genes_data, pathway_id, pathway_name, ns)
+        res = convert_pathway(tree, chem_data, genes_data, 
+                              pharma2uniprot, pathway_id, pathway_name, ns,
+                              pharma2chebi)
         output.write(res)
         output.write('\n' * 3)
     output.close()
diff --git a/uniprot2pharmagkb.tab.gz b/uniprot2pharmagkb.tab.gz
new file mode 100644
index 0000000..f607718
Binary files /dev/null and b/uniprot2pharmagkb.tab.gz differ