Skip to content

Commit

Permalink
Clean up DrugBank and TWOSIDES importers (#57)
Browse files Browse the repository at this point in the history
* Clean up importers

* Update tox.ini

* Additional cleanup
  • Loading branch information
cthoyt authored Jan 21, 2022
1 parent 29e7f5f commit f5f0ffb
Show file tree
Hide file tree
Showing 9 changed files with 703,165 additions and 894,819 deletions.
166 changes: 66 additions & 100 deletions data_cleaning/drugbank_ddi_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,68 @@
import json
import random
import rdkit
import numpy as np
import pandas as pd
from tqdm import tqdm

from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from tdc.multi_pred import DDI

DDI(name="DrugBank")

positive_samples = pd.read_csv("./data/drugbank.tab", sep="\t")

print(positive_samples.shape)

drugs_raw = {}
big_map = {}

for sample in positive_samples.values.tolist():
drugs_raw[sample[0]] = sample[-2]
drugs_raw[sample[1]] = sample[-1]
big_map[(sample[0], sample[1], sample[2])] = 1
big_map[(sample[1], sample[0], sample[2])] = 1

drugs = list(drugs_raw.keys())
contexts = list(set(positive_samples["Y"].values.tolist()))

print(len(drugs))
print(len(contexts))

negative_samples = []

labeled_triples = positive_samples[["ID1", "ID2", "Y"]]
labeled_triples.columns = ["drug_1", "drug_2", "context"]
labeled_triples["label"] = 1.0

for _ in tqdm(range(len(positive_samples.values.tolist()))):
drug_1, drug_2 = random.sample(drugs, 2)
context = random.choice(contexts)
if (drug_1, drug_2, context) in big_map:
while (drug_1, drug_2, context) in big_map:
drug_1, drug_2 = random.sample(drugs, 2)
context = random.choice(contexts)
negative_sample = [drug_1, drug_2, context, 0.0]
negative_samples.append(negative_sample)
"""Download and pre-process the DrugBank drug-drug interaction dataset."""

negative_samples = pd.DataFrame(negative_samples, columns=["drug_1", "drug_2", "context", "label"])
import math
from random import Random

labeled_triples = pd.concat([labeled_triples, negative_samples])
labeled_triples["context"] = labeled_triples["context"].map(lambda x: "context_" + str(x))

print(labeled_triples.shape)

labeled_triples.to_csv("labeled_triples.csv", index=None)

drugs_raw[
"DB09323"
] = "O.O.O.O.C(CNCC1=CC=CC=C1)NCC1=CC=CC=C1.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O"
drugs_raw[
"DB13450"
] = "[O-]S(=O)(=O)C1=CC=CC=C1.[O-]S(=O)(=O)C1=CC=CC=C1.COC1=CC2=C(C=C1OC)[C@@H](CC1=CC(OC)=C(OC)C=C1)[N@@+](C)(CCC(=O)OCCCCCOC(=O)CC[N@@+]1(C)CCC3=C(C=C(OC)C(OC)=C3)[C@H]1CC1=CC(OC)=C(OC)C=C1)CC2"
drugs_raw["DB09396"] = "O.OS(=O)(=O)C1=CC2=CC=CC=C2C=C1.CCC(=O)O[C@@](CC1=CC=CC=C1)([C@H](C)CN(C)C)C1=CC=CC=C1"
drugs_raw["DB09162"] = "[Fe+3].OC(CC([O-])=O)(CC([O-])=O)C([O-])=O"
drugs_raw["DB11106"] = "CC(C)(N)CO.CN1C2=C(NC(Br)=N2)C(=O)N(C)C1=O"
drugs_raw[
"DB11630"
] = "C1CC2=NC1=C(C3=CC=C(N3)C(=C4C=CC(=N4)C(=C5C=CC(=C2C6=CC(=CC=C6)O)N5)C7=CC(=CC=C7)O)C8=CC(=CC=C8)O)C9=CC(=CC=C9)O"
drugs_raw["DB00958"] = "C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]"
drugs_raw["DB00526"] = "C1CCC(C(C1)[NH-])[NH-].C(=O)(C(=O)O)O.[Pt+2]"
drugs_raw["DB13145"] = "C(C(=O)O)O.[NH2-].[NH2-].[Pt+2]"
drugs_raw["DB00515"] = "N.N.Cl[Pt]Cl"

drug_set = {}
for drug, smiles in drugs_raw.items():
drug_set[drug] = {}
drug_set[drug]["smiles"] = smiles
molecule = rdkit.Chem.MolFromSmiles(smiles)
features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256)
array = np.zeros((0,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(features, array)
drug_features = array.tolist()
drug_set[drug]["features"] = drug_features

with open("drug_set.json", "w") as f:
json.dump(drug_set, f)

context_count = len(contexts)


def map_context(index, countext_count):
context_vector = [0 for i in range(countext_count)]
context_vector[index] = 1
return context_vector


context_set = {context: map_context(i, context_count) for i, context in enumerate(contexts)}

with open("context_set.json", "w") as f:
json.dump(context_set, f)
import click
import pandas as pd
from utils import get_index, get_samples, get_tdc, write_artifacts


@click.command()
@click.option("--seed", type=int, default=42, show_default=True, help="Random seed")
@click.option("--ratio", type=float, default=1.0, show_default=True, help="Negative sampling ratio")
def main(seed: int, ratio: float):
"""Download and pre-process the DrugBank DDI dataset."""
rng = Random(seed)
input_directory, output_directory = get_tdc("drugbank", "drugbankddi")

positive_samples = pd.read_csv(
input_directory.joinpath("drugbank.tab"),
sep="\t",
usecols=[0, 1, 2, 4, 5],
header=0,
names=["drug_1", "drug_2", "context", "drug_1_smiles", "drug_2_smiles"],
)
positive_samples["context"] = positive_samples["context"].map(lambda x: f"context_{x:02}")
print("Number of positive samples:", positive_samples.shape[0])
print("Columns:", positive_samples.columns)

contexts = list(sorted(set(positive_samples["context"].values.tolist())))
print("Number of contexts:", len(contexts))

# Index drugs' SMILES and drug-drug-context triples
drugs_raw, big_map = get_index(positive_samples)
drugs_raw.update(
{
"DB09323": "O.O.O.O.C(CNCC1=CC=CC=C1)NCC1=CC=CC=C1.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O", # noqa:E501
"DB13450": "[O-]S(=O)(=O)C1=CC=CC=C1.[O-]S(=O)(=O)C1=CC=CC=C1.COC1=CC2=C(C=C1OC)[C@@H](CC1=CC(OC)=C(OC)C=C1)[N@@+](C)(CCC(=O)OCCCCCOC(=O)CC[N@@+]1(C)CCC3=C(C=C(OC)C(OC)=C3)[C@H]1CC1=CC(OC)=C(OC)C=C1)CC2", # noqa:E501
"DB09396": "O.OS(=O)(=O)C1=CC2=CC=CC=C2C=C1.CCC(=O)O[C@@](CC1=CC=CC=C1)([C@H](C)CN(C)C)C1=CC=CC=C1",
"DB09162": "[Fe+3].OC(CC([O-])=O)(CC([O-])=O)C([O-])=O",
"DB11106": "CC(C)(N)CO.CN1C2=C(NC(Br)=N2)C(=O)N(C)C1=O",
"DB11630": "C1CC2=NC1=C(C3=CC=C(N3)C(=C4C=CC(=N4)C(=C5C=CC(=C2C6=CC(=CC=C6)O)N5)C7=CC(=CC=C7)O)C8=CC(=CC=C8)O)C9=CC(=CC=C9)O", # noqa:E501
"DB00958": "C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]",
"DB00526": "C1CCC(C(C1)[NH-])[NH-].C(=O)(C(=O)O)O.[Pt+2]",
"DB13145": "C(C(=O)O)O.[NH2-].[NH2-].[Pt+2]",
"DB00515": "N.N.Cl[Pt]Cl",
}
)

drugs = list(drugs_raw)
print("Number of drugs:", len(drugs))

# Generate negative samples
negative_samples = get_samples(
rng=rng, n=int(math.ceil(ratio * positive_samples.shape[0])), drugs=drugs, contexts=contexts, big_map=big_map
)

labeled_triples = positive_samples[["drug_1", "drug_2", "context"]]
labeled_triples["label"] = 1.0
labeled_triples = pd.concat([labeled_triples, negative_samples])
print("Number of total triples:", labeled_triples.shape)
labeled_triples.to_csv(output_directory.joinpath("labeled_triples.csv"), index=False)

write_artifacts(output_directory=output_directory, drugs_raw=drugs_raw, contexts=contexts)


if __name__ == "__main__":
main()
127 changes: 44 additions & 83 deletions data_cleaning/twosides_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,59 @@
import json
import random
import rdkit
import heapq
import numpy as np
import pandas as pd
from tqdm import tqdm

from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

from tdc.multi_pred import DDI
"""Download and pre-process the TWOSIDES drug-drug interaction dataset."""

import math
import typing
from collections import Counter
from random import Random

DDI(name="TWOSIDES")

positive_samples = pd.read_csv("./data/twosides.csv", sep=",")

print(positive_samples)

context_counts = Counter(positive_samples["Side Effect Name"].values.tolist())

print(context_counts)

contexts = heapq.nlargest(10, context_counts, key=context_counts.get)

print(contexts)

positive_samples = positive_samples[positive_samples["Side Effect Name"].isin(contexts)]

print(positive_samples.shape)

drugs_raw = {}
big_map = {}

for sample in positive_samples.values.tolist():
drugs_raw[sample[0]] = sample[-2]
drugs_raw[sample[1]] = sample[-1]
big_map[(sample[0], sample[1], sample[3])] = 1
big_map[(sample[1], sample[0], sample[3])] = 1

drugs = list(drugs_raw.keys())

print(len(drugs))
print(len(contexts))

negative_samples = []

labeled_triples = positive_samples[["ID1", "ID2", "Y"]]
labeled_triples.columns = ["drug_1", "drug_2", "context"]
labeled_triples["label"] = 1.0

for _ in tqdm(range(len(positive_samples.values.tolist()))):
drug_1, drug_2 = random.sample(drugs, 2)
context = random.choice(contexts)
if (drug_1, drug_2, context) in big_map:
while (drug_1, drug_2, context) in big_map:
drug_1, drug_2 = random.sample(drugs, 2)
context = random.choice(contexts)
negative_sample = [drug_1, drug_2, context, 0.0]
negative_samples.append(negative_sample)
import click
import pandas as pd
from tabulate import tabulate
from utils import get_index, get_samples, get_tdc, write_artifacts

negative_samples = pd.DataFrame(negative_samples, columns=["drug_1", "drug_2", "context", "label"])

labeled_triples = pd.concat([labeled_triples, negative_samples])
@click.command()
@click.option("--seed", type=int, default=42, show_default=True, help="Random seed")
@click.option("--ratio", type=float, default=1.0, show_default=True, help="Negative sampling ratio")
@click.option("--top", type=int, default=10, show_default=True, help="Keep top most common side effects")
def main(seed: int, ratio: float, top: int):
"""Download and pre-process the TWOSIDES dataset."""
rng = Random(seed)
input_directory, output_directory = get_tdc("TWOSIDES", "twosides")

print(labeled_triples.shape)
positive_samples = pd.read_csv(
input_directory.joinpath("twosides.csv"),
sep=",",
header=0,
usecols=[0, 1, 3, 4, 5],
names=["drug_1", "drug_2", "context", "drug_1_smiles", "drug_2_smiles"],
)
print("Number of positive samples:", positive_samples.shape[0])

labeled_triples.to_csv("labeled_triples.csv", index=None)
context_counts: typing.Counter[str] = Counter(positive_samples["context"].values.tolist())
contexts = sorted(key for key, _ in context_counts.most_common(top))
print(tabulate(context_counts.most_common(top), headers=["context", "count"]))

drug_set = {}
for drug, smiles in drugs_raw.items():
drug_set[drug] = {}
drug_set[drug]["smiles"] = smiles
molecule = rdkit.Chem.MolFromSmiles(smiles)
features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256)
array = np.zeros((0,), dtype=np.int8)
DataStructs.ConvertToNumpyArray(features, array)
drug_features = array.tolist()
drug_set[drug]["features"] = drug_features
positive_samples = positive_samples[positive_samples["context"].isin(set(contexts))]
print(positive_samples.shape)

with open("drug_set.json", "w") as f:
json.dump(drug_set, f)
drugs_raw, big_map = get_index(positive_samples)

context_count = len(contexts)
drugs = list(drugs_raw.keys())
print("Number of drugs:", len(drugs))

# Generate negative samples
negative_samples = get_samples(
rng=rng, n=int(math.ceil(ratio * positive_samples.shape[0])), drugs=drugs, contexts=contexts, big_map=big_map
)

def map_context(index, countext_count):
context_vector = [0 for i in range(countext_count)]
context_vector[index] = 1
return context_vector
labeled_triples = positive_samples[["drug_1", "drug_2", "context"]]
labeled_triples["label"] = 1.0
labeled_triples = pd.concat([labeled_triples, negative_samples])
print("Number of total triples:", labeled_triples.shape)
labeled_triples.to_csv(output_directory.joinpath("labeled_triples.csv"), index=False)

write_artifacts(output_directory=output_directory, drugs_raw=drugs_raw, contexts=contexts)

context_set = {context: map_context(i, context_count) for i, context in enumerate(contexts)}

with open("context_set.json", "w") as f:
json.dump(context_set, f)
if __name__ == "__main__":
main()
Loading

0 comments on commit f5f0ffb

Please sign in to comment.