-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Clean up DrugBank and TWOSIDES importers (#57)
* Clean up importers * Update tox.ini * Additional cleanup
- Loading branch information
Showing
9 changed files
with
703,165 additions
and
894,819 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,102 +1,68 @@ | ||
import json | ||
import random | ||
import rdkit | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
from rdkit.Chem import AllChem | ||
from rdkit.Chem import DataStructs | ||
|
||
from tdc.multi_pred import DDI | ||
|
||
DDI(name="DrugBank") | ||
|
||
positive_samples = pd.read_csv("./data/drugbank.tab", sep="\t") | ||
|
||
print(positive_samples.shape) | ||
|
||
drugs_raw = {} | ||
big_map = {} | ||
|
||
for sample in positive_samples.values.tolist(): | ||
drugs_raw[sample[0]] = sample[-2] | ||
drugs_raw[sample[1]] = sample[-1] | ||
big_map[(sample[0], sample[1], sample[2])] = 1 | ||
big_map[(sample[1], sample[0], sample[2])] = 1 | ||
|
||
drugs = list(drugs_raw.keys()) | ||
contexts = list(set(positive_samples["Y"].values.tolist())) | ||
|
||
print(len(drugs)) | ||
print(len(contexts)) | ||
|
||
negative_samples = [] | ||
|
||
labeled_triples = positive_samples[["ID1", "ID2", "Y"]] | ||
labeled_triples.columns = ["drug_1", "drug_2", "context"] | ||
labeled_triples["label"] = 1.0 | ||
|
||
for _ in tqdm(range(len(positive_samples.values.tolist()))): | ||
drug_1, drug_2 = random.sample(drugs, 2) | ||
context = random.choice(contexts) | ||
if (drug_1, drug_2, context) in big_map: | ||
while (drug_1, drug_2, context) in big_map: | ||
drug_1, drug_2 = random.sample(drugs, 2) | ||
context = random.choice(contexts) | ||
negative_sample = [drug_1, drug_2, context, 0.0] | ||
negative_samples.append(negative_sample) | ||
"""Download and pre-process the DrugBank drug-drug interaction dataset.""" | ||
|
||
negative_samples = pd.DataFrame(negative_samples, columns=["drug_1", "drug_2", "context", "label"]) | ||
import math | ||
from random import Random | ||
|
||
labeled_triples = pd.concat([labeled_triples, negative_samples]) | ||
labeled_triples["context"] = labeled_triples["context"].map(lambda x: "context_" + str(x)) | ||
|
||
print(labeled_triples.shape) | ||
|
||
labeled_triples.to_csv("labeled_triples.csv", index=None) | ||
|
||
drugs_raw[ | ||
"DB09323" | ||
] = "O.O.O.O.C(CNCC1=CC=CC=C1)NCC1=CC=CC=C1.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O" | ||
drugs_raw[ | ||
"DB13450" | ||
] = "[O-]S(=O)(=O)C1=CC=CC=C1.[O-]S(=O)(=O)C1=CC=CC=C1.COC1=CC2=C(C=C1OC)[C@@H](CC1=CC(OC)=C(OC)C=C1)[N@@+](C)(CCC(=O)OCCCCCOC(=O)CC[N@@+]1(C)CCC3=C(C=C(OC)C(OC)=C3)[C@H]1CC1=CC(OC)=C(OC)C=C1)CC2" | ||
drugs_raw["DB09396"] = "O.OS(=O)(=O)C1=CC2=CC=CC=C2C=C1.CCC(=O)O[C@@](CC1=CC=CC=C1)([C@H](C)CN(C)C)C1=CC=CC=C1" | ||
drugs_raw["DB09162"] = "[Fe+3].OC(CC([O-])=O)(CC([O-])=O)C([O-])=O" | ||
drugs_raw["DB11106"] = "CC(C)(N)CO.CN1C2=C(NC(Br)=N2)C(=O)N(C)C1=O" | ||
drugs_raw[ | ||
"DB11630" | ||
] = "C1CC2=NC1=C(C3=CC=C(N3)C(=C4C=CC(=N4)C(=C5C=CC(=C2C6=CC(=CC=C6)O)N5)C7=CC(=CC=C7)O)C8=CC(=CC=C8)O)C9=CC(=CC=C9)O" | ||
drugs_raw["DB00958"] = "C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]" | ||
drugs_raw["DB00526"] = "C1CCC(C(C1)[NH-])[NH-].C(=O)(C(=O)O)O.[Pt+2]" | ||
drugs_raw["DB13145"] = "C(C(=O)O)O.[NH2-].[NH2-].[Pt+2]" | ||
drugs_raw["DB00515"] = "N.N.Cl[Pt]Cl" | ||
|
||
drug_set = {} | ||
for drug, smiles in drugs_raw.items(): | ||
drug_set[drug] = {} | ||
drug_set[drug]["smiles"] = smiles | ||
molecule = rdkit.Chem.MolFromSmiles(smiles) | ||
features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256) | ||
array = np.zeros((0,), dtype=np.int8) | ||
DataStructs.ConvertToNumpyArray(features, array) | ||
drug_features = array.tolist() | ||
drug_set[drug]["features"] = drug_features | ||
|
||
with open("drug_set.json", "w") as f: | ||
json.dump(drug_set, f) | ||
|
||
context_count = len(contexts) | ||
|
||
|
||
def map_context(index, countext_count): | ||
context_vector = [0 for i in range(countext_count)] | ||
context_vector[index] = 1 | ||
return context_vector | ||
|
||
|
||
context_set = {context: map_context(i, context_count) for i, context in enumerate(contexts)} | ||
|
||
with open("context_set.json", "w") as f: | ||
json.dump(context_set, f) | ||
import click | ||
import pandas as pd | ||
from utils import get_index, get_samples, get_tdc, write_artifacts | ||
|
||
|
||
@click.command() | ||
@click.option("--seed", type=int, default=42, show_default=True, help="Random seed") | ||
@click.option("--ratio", type=float, default=1.0, show_default=True, help="Negative sampling ratio") | ||
def main(seed: int, ratio: float): | ||
"""Download and pre-process the DrugBank DDI dataset.""" | ||
rng = Random(seed) | ||
input_directory, output_directory = get_tdc("drugbank", "drugbankddi") | ||
|
||
positive_samples = pd.read_csv( | ||
input_directory.joinpath("drugbank.tab"), | ||
sep="\t", | ||
usecols=[0, 1, 2, 4, 5], | ||
header=0, | ||
names=["drug_1", "drug_2", "context", "drug_1_smiles", "drug_2_smiles"], | ||
) | ||
positive_samples["context"] = positive_samples["context"].map(lambda x: f"context_{x:02}") | ||
print("Number of positive samples:", positive_samples.shape[0]) | ||
print("Columns:", positive_samples.columns) | ||
|
||
contexts = list(sorted(set(positive_samples["context"].values.tolist()))) | ||
print("Number of contexts:", len(contexts)) | ||
|
||
# Index drugs' SMILES and drug-drug-context triples | ||
drugs_raw, big_map = get_index(positive_samples) | ||
drugs_raw.update( | ||
{ | ||
"DB09323": "O.O.O.O.C(CNCC1=CC=CC=C1)NCC1=CC=CC=C1.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O.[H][C@]12SC(C)(C)[C@@H](N1C(=O)[C@H]2NC(=O)CC1=CC=CC=C1)C(O)=O", # noqa:E501 | ||
"DB13450": "[O-]S(=O)(=O)C1=CC=CC=C1.[O-]S(=O)(=O)C1=CC=CC=C1.COC1=CC2=C(C=C1OC)[C@@H](CC1=CC(OC)=C(OC)C=C1)[N@@+](C)(CCC(=O)OCCCCCOC(=O)CC[N@@+]1(C)CCC3=C(C=C(OC)C(OC)=C3)[C@H]1CC1=CC(OC)=C(OC)C=C1)CC2", # noqa:E501 | ||
"DB09396": "O.OS(=O)(=O)C1=CC2=CC=CC=C2C=C1.CCC(=O)O[C@@](CC1=CC=CC=C1)([C@H](C)CN(C)C)C1=CC=CC=C1", | ||
"DB09162": "[Fe+3].OC(CC([O-])=O)(CC([O-])=O)C([O-])=O", | ||
"DB11106": "CC(C)(N)CO.CN1C2=C(NC(Br)=N2)C(=O)N(C)C1=O", | ||
"DB11630": "C1CC2=NC1=C(C3=CC=C(N3)C(=C4C=CC(=N4)C(=C5C=CC(=C2C6=CC(=CC=C6)O)N5)C7=CC(=CC=C7)O)C8=CC(=CC=C8)O)C9=CC(=CC=C9)O", # noqa:E501 | ||
"DB00958": "C1CC(C1)(C(=O)O)C(=O)O.[NH2-].[NH2-].[Pt+2]", | ||
"DB00526": "C1CCC(C(C1)[NH-])[NH-].C(=O)(C(=O)O)O.[Pt+2]", | ||
"DB13145": "C(C(=O)O)O.[NH2-].[NH2-].[Pt+2]", | ||
"DB00515": "N.N.Cl[Pt]Cl", | ||
} | ||
) | ||
|
||
drugs = list(drugs_raw) | ||
print("Number of drugs:", len(drugs)) | ||
|
||
# Generate negative samples | ||
negative_samples = get_samples( | ||
rng=rng, n=int(math.ceil(ratio * positive_samples.shape[0])), drugs=drugs, contexts=contexts, big_map=big_map | ||
) | ||
|
||
labeled_triples = positive_samples[["drug_1", "drug_2", "context"]] | ||
labeled_triples["label"] = 1.0 | ||
labeled_triples = pd.concat([labeled_triples, negative_samples]) | ||
print("Number of total triples:", labeled_triples.shape) | ||
labeled_triples.to_csv(output_directory.joinpath("labeled_triples.csv"), index=False) | ||
|
||
write_artifacts(output_directory=output_directory, drugs_raw=drugs_raw, contexts=contexts) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,98 +1,59 @@ | ||
import json | ||
import random | ||
import rdkit | ||
import heapq | ||
import numpy as np | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
from rdkit.Chem import AllChem | ||
from rdkit.Chem import DataStructs | ||
|
||
from tdc.multi_pred import DDI | ||
"""Download and pre-process the TWOSIDES drug-drug interaction dataset.""" | ||
|
||
import math | ||
import typing | ||
from collections import Counter | ||
from random import Random | ||
|
||
DDI(name="TWOSIDES") | ||
|
||
positive_samples = pd.read_csv("./data/twosides.csv", sep=",") | ||
|
||
print(positive_samples) | ||
|
||
context_counts = Counter(positive_samples["Side Effect Name"].values.tolist()) | ||
|
||
print(context_counts) | ||
|
||
contexts = heapq.nlargest(10, context_counts, key=context_counts.get) | ||
|
||
print(contexts) | ||
|
||
positive_samples = positive_samples[positive_samples["Side Effect Name"].isin(contexts)] | ||
|
||
print(positive_samples.shape) | ||
|
||
drugs_raw = {} | ||
big_map = {} | ||
|
||
for sample in positive_samples.values.tolist(): | ||
drugs_raw[sample[0]] = sample[-2] | ||
drugs_raw[sample[1]] = sample[-1] | ||
big_map[(sample[0], sample[1], sample[3])] = 1 | ||
big_map[(sample[1], sample[0], sample[3])] = 1 | ||
|
||
drugs = list(drugs_raw.keys()) | ||
|
||
print(len(drugs)) | ||
print(len(contexts)) | ||
|
||
negative_samples = [] | ||
|
||
labeled_triples = positive_samples[["ID1", "ID2", "Y"]] | ||
labeled_triples.columns = ["drug_1", "drug_2", "context"] | ||
labeled_triples["label"] = 1.0 | ||
|
||
for _ in tqdm(range(len(positive_samples.values.tolist()))): | ||
drug_1, drug_2 = random.sample(drugs, 2) | ||
context = random.choice(contexts) | ||
if (drug_1, drug_2, context) in big_map: | ||
while (drug_1, drug_2, context) in big_map: | ||
drug_1, drug_2 = random.sample(drugs, 2) | ||
context = random.choice(contexts) | ||
negative_sample = [drug_1, drug_2, context, 0.0] | ||
negative_samples.append(negative_sample) | ||
import click | ||
import pandas as pd | ||
from tabulate import tabulate | ||
from utils import get_index, get_samples, get_tdc, write_artifacts | ||
|
||
negative_samples = pd.DataFrame(negative_samples, columns=["drug_1", "drug_2", "context", "label"]) | ||
|
||
labeled_triples = pd.concat([labeled_triples, negative_samples]) | ||
@click.command() | ||
@click.option("--seed", type=int, default=42, show_default=True, help="Random seed") | ||
@click.option("--ratio", type=float, default=1.0, show_default=True, help="Negative sampling ratio") | ||
@click.option("--top", type=int, default=10, show_default=True, help="Keep top most common side effects") | ||
def main(seed: int, ratio: float, top: int): | ||
"""Download and pre-process the TWOSIDES dataset.""" | ||
rng = Random(seed) | ||
input_directory, output_directory = get_tdc("TWOSIDES", "twosides") | ||
|
||
print(labeled_triples.shape) | ||
positive_samples = pd.read_csv( | ||
input_directory.joinpath("twosides.csv"), | ||
sep=",", | ||
header=0, | ||
usecols=[0, 1, 3, 4, 5], | ||
names=["drug_1", "drug_2", "context", "drug_1_smiles", "drug_2_smiles"], | ||
) | ||
print("Number of positive samples:", positive_samples.shape[0]) | ||
|
||
labeled_triples.to_csv("labeled_triples.csv", index=None) | ||
context_counts: typing.Counter[str] = Counter(positive_samples["context"].values.tolist()) | ||
contexts = sorted(key for key, _ in context_counts.most_common(top)) | ||
print(tabulate(context_counts.most_common(top), headers=["context", "count"])) | ||
|
||
drug_set = {} | ||
for drug, smiles in drugs_raw.items(): | ||
drug_set[drug] = {} | ||
drug_set[drug]["smiles"] = smiles | ||
molecule = rdkit.Chem.MolFromSmiles(smiles) | ||
features = AllChem.GetHashedMorganFingerprint(molecule, 2, nBits=256) | ||
array = np.zeros((0,), dtype=np.int8) | ||
DataStructs.ConvertToNumpyArray(features, array) | ||
drug_features = array.tolist() | ||
drug_set[drug]["features"] = drug_features | ||
positive_samples = positive_samples[positive_samples["context"].isin(set(contexts))] | ||
print(positive_samples.shape) | ||
|
||
with open("drug_set.json", "w") as f: | ||
json.dump(drug_set, f) | ||
drugs_raw, big_map = get_index(positive_samples) | ||
|
||
context_count = len(contexts) | ||
drugs = list(drugs_raw.keys()) | ||
print("Number of drugs:", len(drugs)) | ||
|
||
# Generate negative samples | ||
negative_samples = get_samples( | ||
rng=rng, n=int(math.ceil(ratio * positive_samples.shape[0])), drugs=drugs, contexts=contexts, big_map=big_map | ||
) | ||
|
||
def map_context(index, countext_count): | ||
context_vector = [0 for i in range(countext_count)] | ||
context_vector[index] = 1 | ||
return context_vector | ||
labeled_triples = positive_samples[["drug_1", "drug_2", "context"]] | ||
labeled_triples["label"] = 1.0 | ||
labeled_triples = pd.concat([labeled_triples, negative_samples]) | ||
print("Number of total triples:", labeled_triples.shape) | ||
labeled_triples.to_csv(output_directory.joinpath("labeled_triples.csv"), index=False) | ||
|
||
write_artifacts(output_directory=output_directory, drugs_raw=drugs_raw, contexts=contexts) | ||
|
||
context_set = {context: map_context(i, context_count) for i, context in enumerate(contexts)} | ||
|
||
with open("context_set.json", "w") as f: | ||
json.dump(context_set, f) | ||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.