This repo demonstrates how to integrate SQLModel with the RDKit database cartridge in the simplest way possible.
You may want to check out an existing solution like razi, but it requires some modifications to work with SQLModel at the time of writing, such as updating similarity search operators and result value processing.
$ git clone https://github.com/savvan0h/sqlmodel-rdkit-example.git
$ cd sqlmodel-rdkit-example
$ python -m venv venv
$ . ./venv/bin/activate
$ pip install .
If you don’t have your own database, you can use the rvianello/postgres-rdkit:16.2-2024.03.1 image to set up a PostgreSQL database with the RDKit extension.
Warning
PostgreSQL with RDKit must use the same RDKit version as your Python environment.
$ docker run \
--name postgres-rdkit \
-p 5432:5432 \
-e POSTGRES_PASSWORD=mysecretpassword \
-d rvianello/postgres-rdkit:16.2-2024.03.1
$ docker exec postgres-rdkit psql -U postgres postgres -c 'CREATE EXTENSION rdkit'
from sqlmodel import create_engine
from models import Compound
# Change the connection string as needed
engine = create_engine("postgresql+psycopg://postgres:mysecretpassword@localhost:5432/postgres")
Compound.__table__.create(engine)
from sqlmodel import Session
from models import morganbv_fp
from molecules import SMILES_SAMPLE
session = Session(engine)
for i, smiles in enumerate(SMILES_SAMPLE):
compound = Compound(
name=f"Compound {i}",
molecule=smiles,
mfp2=morganbv_fp(smiles)
)
session.add(compound)
session.commit()
- Python
from sqlmodel import select
statement = select(Compound).where(Compound.molecule == "COC(c1ccccc1)c1ccccc1") # value can be a mol object
session.exec(statement).all()
- SQL
SELECT compound.id,
compound.name,
mol_to_pkl(compound.molecule) AS molecule,
bfp_to_binary_text(compound.mfp2) AS mfp2
FROM compound
WHERE compound.molecule @= mol_from_pkl(%(molecule_1)s)
- Python
statement = select(Compound).where(Compound.molecule.hassubstruct("C1=C(C)C=CC=C1"))
session.exec(statement).all()
- SQL
SELECT compound.id,
compound.name,
mol_to_pkl(compound.molecule) AS molecule,
bfp_to_binary_text(compound.mfp2) AS mfp2
FROM compound
WHERE compound.molecule @> mol_from_pkl(%(molecule_1)s)
- Python
from sqlmodel import cast
from models import QMol
qmol = cast("c1[c,n]cccc1", QMol)
statement = select(Compound).where(Compound.molecule.hassubstruct(qmol))
session.exec(statement).all()
- SQL
SELECT compound.id,
compound.name,
mol_to_pkl(compound.molecule) AS molecule,
bfp_to_binary_text(compound.mfp2) AS mfp2
FROM compound
WHERE compound.molecule @> CAST(%(param_1)s AS qmol)
- Python
statement = select(Compound).where(Compound.molecule.issubstruct("CCN1c2ccccc2Sc2ccccc21"))
session.exec(statement).all()
- SQL
SELECT compound.id,
compound.name,
mol_to_pkl(compound.molecule) AS molecule,
bfp_to_binary_text(compound.mfp2) AS mfp2
FROM compound
WHERE compound.molecule <@ mol_from_pkl(%(molecule_1)s)
- Python
smiles = "CCN1c2ccccc2Sc2ccccc21"
statement = select(Compound).where(Compound.mfp2.tanimoto_sml(morganbv_fp(smiles)))
session.exec(statement).all()
- SQL
SELECT compound.id,
compound.name,
mol_to_pkl(compound.molecule) AS molecule,
bfp_to_binary_text(compound.mfp2) AS mfp2
FROM compound
WHERE compound.mfp2 %% morganbv_fp(%(morganbv_fp_1)s::VARCHAR)
- Python
from sqlmodel import text
session.exec(text("SET rdkit.tanimoto_threshold=0.6")) # default is 0.5
- SQL
SET rdkit.tanimoto_threshold=0.6