Skip to content

Commit

Permalink
support generalized substructure search in the SubstructLibrary (rdki…
Browse files Browse the repository at this point in the history
…t#6835)

* support generalized substructure search in the SubstructLibrary

* simplify namespaces

* support the new functionality in the swig wrappers

* update mac swig version in CI

* ensure swig4

* switch mac_java ci builds to conda-forge

* change in response to review

* add copy ctor to extendedquerymol

* Back to the way it was
  • Loading branch information
greglandrum authored Nov 1, 2023
1 parent 81e9cb2 commit 908e47c
Show file tree
Hide file tree
Showing 15 changed files with 435 additions and 96 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/linux_build_java.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
steps:
- bash: |
sudo apt-get update
sudo apt-get install -y software-properties-common zlib1g zlib1g-dev swig3.0
sudo apt-get install -y software-properties-common zlib1g zlib1g-dev swig
sudo apt-get install -y libboost-all-dev libfreetype-dev libeigen3-dev
displayName: Setup build environment
- bash: |
Expand Down
23 changes: 23 additions & 0 deletions Code/GraphMol/GeneralizedSubstruct/XQMol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,29 @@ ExtendedQueryMol::ExtendedQueryMol(const std::string &text, bool isJSON) {
}
}

void ExtendedQueryMol::initFromOther(const ExtendedQueryMol &other) {
if (std::holds_alternative<ExtendedQueryMol::RWMol_T>(other.xqmol)) {
xqmol = std::make_unique<RWMol>(
*std::get<ExtendedQueryMol::RWMol_T>(other.xqmol));
} else if (std::holds_alternative<ExtendedQueryMol::MolBundle_T>(
other.xqmol)) {
xqmol = std::make_unique<MolBundle>(
*std::get<ExtendedQueryMol::MolBundle_T>(other.xqmol));
} else if (std::holds_alternative<ExtendedQueryMol::TautomerQuery_T>(
other.xqmol)) {
xqmol = std::make_unique<TautomerQuery>(
*std::get<ExtendedQueryMol::TautomerQuery_T>(other.xqmol));
} else if (std::holds_alternative<ExtendedQueryMol::TautomerBundle_T>(
other.xqmol)) {
auto tb = std::make_unique<std::vector<std::unique_ptr<TautomerQuery>>>();
for (const auto &tqp :
*std::get<ExtendedQueryMol::TautomerBundle_T>(other.xqmol)) {
tb->emplace_back(std::make_unique<TautomerQuery>(*tqp));
}
xqmol = std::move(tb);
}
}

std::vector<MatchVectType> SubstructMatch(
const ROMol &mol, const ExtendedQueryMol &query,
const SubstructMatchParameters &params) {
Expand Down
12 changes: 10 additions & 2 deletions Code/GraphMol/GeneralizedSubstruct/XQMol.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@

namespace RDKit {
namespace GeneralizedSubstruct {
struct RDKIT_GENERALIZEDSUBSTRUCT_EXPORT ExtendedQueryMol
: private boost::noncopyable {
struct RDKIT_GENERALIZEDSUBSTRUCT_EXPORT ExtendedQueryMol {
enum ExtendedQueryMolTypes : unsigned char {
XQM_MOL = 1,
XQM_MOLBUNDLE = 2,
Expand All @@ -50,12 +49,21 @@ struct RDKIT_GENERALIZEDSUBSTRUCT_EXPORT ExtendedQueryMol
ExtendedQueryMol(
std::unique_ptr<std::vector<std::unique_ptr<TautomerQuery>>> tqs)
: xqmol(std::move(tqs)) {}
ExtendedQueryMol(const ExtendedQueryMol &other) { initFromOther(other); }
ExtendedQueryMol &operator=(const ExtendedQueryMol &other) {
if (this == &other) {
return *this;
}
initFromOther(other);
return *this;
}

ExtendedQueryMol(ExtendedQueryMol &&o) noexcept : xqmol(std::move(o.xqmol)) {}
ExtendedQueryMol(const std::string &text, bool isJSON = false);

void initFromBinary(const std::string &pkl);
void initFromJSON(const std::string &text);
void initFromOther(const ExtendedQueryMol &other);

ContainedType xqmol;
std::string toBinary() const;
Expand Down
73 changes: 49 additions & 24 deletions Code/GraphMol/GeneralizedSubstruct/catch_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,45 +123,70 @@ TEST_CASE("tautomer bundle basics") {
}
}

TEST_CASE("createExtendedQueryMol") {
TEST_CASE("createExtendedQueryMol and copy ctors") {
SECTION("RWMol") {
auto mol = "COCC"_smiles;
REQUIRE(mol);
auto xqm = createExtendedQueryMol(*mol);
CHECK(std::holds_alternative<ExtendedQueryMol::RWMol_T>(xqm.xqmol));
CHECK(SubstructMatch(*"COCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC"_smiles, xqm).empty());
auto txqm = createExtendedQueryMol(*mol);
ExtendedQueryMol xqm1(txqm);
ExtendedQueryMol xqm2(std::make_unique<RWMol>(*mol));
xqm2 = txqm;

for (const auto &xqm : {txqm, xqm1, xqm2}) {
CHECK(std::holds_alternative<ExtendedQueryMol::RWMol_T>(xqm.xqmol));
CHECK(SubstructMatch(*"COCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC"_smiles, xqm).empty());
}
}
SECTION("MolBundle") {
auto mol = "COCC |LN:1:1.3|"_smiles;
REQUIRE(mol);
auto xqm = createExtendedQueryMol(*mol);
CHECK(std::holds_alternative<ExtendedQueryMol::MolBundle_T>(xqm.xqmol));
CHECK(SubstructMatch(*"COCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOOCC"_smiles, xqm).empty());
auto txqm = createExtendedQueryMol(*mol);
ExtendedQueryMol xqm1(txqm);
ExtendedQueryMol xqm2(std::make_unique<RWMol>(*mol));
xqm2 = txqm;

for (const auto &xqm : {txqm, xqm1, xqm2}) {
CHECK(std::holds_alternative<ExtendedQueryMol::MolBundle_T>(xqm.xqmol));
CHECK(SubstructMatch(*"COCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOCC"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOOCC"_smiles, xqm).empty());
}
}
SECTION("TautomerQuery") {
auto mol1 = "CC1OC(N)=N1"_smiles;
REQUIRE(mol1);
auto xqm = createExtendedQueryMol(*mol1);
CHECK(std::holds_alternative<ExtendedQueryMol::TautomerQuery_T>(xqm.xqmol));
CHECK(SubstructMatch(*"CCC1OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"CCC1OC(=N)N1"_smiles, *mol1).empty());
CHECK(SubstructMatch(*"CCC1OC(=N)N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"c1[nH]ncc1"_smiles, xqm).empty());
auto txqm = createExtendedQueryMol(*mol1);
ExtendedQueryMol xqm1(txqm);
ExtendedQueryMol xqm2(std::make_unique<RWMol>(*mol1));
xqm2 = txqm;

for (const auto &xqm : {txqm, xqm1, xqm2}) {
CHECK(
std::holds_alternative<ExtendedQueryMol::TautomerQuery_T>(xqm.xqmol));
CHECK(SubstructMatch(*"CCC1OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"CCC1OC(=N)N1"_smiles, *mol1).empty());
CHECK(SubstructMatch(*"CCC1OC(=N)N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"c1[nH]ncc1"_smiles, xqm).empty());
}
}
SECTION("TautomerBundle") {
auto mol1 = "COCC1OC(N)=N1 |LN:1:1.3|"_smiles;
REQUIRE(mol1);
auto xqm = createExtendedQueryMol(*mol1);
CHECK(
std::holds_alternative<ExtendedQueryMol::TautomerBundle_T>(xqm.xqmol));
CHECK(SubstructMatch(*"COCC1(F)OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC1(F)OC(=N)N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COCC1OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOOCC1OC(=N)N1"_smiles, xqm).empty());
auto txqm = createExtendedQueryMol(*mol1);
ExtendedQueryMol xqm1(txqm);
ExtendedQueryMol xqm2(std::make_unique<RWMol>(*mol1));
xqm2 = txqm;

for (const auto &xqm : {txqm, xqm1, xqm2}) {
CHECK(std::holds_alternative<ExtendedQueryMol::TautomerBundle_T>(
xqm.xqmol));
CHECK(SubstructMatch(*"COCC1(F)OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOCC1(F)OC(=N)N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COCC1OC(N)=N1"_smiles, xqm).size() == 1);
CHECK(SubstructMatch(*"COOOOCC1OC(=N)N1"_smiles, xqm).empty());
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion Code/GraphMol/SubstructLibrary/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ endif()
rdkit_library(SubstructLibrary
SubstructLibrary.cpp
PatternFactory.cpp
LINK_LIBRARIES TautomerQuery MolStandardize Fingerprints SubstructMatch SmilesParse
LINK_LIBRARIES GeneralizedSubstruct TautomerQuery MolStandardize Fingerprints SubstructMatch SmilesParse
GraphMol Catalogs DataStructs RDGeneral ${RDKit_SERIALIZATION_LIBS})
target_compile_definitions(SubstructLibrary PRIVATE RDKIT_SUBSTRUCTLIBRARY_BUILD)

Expand Down
140 changes: 130 additions & 10 deletions Code/GraphMol/SubstructLibrary/SubstructLibrary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,13 @@
#endif

#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/GeneralizedSubstruct/XQMol.h>
#include <boost/dynamic_bitset.hpp>

namespace RDKit {

using namespace GeneralizedSubstruct;

bool SubstructLibraryCanSerialize() {
#ifdef RDK_USE_BOOST_SERIALIZATION
return true;
Expand All @@ -55,9 +58,9 @@ struct Bits {
const FPHolderBase *fps;
SubstructMatchParameters params;

Bits(const FPHolderBase *fps, const ROMol &m,
Bits(const FPHolderBase *fingerprints, const ROMol &m,
const SubstructMatchParameters &ssparams)
: fps(fps), params(ssparams) {
: fps(fingerprints), params(ssparams) {
if (fps) {
queryBits = fps->makeFingerprint(m);
} else {
Expand Down Expand Up @@ -86,6 +89,64 @@ struct Bits {
}
}

// FIX complete this
Bits(const FPHolderBase *fingerprints, const ExtendedQueryMol &xqm,
const SubstructMatchParameters &ssparams)
: fps(fingerprints), params(ssparams) {
if (fps) {
const auto *tph = dynamic_cast<const TautomerPatternHolder *>(fps);
const auto *ph = dynamic_cast<const PatternHolder *>(fps);
if (std::holds_alternative<ExtendedQueryMol::RWMol_T>(xqm.xqmol)) {
queryBits = fps->makeFingerprint(
*std::get<ExtendedQueryMol::RWMol_T>(xqm.xqmol));
} else if (std::holds_alternative<ExtendedQueryMol::MolBundle_T>(
xqm.xqmol)) {
auto &bndl = std::get<ExtendedQueryMol::MolBundle_T>(xqm.xqmol);
auto tqb = new ExplicitBitVect(ph->getNumBits());
queryBits = tqb;
for (auto mol : bndl->getMols()) {
auto tfp = fps->makeFingerprint(*mol);
*tqb &= *tfp;
delete tfp;
}
} else if (std::holds_alternative<ExtendedQueryMol::TautomerQuery_T>(
xqm.xqmol)) {
auto &tq = std::get<ExtendedQueryMol::TautomerQuery_T>(xqm.xqmol);
if (!tph) {
BOOST_LOG(rdWarningLog) << "Pattern fingerprints for tautomersearch "
"aren't tautomer fingerprints, ignoring..."
<< std::endl;
queryBits = nullptr;
fps = nullptr;
} else {
queryBits = tq->patternFingerprintTemplate(tph->getNumBits());
}
} else if (std::holds_alternative<ExtendedQueryMol::TautomerBundle_T>(
xqm.xqmol)) {
if (!tph) {
BOOST_LOG(rdWarningLog) << "Pattern fingerprints for tautomersearch "
"aren't tautomer fingerprints, ignoring..."
<< std::endl;
queryBits = nullptr;
fps = nullptr;
} else {
auto &bndl = std::get<ExtendedQueryMol::TautomerBundle_T>(xqm.xqmol);
auto tqb = new ExplicitBitVect(ph->getNumBits());
queryBits = tqb;
for (auto &tq : *bndl) {
auto tfp = tq->patternFingerprintTemplate(tph->getNumBits());
*tqb &= *tfp;
delete tfp;
}
}
} else {
queryBits = nullptr;
}
} else {
queryBits = nullptr;
}
}

bool check(unsigned int idx) const {
if (fps) {
return fps->passesFilter(idx, *queryBits);
Expand Down Expand Up @@ -139,6 +200,36 @@ bool query_needs_rings(const TautomerQuery &in_query) {
return query_needs_rings(in_query.getTemplateMolecule());
}

bool query_needs_rings(const ExtendedQueryMol &xqm) {
if (std::holds_alternative<ExtendedQueryMol::RWMol_T>(xqm.xqmol)) {
return query_needs_rings(*std::get<ExtendedQueryMol::RWMol_T>(xqm.xqmol));
} else if (std::holds_alternative<ExtendedQueryMol::TautomerQuery_T>(
xqm.xqmol)) {
return query_needs_rings(
std::get<ExtendedQueryMol::TautomerQuery_T>(xqm.xqmol)
->getTemplateMolecule());
} else if (std::holds_alternative<ExtendedQueryMol::MolBundle_T>(xqm.xqmol)) {
for (const auto &mol :
std::get<ExtendedQueryMol::MolBundle_T>(xqm.xqmol)->getMols()) {
if (query_needs_rings(*mol)) {
return true;
}
}
return false;
} else if (std::holds_alternative<ExtendedQueryMol::TautomerBundle_T>(
xqm.xqmol)) {
for (const auto &tq :
*std::get<ExtendedQueryMol::TautomerBundle_T>(xqm.xqmol)) {
if (query_needs_rings(tq->getTemplateMolecule())) {
return true;
}
}
return false;
}
return true; // if we somehow get here, we better assume that rings are
// necessary
}

template <class Query>
void SubSearcher(const Query &in_query, const Bits &bits,
const MolHolderBase &mols, unsigned int start,
Expand All @@ -149,6 +240,8 @@ void SubSearcher(const Query &in_query, const Bits &bits,
std::vector<unsigned int> *idxs) {
PRECONDITION(searchOrder.empty() || searchOrder.size() >= end,
"bad searchOrder data");
// we copy the query so that we don't end up with lock contention for
// recursive matchers when using multiple threads
Query query(in_query);
for (unsigned int idx = start; idx < end; idx += numThreads) {
unsigned int sidx = idx;
Expand Down Expand Up @@ -176,8 +269,8 @@ void SubSearcher(const Query &in_query, const Bits &bits,
if (idxs) {
idxs->push_back(sidx);
if (maxResults > 0 && counter == maxResults) {
// if we reached maxResults, record the last idx we processed and bail
// out
// if we reached maxResults, record the last idx we processed and
// bail out
end = idx;
break;
}
Expand Down Expand Up @@ -258,18 +351,18 @@ int internalGetMatches(const Query &query, MolHolderBase &mols,
// If maxResults was close to the theoretical maximum, some threads
// might have even run out of molecules to screen without reaching
// maxResults so we need to make sure that all threads have screened as
// many molecules as the most productive thread if we want multi-threaded
// runs to yield the same results independently from the number of
// threads.
// many molecules as the most productive thread if we want
// multi-threaded runs to yield the same results independently from the
// number of threads.
thread_group_idx = 0;
for (auto &fut : thread_group) {
fut.get();
counter += counterVect[thread_group_idx++];
}
thread_group.clear();
// Find out out the max number of molecules that was screened by the most
// productive thread and do the same in all other threads, unless the
// max number of molecules was reached
// Find out out the max number of molecules that was screened by the
// most productive thread and do the same in all other threads, unless
// the max number of molecules was reached
maxEndIdx = *std::max_element(endIdxVect.begin(), endIdxVect.end());
for (thread_group_idx = 0; thread_group_idx < numThreads;
++thread_group_idx) {
Expand Down Expand Up @@ -395,6 +488,17 @@ std::vector<unsigned int> SubstructLibrary::getMatches(
return idxs;
}

std::vector<unsigned int> SubstructLibrary::getMatches(
const ExtendedQueryMol &query, unsigned int startIdx, unsigned int endIdx,
const SubstructMatchParameters &params, int numThreads,
int maxResults) const {
std::vector<unsigned int> idxs;
boost::dynamic_bitset<> found(mols->size());
internalGetMatches(query, *mols, fps, startIdx, endIdx, params, numThreads,
maxResults, found, searchOrder, &idxs);
return idxs;
}

unsigned int SubstructLibrary::countMatches(
const ROMol &query, unsigned int startIdx, unsigned int endIdx,
const SubstructMatchParameters &params, int numThreads) const {
Expand All @@ -417,6 +521,14 @@ unsigned int SubstructLibrary::countMatches(
numThreads, -1, searchOrder, nullptr);
}

unsigned int SubstructLibrary::countMatches(
const ExtendedQueryMol &query, unsigned int startIdx, unsigned int endIdx,
const SubstructMatchParameters &params, int numThreads) const {
boost::dynamic_bitset<> found(mols->size());
return internalGetMatches(query, *mols, fps, startIdx, endIdx, params,
numThreads, -1, found, searchOrder, nullptr);
}

bool SubstructLibrary::hasMatch(const ROMol &query, unsigned int startIdx,
unsigned int endIdx,
const SubstructMatchParameters &params,
Expand All @@ -442,6 +554,14 @@ bool SubstructLibrary::hasMatch(const MolBundle &query, unsigned int startIdx,
return getMatches(query, startIdx, endIdx, params, numThreads, maxResults)
.size() > 0;
}
bool SubstructLibrary::hasMatch(const ExtendedQueryMol &query,
unsigned int startIdx, unsigned int endIdx,
const SubstructMatchParameters &params,
int numThreads) const {
const int maxResults = 1;
return getMatches(query, startIdx, endIdx, params, numThreads, maxResults)
.size() > 0;
}

void SubstructLibrary::toStream(std::ostream &ss) const {
#ifndef RDK_USE_BOOST_SERIALIZATION
Expand Down
Loading

0 comments on commit 908e47c

Please sign in to comment.