From 0c9b5ec55f8f411438444d9f3b6ffecfa6c9e23f Mon Sep 17 00:00:00 2001 From: Geoff Hutchison Date: Thu, 7 Nov 2024 21:05:37 -0500 Subject: [PATCH] Add basic support for v3000 molfiles, including for large molecules More work to do, including support for RDKit extensions Signed-off-by: Geoff Hutchison --- avogadro/io/mdlformat.cpp | 213 ++++++++++++++++++++++++++++++++++++-- avogadro/io/mdlformat.h | 2 + avogadro/io/sdfformat.cpp | 1 + 3 files changed, 209 insertions(+), 7 deletions(-) diff --git a/avogadro/io/mdlformat.cpp b/avogadro/io/mdlformat.cpp index fe41c111d9..66fc36924f 100644 --- a/avogadro/io/mdlformat.cpp +++ b/avogadro/io/mdlformat.cpp @@ -23,6 +23,7 @@ using Avogadro::Core::Bond; using Avogadro::Core::Elements; using Avogadro::Core::lexicalCast; using Avogadro::Core::Molecule; +using Avogadro::Core::split; using Avogadro::Core::startsWith; using Avogadro::Core::trimmed; @@ -92,8 +93,10 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol) return false; } string mdlVersion(trimmed(buffer.substr(33))); - if (mdlVersion != "V2000") { - appendError("Unsupported file format version encountered: " + mdlVersion); + if (mdlVersion == "V3000") + return readV3000(in, mol); + else if (mdlVersion != "V2000") { + appendError("Unsupported MDL version: " + mdlVersion); return false; } @@ -241,8 +244,8 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol) dataValue += buffer; } } else if (startsWith(buffer, "> <")) { - // This is a data header, read the name of the entry, and the value on the - // following lines. + // This is a data header, read the name of the entry, and the value on + // the following lines. dataName = trimmed(buffer).substr(3, buffer.length() - 4); inValue = true; } @@ -251,11 +254,207 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol) return true; } +bool MdlFormat::readV3000(std::istream& in, Core::Molecule& mol) +{ + string buffer; + // we should have M V30 BEGIN CTAB + getline(in, buffer); + if (trimmed(buffer) != "M V30 BEGIN CTAB") { + appendError("Error parsing V3000 file, expected 'M V30 BEGIN CTAB'."); + return false; + } + // now we should get the counts line + // e.g. 'M V30 COUNTS 23694 24297 0 0 1' + getline(in, buffer); + // split by whitespace + std::vector counts = split(trimmed(buffer), ' '); + if (counts.size() < 5) { + appendError("Error parsing V3000 counts line."); + return false; + } + bool ok(false); + int numAtoms(lexicalCast(counts[3], ok)); + if (!ok) { + appendError("Error parsing number of atoms."); + return false; + } + int numBonds(lexicalCast(counts[4], ok)); + if (!ok) { + appendError("Error parsing number of bonds."); + return false; + } + + // Parse the atom block. + // 'M V30 BEGIN ATOM' + // 'M V30 1 N 171.646 251.874 224.877 0' + getline(in, buffer); + if (trimmed(buffer) != "M V30 BEGIN ATOM") { + appendError("Error parsing V3000 atom block."); + return false; + } + for (int i = 0; i < numAtoms; ++i) { + getline(in, buffer); + std::vector atomData = split(trimmed(buffer), ' '); + if (atomData.size() < 7) { + appendError("Error parsing V3000 atom line."); + return false; + } + + string element(trimmed(atomData[3])); + unsigned char atomicNum = Elements::atomicNumberFromSymbol(element); + Atom newAtom = mol.addAtom(atomicNum); + + Vector3 pos; + pos.x() = lexicalCast(atomData[4], ok); + if (!ok) { + appendError("Failed to parse x coordinate: " + atomData[3]); + return false; + } + pos.y() = lexicalCast(atomData[5], ok); + if (!ok) { + appendError("Failed to parse y coordinate: " + atomData[4]); + return false; + } + pos.z() = lexicalCast(atomData[6], ok); + if (!ok) { + appendError("Failed to parse z coordinate: " + atomData[5]); + return false; + } + newAtom.setPosition3d(pos); + // check for formal charge in the atom block + // CHG=1 for example + if (atomData.size() > 7) { + string chargeData = atomData[7]; + if (startsWith(chargeData, "CHG=")) { + int charge = lexicalCast(chargeData.substr(4), ok); + if (!ok) { + appendError("Failed to parse atom charge: " + chargeData); + return false; + } + mol.setFormalCharge(newAtom.index(), charge); + } + } + } // end of atom block + getline(in, buffer); + // check for END ATOM + if (trimmed(buffer) != "M V30 END ATOM") { + appendError("Error parsing V3000 atom block."); + return false; + } + + // bond block + // 'M V30 BEGIN BOND' + // 'M V30 1 1 1 2' + getline(in, buffer); + if (trimmed(buffer) != "M V30 BEGIN BOND") { + appendError("Error parsing V3000 bond block."); + return false; + } + for (int i = 0; i < numBonds; ++i) { + getline(in, buffer); + std::vector bondData = split(trimmed(buffer), ' '); + if (bondData.size() < 5) { + appendError("Error parsing V3000 bond line."); + return false; + } + int order = lexicalCast(bondData[3], ok); + if (!ok) { + appendError("Failed to parse bond order: " + bondData[3]); + return false; + } + int atom1 = lexicalCast(bondData[4], ok) - 1; + if (!ok) { + appendError("Failed to parse bond atom1: " + bondData[4]); + return false; + } + int atom2 = lexicalCast(bondData[5], ok) - 1; + if (!ok) { + appendError("Failed to parse bond atom2: " + bondData[5]); + return false; + } + mol.addBond(mol.atom(atom1), mol.atom(atom2), + static_cast(order)); + } // end of bond block + + // look for M END + while (getline(in, buffer)) { + if (trimmed(buffer) == "M END") + break; + } + // read in any properties + while (getline(in, buffer)) { + if (startsWith(buffer, "> <")) { + string key = trimmed(buffer.substr(3, buffer.length() - 4)); + string value; + while (getline(in, buffer)) { + if (trimmed(buffer) == "") + break; + value += buffer + "\n"; + } + mol.setData(key, value); + } + } + + return true; +} + +bool MdlFormat::writeV3000(std::ostream& out, const Core::Molecule& mol) +{ + // write the "fake" counts line + out << " 0 0 0 0 0 999 V3000\n"; + out << "M V30 BEGIN CTAB\n"; + out << "M V30 COUNTS " << mol.atomCount() << ' ' << mol.bondCount() + << " 0 0 0\n"; + // atom block + out << "M V30 BEGIN ATOM\n"; + for (size_t i = 0; i < mol.atomCount(); ++i) { + Atom atom = mol.atom(i); + out << "M V30 " << i + 1 << ' ' << Elements::symbol(atom.atomicNumber()) + << ' ' << atom.position3d().x() << ' ' << atom.position3d().y() << ' ' + << atom.position3d().z() << " 0"; + if (atom.formalCharge()) + out << " CHG=" << atom.formalCharge(); + out << "\n"; + } + out << "M V30 END ATOM\n"; + // bond block + out << "M V30 BEGIN BOND\n"; + for (size_t i = 0; i < mol.bondCount(); ++i) { + Bond bond = mol.bond(i); + out << "M V30 " << i + 1 << ' ' << static_cast(bond.order()) << ' ' + << (bond.atom1().index() + 1) << ' ' << (bond.atom2().index() + 1) + << " \n"; + } + out << "M V30 END BOND\n"; + out << "M V30 END CTAB\n"; + out << "M END\n"; + + // TODO: isotopes, radicals, etc. + if (m_writeProperties) { + const auto dataMap = mol.dataMap(); + for (const auto& key : dataMap.names()) { + out << "> <" << key << ">\n"; + out << dataMap.value(key).toString() << "\n"; + out << "\n"; // empty line between data blocks + } + } + + if (m_writeProperties || isMode(FileFormat::MultiMolecule)) + out << "$$$$\n"; + + return true; +} + bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol) { // Header lines. out << mol.data("name").toString() << "\n Avogadro\n\n"; // Counts line. + if (mol.atomCount() > 999 || mol.bondCount() > 999) { + // we need V3000 support for big molecules + return writeV3000(out, mol); + } + out << setw(3) << std::right << mol.atomCount() << setw(3) << mol.bondCount() << " 0 0 0 0 0 0 0 0999 V2000\n"; // Atom block. @@ -269,7 +468,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol) : ((charge <= 3) ? charge : 0); out << setw(10) << std::right << std::fixed << setprecision(4) << atom.position3d().x() << setw(10) << atom.position3d().y() - << setw(10) << atom.position3d().z() << " " << setw(3) << std::left + << setw(10) << atom.position3d().z() << ' ' << setw(3) << std::left << Elements::symbol(atom.atomicNumber()) << " 0" << setw(3) << std::right << chargeField /* for compatibility */ << " 0 0 0 0 0 0 0 0 0 0\n"; @@ -286,7 +485,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol) for (auto& i : chargeList) { Index atomIndex = i.first; signed int atomCharge = i.second; - out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << " " + out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << ' ' << setw(3) << atomCharge << "\n"; } // TODO: isotopes, etc. @@ -301,7 +500,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol) } } - if (isMode(FileFormat::MultiMolecule)) + if (m_writeProperties || isMode(FileFormat::MultiMolecule)) out << "$$$$\n"; return true; diff --git a/avogadro/io/mdlformat.h b/avogadro/io/mdlformat.h index c887221f61..1ae68d2173 100644 --- a/avogadro/io/mdlformat.h +++ b/avogadro/io/mdlformat.h @@ -52,7 +52,9 @@ class AVOGADROIO_EXPORT MdlFormat : public FileFormat std::vector mimeTypes() const override; bool read(std::istream& in, Core::Molecule& molecule) override; + bool readV3000(std::istream& in, Core::Molecule& molecule); bool write(std::ostream& out, const Core::Molecule& molecule) override; + bool writeV3000(std::ostream& out, const Core::Molecule& molecule); protected: bool m_writeProperties = false; diff --git a/avogadro/io/sdfformat.cpp b/avogadro/io/sdfformat.cpp index 4238a13d9c..ce8761aa91 100644 --- a/avogadro/io/sdfformat.cpp +++ b/avogadro/io/sdfformat.cpp @@ -28,6 +28,7 @@ std::vector SdfFormat::fileExtensions() const { std::vector ext; ext.emplace_back("sdf"); + ext.emplace_back("sd3"); return ext; }