Skip to content

Commit

Permalink
Add basic support for v3000 molfiles, including for large molecules
Browse files Browse the repository at this point in the history
More work to do, including support for RDKit extensions

Signed-off-by: Geoff Hutchison <[email protected]>
  • Loading branch information
ghutchis committed Nov 8, 2024
1 parent 5ffb84b commit 0c9b5ec
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 7 deletions.
213 changes: 206 additions & 7 deletions avogadro/io/mdlformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using Avogadro::Core::Bond;
using Avogadro::Core::Elements;
using Avogadro::Core::lexicalCast;
using Avogadro::Core::Molecule;
using Avogadro::Core::split;
using Avogadro::Core::startsWith;
using Avogadro::Core::trimmed;

Expand Down Expand Up @@ -92,8 +93,10 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return false;
}
string mdlVersion(trimmed(buffer.substr(33)));
if (mdlVersion != "V2000") {
appendError("Unsupported file format version encountered: " + mdlVersion);
if (mdlVersion == "V3000")
return readV3000(in, mol);
else if (mdlVersion != "V2000") {
appendError("Unsupported MDL version: " + mdlVersion);
return false;
}

Expand Down Expand Up @@ -241,8 +244,8 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
dataValue += buffer;
}
} else if (startsWith(buffer, "> <")) {
// This is a data header, read the name of the entry, and the value on the
// following lines.
// This is a data header, read the name of the entry, and the value on
// the following lines.
dataName = trimmed(buffer).substr(3, buffer.length() - 4);
inValue = true;
}
Expand All @@ -251,11 +254,207 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return true;
}

bool MdlFormat::readV3000(std::istream& in, Core::Molecule& mol)
{
string buffer;
// we should have M V30 BEGIN CTAB
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN CTAB") {
appendError("Error parsing V3000 file, expected 'M V30 BEGIN CTAB'.");
return false;
}
// now we should get the counts line
// e.g. 'M V30 COUNTS 23694 24297 0 0 1'
getline(in, buffer);
// split by whitespace
std::vector<string> counts = split(trimmed(buffer), ' ');
if (counts.size() < 5) {
appendError("Error parsing V3000 counts line.");
return false;
}
bool ok(false);
int numAtoms(lexicalCast<int>(counts[3], ok));
if (!ok) {
appendError("Error parsing number of atoms.");
return false;
}
int numBonds(lexicalCast<int>(counts[4], ok));
if (!ok) {
appendError("Error parsing number of bonds.");
return false;
}

// Parse the atom block.
// 'M V30 BEGIN ATOM'
// 'M V30 1 N 171.646 251.874 224.877 0'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}
for (int i = 0; i < numAtoms; ++i) {
getline(in, buffer);
std::vector<string> atomData = split(trimmed(buffer), ' ');
if (atomData.size() < 7) {
appendError("Error parsing V3000 atom line.");
return false;
}

string element(trimmed(atomData[3]));
unsigned char atomicNum = Elements::atomicNumberFromSymbol(element);
Atom newAtom = mol.addAtom(atomicNum);

Vector3 pos;
pos.x() = lexicalCast<Real>(atomData[4], ok);
if (!ok) {
appendError("Failed to parse x coordinate: " + atomData[3]);
return false;
}
pos.y() = lexicalCast<Real>(atomData[5], ok);
if (!ok) {
appendError("Failed to parse y coordinate: " + atomData[4]);
return false;
}
pos.z() = lexicalCast<Real>(atomData[6], ok);
if (!ok) {
appendError("Failed to parse z coordinate: " + atomData[5]);
return false;
}
newAtom.setPosition3d(pos);
// check for formal charge in the atom block
// CHG=1 for example
if (atomData.size() > 7) {
string chargeData = atomData[7];
if (startsWith(chargeData, "CHG=")) {
int charge = lexicalCast<int>(chargeData.substr(4), ok);
if (!ok) {
appendError("Failed to parse atom charge: " + chargeData);
return false;
}
mol.setFormalCharge(newAtom.index(), charge);
}
}
} // end of atom block
getline(in, buffer);
// check for END ATOM
if (trimmed(buffer) != "M V30 END ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}

// bond block
// 'M V30 BEGIN BOND'
// 'M V30 1 1 1 2'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN BOND") {
appendError("Error parsing V3000 bond block.");
return false;
}
for (int i = 0; i < numBonds; ++i) {
getline(in, buffer);
std::vector<string> bondData = split(trimmed(buffer), ' ');
if (bondData.size() < 5) {
appendError("Error parsing V3000 bond line.");
return false;
}
int order = lexicalCast<int>(bondData[3], ok);
if (!ok) {
appendError("Failed to parse bond order: " + bondData[3]);
return false;
}
int atom1 = lexicalCast<int>(bondData[4], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom1: " + bondData[4]);
return false;
}
int atom2 = lexicalCast<int>(bondData[5], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom2: " + bondData[5]);
return false;
}
mol.addBond(mol.atom(atom1), mol.atom(atom2),
static_cast<unsigned char>(order));
} // end of bond block

// look for M END
while (getline(in, buffer)) {
if (trimmed(buffer) == "M END")
break;
}
// read in any properties
while (getline(in, buffer)) {
if (startsWith(buffer, "> <")) {
string key = trimmed(buffer.substr(3, buffer.length() - 4));
string value;
while (getline(in, buffer)) {
if (trimmed(buffer) == "")
break;
value += buffer + "\n";
}
mol.setData(key, value);
}
}

return true;
}

bool MdlFormat::writeV3000(std::ostream& out, const Core::Molecule& mol)
{
// write the "fake" counts line
out << " 0 0 0 0 0 999 V3000\n";
out << "M V30 BEGIN CTAB\n";
out << "M V30 COUNTS " << mol.atomCount() << ' ' << mol.bondCount()
<< " 0 0 0\n";
// atom block
out << "M V30 BEGIN ATOM\n";
for (size_t i = 0; i < mol.atomCount(); ++i) {
Atom atom = mol.atom(i);
out << "M V30 " << i + 1 << ' ' << Elements::symbol(atom.atomicNumber())
<< ' ' << atom.position3d().x() << ' ' << atom.position3d().y() << ' '
<< atom.position3d().z() << " 0";
if (atom.formalCharge())
out << " CHG=" << atom.formalCharge();
out << "\n";
}
out << "M V30 END ATOM\n";
// bond block
out << "M V30 BEGIN BOND\n";
for (size_t i = 0; i < mol.bondCount(); ++i) {
Bond bond = mol.bond(i);
out << "M V30 " << i + 1 << ' ' << static_cast<int>(bond.order()) << ' '
<< (bond.atom1().index() + 1) << ' ' << (bond.atom2().index() + 1)
<< " \n";
}
out << "M V30 END BOND\n";
out << "M V30 END CTAB\n";
out << "M END\n";

// TODO: isotopes, radicals, etc.
if (m_writeProperties) {
const auto dataMap = mol.dataMap();
for (const auto& key : dataMap.names()) {
out << "> <" << key << ">\n";
out << dataMap.value(key).toString() << "\n";
out << "\n"; // empty line between data blocks
}
}

if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
}

bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
{
// Header lines.
out << mol.data("name").toString() << "\n Avogadro\n\n";
// Counts line.
if (mol.atomCount() > 999 || mol.bondCount() > 999) {
// we need V3000 support for big molecules
return writeV3000(out, mol);
}

out << setw(3) << std::right << mol.atomCount() << setw(3) << mol.bondCount()
<< " 0 0 0 0 0 0 0 0999 V2000\n";
// Atom block.
Expand All @@ -269,7 +468,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
: ((charge <= 3) ? charge : 0);
out << setw(10) << std::right << std::fixed << setprecision(4)
<< atom.position3d().x() << setw(10) << atom.position3d().y()
<< setw(10) << atom.position3d().z() << " " << setw(3) << std::left
<< setw(10) << atom.position3d().z() << ' ' << setw(3) << std::left
<< Elements::symbol(atom.atomicNumber()) << " 0" << setw(3)
<< std::right << chargeField /* for compatibility */
<< " 0 0 0 0 0 0 0 0 0 0\n";
Expand All @@ -286,7 +485,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
for (auto& i : chargeList) {
Index atomIndex = i.first;
signed int atomCharge = i.second;
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << " "
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << ' '
<< setw(3) << atomCharge << "\n";
}
// TODO: isotopes, etc.
Expand All @@ -301,7 +500,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
}
}

if (isMode(FileFormat::MultiMolecule))
if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
Expand Down
2 changes: 2 additions & 0 deletions avogadro/io/mdlformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ class AVOGADROIO_EXPORT MdlFormat : public FileFormat
std::vector<std::string> mimeTypes() const override;

bool read(std::istream& in, Core::Molecule& molecule) override;
bool readV3000(std::istream& in, Core::Molecule& molecule);
bool write(std::ostream& out, const Core::Molecule& molecule) override;
bool writeV3000(std::ostream& out, const Core::Molecule& molecule);

protected:
bool m_writeProperties = false;
Expand Down
1 change: 1 addition & 0 deletions avogadro/io/sdfformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ std::vector<std::string> SdfFormat::fileExtensions() const
{
std::vector<std::string> ext;
ext.emplace_back("sdf");
ext.emplace_back("sd3");
return ext;
}

Expand Down

0 comments on commit 0c9b5ec

Please sign in to comment.