Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic support for v3000 molfiles, including for large molecules #1765

Merged
merged 2 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 206 additions & 7 deletions avogadro/io/mdlformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ using Avogadro::Core::Bond;
using Avogadro::Core::Elements;
using Avogadro::Core::lexicalCast;
using Avogadro::Core::Molecule;
using Avogadro::Core::split;
using Avogadro::Core::startsWith;
using Avogadro::Core::trimmed;

Expand Down Expand Up @@ -92,8 +93,10 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return false;
}
string mdlVersion(trimmed(buffer.substr(33)));
if (mdlVersion != "V2000") {
appendError("Unsupported file format version encountered: " + mdlVersion);
if (mdlVersion == "V3000")
return readV3000(in, mol);
else if (mdlVersion != "V2000") {
appendError("Unsupported MDL version: " + mdlVersion);
return false;
}

Expand Down Expand Up @@ -241,8 +244,8 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
dataValue += buffer;
}
} else if (startsWith(buffer, "> <")) {
// This is a data header, read the name of the entry, and the value on the
// following lines.
// This is a data header, read the name of the entry, and the value on
// the following lines.
dataName = trimmed(buffer).substr(3, buffer.length() - 4);
inValue = true;
}
Expand All @@ -251,11 +254,207 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
return true;
}

bool MdlFormat::readV3000(std::istream& in, Core::Molecule& mol)
{
string buffer;
// we should have M V30 BEGIN CTAB
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN CTAB") {
appendError("Error parsing V3000 file, expected 'M V30 BEGIN CTAB'.");
return false;
}
// now we should get the counts line
// e.g. 'M V30 COUNTS 23694 24297 0 0 1'
getline(in, buffer);
// split by whitespace
std::vector<string> counts = split(trimmed(buffer), ' ');
if (counts.size() < 5) {
appendError("Error parsing V3000 counts line.");
return false;
}
bool ok(false);
int numAtoms(lexicalCast<int>(counts[3], ok));
if (!ok) {
appendError("Error parsing number of atoms.");
return false;
}
int numBonds(lexicalCast<int>(counts[4], ok));
if (!ok) {
appendError("Error parsing number of bonds.");
return false;
}

// Parse the atom block.
// 'M V30 BEGIN ATOM'
// 'M V30 1 N 171.646 251.874 224.877 0'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}
for (int i = 0; i < numAtoms; ++i) {
getline(in, buffer);
std::vector<string> atomData = split(trimmed(buffer), ' ');
if (atomData.size() < 7) {
appendError("Error parsing V3000 atom line.");
return false;
}

string element(trimmed(atomData[3]));
unsigned char atomicNum = Elements::atomicNumberFromSymbol(element);
Atom newAtom = mol.addAtom(atomicNum);

Vector3 pos;
pos.x() = lexicalCast<Real>(atomData[4], ok);
if (!ok) {
appendError("Failed to parse x coordinate: " + atomData[3]);
return false;
}
pos.y() = lexicalCast<Real>(atomData[5], ok);
if (!ok) {
appendError("Failed to parse y coordinate: " + atomData[4]);
return false;
}
pos.z() = lexicalCast<Real>(atomData[6], ok);
if (!ok) {
appendError("Failed to parse z coordinate: " + atomData[5]);
return false;
}
newAtom.setPosition3d(pos);
// check for formal charge in the atom block
// CHG=1 for example
if (atomData.size() > 8) {
string chargeData = atomData[8];
if (startsWith(chargeData, "CHG=")) {
int charge = lexicalCast<int>(chargeData.substr(4), ok);
if (!ok) {
appendError("Failed to parse atom charge: " + chargeData);
return false;
}
newAtom.setFormalCharge(charge);
}
}
} // end of atom block
getline(in, buffer);
// check for END ATOM
if (trimmed(buffer) != "M V30 END ATOM") {
appendError("Error parsing V3000 atom block.");
return false;
}

// bond block
// 'M V30 BEGIN BOND'
// 'M V30 1 1 1 2'
getline(in, buffer);
if (trimmed(buffer) != "M V30 BEGIN BOND") {
appendError("Error parsing V3000 bond block.");
return false;
}
for (int i = 0; i < numBonds; ++i) {
getline(in, buffer);
std::vector<string> bondData = split(trimmed(buffer), ' ');
if (bondData.size() < 5) {
appendError("Error parsing V3000 bond line.");
return false;
}
int order = lexicalCast<int>(bondData[3], ok);
if (!ok) {
appendError("Failed to parse bond order: " + bondData[3]);
return false;
}
int atom1 = lexicalCast<int>(bondData[4], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom1: " + bondData[4]);
return false;
}
int atom2 = lexicalCast<int>(bondData[5], ok) - 1;
if (!ok) {
appendError("Failed to parse bond atom2: " + bondData[5]);
return false;
}
mol.addBond(mol.atom(atom1), mol.atom(atom2),
static_cast<unsigned char>(order));
} // end of bond block

// look for M END
while (getline(in, buffer)) {
if (trimmed(buffer) == "M END")
break;
}
// read in any properties
while (getline(in, buffer)) {
if (startsWith(buffer, "> <")) {
string key = trimmed(buffer.substr(3, buffer.length() - 4));
string value;
while (getline(in, buffer)) {
if (trimmed(buffer) == "")
break;
value += buffer + "\n";
}
mol.setData(key, value);
}
}

return true;
}

bool MdlFormat::writeV3000(std::ostream& out, const Core::Molecule& mol)
{
// write the "fake" counts line
out << " 0 0 0 0 0 999 V3000\n";
out << "M V30 BEGIN CTAB\n";
out << "M V30 COUNTS " << mol.atomCount() << ' ' << mol.bondCount()
<< " 0 0 0\n";
// atom block
out << "M V30 BEGIN ATOM\n";
for (size_t i = 0; i < mol.atomCount(); ++i) {
Atom atom = mol.atom(i);
out << "M V30 " << i + 1 << ' ' << Elements::symbol(atom.atomicNumber())
<< ' ' << atom.position3d().x() << ' ' << atom.position3d().y() << ' '
<< atom.position3d().z() << " 0";
if (atom.formalCharge())
out << " CHG=" << atom.formalCharge();
out << "\n";
}
out << "M V30 END ATOM\n";
// bond block
out << "M V30 BEGIN BOND\n";
for (size_t i = 0; i < mol.bondCount(); ++i) {
Bond bond = mol.bond(i);
out << "M V30 " << i + 1 << ' ' << static_cast<int>(bond.order()) << ' '
<< (bond.atom1().index() + 1) << ' ' << (bond.atom2().index() + 1)
<< " \n";
}
out << "M V30 END BOND\n";
out << "M V30 END CTAB\n";
out << "M END\n";

// TODO: isotopes, radicals, etc.
if (m_writeProperties) {
const auto dataMap = mol.dataMap();
for (const auto& key : dataMap.names()) {
out << "> <" << key << ">\n";
out << dataMap.value(key).toString() << "\n";
out << "\n"; // empty line between data blocks
}
}

if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
}

bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
{
// Header lines.
out << mol.data("name").toString() << "\n Avogadro\n\n";
// Counts line.
if (mol.atomCount() > 999 || mol.bondCount() > 999) {
// we need V3000 support for big molecules
return writeV3000(out, mol);
}

out << setw(3) << std::right << mol.atomCount() << setw(3) << mol.bondCount()
<< " 0 0 0 0 0 0 0 0999 V2000\n";
// Atom block.
Expand All @@ -269,7 +468,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
: ((charge <= 3) ? charge : 0);
out << setw(10) << std::right << std::fixed << setprecision(4)
<< atom.position3d().x() << setw(10) << atom.position3d().y()
<< setw(10) << atom.position3d().z() << " " << setw(3) << std::left
<< setw(10) << atom.position3d().z() << ' ' << setw(3) << std::left
<< Elements::symbol(atom.atomicNumber()) << " 0" << setw(3)
<< std::right << chargeField /* for compatibility */
<< " 0 0 0 0 0 0 0 0 0 0\n";
Expand All @@ -286,7 +485,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
for (auto& i : chargeList) {
Index atomIndex = i.first;
signed int atomCharge = i.second;
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << " "
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << ' '
<< setw(3) << atomCharge << "\n";
}
// TODO: isotopes, etc.
Expand All @@ -301,7 +500,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
}
}

if (isMode(FileFormat::MultiMolecule))
if (m_writeProperties || isMode(FileFormat::MultiMolecule))
out << "$$$$\n";

return true;
Expand Down
2 changes: 2 additions & 0 deletions avogadro/io/mdlformat.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ class AVOGADROIO_EXPORT MdlFormat : public FileFormat
std::vector<std::string> mimeTypes() const override;

bool read(std::istream& in, Core::Molecule& molecule) override;
bool readV3000(std::istream& in, Core::Molecule& molecule);
bool write(std::ostream& out, const Core::Molecule& molecule) override;
bool writeV3000(std::ostream& out, const Core::Molecule& molecule);

protected:
bool m_writeProperties = false;
Expand Down
1 change: 1 addition & 0 deletions avogadro/io/sdfformat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ std::vector<std::string> SdfFormat::fileExtensions() const
{
std::vector<std::string> ext;
ext.emplace_back("sdf");
ext.emplace_back("sd3");
return ext;
}

Expand Down
Loading