Skip to content

Commit a4831c0

Browse files
authored
Merge pull request #1765 from ghutchis/v3000-mol-support
Add basic support for v3000 molfiles, including for large molecules
2 parents 687e701 + e815396 commit a4831c0

File tree

3 files changed

+209
-7
lines changed

3 files changed

+209
-7
lines changed

avogadro/io/mdlformat.cpp

+206-7
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ using Avogadro::Core::Bond;
2323
using Avogadro::Core::Elements;
2424
using Avogadro::Core::lexicalCast;
2525
using Avogadro::Core::Molecule;
26+
using Avogadro::Core::split;
2627
using Avogadro::Core::startsWith;
2728
using Avogadro::Core::trimmed;
2829

@@ -92,8 +93,10 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
9293
return false;
9394
}
9495
string mdlVersion(trimmed(buffer.substr(33)));
95-
if (mdlVersion != "V2000") {
96-
appendError("Unsupported file format version encountered: " + mdlVersion);
96+
if (mdlVersion == "V3000")
97+
return readV3000(in, mol);
98+
else if (mdlVersion != "V2000") {
99+
appendError("Unsupported MDL version: " + mdlVersion);
97100
return false;
98101
}
99102

@@ -241,8 +244,8 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
241244
dataValue += buffer;
242245
}
243246
} else if (startsWith(buffer, "> <")) {
244-
// This is a data header, read the name of the entry, and the value on the
245-
// following lines.
247+
// This is a data header, read the name of the entry, and the value on
248+
// the following lines.
246249
dataName = trimmed(buffer).substr(3, buffer.length() - 4);
247250
inValue = true;
248251
}
@@ -251,11 +254,207 @@ bool MdlFormat::read(std::istream& in, Core::Molecule& mol)
251254
return true;
252255
}
253256

257+
bool MdlFormat::readV3000(std::istream& in, Core::Molecule& mol)
258+
{
259+
string buffer;
260+
// we should have M V30 BEGIN CTAB
261+
getline(in, buffer);
262+
if (trimmed(buffer) != "M V30 BEGIN CTAB") {
263+
appendError("Error parsing V3000 file, expected 'M V30 BEGIN CTAB'.");
264+
return false;
265+
}
266+
// now we should get the counts line
267+
// e.g. 'M V30 COUNTS 23694 24297 0 0 1'
268+
getline(in, buffer);
269+
// split by whitespace
270+
std::vector<string> counts = split(trimmed(buffer), ' ');
271+
if (counts.size() < 5) {
272+
appendError("Error parsing V3000 counts line.");
273+
return false;
274+
}
275+
bool ok(false);
276+
int numAtoms(lexicalCast<int>(counts[3], ok));
277+
if (!ok) {
278+
appendError("Error parsing number of atoms.");
279+
return false;
280+
}
281+
int numBonds(lexicalCast<int>(counts[4], ok));
282+
if (!ok) {
283+
appendError("Error parsing number of bonds.");
284+
return false;
285+
}
286+
287+
// Parse the atom block.
288+
// 'M V30 BEGIN ATOM'
289+
// 'M V30 1 N 171.646 251.874 224.877 0'
290+
getline(in, buffer);
291+
if (trimmed(buffer) != "M V30 BEGIN ATOM") {
292+
appendError("Error parsing V3000 atom block.");
293+
return false;
294+
}
295+
for (int i = 0; i < numAtoms; ++i) {
296+
getline(in, buffer);
297+
std::vector<string> atomData = split(trimmed(buffer), ' ');
298+
if (atomData.size() < 7) {
299+
appendError("Error parsing V3000 atom line.");
300+
return false;
301+
}
302+
303+
string element(trimmed(atomData[3]));
304+
unsigned char atomicNum = Elements::atomicNumberFromSymbol(element);
305+
Atom newAtom = mol.addAtom(atomicNum);
306+
307+
Vector3 pos;
308+
pos.x() = lexicalCast<Real>(atomData[4], ok);
309+
if (!ok) {
310+
appendError("Failed to parse x coordinate: " + atomData[3]);
311+
return false;
312+
}
313+
pos.y() = lexicalCast<Real>(atomData[5], ok);
314+
if (!ok) {
315+
appendError("Failed to parse y coordinate: " + atomData[4]);
316+
return false;
317+
}
318+
pos.z() = lexicalCast<Real>(atomData[6], ok);
319+
if (!ok) {
320+
appendError("Failed to parse z coordinate: " + atomData[5]);
321+
return false;
322+
}
323+
newAtom.setPosition3d(pos);
324+
// check for formal charge in the atom block
325+
// CHG=1 for example
326+
if (atomData.size() > 8) {
327+
string chargeData = atomData[8];
328+
if (startsWith(chargeData, "CHG=")) {
329+
int charge = lexicalCast<int>(chargeData.substr(4), ok);
330+
if (!ok) {
331+
appendError("Failed to parse atom charge: " + chargeData);
332+
return false;
333+
}
334+
newAtom.setFormalCharge(charge);
335+
}
336+
}
337+
} // end of atom block
338+
getline(in, buffer);
339+
// check for END ATOM
340+
if (trimmed(buffer) != "M V30 END ATOM") {
341+
appendError("Error parsing V3000 atom block.");
342+
return false;
343+
}
344+
345+
// bond block
346+
// 'M V30 BEGIN BOND'
347+
// 'M V30 1 1 1 2'
348+
getline(in, buffer);
349+
if (trimmed(buffer) != "M V30 BEGIN BOND") {
350+
appendError("Error parsing V3000 bond block.");
351+
return false;
352+
}
353+
for (int i = 0; i < numBonds; ++i) {
354+
getline(in, buffer);
355+
std::vector<string> bondData = split(trimmed(buffer), ' ');
356+
if (bondData.size() < 5) {
357+
appendError("Error parsing V3000 bond line.");
358+
return false;
359+
}
360+
int order = lexicalCast<int>(bondData[3], ok);
361+
if (!ok) {
362+
appendError("Failed to parse bond order: " + bondData[3]);
363+
return false;
364+
}
365+
int atom1 = lexicalCast<int>(bondData[4], ok) - 1;
366+
if (!ok) {
367+
appendError("Failed to parse bond atom1: " + bondData[4]);
368+
return false;
369+
}
370+
int atom2 = lexicalCast<int>(bondData[5], ok) - 1;
371+
if (!ok) {
372+
appendError("Failed to parse bond atom2: " + bondData[5]);
373+
return false;
374+
}
375+
mol.addBond(mol.atom(atom1), mol.atom(atom2),
376+
static_cast<unsigned char>(order));
377+
} // end of bond block
378+
379+
// look for M END
380+
while (getline(in, buffer)) {
381+
if (trimmed(buffer) == "M END")
382+
break;
383+
}
384+
// read in any properties
385+
while (getline(in, buffer)) {
386+
if (startsWith(buffer, "> <")) {
387+
string key = trimmed(buffer.substr(3, buffer.length() - 4));
388+
string value;
389+
while (getline(in, buffer)) {
390+
if (trimmed(buffer) == "")
391+
break;
392+
value += buffer + "\n";
393+
}
394+
mol.setData(key, value);
395+
}
396+
}
397+
398+
return true;
399+
}
400+
401+
bool MdlFormat::writeV3000(std::ostream& out, const Core::Molecule& mol)
402+
{
403+
// write the "fake" counts line
404+
out << " 0 0 0 0 0 999 V3000\n";
405+
out << "M V30 BEGIN CTAB\n";
406+
out << "M V30 COUNTS " << mol.atomCount() << ' ' << mol.bondCount()
407+
<< " 0 0 0\n";
408+
// atom block
409+
out << "M V30 BEGIN ATOM\n";
410+
for (size_t i = 0; i < mol.atomCount(); ++i) {
411+
Atom atom = mol.atom(i);
412+
out << "M V30 " << i + 1 << ' ' << Elements::symbol(atom.atomicNumber())
413+
<< ' ' << atom.position3d().x() << ' ' << atom.position3d().y() << ' '
414+
<< atom.position3d().z() << " 0";
415+
if (atom.formalCharge())
416+
out << " CHG=" << atom.formalCharge();
417+
out << "\n";
418+
}
419+
out << "M V30 END ATOM\n";
420+
// bond block
421+
out << "M V30 BEGIN BOND\n";
422+
for (size_t i = 0; i < mol.bondCount(); ++i) {
423+
Bond bond = mol.bond(i);
424+
out << "M V30 " << i + 1 << ' ' << static_cast<int>(bond.order()) << ' '
425+
<< (bond.atom1().index() + 1) << ' ' << (bond.atom2().index() + 1)
426+
<< " \n";
427+
}
428+
out << "M V30 END BOND\n";
429+
out << "M V30 END CTAB\n";
430+
out << "M END\n";
431+
432+
// TODO: isotopes, radicals, etc.
433+
if (m_writeProperties) {
434+
const auto dataMap = mol.dataMap();
435+
for (const auto& key : dataMap.names()) {
436+
out << "> <" << key << ">\n";
437+
out << dataMap.value(key).toString() << "\n";
438+
out << "\n"; // empty line between data blocks
439+
}
440+
}
441+
442+
if (m_writeProperties || isMode(FileFormat::MultiMolecule))
443+
out << "$$$$\n";
444+
445+
return true;
446+
}
447+
254448
bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
255449
{
256450
// Header lines.
257451
out << mol.data("name").toString() << "\n Avogadro\n\n";
258452
// Counts line.
453+
if (mol.atomCount() > 999 || mol.bondCount() > 999) {
454+
// we need V3000 support for big molecules
455+
return writeV3000(out, mol);
456+
}
457+
259458
out << setw(3) << std::right << mol.atomCount() << setw(3) << mol.bondCount()
260459
<< " 0 0 0 0 0 0 0 0999 V2000\n";
261460
// Atom block.
@@ -269,7 +468,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
269468
: ((charge <= 3) ? charge : 0);
270469
out << setw(10) << std::right << std::fixed << setprecision(4)
271470
<< atom.position3d().x() << setw(10) << atom.position3d().y()
272-
<< setw(10) << atom.position3d().z() << " " << setw(3) << std::left
471+
<< setw(10) << atom.position3d().z() << ' ' << setw(3) << std::left
273472
<< Elements::symbol(atom.atomicNumber()) << " 0" << setw(3)
274473
<< std::right << chargeField /* for compatibility */
275474
<< " 0 0 0 0 0 0 0 0 0 0\n";
@@ -286,7 +485,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
286485
for (auto& i : chargeList) {
287486
Index atomIndex = i.first;
288487
signed int atomCharge = i.second;
289-
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << " "
488+
out << "M CHG 1 " << setw(3) << std::right << atomIndex + 1 << ' '
290489
<< setw(3) << atomCharge << "\n";
291490
}
292491
// TODO: isotopes, etc.
@@ -301,7 +500,7 @@ bool MdlFormat::write(std::ostream& out, const Core::Molecule& mol)
301500
}
302501
}
303502

304-
if (isMode(FileFormat::MultiMolecule))
503+
if (m_writeProperties || isMode(FileFormat::MultiMolecule))
305504
out << "$$$$\n";
306505

307506
return true;

avogadro/io/mdlformat.h

+2
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ class AVOGADROIO_EXPORT MdlFormat : public FileFormat
5252
std::vector<std::string> mimeTypes() const override;
5353

5454
bool read(std::istream& in, Core::Molecule& molecule) override;
55+
bool readV3000(std::istream& in, Core::Molecule& molecule);
5556
bool write(std::ostream& out, const Core::Molecule& molecule) override;
57+
bool writeV3000(std::ostream& out, const Core::Molecule& molecule);
5658

5759
protected:
5860
bool m_writeProperties = false;

avogadro/io/sdfformat.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ std::vector<std::string> SdfFormat::fileExtensions() const
2828
{
2929
std::vector<std::string> ext;
3030
ext.emplace_back("sdf");
31+
ext.emplace_back("sd3");
3132
return ext;
3233
}
3334

0 commit comments

Comments
 (0)