From 01b698a18fce88f3499c647ac00427443984fb6e Mon Sep 17 00:00:00 2001 From: Ryan Kingsbury Date: Sat, 20 Jul 2024 10:34:11 -0400 Subject: [PATCH 1/4] Utils: parse super and subscripts in ion formulae --- CHANGELOG.md | 15 +++++++++++++++ docs/chemistry.md | 8 +++++++- src/pyEQL/utils.py | 39 ++++++++++++++++++++++++++++++++++++++- tests/test_utils.py | 5 +++++ 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca752a40..ecd9cdf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.3] - 2024-07-20 + +### Fixed + +- `standardize_formula`: Fix incorrect display of additional formulas, including methane which + was shown as "H4C(aq)", other tri-anions (N3-, P3-), and a variety of haloacetic acids. For + example, tricholoracetic acid was previously shown as `'C2Cl3O2[-1]'` but will now display + as `'CCl3COO[-1]'`. + +### Added + +- `standardize_formula`: `pyEQL` can now parse ion formulas that contain unicode superscript + or subscript characters, which makes input even more flexible. For example, `"PO₄³⁻"` and `"Co²⁺"` + will now standardize correctly to `"PO4[-3]"` and `"Co[+2]"`, respectively. + ## [1.0.2] - 2024-07-09 ### Fixed diff --git a/docs/chemistry.md b/docs/chemistry.md index f51d9146..c209227d 100644 --- a/docs/chemistry.md +++ b/docs/chemistry.md @@ -23,7 +23,8 @@ Here are some examples: | Sodium Sulfate | "Na2(SO4)" or "Na2SO4" | "Na(SO4)(aq)" | | Sodium Ion | "Na+", "Na+1", "Na1+", or "Na[+]" | "Na[+1]" | | Magnesium Ion | "Mg+2", "Mg++", or "Mg[++]" | "Mg[+2]" | -| Methanol | "CH3OH", "CH4O" | "'CH3OH(aq)'" | +| Methanol | "CH3OH", "CH4O" | "CH3OH(aq)" | +| Phosphate Ion | "PO4-3", "PO₄³⁻" | "PO4[-3]" | Specifically, `standardize_formula` uses `Ion.from_formula().reduced_formla` (shown in the right hand column of the table) to identify solutes. Notice that for charged species, the charges are always placed inside square brackets @@ -33,6 +34,11 @@ by `(aq)` to disambiguate them from solids. ```{important} **When writing multivalent ion formulas, it is strongly recommended that you put the charge number AFTER the + or - sign** (e.g., type "Mg+2" NOT "Mg2+"). The latter formula is ambiguous - it could mean $Mg_2^+$ or $Mg^{+2}$ and it will be processed incorrectly into `Mg[+0.5]` + +There is **one exception** to the rule above. If you really want to list the charge number +first , you can use unicode superscript characters (e.g., "Co²⁺"), and `pyEQL` will understand +these regardless of the order of the `+` and the `2`. So you can write "Co²⁺" and it will be +correctly standardized to `Co[+2]` ``` (manual-testing)= diff --git a/src/pyEQL/utils.py b/src/pyEQL/utils.py index 69b5f555..c8d4cd22 100644 --- a/src/pyEQL/utils.py +++ b/src/pyEQL/utils.py @@ -60,6 +60,18 @@ def standardize_formula(formula: str): be enclosed in square brackets to remove any ambiguity in the meaning of the formula. For example, 'Na+', 'Na+1', and 'Na[+]' will all standardize to "Na[+1]" """ + # fix permuted sign and charge number (e.g. Co2+) + for str, rep in zip(["²⁺", "³⁺", "⁴⁺", "²⁻", "³⁻", "⁴⁻"], ["+2", "+3", "+4", "-2", "-3", "-4"]): + formula = formula.replace(str, rep) + + # replace superscripts with non superscripts + for char, rep in zip("⁻⁺⁰¹²³⁴⁵⁶⁷⁸⁹", "-+0123456789"): + formula = formula.replace(char, rep) + + # replace subscripts with non subscripts + for char, rep in zip("₀₁₂₃₄₅₆₇₈₉", "0123456789"): + formula = formula.replace(char, rep) + sform = Ion.from_formula(formula).reduced_formula # TODO - manual formula adjustments. May be implemented upstream in pymatgen in the future @@ -81,15 +93,40 @@ def standardize_formula(formula: str): # thiocyanate elif sform == "CSN[-1]": sform = "SCN[-1]" - # triiodide + # triiodide, nitride, an phosphide elif sform == "I[-0.33333333]": sform = "I3[-1]" + elif sform == "N[-0.33333333]": + sform = "N3[-1]" + elif sform == "P[-0.33333333]": + sform = "P3[-1]" # formate elif sform == "HCOO[-1]": sform = "HCO2[-1]" # oxalate elif sform == "CO2[-1]": sform = "C2O4[-2]" + # haloacetic acids of F, Cl, Br, I + elif sform == "C2Cl3O2[-1]": + sform = "CCl3COO[-1]" + elif sform == "C2F3O2[-1]": + sform = "CF3COO[-1]" + elif sform == "C2I3O2[-1]": + sform = "CI3COO[-1]" + elif sform == "C2Br3O2[-1]": + sform = "CBr3COO[-1]" + + # F+Cl + elif sform == "C2Cl2O2F[-1]": + sform = "CFCl2COO[-1]" + elif sform == "C2Cl(OF)2[-1]": + sform = "CF2ClCOO[-1]" + + # Cl+Br + elif sform == "C2Cl2O2Br[-1]": + sform = "CFCl2COO[-1]" + elif sform == "C2Cl(OBr)2[-1]": + sform = "CF2ClCOO[-1]" # TODO - consider adding recognition of special formulas like MeOH for methanol or Cit for citrate return sform diff --git a/tests/test_utils.py b/tests/test_utils.py index 58c3295b..d4f163d4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -26,12 +26,17 @@ def test_standardize_formula(): assert standardize_formula("H2PO4-") == "H2PO4[-1]" assert standardize_formula("SCN-") == "SCN[-1]" assert standardize_formula("I3-") == "I3[-1]" + assert standardize_formula("N3-") == "N3[-1]" + assert standardize_formula("P3-") == "P3[-1]" assert standardize_formula("HCOO-") == "HCO2[-1]" assert standardize_formula("CO2-1") == "C2O4[-2]" assert standardize_formula("C2O4--") == "C2O4[-2]" assert standardize_formula("H3PO4") == "H3PO4(aq)" assert standardize_formula("H2SO4") == "H2SO4(aq)" assert standardize_formula("HClO4") == "HClO4(aq)" + # superscripts, subscripts, and permuted sign/charge number + assert standardize_formula("PO₄³⁻") == "PO4[-3]" + assert standardize_formula("Co²⁺") == "Co[+2]" def test_formula_dict(): From 67e27e322e597f9149a187e149f61edc428ad64e Mon Sep 17 00:00:00 2001 From: Ryan Kingsbury Date: Sat, 20 Jul 2024 10:54:40 -0400 Subject: [PATCH 2/4] additional tests --- src/pyEQL/utils.py | 16 +++++++++++----- tests/test_utils.py | 8 ++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/pyEQL/utils.py b/src/pyEQL/utils.py index c8d4cd22..f0263ca8 100644 --- a/src/pyEQL/utils.py +++ b/src/pyEQL/utils.py @@ -109,24 +109,30 @@ def standardize_formula(formula: str): # haloacetic acids of F, Cl, Br, I elif sform == "C2Cl3O2[-1]": sform = "CCl3COO[-1]" - elif sform == "C2F3O2[-1]": + elif sform == "C2O2F3[-1]": sform = "CF3COO[-1]" elif sform == "C2I3O2[-1]": sform = "CI3COO[-1]" elif sform == "C2Br3O2[-1]": sform = "CBr3COO[-1]" - # F+Cl + # Cl+F elif sform == "C2Cl2O2F[-1]": sform = "CFCl2COO[-1]" elif sform == "C2Cl(OF)2[-1]": sform = "CF2ClCOO[-1]" # Cl+Br - elif sform == "C2Cl2O2Br[-1]": - sform = "CFCl2COO[-1]" + elif sform == "C2Br(ClO)2[-1]": + sform = "CCl2BrCOO[-1]" elif sform == "C2Cl(OBr)2[-1]": - sform = "CF2ClCOO[-1]" + sform = "CClBr2ClCOO[-1]" + + # Cl+I + elif sform == "C2I(ClO)2[-1]": + sform = "CCl2ICOO[-1]" + elif sform == "C2Cl(OI)2[-1]": + sform = "CClI2COO[-1]" # TODO - consider adding recognition of special formulas like MeOH for methanol or Cit for citrate return sform diff --git a/tests/test_utils.py b/tests/test_utils.py index d4f163d4..3456fdf7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -37,6 +37,14 @@ def test_standardize_formula(): # superscripts, subscripts, and permuted sign/charge number assert standardize_formula("PO₄³⁻") == "PO4[-3]" assert standardize_formula("Co²⁺") == "Co[+2]" + # haloacetic acids + assert standardize_formula("CCl3COO-") == "CCl3COO[-1]" + assert standardize_formula("CF3COO-") == "CF3COO[-1]" + assert standardize_formula("CI3COO-") == "CI3COO[-1]" + assert standardize_formula("CBr3COO-") == "CBr3COO[-1]" + assert standardize_formula("CCl2ICOO-") == "CCl2ICOO[-1]" + assert standardize_formula("CCl2BrCOO-") == "CCl2BrCOO[-1]" + assert standardize_formula("CCl2FCOO-") == "CFCl2COO[-1]" def test_formula_dict(): From e93b398452dc7af0d2cd3b6671ec4a397b94d85d Mon Sep 17 00:00:00 2001 From: Ryan Kingsbury Date: Sat, 20 Jul 2024 11:13:51 -0400 Subject: [PATCH 3/4] complete HAA handling --- src/pyEQL/utils.py | 12 ++++++------ tests/test_utils.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/pyEQL/utils.py b/src/pyEQL/utils.py index f0263ca8..e0135287 100644 --- a/src/pyEQL/utils.py +++ b/src/pyEQL/utils.py @@ -124,15 +124,15 @@ def standardize_formula(formula: str): # Cl+Br elif sform == "C2Br(ClO)2[-1]": - sform = "CCl2BrCOO[-1]" - elif sform == "C2Cl(OBr)2[-1]": - sform = "CClBr2ClCOO[-1]" + sform = "CBrCl2COO[-1]" + elif sform == "C2Br2ClO2[-1]": + sform = "CBr2ClCOO[-1]" # Cl+I elif sform == "C2I(ClO)2[-1]": - sform = "CCl2ICOO[-1]" - elif sform == "C2Cl(OI)2[-1]": - sform = "CClI2COO[-1]" + sform = "CICl2COO[-1]" + elif sform == "C2I2ClO2[-1]": + sform = "CI2ClCOO[-1]" # TODO - consider adding recognition of special formulas like MeOH for methanol or Cit for citrate return sform diff --git a/tests/test_utils.py b/tests/test_utils.py index 3456fdf7..7e991315 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -42,9 +42,15 @@ def test_standardize_formula(): assert standardize_formula("CF3COO-") == "CF3COO[-1]" assert standardize_formula("CI3COO-") == "CI3COO[-1]" assert standardize_formula("CBr3COO-") == "CBr3COO[-1]" - assert standardize_formula("CCl2ICOO-") == "CCl2ICOO[-1]" - assert standardize_formula("CCl2BrCOO-") == "CCl2BrCOO[-1]" + # Cl+F assert standardize_formula("CCl2FCOO-") == "CFCl2COO[-1]" + assert standardize_formula("CClF2COO-") == "CF2ClCOO[-1]" + # Cl+I + assert standardize_formula("CCl2ICOO-") == "CICl2COO[-1]" + assert standardize_formula("CClI2COO-") == "CI2ClCOO[-1]" + # Cl+Br + assert standardize_formula("CBrCl2COO-") == "CBrCl2COO[-1]" + assert standardize_formula("CBr2ClCOO-") == "CBr2ClCOO[-1]" def test_formula_dict(): From 4f9b41f01e6068857efbc21aea2749fb918e2dbe Mon Sep 17 00:00:00 2001 From: Ryan Kingsbury Date: Sat, 20 Jul 2024 11:16:53 -0400 Subject: [PATCH 4/4] add triflate anion --- src/pyEQL/utils.py | 3 +++ tests/test_utils.py | 1 + 2 files changed, 4 insertions(+) diff --git a/src/pyEQL/utils.py b/src/pyEQL/utils.py index e0135287..7b1b9737 100644 --- a/src/pyEQL/utils.py +++ b/src/pyEQL/utils.py @@ -106,6 +106,9 @@ def standardize_formula(formula: str): # oxalate elif sform == "CO2[-1]": sform = "C2O4[-2]" + # triflate + elif sform == "CS(OF)3[-1]": + sform = "CF3SO3[-1]" # haloacetic acids of F, Cl, Br, I elif sform == "C2Cl3O2[-1]": sform = "CCl3COO[-1]" diff --git a/tests/test_utils.py b/tests/test_utils.py index 7e991315..2226923b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -34,6 +34,7 @@ def test_standardize_formula(): assert standardize_formula("H3PO4") == "H3PO4(aq)" assert standardize_formula("H2SO4") == "H2SO4(aq)" assert standardize_formula("HClO4") == "HClO4(aq)" + assert standardize_formula("CF3SO3-") == "CF3SO3[-1]" # superscripts, subscripts, and permuted sign/charge number assert standardize_formula("PO₄³⁻") == "PO4[-3]" assert standardize_formula("Co²⁺") == "Co[+2]"