From 1fae2c5057c8f319aa5006b6567783766efe90fc Mon Sep 17 00:00:00 2001 From: Julia Schumann Date: Sat, 21 Sep 2024 00:53:13 +0200 Subject: [PATCH] add function update_chemical_info() and chemical_data dict --- .../schema_packages/catalysis.py | 61 ++-- .../schema_packages/chemical_data.py | 325 ++++++++++++++++++ 2 files changed, 358 insertions(+), 28 deletions(-) create mode 100644 src/nomad_catalysis/schema_packages/chemical_data.py diff --git a/src/nomad_catalysis/schema_packages/catalysis.py b/src/nomad_catalysis/schema_packages/catalysis.py index 8342646..b7bb086 100644 --- a/src/nomad_catalysis/schema_packages/catalysis.py +++ b/src/nomad_catalysis/schema_packages/catalysis.py @@ -38,6 +38,8 @@ from nomad.metainfo.metainfo import Category from nomad.units import ureg +from .chemical_data import chemical_data + if TYPE_CHECKING: from nomad.datamodel.datamodel import ( EntryArchive, @@ -664,6 +666,33 @@ class Reagent(ArchiveSection): pure_component = SubSection(section_def=PubChemPureSubstanceSection) + def update_chemical_info(self): + """ + This function mapps the chemical information of the reagent from a local + dictionary chemical data and returns a pure_component object. + """ + + # Resolve aliases to primary keys if necessary + chemical_key = chemical_data.get(self.name) + # If the value is a string, it refers to another key, so resolve it + if isinstance(chemical_key, str): + chemical_key = chemical_data[chemical_key] + else: + chemical_key = chemical_key + + pure_component = PubChemPureSubstanceSection() + if chemical_key: + pure_component.name = self.name + pure_component.pub_chem_id = chemical_key.get('pub_chem_id') + pure_component.iupac_name = chemical_key.get('iupac_name') + pure_component.molecular_formula = chemical_key.get('molecular_formula') + pure_component.molecular_mass = chemical_key.get('molecular_mass') + pure_component.inchi = chemical_key.get('inchi', None) # Optional + pure_component.inchi_key = chemical_key.get('inchi_key', None) # Optional + pure_component.cas_number = chemical_key.get('cas_number', None) # Optional + + return pure_component + def normalize(self, archive, logger): """ The normalizer will run for the subsection `PureSubstanceComponent` class. @@ -686,40 +715,16 @@ def normalize(self, archive, logger): return if self.name in ['C5-1', 'C6-1', 'nC5', 'nC6', 'Unknown', 'inert', 'P>=5C']: return - elif self.name == 'n-Butene': - self.name = '1-butene' - elif self.name == 'MAN': - self.name = 'maleic anhydride' elif '_' in self.name: self.name = self.name.replace('_', ' ') if self.name and self.pure_component is None: import time - self.pure_component = PubChemPureSubstanceSection(name=self.name) - if self.name == 'propionic acid': - self.pub_chem_id = 1032 - self.pure_component.iupac_name = 'propanoic acid' - self.pure_component.molecular_formula = 'C3H6O2' - self.pure_component.molecular_mass = 74.08 - return - elif self.name in ['CO', 'carbon monoxide']: - self.pub_chem_id = 281 - self.pure_component.iupac_name = 'carbon monoxide' - self.pure_component.molecular_formula = 'CO' - self.pure_component.molecular_mass = 28.01 - self.pure_component.inchi = 'InChI=1S/CO/c1-2' - self.pure_component.inchi_key = 'UGFAIRIUMAVXCW-UHFFFAOYSA-N' - self.pure_component.cas_number = '630-08-0' - return - elif self.name in ['CO2', 'carbon dioxide']: - self.pub_chem_id = 280 - self.pure_component.iupac_name = 'carbon dioxide' - self.pure_component.molecular_formula = 'CO2' - self.pure_component.molecular_mass = 44.01 - self.pure_component.inchi = 'InChI=1S/CO2/c2-1-3' - self.pure_component.inchi_key = 'CURLTUGMZLYLDI-UHFFFAOYSA-N' - self.pure_component.cas_number = '124-38-9' + pure_component = self.update_chemical_info() + self.pure_component = pure_component + + if self.pure_component.iupac_name is not None: return else: time.sleep(1) diff --git a/src/nomad_catalysis/schema_packages/chemical_data.py b/src/nomad_catalysis/schema_packages/chemical_data.py new file mode 100644 index 0000000..3a4baab --- /dev/null +++ b/src/nomad_catalysis/schema_packages/chemical_data.py @@ -0,0 +1,325 @@ +chemical_data = { + 'carbon monoxide': { + 'pub_chem_id': 281, + 'iupac_name': 'carbon monoxide', + 'molecular_formula': 'CO', + 'molecular_mass': 28.01, + 'inchi': 'InChI=1S/CO/c1-2', + 'inchi_key': 'UGFAIRIUMAVXCW-UHFFFAOYSA-N', + 'cas_number': '630-08-0', + }, + 'CO': 'carbon monoxide', # Reference to 'CO' + 'carbon dioxide': { + 'pub_chem_id': 280, + 'iupac_name': 'carbon dioxide', + 'molecular_formula': 'CO2', + 'molecular_mass': 44.01, + 'inchi': 'InChI=1S/CO2/c2-1-3', + 'inchi_key': 'CURLTUGMZLYLDI-UHFFFAOYSA-N', + 'cas_number': '124-38-9', + }, + 'CO2': 'carbon dioxide', # Reference to 'carbon dioxide' + 'propionic acid': { + 'pub_chem_id': 1032, + 'iupac_name': 'propanoic acid', + 'molecular_formula': 'C3H6O2', + 'molecular_mass': 74.08, + }, + 'ammonia': { + 'pub_chem_id': 222, + 'iupac_name': 'ammonia', # actually 'azane' + 'molecular_formula': 'H3N', + 'molecular_mass': 17.03, + 'inchi': 'InChI=1S/H3N/h1H3', + 'inchi_key': 'QGZKDVFQNNGYKY-UHFFFAOYSA-N', + 'cas_number': '7664-41-7', + }, + 'NH3': 'ammonia', # Reference to 'ammonia' + 'molecular hydrogen': { + 'pub_chem_id': 783, + 'iupac_name': 'molecular hydrogen', + 'molecular_formula': 'H2', + 'molecular_mass': 2.016, + 'inchi': 'InChI=1S/H2/h1H', + 'inchi_key': 'UFHFLCQGNIYNRP-UHFFFAOYSA-N', + 'cas_number': '1333-74-0', + }, + 'H2': 'molecular hydrogen', # Reference to 'molecular hydrogen' + 'hydrogen': 'molecular hydrogen', # Reference to 'molecular hydrogen' + 'water': { + 'pub_chem_id': 962, + 'iupac_name': 'water', # actually 'oxidane' + 'molecular_formula': 'H2O', + 'molecular_mass': 18.015, + 'inchi': 'InChI=1S/H2O/h1H2', + }, + 'H2O': 'water', # Reference to 'water' + 'argon': { + 'pub_chem_id': 23968, + 'iupac_name': 'argon', + 'molecular_formula': 'Ar', + 'molecular_mass': 39.9, + 'inchi': 'InChI=1S/Ar', + 'inchi_key': 'XKRFYHLGVUSROY-UHFFFAOYSA-N', + 'cas_number': '7440-37-1', + }, + 'Ar': 'argon', # Reference to 'argon' + 'molecular nitrogen': { + 'pub_chem_id': 947, + 'iupac_name': 'molecular nitrogen', + 'molecular_formula': 'N2', + 'molecular_mass': 28.014, + 'inchi': 'InChI=1S/N2/c1-2', + 'inchi_key': 'IJGRMHOSHXDMSA-UHFFFAOYSA-N', + 'cas_number': '7727-37-9', + }, + 'nitrogen': 'molecular nitrogen', # Reference to 'molecular nitrogen' + 'N2': 'molecular nitrogen', # Reference to 'molecular nitrogen' + 'molecular oxygen': { + 'pub_chem_id': 977, + 'iupac_name': 'molecular oxygen', + 'molecular_formula': 'O2', + 'molecular_mass': 32.00, + 'inchi': 'InChI=1S/O2/c1-2', + 'inchi_key': 'MYMOFIZGZYHOMD-UHFFFAOYSA-N', + 'cas_number': '7782-44-7', + }, + 'oxygen': 'molecular oxygen', # Reference to 'molecular oxygen' + 'O2': 'molecular oxygen', # Reference to 'molecular oxygen' + 'methane': { + 'pub_chem_id': 297, + 'iupac_name': 'methane', + 'molecular_formula': 'CH4', + 'molecular_mass': 16.043, + 'inchi': 'InChI=1S/CH4/h1H4', + 'inchi_key': 'VNWKTOKETHGBQD-UHFFFAOYSA-N', + 'cas_number': '74-82-8', + }, + 'CH4': 'methane', # Reference to 'methane' + 'ethane': { + 'pub_chem_id': 6324, + 'iupac_name': 'ethane', + 'molecular_formula': 'C2H6', + 'molecular_mass': 30.07, + 'inchi': 'InChI=1S/C2H6/c1-2/h1-2H3', + 'inchi_key': 'OTMSDBZUPAUEDD-UHFFFAOYSA-N', + 'cas_number': '74-84-0', + }, + 'C2H6': 'ethane', # Reference to 'ethane' + 'ethene': { + 'pub_chem_id': 6325, + 'iupac_name': 'ethene', + 'molecular_formula': 'C2H4', + 'molecular_mass': 28.05, + 'inchi': 'InChI=1S/C2H4/c1-2/h1-2H2', + 'inchi_key': 'VGGSQFUCUMXWEO-UHFFFAOYSA-N', + 'cas_number': '74-85-1', + }, + 'ethylene': 'ethene', # Reference to 'ethene' + 'C2H4': 'ethene', # Reference to 'ethene' + 'ethyne': { + 'pub_chem_id': 6326, + 'iupac_name': 'ethyne', + 'molecular_formula': 'C2H2', + 'molecular_mass': 26.04, + 'inchi': 'InChI=1S/C2H2/c1-2/h1-2H', + 'inchi_key': 'HSFWRNGVRCDJHI-UHFFFAOYSA-N', + 'cas_number': '74-86-2', + }, + 'acetylene': 'ethyne', # Reference to 'ethyne' + 'C2H2': 'ethyne', # Reference to 'ethyne' + 'ethin': 'ethyne', # Reference to 'ethyne' + 'acetic acid': { + 'pub_chem_id': 176, + 'iupac_name': 'acetic acid', + 'molecular_formula': 'C2H4O2', + 'molecular_mass': 60.05, + 'inchi': 'InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)', + 'inchi_key': 'QTBSBXVTEAMEQO-UHFFFAOYSA-N', + 'cas_number': '64-19-7', + }, + 'propane': { + 'pub_chem_id': 6334, + 'iupac_name': 'propane', + 'molecular_formula': 'C3H8', + 'molecular_mass': 44.10, + 'inchi': 'InChI=1S/C3H8/c1-3-2/h3H2,1-2H3', + 'inchi_key': 'ATUOYWHBWRKTHZ-UHFFFAOYSA-N', + 'cas_number': '74-98-6', + }, + 'C3H8': 'propane', # Reference to 'propane' + 'propene': { + 'pub_chem_id': 8252, + 'iupac_name': 'prop-1-ene', + 'molecular_formula': 'C3H6', + 'molecular_mass': 42.08, + 'inchi': 'InChI=1S/C3H6/c1-3-2/h3H,1H2,2H3', + 'cas_number': '115-07-1', + }, + 'propylene': 'propene', # Reference to 'propene' + 'C3H6': 'propene', # Reference to 'propene' + 'propyne': { + 'pub_chem_id': 6335, + 'iupac_name': 'propyne', + 'molecular_formula': 'C3H4', + 'molecular_mass': 40.06, + 'inchi': 'InChI=1S/C3H4/c1-3-2/h1H,3H2', + }, + 'propine': 'propyne', # Reference to 'propyne' + 'propionic acid': { + 'pub_chem_id': 1032, + 'iupac_name': 'propanoic acid', + 'molecular_formula': 'C3H6O2', + 'molecular_mass': 74.08, + }, + 'prop-2-enoic acid': { + 'pub_chem_id': 6581, + 'iupac_name': 'prop-2-enoic acid', + 'molecular_formula': 'C3H4O2', + 'molecular_mass': 72.06, + 'inchi': 'InChI=1S/C3H4O2/c1-2-3(4)5/h2H,1H2,(H,4,5)', + 'inchi_key': 'NIXOWILDQLNWCW-UHFFFAOYSA-N', + 'cas_number': '79-10-7', + }, + 'acrylic acid': 'prop-2-enoic acid', + 'acetaldehyde': { + 'pub_chem_id': 177, + 'iupac_name': 'acetaldehyde', + 'molecular_formula': 'C2H4O', + 'molecular_mass': 44.05, + 'inchi': 'InChI=1S/C2H4O/c1-2-3/h2H,1H3', + 'inchi_key': 'IKHGUXGNUITLKF-UHFFFAOYSA-N', + 'cas_number': '75-07-0', + }, + 'butane': { + 'pub_chem_id': 7843, + 'iupac_name': 'butane', + 'molecular_formula': 'C4H10', + 'molecular_mass': 58.12, + 'inchi': 'InChI=1S/C4H10/c1-3-4-2/h3-4H2,1-2H3', + 'inchi_key': 'KDXKERNSBIXSRK-UHFFFAOYSA-N', + 'cas_number': '106-97-8', + }, + 'n-butane': 'butane', # Reference to 'butane' + 'ethanol': { + 'pub_chem_id': 702, + 'iupac_name': 'ethanol', + 'molecular_formula': 'C2H6O', + 'molecular_mass': 46.07, + 'inchi': 'InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', + 'inchi_key': 'LFQSCWFLJHTTHZ-UHFFFAOYSA-N', + 'cas_number': '64-17-5', + }, + 'EtOH': 'ethanol', # Reference to 'ethanol' + 'CH3CH2OH': 'ethanol', # Reference to 'ethanol' + 'methanol': { + 'pub_chem_id': 887, + 'iupac_name': 'methanol', + 'molecular_formula': 'CH4O', + 'molecular_mass': 32.04, + 'inchi': 'InChI=1S/CH4O/c1-2/h2H,1H3', + 'inchi_key': 'OKKJLVBELUTLKV-UHFFFAOYSA-N', + 'cas_number': '67-56-1', + }, + 'methyl alcohol': 'methanol', # Reference to 'methanol' + 'MeOH': 'methanol', # Reference to 'methanol' + 'CH3OH': 'methanol', # Reference to 'methanol' + 'formic acid': { + 'pub_chem_id': 284, + 'iupac_name': 'formic acid', + 'molecular_formula': 'CH2O2', + 'molecular_mass': 46.03, + 'inchi': 'InChI=1S/CH2O2/c2-1-3/h1H,(H,2,3)', + 'inchi_key': 'BDAGIHXWWSANSR-UHFFFAOYSA-N', + 'cas_number': '64-18-6', + }, + 'HCOOH': 'formic acid', # Reference to 'formic acid' + 'methanoic acid': 'formic acid', # Reference to 'formic acid' + 'prop-2-enal': { + 'pub_chem_id': 7847, + 'iupac_name': 'prop-2-enal', + 'molecular_formula': 'C3H4O', + 'molecular_mass': 56.06, + 'inchi': 'InChI=1S/C3H4O/c1-2-3-4/h2-3H,1H2', + }, + 'acrolein': 'prop-2-enal', # Reference to 'prop-2-enal' + 'C3H4O': 'prop-2-enal', # Reference to 'prop-2-enal' + 'propanal': { + 'pub_chem_id': 527, + 'iupac_name': 'propanal', + 'molecular_formula': 'C3H6O', + 'molecular_mass': 58.08, + 'inchi': 'InChI=1S/C3H6O/c1-2-3-4/h3H,2H,1H3', + }, + 'propionaldehyde': 'propanal', # Reference to 'propanal' + 'propan-2-one': { + 'pub_chem_id': 180, + 'iupac_name': 'propan-2-one', + 'molecular_formula': 'C3H6O', + 'molecular_mass': 58.08, + 'inchi': 'InChI=1S/C3H6O/c1-3(2)2/h1-2H3', + 'inchi_key': 'CSCPPACGZOOCGX-UHFFFAOYSA-N', + 'cas_number': '67-64-1', + }, + 'acetone': 'propan-2-one', # Reference to 'propan-2-one' + 'propan-2-ol': { + 'pub_chem_id': 3776, + 'iupac_name': 'propan-2-ol', + 'molecular_formula': 'C3H8O', + 'molecular_mass': 60.10, + 'inchi': 'InChI=1S/C3H8O/c1-3(2)4/h3-4H,1-2H3', + 'inchi_key': 'KZBMWSRVAYFASW-UHFFFAOYSA-N', + 'cas_number': '67-63-0', + }, + '2-propanol': 'propan-2-ol', # Reference to 'propan-2-ol' + 'isopropanol': 'propan-2-ol', # Reference to 'propan-2-ol' + 'prop-2-en-1-ol': { + 'pub_chem_id': 7858, + 'iupac_name': 'prop-2-en-1-ol', + 'molecular_formula': 'C3H6O', + 'molecular_mass': 58.08, + 'inchi': 'InChI=1S/C3H6O/c1-2-3-4/h2,4H,1,3H2', + 'inchi_key': 'QWVGKYWNOKOFNN-UHFFFAOYSA-N', + 'cas_number': '107-18-6', + }, + 'allyl alcohol': 'prop-2-en-1-ol', # Reference to 'allyl alcohol' + 'propan-1-ol': { + 'pub_chem_id': 1031, + 'iupac_name': 'propan-1-ol', + 'molecular_formula': 'C3H8O', + 'molecular_mass': 60.10, + 'inchi': 'InChI=1S/C3H8O/c1-2-3-4/h4H,2-3H2,1H3', + 'inchi_key': 'XNWBBONJTDVZCF-UHFFFAOYSA-N', + 'cas_number': '71-23-8', + }, + '1-propanol': 'propan-1-ol', # Reference to 'propan-1-ol' + 'n-propanol': 'propan-1-ol', # Reference to 'propan-1-ol' + '1-butene': { + 'pub_chem_id': 7844, + 'iupac_name': 'but-1-ene', + 'molecular_formula': 'C4H8', + 'molecular_mass': 56.11, + 'inchi': 'InChI=1S/C4H8/c1-3-4-2/h3H,1,4H2,2H3', + }, + 'butylene': '1-butene', # Reference to '1-butene' + 'n-Butene': '1-butene', # Reference to '1-butene' + '' 'furan': { + 'pub_chem_id': 8029, + 'iupac_name': 'furan', + 'molecular_formula': 'C4H4O', + 'molecular_mass': 68.07, + 'inchi': 'InChI=1S/C4H4O/c1-2-4-5-3-1/h1-4H', + 'inchi_key': 'NNTHXFGICWXRPZ-UHFFFAOYSA-N', + 'cas_number': '110-00-9', + }, + 'furfuran': 'furan', # Reference to 'furan' + 'furan-2,5-dione': { + 'pub_chem_id': 7923, + 'iupac_name': 'furan-2,5-dione', + 'molecular_formula': 'C4H2O3', + 'molecular_mass': 98.06, + 'inchi': 'InChI=1S/C4H2O3/c5-3-1-2-4(6)7-3/h1-2H', + }, + 'maleic anhydride': 'furan-2,5-dione', # Reference to 'furan-2,5-dione' + 'MAN': 'furan-2,5-dione', # Reference to 'furan-2,5-dione' +}