Skip to content

Commit

Permalink
- added documentation to methods
Browse files Browse the repository at this point in the history
- added chemimg method
- added cas validation with check digit verification
- added inchikey validation
- improved query to check for '*' and added exact option
- improved key2cas to get casrn of smallest molecule (assuming other hits are polymers)
  • Loading branch information
stuchalk committed Mar 6, 2022
1 parent 645b711 commit 2590188
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 21 deletions.
141 changes: 126 additions & 15 deletions comchem/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import json
import re
import cairosvg
import os
from bs4 import BeautifulSoup
from urllib.request import urlopen

Expand All @@ -7,13 +10,19 @@
'properties', 'synonyms', 'replaceRns', 'hasMolefile']


# get data from a page
def detail(casrn, field="all"):
""" access the common chemistry detail api """
"""
Access the Common Chemistry detail API at
https://commonchemistry.cas.org/api/detail?cas_rn=<casrn>
:param casrn: CAS Registry Number
:param field: field to return or all fields (default)
:return mixed
"""
if not _validcas(casrn):
return '' # false
url = ccpath + 'detail?cas_rn=' + casrn
respnse = urlopen(url)
jsn = json.loads(respnse.read())

if field == "all":
return jsn
elif field in fields:
Expand All @@ -25,23 +34,125 @@ def detail(casrn, field="all"):
else:
return jsn[field]
else:
return "Field not available..."
return '' # false


# run a search
def query(term):
url = ccpath + 'search?q=' + term
def query(term='', exact=False):
"""
Search the CommonChemistry database API at
https://commonchemistry.cas.org/api/search?q=<term>
:param term: string to be searched
:param exact: boolean to indicate an exact match
:return: string
"""
url = ''
if exact is False:
url = ccpath + 'search?q=' + term + '*'
elif term[-1:] == '*' or exact is True:
url = ccpath + 'search?q=' + term
respnse = urlopen(url)
jsn = json.loads(respnse.read())
out = []
for hit in jsn['results']:
textname = BeautifulSoup(hit["name"], "lxml").text
out.append({"textname": textname, "htmlname": hit["name"].lower(), "rn": hit["rn"]})
out = [] # false
if jsn['results']:
for hit in jsn['results']:
textname = BeautifulSoup(hit["name"], "lxml").text
out.append({"textname": textname, "htmlname": hit["name"].lower(), "rn": hit["rn"]})
return out


# search for a compound using an InChIKey
def key2cas(key):
""" search the api for an InChKey"""
hits = query('InChIKey=' + key)
return hits[0]['rn'] # only returns the casne of the first hit
"""
Find the CAS Registry Number of a chemical substance using an IUPAC InChIKey
:param key - a valid InChIKey
"""
if _validkey(key):
hits = query('InChIKey=' + key, True)
if hits:
if len(hits) == 1:
return hits[0]['rn']
else:
# check hits for smallest molar mass compound, i.e., not polymer
minmm = 100000
minrn = ''
for i, hit in enumerate(hits):
mm = detail(hit['rn'], 'molecularMass')
if mm != '':
if float(mm) < minmm:
minmm = float(mm)
minrn = hit['rn']
return minrn
else:
return ''
else:
return ''


def _validkey(key):
"""
Validate and IUPAC InChIKey
:param key: a string to be validated as an IUPAC InChIKey
:return: bool
"""
test = re.search(r'^[A-Z]{14}-[A-Z]{8}[SN][A]-[A-Z]$', key)
if test is None:
return False
return True


def _validcas(cas):
"""
Validate a CAS Registry Number
See: https://en.wikipedia.org/wiki/CAS_Registry_Number#Format
:param cas: a string to be validated as a CAS Registry Number
:return: bool
"""
test = re.search(r'^\d{2,8}-\d{2}-\d$', cas)
# if format of string does not match then it's not CAS RN
if test is None:
return False
# verify check digit
reverse = cas[::-1] # reverse the CAS Registry Number (needed for checksum math and split out checksum)
digits = reverse.replace('-', '') # remove the dashes
nochk = digits[1:] # all but first digit
chksum = int(digits[:1]) # first digit
total = 0
for i, digit in enumerate(nochk):
total += (i + 1) * int(digit) # index of chars starts at 0
newsum = total % 10
if newsum == chksum:
return True
else:
return False


def chemimg(chemid='', imgtype='svg'):
"""
Get an image for a compound from either a CAS Registry Number, InChIKey, SMILES, or name
:param chemid: the CAS Registry Number, InChIKey, SMILES, or name
:param imgtype: the type of image file to produce - svg, png, or ps
:return:
"""
# check identifier for type so checking can be done
if chemid == '':
return False
if _validkey(chemid):
casrn = key2cas(chemid)
elif not _validcas(chemid):
casrn = query(chemid, True)
else:
casrn = chemid
if not casrn:
return casrn
# get svg data and save
svg = detail(casrn, "image")
f = open(casrn + ".svg", "w")
f.write(svg)
f.close()
if imgtype == 'png':
cairosvg.svg2png(url=casrn + ".svg", write_to=casrn + ".png")
elif imgtype == 'ps':
cairosvg.svg2ps(url=casrn + ".svg", write_to=casrn + ".ps")
if imgtype == 'png' or imgtype == 'ps':
if os.path.exists(casrn + ".svg"):
os.remove(casrn + ".svg")
return True
14 changes: 8 additions & 6 deletions example.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from comchem import *

name = detail("9003-07-0", "properties")
print(name)
# name = detail("9003-07-0", "properties")
# print(name)
#
# find = query("trifluoro*")
# print(find)

find = query("trifluoro*")
print(find)
# find = key2cas('UHOVQNZJYSORNB-UHFFFAOYSA-N')
# print(find)

find = key2cas('UHOVQNZJYSORNB-UHFFFAOYSA-N')
print(find)
chemimg("71-43-2", 'png')
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ packages = [
[tool.poetry.dependencies]
python = "^3.7"
bs4 = "0.0.1"
cairosvg = "^2.5.2"

[tool.poetry.dev-dependencies]
pytest = "^5.2"
Expand Down

0 comments on commit 2590188

Please sign in to comment.