From 63e8a6db8d5a4096d03a7e59dc0ca6040f48a149 Mon Sep 17 00:00:00 2001 From: Eric Scott Date: Tue, 6 Aug 2024 14:39:33 -0700 Subject: [PATCH 1/4] add dataset documenting SMARTS strings and other functions used --- .Rbuildignore | 1 + DESCRIPTION | 5 ++++- R/data.R | 14 +++++++++++++ R/get_fx_groups.R | 17 +++++++++------- data-raw/smarts_simpol1.R | 3 +++ data-raw/smarts_simpol1.csv | 39 ++++++++++++++++++++++++++++++++++++ data/smarts_simpol1.rda | Bin 0 -> 1586 bytes man/get_fx_groups.Rd | 17 +++++++++------- man/smarts_simpol1.Rd | 23 +++++++++++++++++++++ 9 files changed, 104 insertions(+), 15 deletions(-) create mode 100644 R/data.R create mode 100644 data-raw/smarts_simpol1.R create mode 100644 data-raw/smarts_simpol1.csv create mode 100644 data/smarts_simpol1.rda create mode 100644 man/smarts_simpol1.Rd diff --git a/.Rbuildignore b/.Rbuildignore index f5aa41f..9a2c1d3 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,3 +15,4 @@ README.Rmd ^Meta$ ^cran-comments\.md$ ^CRAN-SUBMISSION$ +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index 763a016..6b8a755 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,4 +46,7 @@ biocViews: Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 +Depends: + R (>= 2.10) +LazyData: true diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..6a3c1d2 --- /dev/null +++ b/R/data.R @@ -0,0 +1,14 @@ +#' Search patterns used for SIMPOL.1 functional groups +#' +#' This dataframe documents how functional groups for the SIMPOL.1 and Meredith +#' et al. method are defined using SMARTS strings or `ChemmineR` functions. +#' +#' @format +#' \describe{ +#' \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} +#' \item{functional_groups}{These correspond to matching column names in the results of [get_fx_groups()].} +#' \item{smarts}{SMARTS strings used to capture groups, when applicable} +#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} +#' \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} +#' } +"smarts_simpol1" \ No newline at end of file diff --git a/R/get_fx_groups.R b/R/get_fx_groups.R index e5f1161..71191b4 100644 --- a/R/get_fx_groups.R +++ b/R/get_fx_groups.R @@ -17,13 +17,16 @@ #' risk! Validation is not available on Windows. See **Details** for more #' information. #' -#' @details It is unfortunately difficult to capture errors and warnings -#' produced by the command line tool OpenBabel used by `ChemmineOB`, a -#' dependency of `volcalc`. These errors and warnings are printed to the R -#' console, but they are *not* R errors and do not stop code from running and -#' producing potentially incorrect data. `validate = TRUE` checks the output -#' of certain OpenBabel procedures for the *symptoms* of these errors, namely -#' missing values for InChI and molecular formula. Unfortunately, since InChI +#' @details For more details on how functional groups are defined, see the +#' [smarts_simpol1] data set. +#' +#' It is unfortunately difficult to capture errors and warnings produced by +#' the command line tool OpenBabel used by `ChemmineOB`, a dependency of +#' `volcalc`. These errors and warnings are printed to the R console, but they +#' are *not* R errors and do not stop code from running and producing +#' potentially incorrect data. `validate = TRUE` checks the output of certain +#' OpenBabel procedures for the *symptoms* of these errors, namely missing +#' values for InChI and molecular formula. Unfortunately, since InChI #' generation is not available with the Windows version of `ChemmineOB`, this #' validation step cannot be performed on Windows and `validate = TRUE` will #' simply print a warning that can be silenced by setting `validate = FALSE`. diff --git a/data-raw/smarts_simpol1.R b/data-raw/smarts_simpol1.R new file mode 100644 index 0000000..3b68961 --- /dev/null +++ b/data-raw/smarts_simpol1.R @@ -0,0 +1,3 @@ +## code to prepare `smarts` dataset goes here +smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv") +usethis::use_data(smarts_simpol1, overwrite = TRUE) diff --git a/data-raw/smarts_simpol1.csv b/data-raw/smarts_simpol1.csv new file mode 100644 index 0000000..09f6abb --- /dev/null +++ b/data-raw/smarts_simpol1.csv @@ -0,0 +1,39 @@ +method,functional_group,smarts,fun,notes +simpol1,carbons_asa,NA,NA,Number of carbons on the acid side of an amide—not possible to capture with SMARTS +simpol1,rings_aromatic,NA,ChemmineR::rings, +simpol1,rings_total,NA,ChemmineR::rings, +simpol1,rings_aliphatic,NA,rings_total - rings_aromatic, +simpol1,carbon_dbl_bonds_aliphatic,C=C,ChemmineR::smartsSearchOB, +simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB, +simpol1,hydroxyl_total,NA,ChemmineR::groups, +simpol1,hydroxyl_aromatic,[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted" +simpol1,hydroxyl_aliphatic,NA,hydroxyl_total - hydroxyl_aromatic, +simpol1,aldehydes,NA,ChemmineR::groups, +simpol1,ketones,NA,ChemmineR::groups, +simpol1,carbox_acids,NA,ChemmineR::groups, +simpol1,ester,NA,ChemmineR::groups,"This also captures carbonylperoxynitrates and nitroesters, so the number of carbonylperoxynitrates and nitroesters are subtracted" +simpol1,ether_total,NA,ChemmineR::groups, +simpol1,ether_alkyl,NA,ether_total - ether_alicyclic - ether_aromatic, +simpol1,ether_alicyclic,[OD2]([C!R0])[C!R0],ChemmineR::smartsSearchOB, +simpol1,ether_aromatic,"O(c)[C,c]",ChemmineR::smartsSearchOB,Only one of the carbons has to be aromatic +simpol1,nitrate,"[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]",ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxynitrates is subtracted" +simpol1,nitro,"[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",ChemmineR::smartsSearchOB, +simpol1,amine_primary,[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_secondary,[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_tertiary,[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_aromatic,[NX3;!$(NO)]c,ChemmineR::smartsSearchOB, +simpol1,amide_primary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]",ChemmineR::smartsSearchOB, +simpol1,amide_secondary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, +simpol1,amide_tertiary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, +simpol1,carbonylperoxynitrate,*C(=O)OO[N+1](=O)[O-1],ChemmineR::smartsSearchOB, +simpol1,peroxide,[OX2D2][OX2D2],ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxinitrates is subtracted" +simpol1,hydroperoxide,"[OX2][OX2H,OX1-]",ChemmineR::smartsSearchOB,"This pattern also captures peroxyacids, so the number of carbonylperoxyacids is subtracted" +simpol1,carbonylperoxyacid,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]",ChemmineR::smartsSearchOB, +simpol1,nitrophenol,"[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]",ChemmineR::smartsSearchOB, +simpol1,nitroester,"C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]",ChemmineR::smartsSearchOB,"This pattern captures OH groups on a ring that also has a nitro group (para, ortho, or meta)" +meredith,phosphoric_acids,"[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]",ChemmineR::smartsSearchOB,"This pattern also captures phosphoric esthers, so the number of phosphoric esters is subtracted" +meredith,phosphoric_esters,"[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]",ChemmineR::smartsSearchOB, +meredith,sulfates,"[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",ChemmineR::smartsSearchOB, +meredith,sulfonates,"[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]",ChemmineR::smartsSearchOB,This pattern captures sulfonate ions and their conjugate acids (sulfonic acids) +meredith,thiols,[#16X2H],ChemmineR::smartsSearchOB, +meredith,carbothioesters,S([#6])[CX3](=O)[#6],ChemmineR::smartsSearchOB, diff --git a/data/smarts_simpol1.rda b/data/smarts_simpol1.rda new file mode 100644 index 0000000000000000000000000000000000000000..45f46d11ad242ae377f2c9a03ebbaf9cd1aec564 GIT binary patch literal 1586 zcmV-22F>|GT4*^jL0KkKS-@B$dH@EO|KtDv|1^F7f7?7m{9eER-r-;X1^@&QPyhe{ z;0E7%C>2yHQAkJuKybuqrh`Bl0000Q41-MrKn9HfVj2U6BTY0K0MGyc0MKL_Xc_=C zXaf+?95EVcpwI?@000JqAk#q50i!?|hJZ*(0-Ac7H8h^4jR&cr>H+Dgri_M-03N1~ zQR;e(38vH@P;kU)rh`Bl0000Q41-MrKn9HfVj2Mwf+m47o~8{5Gy(&9w2ZlPXnE7lRr0NNy?#9RCr`0NsD-cEE ziZ4XWdUe?KTARMZY3eg!lscL^ioDu4rF7RD<4!cU+E*%DqYg|nSJj@ps@@+4b}`({ z+N`rSG+A!E9(FOw9nEbIC3^%sTdIaOrJ`0%6SXgDMj@0z4X7$a0WAedMQ8z70Z5{y zBE*O)_983vdFU$i0eS$N^zF|Ia?X9l>s08R^$YJU22#Ok4m?kY$K{uIn$|*tuLJ8@$9)B^^<;=Lj2$5C>dJ(49(9s?a zA&In&aA}P(lRblfK290JnI7i;4THH*>l)sdOS`HVe+&(LARNS~S5N!U|AyNGc>qq4 z?I1qzWd_~)t@u<*u8BMpj1%Xi!l)F82ArK_VH!jngb(Z>e9rG5F!7vL2n`&F^{TQB zdgsx|Bk8Hm{Jrl+7xL%DkHx6EXZ}OxK0zJ3w9YC~>hcnkI9i@5VSHEPb}be9em#k* z12te$lGpFL66fzh7%{Q+;P2}0znk`3-uQ0%?tZ)Ox?}JRuxxGSm9$wcSu-CPg z(@$k72DjqVD4qi0ExzRn-Fm;3AF zdJH-(X3l`EO^3W;%oQ78Spcx1L@(DdHDVhq1yh)Qy(k^bdV1aN>`EXMYSgVwwY9;W z0b<6en1hNCD7VYIxfT}!%YO*NgV<0wL_4&5JsPbLFofa`_|1mc{$G%_)#B)$|YpFm+Fmdm3LHz=dVucHgEN`{Ap z(cdtd!WFVu)G_LsHq*2KcMc&Q9ZP`WTKT za5&dcnb!;D$6^?RoW!*O$a^E40~B0Wl??;O_s}i!Zyv~O6KWG=5YRikF4MTaN~Uxm k`9J5o?ViYcX7~*id_&PC1ya5A6aS03BAh5lU@Q^60KU5H%>V!Z literal 0 HcmV?d00001 diff --git a/man/get_fx_groups.Rd b/man/get_fx_groups.Rd index 41eadf6..dbd2c08 100644 --- a/man/get_fx_groups.Rd +++ b/man/get_fx_groups.Rd @@ -26,13 +26,16 @@ for specified compounds. Users will not typically interact with this function directly, but rather by using \code{\link[=calc_vol]{calc_vol()}}. } \details{ -It is unfortunately difficult to capture errors and warnings -produced by the command line tool OpenBabel used by \code{ChemmineOB}, a -dependency of \code{volcalc}. These errors and warnings are printed to the R -console, but they are \emph{not} R errors and do not stop code from running and -producing potentially incorrect data. \code{validate = TRUE} checks the output -of certain OpenBabel procedures for the \emph{symptoms} of these errors, namely -missing values for InChI and molecular formula. Unfortunately, since InChI +For more details on how functional groups are defined, see the +\link{smarts_simpol1} data set. + +It is unfortunately difficult to capture errors and warnings produced by +the command line tool OpenBabel used by \code{ChemmineOB}, a dependency of +\code{volcalc}. These errors and warnings are printed to the R console, but they +are \emph{not} R errors and do not stop code from running and producing +potentially incorrect data. \code{validate = TRUE} checks the output of certain +OpenBabel procedures for the \emph{symptoms} of these errors, namely missing +values for InChI and molecular formula. Unfortunately, since InChI generation is not available with the Windows version of \code{ChemmineOB}, this validation step cannot be performed on Windows and \code{validate = TRUE} will simply print a warning that can be silenced by setting \code{validate = FALSE}. diff --git a/man/smarts_simpol1.Rd b/man/smarts_simpol1.Rd new file mode 100644 index 0000000..2ebfc7e --- /dev/null +++ b/man/smarts_simpol1.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{smarts_simpol1} +\alias{smarts_simpol1} +\title{Search patterns used for SIMPOL.1 functional groups} +\format{ +\describe{ +\item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} +\item{functional_groups}{These correspond to matching column names in the results of \code{\link[=get_fx_groups]{get_fx_groups()}}.} +\item{smarts}{SMARTS strings used to capture groups, when applicable} +\item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} +\item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} +} +} +\usage{ +smarts_simpol1 +} +\description{ +This dataframe documents how functional groups for the SIMPOL.1 and Meredith +et al. method are defined using SMARTS strings or \code{ChemmineR} functions. +} +\keyword{datasets} From 2d4e2d4d3ba9aafb41c6d152e2df8b6b08d690b9 Mon Sep 17 00:00:00 2001 From: Eric Scott Date: Tue, 6 Aug 2024 15:07:13 -0700 Subject: [PATCH 2/4] define SMARTS patterns in internal dataset rather than in get_fx_groups() code --- R/data.R | 6 +-- R/get_fx_groups.R | 98 +++++++++++------------------------- R/sysdata.rda | Bin 0 -> 822 bytes data-raw/README.md | 4 ++ data-raw/make_data.R | 14 ++++++ data-raw/smarts_simpol1.R | 3 -- data-raw/smarts_simpol1.csv | 2 +- data/smarts_simpol1.rda | Bin 1586 -> 1609 bytes man/smarts_simpol1.Rd | 2 +- 9 files changed, 51 insertions(+), 78 deletions(-) create mode 100644 R/sysdata.rda create mode 100644 data-raw/README.md create mode 100644 data-raw/make_data.R delete mode 100644 data-raw/smarts_simpol1.R diff --git a/R/data.R b/R/data.R index 6a3c1d2..9ee54a4 100644 --- a/R/data.R +++ b/R/data.R @@ -1,14 +1,14 @@ #' Search patterns used for SIMPOL.1 functional groups -#' +#' #' This dataframe documents how functional groups for the SIMPOL.1 and Meredith #' et al. method are defined using SMARTS strings or `ChemmineR` functions. -#' +#' #' @format #' \describe{ #' \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} #' \item{functional_groups}{These correspond to matching column names in the results of [get_fx_groups()].} #' \item{smarts}{SMARTS strings used to capture groups, when applicable} -#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} +#' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "[ChemmineR::smartsSearchOB]". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} #' \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} #' } "smarts_simpol1" \ No newline at end of file diff --git a/R/get_fx_groups.R b/R/get_fx_groups.R index 71191b4..822b4e1 100644 --- a/R/get_fx_groups.R +++ b/R/get_fx_groups.R @@ -100,48 +100,7 @@ Set `validate = FALSE` to silence this warning.") carbon_dbl_count <- tibble::add_row(carbon_dbl_count, n = 0) } - # *_pattern are SMARTS strings: https://www.daylight.com/dayhtml_tutorials/languages/smarts/smarts_examples.html - carbon_dbl_bonds_pattern <- "C=C" #non-aromatic carbon double bonds - CCCO_pattern <- "C(C=C[AR1])(=O)[AR1]" #C=C-C=O in a non-aromatic ring - # ether_alkyl_pattern <- "[OD2]([C!R1])[C!R1]" #currently unused--ether_alkly calculated as total - other ethers - ether_alicyclic_pattern <- "[OD2]([C!R0])[C!R0]" - ether_aromatic_pattern <- "O(c)[C,c]" #only one of the carbons has to be aromatic - nitro_pattern <- "[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]" - hydroxyl_aromatic_pattern <- "[OX2H]c" - nitrate_pattern <- "[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]" - - #TODO need patterns for amines that don't pick up amides - amine_primary_pattern <- "[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4]" - amine_secondary_pattern <- "[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4]" - amine_tertiary_pattern <- "[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4]" - amine_aromatic_pattern <- "[NX3;!$(NO)]c" - - amide_primary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]" - amide_secondary_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]" - amide_tertiary_pattern <- - "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]" - - # amide_total_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3;$([H2]),$([H1][#6;!$(C=[O,N,S])]),$([#7]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])])]" - - carbonylperoxynitrate_pattern <- "*C(=O)OO[N+1](=O)[O-1]" - peroxide_pattern <- "[OX2D2][OX2D2]" #this captures carbonylperoxynitrates too - hydroperoxide_pattern <- "[OX2][OX2H,OX1-]" #this captures peroxyacids too - carbonylperoxyacid_pattern <- "[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]" - nitroester_pattern <- "C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]" - # This captures OH groups on a ring that also has a nitro group (para, ortho, or meta). Need to correct aromatic hydroxyl count later. - nitrophenol_pattern <- - "[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]" - phosphoric_acid_pattern <- - "[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]" - phosphoric_ester_pattern <- - "[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]" - sulfate_pattern <- - "[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]" - #sulfonate groups; sulfonate ions, and conjugate acid, sulfonic acids - sulfonate_pattern <- - "[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]" - thiol_pattern <- "[#16X2H]" - carbothioester_pattern <- "S([#6])[CX3](=O)[#6]" + smarts <- smarts_patterns_simpol1 fx_groups_df <- dplyr::tibble( @@ -157,43 +116,42 @@ Set `validate = FALSE` to silence this warning.") rings_aromatic = as.integer(rings$AROMATIC), rings_total = as.integer(rings$RINGS), rings_aliphatic = NA_integer_, #calculated below - carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, carbon_dbl_bonds_pattern), - CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, CCCO_pattern), # C=C-C=O in a non-aromatic ring + carbon_dbl_bonds_aliphatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbon_dbl_bonds_aliphatic), + CCCO_aliphatic_ring = ChemmineR::smartsSearchOB(compound_sdf, smarts$CCCO_aliphatic_ring), hydroxyl_total = groups$ROH, - hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, hydroxyl_aromatic_pattern, uniqueMatches = FALSE), + hydroxyl_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroxyl_aromatic, uniqueMatches = FALSE), hydroxyl_aliphatic = NA_integer_, #calculated below aldehydes = groups$RCHO, ketones = groups$RCOR, carbox_acids = groups$RCOOH, ester = groups$RCOOR, ether_total = groups$ROR, - # ether_alkyl = ChemmineR::smartsSearchOB(compound_sdf, ether_alkyl_pattern), - ether_alkyl = NA_integer_, - ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, ether_alicyclic_pattern), - ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, ether_aromatic_pattern), - nitrate = ChemmineR::smartsSearchOB(compound_sdf, nitrate_pattern), - nitro = ChemmineR::smartsSearchOB(compound_sdf, nitro_pattern), - amine_primary = ChemmineR::smartsSearchOB(compound_sdf, amine_primary_pattern), - amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, amine_secondary_pattern), - amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amine_tertiary_pattern), - amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, amine_aromatic_pattern), - amide_primary = ChemmineR::smartsSearchOB(compound_sdf, amide_primary_pattern), - amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, amide_secondary_pattern), - amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, amide_tertiary_pattern), - carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxynitrate_pattern), - peroxide = ChemmineR::smartsSearchOB(compound_sdf, peroxide_pattern), - hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, hydroperoxide_pattern), - carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, carbonylperoxyacid_pattern), - nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, nitrophenol_pattern), - nitroester = ChemmineR::smartsSearchOB(compound_sdf, nitroester_pattern), + ether_alkyl = NA_integer_, #calculated below + ether_alicyclic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_alicyclic), + ether_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$ether_aromatic), + nitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrate), + nitro = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitro), + amine_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_primary), + amine_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_secondary), + amine_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_tertiary), + amine_aromatic = ChemmineR::smartsSearchOB(compound_sdf, smarts$amine_aromatic), + amide_primary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_primary), + amide_secondary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_secondary), + amide_tertiary = ChemmineR::smartsSearchOB(compound_sdf, smarts$amide_tertiary), + carbonylperoxynitrate = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxynitrate), + peroxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$peroxide), + hydroperoxide = ChemmineR::smartsSearchOB(compound_sdf, smarts$hydroperoxide), + carbonylperoxyacid = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbonylperoxyacid), + nitrophenol = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitrophenol), + nitroester = ChemmineR::smartsSearchOB(compound_sdf, smarts$nitroester), # Additional groups from Meredith et al. 2023 - phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_acid_pattern), - phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, phosphoric_ester_pattern), - sulfates = ChemmineR::smartsSearchOB(compound_sdf, sulfate_pattern), - sulfonates = ChemmineR::smartsSearchOB(compound_sdf, sulfonate_pattern), - thiols = ChemmineR::smartsSearchOB(compound_sdf, thiol_pattern), - carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, carbothioester_pattern), + phosphoric_acids = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_acids), + phosphoric_esters = ChemmineR::smartsSearchOB(compound_sdf, smarts$phosphoric_esters), + sulfates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfates), + sulfonates = ChemmineR::smartsSearchOB(compound_sdf, smarts$sulfonates), + thiols = ChemmineR::smartsSearchOB(compound_sdf, smarts$thiols), + carbothioesters = ChemmineR::smartsSearchOB(compound_sdf, smarts$carbothioesters), oxygens = atoms[["O"]] %||% 0L, chlorines = atoms[["Cl"]] %||% 0L, nitrogens = atoms[["N"]] %||% 0L, diff --git a/R/sysdata.rda b/R/sysdata.rda new file mode 100644 index 0000000000000000000000000000000000000000..bb23864d899bb5fc187a0365b20a2c9b6ecb157a GIT binary patch literal 822 zcmV-61IheCT4*^jL0KkKS-5muQ~(1of205V|ByWXf6FXE_)5R$-eCd&1OPw)00B?~ zUeFCWbvoLFk|dv`LF#&F$*9QC00*c5&;S4cB?!`L5ujvgpa1|G00w|)1JnVglu?ra z002!e0006s$Y2uz#zvVjG#N5sG{_kZ0iZO=p@0)WVG>EAnWSx0eyOsWX^3xQ~9_&+Nc8EyAcw;vrSMz69Dd^t1KYbo4yb0 zC{mG9O%IPa$=xKTBv&(*0klzqt}sNDQ)y5Idb%h9!U@*9-<+REa*trg5q>fuBM3;# zFiJeKArqjbB`HGU5UhK-K$Bboeo{h$-a#z_#Yxu&tcEdZw52G3kb-c+o!49-gh;Cc zDk?HOgi4r7NHQs?BM8D3uQSKPe$DBq`w*_Qf$YK){@!GPv}ag!)dlC|vjOQ(CI< zOsd2#n)vqWo!IHPl`;$0y)YLKRoJwzk2KB$IWR)-wlo0<2rq00F6X~flDX@0HC6$j~rNpb~tJnAfXLHhed<@ z>)^1+4~h?&`gpr6nK@JrdO)(fI7aJCiwrG59!kj|Q6{3owcfjJ z*T_>TL}eLEzQFmHvPNc@I#c7OtTp*l>M2S0M5d{pR6?rD8J6qUhrh(IP-5!(EzVdi z5;+kJ|H4ts%)?wPMMba*7F72t;M%rUnotTB>AMh;nF6(+;<_`MSSv#lVrXvc?KSf_ z7tfTZ04l%O6rVOYboO_&jAA@$9Q7u2gOkf3=OZTti$!tM6=YEvj;SaJ$xGhl-vwEy zmRZRp.csv. +To turn this into a user-facing dataset, edit `make_data.R` to add another `usethis::use_data()` and document it by adding a new entry to `R/data.R`. To also use this in internal data, it needs to be added as an argument to `usethis::use_data(..., internal = TRUE)` since only one sysdata.rda can exist for holding internal data. E.g. `usethis::use_data(smarts_patterns_simpol1, smarts_patterns_newmethod, internal = TRUE, overwrite = TRUE)` + +Be sure to run the code in `make_data.R` and to run `devtools::document()` to update data and documentation. \ No newline at end of file diff --git a/data-raw/make_data.R b/data-raw/make_data.R new file mode 100644 index 0000000..65dcab0 --- /dev/null +++ b/data-raw/make_data.R @@ -0,0 +1,14 @@ +## code to prepare `smarts` dataset goes here +smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv") + +#create user-facing data.frame +usethis::use_data(smarts_simpol1, overwrite = TRUE) + +#create internal named list with just SMARTS strings +just_smarts_simpol1 <- + smarts_simpol1 %>% + dplyr::filter(!is.na(smarts)) +smarts_patterns_simpol1 <- as.list(just_smarts_simpol1$smarts) +names(smarts_patterns_simpol1) <- just_smarts_simpol1$functional_group + +usethis::use_data(smarts_patterns_simpol1, internal = TRUE, overwrite = TRUE) diff --git a/data-raw/smarts_simpol1.R b/data-raw/smarts_simpol1.R deleted file mode 100644 index 3b68961..0000000 --- a/data-raw/smarts_simpol1.R +++ /dev/null @@ -1,3 +0,0 @@ -## code to prepare `smarts` dataset goes here -smarts_simpol1 <- readr::read_csv("data-raw/smarts_simpol1.csv") -usethis::use_data(smarts_simpol1, overwrite = TRUE) diff --git a/data-raw/smarts_simpol1.csv b/data-raw/smarts_simpol1.csv index 09f6abb..def1ccc 100644 --- a/data-raw/smarts_simpol1.csv +++ b/data-raw/smarts_simpol1.csv @@ -4,7 +4,7 @@ simpol1,rings_aromatic,NA,ChemmineR::rings, simpol1,rings_total,NA,ChemmineR::rings, simpol1,rings_aliphatic,NA,rings_total - rings_aromatic, simpol1,carbon_dbl_bonds_aliphatic,C=C,ChemmineR::smartsSearchOB, -simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB, +simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB,Matches C=C-C=O in a non-aromatic ring simpol1,hydroxyl_total,NA,ChemmineR::groups, simpol1,hydroxyl_aromatic,[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted" simpol1,hydroxyl_aliphatic,NA,hydroxyl_total - hydroxyl_aromatic, diff --git a/data/smarts_simpol1.rda b/data/smarts_simpol1.rda index 45f46d11ad242ae377f2c9a03ebbaf9cd1aec564..c0ccf250f06162ff74b9fe67e7448c33f2023a47 100644 GIT binary patch delta 1584 zcmV-02G9Aj49N^yLRx4!F+o`-Q(1=%pY;F+pnv24|Nk_7|9{&&L;PO9|K8zX00saA z5KsUB0pJGzdip5MVco8G4({R504EHRk5fssjT=d(5DgfHLjoBzJrHQqKxo8d(dv;- zEq_U=B~NKjQ(}gI8Vv(L00000XaLhd13-yLjVa|Cc!>2jQ$PScA)o*N00x87GzNn~ zA|QkUYIgoctp|9@mG$d~o@KC6gTX(g!)Li#&Don$zkbL!Fo z!HbQp6-E5i3}nKk@9mYG`fEy&v0@0gqKmwj4~BJT*0|gBn-5uqv1b-;B$DOL!4sZu zTgLQL`;&}QqTH6keR)sRso?rFP{V$MWaFUInv1F3=%I!v{uyR?t1v^sxtqZ06n|)z z^FGH~Lrn%P0u6!^LqG_w3X+P@0;mB<3Z){%f-3KzB)vbP1d`_humU(|*^?mezGG=w zlykmm3h*ez;(8tU<9lZXWoM1Kz*u|7VeO;7AbitD$p&( zqg0}lAS57}E2FyZm~?I}kU}I`5XD6?X1HNZ?Li{QW&;t3m5(9sX!*W4_|b{v;j37- zq=TT+y4H2KycYe|DW`A;DCiwD(f+fqQ*D9bKoP+5AU}9ALv^;(&oGqM5`TA53=#cd z+E5VC27FebrZo)$#6Wza1JJj3a|?_&&=@FKjdaRQihKvBz9D?lG!JyXw~Kj8QJC+F zi*N(d9@+-D>;x9Ga*`sHVa5ogd~5 zry6babF|&v)l}4K1q`XFsrR<=)5nCeF));6Z2hux+gur}J#W{U{IoBZ(j3}=T1lCt z*pv49feau-2~9x8gwpwX9n#}jabdNzvKZh6A^^a2>=`ok8_ZlV!hZ;{LM68d%#jKh z+9(uK$f*hfaCIp^ezl0Jpiq*KK=A^$aI6eiG0u-}k3ssoj(H>RP>FE}{jgb+QdwF| zl%*lT@Gp{&(LKY4=#fEUemf;(t3$Wf)mJkT8W4NBNkw+zHQiaGnHSrZP^H4&eJyz} znz#*GhV76PQOhOR?thbEi81P@$os z;I4s$C0u+vP=Avuf{9n%6E%AX4Z;D2VF7{wcOwnRt05hpC@fV#SV$WX__YMG+5Bwr zGXD;RKV8a%iJ;TK+9+vohCK|j4c%x1c$H+T3X#b}%vhS`N>-q*6&0AV2w{#i6D?$h z&>NR_31?h1@~Sat2~`RnKF(wTKVZi)T*xsN^s__)Vt-`p6GrF)1R~TE0-?0aC*wSx z8RqunmU9-fpzVe5i1OG}Lj|RlNC(FI`=G?Uayr@x%P?8PiWu&Lml#m+n@M`vDHIFPNljX~@1V<}WL%^I#9czj0>oPvMGM1={XnnPjCbLDObAQxLh$GETn7zxDodzA i)Y#?h&*U~{I1LeUj+B%|OK1TP_`8xR!i0o4Z2zc~IP1Ou delta 1561 zcmV+!2Il$646+PPLRx4!F+o`-Q(3@RBYFS^m;dAc|Nk_7|9{&&L;PO9|K8zX00saA z5KsUB0pJGTdMFiCDp5#C0YGrXX{Lif8UO$Q8VrL?13(6i0Ai6^Eq_Q!0-Ac7H8h^4 zjR&cr>H+Dgri_M-03N1~QR;e(38vH@P;kU)rh`Bl0000Q41-MrKn9HfVj2Mwf+m47 zo~8{5GyVU#+WI*Pp7H>Gsf8{e9rG5F!7vL2n`&F^{TQBdgsx|Bk8Hm{Jrl+7xL%DkHx6EXZ}OxK0zJ3w9YC~ z>hcnkI9i@5VSHEPb}be9em#k*12IzGwRwIp#kf(=E_ z&*ZJw;m;D7(=j2X)Bbin=T;7bM;r0>f7(V12*a%q6q48Px)SH_K^QTy_2BR7?!TM% zTi*C?`tE+a?z&^}4CD+-*fYZ$l4vmu*uq{|lz-cZ(2*2lXs}sRCB#rG7jbIrU38Z_ z1&S>QJb=3Q72w5-9cRs-5%lmLbrayRDaelGFJ)X)F11l>S~zvj%Fl|p2OP4N9FKg?WDw5V!B`u;?syaG1PlkXuf&)aaA7iyPm9C}VV7FV!rZRVA`$Bz^VmO6WPGWo?c8u=r(CE! zBuUCrrDNyUQS$b-Y_-h8QK;tY^^UpIPh}|vx8l<%pjJpv7>{l5m;;z`91PM37LAuB zqURc}U}9DkBZSYIgkb_5y_=1*hD^66?tk>3<;OO{#Rjd$ujD%>qi0ExzRn-Fm;3AFdJH-(X3l`EO^3W;%oQ78Spcx1L@(DdHDVhq z1yh)Qy(k^bdV1aN>`EXMYSgVwwY9;W0b<6en1hNCD7VYIxfT}!%YO*NgV<0wM1Kex z1PsUo#LdE&g#)$Tp>pj55kf>KQ-CWBH25LjhCcqM@5HcNVjB0GhLLyD6I3umhou1H z7WFO#&PyKofm>&_uW)b+ju3JRK+7u(Fu_R>IA0!R((l*I+76(raIxU+bD$OPnP^^g zG9ES2N&^NJ+A%n21q4hiFbjs`EPwt3-0s((?#{YQqofYqAAnCL17v{fh6)7YoV+wL zF{dQH6f2)VVI!8yqYpPIqsFhJ3$IFshlSDKFq*;@vRKqH>Y6swv;cPwAs!t|fZ^lf z>V}Q9fM7f4Z>^skais^{*QUYJrqgo7v59av*HD?)3+2aR7=xU|wE@U`BSM@56kJ!8 z4Fkvb&@J+B9>{DHY7=A-&^x>?)40A$rgR|rKj*vcp2&M<_ze|&L(wG#QoZyO|BJaI LoG3_OED^l`ZmH<0 diff --git a/man/smarts_simpol1.Rd b/man/smarts_simpol1.Rd index 2ebfc7e..9b79d90 100644 --- a/man/smarts_simpol1.Rd +++ b/man/smarts_simpol1.Rd @@ -9,7 +9,7 @@ \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} \item{functional_groups}{These correspond to matching column names in the results of \code{\link[=get_fx_groups]{get_fx_groups()}}.} \item{smarts}{SMARTS strings used to capture groups, when applicable} -\item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "ChemmineR::smartsSearchOB". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} +\item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "\link[ChemmineR:smartsSearchOB]{ChemmineR::smartsSearchOB}". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} } } From 1342f270d56a9bab792e41d68e5105122c0a9bbe Mon Sep 17 00:00:00 2001 From: Eric Scott Date: Tue, 6 Aug 2024 15:08:20 -0700 Subject: [PATCH 3/4] update NEWS --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index d5fce7b..fb6fbaf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # volcalc (development version) * adds a `validate = TRUE` option to `calc_vol()` and `get_fx_groups()` that returns `NA`s when there are suspected errors in parsing SMILES or .mol files. This is unfortunately not available on Windows due to differences in the windows version of `ChemmineOB` +* adds a dataset, `smarts_simpol1`, describing how functional groups are defined for the SIMPOL.1 and Meredith et al. methods # volcalc 2.1.2 From d6976c7e66c5fd7276642de4fc75ad1b08afdf6e Mon Sep 17 00:00:00 2001 From: Eric Scott Date: Tue, 13 Aug 2024 14:38:00 -0700 Subject: [PATCH 4/4] add functional group descriptions to data --- R/data.R | 12 ++++++ data-raw/smarts_simpol1.csv | 78 ++++++++++++++++++------------------ data/smarts_simpol1.rda | Bin 1609 -> 1699 bytes man/smarts_simpol1.Rd | 11 +++++ 4 files changed, 62 insertions(+), 39 deletions(-) diff --git a/R/data.R b/R/data.R index 9ee54a4..a056632 100644 --- a/R/data.R +++ b/R/data.R @@ -7,8 +7,20 @@ #' \describe{ #' \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} #' \item{functional_groups}{These correspond to matching column names in the results of [get_fx_groups()].} +#' \item{description}{Functional group description from Table 5 of Pankow & Asher (2008)} #' \item{smarts}{SMARTS strings used to capture groups, when applicable} #' \item{fun}{The function used to capture the functional group. When `smarts` is not `NA`, this is always "[ChemmineR::smartsSearchOB]". Other groups are captured with other `ChemmineR` functions or as calculations using other functional groups.} #' \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} #' } +#' +#' @references +#' Meredith L, Ledford S, Riemer K, Geffre P, Graves K, Honeker L, LeBauer D, +#' Tfaily M, Krechmer J. 2023. Automating methods for estimating metabolite +#' volatility. Frontiers in Microbiology. \doi{10.3389/fmicb.2023.1267234} +#' +#' Pankow, J.F., Asher, W.E. 2008. SIMPOL.1: a simple group +#' contribution method for predicting vapor pressures and enthalpies of +#' vaporization of multifunctional organic compounds. Atmos. Chem. Phys. +#' \doi{10.5194/acp-8-2773-2008} +#' "smarts_simpol1" \ No newline at end of file diff --git a/data-raw/smarts_simpol1.csv b/data-raw/smarts_simpol1.csv index def1ccc..464f218 100644 --- a/data-raw/smarts_simpol1.csv +++ b/data-raw/smarts_simpol1.csv @@ -1,39 +1,39 @@ -method,functional_group,smarts,fun,notes -simpol1,carbons_asa,NA,NA,Number of carbons on the acid side of an amide—not possible to capture with SMARTS -simpol1,rings_aromatic,NA,ChemmineR::rings, -simpol1,rings_total,NA,ChemmineR::rings, -simpol1,rings_aliphatic,NA,rings_total - rings_aromatic, -simpol1,carbon_dbl_bonds_aliphatic,C=C,ChemmineR::smartsSearchOB, -simpol1,CCCO_aliphatic_ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB,Matches C=C-C=O in a non-aromatic ring -simpol1,hydroxyl_total,NA,ChemmineR::groups, -simpol1,hydroxyl_aromatic,[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted" -simpol1,hydroxyl_aliphatic,NA,hydroxyl_total - hydroxyl_aromatic, -simpol1,aldehydes,NA,ChemmineR::groups, -simpol1,ketones,NA,ChemmineR::groups, -simpol1,carbox_acids,NA,ChemmineR::groups, -simpol1,ester,NA,ChemmineR::groups,"This also captures carbonylperoxynitrates and nitroesters, so the number of carbonylperoxynitrates and nitroesters are subtracted" -simpol1,ether_total,NA,ChemmineR::groups, -simpol1,ether_alkyl,NA,ether_total - ether_alicyclic - ether_aromatic, -simpol1,ether_alicyclic,[OD2]([C!R0])[C!R0],ChemmineR::smartsSearchOB, -simpol1,ether_aromatic,"O(c)[C,c]",ChemmineR::smartsSearchOB,Only one of the carbons has to be aromatic -simpol1,nitrate,"[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]",ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxynitrates is subtracted" -simpol1,nitro,"[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",ChemmineR::smartsSearchOB, -simpol1,amine_primary,[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4],ChemmineR::smartsSearchOB, -simpol1,amine_secondary,[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4],ChemmineR::smartsSearchOB, -simpol1,amine_tertiary,[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4],ChemmineR::smartsSearchOB, -simpol1,amine_aromatic,[NX3;!$(NO)]c,ChemmineR::smartsSearchOB, -simpol1,amide_primary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]",ChemmineR::smartsSearchOB, -simpol1,amide_secondary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, -simpol1,amide_tertiary,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, -simpol1,carbonylperoxynitrate,*C(=O)OO[N+1](=O)[O-1],ChemmineR::smartsSearchOB, -simpol1,peroxide,[OX2D2][OX2D2],ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxinitrates is subtracted" -simpol1,hydroperoxide,"[OX2][OX2H,OX1-]",ChemmineR::smartsSearchOB,"This pattern also captures peroxyacids, so the number of carbonylperoxyacids is subtracted" -simpol1,carbonylperoxyacid,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]",ChemmineR::smartsSearchOB, -simpol1,nitrophenol,"[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]",ChemmineR::smartsSearchOB, -simpol1,nitroester,"C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]",ChemmineR::smartsSearchOB,"This pattern captures OH groups on a ring that also has a nitro group (para, ortho, or meta)" -meredith,phosphoric_acids,"[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]",ChemmineR::smartsSearchOB,"This pattern also captures phosphoric esthers, so the number of phosphoric esters is subtracted" -meredith,phosphoric_esters,"[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]",ChemmineR::smartsSearchOB, -meredith,sulfates,"[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",ChemmineR::smartsSearchOB, -meredith,sulfonates,"[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]",ChemmineR::smartsSearchOB,This pattern captures sulfonate ions and their conjugate acids (sulfonic acids) -meredith,thiols,[#16X2H],ChemmineR::smartsSearchOB, -meredith,carbothioesters,S([#6])[CX3](=O)[#6],ChemmineR::smartsSearchOB, +method,functional_group,description,smarts,fun,notes +simpol1,carbons_asa,carbon number on the acid-side of an amide,NA,NA,Not possible to capture with SMARTS +simpol1,rings_aromatic,aromatic ring,NA,ChemmineR::rings, +simpol1,rings_total,,NA,ChemmineR::rings, +simpol1,rings_aliphatic,non-aromatic ring,NA,rings_total - rings_aromatic, +simpol1,carbon_dbl_bonds_aliphatic,C=C (non-aromatic),C=C,ChemmineR::smartsSearchOB, +simpol1,CCCO_aliphatic_ring,C=C-C=O in non-aromatic ring,C(C=C[AR1])(=O)[AR1],ChemmineR::smartsSearchOB, +simpol1,hydroxyl_total,,NA,ChemmineR::groups, +simpol1,hydroxyl_aromatic,"aromatic hydroxyl (e.g., phenol)",[OX2H]c,ChemmineR::smartsSearchOB,"This pattern also captures nitrophenols, so the number of nitrophenols is subtracted" +simpol1,hydroxyl_aliphatic,hydroxyl (alkyl),NA,hydroxyl_total - hydroxyl_aromatic, +simpol1,aldehydes,aldehyde,NA,ChemmineR::groups, +simpol1,ketones,ketone,NA,ChemmineR::groups, +simpol1,carbox_acids,carboxylic acid,NA,ChemmineR::groups, +simpol1,ester,ester,NA,ChemmineR::groups,"This also captures carbonylperoxynitrates and nitroesters, so the number of carbonylperoxynitrates and nitroesters are subtracted" +simpol1,ether_total,,NA,ChemmineR::groups, +simpol1,ether_alkyl,ether,NA,ether_total - ether_alicyclic - ether_aromatic, +simpol1,ether_alicyclic,ether (alicyclic),[OD2]([C!R0])[C!R0],ChemmineR::smartsSearchOB, +simpol1,ether_aromatic,"ether, aromatic","O(c)[C,c]",ChemmineR::smartsSearchOB,Only one of the carbons has to be aromatic +simpol1,nitrate,nitrate,"[$([NX3](=[OX1])(=[OX1])O),$([NX3+]([OX1-])(=[OX1])O)]",ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxynitrates is subtracted" +simpol1,nitro,nitro,"[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]",ChemmineR::smartsSearchOB, +simpol1,amine_primary,"amine, primary",[NX3;H2;!$(NC=[!#6]);!$(NC#[!#6])][#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_secondary,"amine, secondary",[NX3H1!$(NC=[!#6])!$(NC#[!#6])]([#6X4])[#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_tertiary,"amine, tertiary",[NX3H0!$(NC=[!#6])!$(NC#[!#6])]([#6X4])([#6X4])[#6X4],ChemmineR::smartsSearchOB, +simpol1,amine_aromatic,"amine, aromatic",[NX3;!$(NO)]c,ChemmineR::smartsSearchOB, +simpol1,amide_primary,"amide, primary","[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H2]",ChemmineR::smartsSearchOB, +simpol1,amide_secondary,"amide, secondary","[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H1][#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, +simpol1,amide_tertiary,"amide, tertiary","[CX3;$([R0][#6]),$([H1R0])](=[OX1])[#7X3H0]([#6;!$(C=[O,N,S])])[#6;!$(C=[O,N,S])]",ChemmineR::smartsSearchOB, +simpol1,carbonylperoxynitrate,carbonylperoxynitrate,*C(=O)OO[N+1](=O)[O-1],ChemmineR::smartsSearchOB, +simpol1,peroxide,peroxide,[OX2D2][OX2D2],ChemmineR::smartsSearchOB,"This pattern also captures carbonylperoxynitrates, so the number of carbonylperoxinitrates is subtracted" +simpol1,hydroperoxide,hydroperoxide,"[OX2][OX2H,OX1-]",ChemmineR::smartsSearchOB,"This pattern also captures peroxyacids, so the number of carbonylperoxyacids is subtracted" +simpol1,carbonylperoxyacid,carbonylperoxyacid,"[CX3;$([R0][#6]),$([H1R0])](=[OX1])[OX2][$([OX2H]),$([OX1-])]",ChemmineR::smartsSearchOB, +simpol1,nitrophenol,nitrophenol,"[OX2H][$(c1ccccc1[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1cccc(c1)[$([NX3](=O)=O),$([NX3+](=O)[O-])]),$(c1ccc(cc1)[$([NX3](=O)=O),$([NX3+](=O)[O-])])]",ChemmineR::smartsSearchOB, +simpol1,nitroester,nitroester,"C(=O)(OC)C~[NX3](-,=[OX1])-,=[OX1]",ChemmineR::smartsSearchOB,"This pattern captures OH groups on a ring that also has a nitro group (para, ortho, or meta)" +meredith,phosphoric_acids,,"[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]",ChemmineR::smartsSearchOB,"This pattern also captures phosphoric esthers, so the number of phosphoric esters is subtracted" +meredith,phosphoric_esters,,"[$(P(=[OX1])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)]),$([P+]([OX1-])([OX2][#6])([$([OX2H]),$([OX1-]),$([OX2][#6])])[$([OX2H]),$([OX1-]),$([OX2][#6]),$([OX2]P)])]",ChemmineR::smartsSearchOB, +meredith,sulfates,,"[$([#16X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2][#6]),$([#16X4+2]([OX1-])([OX1-])([OX2H,OX1H0-])[OX2][#6])]",ChemmineR::smartsSearchOB, +meredith,sulfonates,,"[#16X4](=[OX1])(=[OX1])([#6])[*$([O-1]),*$([OH1]),*$([OX2H0])]",ChemmineR::smartsSearchOB,This pattern captures sulfonate ions and their conjugate acids (sulfonic acids) +meredith,thiols,,[#16X2H],ChemmineR::smartsSearchOB, +meredith,carbothioesters,,S([#6])[CX3](=O)[#6],ChemmineR::smartsSearchOB, diff --git a/data/smarts_simpol1.rda b/data/smarts_simpol1.rda index c0ccf250f06162ff74b9fe67e7448c33f2023a47..b6ff863b37f1081306c9c3049b1b0f8ca6c4ea03 100644 GIT binary patch literal 1699 zcmV;U23+|9AS2AX65 z8Z^iN(USxQAOH>k8UsMk0Mkr>14fwu8Zuyj!~g)13LzuZ^gv@%)b!K<14BKjv$ec*=cvO1``@Lnu!aG2*%Y*?azdGYhlT=}5)V|d6O2+~NG{rB`*LCjyK zDEs!6fJHPP}K#)M<7=x?e`Y%-7HJH6`p_i6OXgNP7O z6SFZrTI_pz?}fP!sGMaC@WLEt4a&WJjXV-ZuK0==PPr5sywe*W0D7eb$9L ze$AFlvfsO5$$eOx3^wb*?n>vKc4{##sD6EV$)&;qOy;U(d-71NFs*TPSb<22jUvHhfg5kFByB^02z1Sx7+E--H~fc9Ht1o%Wt2_e3V6iN$P zXx33m5E2kil;L{4HVxgX$<>rii5p z^Cw8A=t>h8=^(}=C=_Kc03vlnD?2;`80a`_ClcK38(0^`w!p@KBN@&>YlA8sSaaLw ztroYU%vw?cdZ{p~1tI~jH*pw7kq0pY(jb2~e~-33Db-+r(aeu_s%8_p1&P8Hanlu6 zjUn)NF`I_WsMBau-SsJTwj3wyu7^dc7n2;t@hMx&uEOEgw$P?E)iS)%&{`7Vdz z5yqb!zh^~2Apsq{+%i0%T*u6Eg+jqVKykU}K0IYaLYn+Ci;r72Wz}H;pu(WZB{W(f z&zoaJIzfLD#<1*S`n@WMhY7+68ilvFwK*T!{nA*|E|6ey|2wR%Xg7@>_xS05r56>* zVbgH{>Ph=7=$H4if-Tr)8(G0jl_l6t%OWgv6iHA(?*V156`;j+j&JSX^cvb`DI+ZrE+AU) zTFJ1MSyLrdLxWJ*rykMijvJsR5UX<7 z;6pACu@}+t!jz>N8JP3Z{6XmBJyjgcB1lLxYTI3xpEzpt=4*f+@$l|2Mw5_;NOTls zN!0b)4rpAo)dqxQPKBROv6`cZQUzc{Feo5jDCzd{hTj389@$}f7?JBKR?4nDz9tf46JhgXy_pww-ESpd~> z5}@JQG7I7tIPL|LtbVORpYy?)@OF=j;e;b7AV#f9)ZJ3o7IuQg<=v2R7Ro@P-$%pB zivsG|Z}x^9TM8-6AVAU}WPnUeo`<08&a^q)%Hku2xI2fmF)O( zFl@0+r&-E0i`Pg)WH1Ar_z(@?7pZkm;1-LpDpnT?9o|j|1%474>0HhTbQ)F(T2UCVuNcASpp->0u)r=Hp_!r| z5#sOHhtt++U2cdP)$WuJz$cFcEU_K1FhM6o>gS=EjXGuUp-A zF~Mj=nA(!`!kPqao1JK5iOg;_ej4{c4*9%CqiWxT6AS0Q#0*qAO(sRk3!r<0r4uAvmn1Zf|9t}8OWO^iaUnLu t9RTj|hZEmZ$5KJiq+cJqNK(t=EC52>1AG7g literal 1609 zcmV-P2DbS^T4*^jL0KkKS%(du^#BH-f8+oE|1^F7f7?7m{9eER-r-;X1^@&QPyhe{ z;0FJC`Y6p|-L7{I?%~h?Ck&F0Q%SUq8%d@R4H$+)0vR+t5NOjtXvAdE>KX%vBTY0K z0MGyc0MKL_Xc_=CXaf+?95EVcpwI?@000JqAk#q50i!?|hJZ<_B~NKjQ(}gI8Vv(L z00000XaLhd13-yLjVa|Cc!>2jQ$PScA)o*N00x87GzNn~A|QkUYIgoctp|70x4m-Y5OtB6%;C8-QT`a3_JWH_F4>e2zhi;b-nMf}tZWWuHI?UkJR zYf6!^VhFgRi@cW)hIMDwxZCub4_Sq=XBKWGlI6|86P|Bd#`IJBlZ;cM+?K+9c~8`- z;QBOB!+wKgck{p@t~_8D@B^Fhjw)o51N5XqNLn$67;81}y>&f)YbO2(Ai} ziqHb60Z0m^BE*6!@1P{TKcWPZ=K`<-IA__DAn(3oX<3wWzG({ZD8%A{M$Pv>F@KTP zeq1FN=HxP@)heXq;n+YHdURo%D~iS}IyGgaSFMkoFXFk!`#T`&JLQDGAi!9Nv9w50 zm0&RM6(H1w5?}losxgvKA?MDJsw{#iLZBlprJ^nJc5Z?wE9LEs#Pa zSrElVF=n`7P3=J<$Yui(h?S2a?`ZkHIQY?t;x9Ga*`sHVa5og zdaSJ^BJ0bEuvY(P<}C5lj% zLX@xs(JYaxJgJ0@F@rmG?KVBl<-R(+aURN^-Ae=P@$os;I4s$C0u+vP?IWxiC5hdHG2pR!U2Y1 z0fGQ`BMr!_AswD5ELA{QNE;CNwFI)+{A}?u{|<#eUCM-spwqzGC~0tpJq)r9-Dm@N zm1L?4k;y~MSeoTZR-mpG6_~LIVU9EtEo6q!8<%zoXIwP$sxfE@RSF(H&SU{UV8=3C z$T1f5vqS=7Wb6}0=mG>H)Dr@sw96;sJf0cm_T-jx7PFx3h46^-*i=IWrItts#{2uA z#JqAk+6l`rS;LAL?t_;YQ1F{cdf6!&9qSI!c*sJgD+z-A<2K0b06oKuLfxS&3f(H&_U4XHFxxllKyphihrxbL9Lp=4a70>oWH$O6P$7ex!h zi~T^a)r@!Hd`t*S@IvtC@>~ZEbt+4!LDbmg?9b#jW;hKIa*mXgMN4P_5BR&1DZ+$= HIBfr@X65hK diff --git a/man/smarts_simpol1.Rd b/man/smarts_simpol1.Rd index 9b79d90..f9e5534 100644 --- a/man/smarts_simpol1.Rd +++ b/man/smarts_simpol1.Rd @@ -8,6 +8,7 @@ \describe{ \item{method}{Either "simpol1" for functional groups only used with the SIMPOL.1 method, or "meredith" for additional groups used in the Meredith et al. method.} \item{functional_groups}{These correspond to matching column names in the results of \code{\link[=get_fx_groups]{get_fx_groups()}}.} +\item{description}{Functional group description from Table 5 of Pankow & Asher (2008)} \item{smarts}{SMARTS strings used to capture groups, when applicable} \item{fun}{The function used to capture the functional group. When \code{smarts} is not \code{NA}, this is always "\link[ChemmineR:smartsSearchOB]{ChemmineR::smartsSearchOB}". Other groups are captured with other \code{ChemmineR} functions or as calculations using other functional groups.} \item{notes}{Notes including how any functional group counts are corrected when there is overlap. E.g. when one SMARTS pattern is a subset of another pattern, but the two groups are counted separately without overlap in the SIMPOL.1 method.} @@ -20,4 +21,14 @@ smarts_simpol1 This dataframe documents how functional groups for the SIMPOL.1 and Meredith et al. method are defined using SMARTS strings or \code{ChemmineR} functions. } +\references{ +Meredith L, Ledford S, Riemer K, Geffre P, Graves K, Honeker L, LeBauer D, +Tfaily M, Krechmer J. 2023. Automating methods for estimating metabolite +volatility. Frontiers in Microbiology. \doi{10.3389/fmicb.2023.1267234} + +Pankow, J.F., Asher, W.E. 2008. SIMPOL.1: a simple group +contribution method for predicting vapor pressures and enthalpies of +vaporization of multifunctional organic compounds. Atmos. Chem. Phys. +\doi{10.5194/acp-8-2773-2008} +} \keyword{datasets}