diff --git a/poetry.lock b/poetry.lock index c12b9554..c4c59f0e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1366,6 +1366,99 @@ files = [ [package.dependencies] rapidfuzz = ">=3.1.0,<4.0.0" +[[package]] +name = "lxml" +version = "5.1.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = false +python-versions = ">=3.6" +files = [ + {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, + {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da"}, + {file = "lxml-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2befa20a13f1a75c751f47e00929fb3433d67eb9923c2c0b364de449121f447c"}, + {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22b7ee4c35f374e2c20337a95502057964d7e35b996b1c667b5c65c567d2252a"}, + {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf8443781533b8d37b295016a4b53c1494fa9a03573c09ca5104550c138d5c05"}, + {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147"}, + {file = "lxml-5.1.0-cp310-cp310-win32.whl", hash = "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93"}, + {file = "lxml-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4"}, + {file = "lxml-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8920ce4a55ff41167ddbc20077f5698c2e710ad3353d32a07d3264f3a2021e"}, + {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cfced4a069003d8913408e10ca8ed092c49a7f6cefee9bb74b6b3e860683b45"}, + {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9e5ac3437746189a9b4121db2a7b86056ac8786b12e88838696899328fc44bb2"}, + {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204"}, + {file = "lxml-5.1.0-cp311-cp311-win32.whl", hash = "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b"}, + {file = "lxml-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8"}, + {file = "lxml-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16dd953fb719f0ffc5bc067428fc9e88f599e15723a85618c45847c96f11f431"}, + {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16018f7099245157564d7148165132c70adb272fb5a17c048ba70d9cc542a1a1"}, + {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82cd34f1081ae4ea2ede3d52f71b7be313756e99b4b5f829f89b12da552d3aa3"}, + {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:19a1bc898ae9f06bccb7c3e1dfd73897ecbbd2c96afe9095a6026016e5ca97b8"}, + {file = "lxml-5.1.0-cp312-cp312-win32.whl", hash = "sha256:13521a321a25c641b9ea127ef478b580b5ec82aa2e9fc076c86169d161798b01"}, + {file = "lxml-5.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:1ad17c20e3666c035db502c78b86e58ff6b5991906e55bdbef94977700c72623"}, + {file = "lxml-5.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:24ef5a4631c0b6cceaf2dbca21687e29725b7c4e171f33a8f8ce23c12558ded1"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d2900b7f5318bc7ad8631d3d40190b95ef2aa8cc59473b73b294e4a55e9f30f"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:601f4a75797d7a770daed8b42b97cd1bb1ba18bd51a9382077a6a247a12aa38d"}, + {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4b68c961b5cc402cbd99cca5eb2547e46ce77260eb705f4d117fd9c3f932b95"}, + {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:afd825e30f8d1f521713a5669b63657bcfe5980a916c95855060048b88e1adb7"}, + {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:262bc5f512a66b527d026518507e78c2f9c2bd9eb5c8aeeb9f0eb43fcb69dc67"}, + {file = "lxml-5.1.0-cp36-cp36m-win32.whl", hash = "sha256:e856c1c7255c739434489ec9c8aa9cdf5179785d10ff20add308b5d673bed5cd"}, + {file = "lxml-5.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:c7257171bb8d4432fe9d6fdde4d55fdbe663a63636a17f7f9aaba9bcb3153ad7"}, + {file = "lxml-5.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b9e240ae0ba96477682aa87899d94ddec1cc7926f9df29b1dd57b39e797d5ab5"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a96f02ba1bcd330807fc060ed91d1f7a20853da6dd449e5da4b09bfcc08fdcf5"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3898ae2b58eeafedfe99e542a17859017d72d7f6a63de0f04f99c2cb125936"}, + {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61c5a7edbd7c695e54fca029ceb351fc45cd8860119a0f83e48be44e1c464862"}, + {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3aeca824b38ca78d9ee2ab82bd9883083d0492d9d17df065ba3b94e88e4d7ee6"}, + {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764"}, + {file = "lxml-5.1.0-cp37-cp37m-win32.whl", hash = "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8"}, + {file = "lxml-5.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b"}, + {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1"}, + {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa"}, + {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45"}, + {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:98f3f020a2b736566c707c8e034945c02aa94e124c24f77ca097c446f81b01f1"}, + {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e"}, + {file = "lxml-5.1.0-cp38-cp38-win32.whl", hash = "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a"}, + {file = "lxml-5.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969"}, + {file = "lxml-5.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f8b0c78e7aac24979ef09b7f50da871c2de2def043d468c4b41f512d831e912"}, + {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bcf86dfc8ff3e992fed847c077bd875d9e0ba2fa25d859c3a0f0f76f07f0c8d"}, + {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:49a9b4af45e8b925e1cd6f3b15bbba2c81e7dba6dce170c677c9cda547411e14"}, + {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:280f3edf15c2a967d923bcfb1f8f15337ad36f93525828b40a0f9d6c2ad24890"}, + {file = "lxml-5.1.0-cp39-cp39-win32.whl", hash = "sha256:ed7326563024b6e91fef6b6c7a1a2ff0a71b97793ac33dbbcf38f6005e51ff6e"}, + {file = "lxml-5.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d7b4beebb178e9183138f552238f7e6613162a42164233e2bda00cb3afac58f"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9bd0ae7cc2b85320abd5e0abad5ccee5564ed5f0cc90245d2f9a8ef330a8deae"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8c1d679df4361408b628f42b26a5d62bd3e9ba7f0c0e7969f925021554755aa"}, + {file = "lxml-5.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2ad3a8ce9e8a767131061a22cd28fdffa3cd2dc193f399ff7b81777f3520e372"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:304128394c9c22b6569eba2a6d98392b56fbdfbad58f83ea702530be80d0f9df"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d74fcaf87132ffc0447b3c685a9f862ffb5b43e70ea6beec2fb8057d5d2a1fea"}, + {file = "lxml-5.1.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:8cf5877f7ed384dabfdcc37922c3191bf27e55b498fecece9fd5c2c7aaa34c33"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:877efb968c3d7eb2dad540b6cabf2f1d3c0fbf4b2d309a3c141f79c7e0061324"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f14a4fb1c1c402a22e6a341a24c1341b4a3def81b41cd354386dcb795f83897"}, + {file = "lxml-5.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:25663d6e99659544ee8fe1b89b1a8c0aaa5e34b103fab124b17fa958c4a324a6"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8b9f19df998761babaa7f09e6bc169294eefafd6149aaa272081cbddc7ba4ca3"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e53d7e6a98b64fe54775d23a7c669763451340c3d44ad5e3a3b48a1efbdc96f"}, + {file = "lxml-5.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c3cd1fc1dc7c376c54440aeaaa0dcc803d2126732ff5c6b68ccd619f2e64be4f"}, + {file = "lxml-5.1.0.tar.gz", hash = "sha256:3eea6ed6e6c918e468e693c41ef07f3c3acc310b70ddd9cc72d9ef84bc9564ca"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.7)"] + [[package]] name = "makim" version = "1.12.0" @@ -3253,6 +3346,49 @@ dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2 doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +[[package]] +name = "types-beautifulsoup4" +version = "4.12.0.20240106" +description = "Typing stubs for beautifulsoup4" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-beautifulsoup4-4.12.0.20240106.tar.gz", hash = "sha256:98d628985b71b140bd3bc22a8cb0ab603c2f2d08f20d37925965eb4a21739be8"}, + {file = "types_beautifulsoup4-4.12.0.20240106-py3-none-any.whl", hash = "sha256:cbdd60ab8aeac737ac014431b6e921b43e84279c0405fdd25a6900bb0e71da5b"}, +] + +[package.dependencies] +types-html5lib = "*" + +[[package]] +name = "types-html5lib" +version = "1.1.11.20240106" +description = "Typing stubs for html5lib" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-html5lib-1.1.11.20240106.tar.gz", hash = "sha256:fc3a1b18eb601b3eeaf92c900bd67675c0a4fa1dd1d2a2893ebdb46923547ee9"}, + {file = "types_html5lib-1.1.11.20240106-py3-none-any.whl", hash = "sha256:61993cb89220107481e0f1da65c388ff8cf3d8c5f6e8483c97559639a596b697"}, +] + +[[package]] +name = "types-lxml" +version = "2023.10.21" +description = "Complete lxml external type annotation" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-lxml-2023.10.21.tar.gz", hash = "sha256:daf1458b7d9b2fb421354137b97c6029c4fe75cef2cbab0d8e144d8c548d9b98"}, + {file = "types_lxml-2023.10.21-py3-none-any.whl", hash = "sha256:545097ca5f69d568827416d671285cae203a5b3d99937294e155c4cc0e46e712"}, +] + +[package.dependencies] +types-beautifulsoup4 = "*" +typing-extensions = ">=4.5,<5.0" + +[package.extras] +dev = ["black", "isort (>=5)", "lxml (==4.9.*)", "mypy (>=1.1,<1.4)", "pyright (>=1.1.289,<1.1.332)", "pytest (>=7)", "pytest-mypy-plugins (>=1.10.1,<2.0)", "typeguard (==3.0.*)"] + [[package]] name = "types-python-dateutil" version = "2.8.19.20240106" @@ -3490,4 +3626,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">3.8.1,<4" -content-hash = "a06f4b25beb33f335b7a59a9aa6618146ec5bb55afc79c8cc1483cca88a892d6" +content-hash = "85d692f0f6af3f63b985b08ab1f0e94fa002a8ca48552f1bc4e93c9c319ababc" diff --git a/pyproject.toml b/pyproject.toml index b502a44e..5dee61b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ exclude = [ [tool.poetry.dependencies] python = ">3.8.1,<4" requests = ">=2.20.0" +lxml = "^5.1.0" [tool.poetry.group.dev.dependencies] pytest = ">=7.3.2" @@ -41,6 +42,7 @@ mkdocstrings-python = ">=1.1.2" jupyterlab = ">=4.0.5" makim = "1.12.0" requests-cache = ">=1" +types-lxml = "^2023.10.21" [tool.pytest.ini_options] testpaths = [ diff --git a/src/pymedx/api.py b/src/pymedx/api.py index 4132b559..0f58515b 100644 --- a/src/pymedx/api.py +++ b/src/pymedx/api.py @@ -1,12 +1,13 @@ """API module for PubMed.""" import datetime import itertools -import xml.etree.ElementTree as xml from typing import Any, Dict, Iterable, List, Union, cast import requests +from lxml import etree as xml + from .article import PubMedArticle from .book import PubMedBookArticle from .helpers import batches diff --git a/src/pymedx/article.py b/src/pymedx/article.py index 12552520..3a66f750 100644 --- a/src/pymedx/article.py +++ b/src/pymedx/article.py @@ -3,12 +3,13 @@ import json from typing import Any, Dict, List, Optional, Union -from xml.etree.ElementTree import Element -from .helpers import getContent +from lxml.etree import _Element +from .helpers import getAllContent, getContent, getContentUnique -class PubMedArticle(object): + +class PubMedArticle: """Data class that contains a PubMed article.""" __slots__ = ( @@ -29,7 +30,7 @@ class PubMedArticle(object): def __init__( self, - xml_element: Optional[Element] = None, + xml_element: Optional[_Element] = None, *args: List[Any], **kwargs: Dict[Any, Any], ) -> None: @@ -46,15 +47,15 @@ def __init__( for field in self.__slots__: self.__setattr__(field, kwargs.get(field, None)) - def _extractPubMedId(self, xml_element: Element) -> Union[str, None, int]: - path = ".//ArticleId[@IdType='pubmed']" - return getContent(element=xml_element, path=path) + def _extractPubMedId(self, xml_element: _Element) -> Union[str, None, int]: + path = ".//PMID" + return getContentUnique(element=xml_element, path=path) - def _extractTitle(self, xml_element: Element) -> Union[str, None, int]: + def _extractTitle(self, xml_element: _Element) -> Union[str, None, int]: path = ".//ArticleTitle" - return getContent(element=xml_element, path=path) + return getAllContent(element=xml_element, path=path) - def _extractKeywords(self, xml_element: Element) -> List[Any]: + def _extractKeywords(self, xml_element: _Element) -> List[Any]: path = ".//Keyword" return [ keyword.text @@ -62,40 +63,40 @@ def _extractKeywords(self, xml_element: Element) -> List[Any]: if keyword is not None ] - def _extractJournal(self, xml_element: Element) -> Union[str, None, int]: + def _extractJournal(self, xml_element: _Element) -> Union[str, None, int]: path = ".//Journal/Title" return getContent(element=xml_element, path=path) - def _extractAbstract(self, xml_element: Element) -> Union[str, None, int]: + def _extractAbstract(self, xml_element: _Element) -> Union[str, None, int]: path = ".//AbstractText" - return getContent(element=xml_element, path=path) + return getAllContent(element=xml_element, path=path) def _extractConclusions( - self: object, xml_element: Element + self, xml_element: _Element ) -> Union[str, None, int]: path = ".//AbstractText[@Label='CONCLUSION']" return getContent(element=xml_element, path=path) - def _extractMethods(self, xml_element: Element) -> Union[str, None, int]: + def _extractMethods(self, xml_element: _Element) -> Union[str, None, int]: path = ".//AbstractText[@Label='METHOD']" return getContent(element=xml_element, path=path) - def _extractResults(self, xml_element: Element) -> Union[str, None, int]: + def _extractResults(self, xml_element: _Element) -> Union[str, None, int]: path = ".//AbstractText[@Label='RESULTS']" return getContent(element=xml_element, path=path) def _extractCopyrights( - self, xml_element: Element + self, xml_element: _Element ) -> Union[str, None, int]: path = ".//CopyrightInformation" return getContent(element=xml_element, path=path) - def _extractDoi(self, xml_element: Element) -> Union[str, None, int]: + def _extractDoi(self, xml_element: _Element) -> Union[str, None, int]: path = ".//ArticleId[@IdType='doi']" - return getContent(element=xml_element, path=path) + return getContentUnique(element=xml_element, path=path) def _extractPublicationDate( - self, xml_element: Element + self, xml_element: _Element ) -> Optional[datetime.date]: # Get the publication date @@ -121,7 +122,7 @@ def _extractPublicationDate( return None def _extractAuthors( - self, xml_element: Element + self, xml_element: _Element ) -> List[dict[str, Union[str, None, int]]]: return [ { @@ -135,7 +136,7 @@ def _extractAuthors( for author in xml_element.findall(".//Author") ] - def _initializeFromXML(self, xml_element: Element) -> None: + def _initializeFromXML(self, xml_element: _Element) -> None: """Parse an XML element into an article object.""" # Parse the different fields of the article self.pubmed_id = self._extractPubMedId(xml_element) @@ -162,7 +163,7 @@ def toJSON(self) -> str: { key: ( value - if not isinstance(value, (datetime.date, Element)) + if not isinstance(value, (datetime.date, _Element)) else str(value) ) for key, value in self.toDict().items() diff --git a/src/pymedx/book.py b/src/pymedx/book.py index a8f94de2..46eb5cfc 100644 --- a/src/pymedx/book.py +++ b/src/pymedx/book.py @@ -3,7 +3,8 @@ import json from typing import Any, Dict, List, Optional, Union -from xml.etree.ElementTree import Element + +from lxml.etree import _Element from .helpers import getContent @@ -29,7 +30,7 @@ class PubMedBookArticle: def __init__( self, - xml_element: Optional[Element] = None, + xml_element: Optional[_Element] = None, *args: List[str], **kwargs: Dict[Any, Any], ) -> None: @@ -47,71 +48,73 @@ def __init__( self.__setattr__(field, kwargs.get(field, None)) def _extractPubMedId( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//ArticleId[@IdType='pubmed']" return getContent(element=xml_element, path=path) def _extractTitle( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//BookTitle" return getContent(element=xml_element, path=path) def _extractAbstract( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//AbstractText" return getContent(element=xml_element, path=path) def _extractCopyrights( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//CopyrightInformation" return getContent(element=xml_element, path=path) def _extractDoi( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//ArticleId[@IdType='doi']" return getContent(element=xml_element, path=path) def _extractIsbn( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//Isbn" return getContent(element=xml_element, path=path) def _extractLanguage( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> Union[str, None, int]: path = ".//Language" return getContent(element=xml_element, path=path) def _extractPublicationType( - self, xml_element: Element + self, xml_element: _Element ) -> Union[str, None, int]: path = ".//PublicationType" return getContent(element=xml_element, path=path) def _extractPublicationDate( - self, xml_element: Element + self, xml_element: _Element ) -> Union[str, None, int]: path = ".//PubDate/Year" return getContent(element=xml_element, path=path) - def _extractPublisher(self, xml_element: Element) -> Union[str, None, int]: + def _extractPublisher( + self, xml_element: _Element + ) -> Union[str, None, int]: path = ".//Publisher/PublisherName" return getContent(element=xml_element, path=path) def _extractPublisherLocation( - self, xml_element: Element + self, xml_element: _Element ) -> Union[str, None, int]: path = ".//Publisher/PublisherLocation" return getContent(element=xml_element, path=path) def _extractAuthors( - self: object, xml_element: Element + self: object, xml_element: _Element ) -> List[dict[str, Union[str, None, int]]]: return [ { @@ -124,7 +127,7 @@ def _extractAuthors( ] def _extractSections( - self, xml_element: Element + self, xml_element: _Element ) -> List[dict[str, Union[str, None, int]]]: return [ { @@ -136,7 +139,7 @@ def _extractSections( for section in xml_element.findall(".//Section") ] - def _initializeFromXML(self, xml_element: Element) -> None: + def _initializeFromXML(self, xml_element: _Element) -> None: """Parse an XML element into an article object.""" # Parse the different fields of the article self.pubmed_id = self._extractPubMedId(xml_element) diff --git a/src/pymedx/helpers.py b/src/pymedx/helpers.py index 55c5576b..60b050d5 100644 --- a/src/pymedx/helpers.py +++ b/src/pymedx/helpers.py @@ -1,6 +1,9 @@ """Module for helper functions.""" -from typing import Generator, List, Optional, Union -from xml.etree.ElementTree import Element +from typing import Generator, List, Optional, Union, cast + +import lxml.etree + +from lxml.etree import _Element def batches( @@ -31,7 +34,7 @@ def batches( def getContent( - element: Element, + element: _Element, path: str, default: Optional[str] = None, separator: str = "\n", @@ -62,3 +65,79 @@ def getContent( # Extract the text and return it return separator.join([sub.text for sub in result if sub.text is not None]) + + +def getContentUnique( + element: _Element, + path: str, + default: Optional[str] = None, +) -> Optional[Union[str, int]]: + """ + Retrieve text content of an XML element. Returns a unique value. + + Parameters + ---------- + element: Element + the XML element to parse. + path: Str + Nested path in the XML element. + default: Str + default value to return when no text is found. + + Returns + ------- + text: Str + text in the XML node. + """ + # Find the path in the element + result = cast(List[_Element], element.findall(path)) + + # Return the default if there is no such element + if not result: + return default + + # Extract the text and return it + return cast(str, result[0].text) + + +def getAllContent( + element: _Element, + path: str, + default: Optional[str] = None, +) -> Optional[Union[str, int]]: + """ + Retrieve text content of an XML element. + + Return all the text inside the path and omit XML tags inside. + + Parameters + ---------- + element: Element + the XML element to parse. + path: Str + Nested path in the XML element. + default: Str + default value to return when no text is found. + + Returns + ------- + text: str + text in the XML node. + """ + # Find the path in the element + raw_result = element.findall(path) + + # Return the default if there is no such element + if not raw_result: + return default + + # Get all text avoiding the tags + result = cast( + str, + lxml.etree.tostring( + raw_result[0], method="text", encoding="utf-8" + ).decode("utf-8"), + ) + + # Extract the text and return it + return result