From ed0a145a8cacef5867b784d57605ba41f40105ea Mon Sep 17 00:00:00 2001 From: John Stewart Date: Fri, 10 Jan 2020 21:43:14 -0500 Subject: [PATCH] Doctests for Dependency Tree and more properties on Doc (#33) * adapted the dependency tree class to CLTKv1 * working on not reloading the full stanford pipeline on every process call * cache stanford NLP objects in wrapper class * fixed doctests for Stanford process * automatically reformatted files * implemented the pipeline pattern, and extraction of sentential structure of input text in the Stanford process * implemented true pipelines and sentence extraction for stanfordNLP * : * moved code out of __init__.py * added doctests and properties of Doc * fixed trailing whitespace * repaired governor and parent references in word; created a core package for essential classes; normalized POS and morpho features attributes of words * interrupted infinite recursion in parent token --- poetry.lock | 124 +++++++++---------- src/cltkv1/core/__init__.py | 2 + src/cltkv1/{utils => core}/data_types.py | 90 +++++++++----- src/cltkv1/{utils => core}/exceptions.py | 18 +-- src/cltkv1/dependency/tree.py | 102 ++++++++------- src/cltkv1/languages/glottolog.py | 2 +- src/cltkv1/{utils => languages}/pipelines.py | 12 +- src/cltkv1/languages/utils.py | 8 +- src/cltkv1/nlp.py | 38 +++--- src/cltkv1/tokenizers/word.py | 11 +- src/cltkv1/utils/__init__.py | 1 - src/cltkv1/utils/example_texts.py | 6 +- src/cltkv1/wrappers/stanford.py | 54 ++++---- 13 files changed, 259 insertions(+), 209 deletions(-) create mode 100644 src/cltkv1/core/__init__.py rename src/cltkv1/{utils => core}/data_types.py (65%) rename src/cltkv1/{utils => core}/exceptions.py (59%) rename src/cltkv1/{utils => languages}/pipelines.py (92%) diff --git a/poetry.lock b/poetry.lock index 503c851..fb1dd74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -266,7 +266,7 @@ marker = "python_version < \"3.8\"" name = "importlib-metadata" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" -version = "1.3.0" +version = "1.4.0" [package.dependencies] zipp = ">=0.5" @@ -452,7 +452,7 @@ description = "Node.js virtual environment builder" name = "nodeenv" optional = false python-versions = "*" -version = "1.3.3" +version = "1.3.4" [[package]] category = "dev" @@ -468,7 +468,7 @@ description = "NumPy is the fundamental package for array computing with Python. name = "numpy" optional = false python-versions = ">=3.5" -version = "1.18.0" +version = "1.18.1" [[package]] category = "dev" @@ -695,8 +695,8 @@ category = "main" description = "YAML parser and emitter for Python" name = "pyyaml" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "5.2" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +version = "5.3" [[package]] category = "main" @@ -704,7 +704,7 @@ description = "Alternative regular expression module, to replace re." name = "regex" optional = false python-versions = "*" -version = "2019.12.20" +version = "2020.1.8" [[package]] category = "main" @@ -1191,8 +1191,8 @@ imagesize = [ {file = "imagesize-1.2.0.tar.gz", hash = "sha256:b1f6b5a4eab1f73479a50fb79fcf729514a900c341d8503d62a62dbc4127a2b1"}, ] importlib-metadata = [ - {file = "importlib_metadata-1.3.0-py2.py3-none-any.whl", hash = "sha256:d95141fbfa7ef2ec65cfd945e2af7e5a6ddbd7c8d9a25e66ff3be8e3daf9f60f"}, - {file = "importlib_metadata-1.3.0.tar.gz", hash = "sha256:073a852570f92da5f744a3472af1b61e28e9f78ccf0c9117658dc32b15de7b45"}, + {file = "importlib_metadata-1.4.0-py2.py3-none-any.whl", hash = "sha256:bdd9b7c397c273bcc9a11d6629a38487cd07154fa255a467bf704cd2c258e359"}, + {file = "importlib_metadata-1.4.0.tar.gz", hash = "sha256:f17c015735e1a88296994c0697ecea7e11db24290941983b08c9feb30921e6d8"}, ] ipython = [ {file = "ipython-7.11.1-py3-none-any.whl", hash = "sha256:387686dd7fc9caf29d2fddcf3116c4b07a11d9025701d220c589a430b0171d8a"}, @@ -1325,7 +1325,7 @@ nltk = [ {file = "nltk-3.4.5.zip", hash = "sha256:bed45551259aa2101381bbdd5df37d44ca2669c5c3dad72439fa459b29137d94"}, ] nodeenv = [ - {file = "nodeenv-1.3.3.tar.gz", hash = "sha256:ad8259494cf1c9034539f6cced78a1da4840a4b157e23640bc4a0c0546b0cb7a"}, + {file = "nodeenv-1.3.4-py2.py3-none-any.whl", hash = "sha256:561057acd4ae3809e665a9aaaf214afff110bbb6a6d5c8a96121aea6878408b3"}, ] nose = [ {file = "nose-1.3.7-py2-none-any.whl", hash = "sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a"}, @@ -1333,27 +1333,27 @@ nose = [ {file = "nose-1.3.7.tar.gz", hash = "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98"}, ] numpy = [ - {file = "numpy-1.18.0-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:b091e5d4cbbe79f0e8b6b6b522346e54a282eadb06e3fd761e9b6fafc2ca91ad"}, - {file = "numpy-1.18.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:443ab93fc35b31f01db8704681eb2fd82f3a1b2fa08eed2dd0e71f1f57423d4a"}, - {file = "numpy-1.18.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:88c5ccbc4cadf39f32193a5ef22e3f84674418a9fd877c63322917ae8f295a56"}, - {file = "numpy-1.18.0-cp35-cp35m-win32.whl", hash = "sha256:e1080e37c090534adb2dd7ae1c59ee883e5d8c3e63d2a4d43c20ee348d0459c5"}, - {file = "numpy-1.18.0-cp35-cp35m-win_amd64.whl", hash = "sha256:f084d513de729ff10cd72a1f80db468cff464fedb1ef2fea030221a0f62d7ff4"}, - {file = "numpy-1.18.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1baefd1fb4695e7f2e305467dbd876d765e6edd30c522894df76f8301efaee36"}, - {file = "numpy-1.18.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:cc070fc43a494e42732d6ae2f6621db040611c1dde64762a40c8418023af56d7"}, - {file = "numpy-1.18.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6f8113c8dbfc192b58996ee77333696469ea121d1c44ea429d8fd266e4c6be51"}, - {file = "numpy-1.18.0-cp36-cp36m-win32.whl", hash = "sha256:a30f5c3e1b1b5d16ec1f03f4df28e08b8a7529d8c920bbed657f4fde61f1fbcd"}, - {file = "numpy-1.18.0-cp36-cp36m-win_amd64.whl", hash = "sha256:3c68c827689ca0ca713dba598335073ce0966850ec0b30715527dce4ecd84055"}, - {file = "numpy-1.18.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f6a7421da632fc01e8a3ecd19c3f7350258d82501a646747664bae9c6a87c731"}, - {file = "numpy-1.18.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:905cd6fa6ac14654a6a32b21fad34670e97881d832e24a3ca32e19b455edb4a8"}, - {file = "numpy-1.18.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:854f6ed4fa91fa6da5d764558804ba5b0f43a51e5fe9fc4fdc93270b052f188a"}, - {file = "numpy-1.18.0-cp37-cp37m-win32.whl", hash = "sha256:ac3cf835c334fcc6b74dc4e630f9b5ff7b4c43f7fb2a7813208d95d4e10b5623"}, - {file = "numpy-1.18.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62506e9e4d2a39c87984f081a2651d4282a1d706b1a82fe9d50a559bb58e705a"}, - {file = "numpy-1.18.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9d6de2ad782aae68f7ed0e0e616477fbf693d6d7cc5f0f1505833ff12f84a673"}, - {file = "numpy-1.18.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1c35fb1131362e6090d30286cfda52ddd42e69d3e2bf1fea190a0fad83ea3a18"}, - {file = "numpy-1.18.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:56710a756c5009af9f35b91a22790701420406d9ac24cf6b652b0e22cfbbb7ff"}, - {file = "numpy-1.18.0-cp38-cp38-win32.whl", hash = "sha256:03bbde29ac8fba860bb2c53a1525b3604a9b60417855ac3119d89868ec6041c3"}, - {file = "numpy-1.18.0-cp38-cp38-win_amd64.whl", hash = "sha256:712f0c32555132f4b641b918bdb1fd3c692909ae916a233ce7f50eac2de87e37"}, - {file = "numpy-1.18.0.zip", hash = "sha256:a9d72d9abaf65628f0f31bbb573b7d9304e43b1e6bbae43149c17737a42764c4"}, + {file = "numpy-1.18.1-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:20b26aaa5b3da029942cdcce719b363dbe58696ad182aff0e5dcb1687ec946dc"}, + {file = "numpy-1.18.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:70a840a26f4e61defa7bdf811d7498a284ced303dfbc35acb7be12a39b2aa121"}, + {file = "numpy-1.18.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:17aa7a81fe7599a10f2b7d95856dc5cf84a4eefa45bc96123cbbc3ebc568994e"}, + {file = "numpy-1.18.1-cp35-cp35m-win32.whl", hash = "sha256:f3d0a94ad151870978fb93538e95411c83899c9dc63e6fb65542f769568ecfa5"}, + {file = "numpy-1.18.1-cp35-cp35m-win_amd64.whl", hash = "sha256:1786a08236f2c92ae0e70423c45e1e62788ed33028f94ca99c4df03f5be6b3c6"}, + {file = "numpy-1.18.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ae0975f42ab1f28364dcda3dde3cf6c1ddab3e1d4b2909da0cb0191fa9ca0480"}, + {file = "numpy-1.18.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:cf7eb6b1025d3e169989416b1adcd676624c2dbed9e3bcb7137f51bfc8cc2572"}, + {file = "numpy-1.18.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b765ed3930b92812aa698a455847141869ef755a87e099fddd4ccf9d81fffb57"}, + {file = "numpy-1.18.1-cp36-cp36m-win32.whl", hash = "sha256:2d75908ab3ced4223ccba595b48e538afa5ecc37405923d1fea6906d7c3a50bc"}, + {file = "numpy-1.18.1-cp36-cp36m-win_amd64.whl", hash = "sha256:9acdf933c1fd263c513a2df3dceecea6f3ff4419d80bf238510976bf9bcb26cd"}, + {file = "numpy-1.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:56bc8ded6fcd9adea90f65377438f9fea8c05fcf7c5ba766bef258d0da1554aa"}, + {file = "numpy-1.18.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e422c3152921cece8b6a2fb6b0b4d73b6579bd20ae075e7d15143e711f3ca2ca"}, + {file = "numpy-1.18.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b3af02ecc999c8003e538e60c89a2b37646b39b688d4e44d7373e11c2debabec"}, + {file = "numpy-1.18.1-cp37-cp37m-win32.whl", hash = "sha256:d92350c22b150c1cae7ebb0ee8b5670cc84848f6359cf6b5d8f86617098a9b73"}, + {file = "numpy-1.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:77c3bfe65d8560487052ad55c6998a04b654c2fbc36d546aef2b2e511e760971"}, + {file = "numpy-1.18.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c98c5ffd7d41611407a1103ae11c8b634ad6a43606eca3e2a5a269e5d6e8eb07"}, + {file = "numpy-1.18.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:9537eecf179f566fd1c160a2e912ca0b8e02d773af0a7a1120ad4f7507cd0d26"}, + {file = "numpy-1.18.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:e840f552a509e3380b0f0ec977e8124d0dc34dc0e68289ca28f4d7c1d0d79474"}, + {file = "numpy-1.18.1-cp38-cp38-win32.whl", hash = "sha256:590355aeade1a2eaba17617c19edccb7db8d78760175256e3cf94590a1a964f3"}, + {file = "numpy-1.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:39d2c685af15d3ce682c99ce5925cc66efc824652e10990d2462dfe9b8918c6a"}, + {file = "numpy-1.18.1.zip", hash = "sha256:b6ff59cee96b454516e47e7721098e6ceebef435e3e21ac2d6c3b8b02628eb77"}, ] packaging = [ {file = "packaging-20.0-py2.py3-none-any.whl", hash = "sha256:aec3fdbb8bc9e4bb65f0634b9f551ced63983a529d6a8931817d52fdd0816ddb"}, @@ -1485,40 +1485,40 @@ pyuca = [ {file = "pyuca-1.2.tar.gz", hash = "sha256:8a382fe74627f08c0d18908c0713ca4a20aad5385f077579e56208beea2893b2"}, ] pyyaml = [ - {file = "PyYAML-5.2-cp27-cp27m-win32.whl", hash = "sha256:35ace9b4147848cafac3db142795ee42deebe9d0dad885ce643928e88daebdcc"}, - {file = "PyYAML-5.2-cp27-cp27m-win_amd64.whl", hash = "sha256:ebc4ed52dcc93eeebeae5cf5deb2ae4347b3a81c3fa12b0b8c976544829396a4"}, - {file = "PyYAML-5.2-cp35-cp35m-win32.whl", hash = "sha256:38a4f0d114101c58c0f3a88aeaa44d63efd588845c5a2df5290b73db8f246d15"}, - {file = "PyYAML-5.2-cp35-cp35m-win_amd64.whl", hash = "sha256:483eb6a33b671408c8529106df3707270bfacb2447bf8ad856a4b4f57f6e3075"}, - {file = "PyYAML-5.2-cp36-cp36m-win32.whl", hash = "sha256:7f38e35c00e160db592091751d385cd7b3046d6d51f578b29943225178257b31"}, - {file = "PyYAML-5.2-cp36-cp36m-win_amd64.whl", hash = "sha256:0e7f69397d53155e55d10ff68fdfb2cf630a35e6daf65cf0bdeaf04f127c09dc"}, - {file = "PyYAML-5.2-cp37-cp37m-win32.whl", hash = "sha256:e4c015484ff0ff197564917b4b4246ca03f411b9bd7f16e02a2f586eb48b6d04"}, - {file = "PyYAML-5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b6be5edb9f6bb73680f5bf4ee08ff25416d1400fbd4535fe0069b2994da07cd"}, - {file = "PyYAML-5.2-cp38-cp38-win32.whl", hash = "sha256:8100c896ecb361794d8bfdb9c11fce618c7cf83d624d73d5ab38aef3bc82d43f"}, - {file = "PyYAML-5.2-cp38-cp38-win_amd64.whl", hash = "sha256:2e9f0b7c5914367b0916c3c104a024bb68f269a486b9d04a2e8ac6f6597b7803"}, - {file = "PyYAML-5.2.tar.gz", hash = "sha256:c0ee8eca2c582d29c3c2ec6e2c4f703d1b7f1fb10bc72317355a746057e7346c"}, + {file = "PyYAML-5.3-cp27-cp27m-win32.whl", hash = "sha256:940532b111b1952befd7db542c370887a8611660d2b9becff75d39355303d82d"}, + {file = "PyYAML-5.3-cp27-cp27m-win_amd64.whl", hash = "sha256:059b2ee3194d718896c0ad077dd8c043e5e909d9180f387ce42012662a4946d6"}, + {file = "PyYAML-5.3-cp35-cp35m-win32.whl", hash = "sha256:4fee71aa5bc6ed9d5f116327c04273e25ae31a3020386916905767ec4fc5317e"}, + {file = "PyYAML-5.3-cp35-cp35m-win_amd64.whl", hash = "sha256:dbbb2379c19ed6042e8f11f2a2c66d39cceb8aeace421bfc29d085d93eda3689"}, + {file = "PyYAML-5.3-cp36-cp36m-win32.whl", hash = "sha256:e3a057b7a64f1222b56e47bcff5e4b94c4f61faac04c7c4ecb1985e18caa3994"}, + {file = "PyYAML-5.3-cp36-cp36m-win_amd64.whl", hash = "sha256:74782fbd4d4f87ff04159e986886931456a1894c61229be9eaf4de6f6e44b99e"}, + {file = "PyYAML-5.3-cp37-cp37m-win32.whl", hash = "sha256:24521fa2890642614558b492b473bee0ac1f8057a7263156b02e8b14c88ce6f5"}, + {file = "PyYAML-5.3-cp37-cp37m-win_amd64.whl", hash = "sha256:1cf708e2ac57f3aabc87405f04b86354f66799c8e62c28c5fc5f88b5521b2dbf"}, + {file = "PyYAML-5.3-cp38-cp38-win32.whl", hash = "sha256:70024e02197337533eef7b85b068212420f950319cc8c580261963aefc75f811"}, + {file = "PyYAML-5.3-cp38-cp38-win_amd64.whl", hash = "sha256:cb1f2f5e426dc9f07a7681419fe39cee823bb74f723f36f70399123f439e9b20"}, + {file = "PyYAML-5.3.tar.gz", hash = "sha256:e9f45bd5b92c7974e59bcd2dcc8631a6b6cc380a904725fce7bc08872e691615"}, ] regex = [ - {file = "regex-2019.12.20-cp27-cp27m-win32.whl", hash = "sha256:7bbbdbada3078dc360d4692a9b28479f569db7fc7f304b668787afc9feb38ec8"}, - {file = "regex-2019.12.20-cp27-cp27m-win_amd64.whl", hash = "sha256:a83049eb717ae828ced9cf607845929efcb086a001fc8af93ff15c50012a5716"}, - {file = "regex-2019.12.20-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:27d1bd20d334f50b7ef078eba0f0756a640fd25f5f1708d3b5bed18a5d6bced9"}, - {file = "regex-2019.12.20-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1768cf42a78a11dae63152685e7a1d90af7a8d71d2d4f6d2387edea53a9e0588"}, - {file = "regex-2019.12.20-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:4850c78b53acf664a6578bba0e9ebeaf2807bb476c14ec7e0f936f2015133cae"}, - {file = "regex-2019.12.20-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:78b3712ec529b2a71731fbb10b907b54d9c53a17ca589b42a578bc1e9a2c82ea"}, - {file = "regex-2019.12.20-cp36-cp36m-win32.whl", hash = "sha256:8d9ef7f6c403e35e73b7fc3cde9f6decdc43b1cb2ff8d058c53b9084bfcb553e"}, - {file = "regex-2019.12.20-cp36-cp36m-win_amd64.whl", hash = "sha256:faad39fdbe2c2ccda9846cd21581063086330efafa47d87afea4073a08128656"}, - {file = "regex-2019.12.20-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:adc35d38952e688535980ae2109cad3a109520033642e759f987cf47fe278aa1"}, - {file = "regex-2019.12.20-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:ef0b828a7e22e58e06a1cceddba7b4665c6af8afeb22a0d8083001330572c147"}, - {file = "regex-2019.12.20-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:0e6cf1e747f383f52a0964452658c04300a9a01e8a89c55ea22813931b580aa8"}, - {file = "regex-2019.12.20-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:032fdcc03406e1a6485ec09b826eac78732943840c4b29e503b789716f051d8d"}, - {file = "regex-2019.12.20-cp37-cp37m-win32.whl", hash = "sha256:77ae8d926f38700432807ba293d768ba9e7652df0cbe76df2843b12f80f68885"}, - {file = "regex-2019.12.20-cp37-cp37m-win_amd64.whl", hash = "sha256:c29a77ad4463f71a506515d9ec3a899ed026b4b015bf43245c919ff36275444b"}, - {file = "regex-2019.12.20-cp38-cp38-manylinux1_i686.whl", hash = "sha256:57eacd38a5ec40ed7b19a968a9d01c0d977bda55664210be713e750dd7b33540"}, - {file = "regex-2019.12.20-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:724eb24b92fc5fdc1501a1b4df44a68b9c1dda171c8ef8736799e903fb100f63"}, - {file = "regex-2019.12.20-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:d508875793efdf6bab3d47850df8f40d4040ae9928d9d80864c1768d6aeaf8e3"}, - {file = "regex-2019.12.20-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:cfd31b3300fefa5eecb2fe596c6dee1b91b3a05ece9d5cfd2631afebf6c6fadd"}, - {file = "regex-2019.12.20-cp38-cp38-win32.whl", hash = "sha256:29b20f66f2e044aafba86ecf10a84e611b4667643c42baa004247f5dfef4f90b"}, - {file = "regex-2019.12.20-cp38-cp38-win_amd64.whl", hash = "sha256:d3ee0b035816e0520fac928de31b6572106f0d75597f6fa3206969a02baba06f"}, - {file = "regex-2019.12.20.tar.gz", hash = "sha256:106e25a841921d8259dcef2a42786caae35bc750fb996f830065b3dfaa67b77e"}, + {file = "regex-2020.1.8-cp27-cp27m-win32.whl", hash = "sha256:4e8f02d3d72ca94efc8396f8036c0d3bcc812aefc28ec70f35bb888c74a25161"}, + {file = "regex-2020.1.8-cp27-cp27m-win_amd64.whl", hash = "sha256:e6c02171d62ed6972ca8631f6f34fa3281d51db8b326ee397b9c83093a6b7242"}, + {file = "regex-2020.1.8-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4eae742636aec40cf7ab98171ab9400393360b97e8f9da67b1867a9ee0889b26"}, + {file = "regex-2020.1.8-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bd25bb7980917e4e70ccccd7e3b5740614f1c408a642c245019cff9d7d1b6149"}, + {file = "regex-2020.1.8-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:3e77409b678b21a056415da3a56abfd7c3ad03da71f3051bbcdb68cf44d3c34d"}, + {file = "regex-2020.1.8-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:07b39bf943d3d2fe63d46281d8504f8df0ff3fe4c57e13d1656737950e53e525"}, + {file = "regex-2020.1.8-cp36-cp36m-win32.whl", hash = "sha256:23e2c2c0ff50f44877f64780b815b8fd2e003cda9ce817a7fd00dea5600c84a0"}, + {file = "regex-2020.1.8-cp36-cp36m-win_amd64.whl", hash = "sha256:27429b8d74ba683484a06b260b7bb00f312e7c757792628ea251afdbf1434003"}, + {file = "regex-2020.1.8-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:0e182d2f097ea8549a249040922fa2b92ae28be4be4895933e369a525ba36576"}, + {file = "regex-2020.1.8-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e3cd21cc2840ca67de0bbe4071f79f031c81418deb544ceda93ad75ca1ee9f7b"}, + {file = "regex-2020.1.8-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:ecc6de77df3ef68fee966bb8cb4e067e84d4d1f397d0ef6fce46913663540d77"}, + {file = "regex-2020.1.8-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:26ff99c980f53b3191d8931b199b29d6787c059f2e029b2b0c694343b1708c35"}, + {file = "regex-2020.1.8-cp37-cp37m-win32.whl", hash = "sha256:7bcd322935377abcc79bfe5b63c44abd0b29387f267791d566bbb566edfdd146"}, + {file = "regex-2020.1.8-cp37-cp37m-win_amd64.whl", hash = "sha256:10671601ee06cf4dc1bc0b4805309040bb34c9af423c12c379c83d7895622bb5"}, + {file = "regex-2020.1.8-cp38-cp38-manylinux1_i686.whl", hash = "sha256:98b8ed7bb2155e2cbb8b76f627b2fd12cf4b22ab6e14873e8641f266e0fb6d8f"}, + {file = "regex-2020.1.8-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:6a6ba91b94427cd49cd27764679024b14a96874e0dc638ae6bdd4b1a3ce97be1"}, + {file = "regex-2020.1.8-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:6a6ae17bf8f2d82d1e8858a47757ce389b880083c4ff2498dba17c56e6c103b9"}, + {file = "regex-2020.1.8-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:0932941cdfb3afcbc26cc3bcf7c3f3d73d5a9b9c56955d432dbf8bbc147d4c5b"}, + {file = "regex-2020.1.8-cp38-cp38-win32.whl", hash = "sha256:d58e4606da2a41659c84baeb3cfa2e4c87a74cec89a1e7c56bee4b956f9d7461"}, + {file = "regex-2020.1.8-cp38-cp38-win_amd64.whl", hash = "sha256:e7c7661f7276507bce416eaae22040fd91ca471b5b33c13f8ff21137ed6f248c"}, + {file = "regex-2020.1.8.tar.gz", hash = "sha256:d0f424328f9822b0323b3b6f2e4b9c90960b24743d220763c7f07071e0778351"}, ] requests = [ {file = "requests-2.22.0-py2.py3-none-any.whl", hash = "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"}, diff --git a/src/cltkv1/core/__init__.py b/src/cltkv1/core/__init__.py new file mode 100644 index 0000000..4f02e41 --- /dev/null +++ b/src/cltkv1/core/__init__.py @@ -0,0 +1,2 @@ +from .data_types import * +from .exceptions import * diff --git a/src/cltkv1/utils/data_types.py b/src/cltkv1/core/data_types.py similarity index 65% rename from src/cltkv1/utils/data_types.py rename to src/cltkv1/core/data_types.py index e7b4741..9821121 100644 --- a/src/cltkv1/utils/data_types.py +++ b/src/cltkv1/core/data_types.py @@ -2,15 +2,15 @@ of the NLP pipeline. ->>> from cltkv1.utils.data_types import Language ->>> from cltkv1.utils.data_types import Word ->>> from cltkv1.utils.data_types import Process ->>> from cltkv1.utils.data_types import Doc ->>> from cltkv1.utils.data_types import Pipeline +>>> from cltkv1.core.data_types import Language +>>> from cltkv1.core.data_types import Word +>>> from cltkv1.core.data_types import Process +>>> from cltkv1.core.data_types import Doc +>>> from cltkv1.core.data_types import Pipeline """ from dataclasses import dataclass -from typing import Any, Callable, List, Type, Union +from typing import Any, Callable, Dict, List, Type, Union @dataclass @@ -20,7 +20,7 @@ class Language: ``cltkv1.lagnuages.glottolog.LANGUAGES`` May be extended by user for dialects or languages not documented by ISO 639-3. - >>> from cltkv1.utils.data_types import Language + >>> from cltkv1.core.data_types import Language >>> from cltkv1.languages.utils import get_lang >>> latin = get_lang("lat") >>> isinstance(latin, Language) @@ -46,14 +46,14 @@ class Word: """Contains attributes of each processed word in a list of words. Designed to be used in the ``Doc.words`` dataclass. - >>> from cltkv1.utils.data_types import Word + >>> from cltkv1.core.data_types import Word >>> from cltkv1.utils.example_texts import get_example_text >>> get_example_text("lat")[:25] 'Gallia est omnis divisa i' >>> from cltkv1.languages.utils import get_lang >>> latin = get_lang("lat") >>> Word(index_char_start=0, index_char_stop=6, index_token=0, string=get_example_text("lat")[0:6], pos="nom") - Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', lemma=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, parent_token=None, feats=None) + Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', lemma=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, parent=None, features=None) """ index_char_start: int = None @@ -67,9 +67,9 @@ class Word: xpos: str = None # treebank-specific POS tag (from stanfordnlp) upos: str = None # universal POS tag (from stanfordnlp) dependency_relation: str = None # (from stanfordnlp) - governor: str = None # (from stanfordnlp) - parent_token: str = None # (from stanfordnlp) - feats: str = None # morphological features (from stanfordnlp) + governor: "Word" = None + parent: "Word" = None + features: Dict[str, str] = None # morphological features (from stanfordnlp) @dataclass @@ -89,38 +89,66 @@ class Doc: True """ - indices_sentences: List[List[int]] = None - indices_tokens: List[List[int]] = None language: str = None words: List[Word] = None - pipeline: List["Process"] = None + pipeline: "Pipeline" = None raw: str = None @property - def sentences(self): - return [ - [self.words[token_index] for token_index in sentence] - for sentence in self.indices_tokens - ] + def sentences(self) -> List[List[Word]]: + sentences = {} + for word in self.words: + sentence = sentences.get(word.index_sentence, {}) + sentence[word.index_token] = word + sentences[word.index_sentence] = sentence - @property - def tokens_list(self) -> List[str]: - """Returns a list of string word tokens. + sorted_values = lambda dict: [x[1] for x in sorted(dict.items())] + + return [sorted_values(sentence) for sentence in sorted_values(sentences)] - TODO: Why does ``Doc.tokens`` fail? + def _get_words_attribute(self, attribute): + return [getattr(word, attribute) for word in self.words] + + @property + def tokens(self) -> List[str]: + """Returns a list of string word tokens of all words in the doc. >>> from cltkv1 import NLP >>> from cltkv1.utils.example_texts import get_example_text >>> cltk_nlp = NLP(language="lat") - >>> cltk_nlp.language.name - 'Latin' - >>> isinstance(cltk_nlp.language, Language) - True >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat")) - >>> cltk_doc.tokens_list[:10] + >>> cltk_doc.tokens[:10] ['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam'] """ - return [word_obj.string for word_obj in self.words] + return self._get_words_attribute("string") + + @property + def pos(self) -> List[str]: + """Returns a list of the POS tags of all words in the doc. + + >>> from cltkv1 import NLP + >>> from cltkv1.utils.example_texts import get_example_text + >>> cltk_nlp = NLP(language="lat") + >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat")) + >>> cltk_doc.pos[:3] + ['NOUN', 'AUX', 'DET'] + """ + return self._get_words_attribute("upos") + + @property + def morphosyntactic_features(self) -> Dict[str, str]: + """Returns a list of dictionaries containing the morphosyntactic features + of each word (when available). + Each dictionary specifies feature names as keys and feature values as values. + + >>> from cltkv1 import NLP + >>> from cltkv1.utils.example_texts import get_example_text + >>> cltk_nlp = NLP(language="lat") + >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat")) + >>> cltk_doc.morphosyntactic_features[:3] + [{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, {'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing', 'PronType': 'Ind'}] + """ + return self._get_words_attribute("features") @dataclass @@ -164,7 +192,7 @@ class Pipeline: # TODO: Consider adding a Unicode normalization as a default first Process - >>> from cltkv1.utils.data_types import Process, Pipeline + >>> from cltkv1.core.data_types import Process, Pipeline >>> from cltkv1.languages.utils import get_lang >>> from cltkv1.tokenizers import LatinTokenizationProcess >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat")) diff --git a/src/cltkv1/utils/exceptions.py b/src/cltkv1/core/exceptions.py similarity index 59% rename from src/cltkv1/utils/exceptions.py rename to src/cltkv1/core/exceptions.py index 28f4895..f305734 100644 --- a/src/cltkv1/utils/exceptions.py +++ b/src/cltkv1/core/exceptions.py @@ -4,13 +4,13 @@ class CLTKException(Exception): """Exception class for the ``cltkv1`` library. - >>> from cltkv1.utils.exceptions import CLTKException + >>> from cltkv1.core.exceptions import CLTKException >>> raise CLTKException Traceback (most recent call last): ... - File "", line 1, in + File "", line 1, in raise CLTKException - cltkv1.utils.exceptions.CLTKException + cltkv1.core.exceptions.CLTKException """ @@ -18,13 +18,13 @@ class UnimplementedLanguageError(CLTKException): """Exception for when a language is supported by the CLTK however a particular process is not available for that language. - >>> from cltkv1.utils.exceptions import UnimplementedLanguageError + >>> from cltkv1.core.exceptions import UnimplementedLanguageError >>> raise UnimplementedLanguageError Traceback (most recent call last): ... - File "", line 1, in + File "", line 1, in raise UnimplementedLanguageError - cltkv1.utils.exceptions.UnimplementedLanguageError + cltkv1.core.exceptions.UnimplementedLanguageError """ @@ -34,11 +34,11 @@ class UnknownLanguageError(CLTKException): TODO: Mk separate exceptions for unknown lang vs unimplemented process for a known lang - >>> from cltkv1.utils.exceptions import UnknownLanguageError + >>> from cltkv1.core.exceptions import UnknownLanguageError >>> raise UnknownLanguageError Traceback (most recent call last): ... - File "", line 1, in + File "", line 1, in raise UnknownLanguageError - cltkv1.utils.exceptions.UnknownLanguageError + cltkv1.core.exceptions.UnknownLanguageError """ diff --git a/src/cltkv1/dependency/tree.py b/src/cltkv1/dependency/tree.py index 019e275..0c72b68 100644 --- a/src/cltkv1/dependency/tree.py +++ b/src/cltkv1/dependency/tree.py @@ -5,7 +5,7 @@ from typing import List, Union from xml.etree.ElementTree import Element, ElementTree -from cltkv1.utils.data_types import Doc, Process, Word +from cltkv1.core.data_types import Doc, Process, Word class Form(Element): @@ -69,17 +69,17 @@ def get_dependencies(self, relation: str) -> List["Dependency"]: """Extract dependents of this form for the specified dependency relation. - TODO: Add doctest for ``Form.get_dependencies()`` - >>> john = Form('John', 1) / 'NNP' - >>> john - John_1/NNP >>> loves = Form('loves', 2) / 'VRB' - >>> loves - loves_2/VRB >>> mary = Form('Mary', 3) / 'NNP' - >>> mary - Mary_3/NNP + >>> loves >> john | 'subj' + subj(loves_2/VRB, John_1/NNP) + >>> loves >> mary | 'obj' + obj(loves_2/VRB, Mary_3/NNP) + >>> loves.get_dependencies('subj') + [subj(loves_2/VRB, John_1/NNP)] + >>> loves.get_dependencies('obj') + [obj(loves_2/VRB, Mary_3/NNP)] """ deps = self.findall('*[@relation="{}"]'.format(relation)) return [Dependency(self, dep, relation) for dep in deps] @@ -99,12 +99,15 @@ def full_str(self, include_relation=True) -> str: The ID is attached to the text, and the relation is optionally suppressed. - TODO: Make this test more meaningful. KJ couldn't get the ``desc_form.full_str()`` to equal the target. + >>> loves = Form('loves', 2) / 'VRB' + >>> loves.full_str() + 'loves_2 [pos=VRB]' + >>> john = Form('John', 1) / 'NNP' + >>> loves >> john | 'subj' + subj(loves_2/VRB, John_1/NNP) + >>> john.full_str(True) + 'John_1 [pos=NNP,relation=subj]' - >>> f = Form - >>> desc_form = f('described') - >>> type(desc_form.full_str()) - """ excluded = ["form_id", "relation"] if not include_relation else ["form_id"] return "{0}_{1} [{2}]".format( @@ -141,19 +144,24 @@ def to_form(word: Word) -> "Form": form.set("upos", word.upos) form.set("xpos", word.xpos) - if word.feats != "_": - for f in word.feats.split("|"): - feature = f.split("=") - form.set(feature[0], feature[1]) + for (feature_name, feature_value) in word.features.items(): + form.set(feature_name, feature_value) return form class Dependency: - """The relationship (or edge) between a hierarchical - and subordinate Node. + """The asymmetric binary relationship (or edge) between a governing + Form (the "head") and a subordinate Form (the "dependent"). + + In principle the relationship could capture any form-to-form relation + that the systems deems of interest, be it syntactic, semantic, or discursive. - TODO: Explain this better. + If the `relation` attribute is not speficied, then the dependency simply states + that there's some asymmetric relationship between the head and the dependenent. + This is an *untyped* dependency. + + For a *typed* dependency, a string value is supplied for the `relation` attribute. """ def __init__(self, head: Form, dep: Form, relation: str = None) -> None: @@ -182,43 +190,43 @@ def __init__(self, root: Form) -> None: ElementTree.__init__(self, root) - def _get_deps(self, node: Form, deps: List[Dependency]) -> List[Dependency]: - """ - TODO: Add docstring and doctests - TODO: What is difference btw this and ``DependencyTree.get_dependencies()``? - """ - for child_node in list(node): - deps = self._get_deps(child_node, deps) - deps.extend(node.get_dependencies(child_node("relation"))) - return deps - def get_dependencies(self) -> List[Dependency]: """Returns a list of all the dependency relations in the tree, generated by depth-first search. - TODO: Add doctests - """ - deps = self._get_deps(self.getroot(), []) - deps.append(Dependency(None, self.getroot(), "root")) - return deps - - def _print_treelet(self, node: Form, indent: int, all_features: bool): + >>> from cltkv1 import NLP + >>> from cltkv1.utils.example_texts import get_example_text + >>> cltk_nlp = NLP(language="lat") + >>> doc = cltk_nlp.analyze(text=get_example_text("lat")) + >>> t = DependencyTree.to_tree(doc.sentences[0]) + >>> len(t.get_dependencies()) + 30 """ - TODO: Add docstring and doctest - """ - edge = "└─ " if indent > 0 else "" - node_str = node.full_str(False) if all_features else str(node) - print(" " * indent + edge + node("relation") + " | " + node_str) + def _get_deps(node: Form, deps: List[Dependency]) -> List[Dependency]: + for child_node in list(node): + deps = _get_deps(child_node, deps) + deps.extend(node.get_dependencies(child_node("relation"))) + return deps - for child_node in list(node): - self._print_treelet(child_node, indent + 4, all_features) + deps = _get_deps(self.getroot(), []) + deps.append(Dependency(None, self.getroot(), "root")) + return deps def print_tree(self, all_features: bool = True): """Prints a pretty-printed (indented) representation of the dependency tree. If all_features is True, then - each node is printed with its complete feature bundle. + each node is printed with its complete feature bundles. """ + + def _print_treelet(node: Form, indent: int, all_features: bool): + edge = "└─ " if indent > 0 else "" + node_str = node.full_str(False) if all_features else str(node) + print(" " * indent + edge + node("relation") + " | " + node_str) + + for child_node in list(node): + _print_treelet(child_node, indent + 4, all_features) + self._print_treelet(self.getroot(), indent=0, all_features=all_features) @staticmethod @@ -241,7 +249,7 @@ def to_tree(sentence: List[Word]) -> "DependencyTree": if word.dependency_relation == "root": root = forms[word.index_token] else: - gov = forms[word.governor] + gov = forms[word.governor.index_token] dep = forms[word.index_token] gov >> dep | word.dependency_relation diff --git a/src/cltkv1/languages/glottolog.py b/src/cltkv1/languages/glottolog.py index 2e98a8c..85a3ccc 100644 --- a/src/cltkv1/languages/glottolog.py +++ b/src/cltkv1/languages/glottolog.py @@ -253,7 +253,7 @@ from collections import OrderedDict from typing import List -from cltkv1.utils.data_types import Language +from cltkv1.core.data_types import Language LANGUAGES = OrderedDict( [ diff --git a/src/cltkv1/utils/pipelines.py b/src/cltkv1/languages/pipelines.py similarity index 92% rename from src/cltkv1/utils/pipelines.py rename to src/cltkv1/languages/pipelines.py index 96cc343..c2f9aa6 100644 --- a/src/cltkv1/utils/pipelines.py +++ b/src/cltkv1/languages/pipelines.py @@ -9,9 +9,9 @@ from dataclasses import dataclass, field from typing import Callable, List, Type +from cltkv1.core.data_types import Language, Pipeline, Process from cltkv1.languages.utils import get_lang from cltkv1.tokenizers import DefaultTokenizationProcess, LatinTokenizationProcess -from cltkv1.utils.data_types import Language, Pipeline, Process from cltkv1.wrappers.stanford import StanfordNLPProcess @@ -19,7 +19,7 @@ class LatinPipeline(Pipeline): """Default ``Pipeline`` for Latin. - >>> from cltkv1.utils.pipelines import LatinPipeline + >>> from cltkv1.languages.pipelines import LatinPipeline >>> a_pipeline = LatinPipeline() >>> a_pipeline.description 'Pipeline for the Latin language' @@ -40,7 +40,7 @@ class LatinPipeline(Pipeline): class GreekPipeline(Pipeline): """Default ``Pipeline`` for Ancient Greek. - >>> from cltkv1.utils.pipelines import GreekPipeline + >>> from cltkv1.languages.pipelines import GreekPipeline >>> a_pipeline = GreekPipeline() >>> a_pipeline.description 'Pipeline for the Greek language' @@ -61,7 +61,7 @@ class GreekPipeline(Pipeline): class OCSPipeline(Pipeline): """Default ``Pipeline`` for Old Church Slavonic. - >>> from cltkv1.utils.pipelines import OCSPipeline + >>> from cltkv1.languages.pipelines import OCSPipeline >>> a_pipeline = OCSPipeline() >>> a_pipeline.description 'Pipeline for the Old Church Slavonic language' @@ -82,7 +82,7 @@ class OCSPipeline(Pipeline): class OldFrenchPipeline(Pipeline): """Default ``Pipeline`` for Old French. - >>> from cltkv1.utils.pipelines import OldFrenchPipeline + >>> from cltkv1.languages.pipelines import OldFrenchPipeline >>> a_pipeline = OldFrenchPipeline() >>> a_pipeline.description 'Pipeline for the Old French language' @@ -103,7 +103,7 @@ class OldFrenchPipeline(Pipeline): class GothicPipeline(Pipeline): """Default ``Pipeline`` for Gothic. - >>> from cltkv1.utils.pipelines import GothicPipeline + >>> from cltkv1.languages.pipelines import GothicPipeline >>> a_pipeline = GothicPipeline() >>> a_pipeline.description 'Pipeline for the Gothic language' diff --git a/src/cltkv1/languages/utils.py b/src/cltkv1/languages/utils.py index 6c0b7fe..84f9207 100644 --- a/src/cltkv1/languages/utils.py +++ b/src/cltkv1/languages/utils.py @@ -1,8 +1,8 @@ from typing import List +from cltkv1.core.data_types import Language +from cltkv1.core.exceptions import UnknownLanguageError from cltkv1.languages.glottolog import LANGUAGES -from cltkv1.utils.data_types import Language -from cltkv1.utils.exceptions import UnknownLanguageError def get_lang(iso_code: str) -> Language: @@ -12,11 +12,11 @@ def get_lang(iso_code: str) -> Language: >>> from cltkv1.languages.utils import get_lang >>> get_lang("akk") Language(name='Akkadian', glottolog_id='akka1240', latitude=33.1, longitude=44.1, dates=[], family_id='afro1255', parent_id='east2678', level='language', iso_639_3_code='akk', type='a') - >>> from cltkv1.utils.exceptions import UnknownLanguageError + >>> from cltkv1.core.exceptions import UnknownLanguageError >>> get_lang("xxx") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnknownLanguageError + cltkv1.core.exceptions.UnknownLanguageError """ try: return LANGUAGES[iso_code] diff --git a/src/cltkv1/nlp.py b/src/cltkv1/nlp.py index fc011b7..7da9207 100644 --- a/src/cltkv1/nlp.py +++ b/src/cltkv1/nlp.py @@ -2,16 +2,16 @@ from typing import List -from cltkv1.languages.utils import get_lang -from cltkv1.utils.data_types import Doc, Language, Pipeline, Type -from cltkv1.utils.exceptions import UnimplementedLanguageError, UnknownLanguageError -from cltkv1.utils.pipelines import ( +from cltkv1.core.data_types import Doc, Language, Pipeline, Type +from cltkv1.core.exceptions import UnimplementedLanguageError, UnknownLanguageError +from cltkv1.languages.pipelines import ( GothicPipeline, GreekPipeline, LatinPipeline, OCSPipeline, OldFrenchPipeline, ) +from cltkv1.languages.utils import get_lang pipelines = { "lat": LatinPipeline, @@ -35,8 +35,8 @@ def __init__(self, language: str, custom_pipeline: Pipeline = None) -> None: >>> NLP(language="xxx") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnknownLanguageError: Unknown language 'xxx'. Use ISO 639-3 languages. - >>> from cltkv1.utils.data_types import Pipeline + cltkv1.core.exceptions.UnknownLanguageError: Unknown language 'xxx'. Use ISO 639-3 languages. + >>> from cltkv1.core.data_types import Pipeline >>> from cltkv1.tokenizers import LatinTokenizationProcess >>> from cltkv1.languages.utils import get_lang >>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat")) @@ -58,7 +58,7 @@ def _get_pipeline(self) -> Pipeline: are valid, both in themselves and in unison. >>> from cltkv1 import NLP - >>> from cltkv1.utils.data_types import Pipeline + >>> from cltkv1.core.data_types import Pipeline >>> cltk_nlp = NLP(language="lat") >>> lat_pipeline = cltk_nlp._get_pipeline() >>> isinstance(cltk_nlp.pipeline, Pipeline) @@ -68,7 +68,7 @@ def _get_pipeline(self) -> Pipeline: >>> cltk_nlp = NLP(language="axm") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnimplementedLanguageError: axm + cltkv1.core.exceptions.UnimplementedLanguageError: axm """ try: return pipelines[self.language.iso_639_3_code]() @@ -83,34 +83,34 @@ def analyze(self, text: str) -> Doc: >>> from cltkv1 import NLP >>> from cltkv1.utils.example_texts import get_example_text - >>> from cltkv1.utils.data_types import Doc + >>> from cltkv1.core.data_types import Doc >>> cltk_nlp = NLP(language="lat") >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat")) >>> isinstance(cltk_doc, Doc) True - >>> cltk_doc.words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Gallia', pos='A1|grn1|casA|gen2|stAM', lemma='aallius', scansion=None, xpos='A1|grn1|casA|gen2|stAM', upos='NOUN', dependency_relation='nsubj', governor=4, parent_token=]>, feats='Case=Nom|Degree=Pos|Gender=Fem|Number=Sing') + >>> cltk_doc.words[0] # doctest: +ELLIPSIS + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Gallia', pos='A1|grn1|casA|gen2|stAM', lemma='aallius', scansion=None, xpos='A1|grn1|casA|gen2|stAM', upos='NOUN', dependency_relation='nsubj', governor=..., parent=..., features={'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}) >>> from cltkv1.utils.example_texts import get_example_text >>> cltk_nlp = NLP(language="grc") >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("grc")) - >>> cltk_doc.words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='ὅτι', pos='Df', lemma='ὅτι#1', scansion=None, xpos='Df', upos='ADV', dependency_relation='advmod', governor=13, parent_token=]>, feats='_') + >>> cltk_doc.words[0] # doctest: +ELLIPSIS + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='ὅτι', pos='Df', lemma='ὅτι#1', scansion=None, xpos='Df', upos='ADV', dependency_relation='advmod', governor=..., parent=..., features={}) >>> cltk_nlp = NLP(language="chu") >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("chu")) - >>> cltk_doc.words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='отьчє', pos='Nb', lemma='отьць', scansion=None, xpos='Nb', upos='NOUN', dependency_relation='nsubj', governor=6, parent_token=]>, feats='Case=Nom|Gender=Masc|Number=Sing') + >>> cltk_doc.words[0] # doctest: +ELLIPSIS + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='отьчє', pos='Nb', lemma='отьць', scansion=None, xpos='Nb', upos='NOUN', dependency_relation='nsubj', governor=..., parent=..., features={'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}) >>> cltk_nlp = NLP(language="fro") >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("fro")) - >>> cltk_doc.words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Une', pos='DETndf', lemma='Une', scansion=None, xpos='DETndf', upos='DET', dependency_relation='det', governor=2, parent_token=]>, feats='Definite=Ind|PronType=Art') + >>> cltk_doc.words[0] # doctest: +ELLIPSIS + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Une', pos='DETndf', lemma='Une', scansion=None, xpos='DETndf', upos='DET', dependency_relation='det', governor=..., parent=..., features={'Definite': 'Ind', 'PronType': 'Art'}) >>> cltk_nlp = NLP(language="got") >>> cltk_doc = cltk_nlp.analyze(text=get_example_text("got")) - >>> cltk_doc.words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='swa', pos='Df', lemma='swa', scansion=None, xpos='Df', upos='ADV', dependency_relation='advmod', governor=2, parent_token=]>, feats='_') + >>> cltk_doc.words[0] # doctest: +ELLIPSIS + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='swa', pos='Df', lemma='swa', scansion=None, xpos='Df', upos='ADV', dependency_relation='advmod', governor=..., parent=..., features={}) >>> len(cltk_doc.sentences) 4 """ diff --git a/src/cltkv1/tokenizers/word.py b/src/cltkv1/tokenizers/word.py index 64949d6..514e490 100644 --- a/src/cltkv1/tokenizers/word.py +++ b/src/cltkv1/tokenizers/word.py @@ -8,7 +8,7 @@ from cltk.tokenize.word import WordTokenizer -from cltkv1.utils.data_types import Doc, Process +from cltkv1.core.data_types import Doc, Process, Word # a closure for marshalling Docs to CLTK tokenizers @@ -16,7 +16,12 @@ def make_tokenizer_algorithm(language: str) -> Callable[[Doc], Doc]: tokenizer = WordTokenizer(language=language) def algorithm(self, doc: Doc) -> Doc: - doc.tokens = tokenizer.tokenize(doc.raw) + doc.words = [] + + for i, token in enumerate(tokenizer.tokenize(doc.raw)): + word = Word(string=token, index_token=i) + doc.words.append(word) + return doc return algorithm @@ -42,7 +47,7 @@ class TokenizationProcess(Process): Example: ``TokenizationProcess`` -> ``LatinTokenizationProcess`` >>> from cltkv1.tokenizers.word import TokenizationProcess - >>> from cltkv1.utils.data_types import Process + >>> from cltkv1.core.data_types import Process >>> issubclass(TokenizationProcess, Process) True >>> tok = TokenizationProcess(input_doc=Doc(raw="some input data")) diff --git a/src/cltkv1/utils/__init__.py b/src/cltkv1/utils/__init__.py index d953c86..85f9d33 100644 --- a/src/cltkv1/utils/__init__.py +++ b/src/cltkv1/utils/__init__.py @@ -1,4 +1,3 @@ """Init for `cltkv1.utils`.""" -from .exceptions import * from .utils import * diff --git a/src/cltkv1/utils/example_texts.py b/src/cltkv1/utils/example_texts.py index 7e1b230..7f569e7 100644 --- a/src/cltkv1/utils/example_texts.py +++ b/src/cltkv1/utils/example_texts.py @@ -13,8 +13,8 @@ # pylint: disable=line-too-long +from cltkv1.core.exceptions import UnimplementedLanguageError from cltkv1.languages.utils import get_lang -from cltkv1.utils.exceptions import UnimplementedLanguageError EXAMPLE_TEXTS = dict( # Akkadian @@ -134,11 +134,11 @@ def get_example_text(iso_code: str) -> str: >>> get_example_text("zkz") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnimplementedLanguageError: Example text unavailable for ISO 639-3 code 'zkz'. + cltkv1.core.exceptions.UnimplementedLanguageError: Example text unavailable for ISO 639-3 code 'zkz'. >>> get_example_text("xxx") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnknownLanguageError + cltkv1.core.exceptions.UnknownLanguageError """ get_lang(iso_code=iso_code) try: diff --git a/src/cltkv1/wrappers/stanford.py b/src/cltkv1/wrappers/stanford.py index 4e9c7f5..849b5c9 100644 --- a/src/cltkv1/wrappers/stanford.py +++ b/src/cltkv1/wrappers/stanford.py @@ -8,14 +8,9 @@ import stanfordnlp # type: ignore -from cltkv1.utils import ( - UnimplementedLanguageError, - UnknownLanguageError, - example_texts, - file_exists, - suppress_stdout, -) -from cltkv1.utils.data_types import Doc, Process, Word +from cltkv1.core.data_types import Doc, Process, Word +from cltkv1.core.exceptions import UnimplementedLanguageError, UnknownLanguageError +from cltkv1.utils import example_texts, file_exists, suppress_stdout from cltkv1.utils.example_texts import EXAMPLE_TEXTS LOG = logging.getLogger(__name__) @@ -53,7 +48,7 @@ def __init__(self, language: str, treebank: Optional[str] = None) -> None: >>> StanfordNLPWrapper(language="xxx") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnknownLanguageError: Language 'xxx' either not in scope for CLTK or not supported by StanfordNLP. + cltkv1.core.exceptions.UnknownLanguageError: Language 'xxx' either not in scope for CLTK or not supported by StanfordNLP. >>> stanford_wrapper = StanfordNLPWrapper(language="grc", treebank="grc_proiel") >>> snlp_doc = stanford_wrapper.parse(get_example_text("grc")) @@ -70,7 +65,7 @@ def __init__(self, language: str, treebank: Optional[str] = None) -> None: >>> stanford_wrapper = StanfordNLPWrapper(language="lat", treebank="xxx") Traceback (most recent call last): ... - cltkv1.utils.exceptions.UnimplementedLanguageError: Invalid treebank 'xxx' for language 'lat'. + cltkv1.core.exceptions.UnimplementedLanguageError: Invalid treebank 'xxx' for language 'lat'. """ self.language = language self.treebank = treebank @@ -346,17 +341,15 @@ def __init__(self, input_doc, language): def algorithm(self, doc): stanfordnlp_doc = self.stanfordnlp_wrapper.parse(doc.raw) - (cltk_words, indices_tokens) = StanfordNLPProcess.stanfordnlp_to_cltk_word_type( - stanfordnlp_doc - ) + cltk_words = StanfordNLPProcess.stanfordnlp_to_cltk_word_type(stanfordnlp_doc) doc.words = cltk_words - doc.indices_tokens = indices_tokens doc.stanfordnlp_doc = stanfordnlp_doc return doc @staticmethod def stanfordnlp_to_cltk_word_type(stanfordnlp_doc): + """Take an entire ``stanfordnlp`` document, extract each word, and encode it in the way expected by the CLTK's ``Word`` type. @@ -371,13 +364,13 @@ def stanfordnlp_to_cltk_word_type(stanfordnlp_doc): >>> isinstance(cltk_words[0], Word) True >>> cltk_words[0] - Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Gallia', pos='A1|grn1|casA|gen2|stAM', lemma='aallius', scansion=None, xpos='A1|grn1|casA|gen2|stAM', upos='NOUN', dependency_relation='nsubj', governor=4, parent_token=]>, feats='Case=Nom|Degree=Pos|Gender=Fem|Number=Sing') + Word(index_char_start=None, index_char_stop=None, index_token=1, index_sentence=0, string='Gallia', pos='A1|grn1|casA|gen2|stAM', lemma='aallius', scansion=None, xpos='A1|grn1|casA|gen2|stAM', upos='NOUN', dependency_relation='nsubj', governor=Word(index_char_start=None, index_char_stop=None, index_token=4, index_sentence=0, string='divisa', pos='L2', lemma='divido', scansion=None, xpos='L2', upos='VERB', dependency_relation='root', governor=None, parent=None, features={'Aspect': 'Perf', 'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Part', 'Voice': 'Pass'}), parent=None, features={'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}) """ words_list = list() - sentence_list = list() for sentence_index, sentence in enumerate(stanfordnlp_doc.sentences): - token_indices = list() + sent_words = dict() + indices = list() for token_index, token in enumerate(sentence.tokens): stanfordnlp_word = token.words[0] @@ -390,12 +383,27 @@ def stanfordnlp_to_cltk_word_type(stanfordnlp_doc): upos=stanfordnlp_word.upos, lemma=stanfordnlp_word.lemma, dependency_relation=stanfordnlp_word.dependency_relation, - governor=stanfordnlp_word.governor, - parent_token=stanfordnlp_word.parent_token, - feats=stanfordnlp_word.feats, + features={} + if stanfordnlp_word.feats == "_" + else dict( + [f.split("=") for f in stanfordnlp_word.feats.split("|")] + ), + ) + sent_words[cltk_word.index_token] = cltk_word + indices.append( + ( + int(stanfordnlp_word.governor), + int(stanfordnlp_word.parent_token.index), + ) ) words_list.append(cltk_word) - token_indices.append(token_index) - sentence_list.append(token_indices) - return (words_list, sentence_list) + for i, cltk_word in enumerate(sent_words.values()): + (governor_index, parent_index) = indices[i] + cltk_word.governor = ( + sent_words[governor_index] if governor_index > 0 else None + ) + if cltk_word.index_token != sent_words[parent_index].index_token: + cltk_word.parent = sent_words[parent_index] + + return words_list