Skip to content

Commit

Permalink
Doctests for Dependency Tree and more properties on Doc (cltk#33)
Browse files Browse the repository at this point in the history
* adapted the dependency tree class to CLTKv1

* working on not reloading the full stanford pipeline on every process call

* cache stanford NLP objects in wrapper class

* fixed doctests for Stanford process

* automatically reformatted files

* implemented the pipeline pattern, and extraction of sentential structure of input text in the Stanford process

* implemented true pipelines and sentence extraction for stanfordNLP

* :

* moved code out of __init__.py

* added doctests and properties of Doc

* fixed trailing whitespace

* repaired governor and parent references in word; created a core package for essential classes; normalized POS and morpho features attributes of words

* interrupted infinite recursion in parent token
  • Loading branch information
free-variation authored and kylepjohnson committed Jan 11, 2020
1 parent 5a5b1ef commit ed0a145
Show file tree
Hide file tree
Showing 13 changed files with 259 additions and 209 deletions.
124 changes: 62 additions & 62 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/cltkv1/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .data_types import *
from .exceptions import *
90 changes: 59 additions & 31 deletions src/cltkv1/utils/data_types.py → src/cltkv1/core/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
of the NLP pipeline.
>>> from cltkv1.utils.data_types import Language
>>> from cltkv1.utils.data_types import Word
>>> from cltkv1.utils.data_types import Process
>>> from cltkv1.utils.data_types import Doc
>>> from cltkv1.utils.data_types import Pipeline
>>> from cltkv1.core.data_types import Language
>>> from cltkv1.core.data_types import Word
>>> from cltkv1.core.data_types import Process
>>> from cltkv1.core.data_types import Doc
>>> from cltkv1.core.data_types import Pipeline
"""

from dataclasses import dataclass
from typing import Any, Callable, List, Type, Union
from typing import Any, Callable, Dict, List, Type, Union


@dataclass
Expand All @@ -20,7 +20,7 @@ class Language:
``cltkv1.lagnuages.glottolog.LANGUAGES`` May be extended by
user for dialects or languages not documented by ISO 639-3.
>>> from cltkv1.utils.data_types import Language
>>> from cltkv1.core.data_types import Language
>>> from cltkv1.languages.utils import get_lang
>>> latin = get_lang("lat")
>>> isinstance(latin, Language)
Expand All @@ -46,14 +46,14 @@ class Word:
"""Contains attributes of each processed word in a list of
words. Designed to be used in the ``Doc.words`` dataclass.
>>> from cltkv1.utils.data_types import Word
>>> from cltkv1.core.data_types import Word
>>> from cltkv1.utils.example_texts import get_example_text
>>> get_example_text("lat")[:25]
'Gallia est omnis divisa i'
>>> from cltkv1.languages.utils import get_lang
>>> latin = get_lang("lat")
>>> Word(index_char_start=0, index_char_stop=6, index_token=0, string=get_example_text("lat")[0:6], pos="nom")
Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', lemma=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, parent_token=None, feats=None)
Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Gallia', pos='nom', lemma=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, parent=None, features=None)
"""

index_char_start: int = None
Expand All @@ -67,9 +67,9 @@ class Word:
xpos: str = None # treebank-specific POS tag (from stanfordnlp)
upos: str = None # universal POS tag (from stanfordnlp)
dependency_relation: str = None # (from stanfordnlp)
governor: str = None # (from stanfordnlp)
parent_token: str = None # (from stanfordnlp)
feats: str = None # morphological features (from stanfordnlp)
governor: "Word" = None
parent: "Word" = None
features: Dict[str, str] = None # morphological features (from stanfordnlp)


@dataclass
Expand All @@ -89,38 +89,66 @@ class Doc:
True
"""

indices_sentences: List[List[int]] = None
indices_tokens: List[List[int]] = None
language: str = None
words: List[Word] = None
pipeline: List["Process"] = None
pipeline: "Pipeline" = None
raw: str = None

@property
def sentences(self):
return [
[self.words[token_index] for token_index in sentence]
for sentence in self.indices_tokens
]
def sentences(self) -> List[List[Word]]:
sentences = {}
for word in self.words:
sentence = sentences.get(word.index_sentence, {})
sentence[word.index_token] = word
sentences[word.index_sentence] = sentence

@property
def tokens_list(self) -> List[str]:
"""Returns a list of string word tokens.
sorted_values = lambda dict: [x[1] for x in sorted(dict.items())]

return [sorted_values(sentence) for sentence in sorted_values(sentences)]

TODO: Why does ``Doc.tokens`` fail?
def _get_words_attribute(self, attribute):
return [getattr(word, attribute) for word in self.words]

@property
def tokens(self) -> List[str]:
"""Returns a list of string word tokens of all words in the doc.
>>> from cltkv1 import NLP
>>> from cltkv1.utils.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat")
>>> cltk_nlp.language.name
'Latin'
>>> isinstance(cltk_nlp.language, Language)
True
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.tokens_list[:10]
>>> cltk_doc.tokens[:10]
['Gallia', 'est', 'omnis', 'divisa', 'in', 'partes', 'tres', ',', 'quarum', 'unam']
"""
return [word_obj.string for word_obj in self.words]
return self._get_words_attribute("string")

@property
def pos(self) -> List[str]:
"""Returns a list of the POS tags of all words in the doc.
>>> from cltkv1 import NLP
>>> from cltkv1.utils.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat")
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.pos[:3]
['NOUN', 'AUX', 'DET']
"""
return self._get_words_attribute("upos")

@property
def morphosyntactic_features(self) -> Dict[str, str]:
"""Returns a list of dictionaries containing the morphosyntactic features
of each word (when available).
Each dictionary specifies feature names as keys and feature values as values.
>>> from cltkv1 import NLP
>>> from cltkv1.utils.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat")
>>> cltk_doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> cltk_doc.morphosyntactic_features[:3]
[{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing'}, {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}, {'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Fem', 'Number': 'Sing', 'PronType': 'Ind'}]
"""
return self._get_words_attribute("features")


@dataclass
Expand Down Expand Up @@ -164,7 +192,7 @@ class Pipeline:
# TODO: Consider adding a Unicode normalization as a default first Process
>>> from cltkv1.utils.data_types import Process, Pipeline
>>> from cltkv1.core.data_types import Process, Pipeline
>>> from cltkv1.languages.utils import get_lang
>>> from cltkv1.tokenizers import LatinTokenizationProcess
>>> a_pipeline = Pipeline(description="A custom Latin pipeline", processes=[LatinTokenizationProcess], language=get_lang("lat"))
Expand Down
18 changes: 9 additions & 9 deletions src/cltkv1/utils/exceptions.py → src/cltkv1/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,27 @@
class CLTKException(Exception):
"""Exception class for the ``cltkv1`` library.
>>> from cltkv1.utils.exceptions import CLTKException
>>> from cltkv1.core.exceptions import CLTKException
>>> raise CLTKException
Traceback (most recent call last):
...
File "<doctest cltkv1.utils.exceptions.CLTKException[1]>", line 1, in <module>
File "<doctest cltkv1.core.exceptions.CLTKException[1]>", line 1, in <module>
raise CLTKException
cltkv1.utils.exceptions.CLTKException
cltkv1.core.exceptions.CLTKException
"""


class UnimplementedLanguageError(CLTKException):
"""Exception for when a language is supported by the CLTK however
a particular process is not available for that language.
>>> from cltkv1.utils.exceptions import UnimplementedLanguageError
>>> from cltkv1.core.exceptions import UnimplementedLanguageError
>>> raise UnimplementedLanguageError
Traceback (most recent call last):
...
File "<doctest cltkv1.utils.exceptions.UnimplementedLanguageError[1]>", line 1, in <module>
File "<doctest cltkv1.core.exceptions.UnimplementedLanguageError[1]>", line 1, in <module>
raise UnimplementedLanguageError
cltkv1.utils.exceptions.UnimplementedLanguageError
cltkv1.core.exceptions.UnimplementedLanguageError
"""


Expand All @@ -34,11 +34,11 @@ class UnknownLanguageError(CLTKException):
TODO: Mk separate exceptions for unknown lang vs unimplemented process for a known lang
>>> from cltkv1.utils.exceptions import UnknownLanguageError
>>> from cltkv1.core.exceptions import UnknownLanguageError
>>> raise UnknownLanguageError
Traceback (most recent call last):
...
File "<doctest cltkv1.utils.exceptions.UnknownLanguageError[1]>", line 1, in <module>
File "<doctest cltkv1.core.exceptions.UnknownLanguageError[1]>", line 1, in <module>
raise UnknownLanguageError
cltkv1.utils.exceptions.UnknownLanguageError
cltkv1.core.exceptions.UnknownLanguageError
"""
102 changes: 55 additions & 47 deletions src/cltkv1/dependency/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List, Union
from xml.etree.ElementTree import Element, ElementTree

from cltkv1.utils.data_types import Doc, Process, Word
from cltkv1.core.data_types import Doc, Process, Word


class Form(Element):
Expand Down Expand Up @@ -69,17 +69,17 @@ def get_dependencies(self, relation: str) -> List["Dependency"]:
"""Extract dependents of this form for the specified
dependency relation.
TODO: Add doctest for ``Form.get_dependencies()``
>>> john = Form('John', 1) / 'NNP'
>>> john
John_1/NNP
>>> loves = Form('loves', 2) / 'VRB'
>>> loves
loves_2/VRB
>>> mary = Form('Mary', 3) / 'NNP'
>>> mary
Mary_3/NNP
>>> loves >> john | 'subj'
subj(loves_2/VRB, John_1/NNP)
>>> loves >> mary | 'obj'
obj(loves_2/VRB, Mary_3/NNP)
>>> loves.get_dependencies('subj')
[subj(loves_2/VRB, John_1/NNP)]
>>> loves.get_dependencies('obj')
[obj(loves_2/VRB, Mary_3/NNP)]
"""
deps = self.findall('*[@relation="{}"]'.format(relation))
return [Dependency(self, dep, relation) for dep in deps]
Expand All @@ -99,12 +99,15 @@ def full_str(self, include_relation=True) -> str:
The ID is attached to the text, and the relation is
optionally suppressed.
TODO: Make this test more meaningful. KJ couldn't get the ``desc_form.full_str()`` to equal the target.
>>> loves = Form('loves', 2) / 'VRB'
>>> loves.full_str()
'loves_2 [pos=VRB]'
>>> john = Form('John', 1) / 'NNP'
>>> loves >> john | 'subj'
subj(loves_2/VRB, John_1/NNP)
>>> john.full_str(True)
'John_1 [pos=NNP,relation=subj]'
>>> f = Form
>>> desc_form = f('described')
>>> type(desc_form.full_str())
<class 'str'>
"""
excluded = ["form_id", "relation"] if not include_relation else ["form_id"]
return "{0}_{1} [{2}]".format(
Expand Down Expand Up @@ -141,19 +144,24 @@ def to_form(word: Word) -> "Form":
form.set("upos", word.upos)
form.set("xpos", word.xpos)

if word.feats != "_":
for f in word.feats.split("|"):
feature = f.split("=")
form.set(feature[0], feature[1])
for (feature_name, feature_value) in word.features.items():
form.set(feature_name, feature_value)

return form


class Dependency:
"""The relationship (or edge) between a hierarchical
and subordinate Node.
"""The asymmetric binary relationship (or edge) between a governing
Form (the "head") and a subordinate Form (the "dependent").
In principle the relationship could capture any form-to-form relation
that the systems deems of interest, be it syntactic, semantic, or discursive.
TODO: Explain this better.
If the `relation` attribute is not speficied, then the dependency simply states
that there's some asymmetric relationship between the head and the dependenent.
This is an *untyped* dependency.
For a *typed* dependency, a string value is supplied for the `relation` attribute.
"""

def __init__(self, head: Form, dep: Form, relation: str = None) -> None:
Expand Down Expand Up @@ -182,43 +190,43 @@ def __init__(self, root: Form) -> None:

ElementTree.__init__(self, root)

def _get_deps(self, node: Form, deps: List[Dependency]) -> List[Dependency]:
"""
TODO: Add docstring and doctests
TODO: What is difference btw this and ``DependencyTree.get_dependencies()``?
"""
for child_node in list(node):
deps = self._get_deps(child_node, deps)
deps.extend(node.get_dependencies(child_node("relation")))
return deps

def get_dependencies(self) -> List[Dependency]:
"""Returns a list of all the dependency relations in the tree,
generated by depth-first search.
TODO: Add doctests
"""
deps = self._get_deps(self.getroot(), [])
deps.append(Dependency(None, self.getroot(), "root"))
return deps

def _print_treelet(self, node: Form, indent: int, all_features: bool):
>>> from cltkv1 import NLP
>>> from cltkv1.utils.example_texts import get_example_text
>>> cltk_nlp = NLP(language="lat")
>>> doc = cltk_nlp.analyze(text=get_example_text("lat"))
>>> t = DependencyTree.to_tree(doc.sentences[0])
>>> len(t.get_dependencies())
30
"""

TODO: Add docstring and doctest
"""
edge = "└─ " if indent > 0 else ""
node_str = node.full_str(False) if all_features else str(node)
print(" " * indent + edge + node("relation") + " | " + node_str)
def _get_deps(node: Form, deps: List[Dependency]) -> List[Dependency]:
for child_node in list(node):
deps = _get_deps(child_node, deps)
deps.extend(node.get_dependencies(child_node("relation")))
return deps

for child_node in list(node):
self._print_treelet(child_node, indent + 4, all_features)
deps = _get_deps(self.getroot(), [])
deps.append(Dependency(None, self.getroot(), "root"))
return deps

def print_tree(self, all_features: bool = True):
"""Prints a pretty-printed (indented) representation
of the dependency tree. If all_features is True, then
each node is printed with its complete feature bundle.
each node is printed with its complete feature bundles.
"""

def _print_treelet(node: Form, indent: int, all_features: bool):
edge = "└─ " if indent > 0 else ""
node_str = node.full_str(False) if all_features else str(node)
print(" " * indent + edge + node("relation") + " | " + node_str)

for child_node in list(node):
_print_treelet(child_node, indent + 4, all_features)

self._print_treelet(self.getroot(), indent=0, all_features=all_features)

@staticmethod
Expand All @@ -241,7 +249,7 @@ def to_tree(sentence: List[Word]) -> "DependencyTree":
if word.dependency_relation == "root":
root = forms[word.index_token]
else:
gov = forms[word.governor]
gov = forms[word.governor.index_token]
dep = forms[word.index_token]
gov >> dep | word.dependency_relation

Expand Down
2 changes: 1 addition & 1 deletion src/cltkv1/languages/glottolog.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@
from collections import OrderedDict
from typing import List

from cltkv1.utils.data_types import Language
from cltkv1.core.data_types import Language

LANGUAGES = OrderedDict(
[
Expand Down
Loading

0 comments on commit ed0a145

Please sign in to comment.