From eb035013c3a851055573ecf225df240e28c812d2 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 7 Dec 2020 12:04:06 +0000 Subject: [PATCH] [python][docs] more detailed docs for trees_to_dataframe(), create_tree_digraph(), plot_tree() (#3618) * [python] more detailed docs for trees_to_dataframe(), create_tree_digraph(), plot_tree() * fixing warnings * fix warnings * undo unnecessary space * Apply suggestions from code review Co-authored-by: Nikita Titov * single line, better weight descriptions * Apply suggestions from code review Co-authored-by: Nikita Titov * column names * Update python-package/lightgbm/plotting.py Co-authored-by: Nikita Titov Co-authored-by: Nikita Titov --- python-package/lightgbm/basic.py | 18 +++++++++++++ python-package/lightgbm/plotting.py | 42 ++++++++++++++++++++++++----- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 0e3f065a4520..60f6b70f0881 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2202,6 +2202,24 @@ def free_network(self): def trees_to_dataframe(self): """Parse the fitted model and return in an easy-to-read pandas DataFrame. + The returned DataFrame has the following columns. + + - ``tree_index`` : int64, which tree a node belongs to. 0-based, so a value of ``6``, for example, means "this node is in the 7th tree". + - ``node_depth`` : int64, how far a node is from the root of the tree. The root node has a value of ``1``, its direct children are ``2``, etc. + - ``node_index`` : string, unique identifier for a node. + - ``left_child`` : string, ``node_index`` of the child node to the left of a split. ``None`` for leaf nodes. + - ``right_child`` : string, ``node_index`` of the child node to the right of a split. ``None`` for leaf nodes. + - ``parent_index`` : string, ``node_index`` of this node's parent. ``None`` for the root node. + - ``split_feature`` : string, name of the feature used for splitting. ``None`` for leaf nodes. + - ``split_gain`` : float64, gain from adding this split to the tree. ``NaN`` for leaf nodes. + - ``threshold`` : float64, value of the feature used to decide which side of the split a record will go down. ``NaN`` for leaf nodes. + - ``decision_type`` : string, logical operator describing how to compare a value to ``threshold``. For example, ``split_feature = "Column_10", threshold = 15, decision_type = "<="`` means that records where ``Column_10 <= 15`` follow the left side of the split, otherwise follows the right side of the split. ``None`` for leaf nodes. + - ``missing_direction`` : string, split direction that missing values should go to. ``None`` for leaf nodes. + - ``missing_type`` : string, describes what types of values are treated as missing. + - ``value`` : float64, predicted value for this leaf node, multiplied by the learning rate. + - ``weight`` : float64 or int64, sum of hessian (second-order derivative of objective), summed over observations that fall in this node. + - ``count`` : int64, number of records in the training data that fall into this node. + Returns ------- result : pandas DataFrame diff --git a/python-package/lightgbm/plotting.py b/python-package/lightgbm/plotting.py index e08214e9da30..ae57a449cb74 100644 --- a/python-package/lightgbm/plotting.py +++ b/python-package/lightgbm/plotting.py @@ -474,6 +474,16 @@ def create_tree_digraph(booster, tree_index=0, show_info=None, precision=3, orientation='horizontal', **kwargs): """Create a digraph representation of specified tree. + Each node in the graph represents a node in the tree. + + Non-leaf nodes have labels like ``Column_10 <= 875.9``, which means + "this node splits on the feature named "Column_10", with threshold 875.9". + + Leaf nodes have labels like ``leaf 2: 0.422``, which means "this node is a + leaf node, and the predicted value for records that fall into this node + is 0.422". The number (``2``) is an internal unique identifier and doesn't + have any special meaning. + .. note:: For more information please visit @@ -487,9 +497,14 @@ def create_tree_digraph(booster, tree_index=0, show_info=None, precision=3, The index of a target tree to convert. show_info : list of strings or None, optional (default=None) What information should be shown in nodes. - Possible values of list items: - 'split_gain', 'internal_value', 'internal_count', 'internal_weight', - 'leaf_count', 'leaf_weight', 'data_percentage'. + + - ``'split_gain'`` : gain from adding this split to the model + - ``'internal_value'`` : raw predicted value that would be produced by this node if it was a leaf node + - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node + - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node + - ``'leaf_count'`` : number of records from the training data that fall into this leaf node + - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node + - ``'data_percentage'`` : percentage of training data that fall into this node precision : int or None, optional (default=3) Used to restrict the display of floating point values to a certain precision. orientation : string, optional (default='horizontal') @@ -536,6 +551,16 @@ def plot_tree(booster, ax=None, tree_index=0, figsize=None, dpi=None, show_info=None, precision=3, orientation='horizontal', **kwargs): """Plot specified tree. + Each node in the graph represents a node in the tree. + + Non-leaf nodes have labels like ``Column_10 <= 875.9``, which means + "this node splits on the feature named "Column_10", with threshold 875.9". + + Leaf nodes have labels like ``leaf 2: 0.422``, which means "this node is a + leaf node, and the predicted value for records that fall into this node + is 0.422". The number (``2``) is an internal unique identifier and doesn't + have any special meaning. + .. note:: It is preferable to use ``create_tree_digraph()`` because of its lossless quality @@ -556,9 +581,14 @@ def plot_tree(booster, ax=None, tree_index=0, figsize=None, dpi=None, Resolution of the figure. show_info : list of strings or None, optional (default=None) What information should be shown in nodes. - Possible values of list items: - 'split_gain', 'internal_value', 'internal_count', 'internal_weight', - 'leaf_count', 'leaf_weight', 'data_percentage'. + + - ``'split_gain'`` : gain from adding this split to the model + - ``'internal_value'`` : raw predicted value that would be produced by this node if it was a leaf node + - ``'internal_count'`` : number of records from the training data that fall into this non-leaf node + - ``'internal_weight'`` : total weight of all nodes that fall into this non-leaf node + - ``'leaf_count'`` : number of records from the training data that fall into this leaf node + - ``'leaf_weight'`` : total weight (sum of hessian) of all observations that fall into this leaf node + - ``'data_percentage'`` : percentage of training data that fall into this node precision : int or None, optional (default=3) Used to restrict the display of floating point values to a certain precision. orientation : string, optional (default='horizontal')