utils.py

import math
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import _tree

def get_scaler(name):
    result = None
    if name == 'minmax':
        result = MinMaxScaler(feature_range=(-1, 1))
    elif name == 'std':
        result = StandardScaler()
    else:
        raise ValueError('Unknown scaler type.')
    return result

def parse_count(count, ref_count):
    if count == 'auto':
        result = -1
    elif count.isnumeric():
        result = int(count)
    else: # '<int>x'
        multiplier = float(count[:-1])
        result = int(ref_count * multiplier)
    return result

def parse_max_depth(md, ref):
    if md == 'auto':
        # This proved to be a good default.
        result = math.ceil(math.log2(ref)) - 1
    else:
        result = int(md)
    return result

def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []

    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"({name} <= {np.round(threshold, 3)})"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"({name} > {np.round(threshold, 3)})"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            path += [(tree_.value[node], tree_.n_node_samples[node])]
            paths += [path]

    recurse(0, path, paths)

    # sort by samples count
    samples_count = [p[-1][1] for p in paths]
    ii = list(np.argsort(samples_count))
    paths = [paths[i] for i in reversed(ii)]

    rules = []
    for path in paths:
        rule = "if "

        for p in path[:-1]:
            if rule != "if ":
                rule += " and "
            rule += str(p)
        rule += " then "
        if class_names is None:
            rule += "response: " + str(np.round(path[-1][0][0][0], 3))
        else:
            classes = path[-1][0][0]
            l = np.argmax(classes)
            rule += f"class: {class_names[l]} (proba: {np.round(100.0 * classes[l] / np.sum(classes), 2)}%)"
        rule += f" | based on {path[-1][1]:,} samples"
        rules += [rule]

    return rules