From 3b85b2b07e2f329b2514864302748546c5592b2e Mon Sep 17 00:00:00 2001 From: Sefik Ilkin Serengil Date: Sat, 23 Dec 2023 11:01:23 +0000 Subject: [PATCH 1/3] improvements --- .github/workflows/pythonpublish.yml | 26 - .github/workflows/tests.yml | 67 ++ .gitignore | 2 +- .pylintrc | 640 ++++++++++++++ .vscode/settings.json | 20 + Makefile | 5 + README.md | 5 +- chefboost/Chefboost.py | 1015 ++++++++++----------- chefboost/commons/evaluate.py | 250 +++--- chefboost/commons/functions.py | 251 +++--- chefboost/commons/logger.py | 40 + chefboost/training/Preprocess.py | 296 ++++--- chefboost/training/Training.py | 1261 ++++++++++++++------------- chefboost/tuning/adaboost.py | 277 +++--- chefboost/tuning/gbm.py | 627 ++++++------- chefboost/tuning/randomforest.py | 199 +++-- requirements.txt | 8 +- scripts/push-release.sh | 11 + setup.py | 5 +- tests/global-unit-test.py | 438 +++++----- 20 files changed, 3259 insertions(+), 2184 deletions(-) delete mode 100644 .github/workflows/pythonpublish.yml create mode 100644 .github/workflows/tests.yml create mode 100644 .pylintrc create mode 100644 .vscode/settings.json create mode 100644 Makefile create mode 100644 chefboost/commons/logger.py create mode 100644 scripts/push-release.sh diff --git a/.github/workflows/pythonpublish.yml b/.github/workflows/pythonpublish.yml deleted file mode 100644 index 21f2f01..0000000 --- a/.github/workflows/pythonpublish.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Upload Python Package - -on: - release: - types: [created] - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: Set up Python - uses: actions/setup-python@v1 - with: - python-version: '3.x' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..65d670b --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,67 @@ +name: Tests and Linting + +on: + push: + paths: + - '.github/workflows/tests.yml' + - 'chefboost/**' + - 'tests/**' + - 'requirements.txt' + - '.gitignore' + - 'setup.py' + pull_request: + paths: + - '.github/workflows/tests.yml' + - 'chefboost/**' + - 'tests/**' + - 'requirements.txt' + - '.gitignore' + - 'setup.py' + +jobs: + unit-tests: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + pip install . + + - name: Test with pytest + run: | + cd tests + python global-unit-test.py + linting: + needs: unit-tests + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pylint + pip install black + pip install . + + - name: Lint with pylint + run: | + python -m pylint chefboost/ --fail-under=10 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 34eeb1a..feaa71a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,6 @@ dist/ Pipfile Pipfile.lock .mypy_cache/ -.vscode/ .idea/ chefboost.egg-info/ tests/outputs/ @@ -19,3 +18,4 @@ chefboost/tuning/__pycache__/* .DS_Store chefboost/.DS_Store tests/.DS_Store +.pytest_cache \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..694ae4c --- /dev/null +++ b/.pylintrc @@ -0,0 +1,640 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\' represents the directory delimiter on Windows systems, it +# can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.9 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=BaseException, + Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + import-error, + invalid-name, + missing-module-docstring, + missing-function-docstring, + missing-class-docstring, + too-many-arguments, + too-many-locals, + too-many-branches, + too-many-statements, + global-variable-undefined, + import-outside-toplevel, + singleton-comparison, + too-many-lines, + duplicate-code, + bare-except, + cyclic-import, + global-statement, + no-member, + no-name-in-module, + unrecognized-option, + consider-using-dict-items, + consider-iterating-dictionary, + unexpected-keyword-arg + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it work, +# install the 'python-enchant' package. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..38fd700 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,20 @@ +{ + "python.linting.pylintEnabled": true, + "python.linting.enabled": true, + "python.linting.pylintUseMinimalCheckers": false, + "editor.formatOnSave": true, + "editor.renderWhitespace": "all", + "files.autoSave": "afterDelay", + "python.analysis.typeCheckingMode": "basic", + "python.formatting.provider": "black", + "python.formatting.blackArgs": [ + "--line-length=100" + ], + "editor.fontWeight": "normal", + "python.analysis.extraPaths": [ + "./chefboost" + ], + "black-formatter.args": [ + "--line-length=100" + ] +} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ab7f41a --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +test: + cd tests && python global-unit-test.py + +lint: + python -m pylint chefboost/ --fail-under=10 \ No newline at end of file diff --git a/README.md b/README.md index 0296244..8a8a569 100644 --- a/README.md +++ b/README.md @@ -187,9 +187,10 @@ config = {'algorithm': 'C4.5', 'enableParallelism': False} model = chef.fit(df, config) ``` -### Contributing +## Contribution [![Tests](https://github.com/serengil/chefboost/actions/workflows/tests.yml/badge.svg)](https://github.com/serengil/chefboost/actions/workflows/tests.yml) + +Pull requests are more than welcome! You should run the unit tests and linting locally by running `make test` and `make lint` commands before creating a PR. Once a PR created, GitHub test workflow will be run automatically and unit test results will be available in [GitHub actions](https://github.com/serengil/chefboosts/actions) before approval. -Pull requests are welcome. You should run the unit tests locally by running [`test/global-unit-test.py`](https://github.com/serengil/chefboost/blob/master/tests/global-unit-test.py). Please share the unit test result logs in the PR. ### Support diff --git a/chefboost/Chefboost.py b/chefboost/Chefboost.py index f7798f7..f45c6ac 100644 --- a/chefboost/Chefboost.py +++ b/chefboost/Chefboost.py @@ -1,556 +1,601 @@ -import pandas as pd -import math -import numpy as np import time -import imp import pickle import os -from os import path import json +from typing import Optional, Dict, Any, Union -from chefboost.commons import functions, evaluate as eval -from chefboost.training import Preprocess, Training -from chefboost.tuning import gbm, adaboost, randomforest - -#------------------------ - -def fit(df, config = {}, target_label = 'Decision', validation_df = None): - - """ - Parameters: - df (pandas data frame): Training data frame. The target column must be named as 'Decision' and it has to be in the last column - - config (dictionary): - - config = { - 'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression - 'enableParallelism' (boolean): False - - 'enableGBM' (boolean): True, - 'epochs' (int): 7, - 'learning_rate' (int): 1, - - 'enableRandomForest' (boolean): True, - 'num_of_trees' (int): 5, - - 'enableAdaboost' (boolean): True, - 'num_of_weak_classifier' (int): 4 - } - - validation_df (pandas data frame): if nothing is passed to validation data frame, then the function validates built trees for training data frame - - Returns: - chefboost model - - """ - - #------------------------ - - process_id = os.getpid() - - #------------------------ - #rename target column name - if target_label != 'Decision': - df = df.rename(columns = {target_label: 'Decision'}) - - #if target is not the last column - if df.columns[-1] != 'Decision': - if 'Decision' in df.columns: - new_column_order = df.columns.drop('Decision').tolist() + ['Decision'] - #print(new_column_order) - df = df[new_column_order] - else: - raise ValueError('Please set the target_label') - - #------------------------ - - base_df = df.copy() - - #------------------------ - - target_label = df.columns[len(df.columns)-1] - if target_label != 'Decision': - print("Expected: Decision, Existing: ",target_label) - raise ValueError('Please confirm that name of the target column is "Decision" and it is put to the right in pandas data frame') - - #------------------------ - #handle NaN values - - nan_values = [] - - for column in df.columns: - if df[column].dtypes != 'object': - min_value = df[column].min() - idx = df[df[column].isna()].index - - nan_value = [] - nan_value.append(column) - - if idx.shape[0] > 0: - df.loc[idx, column] = min_value - 1 - nan_value.append(min_value - 1) - min_value - 1 - #print("NaN values are replaced to ", min_value - 1, " in column ", column) - else: - nan_value.append(None) - - nan_values.append(nan_value) - - #------------------------ - - #initialize params and folders - config = functions.initializeParams(config) - functions.initializeFolders() - - #------------------------ - - algorithm = config['algorithm'] - - valid_algorithms = ['ID3', 'C4.5', 'CART', 'CHAID', 'Regression'] - - if algorithm not in valid_algorithms: - raise ValueError('Invalid algorithm passed. You passed ', algorithm," but valid algorithms are ",valid_algorithms) - - #------------------------ - - enableRandomForest = config['enableRandomForest'] - num_of_trees = config['num_of_trees'] - enableMultitasking = config['enableMultitasking'] #no longer used. check to remove this variable. - - enableGBM = config['enableGBM'] - epochs = config['epochs'] - learning_rate = config['learning_rate'] - - enableAdaboost = config['enableAdaboost'] - enableParallelism = config['enableParallelism'] - - #------------------------ - - if enableParallelism == True: - print("[INFO]: ",config["num_cores"],"CPU cores will be allocated in parallel running") - - from multiprocessing import set_start_method, freeze_support - set_start_method("spawn", force=True) - freeze_support() - #------------------------ - raw_df = df.copy() - num_of_rows = df.shape[0]; num_of_columns = df.shape[1] - - if algorithm == 'Regression': - if df['Decision'].dtypes == 'object': - raise ValueError('Regression trees cannot be applied for nominal target values! You can either change the algorithm or data set.') - - if df['Decision'].dtypes != 'object': #this must be regression tree even if it is not mentioned in algorithm - - if algorithm != 'Regression': - print("WARNING: You set the algorithm to ", algorithm," but the Decision column of your data set has non-object type.") - print("That's why, the algorithm is set to Regression to handle the data set.") - - algorithm = 'Regression' - config['algorithm'] = 'Regression' - global_stdev = df['Decision'].std(ddof=0) - - if enableGBM == True: - print("Gradient Boosting Machines...") - algorithm = 'Regression' - config['algorithm'] = 'Regression' - - if enableAdaboost == True: - #enableParallelism = False - for j in range(0, num_of_columns): - column_name = df.columns[j] - if df[column_name].dtypes == 'object': - raise ValueError('Adaboost must be run on numeric data set for both features and target') - - #------------------------- - - print(algorithm," tree is going to be built...") - - dataset_features = dict() #initialize a dictionary. this is going to be used to check features numeric or nominal. numeric features should be transformed to nominal values based on scales. - - header = "def findDecision(obj): #" - - num_of_columns = df.shape[1]-1 - for i in range(0, num_of_columns): - column_name = df.columns[i] - dataset_features[column_name] = df[column_name].dtypes - header = header + "obj[" + str(i) +"]: "+column_name - if i != num_of_columns - 1: - header = header + ", " - - header = header + "\n" - - #------------------------ - - begin = time.time() - - trees = []; alphas = [] - - if enableAdaboost == True: - trees, alphas = adaboost.apply(df, config, header, dataset_features, validation_df = validation_df, process_id = process_id) - - elif enableGBM == True: - - if df['Decision'].dtypes == 'object': #transform classification problem to regression - trees, alphas = gbm.classifier(df, config, header, dataset_features, validation_df = validation_df, process_id = process_id) - classification = True - - else: #regression - trees = gbm.regressor(df, config, header, dataset_features, validation_df = validation_df, process_id = process_id) - classification = False - - elif enableRandomForest == True: - trees = randomforest.apply(df, config, header, dataset_features, validation_df = validation_df, process_id = process_id) - else: #regular decision tree building - - root = 1; file = "outputs/rules/rules.py" - functions.createFile(file, header) - - if enableParallelism == True: - json_file = "outputs/rules/rules.json" - functions.createFile(json_file, "[\n") - - trees = Training.buildDecisionTree(df, root = root, file = file, config = config - , dataset_features = dataset_features - , parent_level = 0, leaf_id = 0, parents = 'root', validation_df = validation_df, main_process_id = process_id) - - print("-------------------------") - print("finished in ",time.time() - begin," seconds") - - obj = { - "trees": trees, - "alphas": alphas, - "config": config, - "nan_values": nan_values - } - - #----------------------------------------- - - #train set accuracy - df = base_df.copy() - evaluate(obj, df, task = 'train') - - #validation set accuracy - if isinstance(validation_df, pd.DataFrame): - evaluate(obj, validation_df, task = 'validation') - - #----------------------------------------- - - return obj - - #----------------------------------------- - -def predict(model, param): - - """ - Parameters: - model (built chefboost model): you should pass model argument to the return of fit function - param (list): pass input features as python list - - e.g. chef.predict(model, param = ['Sunny', 'Hot', 'High', 'Weak']) - Returns: - prediction - """ - - trees = model["trees"] - config = model["config"] - - alphas = [] - if "alphas" in model: - alphas = model["alphas"] - - nan_values = [] - if "nan_values" in model: - nan_values = model["nan_values"] - - #----------------------- - #handle missing values - - column_index = 0 - for column in nan_values: - column_name = column[0] - missing_value = column[1] - - if pd.isna(missing_value) != True: - #print("missing values will be replaced with ",missing_value," in ",column_name," column") - - if pd.isna(param[column_index]): - param[column_index] = missing_value - - column_index = column_index + 1 - - #print("instance: ", param) - #----------------------- - - enableGBM = config['enableGBM'] - adaboost = config['enableAdaboost'] - enableRandomForest = config['enableRandomForest'] - - #----------------------- - - classification = False - prediction = 0 - prediction_classes = [] - - #----------------------- - - if enableGBM == True: - - if len(trees) == config['epochs']: - classification = False - else: - classification = True - prediction_classes = [0 for i in alphas] - - #----------------------- - - if len(trees) > 1: #bagging or boosting - index = 0 - for tree in trees: - if adaboost != True: - - custom_prediction = tree.findDecision(param) +import numpy as np +import pandas as pd - if custom_prediction != None: - if type(custom_prediction) != str: #regression +from chefboost.commons import functions, evaluate as cb_eval +from chefboost.training import Training +from chefboost.tuning import gbm, adaboost as adaboost_clf, randomforest +from chefboost.commons.logger import Logger - if enableGBM == True and classification == True: - prediction_classes[index % len(alphas)] += custom_prediction - else: - prediction += custom_prediction - else: - classification = True - prediction_classes.append(custom_prediction) - else: #adaboost - prediction += alphas[index] * tree.findDecision(param) - index = index + 1 +# pylint: disable=too-many-nested-blocks, no-else-return, inconsistent-return-statements - if enableRandomForest == True: - #notice that gbm requires cumilative sum but random forest requires mean of each tree - prediction = prediction / len(trees) +logger = Logger(module="chefboost/Chefboost.py") - if adaboost == True: - prediction = functions.sign(prediction) - else: #regular decision tree - tree = trees[0] - prediction = tree.findDecision(param) +# ------------------------ - if classification == False: - return prediction - else: - if enableGBM == True and classification == True: - return alphas[np.argmax(prediction_classes)] - else: #classification - #e.g. random forest - #get predictions made by different trees - predictions = np.array(prediction_classes) - #find the most frequent prediction - (values, counts) = np.unique(predictions, return_counts=True) - idx = np.argmax(counts) - prediction = values[idx] +def fit( + df: pd.DataFrame, + config: Optional[dict] = None, + target_label: str = "Decision", + validation_df: Optional[pd.DataFrame] = None, +) -> Dict[str, Any]: + """ + Build (a) decision tree model(s) - return prediction + Args: + df (pandas data frame): Training data frame. -def save_model(base_model, file_name="model.pkl"): + config (dictionary): training configuration. e.g. - """ - Parameters: - base_model (built chefboost model): you should pass this to the return of fit function - file_name (string): you should pass target file name as exact path. - """ + config = { + 'algorithm' (string): ID3, 'C4.5, CART, CHAID or Regression + 'enableParallelism' (boolean): False - model = base_model.copy() + 'enableGBM' (boolean): True, + 'epochs' (int): 7, + 'learning_rate' (int): 1, - #modules cannot be saved. Save its reference instead. - module_names = [] - for tree in model["trees"]: - module_names.append(tree.__name__) + 'enableRandomForest' (boolean): True, + 'num_of_trees' (int): 5, - model["trees"] = module_names + 'enableAdaboost' (boolean): True, + 'num_of_weak_classifier' (int): 4 + } - f = open("outputs/rules/"+file_name, "wb") - pickle.dump(model,f) - f.close() + target_label (str): target label for supervised learning. + Default is Decision at the end of dataframe. -def load_model(file_name="model.pkl"): + validation_df (pandas data frame): validation data frame + if nothing is passed to validation data frame, then the function validates + built trees for training data frame - """ - Parameters: - file_name (string): exact path of the target saved model - Returns: - built chefboost model - """ + Returns: + chefboost model + """ - f = open('outputs/rules/'+file_name, 'rb') - model = pickle.load(f) + # ------------------------ - #restore modules from its references - modules = [] - for model_name in model["trees"]: - module = functions.restoreTree(model_name) - modules.append(module) + process_id = os.getpid() - model["trees"] = modules + # ------------------------ + # rename target column name + if target_label != "Decision": + # TODO: what if another column name is Decision? + df = df.rename(columns={target_label: "Decision"}) - return model + # if target is not the last column + if df.columns[-1] != "Decision": + if "Decision" in df.columns: + new_column_order = df.columns.drop("Decision").tolist() + ["Decision"] + logger.debug(new_column_order) + df = df[new_column_order] + else: + raise ValueError("Please set the target_label") -def restoreTree(moduleName): + # ------------------------ - """ - If you have decision rules, then this function enables you to load a built chefboost model. You can then call prediction. - Parameters: - moduleName (string): you should pass outputs/rules/rules if you want to restore outputs/rules/rules.py + base_df = df.copy() - Returns: - built chefboost model - """ + # ------------------------ - return functions.restoreTree(moduleName) + target_label = df.columns[len(df.columns) - 1] -def feature_importance(rules): + # ------------------------ + # handle NaN values - """ - Parameters: - rules (string or list): + nan_values = [] - e.g. decision_rules = "outputs/rules/rules.py" - or this could be retrieved from built model as shown below. + for column in df.columns: + if df[column].dtypes != "object": + min_value = df[column].min() + idx = df[df[column].isna()].index - decision_rules = [] - for tree in model["trees"]: - rule = .__dict__["__spec__"].origin - decision_rules.append(rule) + nan_value = [] + nan_value.append(column) - Returns: - pandas data frame - """ + if idx.shape[0] > 0: + df.loc[idx, column] = min_value - 1 + nan_value.append(min_value - 1) + logger.debug("NaN values are replaced to {min_value - 1} in column {column}") + else: + nan_value.append(None) - if type(rules) != list: - rules = [rules] - else: - print("rules: ",rules) + nan_values.append(nan_value) - #----------------------------- + # ------------------------ - dfs = [] + # initialize params and folders + config = functions.initializeParams(config) + functions.initializeFolders() - for rule in rules: - print("Decision rule: ",rule) + # ------------------------ - file = open(rule, 'r') - lines = file.readlines() + algorithm = config["algorithm"] - pivot = {} - rules = [] + valid_algorithms = ["ID3", "C4.5", "CART", "CHAID", "Regression"] - #initialize feature importances - line_idx = 0 - for line in lines: - if line_idx == 0: - feature_explainer_list = line.split("#")[1].split(", ") - for feature_explainer in feature_explainer_list: - feature = feature_explainer.split(": ")[1].replace("\n", "") - pivot[feature] = 0 - else: - if "# " in line: - rule = line.strip().split("# ")[1] - rules.append(json.loads(rule)) + if algorithm not in valid_algorithms: + raise ValueError( + "Invalid algorithm passed. You passed ", + algorithm, + " but valid algorithms are ", + valid_algorithms, + ) - line_idx = line_idx + 1 + # ------------------------ - feature_names = list(pivot.keys()) + enableRandomForest = config["enableRandomForest"] + enableGBM = config["enableGBM"] + enableAdaboost = config["enableAdaboost"] + enableParallelism = config["enableParallelism"] - for feature in feature_names: - for rule in rules: - if rule["feature"] == feature: + # ------------------------ + if enableParallelism == True: + num_cores = config["num_cores"] + logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running") - score = rule["metric_value"] * rule["instances"] - current_depth = rule["depth"] + from multiprocessing import set_start_method, freeze_support + + set_start_method("spawn", force=True) + freeze_support() + # ------------------------ + num_of_columns = df.shape[1] + + if algorithm == "Regression": + if df["Decision"].dtypes == "object": + raise ValueError( + "Regression trees cannot be applied for nominal target values!" + "You can either change the algorithm or data set." + ) + + if ( + df["Decision"].dtypes != "object" + ): # this must be regression tree even if it is not mentioned in algorithm + if algorithm != "Regression": + logger.warn( + f"You set the algorithm to {algorithm} but the Decision column of your" + " data set has non-object type." + "That's why, the algorithm is set to Regression to handle the data set." + ) + + algorithm = "Regression" + config["algorithm"] = "Regression" + + if enableGBM == True: + logger.info("Gradient Boosting Machines...") + algorithm = "Regression" + config["algorithm"] = "Regression" + + if enableAdaboost == True: + # enableParallelism = False + for j in range(0, num_of_columns): + column_name = df.columns[j] + if df[column_name].dtypes == "object": + raise ValueError( + "Adaboost must be run on numeric data set for both features and target" + ) + + # ------------------------- + + logger.info(f"{algorithm} tree is going to be built...") + + # initialize a dictionary. this is going to be used to check features numeric or nominal. + # numeric features should be transformed to nominal values based on scales. + dataset_features = {} + + header = "def findDecision(obj): #" + + num_of_columns = df.shape[1] - 1 + for i in range(0, num_of_columns): + column_name = df.columns[i] + dataset_features[column_name] = df[column_name].dtypes + header += f"obj[{str(i)}]: {column_name}" + + if i != num_of_columns - 1: + header = header + ", " + + header = header + "\n" + + # ------------------------ + + begin = time.time() + + trees = [] + alphas = [] + + if enableAdaboost == True: + trees, alphas = adaboost_clf.apply( + df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + ) + + elif enableGBM == True: + if df["Decision"].dtypes == "object": # transform classification problem to regression + trees, alphas = gbm.classifier( + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + ) + # classification = True + + else: # regression + trees = gbm.regressor( + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + ) + # classification = False + + elif enableRandomForest == True: + trees = randomforest.apply( + df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + ) + else: # regular decision tree building + root = 1 + file = "outputs/rules/rules.py" + functions.createFile(file, header) + + if enableParallelism == True: + json_file = "outputs/rules/rules.json" + functions.createFile(json_file, "[\n") + + trees = Training.buildDecisionTree( + df, + root=root, + file=file, + config=config, + dataset_features=dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + validation_df=validation_df, + main_process_id=process_id, + ) + + logger.info("-------------------------") + logger.info(f"finished in {time.time() - begin} seconds") + + obj = {"trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values} + + # ----------------------------------------- + + # train set accuracy + df = base_df.copy() + evaluate(obj, df, task="train") + + # validation set accuracy + if isinstance(validation_df, pd.DataFrame): + evaluate(obj, validation_df, task="validation") + + # ----------------------------------------- + + return obj + + # ----------------------------------------- + + +def predict(model: dict, param: list) -> Union[str, int, float]: + """ + Predict the target label of given features from a pre-trained model + Args: + model (built chefboost model): pre-trained model which is the output + of fit function + param (list): pass input features as python list + e.g. chef.predict(model, param = ['Sunny', 'Hot', 'High', 'Weak']) + Returns: + prediction + """ + + trees = model["trees"] + config = model["config"] + + alphas = [] + if "alphas" in model: + alphas = model["alphas"] + + nan_values = [] + if "nan_values" in model: + nan_values = model["nan_values"] + + # ----------------------- + # handle missing values + + column_index = 0 + for column in nan_values: + column_name = column[0] + missing_value = column[1] + + if pd.isna(missing_value) != True: + logger.debug( + f"missing values will be replaced with {missing_value} in {column_name} column" + ) + + if pd.isna(param[column_index]): + param[column_index] = missing_value + + column_index = column_index + 1 + + logger.debug(f"instance: {param}") + # ----------------------- + + enableGBM = config["enableGBM"] + adaboost = config["enableAdaboost"] + enableRandomForest = config["enableRandomForest"] + + # ----------------------- + + classification = False + prediction = 0 + prediction_classes = [] + + # ----------------------- + + if enableGBM == True: + if len(trees) == config["epochs"]: + classification = False + else: + classification = True + prediction_classes = [0 for i in alphas] + + # ----------------------- + + if len(trees) > 1: # bagging or boosting + index = 0 + for tree in trees: + if adaboost != True: + custom_prediction = tree.findDecision(param) + + if custom_prediction != None: + if not isinstance(custom_prediction, str): # regression + if enableGBM == True and classification == True: + prediction_classes[index % len(alphas)] += custom_prediction + else: + prediction += custom_prediction + else: + classification = True + prediction_classes.append(custom_prediction) + else: # adaboost + prediction += alphas[index] * tree.findDecision(param) + index = index + 1 + + if enableRandomForest == True: + # notice that gbm requires cumilative sum but random forest requires mean of each tree + prediction = prediction / len(trees) + + if adaboost == True: + prediction = functions.sign(prediction) + else: # regular decision tree + tree = trees[0] + prediction = tree.findDecision(param) + + if classification == False: + return prediction + else: + if enableGBM == True and classification == True: + return alphas[np.argmax(prediction_classes)] + else: # classification + # e.g. random forest + # get predictions made by different trees + predictions = np.array(prediction_classes) + + # find the most frequent prediction + (values, counts) = np.unique(predictions, return_counts=True) + idx = np.argmax(counts) + prediction = values[idx] + + return prediction + + +def save_model(base_model: dict, file_name: str = "model.pkl") -> None: + """ + Save pre-trained model on file system + Args: + base_model (dict): pre-trained model which is the output + of the fit function + file_name (string): target file name as exact path. + """ + + model = base_model.copy() + + # modules cannot be saved. Save its reference instead. + module_names = [] + for tree in model["trees"]: + module_names.append(tree.__name__) + + model["trees"] = module_names + + with open(f"outputs/rules/{file_name}", "wb") as f: + pickle.dump(model, f) + + +def load_model(file_name: str = "model.pkl") -> dict: + """ + Load the save pre-trained model from file system + Args: + file_name (str): exact path of the target saved model + Returns: + built model (dict) + """ + + with open("outputs/rules/" + file_name, "rb") as f: + model = pickle.load(f) + + # restore modules from its references + modules = [] + for model_name in model["trees"]: + module = functions.restoreTree(model_name) + modules.append(module) + + model["trees"] = modules + + return model + + +def restoreTree(moduleName) -> dict: + """ + Load built model from set of decision rules + Args: + moduleName (str): e.g. outputs/rules/rules to restore outputs/rules/rules.py + Returns: + built model (dict) + """ + + return functions.restoreTree(moduleName) + + +def feature_importance(rules: Union[str, list]) -> pd.DataFrame: + """ + Show the feature importance values of a built model + Args: + rules (str or list): e.g. decision_rules = "outputs/rules/rules.py" + or this could be retrieved from built model as shown below. + + decision_rules = [] + for tree in model["trees"]: + rule = .__dict__["__spec__"].origin + decision_rules.append(rule) + Returns: + feature importance (pd.DataFrame) + """ + + if not isinstance(rules, list): + rules = [rules] + logger.info(f"rules: {rules}") + + # ----------------------------- + + dfs = [] + + for rule in rules: + logger.info("Decision rule: {rule}") + + with open(rule, "r", encoding="UTF-8") as file: + lines = file.readlines() + + pivot = {} + rules = [] + + # initialize feature importances + line_idx = 0 + for line in lines: + if line_idx == 0: + feature_explainer_list = line.split("#")[1].split(", ") + for feature_explainer in feature_explainer_list: + feature = feature_explainer.split(": ")[1].replace("\n", "") + pivot[feature] = 0 + else: + if "# " in line: + rule = line.strip().split("# ")[1] + rules.append(json.loads(rule)) - child_scores = 0 - #find child node importances - for child_rule in rules: - if child_rule["depth"] == current_depth + 1: + line_idx = line_idx + 1 - child_score = child_rule["metric_value"] * child_rule["instances"] + feature_names = list(pivot.keys()) - child_scores = child_scores + child_score + for feature in feature_names: + for rule in rules: + if rule["feature"] == feature: + score = rule["metric_value"] * rule["instances"] + current_depth = rule["depth"] - score = score - child_scores + child_scores = 0 + # find child node importances + for child_rule in rules: + if child_rule["depth"] == current_depth + 1: + child_score = child_rule["metric_value"] * child_rule["instances"] - pivot[feature] = pivot[feature] + score + child_scores = child_scores + child_score - #normalize feature importance + score = score - child_scores - total_score = 0 - for feature, score in pivot.items(): - total_score = total_score + score + pivot[feature] = pivot[feature] + score - for feature, score in pivot.items(): - pivot[feature] = round(pivot[feature] / total_score, 4) + # normalize feature importance - instances = [] - for feature, score in pivot.items(): - instance = [] - instance.append(feature) - instance.append(score) - instances.append(instance) + total_score = 0 + for feature, score in pivot.items(): + total_score = total_score + score - df = pd.DataFrame(instances, columns = ["feature", "final_importance"]) - df = df.sort_values(by = ["final_importance"], ascending = False) + for feature, score in pivot.items(): + pivot[feature] = round(pivot[feature] / total_score, 4) - if len(rules) == 1: - return df - else: - dfs.append(df) + instances = [] + for feature, score in pivot.items(): + instance = [] + instance.append(feature) + instance.append(score) + instances.append(instance) - if len(rules) > 1: + df = pd.DataFrame(instances, columns=["feature", "final_importance"]) + df = df.sort_values(by=["final_importance"], ascending=False) - hf = pd.DataFrame(feature_names, columns = ["feature"]) - hf["importance"] = 0 + if len(rules) == 1: + return df + else: + dfs.append(df) - for df in dfs: - hf = hf.merge(df, on = ["feature"], how = "left") - hf["importance"] = hf["importance"] + hf["final_importance"] - hf = hf.drop(columns = ["final_importance"]) + if len(rules) > 1: + hf = pd.DataFrame(feature_names, columns=["feature"]) + hf["importance"] = 0 - #------------------------ - #normalize - hf["importance"] = hf["importance"] / hf["importance"].sum() - hf = hf.sort_values(by = ["importance"], ascending = False) + for df in dfs: + hf = hf.merge(df, on=["feature"], how="left") + hf["importance"] = hf["importance"] + hf["final_importance"] + hf = hf.drop(columns=["final_importance"]) - return hf + # ------------------------ + # normalize + hf["importance"] = hf["importance"] / hf["importance"].sum() + hf = hf.sort_values(by=["importance"], ascending=False) -def evaluate(model, df, target_label = 'Decision', task = 'test'): + return hf - """ - Parameters: - model (built chefboost model): you should pass the return of fit function - df (pandas data frame): data frame you would like to evaluate - task (string): optionally you can pass this train, validation or test - """ - #-------------------------- +def evaluate( + model: dict, df: pd.DataFrame, target_label: str = "Decision", task: str = "test" +) -> None: + """ + Evaluate the performance of a built model on a data set + Args: + model (dict): built model which is the output of fit function + df (pandas data frame): data frame you would like to evaluate + target_label (str): target label + task (string): set this to train, validation or test + Returns: + None + """ - if target_label != 'Decision': - df = df.rename(columns = {target_label: 'Decision'}) + # -------------------------- - #if target is not the last column - if df.columns[-1] != 'Decision': - new_column_order = df.columns.drop('Decision').tolist() + ['Decision'] - print(new_column_order) - df = df[new_column_order] + if target_label != "Decision": + df = df.rename(columns={target_label: "Decision"}) - #-------------------------- + # if target is not the last column + if df.columns[-1] != "Decision": + new_column_order = df.columns.drop("Decision").tolist() + ["Decision"] + logger.debug(new_column_order) + df = df[new_column_order] + + # -------------------------- - functions.bulk_prediction(df, model) + functions.bulk_prediction(df, model) - enableAdaboost = model["config"]["enableAdaboost"] + enableAdaboost = model["config"]["enableAdaboost"] - if enableAdaboost == True: - df['Decision'] = df['Decision'].astype(str) - df['Prediction'] = df['Prediction'].astype(str) + if enableAdaboost == True: + df["Decision"] = df["Decision"].astype(str) + df["Prediction"] = df["Prediction"].astype(str) - eval.evaluate(df, task = task) + cb_eval.evaluate(df, task=task) diff --git a/chefboost/commons/evaluate.py b/chefboost/commons/evaluate.py index 62a4061..44eba39 100644 --- a/chefboost/commons/evaluate.py +++ b/chefboost/commons/evaluate.py @@ -1,121 +1,133 @@ import math +from chefboost.commons.logger import Logger -def evaluate(df, task = 'train'): - - if df['Decision'].dtypes == 'object': - problem_type = 'classification' - else: - problem_type = 'regression' - - #------------------------------------- - - instances = df.shape[0] - - print("-------------------------") - print("Evaluate ",task,"set") - print("-------------------------") - - if problem_type == 'classification': - - idx = df[df['Prediction'] == df['Decision']].index - accuracy = 100*len(idx)/df.shape[0] - print("Accuracy: ", accuracy,"% on ",instances," instances") - - #----------------------------- - - predictions = df.Prediction.values - actuals = df.Decision.values - - #----------------------------- - #confusion matrix - - #labels = df.Prediction.unique() - labels = df.Decision.unique() - - confusion_matrix = [] - for prediction_label in labels: - confusion_row = [] - for actual_label in labels: - item = len(df[(df['Prediction'] == prediction_label) - & (df['Decision'] == actual_label)]['Decision'].values) - confusion_row.append(item) - confusion_matrix.append(confusion_row) - - print("Labels: ", labels) - print("Confusion matrix: ",confusion_matrix) - - #----------------------------- - #precision and recall - - for decision_class in labels: - - fp = 0; fn = 0; tp = 0; tn = 0 - for i in range(0, len(predictions)): - prediction = predictions[i] - actual = actuals[i] - - if actual == decision_class and prediction == decision_class: - tp = tp + 1 - elif actual != decision_class and prediction != decision_class: - tn = tn + 1 - elif actual != decision_class and prediction == decision_class: - fp = fp + 1 - elif actual == decision_class and prediction != decision_class: - fn = fn + 1 - - epsilon = 0.0000001 #to avoid divison by zero exception - precision = round(100*tp / (tp + fp + epsilon), 4) - recall = round(100*tp / (tp + fn + epsilon), 4) #tpr - f1_score = round((2 * precision * recall) / (precision + recall + epsilon), 4) - accuracy = round(100 * (tp + tn) / (tp + tn + fp + fn + epsilon), 4) - - if len(labels) >= 3: - print("Decision ", decision_class, " => ",end = '') - print("Accuray: ", accuracy,"%, ", end = '') - - print("Precision: ", precision,"%, Recall: ", recall,"%, F1: ", f1_score,"%") - #print("TP: ",tp,", TN: ",tn,", FP: ", fp,", FN: ",fn) - - if len(labels) < 3: - break - - #------------------------------------- - else: - - df['Absolute_Error'] = abs(df['Prediction'] - df['Decision']) - df['Absolute_Error_Squared'] = df['Absolute_Error'] * df['Absolute_Error'] - df['Decision_Squared'] = df['Decision'] * df['Decision'] - df['Decision_Mean'] = df['Decision'].mean() - - #print(df) - - if instances > 0: - - mae = df['Absolute_Error'].sum()/instances - print("MAE: ",mae) - - mse = df['Absolute_Error_Squared'].sum()/instances - print("MSE: ", mse) - - rmse = math.sqrt(mse) - print("RMSE: ",rmse) - - rae = 0; rrse = 0 - try: #divisor might be equal to 0. - - rae = math.sqrt(df['Absolute_Error_Squared'].sum())/math.sqrt(df['Decision_Squared'].sum()) - - rrse = math.sqrt((df['Absolute_Error_Squared'].sum()) / ((df['Decision_Mean'] - df['Decision']) ** 2).sum()) - - except Exception as err: - print(str(err)) - - print("RAE: ", rae) - print("RRSE: ",rrse) - - mean = df['Decision'].mean() - print("Mean: ", mean) - - if mean > 0: - print("MAE / Mean: ",100*mae/mean,"%") - print("RMSE / Mean: ",100*rmse/mean,"%") \ No newline at end of file +# pylint: disable=broad-except + +logger = Logger(module="chefboost/commons/evaluate.py") + + +def evaluate(df, task="train"): + if df["Decision"].dtypes == "object": + problem_type = "classification" + else: + problem_type = "regression" + + # ------------------------------------- + + instances = df.shape[0] + + logger.info("-------------------------") + logger.info(f"Evaluate {task} set") + logger.info("-------------------------") + + if problem_type == "classification": + idx = df[df["Prediction"] == df["Decision"]].index + accuracy = 100 * len(idx) / df.shape[0] + logger.info(f"Accuracy: {accuracy}% on {instances} instances") + + # ----------------------------- + + predictions = df.Prediction.values + actuals = df.Decision.values + + # ----------------------------- + # confusion matrix + + # labels = df.Prediction.unique() + labels = df.Decision.unique() + + confusion_matrix = [] + for prediction_label in labels: + confusion_row = [] + for actual_label in labels: + item = len( + df[(df["Prediction"] == prediction_label) & (df["Decision"] == actual_label)][ + "Decision" + ].values + ) + confusion_row.append(item) + confusion_matrix.append(confusion_row) + + logger.info(f"Labels: {labels}") + logger.info(f"Confusion matrix: {confusion_matrix}") + + # ----------------------------- + # precision and recall + + for decision_class in labels: + fp = 0 + fn = 0 + tp = 0 + tn = 0 + for i, prediction in enumerate(predictions): + actual = actuals[i] + + if actual == decision_class and prediction == decision_class: + tp = tp + 1 + # pylint: disable=consider-using-in + elif actual != decision_class and prediction != decision_class: + tn = tn + 1 + elif actual != decision_class and prediction == decision_class: + fp = fp + 1 + elif actual == decision_class and prediction != decision_class: + fn = fn + 1 + + epsilon = 0.0000001 # to avoid divison by zero exception + precision = round(100 * tp / (tp + fp + epsilon), 4) + recall = round(100 * tp / (tp + fn + epsilon), 4) # tpr + f1_score = round((2 * precision * recall) / (precision + recall + epsilon), 4) + accuracy = round(100 * (tp + tn) / (tp + tn + fp + fn + epsilon), 4) + + if len(labels) >= 3: + logger.info(f"Decision {decision_class}") + logger.info(f"Accuray: {accuracy}") + + logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%") + logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}") + + if len(labels) < 3: + break + + # ------------------------------------- + else: + df["Absolute_Error"] = abs(df["Prediction"] - df["Decision"]) + df["Absolute_Error_Squared"] = df["Absolute_Error"] * df["Absolute_Error"] + df["Decision_Squared"] = df["Decision"] * df["Decision"] + df["Decision_Mean"] = df["Decision"].mean() + + logger.debug(df) + + if instances > 0: + mae = df["Absolute_Error"].sum() / instances + logger.info(f"MAE: {mae}") + + mse = df["Absolute_Error_Squared"].sum() / instances + logger.info(f"MSE: {mse}") + + rmse = math.sqrt(mse) + logger.info(f"RMSE: {rmse}") + + rae = 0 + rrse = 0 + try: # divisor might be equal to 0. + rae = math.sqrt(df["Absolute_Error_Squared"].sum()) / math.sqrt( + df["Decision_Squared"].sum() + ) + + rrse = math.sqrt( + (df["Absolute_Error_Squared"].sum()) + / ((df["Decision_Mean"] - df["Decision"]) ** 2).sum() + ) + + except Exception as err: + logger.error(str(err)) + + logger.info(f"RAE: {rae}") + logger.info(f"RRSE {rrse}") + + mean = df["Decision"].mean() + logger.info(f"Mean: {mean}") + + if mean > 0: + logger.info(f"MAE / Mean: {100 * mae / mean}%") + logger.info(f"RMSE / Mean: {100 * rmse / mean}%") diff --git a/chefboost/commons/functions.py b/chefboost/commons/functions.py index f4d5b0a..3df4138 100644 --- a/chefboost/commons/functions.py +++ b/chefboost/commons/functions.py @@ -1,133 +1,164 @@ -import numpy as np import pathlib -import imp +import imp # pylint: disable=deprecated-module import os from os import path import multiprocessing +from typing import Optional +import numpy as np from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +# pylint: disable=no-else-return, broad-except + +logger = Logger(module="chefboost/commons/functions.py") + def bulk_prediction(df, model): + predictions = [] + for _, instance in df.iterrows(): + features = instance.values[0:-1] + prediction = cb.predict(model, features) + predictions.append(prediction) - predictions = [] - for index, instance in df.iterrows(): - features = instance.values[0:-1] - prediction = cb.predict(model, features) - predictions.append(prediction) + df["Prediction"] = predictions - df['Prediction'] = predictions def restoreTree(moduleName): - fp, pathname, description = imp.find_module(moduleName) - return imp.load_module(moduleName, fp, pathname, description) + fp, pathname, description = imp.find_module(moduleName) + return imp.load_module(moduleName, fp, pathname, description) + def softmax(w): - e = np.exp(np.array(w, dtype=np.float32)) - dist = e / np.sum(e) - return dist + e = np.exp(np.array(w, dtype=np.float32)) + dist = e / np.sum(e) + return dist + def sign(x): - if x > 0: - return 1 - elif x < 0: - return -1 - else: - return 0 + if x > 0: + return 1 + elif x < 0: + return -1 + else: + return 0 + def formatRule(root): - resp = '' + resp = "" + + for _ in range(0, root): + resp = resp + " " + + return resp + - for i in range(0, root): - resp = resp + ' ' +def storeRule(file, content): + with open(file, "a+", encoding="UTF-8") as f: + f.writelines(content) + f.writelines("\n") - return resp -def storeRule(file,content): - f = open(file, "a+") - f.writelines(content) - f.writelines("\n") +def createFile(file, content): + with open(file, "w", encoding="UTF-8") as f: + f.write(content) -def createFile(file,content): - f = open(file, "w") - f.write(content) def initializeFolders(): - import sys - sys.path.append("..") - pathlib.Path("outputs").mkdir(parents=True, exist_ok=True) - pathlib.Path("outputs/data").mkdir(parents=True, exist_ok=True) - pathlib.Path("outputs/rules").mkdir(parents=True, exist_ok=True) - - #----------------------------------- - - #clear existing rules in outputs/ - - outputs_path = os.getcwd()+os.path.sep+"outputs"+os.path.sep - - try: - if path.exists(outputs_path+"data"): - for file in os.listdir(outputs_path+"data"): - os.remove(outputs_path+"data"+os.path.sep+file) - - if path.exists(outputs_path+"rules"): - for file in os.listdir(outputs_path+"rules"): - if ".py" in file or ".json" in file or ".txt" in file or ".pkl" in file or ".csv" in file: - os.remove(outputs_path+"rules"+os.path.sep+file) - except Exception as err: - print("WARNING: ", str(err)) - - #------------------------------------ - -def initializeParams(config): - algorithm = 'ID3' - enableRandomForest = False; num_of_trees = 5; enableMultitasking = False - enableGBM = False; epochs = 10; learning_rate = 1; max_depth = 5 - enableAdaboost = False; num_of_weak_classifier = 4 - enableParallelism = True - num_cores = int(multiprocessing.cpu_count()/2) #allocate half of your total cores - #num_cores = int((3*multiprocessing.cpu_count())/4) #allocate 3/4 of your total cores - #num_cores = multiprocessing.cpu_count() - - for key, value in config.items(): - if key == 'algorithm': - algorithm = value - #--------------------------------- - elif key == 'enableRandomForest': - enableRandomForest = value - elif key == 'num_of_trees': - num_of_trees = value - elif key == 'enableMultitasking': - enableMultitasking = value - #--------------------------------- - elif key == 'enableGBM': - enableGBM = value - elif key == 'epochs': - epochs = value - elif key == 'learning_rate': - learning_rate = value - elif key == 'max_depth': - max_depth = value - #--------------------------------- - elif key == 'enableAdaboost': - enableAdaboost = value - elif key == 'num_of_weak_classifier': - num_of_weak_classifier = value - #--------------------------------- - elif key == 'enableParallelism': - enableParallelism = value - elif key == 'num_cores': - num_cores = value - - config['algorithm'] = algorithm - config['enableRandomForest'] = enableRandomForest - config['num_of_trees'] = num_of_trees - config['enableMultitasking'] = enableMultitasking - config['enableGBM'] = enableGBM - config['epochs'] = epochs - config['learning_rate'] = learning_rate - config['max_depth'] = max_depth - config['enableAdaboost'] = enableAdaboost - config['num_of_weak_classifier'] = num_of_weak_classifier - config['enableParallelism'] = enableParallelism - config['num_cores'] = num_cores - - return config + import sys + + sys.path.append("..") + pathlib.Path("outputs").mkdir(parents=True, exist_ok=True) + pathlib.Path("outputs/data").mkdir(parents=True, exist_ok=True) + pathlib.Path("outputs/rules").mkdir(parents=True, exist_ok=True) + + # ----------------------------------- + + # clear existing rules in outputs/ + + outputs_path = os.getcwd() + os.path.sep + "outputs" + os.path.sep + + try: + if path.exists(outputs_path + "data"): + for file in os.listdir(outputs_path + "data"): + os.remove(outputs_path + "data" + os.path.sep + file) + + if path.exists(outputs_path + "rules"): + for file in os.listdir(outputs_path + "rules"): + if ( + ".py" in file + or ".json" in file + or ".txt" in file + or ".pkl" in file + or ".csv" in file + ): + os.remove(outputs_path + "rules" + os.path.sep + file) + except Exception as err: + logger.warn(str(err)) + + # ------------------------------------ + + +def initializeParams(config: Optional[dict] = None): + + if config == None: + config = {} + + algorithm = "ID3" + enableRandomForest = False + num_of_trees = 5 + enableMultitasking = False + enableGBM = False + epochs = 10 + learning_rate = 1 + max_depth = 5 + enableAdaboost = False + num_of_weak_classifier = 4 + enableParallelism = True + num_cores = int(multiprocessing.cpu_count() / 2) # allocate half of your total cores + # num_cores = int((3*multiprocessing.cpu_count())/4) #allocate 3/4 of your total cores + # num_cores = multiprocessing.cpu_count() + + for key, value in config.items(): + if key == "algorithm": + algorithm = value + # --------------------------------- + elif key == "enableRandomForest": + enableRandomForest = value + elif key == "num_of_trees": + num_of_trees = value + elif key == "enableMultitasking": + enableMultitasking = value + # --------------------------------- + elif key == "enableGBM": + enableGBM = value + elif key == "epochs": + epochs = value + elif key == "learning_rate": + learning_rate = value + elif key == "max_depth": + max_depth = value + # --------------------------------- + elif key == "enableAdaboost": + enableAdaboost = value + elif key == "num_of_weak_classifier": + num_of_weak_classifier = value + # --------------------------------- + elif key == "enableParallelism": + enableParallelism = value + elif key == "num_cores": + num_cores = value + + config["algorithm"] = algorithm + config["enableRandomForest"] = enableRandomForest + config["num_of_trees"] = num_of_trees + config["enableMultitasking"] = enableMultitasking + config["enableGBM"] = enableGBM + config["epochs"] = epochs + config["learning_rate"] = learning_rate + config["max_depth"] = max_depth + config["enableAdaboost"] = enableAdaboost + config["num_of_weak_classifier"] = num_of_weak_classifier + config["enableParallelism"] = enableParallelism + config["num_cores"] = num_cores + + return config diff --git a/chefboost/commons/logger.py b/chefboost/commons/logger.py new file mode 100644 index 0000000..4a8dc6d --- /dev/null +++ b/chefboost/commons/logger.py @@ -0,0 +1,40 @@ +import os +import logging +from datetime import datetime + +# pylint: disable=broad-except +class Logger: + def __init__(self, module=None): + self.module = module + log_level = os.environ.get("CHEFBOOST_LOG_LEVEL", str(logging.INFO)) + try: + self.log_level = int(log_level) + except Exception as err: + self.dump_log( + f"Exception while parsing $CHEFBOOST_LOG_LEVEL." + f"Expected int but it is {log_level} ({str(err)})" + ) + self.log_level = logging.INFO + + def info(self, message): + if self.log_level <= logging.INFO: + self.dump_log(f"{message}") + + def debug(self, message): + if self.log_level <= logging.DEBUG: + self.dump_log(f"🕷️ {message}") + + def warn(self, message): + if self.log_level <= logging.WARNING: + self.dump_log(f"⚠️ {message}") + + def error(self, message): + if self.log_level <= logging.ERROR: + self.dump_log(f"🔴 {message}") + + def critical(self, message): + if self.log_level <= logging.CRITICAL: + self.dump_log(f"💥 {message}") + + def dump_log(self, message): + print(f"{str(datetime.now())[2:-7]} - {message}") diff --git a/chefboost/training/Preprocess.py b/chefboost/training/Preprocess.py index 7900ae3..127efa9 100644 --- a/chefboost/training/Preprocess.py +++ b/chefboost/training/Preprocess.py @@ -1,132 +1,174 @@ +import math import numpy as np +from chefboost.training import Training +from chefboost.commons.logger import Logger -import math +logger = Logger(module="chefboost/training/Preprocess.py") -from chefboost.training import Training -#from training import Training def processContinuousFeatures(algorithm, df, column_name, entropy, config): - - #if True: - if df[column_name].nunique() <= 20: - unique_values = sorted(df[column_name].unique()) - else: - unique_values = [] - - df_mean = df[column_name].mean() - df_std = df[column_name].std(ddof=0) - df_min = df[column_name].min() - df_max = df[column_name].max() - - unique_values.append(df[column_name].min()) - unique_values.append(df[column_name].max()) - unique_values.append(df[column_name].mean()) - - scales = list(range(-3,+4, 1)) - for scale in scales: - if df_mean + scale * df_std > df_min and df_mean + scale * df_std < df_max: - unique_values.append(df_mean + scale * df_std) - - unique_values.sort() - - #print(column_name,"->",unique_values) - - subset_gainratios = []; subset_gains = []; subset_ginis = []; subset_red_stdevs = []; subset_chi_squares = [] - - if len(unique_values) == 1: - winner_threshold = unique_values[0] - df[column_name] = np.where(df[column_name] <= winner_threshold, "<="+str(winner_threshold), ">"+str(winner_threshold)) - return df - - for i in range(0, len(unique_values)-1): - threshold = unique_values[i] - - subset1 = df[df[column_name] <= threshold] - subset2 = df[df[column_name] > threshold] - - subset1_rows = subset1.shape[0]; subset2_rows = subset2.shape[0] - total_instances = df.shape[0] #subset1_rows+subset2_rows - - subset1_probability = subset1_rows / total_instances - subset2_probability = subset2_rows / total_instances - - if algorithm == 'ID3' or algorithm == 'C4.5': - threshold_gain = entropy - subset1_probability*Training.calculateEntropy(subset1, config) - subset2_probability*Training.calculateEntropy(subset2, config) - subset_gains.append(threshold_gain) - - if algorithm == 'C4.5': #C4.5 also need gain in the block above. That's why, instead of else if we used direct if condition here - - threshold_splitinfo = -subset1_probability * math.log(subset1_probability, 2)-subset2_probability*math.log(subset2_probability, 2) - gainratio = threshold_gain / threshold_splitinfo - subset_gainratios.append(gainratio) - - elif algorithm == 'CART': - decision_for_subset1 = subset1['Decision'].value_counts().tolist() - decision_for_subset2 = subset2['Decision'].value_counts().tolist() - - gini_subset1 = 1; gini_subset2 = 1 - - for j in range(0, len(decision_for_subset1)): - gini_subset1 = gini_subset1 - math.pow((decision_for_subset1[j]/subset1_rows),2) - - for j in range(0, len(decision_for_subset2)): - gini_subset2 = gini_subset2 - math.pow((decision_for_subset2[j]/subset2_rows),2) - - gini = (subset1_rows/total_instances)*gini_subset1 + (subset2_rows/total_instances) * gini_subset2 - - subset_ginis.append(gini) - - elif algorithm == "CHAID": - #subset1 = high, subset2 = normal - - unique_decisions = df['Decision'].unique() #Yes, No - num_of_decisions = len(unique_decisions) #2 - - subset1_expected = subset1.shape[0] / num_of_decisions - subset2_expected = subset2.shape[0] / num_of_decisions - - chi_square = 0 - for d in unique_decisions: #Yes, No - - #decision = Yes - subset1_d = subset1[subset1["Decision"] == d] #high, yes - subset2_d = subset2[subset2["Decision"] == d] #normal, yes - - subset1_d_chi_square = math.sqrt(((subset1_d.shape[0] - subset1_expected) * (subset1_d.shape[0] - subset1_expected))/subset1_expected) - - subset2_d_chi_square = math.sqrt(((subset2_d.shape[0] - subset2_expected) * (subset2_d.shape[0] - subset2_expected))/subset2_expected) - - chi_square = chi_square + subset1_d_chi_square + subset2_d_chi_square - - subset_chi_squares.append(chi_square) - - #---------------------------------- - elif algorithm == 'Regression': - superset_stdev = df['Decision'].std(ddof=0) - subset1_stdev = subset1['Decision'].std(ddof=0) - subset2_stdev = subset2['Decision'].std(ddof=0) - - threshold_weighted_stdev = (subset1_rows/total_instances)*subset1_stdev + (subset2_rows/total_instances)*subset2_stdev - threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev - subset_red_stdevs.append(threshold_reducted_stdev) - - #---------------------------------- - - if algorithm == "C4.5": - winner_one = subset_gainratios.index(max(subset_gainratios)) - elif algorithm == "ID3": #actually, ID3 does not support for continuous features but we can still do it - winner_one = subset_gains.index(max(subset_gains)) - elif algorithm == "CART": - winner_one = subset_ginis.index(min(subset_ginis)) - elif algorithm == "CHAID": - winner_one = subset_chi_squares.index(max(subset_chi_squares)) - elif algorithm == "Regression": - winner_one = subset_red_stdevs.index(max(subset_red_stdevs)) - - winner_threshold = unique_values[winner_one] - #print(column_name,": ", winner_threshold," in ", unique_values) - - #print("theshold is ",winner_threshold," for ",column_name) - df[column_name] = np.where(df[column_name] <= winner_threshold, "<="+str(winner_threshold), ">"+str(winner_threshold)) - - return df + # if True: + if df[column_name].nunique() <= 20: + unique_values = sorted(df[column_name].unique()) + else: + unique_values = [] + + df_mean = df[column_name].mean() + df_std = df[column_name].std(ddof=0) + df_min = df[column_name].min() + df_max = df[column_name].max() + + unique_values.append(df[column_name].min()) + unique_values.append(df[column_name].max()) + unique_values.append(df[column_name].mean()) + + scales = list(range(-3, +4, 1)) + for scale in scales: + if df_mean + scale * df_std > df_min and df_mean + scale * df_std < df_max: + unique_values.append(df_mean + scale * df_std) + + unique_values.sort() + + logger.debug(f"{column_name} -> {unique_values}") + + subset_gainratios = [] + subset_gains = [] + subset_ginis = [] + subset_red_stdevs = [] + subset_chi_squares = [] + + if len(unique_values) == 1: + winner_threshold = unique_values[0] + df[column_name] = np.where( + df[column_name] <= winner_threshold, + "<=" + str(winner_threshold), + ">" + str(winner_threshold), + ) + return df + + for i in range(0, len(unique_values) - 1): + threshold = unique_values[i] + + subset1 = df[df[column_name] <= threshold] + subset2 = df[df[column_name] > threshold] + + subset1_rows = subset1.shape[0] + subset2_rows = subset2.shape[0] + total_instances = df.shape[0] # subset1_rows+subset2_rows + + subset1_probability = subset1_rows / total_instances + subset2_probability = subset2_rows / total_instances + + if algorithm in ["ID3", "C4.5"]: + threshold_gain = ( + entropy + - subset1_probability * Training.calculateEntropy(subset1, config) + - subset2_probability * Training.calculateEntropy(subset2, config) + ) + subset_gains.append(threshold_gain) + + # C4.5 also need gain in the block above. + # That's why, instead of else if we used direct if condition here + if algorithm == "C4.5": + threshold_splitinfo = -subset1_probability * math.log( + subset1_probability, 2 + ) - subset2_probability * math.log(subset2_probability, 2) + gainratio = threshold_gain / threshold_splitinfo + subset_gainratios.append(gainratio) + + elif algorithm == "CART": + decision_for_subset1 = subset1["Decision"].value_counts().tolist() + decision_for_subset2 = subset2["Decision"].value_counts().tolist() + + gini_subset1 = 1 + gini_subset2 = 1 + + for current_decision_for_subset1 in decision_for_subset1: + gini_subset1 = gini_subset1 - math.pow( + (current_decision_for_subset1 / subset1_rows), 2 + ) + + for current_decision_for_subset2 in decision_for_subset2: + gini_subset2 = gini_subset2 - math.pow( + (current_decision_for_subset2 / subset2_rows), 2 + ) + + gini = (subset1_rows / total_instances) * gini_subset1 + ( + subset2_rows / total_instances + ) * gini_subset2 + + subset_ginis.append(gini) + + elif algorithm == "CHAID": + # subset1 = high, subset2 = normal + + unique_decisions = df["Decision"].unique() # Yes, No + num_of_decisions = len(unique_decisions) # 2 + + subset1_expected = subset1.shape[0] / num_of_decisions + subset2_expected = subset2.shape[0] / num_of_decisions + + chi_square = 0 + for d in unique_decisions: # Yes, No + # decision = Yes + subset1_d = subset1[subset1["Decision"] == d] # high, yes + subset2_d = subset2[subset2["Decision"] == d] # normal, yes + + subset1_d_chi_square = math.sqrt( + ( + (subset1_d.shape[0] - subset1_expected) + * (subset1_d.shape[0] - subset1_expected) + ) + / subset1_expected + ) + + subset2_d_chi_square = math.sqrt( + ( + (subset2_d.shape[0] - subset2_expected) + * (subset2_d.shape[0] - subset2_expected) + ) + / subset2_expected + ) + + chi_square = chi_square + subset1_d_chi_square + subset2_d_chi_square + + subset_chi_squares.append(chi_square) + + # ---------------------------------- + elif algorithm == "Regression": + superset_stdev = df["Decision"].std(ddof=0) + subset1_stdev = subset1["Decision"].std(ddof=0) + subset2_stdev = subset2["Decision"].std(ddof=0) + + threshold_weighted_stdev = (subset1_rows / total_instances) * subset1_stdev + ( + subset2_rows / total_instances + ) * subset2_stdev + threshold_reducted_stdev = superset_stdev - threshold_weighted_stdev + subset_red_stdevs.append(threshold_reducted_stdev) + + # ---------------------------------- + + if algorithm == "C4.5": + winner_one = subset_gainratios.index(max(subset_gainratios)) + elif ( + algorithm == "ID3" + ): # actually, ID3 does not support for continuous features but we can still do it + winner_one = subset_gains.index(max(subset_gains)) + elif algorithm == "CART": + winner_one = subset_ginis.index(min(subset_ginis)) + elif algorithm == "CHAID": + winner_one = subset_chi_squares.index(max(subset_chi_squares)) + elif algorithm == "Regression": + winner_one = subset_red_stdevs.index(max(subset_red_stdevs)) + + winner_threshold = unique_values[winner_one] + logger.debug(f"{column_name}: {winner_threshold} in {unique_values}") + + logger.debug(f"theshold is {winner_threshold} for {column_name}") + df[column_name] = np.where( + df[column_name] <= winner_threshold, + "<=" + str(winner_threshold), + ">" + str(winner_threshold), + ) + + return df diff --git a/chefboost/training/Training.py b/chefboost/training/Training.py index 7b151d5..c1b34c9 100644 --- a/chefboost/training/Training.py +++ b/chefboost/training/Training.py @@ -1,729 +1,808 @@ import math -import imp +import imp # pylint:disable=deprecated-module import uuid import json -import numpy as np import copy -import os import multiprocessing import multiprocessing.pool from contextlib import closing -import pandas as pd -import psutil import gc -import sys -import tqdm - -from chefboost.training import Preprocess -from chefboost.commons import functions, evaluate - -#---------------------------------------- - -global decision_rules - -class NoDaemonProcess(multiprocessing.Process): - # make 'daemon' attribute always return False - def _get_daemon(self): - return False - def _set_daemon(self, value): - pass - daemon = property(_get_daemon, _set_daemon) - -class NoDaemonContext(type(multiprocessing.get_context())): - Process = NoDaemonProcess - -class MyPool(multiprocessing.pool.Pool): - - def __init__(self, *args, **kwargs): - kwargs['context'] = NoDaemonContext() - super(MyPool, self).__init__(*args, **kwargs) - -#---------------------------------------- -def calculateEntropy(df, config): - - algorithm = config['algorithm'] - - #-------------------------- - - if algorithm == 'Regression': - return 0 - - #print(df) - - instances = df.shape[0]; columns = df.shape[1] - #print(instances," rows, ",columns," columns") - - decisions = df['Decision'].value_counts().keys().tolist() - - entropy = 0 - - for i in range(0, len(decisions)): - decision = decisions[i] - num_of_decisions = df['Decision'].value_counts().tolist()[i] - #print(decision,"->",num_of_decisions) - - class_probability = num_of_decisions/instances - - entropy = entropy - class_probability*math.log(class_probability, 2) - - return entropy - -def findDecision(df, config): - #information gain for id3, gain ratio for c4.5, gini for cart, chi square for chaid and std for regression - algorithm = config['algorithm'] - - resp_obj = findGains(df, config) - gains = list(resp_obj["gains"].values()) - entropy = resp_obj["entropy"] - - if algorithm == "ID3": - winner_index = gains.index(max(gains)) - metric_value = entropy - metric_name = "Entropy" - elif algorithm == "C4.5": - winner_index = gains.index(max(gains)) - metric_value = entropy - metric_name = "Entropy" - elif algorithm == "CART": - winner_index = gains.index(min(gains)) - metric_value = min(gains) - metric_name = "Gini" - elif algorithm == "CHAID": - winner_index = gains.index(max(gains)) - metric_value = max(gains) - metric_name = "ChiSquared" - elif algorithm == "Regression": - winner_index = gains.index(max(gains)) - metric_value = max(gains) - metric_name = "Std" - - winner_name = df.columns[winner_index] - - return winner_name, df.shape[0], metric_value, metric_name - -def findGains(df, config): - - algorithm = config['algorithm'] - decision_classes = df["Decision"].unique() - - #----------------------------- - - entropy = 0 - - if algorithm == "ID3" or algorithm == "C4.5": - entropy = calculateEntropy(df, config) - - columns = df.shape[1]; instances = df.shape[0] - - gains = [] - - for i in range(0, columns-1): - column_name = df.columns[i] - column_type = df[column_name].dtypes - - #print(column_name,"->",column_type) - - if column_type != 'object': - df = Preprocess.processContinuousFeatures(algorithm, df, column_name, entropy, config) - - classes = df[column_name].value_counts() - - splitinfo = 0 - if algorithm == 'ID3' or algorithm == 'C4.5': - gain = entropy * 1 - else: - gain = 0 - - for j in range(0, len(classes)): - current_class = classes.keys().tolist()[j] - #print(column_name,"->",current_class) - - subdataset = df[df[column_name] == current_class] - #print(subdataset) - - subset_instances = subdataset.shape[0] - class_probability = subset_instances/instances - - if algorithm == 'ID3' or algorithm == 'C4.5': - subset_entropy = calculateEntropy(subdataset, config) - gain = gain - class_probability * subset_entropy - - if algorithm == 'C4.5': - splitinfo = splitinfo - class_probability*math.log(class_probability, 2) - - elif algorithm == 'CART': #GINI index - decision_list = subdataset['Decision'].value_counts().tolist() - - subgini = 1 - - for k in range(0, len(decision_list)): - subgini = subgini - math.pow((decision_list[k]/subset_instances), 2) - - gain = gain + (subset_instances / instances) * subgini - - elif algorithm == 'CHAID': - num_of_decisions = len(decision_classes) - - expected = subset_instances / num_of_decisions - - for d in decision_classes: - num_of_d = subdataset[subdataset["Decision"] == d].shape[0] - - chi_square_of_d = math.sqrt(((num_of_d - expected) * (num_of_d - expected)) / expected) - - gain += chi_square_of_d - - elif algorithm == 'Regression': - subset_stdev = subdataset['Decision'].std(ddof=0) - gain = gain + (subset_instances/instances)*subset_stdev - - #iterating over classes for loop end - #------------------------------- - - if algorithm == 'Regression': - stdev = df['Decision'].std(ddof=0) - gain = stdev - gain - if algorithm == 'C4.5': - if splitinfo == 0: - splitinfo = 100 #this can be if data set consists of 2 rows and current column consists of 1 class. still decision can be made (decisions for these 2 rows same). set splitinfo to very large value to make gain ratio very small. in this way, we won't find this column as the most dominant one. - gain = gain / splitinfo - - #---------------------------------- - - gains.append(gain) - - #------------------------------------------------- +import psutil - resp_obj = {} - resp_obj["gains"] = {} +import numpy as np +import pandas as pd - for idx, feature in enumerate(df.columns[0:-1]): #Decision is always the last column - #print(idx, feature) - resp_obj["gains"][feature] = gains[idx] +from chefboost.training import Preprocess +from chefboost.commons import functions +from chefboost.commons.logger import Logger - resp_obj["entropy"] = entropy +# pylint: disable=too-many-function-args, unused-argument - return resp_obj +logger = Logger(module="chefboost/training/Training.py") -def createBranchWrapper(func, args): - return func(*args) - -def createBranch(config, current_class, subdataset, numericColumn, branch_index, winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id = 0, main_process_id = None): - - custom_rules = [] - - algorithm = config['algorithm'] - enableAdaboost = config['enableAdaboost'] - enableGBM = config['enableGBM'] - max_depth = config['max_depth'] - enableParallelism = config['enableParallelism'] - - charForResp = "'" - if algorithm == 'Regression': - charForResp = "" - - #--------------------------- - - tmp_root = root * 1 - parents_raw = copy.copy(parents) - - #--------------------------- - - if numericColumn == True: - compareTo = current_class #current class might be <=x or >x in this case - else: - compareTo = " == '"+str(current_class)+"'" - - terminateBuilding = False - - #----------------------------------------------- - #can decision be made? - - if enableGBM == True and root >= max_depth: #max depth - final_decision = subdataset['Decision'].mean() - terminateBuilding = True - elif enableAdaboost == True: - #final_decision = subdataset['Decision'].value_counts().idxmax() - final_decision = functions.sign(subdataset['Decision'].mean()) #get average - terminateBuilding = True - enableParallelism = False - elif len(subdataset['Decision'].value_counts().tolist()) == 1: - final_decision = subdataset['Decision'].value_counts().keys().tolist()[0] #all items are equal in this case - terminateBuilding = True - elif subdataset.shape[1] == 1: #if decision cannot be made even though all columns dropped - final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one - terminateBuilding = True - elif algorithm == 'Regression' and (subdataset.shape[0] < 5 or root >= max_depth): #pruning condition - #elif algorithm == 'Regression' and subdataset['Decision'].std(ddof=0)/global_stdev < 0.4: #pruning condition - final_decision = subdataset['Decision'].mean() #get average - terminateBuilding = True - elif algorithm in ['ID3', 'C4.5', 'CART', 'CHAID'] and root >= max_depth: - final_decision = subdataset['Decision'].value_counts().idxmax() #get the most frequent one - terminateBuilding = True +# ---------------------------------------- - #----------------------------------------------- +global decision_rules # pylint: disable=global-at-module-level - if enableParallelism == True: - check_condition = "if" #TODO: elif checks might be above than if statements in parallel - else: - if branch_index == 0: - check_condition = "if" - else: - check_condition = "elif" - check_rule = check_condition+" obj["+str(winner_index)+"]"+compareTo+":" +class NoDaemonProcess(multiprocessing.Process): + # make 'daemon' attribute always return False + def _get_daemon(self): + return False - leaf_id = str(uuid.uuid1()) + def _set_daemon(self, value): + pass - if enableParallelism != True: - functions.storeRule(file,(functions.formatRule(root),"",check_rule)) - else: - sample_rule = {} - sample_rule["current_level"] = root - sample_rule["leaf_id"] = leaf_id - sample_rule["parents"] = parents - sample_rule["rule"] = check_rule - sample_rule["feature_idx"] = winner_index - sample_rule["feature_name"] = winner_name - sample_rule["instances"] = num_of_instances - sample_rule["metric"] = metric - sample_rule["return_statement"] = 0 - sample_rule["tree_id"] = tree_id + daemon = property(_get_daemon, _set_daemon) - #json to string - sample_rule = json.dumps(sample_rule) - custom_rules.append(sample_rule) +class NoDaemonContext(type(multiprocessing.get_context())): + # pylint: disable=too-few-public-methods + Process = NoDaemonProcess - #----------------------------------------------- - if terminateBuilding == True: #check decision is made +class MyPool(multiprocessing.pool.Pool): + # pylint: disable=too-few-public-methods, abstract-method, super-with-arguments + def __init__(self, *args, **kwargs): + kwargs["context"] = NoDaemonContext() + super(MyPool, self).__init__(*args, **kwargs) - parents = copy.copy(leaf_id) - leaf_id = str(uuid.uuid1()) - decision_rule = "return "+charForResp+str(final_decision)+charForResp +# ---------------------------------------- +def calculateEntropy(df, config): + algorithm = config["algorithm"] - if enableParallelism != True: - #serial - functions.storeRule(file,(functions.formatRule(root+1),decision_rule)) - else: - #parallel - sample_rule = {} - sample_rule["current_level"] = root+1 - sample_rule["leaf_id"] = leaf_id - sample_rule["parents"] = parents - sample_rule["rule"] = decision_rule - sample_rule["feature_idx"] = winner_index - sample_rule["feature_name"] = winner_name - sample_rule["instances"] = num_of_instances - sample_rule["metric"] = 0 - sample_rule["return_statement"] = 1 - sample_rule["tree_id"] = tree_id + # -------------------------- - #json to string - sample_rule = json.dumps(sample_rule) + if algorithm == "Regression": + return 0 - custom_rules.append(sample_rule) + logger.debug(df) - else: #decision is not made, continue to create branch and leafs - root = root + 1 #the following rule will be included by this rule. increase root - parents = copy.copy(leaf_id) - - results = buildDecisionTree(subdataset, root, file, config, dataset_features - , root-1, leaf_id, parents, tree_id = tree_id, main_process_id = main_process_id) + instances = df.shape[0] - custom_rules = custom_rules + results - - root = tmp_root * 1 - parents = copy.copy(parents_raw) + decisions = df["Decision"].value_counts().keys().tolist() - gc.collect() - - return custom_rules - -def buildDecisionTree(df, root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = 0, validation_df = None, main_process_id = None): + entropy = 0 - models = [] + for i, decision in enumerate(decisions): + num_of_decisions = df["Decision"].value_counts().tolist()[i] + logger.debug(f"{decision} -> {num_of_decisions}") - decision_rules = [] + class_probability = num_of_decisions / instances - feature_names = df.columns[0:-1] + entropy = entropy - class_probability * math.log(class_probability, 2) - enableParallelism = config['enableParallelism'] - algorithm = config['algorithm'] + return entropy - json_file = file.split(".")[0]+".json" - random_forest_enabled = config['enableRandomForest'] - enableGBM = config['enableGBM'] - enableAdaboost = config['enableAdaboost'] +def findDecision(df, config): + # information gain for id3, gain ratio for c4.5, gini for cart, + # chi square for chaid and std for regression + algorithm = config["algorithm"] + + resp_obj = findGains(df, config) + gains = list(resp_obj["gains"].values()) + entropy = resp_obj["entropy"] + + if algorithm == "ID3": + winner_index = gains.index(max(gains)) + metric_value = entropy + metric_name = "Entropy" + elif algorithm == "C4.5": + winner_index = gains.index(max(gains)) + metric_value = entropy + metric_name = "Entropy" + elif algorithm == "CART": + winner_index = gains.index(min(gains)) + metric_value = min(gains) + metric_name = "Gini" + elif algorithm == "CHAID": + winner_index = gains.index(max(gains)) + metric_value = max(gains) + metric_name = "ChiSquared" + elif algorithm == "Regression": + winner_index = gains.index(max(gains)) + metric_value = max(gains) + metric_name = "Std" + + winner_name = df.columns[winner_index] + + return winner_name, df.shape[0], metric_value, metric_name - if root == 1: - if random_forest_enabled != True and enableGBM != True and enableAdaboost != True: - raw_df = df.copy() - #-------------------------------------- +def findGains(df, config): + algorithm = config["algorithm"] + decision_classes = df["Decision"].unique() - df_copy = df.copy() + # ----------------------------- - winner_name, num_of_instances, metric, metric_name = findDecision(df, config) + entropy = 0 - #find winner index, this cannot be returned by find decision because columns dropped in previous steps - j = 0 - for i in dataset_features: - if i == winner_name: - winner_index = j - j = j + 1 + if algorithm in ["ID3", "C4.5"]: + entropy = calculateEntropy(df, config) - numericColumn = False - if dataset_features[winner_name] != 'object': - numericColumn = True + columns = df.shape[1] + instances = df.shape[0] - #restoration - columns = df.shape[1] - for i in range(0, columns-1): - #column_name = df.columns[i]; column_type = df[column_name].dtypes #numeric field already transformed to object. you cannot check it with df itself, you should check df_copy - column_name = df_copy.columns[i]; column_type = df_copy[column_name].dtypes - if column_type != 'object' and column_name != winner_name: - df[column_name] = df_copy[column_name] + gains = [] - classes = df[winner_name].value_counts().keys().tolist() - #print("classes: ",classes," in ", winner_name) - #----------------------------------------------------- + for i in range(0, columns - 1): + column_name = df.columns[i] + column_type = df[column_name].dtypes - num_cores = config["num_cores"] + logger.debug(f"{column_name} -> {column_type}") - input_params = [] + if column_type != "object": + df = Preprocess.processContinuousFeatures(algorithm, df, column_name, entropy, config) - #serial approach - for i in range(0,len(classes)): - current_class = classes[i] - subdataset = df[df[winner_name] == current_class] - subdataset = subdataset.drop(columns=[winner_name]) - branch_index = i * 1 + classes = df[column_name].value_counts() - #create branches serially - if enableParallelism != True: + splitinfo = 0 + if algorithm in ["ID3", "C4.5"]: + gain = entropy * 1 + else: + gain = 0 - if i == 0: + for j in range(0, len(classes)): + current_class = classes.keys().tolist()[j] + logger.debug(f"{column_name} -> {current_class}") - descriptor = { - "feature": winner_name, - "instances": num_of_instances, - #"metric_name": metric_name, - "metric_value": round(metric, 4), - "depth": parent_level + 1 - } - descriptor = "# "+json.dumps(descriptor) + subdataset = df[df[column_name] == current_class] + logger.debug(subdataset) - functions.storeRule(file, (functions.formatRule(root), "", descriptor)) + subset_instances = subdataset.shape[0] + class_probability = subset_instances / instances - results = createBranch(config, current_class, subdataset, numericColumn, branch_index - , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id = tree_id, main_process_id = main_process_id) + if algorithm in ["ID3", "C4.5"]: + subset_entropy = calculateEntropy(subdataset, config) + gain = gain - class_probability * subset_entropy - decision_rules = decision_rules + results + if algorithm == "C4.5": + splitinfo = splitinfo - class_probability * math.log(class_probability, 2) - else: - input_params.append((config, current_class, subdataset, numericColumn, branch_index - , winner_name, winner_index, root, parents, file, dataset_features, num_of_instances, metric, tree_id, main_process_id)) + elif algorithm == "CART": # GINI index + decision_list = subdataset["Decision"].value_counts().tolist() - #--------------------------- - #add else condition in the decision tree + subgini = 1 - if df.Decision.dtypes == 'object': #classification - pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() - pivot = pivot.rename(columns = {"Decision": "Instances","index": "Decision"}) - pivot = pivot.sort_values(by = ["Instances"], ascending = False).reset_index() + for current_decision in decision_list: + subgini = subgini - math.pow((current_decision / subset_instances), 2) - else_decision = "return '%s'" % (pivot.iloc[0].Decision) + gain = gain + (subset_instances / instances) * subgini - if enableParallelism != True: - functions.storeRule(file,(functions.formatRule(root), "else:")) - functions.storeRule(file,(functions.formatRule(root+1), else_decision)) - else: #parallelism - leaf_id = str(uuid.uuid1()) + elif algorithm == "CHAID": + num_of_decisions = len(decision_classes) - check_rule = "else: "+else_decision + expected = subset_instances / num_of_decisions - sample_rule = {} - sample_rule["current_level"] = root - sample_rule["leaf_id"] = leaf_id - sample_rule["parents"] = parents - sample_rule["rule"] = check_rule - sample_rule["feature_idx"] = -1 - sample_rule["feature_name"] = "" - sample_rule["instances"] = df.shape[0] - sample_rule["metric"] = 0 - sample_rule["return_statement"] = 0 - sample_rule["tree_id"] = tree_id + for d in decision_classes: + num_of_d = subdataset[subdataset["Decision"] == d].shape[0] - #json to string - sample_rule = json.dumps(sample_rule) - decision_rules.append(sample_rule) + chi_square_of_d = math.sqrt( + ((num_of_d - expected) * (num_of_d - expected)) / expected + ) - else: #regression - else_decision = "return %s" % (subdataset.Decision.mean()) + gain += chi_square_of_d - if enableParallelism != True: - functions.storeRule(file,(functions.formatRule(root), "else:")) - functions.storeRule(file,(functions.formatRule(root+1), else_decision)) - else: - leaf_id = str(uuid.uuid1()) + elif algorithm == "Regression": + subset_stdev = subdataset["Decision"].std(ddof=0) + gain = gain + (subset_instances / instances) * subset_stdev - check_rule = "else: "+else_decision + # iterating over classes for loop end + # ------------------------------- - sample_rule = {} - sample_rule["current_level"] = root - sample_rule["leaf_id"] = leaf_id - sample_rule["parents"] = parents - sample_rule["rule"] = check_rule - sample_rule["tree_id"] = tree_id - sample_rule["feature_name"] = "" - sample_rule["instances"] = 0 - sample_rule["metric"] = 0 - sample_rule["return_statement"] = 1 + if algorithm == "Regression": + stdev = df["Decision"].std(ddof=0) + gain = stdev - gain + if algorithm == "C4.5": + if splitinfo == 0: + splitinfo = 100 + # this can be if data set consists of 2 rows and current column consists + # of 1 class. still decision can be made (decisions for these 2 rows same). + # set splitinfo to very large value to make gain ratio very small. + # in this way, we won't find this column as the most dominant one. + gain = gain / splitinfo - #json to string - sample_rule = json.dumps(sample_rule) - decision_rules.append(sample_rule) + # ---------------------------------- - #--------------------------- + gains.append(gain) - try: - main_process = psutil.Process(main_process_id) - children = main_process.children(recursive=True) - active_processes = len(children) + 1 #plus parent - #active_processes = len(children) - except: - active_processes = 100 #set a large initial value + # ------------------------------------------------- - results = [] - #create branches in parallel - if enableParallelism == True: + resp_obj = {} + resp_obj["gains"] = {} - required_threads = active_processes + len(classes) + for idx, feature in enumerate(df.columns[0:-1]): # Decision is always the last column + logger.debug(f"{idx}, {feature}") + resp_obj["gains"][feature] = gains[idx] - #if parent_level == 0 and random_forest_enabled != True: - if main_process_id != None and num_cores >= required_threads: #len(classes) branches will be run in parallel + resp_obj["entropy"] = entropy - #POOL_SIZE = num_cores - POOL_SIZE = len(classes) + return resp_obj - #with closing(multiprocessing.Pool(POOL_SIZE)) as pool: - with closing(MyPool(POOL_SIZE)) as pool: - funclist = [] - for input_param in input_params: - f = pool.apply_async(createBranchWrapper, [createBranch, input_param]) - funclist.append(f) +def createBranchWrapper(func, args): + return func(*args) + + +def createBranch( + config, + current_class, + subdataset, + numericColumn, + branch_index, + winner_name, + winner_index, + root, + parents, + file, + dataset_features, + num_of_instances, + metric, + tree_id=0, + main_process_id=None, +): + custom_rules = [] + + algorithm = config["algorithm"] + enableAdaboost = config["enableAdaboost"] + enableGBM = config["enableGBM"] + max_depth = config["max_depth"] + enableParallelism = config["enableParallelism"] + + charForResp = "'" + if algorithm == "Regression": + charForResp = "" + + # --------------------------- + + tmp_root = root * 1 + parents_raw = copy.copy(parents) + + # --------------------------- + + if numericColumn == True: + compareTo = current_class # current class might be <=x or >x in this case + else: + compareTo = " == '" + str(current_class) + "'" + + terminateBuilding = False + + # ----------------------------------------------- + # can decision be made? + + if enableGBM == True and root >= max_depth: # max depth + final_decision = subdataset["Decision"].mean() + terminateBuilding = True + elif enableAdaboost == True: + # final_decision = subdataset['Decision'].value_counts().idxmax() + final_decision = functions.sign(subdataset["Decision"].mean()) # get average + terminateBuilding = True + enableParallelism = False + elif len(subdataset["Decision"].value_counts().tolist()) == 1: + final_decision = ( + subdataset["Decision"].value_counts().keys().tolist()[0] + ) # all items are equal in this case + terminateBuilding = True + elif subdataset.shape[1] == 1: # if decision cannot be made even though all columns dropped + final_decision = subdataset["Decision"].value_counts().idxmax() # get the most frequent one + terminateBuilding = True + elif algorithm == "Regression" and ( + subdataset.shape[0] < 5 or root >= max_depth + ): # pruning condition + final_decision = subdataset["Decision"].mean() # get average + terminateBuilding = True + elif algorithm in ["ID3", "C4.5", "CART", "CHAID"] and root >= max_depth: + final_decision = subdataset["Decision"].value_counts().idxmax() # get the most frequent one + terminateBuilding = True + + # ----------------------------------------------- + + if enableParallelism == True: + check_condition = "if" # TODO: elif checks might be above than if statements in parallel + else: + if branch_index == 0: + check_condition = "if" + else: + check_condition = "elif" + + check_rule = check_condition + " obj[" + str(winner_index) + "]" + compareTo + ":" + + leaf_id = str(uuid.uuid1()) + + if enableParallelism != True: + functions.storeRule(file, (functions.formatRule(root), "", check_rule)) + else: + sample_rule = {} + sample_rule["current_level"] = root + sample_rule["leaf_id"] = leaf_id + sample_rule["parents"] = parents + sample_rule["rule"] = check_rule + sample_rule["feature_idx"] = winner_index + sample_rule["feature_name"] = winner_name + sample_rule["instances"] = num_of_instances + sample_rule["metric"] = metric + sample_rule["return_statement"] = 0 + sample_rule["tree_id"] = tree_id + + # json to string + sample_rule = json.dumps(sample_rule) + + custom_rules.append(sample_rule) + + # ----------------------------------------------- + + if terminateBuilding == True: # check decision is made + parents = copy.copy(leaf_id) + leaf_id = str(uuid.uuid1()) + + decision_rule = "return " + charForResp + str(final_decision) + charForResp + + if enableParallelism != True: + # serial + functions.storeRule(file, (functions.formatRule(root + 1), decision_rule)) + else: + # parallel + sample_rule = {} + sample_rule["current_level"] = root + 1 + sample_rule["leaf_id"] = leaf_id + sample_rule["parents"] = parents + sample_rule["rule"] = decision_rule + sample_rule["feature_idx"] = winner_index + sample_rule["feature_name"] = winner_name + sample_rule["instances"] = num_of_instances + sample_rule["metric"] = 0 + sample_rule["return_statement"] = 1 + sample_rule["tree_id"] = tree_id + + # json to string + sample_rule = json.dumps(sample_rule) + + custom_rules.append(sample_rule) + + else: # decision is not made, continue to create branch and leafs + root = root + 1 # the following rule will be included by this rule. increase root + parents = copy.copy(leaf_id) + + results = buildDecisionTree( + subdataset, + root, + file, + config, + dataset_features, + root - 1, + leaf_id, + parents, + tree_id=tree_id, + main_process_id=main_process_id, + ) + + custom_rules = custom_rules + results + + root = tmp_root * 1 + parents = copy.copy(parents_raw) + + gc.collect() + + return custom_rules + + +def buildDecisionTree( + df, + root, + file, + config, + dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + tree_id=0, + validation_df=None, + main_process_id=None, +): + models = [] + + decision_rules = [] + + feature_names = df.columns[0:-1] + + enableParallelism = config["enableParallelism"] + + json_file = file.split(".")[0] + ".json" + + # -------------------------------------- + + df_copy = df.copy() + + winner_name, num_of_instances, metric, _ = findDecision(df, config) + + # find winner index, this cannot be returned by find decision + # because columns dropped in previous steps + j = 0 + for i in dataset_features: + if i == winner_name: + winner_index = j + j = j + 1 + + numericColumn = False + if dataset_features[winner_name] != "object": + numericColumn = True + + # restoration + columns = df.shape[1] + for i in range(0, columns - 1): + column_name = df_copy.columns[i] + column_type = df_copy[column_name].dtypes + if column_type != "object" and column_name != winner_name: + df[column_name] = df_copy[column_name] + + classes = df[winner_name].value_counts().keys().tolist() + logger.debug(f"classes: {classes} in {winner_name}") + # ----------------------------------------------------- + + num_cores = config["num_cores"] + + input_params = [] + + # serial approach + for i, current_class in enumerate(classes): + subdataset = df[df[winner_name] == current_class] + subdataset = subdataset.drop(columns=[winner_name]) + branch_index = i * 1 + + # create branches serially + if enableParallelism != True: + if i == 0: + descriptor = { + "feature": winner_name, + "instances": num_of_instances, + # "metric_name": metric_name, + "metric_value": round(metric, 4), + "depth": parent_level + 1, + } + descriptor = "# " + json.dumps(descriptor) + + functions.storeRule(file, (functions.formatRule(root), "", descriptor)) + + results = createBranch( + config, + current_class, + subdataset, + numericColumn, + branch_index, + winner_name, + winner_index, + root, + parents, + file, + dataset_features, + num_of_instances, + metric, + tree_id=tree_id, + main_process_id=main_process_id, + ) + + decision_rules = decision_rules + results + + else: + input_params.append( + ( + config, + current_class, + subdataset, + numericColumn, + branch_index, + winner_name, + winner_index, + root, + parents, + file, + dataset_features, + num_of_instances, + metric, + tree_id, + main_process_id, + ) + ) + + # --------------------------- + # add else condition in the decision tree + + if df.Decision.dtypes == "object": # classification + pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() + pivot = pivot.rename(columns={"Decision": "Instances", "index": "Decision"}) + pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() + + else_decision = f"return '{pivot.iloc[0].Decision}'" + + if enableParallelism != True: + functions.storeRule(file, (functions.formatRule(root), "else:")) + functions.storeRule(file, (functions.formatRule(root + 1), else_decision)) + else: # parallelism + leaf_id = str(uuid.uuid1()) + + check_rule = "else: " + else_decision + + sample_rule = {} + sample_rule["current_level"] = root + sample_rule["leaf_id"] = leaf_id + sample_rule["parents"] = parents + sample_rule["rule"] = check_rule + sample_rule["feature_idx"] = -1 + sample_rule["feature_name"] = "" + sample_rule["instances"] = df.shape[0] + sample_rule["metric"] = 0 + sample_rule["return_statement"] = 0 + sample_rule["tree_id"] = tree_id + + # json to string + sample_rule = json.dumps(sample_rule) + decision_rules.append(sample_rule) + + else: # regression + else_decision = f"return {subdataset.Decision.mean()}" + + if enableParallelism != True: + functions.storeRule(file, (functions.formatRule(root), "else:")) + functions.storeRule(file, (functions.formatRule(root + 1), else_decision)) + else: + leaf_id = str(uuid.uuid1()) + + check_rule = "else: " + else_decision + + sample_rule = {} + sample_rule["current_level"] = root + sample_rule["leaf_id"] = leaf_id + sample_rule["parents"] = parents + sample_rule["rule"] = check_rule + sample_rule["tree_id"] = tree_id + sample_rule["feature_name"] = "" + sample_rule["instances"] = 0 + sample_rule["metric"] = 0 + sample_rule["return_statement"] = 1 + + # json to string + sample_rule = json.dumps(sample_rule) + decision_rules.append(sample_rule) + + # --------------------------- + + try: + main_process = psutil.Process(main_process_id) + children = main_process.children(recursive=True) + active_processes = len(children) + 1 # plus parent + # active_processes = len(children) + except: + active_processes = 100 # set a large initial value + + results = [] + # create branches in parallel + if enableParallelism == True: + required_threads = active_processes + len(classes) - #all functions registered here + # if parent_level == 0 and random_forest_enabled != True: + if ( + main_process_id != None and num_cores >= required_threads + ): # len(classes) branches will be run in parallel + # POOL_SIZE = num_cores + POOL_SIZE = len(classes) + + # with closing(multiprocessing.Pool(POOL_SIZE)) as pool: + with closing(MyPool(POOL_SIZE)) as pool: + funclist = [] - for f in funclist: - branch_results = f.get(timeout = 100000) + for input_param in input_params: + f = pool.apply_async(createBranchWrapper, [createBranch, input_param]) + funclist.append(f) - for branch_result in branch_results: - results.append(branch_result) + # all functions registered here - pool.close() - pool.terminate() + for f in funclist: + branch_results = f.get(timeout=100000) - #-------------------------------- + for branch_result in branch_results: + results.append(branch_result) - else: #serial - for input_param in input_params: - sub_results = createBranchWrapper(createBranch, input_param) - for sub_result in sub_results: - results.append(sub_result) + pool.close() + pool.terminate() - #-------------------------------- + # -------------------------------- - decision_rules = decision_rules + results + else: # serial + for input_param in input_params: + sub_results = createBranchWrapper(createBranch, input_param) + for sub_result in sub_results: + results.append(sub_result) - #-------------------------------- + # -------------------------------- - if root != 1: #return children results until the root node - return decision_rules + decision_rules = decision_rules + results - #--------------------------------------------- + # -------------------------------- - if root == 1: + if root != 1: # return children results until the root node + return decision_rules - if enableParallelism == True: + # --------------------------------------------- - #custom rules are stored in decision_rules. merge them all in a json file first + if root == 1: + if enableParallelism == True: + # custom rules are stored in decision_rules. merge them all in a json file first - json_rules = "[\n" #initialize + json_rules = "[\n" # initialize - file_index = 0 - for custom_rule in decision_rules: + file_index = 0 + for custom_rule in decision_rules: + json_rules += custom_rule - json_rules += custom_rule + if file_index < len(decision_rules) - 1: + json_rules += ", " - if file_index < len(decision_rules) - 1: - json_rules += ", " + json_rules += "\n" - json_rules += "\n" + file_index = file_index + 1 - file_index = file_index + 1 + # ----------------------------------- - #----------------------------------- + json_rules += "]" + functions.createFile(json_file, json_rules) - json_rules += "]" - functions.createFile(json_file, json_rules) + # ----------------------------------- + # reconstruct rules from json to py - #----------------------------------- - #reconstruct rules from json to py + reconstructRules(json_file, feature_names) - reconstructRules(json_file, feature_names) + # ----------------------------------- - #----------------------------------- + # is regular decision tree + if ( + config["enableRandomForest"] != True + and config["enableGBM"] != True + and config["enableAdaboost"] != True + ): + # this is reguler decision tree. find accuracy here. - #is regular decision tree - if config['enableRandomForest'] != True and config['enableGBM'] != True and config['enableAdaboost'] != True: - #this is reguler decision tree. find accuracy here. + moduleName = "outputs/rules/rules" + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) # rules0 + models.append(myrules) - moduleName = "outputs/rules/rules" - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 - models.append(myrules) + return models - return models def findPrediction(row): - params = [] - num_of_features = row.shape[0] - 1 - for j in range(0, num_of_features): - params.append(row[j]) - - moduleName = "outputs/rules/rules" - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 - - prediction = myrules.findDecision(params) - return prediction - -""" -If you set parelellisim True, then branches will be created parallel. Rules are stored in a json file randomly. This program reconstructs built rules in a tree form. In this way, we can build decision trees faster. -""" - -def reconstructRules(source, feature_names, tree_id = 0): + params = [] + num_of_features = row.shape[0] - 1 + for j in range(0, num_of_features): + params.append(row[j]) - #print("Reconstructing ",source) + moduleName = "outputs/rules/rules" + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) # rules0 + + prediction = myrules.findDecision(params) + return prediction - file_name = source.split(".json")[0] - file_name = file_name+".py" - #----------------------------------- +# If you set parelellisim True, then branches will be created parallel. Rules are stored in a +# json file randomly. This program reconstructs built rules in a tree form. +# In this way, we can build decision trees faster. - constructor = "def findDecision(obj): #" - idx = 0 - for feature in feature_names: - constructor = constructor + "obj["+str(idx)+"]: "+feature - if idx < len(feature_names) - 1: - constructor = constructor+", " - idx = idx + 1 +def reconstructRules(source, feature_names, tree_id=0): + logger.debug(f"Reconstructing {source}") - functions.createFile(file_name, constructor+"\n") + file_name = source.split(".json")[0] + file_name = file_name + ".py" - #----------------------------------- + # ----------------------------------- - with open(source, 'r') as f: - rules = json.load(f) + constructor = "def findDecision(obj): #" + idx = 0 + for feature in feature_names: + constructor = constructor + "obj[" + str(idx) + "]: " + feature - #print(rules) + if idx < len(feature_names) - 1: + constructor = constructor + ", " + idx = idx + 1 - def padleft(rule, level): - for i in range(0, level): - rule = "\t"+rule - return rule + functions.createFile(file_name, constructor + "\n") - #print("def findDecision(obj):") + # ----------------------------------- - max_level = 0 + with open(source, "r", encoding="UTF-8") as f: + rules = json.load(f) - rule_set = [] - #json file might not store rules respectively - for instance in rules: - if len(instance) > 0: - rule = [] - rule.append(instance["current_level"]) - rule.append(instance["leaf_id"]) - rule.append(instance["parents"]) - rule.append(instance["rule"]) - rule.append(instance["feature_name"]) - rule.append(instance["instances"]) - rule.append(instance["metric"]) - rule.append(instance["return_statement"]) - rule_set.append(rule) - #print(padleft(instance["rule"], instance["current_level"])) + logger.debug(rules) - df = np.array(rule_set) + def padleft(rule, level): + for _ in range(0, level): + rule = "\t" + rule + return rule - def extractRules(df, parent = 'root', level=1): + logger.debug("def findDecision(obj):") - level_raw = level * 1; parent_raw = copy.copy(parent) + rule_set = [] + # json file might not store rules respectively + for instance in rules: + if len(instance) > 0: + rule = [] + rule.append(instance["current_level"]) + rule.append(instance["leaf_id"]) + rule.append(instance["parents"]) + rule.append(instance["rule"]) + rule.append(instance["feature_name"]) + rule.append(instance["instances"]) + rule.append(instance["metric"]) + rule.append(instance["return_statement"]) + rule_set.append(rule) + logger.debug(padleft(instance["rule"], instance["current_level"])) - else_rule = "" + df = np.array(rule_set) - leaf_idx = 0 - for i in range(0 ,df.shape[0]): - current_level = int(df[i][0]) - leaf_id = df[i][1] - parent_id = df[i][2] - rule = df[i][3] - feature_name = df[i][4] - instances = int(df[i][5]) - metric = float(df[i][6]) - return_statement = int(df[i][7]) + def extractRules(df, parent="root", level=1): + level_raw = level * 1 + parent_raw = copy.copy(parent) - if parent_id == parent: + else_rule = "" - if_statement = False - if rule[0:2] == "if": - if_statement = True + leaf_idx = 0 + for i in range(0, df.shape[0]): + current_level = int(df[i][0]) + leaf_id = df[i][1] + parent_id = df[i][2] + rule = df[i][3] + feature_name = df[i][4] + instances = int(df[i][5]) + metric = float(df[i][6]) + return_statement = int(df[i][7]) - else_statement = False - if rule[0:5] == "else:": - else_statement = True - else_rule = rule + if parent_id == parent: + if_statement = False + if rule[0:2] == "if": + if_statement = True - #------------------------ + else_statement = False + if rule[0:5] == "else:": + else_statement = True + else_rule = rule - if else_statement != True: + # ------------------------ - if if_statement == True and leaf_idx > 0: - rule = "el"+rule + if else_statement != True: + if if_statement == True and leaf_idx > 0: + rule = "el" + rule - #print(padleft(rule, level), "(", leaf_idx,")") + logger.debug(f"{padleft(rule, level)} ({leaf_idx})") - if leaf_idx == 0 and return_statement == 0: - explainer = {} - explainer["feature"] = feature_name - explainer["instances"] = instances - explainer["metric_value"] = round(metric, 4) - explainer["depth"] = current_level - explainer = "# "+json.dumps(explainer) - functions.storeRule(file_name, padleft(explainer, level)) + if leaf_idx == 0 and return_statement == 0: + explainer = {} + explainer["feature"] = feature_name + explainer["instances"] = instances + explainer["metric_value"] = round(metric, 4) + explainer["depth"] = current_level + explainer = "# " + json.dumps(explainer) + functions.storeRule(file_name, padleft(explainer, level)) - functions.storeRule(file_name, padleft(rule, level)) + functions.storeRule(file_name, padleft(rule, level)) - level = level + 1; parent = copy.copy(leaf_id) - extractRules(df, parent, level) - level = level_raw * 1; parent = copy.copy(parent_raw) #restore + level = level + 1 + parent = copy.copy(leaf_id) + extractRules(df, parent, level) + level = level_raw * 1 + parent = copy.copy(parent_raw) # restore - leaf_idx = leaf_idx + 1 + leaf_idx = leaf_idx + 1 - #add else statement + # add else statement - if else_rule != "": - #print(padleft(else_rule, level)) - functions.storeRule(file_name, padleft(else_rule, level)) + if else_rule != "": + logger.debug(padleft(else_rule, level)) + functions.storeRule(file_name, padleft(else_rule, level)) - #------------------------------------ + # ------------------------------------ - extractRules(df) + extractRules(df) - #------------------------------------ + # ------------------------------------ diff --git a/chefboost/tuning/adaboost.py b/chefboost/tuning/adaboost.py index 297fa7c..87b0dd3 100644 --- a/chefboost/tuning/adaboost.py +++ b/chefboost/tuning/adaboost.py @@ -1,136 +1,165 @@ +import imp # pylint: disable=deprecated-module +import math + import pandas as pd import numpy as np +from tqdm import tqdm -from chefboost.commons import functions, evaluate +from chefboost.commons import functions from chefboost.training import Training -from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger -import imp -import math +# pylint: disable=unused-argument + +logger = Logger(module="chefboost/tuning/adaboost.py") -from tqdm import tqdm def findPrediction(row): - epoch = row['Epoch'] - row = row.drop(labels=['Epoch']) - columns = row.shape[0] - - params = [] - for j in range(0, columns-1): - params.append(row[j]) - - moduleName = "outputs/rules/rules_%d" % (epoch) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) - - prediction = functions.sign(myrules.findDecision(params)) - - return prediction - -def apply(df, config, header, dataset_features, validation_df = None, process_id = None): - - models = []; alphas = [] - - initializeAlphaFile() - - num_of_weak_classifier = config['num_of_weak_classifier'] - - #------------------------ - - rows = df.shape[0]; columns = df.shape[1] - final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=['prediction']) - - worksheet = df.copy() - worksheet['Weight'] = 1 / rows #uniform distribution initially - - final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)), columns = ['Prediction', 'Actual']) - final_predictions['Actual'] = df['Decision'] - - best_epoch_idx = 0; best_epoch_value = 1000000 - - #for i in range(0, num_of_weak_classifier): - pbar = tqdm(range(0, num_of_weak_classifier), desc='Adaboosting') - for i in pbar: - worksheet['Decision'] = worksheet['Weight'] * worksheet['Decision'] - - root = 1 - file = "outputs/rules/rules_"+str(i)+".py" - - functions.createFile(file, header) - - #print(worksheet) - Training.buildDecisionTree(worksheet.drop(columns=['Weight']) - , root, file, config, dataset_features - , parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id) - - #--------------------------------------- - - moduleName = "outputs/rules/rules_"+str(i) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) - models.append(myrules) - - #--------------------------------------- - - df['Epoch'] = i - worksheet['Prediction'] = df.apply(findPrediction, axis=1) - df = df.drop(columns = ['Epoch']) - - #--------------------------------------- - worksheet['Actual'] = df['Decision'] - worksheet['Loss'] = abs(worksheet['Actual'] - worksheet['Prediction'])/2 - worksheet['Weight_Times_Loss'] = worksheet['Loss'] * worksheet['Weight'] - - epsilon = worksheet['Weight_Times_Loss'].sum() - alpha = math.log((1 - epsilon)/epsilon)/2 #use alpha to update weights in the next round - alphas.append(alpha) - - #----------------------------- - - #store alpha - addEpochAlpha(i, alpha) - - #----------------------------- - - worksheet['Alpha'] = alpha - worksheet['New_Weights'] = worksheet['Weight'] * (-alpha * worksheet['Actual'] * worksheet['Prediction']).apply(math.exp) - - #normalize - worksheet['New_Weights'] = worksheet['New_Weights'] / worksheet['New_Weights'].sum() - worksheet['Weight'] = worksheet['New_Weights'] - worksheet['Decision'] = df['Decision'] - - final_predictions['Prediction'] = final_predictions['Prediction'] + worksheet['Alpha'] * worksheet['Prediction'] - #print(final_predictions) - worksheet = worksheet.drop(columns = ['New_Weights', 'Prediction', 'Actual', 'Loss', 'Weight_Times_Loss', 'Alpha']) - - mae = (np.abs(final_predictions['Prediction'].apply(functions.sign) - final_predictions['Actual'])/2).sum()/final_predictions.shape[0] - #print(mae) - - if mae < best_epoch_value: - best_epoch_value = mae * 1 - best_epoch_idx = i * 1 - - pbar.set_description("Epoch %d. Loss: %d. Process: " % (i+1, mae)) - - #------------------------------ - - print("The best epoch is ",best_epoch_idx," with the ",best_epoch_value," MAE score") - - models = models[0: best_epoch_idx+1] - alphas = alphas[0: best_epoch_idx+1] - - #------------------------------ - - return models, alphas + epoch = row["Epoch"] + row = row.drop(labels=["Epoch"]) + columns = row.shape[0] + + params = [] + for j in range(0, columns - 1): + params.append(row[j]) + + moduleName = f"outputs/rules/rules_{int(epoch)}" + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) + + prediction = functions.sign(myrules.findDecision(params)) + + return prediction + + +def apply(df, config, header, dataset_features, validation_df=None, process_id=None): + models = [] + alphas = [] + + initializeAlphaFile() + + num_of_weak_classifier = config["num_of_weak_classifier"] + + # ------------------------ + + rows = df.shape[0] + final_predictions = pd.DataFrame(np.zeros([rows, 1]), columns=["prediction"]) + + worksheet = df.copy() + worksheet["Weight"] = 1 / rows # uniform distribution initially + + final_predictions = pd.DataFrame(np.zeros((df.shape[0], 2)), columns=["Prediction", "Actual"]) + final_predictions["Actual"] = df["Decision"] + + best_epoch_idx = 0 + best_epoch_value = 1000000 + + # for i in range(0, num_of_weak_classifier): + pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting") + for i in pbar: + worksheet["Decision"] = worksheet["Weight"] * worksheet["Decision"] + + root = 1 + file = "outputs/rules/rules_" + str(i) + ".py" + + functions.createFile(file, header) + + logger.debug(worksheet) + Training.buildDecisionTree( + worksheet.drop(columns=["Weight"]), + root, + file, + config, + dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + main_process_id=process_id, + ) + + # --------------------------------------- + + moduleName = "outputs/rules/rules_" + str(i) + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) + models.append(myrules) + + # --------------------------------------- + + df["Epoch"] = i + worksheet["Prediction"] = df.apply(findPrediction, axis=1) + df = df.drop(columns=["Epoch"]) + + # --------------------------------------- + worksheet["Actual"] = df["Decision"] + worksheet["Loss"] = abs(worksheet["Actual"] - worksheet["Prediction"]) / 2 + worksheet["Weight_Times_Loss"] = worksheet["Loss"] * worksheet["Weight"] + + epsilon = worksheet["Weight_Times_Loss"].sum() + alpha = ( + math.log((1 - epsilon) / epsilon) / 2 + ) # use alpha to update weights in the next round + alphas.append(alpha) + + # ----------------------------- + + # store alpha + addEpochAlpha(i, alpha) + + # ----------------------------- + + worksheet["Alpha"] = alpha + worksheet["New_Weights"] = worksheet["Weight"] * ( + -alpha * worksheet["Actual"] * worksheet["Prediction"] + ).apply(math.exp) + + # normalize + worksheet["New_Weights"] = worksheet["New_Weights"] / worksheet["New_Weights"].sum() + worksheet["Weight"] = worksheet["New_Weights"] + worksheet["Decision"] = df["Decision"] + + final_predictions["Prediction"] = ( + final_predictions["Prediction"] + worksheet["Alpha"] * worksheet["Prediction"] + ) + logger.debug(final_predictions) + worksheet = worksheet.drop( + columns=["New_Weights", "Prediction", "Actual", "Loss", "Weight_Times_Loss", "Alpha"] + ) + + mae = ( + np.abs( + final_predictions["Prediction"].apply(functions.sign) - final_predictions["Actual"] + ) + / 2 + ).sum() / final_predictions.shape[0] + logger.debug(mae) + + if mae < best_epoch_value: + best_epoch_value = mae * 1 + best_epoch_idx = i * 1 + + pbar.set_description(f"Epoch {i + 1}. Loss: {mae}. Process: ") + + # ------------------------------ + + logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score") + + models = models[0 : best_epoch_idx + 1] + alphas = alphas[0 : best_epoch_idx + 1] + + # ------------------------------ + + return models, alphas + def initializeAlphaFile(): - file = "outputs/rules/alphas.py" - header = "def findAlpha(epoch):\n" - functions.createFile(file, header) + file = "outputs/rules/alphas.py" + header = "def findAlpha(epoch):\n" + functions.createFile(file, header) + def addEpochAlpha(epoch, alpha): - file = "outputs/rules/alphas.py" - content = " if epoch == "+str(epoch)+":\n" - content += " return "+str(alpha) - functions.storeRule(file, content) + file = "outputs/rules/alphas.py" + content = " if epoch == " + str(epoch) + ":\n" + content += " return " + str(alpha) + functions.storeRule(file, content) diff --git a/chefboost/tuning/gbm.py b/chefboost/tuning/gbm.py index 0c99477..53a6098 100644 --- a/chefboost/tuning/gbm.py +++ b/chefboost/tuning/gbm.py @@ -1,314 +1,329 @@ +import imp # pylint: disable=deprecated-module +import gc + import pandas as pd import numpy as np +from tqdm import tqdm -import imp +from chefboost.commons import functions +from chefboost.training import Training +from chefboost.commons.logger import Logger -from chefboost.commons import functions, evaluate -from chefboost.training import Preprocess, Training -from chefboost import Chefboost as cb +# pylint: disable=unused-argument -from tqdm import tqdm +logger = Logger(module="chefboost/tuning/gbm.py") -import gc def findPrediction(row): - epoch = row['Epoch'] - row = row.drop(labels=['Epoch']) - columns = row.shape[0] - - params = [] - for j in range(0, columns-1): - params.append(row[j]) - - moduleName = "outputs/rules/rules%s" % (epoch-1) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) - - #prediction = int(myrules.findDecision(params)) - prediction = myrules.findDecision(params) - - return prediction - -def regressor(df, config, header, dataset_features, validation_df = None, process_id = None): - models = [] - - #we will update decisions in every epoch, this will be used to restore - base_actuals = df.Decision.values - - algorithm = config['algorithm'] - - enableRandomForest = config['enableRandomForest'] - num_of_trees = config['num_of_trees'] - enableMultitasking = config['enableMultitasking'] - - enableGBM = config['enableGBM'] - epochs = config['epochs'] - learning_rate = config['learning_rate'] - - enableAdaboost = config['enableAdaboost'] - - #------------------------------ - - boosted_from = 0; boosted_to = 0 - - #------------------------------ - - base_df = df.copy() - - #gbm will manipulate actuals. store its raw version. - target_values = base_df['Decision'].values - num_of_instances = target_values.shape[0] - - root = 1 - file = "outputs/rules/rules0.py"; json_file = "outputs/rules/rules0.json" - functions.createFile(file, header) - functions.createFile(json_file, "[\n") - - Training.buildDecisionTree(df,root,file, config, dataset_features - , parent_level = 0, leaf_id = 0, parents = 'root') #generate rules0 - - #functions.storeRule(json_file," {}]") - - df = base_df.copy() - - base_df['Boosted_Prediction'] = 0 - - #------------------------------ - - best_epoch_idx = 0; best_epoch_loss = 1000000 - - pbar = tqdm(range(1, epochs+1), desc='Boosting') - - #for index in range(1,epochs+1): - #for index in tqdm(range(1,epochs+1), desc='Boosting'): - for index in pbar: - #print("epoch ",index," - ",end='') - loss = 0 - - #run data(i-1) and rules(i-1), save data1 - - #dynamic import - moduleName = "outputs/rules/rules%s" % (index-1) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 - - models.append(myrules) - - new_data_set = "outputs/data/data%s.csv" % (index) - f = open(new_data_set, "w") - - #put header in the following file - columns = df.shape[1] - - mae = 0 - - #---------------------------------------- - - df['Epoch'] = index - df['Prediction'] = df.apply(findPrediction, axis=1) - - base_df['Boosted_Prediction'] += df['Prediction'] - - loss = (base_df['Boosted_Prediction'] - base_df['Decision']).pow(2).sum() - current_loss = loss / num_of_instances #mse - - if index == 1: - boosted_from = current_loss * 1 - elif index == epochs: - boosted_to = current_loss * 1 - - if current_loss < best_epoch_loss: - best_epoch_loss = current_loss * 1 - best_epoch_idx = index * 1 - - df['Decision'] = int(learning_rate)*(df['Decision'] - df['Prediction']) - df = df.drop(columns = ['Epoch', 'Prediction']) - - #--------------------------------- - - df.to_csv(new_data_set, index=False) - #data(i) created - - #--------------------------------- - - file = "outputs/rules/rules"+str(index)+".py" - json_file = "outputs/rules/rules"+str(index)+".json" - - functions.createFile(file, header) - functions.createFile(json_file, "[\n") - - current_df = df.copy() - Training.buildDecisionTree(df,root,file, config, dataset_features - , parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id) - - #functions.storeRule(json_file," {}]") - - df = current_df.copy() #numeric features require this restoration to apply findDecision function - - #rules(i) created - - loss = loss / num_of_instances - #print("epoch ",index," - loss: ",loss) - #print("loss: ",loss) - pbar.set_description("Epoch %d. Loss: %d. Process: " % (index, loss)) - - gc.collect() - - #--------------------------------- - - print("The best epoch is ", best_epoch_idx," with ", best_epoch_loss," loss value") - models = models[0:best_epoch_idx] - config["epochs"] = best_epoch_idx - - print("MSE of ",num_of_instances," instances are boosted from ",boosted_from," to ",best_epoch_loss," in ",epochs," epochs") - - return models - -def classifier(df, config, header, dataset_features, validation_df = None, process_id = None): - - models = [] - - print("gradient boosting for classification") - - epochs = config['epochs'] - enableParallelism = config['enableParallelism'] - - temp_df = df.copy() - original_dataset = df.copy() - worksheet = df.copy() - - classes = df['Decision'].unique() - - boosted_predictions = np.zeros([df.shape[0], len(classes)]) - - pbar = tqdm(range(0, epochs), desc='Boosting') - - #store actual set, we will use this to calculate loss - actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) - for i in range(0, len(classes)): - current_class = classes[i] - actual_set[current_class] = np.where(df['Decision'] == current_class, 1, 0) - actual_set = actual_set.values #transform it to numpy array - - best_accuracy_idx = 0; best_accuracy_value = 0 - accuracies = [] - - #for epoch in range(0, epochs): - for epoch in pbar: - for i in range(0, len(classes)): - current_class = classes[i] - - if epoch == 0: - temp_df['Decision'] = np.where(df['Decision'] == current_class, 1, 0) - worksheet['Y_'+str(i)] = temp_df['Decision'] - else: - temp_df['Decision'] = worksheet['Y-P_'+str(i)] - - predictions = [] - - #change data type for decision column - temp_df[['Decision']].astype('int64') - - root = 1 - file_base = "outputs/rules/rules-for-"+current_class+"-round-"+str(epoch) - - file = file_base+".py" - functions.createFile(file, header) - - if enableParallelism == True: - json_file = file_base+".json" - functions.createFile(json_file, "[\n") - - Training.buildDecisionTree(temp_df, root, file, config, dataset_features - , parent_level = 0, leaf_id = 0, parents = 'root', main_process_id = process_id) - - #decision rules created - #---------------------------- - - #dynamic import - moduleName = "outputs/rules/rules-for-"+current_class+"-round-"+str(epoch) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) #rules0 - - models.append(myrules) - - num_of_columns = df.shape[1] - - for row, instance in df.iterrows(): - features = [] - for j in range(0, num_of_columns-1): #iterate on features - features.append(instance[j]) - - actual = temp_df.loc[row]['Decision'] - prediction = myrules.findDecision(features) - - predictions.append(prediction) - - #---------------------------- - if epoch == 0: - worksheet['F_'+str(i)] = 0 - else: - worksheet['F_'+str(i)] = pd.Series(predictions).values - - boosted_predictions[:,i] = boosted_predictions[:,i] + worksheet['F_'+str(i)].values.astype(np.float32) - - #print(boosted_predictions[0:5,:]) - - worksheet['P_'+str(i)] = 0 - - #---------------------------- - temp_df = df.copy() #restoration - - for row, instance in worksheet.iterrows(): - f_scores = [] - for i in range(0, len(classes)): - f_scores.append(instance['F_'+str(i)]) - - probabilities = functions.softmax(f_scores) - - for j in range(0, len(probabilities)): - instance['P_'+str(j)] = probabilities[j] - - worksheet.loc[row] = instance - - for i in range(0, len(classes)): - worksheet['Y-P_'+str(i)] = worksheet['Y_'+str(i)] - worksheet['P_'+str(i)] - - prediction_set = np.zeros([df.shape[0], len(classes)]) - for i in range(0, boosted_predictions.shape[0]): - predicted_index = np.argmax(boosted_predictions[i]) - prediction_set[i][predicted_index] = 1 - - #---------------------------- - #find loss for this epoch: prediction_set vs actual_set - classified = 0 - for i in range(0, actual_set.shape[0]): - actual = np.argmax(actual_set[i]) - prediction = np.argmax(prediction_set[i]) - #print("actual: ",actual," - prediction: ",prediction) - - if actual == prediction: - classified = classified + 1 - - accuracy = 100 * classified / actual_set.shape[0] - accuracies.append(accuracy) - - if accuracy > best_accuracy_value: - best_accuracy_value = accuracy * 1 - best_accuracy_idx = epoch * 1 - - #---------------------------- - - #print(worksheet.head()) - #print("round ",epoch+1) - pbar.set_description("Epoch %d. Accuracy: %d. Process: " % (epoch+1, accuracy)) - - gc.collect() - - #-------------------------------- - - print("The best accuracy got in ",best_accuracy_idx," epoch with the score ", best_accuracy_value) - - models = models[0: best_accuracy_idx * len(classes) + len(classes)] - - return models, classes + epoch = row["Epoch"] + row = row.drop(labels=["Epoch"]) + columns = row.shape[0] + + params = [] + for j in range(0, columns - 1): + params.append(row[j]) + + moduleName = f"outputs/rules/rules{epoch - 1}" + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) + + # prediction = int(myrules.findDecision(params)) + prediction = myrules.findDecision(params) + + return prediction + + +def regressor(df, config, header, dataset_features, validation_df=None, process_id=None): + models = [] + + # we will update decisions in every epoch, this will be used to restore + epochs = config["epochs"] + learning_rate = config["learning_rate"] + + boosted_from = 0 + boosted_to = 0 + + base_df = df.copy() + + # gbm will manipulate actuals. store its raw version. + target_values = base_df["Decision"].values + num_of_instances = target_values.shape[0] + + root = 1 + file = "outputs/rules/rules0.py" + json_file = "outputs/rules/rules0.json" + functions.createFile(file, header) + functions.createFile(json_file, "[\n") + + Training.buildDecisionTree( + df, root, file, config, dataset_features, parent_level=0, leaf_id=0, parents="root" + ) # generate rules0 + + # functions.storeRule(json_file," {}]") + + df = base_df.copy() + + base_df["Boosted_Prediction"] = 0 + + # ------------------------------ + + best_epoch_idx = 0 + best_epoch_loss = 1000000 + + pbar = tqdm(range(1, epochs + 1), desc="Boosting") + + # for index in range(1,epochs+1): + # for index in tqdm(range(1,epochs+1), desc='Boosting'): + for index in pbar: + logger.debug(f"epoch {index} - ") + loss = 0 + + # run data(i-1) and rules(i-1), save data1 + + # dynamic import + moduleName = f"outputs/rules/rules{index - 1}" + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) # rules0 + + models.append(myrules) + + new_data_set = f"outputs/data/data{index}.csv" + with open(new_data_set, "w", encoding="UTF-8"): + pass + + # ---------------------------------------- + + df["Epoch"] = index + df["Prediction"] = df.apply(findPrediction, axis=1) + + base_df["Boosted_Prediction"] += df["Prediction"] + + loss = (base_df["Boosted_Prediction"] - base_df["Decision"]).pow(2).sum() + current_loss = loss / num_of_instances # mse + + if index == 1: + boosted_from = current_loss * 1 + elif index == epochs: + boosted_to = current_loss * 1 + logger.debug(f"Boosted to {boosted_to}") + + if current_loss < best_epoch_loss: + best_epoch_loss = current_loss * 1 + best_epoch_idx = index * 1 + + df["Decision"] = int(learning_rate) * (df["Decision"] - df["Prediction"]) + df = df.drop(columns=["Epoch", "Prediction"]) + + # --------------------------------- + + df.to_csv(new_data_set, index=False) + # data(i) created + + # --------------------------------- + + file = "outputs/rules/rules" + str(index) + ".py" + json_file = "outputs/rules/rules" + str(index) + ".json" + + functions.createFile(file, header) + functions.createFile(json_file, "[\n") + + current_df = df.copy() + Training.buildDecisionTree( + df, + root, + file, + config, + dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + main_process_id=process_id, + ) + + # functions.storeRule(json_file," {}]") + + df = ( + current_df.copy() + ) # numeric features require this restoration to apply findDecision function + + # rules(i) created + + loss = loss / num_of_instances + logger.debug(f"epoch {index} - loss: {loss}") + logger.debug(f"loss: {loss}") + pbar.set_description(f"Epoch {index}. Loss: {loss}. Process: ") + + gc.collect() + + # --------------------------------- + + logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value") + models = models[0:best_epoch_idx] + config["epochs"] = best_epoch_idx + + logger.info( + f"MSE of {num_of_instances} instances are boosted from {boosted_from}" + f"to {best_epoch_loss} in {epochs} epochs" + ) + + return models + + +def classifier(df, config, header, dataset_features, validation_df=None, process_id=None): + models = [] + + logger.info("gradient boosting for classification") + + epochs = config["epochs"] + enableParallelism = config["enableParallelism"] + + temp_df = df.copy() + worksheet = df.copy() + + classes = df["Decision"].unique() + + boosted_predictions = np.zeros([df.shape[0], len(classes)]) + + pbar = tqdm(range(0, epochs), desc="Boosting") + + # store actual set, we will use this to calculate loss + actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) + for current_class in classes: + actual_set[current_class] = np.where(df["Decision"] == current_class, 1, 0) + actual_set = actual_set.values # transform it to numpy array + + best_accuracy_idx = 0 + best_accuracy_value = 0 + accuracies = [] + + # for epoch in range(0, epochs): + for epoch in pbar: + for i, current_class in enumerate(classes): + + if epoch == 0: + temp_df["Decision"] = np.where(df["Decision"] == current_class, 1, 0) + worksheet["Y_" + str(i)] = temp_df["Decision"] + else: + temp_df["Decision"] = worksheet["Y-P_" + str(i)] + + predictions = [] + + # change data type for decision column + temp_df[["Decision"]].astype("int64") + + root = 1 + file_base = "outputs/rules/rules-for-" + current_class + "-round-" + str(epoch) + + file = file_base + ".py" + functions.createFile(file, header) + + if enableParallelism == True: + json_file = file_base + ".json" + functions.createFile(json_file, "[\n") + + Training.buildDecisionTree( + temp_df, + root, + file, + config, + dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + main_process_id=process_id, + ) + + # decision rules created + # ---------------------------- + + # dynamic import + moduleName = "outputs/rules/rules-for-" + current_class + "-round-" + str(epoch) + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) # rules0 + + models.append(myrules) + + num_of_columns = df.shape[1] + + for row, instance in df.iterrows(): + features = [] + for j in range(0, num_of_columns - 1): # iterate on features + features.append(instance[j]) + + actual = temp_df.loc[row]["Decision"] + prediction = myrules.findDecision(features) + + predictions.append(prediction) + + # ---------------------------- + if epoch == 0: + worksheet["F_" + str(i)] = 0 + else: + worksheet["F_" + str(i)] = pd.Series(predictions).values + + boosted_predictions[:, i] = boosted_predictions[:, i] + worksheet[ + "F_" + str(i) + ].values.astype(np.float32) + + logger.debug(boosted_predictions[0:5, :]) + + worksheet["P_" + str(i)] = 0 + + # ---------------------------- + temp_df = df.copy() # restoration + + for row, instance in worksheet.iterrows(): + f_scores = [] + for i in range(0, len(classes)): + f_scores.append(instance["F_" + str(i)]) + + probabilities = functions.softmax(f_scores) + + for j, current_prob in enumerate(probabilities): + instance["P_" + str(j)] = current_prob + + worksheet.loc[row] = instance + + for i in range(0, len(classes)): + worksheet["Y-P_" + str(i)] = worksheet["Y_" + str(i)] - worksheet["P_" + str(i)] + + prediction_set = np.zeros([df.shape[0], len(classes)]) + for i in range(0, boosted_predictions.shape[0]): + predicted_index = np.argmax(boosted_predictions[i]) + prediction_set[i][predicted_index] = 1 + + # ---------------------------- + # find loss for this epoch: prediction_set vs actual_set + classified = 0 + for i in range(0, actual_set.shape[0]): + actual = np.argmax(actual_set[i]) + prediction = np.argmax(prediction_set[i]) + logger.debug(f"actual: {actual} - prediction: {prediction}") + + if actual == prediction: + classified = classified + 1 + + accuracy = 100 * classified / actual_set.shape[0] + accuracies.append(accuracy) + + if accuracy > best_accuracy_value: + best_accuracy_value = accuracy * 1 + best_accuracy_idx = epoch * 1 + + # ---------------------------- + + logger.debug(worksheet.head()) + logger.debug("round {epoch+1}") + pbar.set_description(f"Epoch {epoch + 1}. Accuracy: {accuracy}. Process: ") + + gc.collect() + + # -------------------------------- + + logger.info( + f"The best accuracy got in {best_accuracy_idx} epoch with the score {best_accuracy_value}" + ) + + models = models[0 : best_accuracy_idx * len(classes) + len(classes)] + + return models, classes diff --git a/chefboost/tuning/randomforest.py b/chefboost/tuning/randomforest.py index 163c8a5..743a8f0 100644 --- a/chefboost/tuning/randomforest.py +++ b/chefboost/tuning/randomforest.py @@ -1,92 +1,123 @@ -import pandas as pd -import numpy as np -from multiprocessing import Pool import multiprocessing from contextlib import closing -from chefboost.commons import functions, evaluate -from chefboost.training import Training -from chefboost import Chefboost as cb -from tqdm import tqdm -import imp -import os - -def apply(df, config, header, dataset_features, validation_df = None, process_id = None): - - models = [] - - num_of_trees = config['num_of_trees'] - - parallelism_on = config["enableParallelism"] - - #TODO: is this logical for 48x2 cores? - #config["enableParallelism"] = False #run each tree in parallel but each branch in serial - - #TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. - - input_params = [] - - pbar = tqdm(range(0, num_of_trees), desc='Bagging') - for i in pbar: - pbar.set_description("Sub decision tree %d is processing" % (i+1)) - subset = df.sample(frac=1/num_of_trees) - - root = 1 - - moduleName = "outputs/rules/rule_"+str(i) - file = moduleName+".py" +import imp # pylint: disable=deprecated-module - functions.createFile(file, header) - - if parallelism_on: #parallel run - input_params.append((subset, root, file, config, dataset_features, 0, 0, 'root', i, None, process_id)) - - else: #serial run - Training.buildDecisionTree(subset,root, file, config, dataset_features, parent_level = 0, leaf_id = 0, parents = 'root', tree_id = i, main_process_id = process_id) - - #------------------------------- - - if parallelism_on: - num_cores = config["num_cores"] - - #--------------------------------- - - if num_of_trees <= num_cores: - POOL_SIZE = num_of_trees - else: - POOL_SIZE = num_cores - - with closing(multiprocessing.Pool(POOL_SIZE)) as pool: - funclist = [] - for input_param in input_params: - f = pool.apply_async(buildDecisionTree, [*input_param]) - funclist.append(f) - - #all functions registered here - #results = [] - for f in tqdm(funclist): - branch_results = f.get(timeout = 100000) - #results.append(branch_results) - - pool.close() - pool.terminate() +from tqdm import tqdm - #------------------------------- - #collect models for both serial and parallel here - for i in range(0, num_of_trees): - moduleName = "outputs/rules/rule_"+str(i) - fp, pathname, description = imp.find_module(moduleName) - myrules = imp.load_module(moduleName, fp, pathname, description) - models.append(myrules) +from chefboost.commons import functions +from chefboost.training import Training - #------------------------------- +# pylint: disable=unused-argument - return models -#wrapper for parallel run -def buildDecisionTree(df, root, file, config, dataset_features, parent_level, leaf_id, parents, tree_id, validation_df = None, process_id = None): - Training.buildDecisionTree(df, root, file, config, dataset_features, parent_level = parent_level, leaf_id =leaf_id, parents = parents, tree_id = tree_id, main_process_id = process_id) +def apply(df, config, header, dataset_features, validation_df=None, process_id=None): + models = [] -""" -def buildDecisionTreeWrapper(func, args): - return func(*args) -""" + num_of_trees = config["num_of_trees"] + + parallelism_on = config["enableParallelism"] + + # TODO: is this logical for 48x2 cores? + # config["enableParallelism"] = False #run each tree in parallel but each branch in serial + + # TODO: reconstruct for parallel run is problematic. you should reconstruct based on tree id. + + input_params = [] + + pbar = tqdm(range(0, num_of_trees), desc="Bagging") + for i in pbar: + pbar.set_description(f"Sub decision tree {i + 1} is processing") + subset = df.sample(frac=1 / num_of_trees) + + root = 1 + + moduleName = "outputs/rules/rule_" + str(i) + file = moduleName + ".py" + + functions.createFile(file, header) + + if parallelism_on: # parallel run + input_params.append( + (subset, root, file, config, dataset_features, 0, 0, "root", i, None, process_id) + ) + + else: # serial run + Training.buildDecisionTree( + subset, + root, + file, + config, + dataset_features, + parent_level=0, + leaf_id=0, + parents="root", + tree_id=i, + main_process_id=process_id, + ) + + # ------------------------------- + + if parallelism_on: + num_cores = config["num_cores"] + + # --------------------------------- + + if num_of_trees <= num_cores: + POOL_SIZE = num_of_trees + else: + POOL_SIZE = num_cores + + with closing(multiprocessing.Pool(POOL_SIZE)) as pool: + funclist = [] + for input_param in input_params: + f = pool.apply_async(buildDecisionTree, [*input_param]) + funclist.append(f) + + # all functions registered here + # results = [] + for f in tqdm(funclist): + _ = f.get(timeout=100000) # this was branch_results + # results.append(branch_results) + + pool.close() + pool.terminate() + + # ------------------------------- + # collect models for both serial and parallel here + for i in range(0, num_of_trees): + moduleName = "outputs/rules/rule_" + str(i) + fp, pathname, description = imp.find_module(moduleName) + myrules = imp.load_module(moduleName, fp, pathname, description) + models.append(myrules) + + # ------------------------------- + + return models + + +# wrapper for parallel run +def buildDecisionTree( + df, + root, + file, + config, + dataset_features, + parent_level, + leaf_id, + parents, + tree_id, + validation_df=None, + process_id=None, +): + Training.buildDecisionTree( + df, + root, + file, + config, + dataset_features, + parent_level=parent_level, + leaf_id=leaf_id, + parents=parents, + tree_id=tree_id, + main_process_id=process_id, + ) diff --git a/requirements.txt b/requirements.txt index 845b072..ccd2cff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pip install pandas==0.22.0 -pip install numpy==1.14.0 -pip install tqdm==4.30.0 -pip install psutil==5.4.3 \ No newline at end of file +pandas>=0.22.0 +numpy>=1.14.0 +tqdm>=4.30.0 +psutil>=5.4.3 \ No newline at end of file diff --git a/scripts/push-release.sh b/scripts/push-release.sh new file mode 100644 index 0000000..5b3e6fa --- /dev/null +++ b/scripts/push-release.sh @@ -0,0 +1,11 @@ +cd .. + +echo "deleting existing release related files" +rm -rf dist/* +rm -rf build/* + +echo "creating a package for current release - pypi compatible" +python setup.py sdist bdist_wheel + +echo "pushing the release to pypi" +python -m twine upload dist/* \ No newline at end of file diff --git a/setup.py b/setup.py index b2594b9..d157070 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,9 @@ with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() +with open("requirements.txt", "r", encoding="utf-8") as f: + requirements = f.read().split("\n") + setuptools.setup( name="chefboost", version="0.0.18", @@ -19,5 +22,5 @@ "Operating System :: OS Independent", ], python_requires='>=3.6', - install_requires=["pandas>=0.22.0", "numpy>=1.14.0", "tqdm>=4.30.0", "psutil>=5.4.3"] + install_requires=requirements ) diff --git a/tests/global-unit-test.py b/tests/global-unit-test.py index ba513ff..a519828 100644 --- a/tests/global-unit-test.py +++ b/tests/global-unit-test.py @@ -1,309 +1,339 @@ +import gc import pandas as pd -import sys from chefboost import Chefboost as cb -import gc +from chefboost.commons.logger import Logger -pd.set_option('display.max_rows', 500) -pd.set_option('display.max_columns', 500) -pd.set_option('display.width', 1000) +pd.set_option("display.max_rows", 500) +pd.set_option("display.max_columns", 500) +pd.set_option("display.width", 1000) -#---------------------------------------------- +logger = Logger(module="tests/global-unit-test.py") + +# ---------------------------------------------- parallelism_cases = [True] -#parallelism_cases = [False] -#parallelism_cases = [False, True] +# parallelism_cases = [False] +# parallelism_cases = [False, True] + +if __name__ == "__main__": -if __name__ == '__main__': + for enableParallelism in parallelism_cases: - for enableParallelism in parallelism_cases: + logger.info("*************************") + logger.info(f"enableParallelism is set to {enableParallelism}") + logger.info("*************************") - print("*************************") - print("enableParallelism is set to ",enableParallelism) - print("*************************") + logger.info("no config passed") + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df) - print("no config passed ") - df = pd.read_csv("dataset/golf.txt") - model = cb.fit(df) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Validation set case") - print("Validation set case") + df = pd.read_csv("dataset/golf.txt") + validation_df = pd.read_csv("dataset/golf.txt") + config = {"algorithm": "ID3", "enableParallelism": enableParallelism} + model = cb.fit(df, config, validation_df=validation_df) - df = pd.read_csv("dataset/golf.txt") - validation_df = pd.read_csv("dataset/golf.txt") - config = {'algorithm': 'ID3', 'enableParallelism': enableParallelism} - model = cb.fit(df, config, validation_df = validation_df) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Feature importance") + # decision_rules = model["trees"][0].__dict__["__name__"]+".py" + decision_rules = model["trees"][0].__dict__["__spec__"].origin + logger.info(cb.feature_importance(decision_rules)) - print("Feature importance") - #decision_rules = model["trees"][0].__dict__["__name__"]+".py" - decision_rules = model["trees"][0].__dict__["__spec__"].origin - print(cb.feature_importance(decision_rules)) + logger.info("-------------------------") - print("-------------------------") + logger.info("ID3 for nominal features and nominal target:") + df = pd.read_csv("dataset/golf.txt") - print("ID3 for nominal features and nominal target:") - df = pd.read_csv("dataset/golf.txt") + config = {"algorithm": "ID3", "enableParallelism": enableParallelism} + model = cb.fit(df, config) - config = {'algorithm': 'ID3', 'enableParallelism': enableParallelism} - model = cb.fit(df, config) + validation_df = pd.read_csv("dataset/golf.txt") - validation_df = pd.read_csv("dataset/golf.txt") + logger.info("External validation") + cb.evaluate(model, validation_df) - print("External validation") - cb.evaluate(model, validation_df) + cb.save_model(model) + logger.info("built model is saved to model.pkl") - cb.save_model(model) - print("built model is saved to model.pkl") + restored_model = cb.load_model("model.pkl") + logger.info("built model is restored from model.pkl") - restored_model = cb.load_model("model.pkl") - print("built model is restored from model.pkl") + instance = ["Sunny", "Hot", "High", "Weak"] + prediction = cb.predict(restored_model, instance) - instance = ['Sunny', 'Hot', 'High', 'Weak'] - prediction = cb.predict(restored_model, instance) + logger.info(f"prediction for {instance} is {prediction}") - print("prediction for ", instance, "is ", prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("ID3 for nominal/numeric features and nominal target:") + config = {"algorithm": "ID3", "enableParallelism": enableParallelism} + model = cb.fit(pd.read_csv("dataset/golf2.txt"), config) - print("ID3 for nominal/numeric features and nominal target:") - config = {'algorithm': 'ID3', 'enableParallelism': enableParallelism} - model = cb.fit(pd.read_csv("dataset/golf2.txt"), config) + instance = ["Sunny", 85, 85, "Weak"] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - instance = ['Sunny', 85, 85, 'Weak'] - prediction = cb.predict(model, instance) - print("prediction for ", instance, "is ", prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("C4.5 for nominal/numeric features and nominal target:") + config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf2.txt"), config) - print("C4.5 for nominal/numeric features and nominal target:") - config = {'algorithm': 'C4.5', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("CART for nominal/numeric features and nominal target:") + config = {"algorithm": "CART", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf2.txt"), config) - print("CART for nominal/numeric features and nominal target:") - config = {'algorithm': 'CART', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("CHAID for nominal features and nominal target:") + config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf.txt"), config) - print("CHAID for nominal features and nominal target:") - config = {'algorithm': 'CHAID', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("CHAID for nominal/numeric features and nominal target:") + config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf2.txt"), config) - print("CHAID for nominal/numeric features and nominal target:") - config = {'algorithm': 'CHAID', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf2.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("regression tree for nominal features, numeric target") + config = {"algorithm": "Regression", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf3.txt"), config) - print("regression tree for nominal features, numeric target") - config = {'algorithm': 'Regression', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf3.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("regression tree for nominal/numeric features, numeric target") + config = {"algorithm": "Regression", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf4.txt"), config) - print("regression tree for nominal/numeric features, numeric target") - config = {'algorithm': 'Regression', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf4.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info( + "algorithm must be regression tree for numetic target. set any other algorithm." + ) + config = {"algorithm": "ID3", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/golf4.txt"), config) - print("algorithm must be regression tree for numetic target. set any other algorithm.") - config = {'algorithm': 'ID3', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/golf4.txt"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("ID3 for nominal features and target (large data set)") + config = {"algorithm": "ID3", "enableParallelism": enableParallelism} + model = cb.fit(pd.read_csv("dataset/car.data"), config) - print("ID3 for nominal features and target (large data set)") - config = {'algorithm': 'ID3', 'enableParallelism': enableParallelism} - model = cb.fit(pd.read_csv("dataset/car.data"), config) + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] + prediction = cb.predict(model, instance) + logger.info(prediction) - instance = ['vhigh','vhigh',2,'2','small','low'] - prediction = cb.predict(model, instance) - print(prediction) + instance = ["high", "high", "4", "more", "big", "high"] + prediction = cb.predict(model, instance) + logger.info(prediction) - instance = ['high','high','4','more','big','high'] - prediction = cb.predict(model, instance) - print(prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("C4.5 for nominal features and target (large data set)") + config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/car.data"), config) - print("C4.5 for nominal features and target (large data set)") - config = {'algorithm': 'C4.5', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/car.data"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("CART for nominal features and target (large data set)") + config = {"algorithm": "CART", "enableParallelism": enableParallelism} + cb.fit(pd.read_csv("dataset/car.data"), config) - print("CART for nominal features and target (large data set)") - config = {'algorithm': 'CART', 'enableParallelism': enableParallelism} - cb.fit(pd.read_csv("dataset/car.data"), config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("CHAID for nominal features and target (large data set)") + config = {"algorithm": "CHAID", "enableParallelism": enableParallelism} + df = pd.read_csv("dataset/car.data") + cb.fit(df, config) - print("CHAID for nominal features and target (large data set)") - config = {'algorithm': 'CHAID', 'enableParallelism': enableParallelism} - df = pd.read_csv("dataset/car.data") - cb.fit(df, config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Iris with regular decision tree") + config = {"algorithm": "ID3"} + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + model = cb.fit(df, config) - print("Iris with regular decision tree") - config = {'algorithm': 'ID3'} - df = pd.read_csv("dataset/iris.data", names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"]) - model = cb.fit(df, config) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Adaboost") + config = { + "algorithm": "ID3", + "enableAdaboost": True, + "num_of_weak_classifier": 10, + "enableParallelism": False, + } + df = pd.read_csv("dataset/adaboost.txt") + validation_df = df.copy() - print("Adaboost") - config = {'algorithm': 'ID3', 'enableAdaboost': True, 'num_of_weak_classifier': 10, 'enableParallelism': False} - df = pd.read_csv("dataset/adaboost.txt") - validation_df = df.copy() + model = cb.fit(df, config, validation_df=validation_df) - model = cb.fit(df, config - , validation_df = validation_df - ) + instance = [4, 3.5] - instance = [4, 3.5] - #prediction = cb.predict(model, instance) - #print("prediction for ",instance," is ",prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Regular GBM") + config = { + "algorithm": "CART", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + "enableParallelism": enableParallelism, + } + df = pd.read_csv("dataset/golf4.txt") + validation_df = pd.read_csv("dataset/golf4.txt") + model = cb.fit(df, config, validation_df=validation_df) - print("Regular GBM") - config = {'algorithm': 'CART', 'enableGBM': True, 'epochs': 10, 'learning_rate': 1, 'enableParallelism': enableParallelism} - df = pd.read_csv("dataset/golf4.txt") - validation_df = pd.read_csv("dataset/golf4.txt") - model = cb.fit(df, config - , validation_df = validation_df - ) + instance = ["Sunny", 85, 85, "Weak"] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - instance = ['Sunny',85,85,'Weak'] - prediction = cb.predict(model, instance) - print("prediction for ",instance," is ",prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("GBM for classification") + config = { + "algorithm": "ID3", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + "enableParallelism": enableParallelism, + } - print("GBM for classification") - config = {'algorithm': 'ID3', 'enableGBM': True, 'epochs': 10, 'learning_rate': 1, 'enableParallelism': enableParallelism} + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + validation_df = df.copy() - df = pd.read_csv("dataset/iris.data", names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"]) - validation_df = df.copy() + model = cb.fit(df, config, validation_df=validation_df) - model = cb.fit(df, config - , validation_df = validation_df - ) + instance = [7.0, 3.2, 4.7, 1.4] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - instance = [7.0,3.2,4.7,1.4] - prediction = cb.predict(model, instance) - print("prediction for ",instance," is ",prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Random forest") + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 3, + "enableParallelism": enableParallelism, + } + df = pd.read_csv("dataset/car.data") + validation_df = pd.read_csv("dataset/car.data") + model = cb.fit( + pd.read_csv("dataset/car.data"), + config + # , validation_df = validation_df + ) - print("Random forest") - config = {'algorithm': 'ID3', 'enableRandomForest': True, 'num_of_trees': 3 - , 'enableParallelism': enableParallelism - } - df = pd.read_csv("dataset/car.data") - validation_df = pd.read_csv("dataset/car.data") - model = cb.fit(pd.read_csv("dataset/car.data"), config - #, validation_df = validation_df - ) + logger.info("Feature importance of random forest") + decision_rules = [] + for tree in model["trees"]: - print("Feature importance of random forest") - decision_rules = [] - for tree in model["trees"]: + decision_rule = tree.__dict__["__spec__"].origin + decision_rules.append(decision_rule) - decision_rule = tree.__dict__["__spec__"].origin - decision_rules.append(decision_rule) + df = cb.feature_importance(decision_rules) + logger.info(df) - df = cb.feature_importance(decision_rules) - print(df) + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] - instance = ['vhigh','vhigh',2,'2','small','low'] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - prediction = cb.predict(model, instance) - print("prediction for ",instance," is ",prediction) + instance = ["high", "high", 4, "more", "big", "high"] - instance = ['high','high',4,'more','big','high'] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - prediction = cb.predict(model, instance) - print("prediction for ",instance," is ",prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") + logger.info("Random forest for regression") - print("Random forest for regression") + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 5, + "enableMultitasking": False, + "enableParallelism": enableParallelism, + } - config = {'algorithm': 'ID3', 'enableRandomForest': True, 'num_of_trees': 5, 'enableMultitasking': False, 'enableParallelism': enableParallelism} + df = pd.read_csv("dataset/car_reg.data") + model = cb.fit(pd.read_csv("dataset/car_reg.data"), config) - df = pd.read_csv("dataset/car_reg.data") - model = cb.fit(pd.read_csv("dataset/car_reg.data"), config) + validation_df = pd.read_csv("dataset/car_reg.data") + cb.evaluate(model, validation_df) - validation_df = pd.read_csv("dataset/car_reg.data") - cb.evaluate(model, validation_df) + instance = ["high", "high", 4, "more", "big", "high"] + prediction = cb.predict(model, instance) + logger.info(f"prediction for {instance} is {prediction}") - instance = ['high','high',4,'more','big','high'] - prediction = cb.predict(model, instance) - print("prediction for ",instance," is ",prediction) + gc.collect() - gc.collect() + logger.info("-------------------------") - print("-------------------------") - - print("Is there any none predictions?") - config = {'algorithm': 'C4.5', 'enableParallelism': enableParallelism} - model = cb.fit(pd.read_csv("dataset/none_train.txt"), config) - test_set = pd.read_csv("dataset/none_test.txt") - instance = test_set.iloc[3] - print(instance.values, "->", cb.predict(model, instance)) + logger.info("Is there any none predictions?") + config = {"algorithm": "C4.5", "enableParallelism": enableParallelism} + model = cb.fit(pd.read_csv("dataset/none_train.txt"), config) + test_set = pd.read_csv("dataset/none_test.txt") + instance = test_set.iloc[3] + logger.info(f"{instance.values} -> {cb.predict(model, instance)}") - gc.collect() + gc.collect() - print("-------------------------") + logger.info("-------------------------") - print("-------------------------") - print("unit tests completed successfully...") + logger.info("-------------------------") + logger.info("unit tests completed successfully...") From 1531ff73f8663df69190916296b13f087701c415 Mon Sep 17 00:00:00 2001 From: Sefik Ilkin Serengil Date: Sat, 23 Dec 2023 11:07:32 +0000 Subject: [PATCH 2/3] print df in actions --- tests/global-unit-test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/global-unit-test.py b/tests/global-unit-test.py index a519828..defa3a0 100644 --- a/tests/global-unit-test.py +++ b/tests/global-unit-test.py @@ -25,6 +25,7 @@ logger.info("no config passed") df = pd.read_csv("dataset/golf.txt") + logger.info(df.head()) model = cb.fit(df) gc.collect() From b0c06a97fd426c69d238e670d368daa148019632 Mon Sep 17 00:00:00 2001 From: Sefik Ilkin Serengil Date: Sat, 23 Dec 2023 11:12:08 +0000 Subject: [PATCH 3/3] enfore dependencies --- .github/workflows/tests.yml | 2 +- tests/global-unit-test.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 65d670b..5a5350b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,7 +34,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pytest + pip install pandas==1.3.5 numpy==1.22.3 tqdm==4.62.3 psutil==5.9.0 pip install . - name: Test with pytest diff --git a/tests/global-unit-test.py b/tests/global-unit-test.py index defa3a0..a519828 100644 --- a/tests/global-unit-test.py +++ b/tests/global-unit-test.py @@ -25,7 +25,6 @@ logger.info("no config passed") df = pd.read_csv("dataset/golf.txt") - logger.info(df.head()) model = cb.fit(df) gc.collect()