From bb3eaadb9a70c6eafc472b606225833dc4f8d8c9 Mon Sep 17 00:00:00 2001
From: Ch0p1k3 <ch00p.228@gmail.com>
Date: Sat, 1 Jul 2023 14:34:14 +0000
Subject: [PATCH] Tutorial to apply CatBoost model from C++

---
 apply_model/cpp/.clang-format                 |  12 +
 apply_model/cpp/.clang-tidy                   |  49 +++
 apply_model/cpp/.gitignore                    |  98 +++++
 apply_model/cpp/CMakeLists.txt                |  10 +
 apply_model/cpp/README.md                     |  21 +
 apply_model/cpp/bin/CMakeLists.txt            |  10 +
 apply_model/cpp/bin/main.cpp                  | 153 +++++++
 apply_model/cpp/build/.gitignore              |   2 +
 apply_model/cpp/cmake/Ccache.cmake            |   6 +
 apply_model/cpp/cmake/CheckCompiler.cmake     |  16 +
 apply_model/cpp/cmake/CompileOptions.cmake    |  20 +
 apply_model/cpp/cmake/Development.cmake       |   2 +
 apply_model/cpp/model/.gitignore              |   2 +
 apply_model/cpp/model/train_model.ipynb       | 389 ++++++++++++++++++
 apply_model/cpp/third_party/CMakeLists.txt    |   2 +
 .../cpp/third_party/argparse/CMakeLists.txt   |   9 +
 .../cpp/third_party/catboost/CMakeLists.txt   |  33 ++
 17 files changed, 834 insertions(+)
 create mode 100644 apply_model/cpp/.clang-format
 create mode 100644 apply_model/cpp/.clang-tidy
 create mode 100644 apply_model/cpp/.gitignore
 create mode 100644 apply_model/cpp/CMakeLists.txt
 create mode 100644 apply_model/cpp/README.md
 create mode 100644 apply_model/cpp/bin/CMakeLists.txt
 create mode 100644 apply_model/cpp/bin/main.cpp
 create mode 100644 apply_model/cpp/build/.gitignore
 create mode 100644 apply_model/cpp/cmake/Ccache.cmake
 create mode 100644 apply_model/cpp/cmake/CheckCompiler.cmake
 create mode 100644 apply_model/cpp/cmake/CompileOptions.cmake
 create mode 100644 apply_model/cpp/cmake/Development.cmake
 create mode 100644 apply_model/cpp/model/.gitignore
 create mode 100644 apply_model/cpp/model/train_model.ipynb
 create mode 100644 apply_model/cpp/third_party/CMakeLists.txt
 create mode 100644 apply_model/cpp/third_party/argparse/CMakeLists.txt
 create mode 100644 apply_model/cpp/third_party/catboost/CMakeLists.txt

diff --git a/apply_model/cpp/.clang-format b/apply_model/cpp/.clang-format
new file mode 100644
index 0000000..589a571
--- /dev/null
+++ b/apply_model/cpp/.clang-format
@@ -0,0 +1,12 @@
+BasedOnStyle: Google
+---
+Language: Cpp
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLambdasOnASingleLine: Empty
+DerivePointerAlignment: false
+PointerAlignment: Left
+SortIncludes: false
+PackConstructorInitializers: Never
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: AfterComma
diff --git a/apply_model/cpp/.clang-tidy b/apply_model/cpp/.clang-tidy
new file mode 100644
index 0000000..f264464
--- /dev/null
+++ b/apply_model/cpp/.clang-tidy
@@ -0,0 +1,49 @@
+---
+
+Checks: '-*,cppcoreguidelines-avoid-goto,cppcoreguidelines-pro-type-const-cast, google-runtime-int, modernize-use-nullptr, readability-braces-around-statements, readability-container-size-empty, readability-redundant-control-flow, readability-identifier-naming, readability-simplify-boolean-expr, google-build-using-namespace, readability-implicit-bool-conversion, google-explicit-constructor'
+
+HeaderFilterRegex: '\.hpp$'
+
+WarningsAsErrors: '*'
+
+CheckOptions:
+  - key: readability-identifier-naming.NamespaceCase
+    value: lower_case
+  - key: readability-identifier-naming.ClassCase
+    value: CamelCase
+  - key: readability-identifier-naming.StructCase
+    value: CamelCase
+  - key: readability-identifier-naming.TypedefCase
+    value: CamelCase
+  - key: readability-identifier-naming.TypeAliasCase
+    value: CamelCase
+  - key: readability-identifier-naming.FunctionCase
+    value: CamelCase
+  - key: readability-identifier-naming.ParameterCase
+    value: lower_case
+  - key: readability-identifier-naming.VariableCase
+    value: lower_case
+  - key: readability-identifier-naming.PrivateMemberCase
+    value: lower_case
+  - key: readability-identifier-naming.PrivateMemberSuffix
+    value: '_'
+  - key: readability-identifier-naming.GlobalConstantCase
+    value: CamelCase
+  - key: readability-identifier-naming.GlobalConstantPrefix
+    value: k
+  - key: readability-identifier-naming.StaticConstantCase
+    value: CamelCase
+  - key: readability-identifier-naming.StaticConstantPrefix
+    value: k
+  - key: readability-identifier-naming.ConstexprVariableCase
+    value: CamelCase
+  - key: readability-identifier-naming.ConstexprVariablePrefix
+    value: k
+  - key: readability-identifier-naming.TypeTemplateParameterCase
+    value: CamelCase
+  - key: readability-simplify-boolean-expr.ChainedConditionalReturn
+    value: '1'
+  - key: readability-simplify-boolean-expr.ChainedConditionalAssignment
+    value: '1'
+  - key: readability-identifier-naming.TypeTemplateParameterIgnoredRegexp
+    value: expr-type
diff --git a/apply_model/cpp/.gitignore b/apply_model/cpp/.gitignore
new file mode 100644
index 0000000..6aa8435
--- /dev/null
+++ b/apply_model/cpp/.gitignore
@@ -0,0 +1,98 @@
+# Created by .ignore support plugin (hsz.mobi)
+### C++ template
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.app
+### CMake template
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Makefile
+cmake_install.cmake
+install_manifest.txt
+CTestTestfile.cmake
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+.idea/vcs.xml
+.idea/jsLibraryMappings.xml
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/dataSources.local.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Idea
+.idea/
+cmake-build-debug/
+cmake-build-release/
+cmake-*
+
+# VS code
+.vscode/
+
+# YouCompleteMe VIM plugin
+**/.ycm_extra_conf.py
+
+# Python
+__pycache__
+
+# Clangd
+.cache/
+
+# CTest
+Testing/
diff --git a/apply_model/cpp/CMakeLists.txt b/apply_model/cpp/CMakeLists.txt
new file mode 100644
index 0000000..b839e24
--- /dev/null
+++ b/apply_model/cpp/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.14)
+project(apply-model)
+
+include(cmake/Ccache.cmake)
+include(cmake/CheckCompiler.cmake)
+include(cmake/CompileOptions.cmake)
+include(cmake/Development.cmake)
+
+add_subdirectory(bin)
+add_subdirectory(third_party)
diff --git a/apply_model/cpp/README.md b/apply_model/cpp/README.md
new file mode 100644
index 0000000..94af647
--- /dev/null
+++ b/apply_model/cpp/README.md
@@ -0,0 +1,21 @@
+# Apply CatBoost model from C++
+This tutorial consists of two parts:
+- first part where we preprocess dataset and train the classifier model.
+  This part can be found in [train_model.ipynb](model/train_model.ipynb).
+- second part where we load model into C++ application and then apply it.
+  This part presented as a small CMake project.
+
+  To configure CMake, execute:
+  ```bash
+  cmake -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_C_COMPILER:FILEPATH=<path_to_clang> -DCMAKE_CXX_COMPILER:FILEPATH=<path_to_clang++> -Bbuild -G "Unix Makefiles"
+  ```
+
+  Build target `apply_model`:
+  ```bash
+  cmake --build build --config Release --target apply_model
+  ```
+
+  Run binary:
+  ```bash
+  build/bin/apply_model -m model/adult.cbm
+  ```
diff --git a/apply_model/cpp/bin/CMakeLists.txt b/apply_model/cpp/bin/CMakeLists.txt
new file mode 100644
index 0000000..f969882
--- /dev/null
+++ b/apply_model/cpp/bin/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_executable(
+    apply_model
+    main.cpp
+)
+
+target_link_libraries(
+    apply_model
+    argparse
+    catboost
+)
diff --git a/apply_model/cpp/bin/main.cpp b/apply_model/cpp/bin/main.cpp
new file mode 100644
index 0000000..9ee75bc
--- /dev/null
+++ b/apply_model/cpp/bin/main.cpp
@@ -0,0 +1,153 @@
+#include <cmath>
+#include <filesystem>
+#include <iostream>
+#include <string>
+
+#include <argparse/argparse.hpp>
+#include <catboost/c_api.h>
+#include <catboost/wrapped_calcer.h>
+
+float Sigmoid(const float x) {
+  return 1. / (1. + std::exp(-x));
+}
+
+std::string Answer(const bool makes_over_50k_a_year) {
+  if (makes_over_50k_a_year) {
+    return "makes over 50K a year";
+  }
+  return "doesn't make over 50K a year";
+}
+
+int main(const int argc, const char* argv[]) {
+  argparse::ArgumentParser program("apply-model", "",
+                                   argparse::default_arguments::help);
+  program.add_argument("-m", "--model")
+      .help("path to model")
+      .metavar("model")
+      .action([](const auto& path) {
+        return std::filesystem::path(path);
+      });
+  program.parse_args(argc, argv);
+  const auto model_path = program.get<std::filesystem::path>("-m");
+
+  // Load model that we trained withing Jupyter Notebook
+  ModelCalcerWrapper model(model_path);
+  std::cout << "Adult dataset model metainformation" << std::endl;
+  std::cout << "Tree count: " << model.GetTreeCount() << std::endl;
+
+  // In our case we were solving a binary classification problem (weather person makes over 50K a year), so the
+  // dimension of the prediction will be 1, it will return probability of the object to belong to the positive
+  // class; in our case we had two classed encoded as "<=50K" and ">50K", during data preprocessing (see
+  // `get_fixed_adult()` in Notebook) we encoded "<=50K" as 0 and ">50K" as 1, so that ">50K" became a positive
+  // class. Probability of the negative class ("<=50K") can be easily deduced as (1-p) where p is a probability of
+  // positive class.
+  //
+  // For most of cases prediction dimension will be 1 (for regression and for ranking), it can be N for cases of
+  // multiclassification, where N is a number of classes.
+  std::cout << "numeric feature count: " << model.GetFloatFeaturesCount()
+            << std::endl;
+  std::cout << "categoric feature count: " << model.GetCatFeaturesCount()
+            << std::endl;
+
+  std::cout << std::endl;
+
+  // When we were training CatBoost we used a default classification threshold for AUC which is equal to 0.5,
+  // this means that our formula is optimized for this threashold, though we may change threshold to optimize some
+  // other metric on a different dataset, but we won't do it in this tutorial.
+  static constexpr auto kClassificationThreshold = 0.5;
+
+  // Ok now lets try to use our model for prediction. We'll look at the test part of Adult dataset. You will need
+  // to download it [1] from UCI repository. Look for "adult.test", "adult.name" will also be useful because it
+  // in contains human-readable description of the dataset.
+  //
+  // So the first line of test part of the dataset is:
+  //
+  // "25, Private, 226802, 11th, 7, Never-married, Machine-op-inspct, Own-child, Black, Male, 0, 0, 40, United-States, <=50K."
+  //
+  // Based on "adult.name" we can recover its vectors of numeric and categoric features (in our case all
+  // "continuous" features are numeric and all other features are categoric):
+  //
+  // numericFeatures: {25, 226802, 7, 0, 0, 40}
+  // categoricFeatures: {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child", "Black", "Male", "United-States"}
+  //
+  // And he doesn't make 50K per year. Also note that order of numeric and categoric features in source data and
+  // in `numericFeatures` and `categoricFeatures` is kept the same. Otherwise we can't apply the model (well, we
+  // can, but result of prediction will be garbage).
+  //
+  // Now lets run it! And let's call this person "person A", to make variable names unique.
+  //
+  // [1]: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
+  const std::vector<float> person_a_numeric_features(
+      {25., 226'802., 7., 0., 0., 40.});
+  const std::vector<std::string> person_a_categoric_features(
+      {"Private", "11th", "Never-married", "Machine-op-inspct", "Own-child",
+       "Black", "Male", "United-States"});
+  const auto person_a_prediction =
+      model.Calc(person_a_numeric_features, person_a_categoric_features);
+  const auto person_a_makes_over_50k_probability = Sigmoid(person_a_prediction);
+
+  // Since we made prediction only for one person and prediction dimension is 1, proability of person A make
+  // over 50K will have index 0 in `person_a_prediction`.
+  //
+  // CatBoost doesn't compute "probability", to turn CatBoost prediction into a probability we'll need to apply
+  // sigmoid function.
+  const auto person_a_makes_over_50k =
+      person_a_makes_over_50k_probability > kClassificationThreshold;
+  std::cout << "Person A make over 50K a year with probability "
+            << person_a_makes_over_50k_probability << std::endl;
+  std::cout << "Person A " << Answer(person_a_makes_over_50k) << std::endl;
+  std::cout << std::endl;
+
+  // Now lets find an example with missing features and income greater than 50K a year. At line 40 of "adult.test"
+  // we can find following line:
+  //
+  // "40, Private, 85019, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 45, ?, >50K."
+  //
+  // Lets call this person "Person B", dataset missing (missing features are marked with "?") "native-county"
+  // feature for Person B. When we were doing preprocessing in `get_fixed_adult` we replaced missing categoric
+  // features with string "nan", now, when we apply trained model we must also use "nan" for missing features.
+  // Lets write out feature vectors for Person B:
+  //
+  // numericFeatures = {40, 85019, 16, 0, 0, 45};
+  // categoricFeatures = {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty", "Husband", "Asian-Pac-Islander", "Male", "nan"};
+  //
+  // And according to the dataset Person B makes more than 50K a year. Ok, lets try to apply the model to this
+  // example.
+  const std::vector<float> person_b_numeric_features(
+      {40., 85019., 16., 0., 0., 45.});
+  const std::vector<std::string> person_b_categoric_features(
+      {"Private", "Doctorate", "Married-civ-spouce", "Prof-specialty",
+       "Husband", "Asian-Pac-Islander", "Male", "nan"});
+  const auto person_b_prediction =
+      model.Calc(person_b_numeric_features, person_b_categoric_features);
+  const auto person_b_makes_over_50k_probability = Sigmoid(person_b_prediction);
+
+  const auto person_b_makes_over_50k =
+      person_b_makes_over_50k_probability > kClassificationThreshold;
+  std::cout << "Person B make over 50K a year with probability "
+            << person_b_makes_over_50k_probability << std::endl;
+  std::cout << "Person B " << Answer(person_b_makes_over_50k) << std::endl;
+  std::cout << std::endl;
+
+  // Let's try to apply the model to Person A and Person B in one call.
+  const std::vector<std::vector<float>> persons_ab_numberic_features =
+      {person_a_numeric_features, person_b_numeric_features};
+  const std::vector<std::vector<std::string>> persons_ab_categoric_features =
+      {person_a_categoric_features, person_b_categoric_features};
+  const auto persons_ab_predictions = model.Calc(
+      persons_ab_numberic_features, persons_ab_categoric_features);
+  const std::vector<float> persons_ab_make_over_50k_probabilities = {
+      Sigmoid(persons_ab_predictions[0]), Sigmoid(persons_ab_predictions[1])};
+  const std::vector<bool> persons_ab_make_over_50k = {
+      persons_ab_make_over_50k_probabilities[0] > kClassificationThreshold,
+      persons_ab_make_over_50k_probabilities[1] > kClassificationThreshold};
+
+  // Predictions should be same as above
+  std::cout << "Using batch interface" << std::endl;
+  std::cout << "Person A make over 50K a year with probability "
+            << persons_ab_make_over_50k_probabilities[0] << std::endl;
+  std::cout << "Person A " << Answer(persons_ab_make_over_50k[0]) << std::endl;
+  std::cout << "Person B make over 50K a year with probability "
+            << persons_ab_make_over_50k_probabilities[1] << std::endl;
+  std::cout << "Person B " << Answer(persons_ab_make_over_50k[1]) << std::endl;
+}
diff --git a/apply_model/cpp/build/.gitignore b/apply_model/cpp/build/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/apply_model/cpp/build/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/apply_model/cpp/cmake/Ccache.cmake b/apply_model/cpp/cmake/Ccache.cmake
new file mode 100644
index 0000000..9eee24d
--- /dev/null
+++ b/apply_model/cpp/cmake/Ccache.cmake
@@ -0,0 +1,6 @@
+find_program(CCACHE_FOUND ccache)
+if(CCACHE_FOUND)
+    message(STATUS "Using ccache")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
+endif(CCACHE_FOUND)
diff --git a/apply_model/cpp/cmake/CheckCompiler.cmake b/apply_model/cpp/cmake/CheckCompiler.cmake
new file mode 100644
index 0000000..96cbfb0
--- /dev/null
+++ b/apply_model/cpp/cmake/CheckCompiler.cmake
@@ -0,0 +1,16 @@
+set(REQUIRED_CXX_COMPILER "Clang")
+set(CXX_COMPILER_MIN_VERSION 14.0)
+
+message(STATUS "C++ compiler: ${CMAKE_CXX_COMPILER}")
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL REQUIRED_CXX_COMPILER)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS CXX_COMPILER_MIN_VERSION)
+        message(FATAL_ERROR
+            "Old version of ${REQUIRED_CXX_COMPILER} compiler: ${CMAKE_CXX_COMPILER_VERSION}, required ${CXX_COMPILER_MIN_VERSION}."
+            )
+    endif()
+else()
+    message(FATAL_ERROR
+        "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}. Use ${REQUIRED_CXX_COMPILER}, version >= ${CXX_COMPILER_MIN_VERSION}."
+        )
+endif()
diff --git a/apply_model/cpp/cmake/CompileOptions.cmake b/apply_model/cpp/cmake/CompileOptions.cmake
new file mode 100644
index 0000000..ae5d757
--- /dev/null
+++ b/apply_model/cpp/cmake/CompileOptions.cmake
@@ -0,0 +1,20 @@
+# Common compile options for C++
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/third_party)
+
+# https://clang.llvm.org/docs/DiagnosticsReference.html
+add_compile_options(-Wall -Wextra -Wpedantic -fno-omit-frame-pointer)
+
+# Turn warnings into errors
+add_compile_options(-Werror -Wno-language-extension-token)
+
+add_compile_options(-Wno-error=unused-command-line-argument)
+add_compile_options(-Wno-error=unused-but-set-variable)
+
+message(STATUS "C++ standard: ${CMAKE_CXX_STANDARD}")
diff --git a/apply_model/cpp/cmake/Development.cmake b/apply_model/cpp/cmake/Development.cmake
new file mode 100644
index 0000000..8916b56
--- /dev/null
+++ b/apply_model/cpp/cmake/Development.cmake
@@ -0,0 +1,2 @@
+# https://cmake.org/cmake/help/v3.14/variable/CMAKE_EXPORT_COMPILE_COMMANDS.html
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
diff --git a/apply_model/cpp/model/.gitignore b/apply_model/cpp/model/.gitignore
new file mode 100644
index 0000000..8d0f665
--- /dev/null
+++ b/apply_model/cpp/model/.gitignore
@@ -0,0 +1,2 @@
+adult.cbm
+catboost_info/
diff --git a/apply_model/cpp/model/train_model.ipynb b/apply_model/cpp/model/train_model.ipynb
new file mode 100644
index 0000000..285a873
--- /dev/null
+++ b/apply_model/cpp/model/train_model.ipynb
@@ -0,0 +1,389 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# catboost for cpp tutorial"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q numpy==1.23.4 pandas catboost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import absolute_import, division, print_function, unicode_literals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CatBoost version 1.2\n",
+      "NumPy version 1.23.4\n",
+      "Pandas version 2.0.3\n"
+     ]
+    }
+   ],
+   "source": [
+    "import catboost as cb\n",
+    "import catboost.datasets as cbd\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "# print module versions for reproducibility\n",
+    "print('CatBoost version {}'.format(cb.__version__))\n",
+    "print('NumPy version {}'.format(np.__version__))\n",
+    "print('Pandas version {}'.format(pd.__version__))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    Download \"Adult Data Set\" [1] from UCI Machine Learning Repository.\n",
+      "\n",
+      "    Will return two pandas.DataFrame-s, first with train part (adult.data) and second with test part\n",
+      "    (adult.test) of the dataset.\n",
+      "\n",
+      "    [1]: https://archive.ics.uci.edu/ml/datasets/Adult\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "# We are going to use UCI Adult Data Set because it has both numerical and categorical \n",
+    "# features and also has missing features.\n",
+    "print(cbd.adult.__doc__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_fixed_adult():\n",
+    "    train, test = cbd.adult()\n",
+    "    \n",
+    "    # CatBoost doesn't support pandas.DataFrame missing values for categorical features out \n",
+    "    # of the box (seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker). So \n",
+    "    # we have to replace them with some designated string manually. \n",
+    "    for dataset in (train, test, ):\n",
+    "        for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n",
+    "            dataset[name].fillna('nan', inplace=True)\n",
+    "    \n",
+    "    X_train, y_train = train.drop('income', axis=1), train.income\n",
+    "    X_test, y_test = test.drop('income', axis=1), test.income\n",
+    "    return X_train, y_train, X_test, y_test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2431430/2445291177.py:8: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n",
+      "/tmp/ipykernel_2431430/2445291177.py:8: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  for name in (name for name, dtype in dict(dataset.dtypes).items() if dtype == np.object):\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train, y_train, _, _ = get_fixed_adult()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>age</th>\n",
+       "      <th>workclass</th>\n",
+       "      <th>fnlwgt</th>\n",
+       "      <th>education</th>\n",
+       "      <th>education-num</th>\n",
+       "      <th>marital-status</th>\n",
+       "      <th>occupation</th>\n",
+       "      <th>relationship</th>\n",
+       "      <th>race</th>\n",
+       "      <th>sex</th>\n",
+       "      <th>capital-gain</th>\n",
+       "      <th>capital-loss</th>\n",
+       "      <th>hours-per-week</th>\n",
+       "      <th>native-country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>39.0</td>\n",
+       "      <td>State-gov</td>\n",
+       "      <td>77516.0</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Adm-clerical</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>2174.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>50.0</td>\n",
+       "      <td>Self-emp-not-inc</td>\n",
+       "      <td>83311.0</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Exec-managerial</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>38.0</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>215646.0</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9.0</td>\n",
+       "      <td>Divorced</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>53.0</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>234721.0</td>\n",
+       "      <td>11th</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>United-States</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28.0</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>338409.0</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13.0</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Prof-specialty</td>\n",
+       "      <td>Wife</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>40.0</td>\n",
+       "      <td>Cuba</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    age         workclass    fnlwgt  education  education-num  \\\n",
+       "0  39.0         State-gov   77516.0  Bachelors           13.0   \n",
+       "1  50.0  Self-emp-not-inc   83311.0  Bachelors           13.0   \n",
+       "2  38.0           Private  215646.0    HS-grad            9.0   \n",
+       "3  53.0           Private  234721.0       11th            7.0   \n",
+       "4  28.0           Private  338409.0  Bachelors           13.0   \n",
+       "\n",
+       "       marital-status         occupation   relationship   race     sex  \\\n",
+       "0       Never-married       Adm-clerical  Not-in-family  White    Male   \n",
+       "1  Married-civ-spouse    Exec-managerial        Husband  White    Male   \n",
+       "2            Divorced  Handlers-cleaners  Not-in-family  White    Male   \n",
+       "3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   \n",
+       "4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   \n",
+       "\n",
+       "   capital-gain  capital-loss  hours-per-week native-country  \n",
+       "0        2174.0           0.0            40.0  United-States  \n",
+       "1           0.0           0.0            13.0  United-States  \n",
+       "2           0.0           0.0            40.0  United-States  \n",
+       "3           0.0           0.0            40.0  United-States  \n",
+       "4           0.0           0.0            40.0           Cuba  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2431430/982828455.py:15: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<catboost.core.CatBoostClassifier at 0x7f79e6088520>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# If you want to find out how we found these parameters check \"Simple classification \n",
+    "# example with missing feature handling and parameter tuning\" tutorial in `classification`\n",
+    "# subdirectory of tutorials\n",
+    "model = cb.CatBoostClassifier(\n",
+    "    class_names=('<=50K', '>50K'),\n",
+    "    loss_function='Logloss',\n",
+    "    eval_metric='AUC', \n",
+    "    custom_metric=['AUC'],\n",
+    "    iterations=100,\n",
+    "    random_seed=20181224,\n",
+    "    learning_rate=0.4234185321620083, \n",
+    "    depth=5, \n",
+    "    l2_leaf_reg=9.464266235679002)\n",
+    "model.fit(\n",
+    "    cb.Pool(X_train, y_train, cat_features=np.where(X_train.dtypes != np.float)[0]),\n",
+    "    verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_model('adult.cbm')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "72K\tadult.cbm\n"
+     ]
+    }
+   ],
+   "source": [
+    "!du -sh adult.cbm"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/apply_model/cpp/third_party/CMakeLists.txt b/apply_model/cpp/third_party/CMakeLists.txt
new file mode 100644
index 0000000..2330eae
--- /dev/null
+++ b/apply_model/cpp/third_party/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(argparse)
+add_subdirectory(catboost)
diff --git a/apply_model/cpp/third_party/argparse/CMakeLists.txt b/apply_model/cpp/third_party/argparse/CMakeLists.txt
new file mode 100644
index 0000000..9153f1b
--- /dev/null
+++ b/apply_model/cpp/third_party/argparse/CMakeLists.txt
@@ -0,0 +1,9 @@
+include(FetchContent)
+
+FetchContent_Declare(
+    argparse
+    GIT_REPOSITORY https://github.com/p-ranav/argparse.git
+    GIT_TAG v2.9
+    GIT_PROGRESS TRUE
+)
+FetchContent_MakeAvailable(argparse)
diff --git a/apply_model/cpp/third_party/catboost/CMakeLists.txt b/apply_model/cpp/third_party/catboost/CMakeLists.txt
new file mode 100644
index 0000000..22e6c1e
--- /dev/null
+++ b/apply_model/cpp/third_party/catboost/CMakeLists.txt
@@ -0,0 +1,33 @@
+set(CATBOOST_TAG v1.2)
+set(CATBOOST_BASE_PATH ${CMAKE_BINARY_DIR}/third_party/catboost)
+
+file(
+    DOWNLOAD
+    https://github.com/catboost/catboost/releases/download/${CATBOOST_TAG}/libcatboostmodel.so
+    ${CATBOOST_BASE_PATH}/libcatboostmodel.so
+    SHOW_PROGRESS
+)
+
+file(
+    DOWNLOAD
+    https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/c_api.cpp
+    ${CATBOOST_BASE_PATH}/c_api.cpp
+    SHOW_PROGRESS
+)
+
+file(
+    DOWNLOAD
+    https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/c_api.h
+    ${CATBOOST_BASE_PATH}/c_api.h
+    SHOW_PROGRESS
+)
+
+file(
+    DOWNLOAD
+    https://raw.githubusercontent.com/catboost/catboost/${CATBOOST_TAG}/catboost/libs/model_interface/wrapped_calcer.h
+    ${CATBOOST_BASE_PATH}/wrapped_calcer.h
+    SHOW_PROGRESS
+)
+
+add_library(catboost SHARED IMPORTED GLOBAL)
+set_property(TARGET catboost PROPERTY IMPORTED_LOCATION ${CATBOOST_BASE_PATH}/libcatboostmodel.so)