-
Notifications
You must be signed in to change notification settings - Fork 0
/
fastai_gbms
1 lines (1 loc) · 61.9 KB
/
fastai_gbms
1
{"metadata":{"colab":{"provenance":[]},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":73290,"databundleVersionId":8710574,"sourceType":"competition"},{"sourceId":8574851,"sourceType":"datasetVersion","datasetId":5127429}],"dockerImageVersionId":30715,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false},"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"}},"nbformat_minor":4,"nbformat":4,"cells":[{"source":"<a href=\"https://www.kaggle.com/code/rubanzasilva/fastai-gradient-boosting?scriptVersionId=184360814\" target=\"_blank\"><img align=\"left\" alt=\"Kaggle\" title=\"Open in Kaggle\" src=\"https://kaggle.com/static/images/open-in-kaggle.svg\"></a>","metadata":{},"cell_type":"markdown"},{"cell_type":"markdown","source":"# **STUDENT ACADEMIC SUCCESS**\n\nHere we build a classification model to predict students' dropout and academic sucess.\n\nThe dataset contains information collected from a higher education institution related to students undertaking different degree programs.\nThe original dataset contains information known at the time of student enrollment and the student's academic perfromance at the end of the 1st and 2nd semester.\n\nThe target is split into three distinct categories namely Dropout, Enrolled and Graduate.\n\nIn this spirit, I shall build classification models using various architectures to predict the student's dropout and academic success.This can then be used to predict which students are most likely to drop out at an early stage so that strategies can be put in place to counter this.\n\nThat would help reduce the rate of academic dropout and failure.","metadata":{}},{"cell_type":"markdown","source":"## Import Libraries and Datasets","metadata":{"id":"9OxKc8QxLWir"}},{"cell_type":"markdown","source":"### Libraries","metadata":{"id":"fYKulgvoLlxF"}},{"cell_type":"markdown","source":"## Imports\n\nBelow, i import all the libraries and datasets needed for this competition.","metadata":{"id":"YRiKSLsk_8Ia"}},{"cell_type":"code","source":"!pip install catboost\n!pip install optuna\n!pip install optuna_distributed\n!pip install openfe","metadata":{"id":"MVFLmLWg_8Ih","outputId":"0f41f00f-df50-4173-a4f0-b30c536499ed","scrolled":true,"_kg_hide-output":true,"execution":{"iopub.status.busy":"2024-06-19T03:25:16.761928Z","iopub.execute_input":"2024-06-19T03:25:16.762435Z","iopub.status.idle":"2024-06-19T03:26:22.622895Z","shell.execute_reply.started":"2024-06-19T03:25:16.762396Z","shell.execute_reply":"2024-06-19T03:26:22.621638Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#hide\n#! [ -e /content ]\n\n#hide\n#This imports and sets up everything you will need for this notebook\n#\n#!pip install -Uqq fastbook\n#import fastbook\n#fastbook.setup_book()\n\n#from fastbook import *\n\n\nfrom fastai.tabular.all import *\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport numpy as np\nfrom numpy import random\nfrom tqdm import tqdm\nfrom ipywidgets import interact\n\nfrom fastai.imports import *\nnp.set_printoptions(linewidth=130)\n\n\nfrom pathlib import Path\nimport os\n\n\nfrom sklearn.ensemble import RandomForestRegressor,RandomForestClassifier\nfrom sklearn.metrics import roc_auc_score,accuracy_score,mean_squared_error\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import VotingClassifier\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import KFold, cross_val_score\n\n\n\n#transformers and pipeline\nfrom sklearn.compose import ColumnTransformer, make_column_transformer\nfrom sklearn.pipeline import Pipeline, make_pipeline\nfrom sklearn import set_config\n\nimport xgboost as xgb\nfrom xgboost import plot_importance\nfrom xgboost import XGBClassifier\n\nimport lightgbm as lgb\nfrom lightgbm import LGBMClassifier\n\nfrom catboost import CatBoostClassifier,CatBoostRegressor,Pool, metrics, cv\n\n\n\n\n\nimport optuna\nfrom optuna.samplers import TPESampler\nfrom optuna.visualization import plot_contour\nfrom optuna.visualization import plot_edf\nfrom optuna.visualization import plot_intermediate_values\nfrom optuna.visualization import plot_optimization_history\nfrom optuna.visualization import plot_parallel_coordinate\nfrom optuna.visualization import plot_param_importances\nfrom optuna.visualization import plot_slice\nfrom optuna.samplers import TPESampler\nimport warnings\n\n\nmatplotlib.rc('image', cmap='Greys')\n\n#from fastkaggle import setup_comp\n\n\n\nfrom openfe import OpenFE, transform\n\n#from IPython.display import FileLink\n\n\n","metadata":{"id":"iKvCdJ05_8Im","execution":{"iopub.status.busy":"2024-06-19T03:26:22.625746Z","iopub.execute_input":"2024-06-19T03:26:22.626199Z","iopub.status.idle":"2024-06-19T03:26:25.652541Z","shell.execute_reply.started":"2024-06-19T03:26:22.626136Z","shell.execute_reply":"2024-06-19T03:26:25.651346Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"id":"h4tHhDlX_8Iv","outputId":"9ddbe59f-a542-4564-836c-d02d7a757dd5","execution":{"iopub.status.busy":"2024-06-19T03:26:25.654058Z","iopub.execute_input":"2024-06-19T03:26:25.654418Z","iopub.status.idle":"2024-06-19T03:26:25.6681Z","shell.execute_reply.started":"2024-06-19T03:26:25.654389Z","shell.execute_reply":"2024-06-19T03:26:25.666605Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"path = Path('/kaggle/input/playground-series-s4e6/')\npath","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:25.671321Z","iopub.execute_input":"2024-06-19T03:26:25.671793Z","iopub.status.idle":"2024-06-19T03:26:25.690485Z","shell.execute_reply.started":"2024-06-19T03:26:25.671752Z","shell.execute_reply":"2024-06-19T03:26:25.689128Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Import Datasets\n\nI use id as my index_col as it gives slightly better results.","metadata":{}},{"cell_type":"code","source":"train_df = pd.read_csv(path/'train.csv',index_col='id')\ntest_df = pd.read_csv(path/'test.csv',index_col='id')\nsub_df = pd.read_csv(path/'sample_submission.csv')\noriginal_df = pd.read_csv('/kaggle/input/academic-success-dataset/data.csv',delimiter=';')","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:25.691566Z","iopub.execute_input":"2024-06-19T03:26:25.691937Z","iopub.status.idle":"2024-06-19T03:26:26.470517Z","shell.execute_reply.started":"2024-06-19T03:26:25.691906Z","shell.execute_reply":"2024-06-19T03:26:26.469267Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# EDA","metadata":{}},{"cell_type":"code","source":"train_df.head()","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:26.471965Z","iopub.execute_input":"2024-06-19T03:26:26.472434Z","iopub.status.idle":"2024-06-19T03:26:26.507787Z","shell.execute_reply.started":"2024-06-19T03:26:26.472385Z","shell.execute_reply":"2024-06-19T03:26:26.506664Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df.hist(figsize=(20,15));","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:26.509042Z","iopub.execute_input":"2024-06-19T03:26:26.50941Z","iopub.status.idle":"2024-06-19T03:26:33.920733Z","shell.execute_reply.started":"2024-06-19T03:26:26.509381Z","shell.execute_reply":"2024-06-19T03:26:33.919373Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df['Target'].hist(figsize=(6,4));","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:33.922438Z","iopub.execute_input":"2024-06-19T03:26:33.922834Z","iopub.status.idle":"2024-06-19T03:26:34.237518Z","shell.execute_reply.started":"2024-06-19T03:26:33.922795Z","shell.execute_reply":"2024-06-19T03:26:34.236116Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Summary statistics","metadata":{}},{"cell_type":"code","source":"train_df.describe()","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:34.239206Z","iopub.execute_input":"2024-06-19T03:26:34.239678Z","iopub.status.idle":"2024-06-19T03:26:34.421065Z","shell.execute_reply.started":"2024-06-19T03:26:34.239642Z","shell.execute_reply":"2024-06-19T03:26:34.419873Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df.describe(include=[object])","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:34.426192Z","iopub.execute_input":"2024-06-19T03:26:34.426706Z","iopub.status.idle":"2024-06-19T03:26:34.469868Z","shell.execute_reply.started":"2024-06-19T03:26:34.426658Z","shell.execute_reply":"2024-06-19T03:26:34.46847Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### Feature Encoding","metadata":{}},{"cell_type":"markdown","source":"Per the original feature explanations, I expect all the features to be integers with the only exceptions being Previous qualification (grade) and Admission grade which are grading scores between 0 and 200.\n\nUnemployment rate and Inflation rate are marked as percentages hence they can also be continous and floats.\n\nLet us check the feature encoding for our variables and see if it matches.\n\nWhen i run the cell below, i notice that the Curricular units 1st sem (grade) and Curricular units 2nd sem (grade) are also floats. Hence i believe these might warrant further investigastion.\n\nThe circular units in the 1st and 2nd sem are the result of the grade average in the 1st and 2nd semester respectively which are between 0 and 20.","metadata":{}},{"cell_type":"code","source":"train_df.info();","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:34.471414Z","iopub.execute_input":"2024-06-19T03:26:34.471833Z","iopub.status.idle":"2024-06-19T03:26:34.505767Z","shell.execute_reply.started":"2024-06-19T03:26:34.471801Z","shell.execute_reply":"2024-06-19T03:26:34.504482Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df['Curricular units 2nd sem (grade)'].hist(bins=100,figsize=(20,15));","metadata":{"_kg_hide-output":true,"_kg_hide-input":true,"execution":{"iopub.status.busy":"2024-06-19T03:26:34.507075Z","iopub.execute_input":"2024-06-19T03:26:34.507428Z","iopub.status.idle":"2024-06-19T03:26:35.135596Z","shell.execute_reply.started":"2024-06-19T03:26:34.507399Z","shell.execute_reply":"2024-06-19T03:26:35.134402Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sns.boxplot(y='Curricular units 2nd sem (grade)', x='Target',data=train_df)\nplt.show()\nplt.close()\n","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:35.13707Z","iopub.execute_input":"2024-06-19T03:26:35.137468Z","iopub.status.idle":"2024-06-19T03:26:35.79939Z","shell.execute_reply.started":"2024-06-19T03:26:35.137437Z","shell.execute_reply":"2024-06-19T03:26:35.797606Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I will try to explore the features with large deviations to check for possible outliers etc.","metadata":{}},{"cell_type":"markdown","source":"#### Course\n\n33 - Biofuel Production Technologies \n\n171 - Animation and Multimedia Design \n\n8014 - Social Service (evening attendance) \n\n9003 - Agronomy \n\n9070 - Communication Design \n\n9085 - Veterinary Nursing \n\n9119 - Informatics Engineering \n\n9130 - Equinculture \n\n9147 - Management \n\n9238 - Social Service \n\n9254 - Tourism \n\n9500 - Nursing \n\n9556 - Oral Hygiene \n\n9670 - Advertising and Marketing Management \n\n9773 - Journalism and Communication \n\n9853 - Basic Education \n\n9991 - Management (evening attendance)","metadata":{}},{"cell_type":"code","source":"train_df['Course'].hist(bins=100,figsize=(20,15));","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2024-06-19T03:26:35.80154Z","iopub.execute_input":"2024-06-19T03:26:35.802079Z","iopub.status.idle":"2024-06-19T03:26:36.38874Z","shell.execute_reply.started":"2024-06-19T03:26:35.802031Z","shell.execute_reply":"2024-06-19T03:26:36.387545Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"The dataset consists of students who are offering 19 different courses with Nursing and Journalism.\n\nI also notice that there are 2 courses included in the original dataset but not in the synthetic training dataset provided which are 979 and 39.These so happen to only have 1 student offering them for each.\n","metadata":{}},{"cell_type":"code","source":"train_df['Course'].nunique()","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:36.390449Z","iopub.execute_input":"2024-06-19T03:26:36.390984Z","iopub.status.idle":"2024-06-19T03:26:36.402239Z","shell.execute_reply.started":"2024-06-19T03:26:36.390943Z","shell.execute_reply":"2024-06-19T03:26:36.400876Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"original_df['Course'].nunique()","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:36.404023Z","iopub.execute_input":"2024-06-19T03:26:36.404563Z","iopub.status.idle":"2024-06-19T03:26:36.417322Z","shell.execute_reply.started":"2024-06-19T03:26:36.404467Z","shell.execute_reply":"2024-06-19T03:26:36.416129Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Find values in 'Course' of train_df that are not in 'Course' of original_df\nmissing_courses = train_df[~train_df['Course'].isin(original_df['Course'])]\n\n# Get unique course names in train_df that are not in original_df\nunique_missing_courses = missing_courses['Course'].unique()\n\nprint(unique_missing_courses)\n","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:36.418738Z","iopub.execute_input":"2024-06-19T03:26:36.419104Z","iopub.status.idle":"2024-06-19T03:26:36.436097Z","shell.execute_reply.started":"2024-06-19T03:26:36.419075Z","shell.execute_reply":"2024-06-19T03:26:36.434453Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df['Course'].value_counts()","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2024-06-19T03:26:36.437819Z","iopub.execute_input":"2024-06-19T03:26:36.438299Z","iopub.status.idle":"2024-06-19T03:26:36.451429Z","shell.execute_reply.started":"2024-06-19T03:26:36.438258Z","shell.execute_reply":"2024-06-19T03:26:36.450097Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Value count proportions","metadata":{}},{"cell_type":"code","source":"train_df['Course'].value_counts(normalize=True)","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2024-06-19T03:26:36.453003Z","iopub.execute_input":"2024-06-19T03:26:36.453384Z","iopub.status.idle":"2024-06-19T03:26:36.469421Z","shell.execute_reply.started":"2024-06-19T03:26:36.453353Z","shell.execute_reply":"2024-06-19T03:26:36.467969Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#barplot\nsns.countplot(x='Course', data=train_df, palette='winter')\nplt.figure(figsize=(20, 15)) ;\nplt.show();\n","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:36.470913Z","iopub.execute_input":"2024-06-19T03:26:36.471335Z","iopub.status.idle":"2024-06-19T03:26:36.837298Z","shell.execute_reply.started":"2024-06-19T03:26:36.471302Z","shell.execute_reply":"2024-06-19T03:26:36.836086Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"For better visualization, i try flipping the values of the axes in the countplot.","metadata":{}},{"cell_type":"code","source":"sns.countplot(y='Course', data=train_df, palette='winter')\nplt.figure(figsize=(15, 20)) \nplt.show()\n","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:36.838794Z","iopub.execute_input":"2024-06-19T03:26:36.839184Z","iopub.status.idle":"2024-06-19T03:26:37.206564Z","shell.execute_reply.started":"2024-06-19T03:26:36.839128Z","shell.execute_reply":"2024-06-19T03:26:37.205123Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I check to see if there is any relationship between the Course taken and the Target.","metadata":{}},{"cell_type":"code","source":"sns.countplot(x='Course', hue='Target', data=train_df)\nplt.show();","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:37.208461Z","iopub.execute_input":"2024-06-19T03:26:37.208846Z","iopub.status.idle":"2024-06-19T03:26:37.8216Z","shell.execute_reply.started":"2024-06-19T03:26:37.208816Z","shell.execute_reply":"2024-06-19T03:26:37.82035Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sns.countplot(y='Course', hue='Target', data=train_df)\nplt.figure(figsize=(15, 20)) \nplt.show();","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:37.823315Z","iopub.execute_input":"2024-06-19T03:26:37.823767Z","iopub.status.idle":"2024-06-19T03:26:38.420898Z","shell.execute_reply.started":"2024-06-19T03:26:37.823728Z","shell.execute_reply":"2024-06-19T03:26:38.419544Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sns.boxplot(x = \"Target\", y = \"Course\", palette = \"pastel\", data = train_df)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:38.422744Z","iopub.execute_input":"2024-06-19T03:26:38.423257Z","iopub.status.idle":"2024-06-19T03:26:38.74922Z","shell.execute_reply.started":"2024-06-19T03:26:38.423212Z","shell.execute_reply":"2024-06-19T03:26:38.747968Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Baseline\n\nPreviously, i had built a baseline model using AutoML solution AutoGluon without presets, this gave me an initial submission score of 0.83434.Find the notebook [here](https://www.kaggle.com/code/rubanzasilva/autogluon-starter).\n\nIn this notebook, i test out different model architectures and data transformation to try to improve on the baseline score.","metadata":{}},{"cell_type":"markdown","source":"# Without original dataset\n\nFirst i shall try out the models using only the data initially provided to us, without the original dataset.\n\nBelow i use the fastai cont_cat_split function to separate my dataset variables into categorical and continous variables.\n\ncont_cat_split is a fastai helper function which returns the values of a dataframe as either categorical ot continuos based on the cardinality of its values. The function takes an argument of max_card whose default is 20. If the number of unique values is above the max_card, then that variables is considered to be continuos and vice versa.\n\nThe column specified as the dep_var is skipped during this cont_cat_split.\n\n\n","metadata":{}},{"cell_type":"code","source":"cont_names,cat_names = cont_cat_split(train_df, dep_var='Target')","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:38.750526Z","iopub.execute_input":"2024-06-19T03:26:38.750942Z","iopub.status.idle":"2024-06-19T03:26:38.774936Z","shell.execute_reply.started":"2024-06-19T03:26:38.750908Z","shell.execute_reply":"2024-06-19T03:26:38.773605Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"len(cat_names),len(cont_names)","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:38.776626Z","iopub.execute_input":"2024-06-19T03:26:38.776995Z","iopub.status.idle":"2024-06-19T03:26:38.784409Z","shell.execute_reply.started":"2024-06-19T03:26:38.776964Z","shell.execute_reply":"2024-06-19T03:26:38.78326Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I then use the fastai data transformation function *randomsplitter* which randomly splits my dataset into a training and validation set based of the value of *valid_pct*.","metadata":{}},{"cell_type":"code","source":"splits = RandomSplitter(valid_pct=0.2)(range_of(train_df))","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:38.785913Z","iopub.execute_input":"2024-06-19T03:26:38.786299Z","iopub.status.idle":"2024-06-19T03:26:38.845832Z","shell.execute_reply.started":"2024-06-19T03:26:38.786268Z","shell.execute_reply":"2024-06-19T03:26:38.844723Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I use fastai's TabularPandas class to create a TabularPandas object that applies preprocessing steps to our data, creating a data frame wrapper that takes in different arguments and knows which columns are categorical and continuous. I also define the target variable, y_name, the type of target and problem we are dealing with, and the way to split our data which previously defined in splits above.\n\nI then define a list of preprocessing steps, Procs, to be taken on our data. \n\nCategorify deals with the categorical variables and converts each category into a list of indexable numerical integers, creating numerical input which is required by our model.\nEach category corresponds to a different number.\n\nFillMissing as its name suggests, fills in the missing values in columns with continuous values. This can be filled in with the median, mode of that column, or a constant, with the default being the median value for that particular column.\nAs said above, FillMissing supports using the mode and a constant as strategies for dealing with missing values. We can do this by changing the FillMissing argument fill_strategy to mode or constant.\n\nNormalize puts the continuous variables between a standardized scale without losing important information by subtracting the mean and dividing by the standard deviation.\n\n","metadata":{}},{"cell_type":"code","source":"to = TabularPandas(train_df, procs=[Categorify, FillMissing,Normalize],\n cat_names = cat_names,\n cont_names = cont_names,\n y_names='Target',\n y_block=CategoryBlock(),\n splits=splits)","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:38.852555Z","iopub.execute_input":"2024-06-19T03:26:38.852964Z","iopub.status.idle":"2024-06-19T03:26:39.029038Z","shell.execute_reply.started":"2024-06-19T03:26:38.852932Z","shell.execute_reply":"2024-06-19T03:26:39.027784Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"We can now take a look at our TabularPandas object with the above transforms applied below","metadata":{}},{"cell_type":"code","source":"to.xs.iloc[:2]","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:39.030566Z","iopub.execute_input":"2024-06-19T03:26:39.030992Z","iopub.status.idle":"2024-06-19T03:26:39.069037Z","shell.execute_reply.started":"2024-06-19T03:26:39.030958Z","shell.execute_reply":"2024-06-19T03:26:39.067755Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I then define a DataLoader which is an extension of PyTorch's DataLoaders class albeit with more functionality.\nThis takes in our data above from the TabularPandas object and prepares it as input for our model passing it in batches which we defined by our our batch size set by the bs argument.\n\nThe DataLoaders and TabularPandas Object allow us build data objects we can use for training without specifically changing the raw input data.\n\nThe dataloader then acts as input for our models.","metadata":{}},{"cell_type":"code","source":"dls = to.dataloaders(bs=64)\ntest_dl = dls.test_dl(test_df)","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:39.070554Z","iopub.execute_input":"2024-06-19T03:26:39.070996Z","iopub.status.idle":"2024-06-19T03:26:39.211317Z","shell.execute_reply.started":"2024-06-19T03:26:39.070956Z","shell.execute_reply":"2024-06-19T03:26:39.210013Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"In order to use other libraries with fastai, I extract the x's and y's from my TabularPandas object which i used to preprocess the data.\n","metadata":{}},{"cell_type":"code","source":"to.xs[:3]","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:26:39.21255Z","iopub.execute_input":"2024-06-19T03:26:39.212892Z","iopub.status.idle":"2024-06-19T03:26:39.251118Z","shell.execute_reply.started":"2024-06-19T03:26:39.212862Z","shell.execute_reply":"2024-06-19T03:26:39.249876Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"X_train, y_train = to.train.xs, to.train.ys.values.ravel()\nX_test, y_test = to.valid.xs, to.valid.ys.values.ravel()\n","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:40:07.205927Z","iopub.execute_input":"2024-06-19T03:40:07.206418Z","iopub.status.idle":"2024-06-19T03:40:07.262468Z","shell.execute_reply.started":"2024-06-19T03:40:07.206383Z","shell.execute_reply":"2024-06-19T03:40:07.261219Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I can now directly use the training and validation set values i extracted above as direct input for decision tress and gradient boosting models as we shall see below.","metadata":{}},{"cell_type":"markdown","source":"# Parameter Optimization With Optuna\n\nI improve my model performance by optimizing my model hyperparameters further. To do this, i employ optuna which is a hyperparameter optimization framework that automates hyperparameter search.","metadata":{}},{"cell_type":"markdown","source":"#### LGBM","metadata":{}},{"cell_type":"code","source":"%%time\ndef objective(trial):\n params = {\n 'num_leaves': trial.suggest_int('num_leaves', 100, 500),\n 'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),\n 'n_estimators': trial.suggest_int('n_estimators', 300, 1200),\n 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),\n 'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),\n 'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),\n 'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),\n 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),\n 'subsample': trial.suggest_float('subsample', 0.25, 1.0),\n 'max_depth': trial.suggest_int('max_depth', 1, 15)\n }\n \n model = LGBMClassifier(**params, objective='multiclass', random_state=0, device='cpu', verbosity=-1)\n \n # Cross-validation with 5 folds using KFold\n kf = KFold(n_splits=5, shuffle=True, random_state=42)\n cv_results = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')\n \n # We maximize accuracy, so we return the mean accuracy of the cross-validation\n return np.mean(cv_results)\n\nstudy = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction=\"maximize\")\nstudy.optimize(objective, n_trials=100)\n\nprint('Best value:', study.best_value)\nprint('Best trial:', study.best_trial.params)\n","metadata":{"execution":{"iopub.status.busy":"2024-06-18T09:32:25.691097Z","iopub.execute_input":"2024-06-18T09:32:25.691524Z","iopub.status.idle":"2024-06-18T13:09:51.450502Z","shell.execute_reply.started":"2024-06-18T09:32:25.691492Z","shell.execute_reply":"2024-06-18T13:09:51.448557Z"},"scrolled":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print('Best value:', study.best_value)\nprint('Best trial:', study.best_trial.params)","metadata":{"execution":{"iopub.status.busy":"2024-06-18T13:09:51.453291Z","iopub.execute_input":"2024-06-18T13:09:51.453698Z","iopub.status.idle":"2024-06-18T13:09:51.460886Z","shell.execute_reply.started":"2024-06-18T13:09:51.453653Z","shell.execute_reply":"2024-06-18T13:09:51.459437Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### XGBoost","metadata":{}},{"cell_type":"code","source":"%%time\ndef objective(trial):\n params = {\n 'num_leaves': trial.suggest_int('num_leaves', 100, 500),\n 'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),\n 'n_estimators': trial.suggest_int('n_estimators', 300, 1200),\n 'subsample_for_bin': trial.suggest_int('subsample_for_bin', 20000, 300000),\n 'min_child_samples': trial.suggest_int('min_child_samples', 20, 500),\n 'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 10.0, log=True),\n 'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 10.0, log=True),\n 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),\n 'subsample': trial.suggest_float('subsample', 0.25, 1.0),\n 'max_depth': trial.suggest_int('max_depth', 1, 15)\n }\n \n model = xgb.XGBClassifier(**params, random_state=0)\n \n # Cross-validation with 5 folds using KFold\n kf = KFold(n_splits=5, shuffle=True, random_state=42)\n cv_results = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')\n \n # We maximize accuracy, so we return the mean accuracy of the cross-validation\n return np.mean(cv_results)\n\nstudy = optuna.create_study(sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=0), direction=\"maximize\")\nstudy.optimize(objective, n_trials=100)\n\nprint('Best value:', study.best_value)\nprint('Best trial:', study.best_trial.params)\n","metadata":{"scrolled":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"raw","source":"print('Best value:', study.best_value)\nprint('Best trial:', study.best_trial.params)","metadata":{"execution":{"iopub.status.busy":"2024-06-18T20:00:13.687698Z","iopub.status.idle":"2024-06-18T20:00:13.688119Z","shell.execute_reply.started":"2024-06-18T20:00:13.687921Z","shell.execute_reply":"2024-06-18T20:00:13.687939Z"}}},{"cell_type":"markdown","source":"#### CatBoost","metadata":{}},{"cell_type":"markdown","source":"# Trying out different model architectures.\n\nHere i start with decision trees specifically random forests, then i try out gradient boosting models catboost, xgboost and light GBMs.\n\nLater on i try out neural networks and an ensemble of various neural networks using the fastai library.","metadata":{}},{"cell_type":"markdown","source":"# Random Forests","metadata":{}},{"cell_type":"code","source":"%%time\nrf = RandomForestClassifier(100, min_samples_leaf=3)\nrf_model = rf.fit(X_train, y_train);\n\nrf_preds = tensor(rf_model.predict(test_dl.xs))\n\nrf_preds_x = tensor(rf_model.predict(X_test))\n\n#mse = mean_absolute_error(y_test, rf_preds_x)\n#rmse = np.sqrt(mse)\n\naccuracy_score(y_test,rf_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-17T20:54:26.68459Z","iopub.execute_input":"2024-06-17T20:54:26.685199Z","iopub.status.idle":"2024-06-17T20:54:45.41037Z","shell.execute_reply.started":"2024-06-17T20:54:26.68515Z","shell.execute_reply":"2024-06-17T20:54:45.408737Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Feature Importance\n\nI Looking into sklearns feature_importances_ attribute to see the rate at which particular columns influenced the models predictions.\n\nThis gives us a picture into the models decision making process.","metadata":{}},{"cell_type":"code","source":"def rf_feat_importance(m, df):\n return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}\n ).sort_values('imp', ascending=False)","metadata":{"execution":{"iopub.status.busy":"2024-06-17T20:54:45.41351Z","iopub.execute_input":"2024-06-17T20:54:45.414024Z","iopub.status.idle":"2024-06-17T20:54:45.421828Z","shell.execute_reply.started":"2024-06-17T20:54:45.413979Z","shell.execute_reply":"2024-06-17T20:54:45.420084Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fi = rf_feat_importance(rf_model, X_train)\n#fi[:10]\n\nfi","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2024-06-17T20:54:45.424171Z","iopub.execute_input":"2024-06-17T20:54:45.424883Z","iopub.status.idle":"2024-06-17T20:54:45.475427Z","shell.execute_reply.started":"2024-06-17T20:54:45.424834Z","shell.execute_reply":"2024-06-17T20:54:45.474056Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"This shows us that the first couple of features influence the models predicitions way more than any other features.\n\nBelow i visualize the same feature importance from above in a barchart\n","metadata":{}},{"cell_type":"code","source":"def plot_fi(fi):\n return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)\n\n#plot_fi(fi[:30]);\nplot_fi(fi);\n ","metadata":{"execution":{"iopub.status.busy":"2024-06-17T20:54:45.477956Z","iopub.execute_input":"2024-06-17T20:54:45.478343Z","iopub.status.idle":"2024-06-17T20:54:46.126682Z","shell.execute_reply.started":"2024-06-17T20:54:45.47831Z","shell.execute_reply":"2024-06-17T20:54:46.125221Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Cat Boost","metadata":{}},{"cell_type":"code","source":"cat_optuna_params = {\n \n 'colsample_bylevel': 0.6383474716497279,\n 'learning_rate': 0.09475494290429642,\n 'random_strength': 0.07771221926568195,\n 'max_bin': 490,\n 'depth': 5,\n 'l2_leaf_reg': 5,\n 'boosting_type': 'Plain',\n 'bootstrap_type': 'Bernoulli',\n 'subsample': 0.8429457747642737\n \n}","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:47:47.037629Z","iopub.execute_input":"2024-06-19T03:47:47.038102Z","iopub.status.idle":"2024-06-19T03:47:47.04869Z","shell.execute_reply.started":"2024-06-19T03:47:47.038068Z","shell.execute_reply":"2024-06-19T03:47:47.047406Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\ncat_model = CatBoostClassifier(**cat_optuna_params)\ncat_model = cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)\n\n#test set preds\ncat_preds = tensor(cat_model.predict(test_dl.xs))\n\n\ncat_preds_final = cat_preds.squeeze(1)\n\n#validation set preds\ncat_preds_x = tensor(cat_model.predict(X_test))\n\ncat_preds_x_final = cat_preds_x.squeeze(1)\n\naccuracy_score(y_test,cat_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:48:03.695686Z","iopub.execute_input":"2024-06-19T03:48:03.696857Z","iopub.status.idle":"2024-06-19T03:48:41.860303Z","shell.execute_reply.started":"2024-06-19T03:48:03.69679Z","shell.execute_reply":"2024-06-19T03:48:41.85901Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#cat_preds_a = tensor(cat_model.predict(test_df))\n#cat_preds_a.shape,cat_preds_final.shape,cat_preds_x_final.shape\n#cat_preds_a_final = cat_preds_a.squeeze(1)\n#cat_preds_a_final","metadata":{"execution":{"iopub.status.busy":"2024-06-17T21:01:02.414791Z","iopub.execute_input":"2024-06-17T21:01:02.415255Z","iopub.status.idle":"2024-06-17T21:01:02.521864Z","shell.execute_reply.started":"2024-06-17T21:01:02.415217Z","shell.execute_reply":"2024-06-17T21:01:02.520738Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"cat_preds_final","metadata":{"execution":{"iopub.status.busy":"2024-06-17T21:01:02.561777Z","iopub.execute_input":"2024-06-17T21:01:02.562201Z","iopub.status.idle":"2024-06-17T21:01:02.570697Z","shell.execute_reply.started":"2024-06-17T21:01:02.562168Z","shell.execute_reply":"2024-06-17T21:01:02.569344Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# XGBoost","metadata":{}},{"cell_type":"code","source":"xgb_optuna_params = {\n 'n_estimators': 929,\n 'alpha': 2.287466581490129e-05,\n 'subsample': 0.8766675651018592,\n 'colsample_bytree': 0.288332829334817,\n 'max_depth': 8,\n 'min_child_weight': 6,\n 'learning_rate': 0.024083411832750343,\n 'gamma': 0.001816649055813574\n}","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:40:21.697171Z","iopub.execute_input":"2024-06-19T03:40:21.697712Z","iopub.status.idle":"2024-06-19T03:40:21.705481Z","shell.execute_reply.started":"2024-06-19T03:40:21.697671Z","shell.execute_reply":"2024-06-19T03:40:21.703981Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"xgb_model = xgb.XGBClassifier(**xgb_optuna_params)\nxgb_model = xgb_model.fit(X_train, y_train)\n\nxgb_preds = tensor(xgb_model.predict(test_dl.xs))\n\nxgb_preds_x = tensor(xgb_model.predict(X_test))\n\naccuracy_score(y_test,xgb_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:40:29.288261Z","iopub.execute_input":"2024-06-19T03:40:29.28879Z","iopub.status.idle":"2024-06-19T03:41:29.152994Z","shell.execute_reply.started":"2024-06-19T03:40:29.288746Z","shell.execute_reply":"2024-06-19T03:41:29.152105Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# LGBM","metadata":{}},{"cell_type":"code","source":"lgb_optuna_params = {\n 'num_leaves': 485, \n 'learning_rate': 0.016388605840878773, \n 'n_estimators': 1064, \n 'subsample_for_bin': 106545, \n 'min_child_samples': 382, \n 'reg_alpha': 4.355786683676367e-05, \n 'reg_lambda': 0.12174022484031638, \n 'colsample_bytree': 0.44894475300776, \n 'subsample': 0.735046656897411, \n 'max_depth': 8\n}","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:44:56.15439Z","iopub.execute_input":"2024-06-19T03:44:56.154837Z","iopub.status.idle":"2024-06-19T03:44:56.162552Z","shell.execute_reply.started":"2024-06-19T03:44:56.154804Z","shell.execute_reply":"2024-06-19T03:44:56.160888Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lgb_model = lgb.LGBMClassifier(**lgb_optuna_params)\nlgb_model = lgb_model.fit(X_train, y_train)\n\n#test set preds\nlgb_preds = tensor(lgb_model.predict(test_dl.xs))\n\n#validation set preds\nlgb_preds_x = tensor(lgb_model.predict(X_test))\n\nlgb_score = accuracy_score(y_test,lgb_preds_x)\nlgb_score","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2024-06-19T03:45:04.163831Z","iopub.execute_input":"2024-06-19T03:45:04.164355Z","iopub.status.idle":"2024-06-19T03:46:01.150513Z","shell.execute_reply.started":"2024-06-19T03:45:04.164315Z","shell.execute_reply":"2024-06-19T03:46:01.149217Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lgb_score","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:46:01.152769Z","iopub.execute_input":"2024-06-19T03:46:01.153183Z","iopub.status.idle":"2024-06-19T03:46:01.161739Z","shell.execute_reply.started":"2024-06-19T03:46:01.153134Z","shell.execute_reply":"2024-06-19T03:46:01.16012Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model_preds = {\n \"random forests\":accuracy_score(y_test,rf_preds_x),\n \"cat boost\":accuracy_score(y_test,cat_preds_x),\n \"lgbm\":lgb_score,\n \"xgboost\":accuracy_score(y_test,xgb_preds_x), \n}\n\n#model_preds_a = model_preds.sort()\nprint(model_preds)","metadata":{"execution":{"iopub.status.busy":"2024-06-17T21:02:57.922567Z","iopub.execute_input":"2024-06-17T21:02:57.923381Z","iopub.status.idle":"2024-06-17T21:02:57.936934Z","shell.execute_reply.started":"2024-06-17T21:02:57.923341Z","shell.execute_reply":"2024-06-17T21:02:57.935774Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"{'random forests': 0.826439260275763, 'cat boost': 0.8316016467359342, 'lgbm': 0.8306214467751422, 'xgboost': 0.8299679801346141}","metadata":{}},{"cell_type":"code","source":"mapping = dict(enumerate(dls.vocab))\nmapping","metadata":{"execution":{"iopub.status.busy":"2024-06-13T04:04:39.102712Z","iopub.execute_input":"2024-06-13T04:04:39.103477Z","iopub.status.idle":"2024-06-13T04:04:39.116736Z","shell.execute_reply.started":"2024-06-13T04:04:39.103449Z","shell.execute_reply":"2024-06-13T04:04:39.115779Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Neural Network","metadata":{"id":"dYNkzV6z_8JO"}},{"cell_type":"code","source":"learn = tabular_learner(dls, metrics=accuracy)\nlearn.lr_find(suggest_funcs=(slide,valley))","metadata":{"id":"X0LRKpxq_8JO","execution":{"iopub.status.busy":"2024-06-10T16:31:41.166644Z","iopub.execute_input":"2024-06-10T16:31:41.167152Z","iopub.status.idle":"2024-06-10T16:31:43.768408Z","shell.execute_reply.started":"2024-06-10T16:31:41.167108Z","shell.execute_reply":"2024-06-10T16:31:43.766548Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\nlearn.fit_one_cycle(20,0.02)","metadata":{"id":"igxsaxjb_8JU","execution":{"iopub.status.busy":"2024-06-10T16:31:43.771644Z","iopub.execute_input":"2024-06-10T16:31:43.77216Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"dl = learn.dls.test_dl(test_df)","metadata":{"id":"xpiUqiLF_8JV","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\nnn_preds = learn.get_preds(dl=dl)\nnn_preds_x = learn.get_preds()[0]\na_preds, _ = learn.get_preds(dl=dl)\nnn_preds_y = a_preds.squeeze(1)","metadata":{"id":"ASXsRveD_8JW","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"cat_preds_final = cat_preds.squeeze(1)","metadata":{"execution":{"iopub.status.busy":"2024-06-10T09:36:44.35509Z","iopub.execute_input":"2024-06-10T09:36:44.355624Z","iopub.status.idle":"2024-06-10T09:36:44.362535Z","shell.execute_reply.started":"2024-06-10T09:36:44.355582Z","shell.execute_reply":"2024-06-10T09:36:44.360721Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"nn_preds_x.shape,nn_preds.shape,nn_preds_y.shape","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!ls","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:17:03.466047Z","iopub.execute_input":"2024-06-10T16:17:03.467314Z","iopub.status.idle":"2024-06-10T16:17:04.594003Z","shell.execute_reply.started":"2024-06-10T16:17:03.467255Z","shell.execute_reply":"2024-06-10T16:17:04.592423Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Individual Model Submission\n\nBelow i try submitting any of the above trained models, to prepara a file for submission i pass in the model preds from the chosen model for example lgb_preds, cat_preds_final, xgb_preds etc.I pass these into the predicted_labels variable below.\n","metadata":{}},{"cell_type":"code","source":"!rm submission.csv","metadata":{"execution":{"iopub.status.busy":"2024-06-10T09:41:09.743546Z","iopub.execute_input":"2024-06-10T09:41:09.745021Z","iopub.status.idle":"2024-06-10T09:41:10.803569Z","shell.execute_reply.started":"2024-06-10T09:41:09.74496Z","shell.execute_reply":"2024-06-10T09:41:10.801056Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mapping = dict(enumerate(dls.vocab))\npredicted_labels = [mapping[value.item()] for value in cat_preds_final]\nsubmit = pd.read_csv(path/'sample_submission.csv')\nsubmit.Target = predicted_labels\nsubmit.to_csv('submission.csv',index=False)\nsubmit","metadata":{"id":"KB9WcsyA_8Jk","execution":{"iopub.status.busy":"2024-06-10T09:46:44.413471Z","iopub.execute_input":"2024-06-10T09:46:44.414025Z","iopub.status.idle":"2024-06-10T09:46:44.641955Z","shell.execute_reply.started":"2024-06-10T09:46:44.41398Z","shell.execute_reply":"2024-06-10T09:46:44.640504Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Model Ensemble\n\nFor testing with our accuracy_score metric, we use the x preds as in lgb_preds_x, xgboost_preds_x,rf_preds_x, and cat_preds_x_final.\n\nFor submission, we use the lgb_preds, xgboost_preds,rf_preds, and cat_preds_final as this is the result of running the model on the initial test set and gives us the same shape as our expected submission.","metadata":{}},{"cell_type":"markdown","source":"#### For testing\n","metadata":{}},{"cell_type":"code","source":"cat_preds_x_final = cat_preds_x.squeeze(1)","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:15:19.950821Z","iopub.execute_input":"2024-06-10T16:15:19.951403Z","iopub.status.idle":"2024-06-10T16:15:19.960013Z","shell.execute_reply.started":"2024-06-10T16:15:19.951357Z","shell.execute_reply":"2024-06-10T16:15:19.957736Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"rf_preds_x.shape,cat_preds_x_final.shape,lgb_preds_x.shape,xgb_preds_x.shape","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:15:26.089087Z","iopub.execute_input":"2024-06-10T16:15:26.089732Z","iopub.status.idle":"2024-06-10T16:15:26.10064Z","shell.execute_reply.started":"2024-06-10T16:15:26.089683Z","shell.execute_reply":"2024-06-10T16:15:26.098748Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"general_preds = (rf_preds_x + cat_preds_x_final + lgb_preds_x + xgb_preds_x)/4\ngeneral_preds","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:25:42.124608Z","iopub.execute_input":"2024-06-10T16:25:42.125171Z","iopub.status.idle":"2024-06-10T16:25:42.178091Z","shell.execute_reply.started":"2024-06-10T16:25:42.125115Z","shell.execute_reply":"2024-06-10T16:25:42.176645Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"rf_preds_x.shape,cat_preds_x_final.shape,lgb_preds_x.shape,xgb_preds_x.shape,general_preds.shape","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:16:57.604151Z","iopub.execute_input":"2024-06-10T16:16:57.605367Z","iopub.status.idle":"2024-06-10T16:16:57.614658Z","shell.execute_reply.started":"2024-06-10T16:16:57.605315Z","shell.execute_reply":"2024-06-10T16:16:57.613223Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"accuracy_score(y_test,general_preds)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"#### For Submission","metadata":{}},{"cell_type":"code","source":"rf_preds.shape,cat_preds_final.shape,lgb_preds.shape,xgb_preds.shape","metadata":{"execution":{"iopub.status.busy":"2024-06-10T16:25:07.551957Z","iopub.execute_input":"2024-06-10T16:25:07.552656Z","iopub.status.idle":"2024-06-10T16:25:07.563321Z","shell.execute_reply.started":"2024-06-10T16:25:07.552612Z","shell.execute_reply":"2024-06-10T16:25:07.561836Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"general_preds_sub = ( cat_preds_final + lgb_preds + xgb_preds)/4\ngeneral_preds_sub","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:51:08.422209Z","iopub.execute_input":"2024-06-19T03:51:08.422699Z","iopub.status.idle":"2024-06-19T03:51:08.491013Z","shell.execute_reply.started":"2024-06-19T03:51:08.422667Z","shell.execute_reply":"2024-06-19T03:51:08.489759Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"rf_preds.shape,cat_preds_final.shape,lgb_preds.shape,xgb_preds.shape,general_preds_sub.shape","metadata":{"execution":{"iopub.status.busy":"2024-06-13T04:04:39.173315Z","iopub.execute_input":"2024-06-13T04:04:39.173668Z","iopub.status.idle":"2024-06-13T04:04:39.179995Z","shell.execute_reply.started":"2024-06-13T04:04:39.173635Z","shell.execute_reply":"2024-06-13T04:04:39.179009Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mapping = dict(enumerate(dls.vocab))\npredicted_labels = [mapping[value.item()] for value in general_preds_sub]","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mapping = dict(enumerate(dls.vocab))\n#predicted_labels = [mapping[value.item()] for value in general_preds_sub]\n# Assuming general_preds_sub contains the averaged predictions from multiple models\nrounded_preds = torch.round(general_preds_sub).long() # Round to nearest whole number and convert to long type\n\n# Now map the rounded predictions to labels\npredicted_labels = [mapping[value.item()] for value in rounded_preds]\nsubmit = pd.read_csv(path/'sample_submission.csv')\nsubmit.Target = predicted_labels\nsubmit.to_csv('submission.csv',index=False)\nsubmit","metadata":{"execution":{"iopub.status.busy":"2024-06-19T03:51:21.961869Z","iopub.execute_input":"2024-06-19T03:51:21.962328Z","iopub.status.idle":"2024-06-19T03:51:22.221033Z","shell.execute_reply.started":"2024-06-19T03:51:21.962294Z","shell.execute_reply":"2024-06-19T03:51:22.21714Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!ls","metadata":{"execution":{"iopub.status.busy":"2024-06-19T04:23:33.772595Z","iopub.execute_input":"2024-06-19T04:23:33.773035Z","iopub.status.idle":"2024-06-19T04:23:34.936097Z","shell.execute_reply.started":"2024-06-19T04:23:33.772994Z","shell.execute_reply":"2024-06-19T04:23:34.934646Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Voting","metadata":{}},{"cell_type":"code","source":"from sklearn.ensemble import VotingClassifier\nfrom sklearn.metrics import accuracy_score\n","metadata":{"execution":{"iopub.status.busy":"2024-06-14T04:14:57.856015Z","iopub.execute_input":"2024-06-14T04:14:57.856395Z","iopub.status.idle":"2024-06-14T04:14:57.862298Z","shell.execute_reply.started":"2024-06-14T04:14:57.856365Z","shell.execute_reply":"2024-06-14T04:14:57.861306Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Assuming rf_model and xgb_model are already fitted models\n# Wrap them in a VotingClassifier\nvoting_clf = VotingClassifier(estimators=[\n ('rf', rf_model), # Replace rf_model with the actual RandomForestClassifier instance\n ('xgb', xgb_model), # Replace xgb_model with the actual XGBoostClassifier instance\n # ('cat', cat_model), # Replace rf_model with the actual RandomForestClassifier instance\n ('lgb', lgb_model)\n], voting='hard')\n\n# Now, you can use the VotingClassifier to make predictions\n# Note: The VotingClassifier expects scikit-learn compatible inputs\n# So, you'll need to convert your PyTorch tensors to NumPy arrays or Pandas DataFrames\n# Here's an example assuming X_test is your test data in a PyTorch tensor format\n#X_test_np = X_test.numpy() # Convert PyTorch tensor to NumPy array\n\nvoting_clf.fit(X_train, y_train)\n\npredictions = voting_clf.predict(X_test)\n\n# Calculate the accuracy score\naccuracy = accuracy_score(y_test, predictions)\nprint(f\"Accuracy: {accuracy:.6f}\")\n","metadata":{"scrolled":true,"execution":{"iopub.status.busy":"2024-06-14T04:28:24.534681Z","iopub.execute_input":"2024-06-14T04:28:24.53568Z","iopub.status.idle":"2024-06-14T04:32:30.148141Z","shell.execute_reply.started":"2024-06-14T04:28:24.535636Z","shell.execute_reply":"2024-06-14T04:32:30.147187Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Calculate the accuracy score\naccuracy = accuracy_score(y_test, predictions)\nprint(f\"Accuracy: {accuracy:.6f}\")","metadata":{"execution":{"iopub.status.busy":"2024-06-14T04:32:30.149957Z","iopub.execute_input":"2024-06-14T04:32:30.150238Z","iopub.status.idle":"2024-06-14T04:32:30.156331Z","shell.execute_reply.started":"2024-06-14T04:32:30.150214Z","shell.execute_reply":"2024-06-14T04:32:30.155293Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Adding original dataset","metadata":{}},{"cell_type":"code","source":"original_df = pd.read_csv('/kaggle/input/academic-success-dataset/data.csv', delimiter=';')","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:07:03.081469Z","iopub.execute_input":"2024-06-11T10:07:03.081854Z","iopub.status.idle":"2024-06-11T10:07:03.107928Z","shell.execute_reply.started":"2024-06-11T10:07:03.08182Z","shell.execute_reply":"2024-06-11T10:07:03.106503Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"original_df.rename(columns={'Daytime/evening attendance\\t':'Daytime/evening attendance'}, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:07:03.948777Z","iopub.execute_input":"2024-06-11T10:07:03.949216Z","iopub.status.idle":"2024-06-11T10:07:03.955727Z","shell.execute_reply.started":"2024-06-11T10:07:03.949179Z","shell.execute_reply":"2024-06-11T10:07:03.954382Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_final = pd.concat([train_df,original_df], axis=0)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:08:36.944802Z","iopub.execute_input":"2024-06-11T10:08:36.945481Z","iopub.status.idle":"2024-06-11T10:08:36.967207Z","shell.execute_reply.started":"2024-06-11T10:08:36.945432Z","shell.execute_reply":"2024-06-11T10:08:36.966023Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"cont_names,cat_names = cont_cat_split(train_final, dep_var='Target')\nsplits = RandomSplitter(valid_pct=0.2)(range_of(train_final))\nto = TabularPandas(train_final, procs=[Categorify, FillMissing,Normalize],\n cat_names = cat_names,\n cont_names = cont_names,\n y_names='Target',\n y_block=CategoryBlock(),\n splits=splits)\n\nX_train, y_train = to.train.xs, to.train.ys.values.ravel()\nX_test, y_test = to.valid.xs, to.valid.ys.values.ravel()\n\ndls = to.dataloaders(bs=64)\n#test_dl = dls.test_dl(test_df)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:08:37.29312Z","iopub.execute_input":"2024-06-11T10:08:37.293538Z","iopub.status.idle":"2024-06-11T10:08:37.651777Z","shell.execute_reply.started":"2024-06-11T10:08:37.293503Z","shell.execute_reply":"2024-06-11T10:08:37.650156Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_dl = dls.test_dl(test_df)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:08:37.653845Z","iopub.execute_input":"2024-06-11T10:08:37.654232Z","iopub.status.idle":"2024-06-11T10:08:37.721786Z","shell.execute_reply.started":"2024-06-11T10:08:37.654199Z","shell.execute_reply":"2024-06-11T10:08:37.720574Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\nrf = RandomForestClassifier(100, min_samples_leaf=3)\nrf_model = rf.fit(X_train, y_train);\n\nrf_preds = tensor(rf_model.predict(test_dl.xs))\n\nrf_preds_x = tensor(rf_model.predict(X_test))\n\nmse = mean_absolute_error(y_test, rf_preds_x)\nrmse = np.sqrt(mse)\n\naccuracy_score(y_test,rf_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:08:51.760974Z","iopub.execute_input":"2024-06-11T10:08:51.761442Z","iopub.status.idle":"2024-06-11T10:09:11.432661Z","shell.execute_reply.started":"2024-06-11T10:08:51.761406Z","shell.execute_reply":"2024-06-11T10:09:11.431259Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"{'random forests': 0.8236293537214925, 'cat boost': 0.8265699536038685, 'lgbm': 0.8253283669868653, 'xgboost': 0.8268313402600798}","metadata":{}},{"cell_type":"code","source":"%%time\ncat_model = CatBoostClassifier(iterations=2000, depth=8, learning_rate= 0.08, random_strength=10)\ncat_model = cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)\n\n#test set preds\ncat_preds = tensor(cat_model.predict(test_dl.xs))\n\n\ncat_preds_final = cat_preds.squeeze(1)\n\n#validation set preds\ncat_preds_x = tensor(cat_model.predict(X_test))\n\ncat_preds_x_final = cat_preds_x.squeeze(1)\n\naccuracy_score(y_test,cat_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:09:11.434521Z","iopub.execute_input":"2024-06-11T10:09:11.434885Z","iopub.status.idle":"2024-06-11T10:12:21.601471Z","shell.execute_reply.started":"2024-06-11T10:09:11.434852Z","shell.execute_reply":"2024-06-11T10:12:21.600095Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\nxgb_model = xgb.XGBClassifier(n_estimators = 197, max_depth=4, learning_rate=0.1818695751227044, subsample= 0.39774994666482544)\nxgb_model = xgb_model.fit(X_train, y_train)\nxgb_preds = tensor(xgb_model.predict(test_dl.xs))\n\nxgb_preds_x = tensor(xgb_model.predict(X_test))\n\naccuracy_score(y_test,xgb_preds_x)","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:13:02.66558Z","iopub.execute_input":"2024-06-11T10:13:02.666971Z","iopub.status.idle":"2024-06-11T10:13:10.180281Z","shell.execute_reply.started":"2024-06-11T10:13:02.666926Z","shell.execute_reply":"2024-06-11T10:13:10.179054Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"lgb_model = lgb.LGBMClassifier(num_leaves=251, learning_rate=0.02956613668999794, n_estimators=483, max_depth=82, boosting_type='gbdt',min_child_samples=90, random_state=27)\nlgb_model = lgb_model.fit(X_train, y_train)\n\n#test set preds\nlgb_preds = tensor(lgb_model.predict(test_dl.xs))\n\n#validation set preds\nlgb_preds_x = tensor(lgb_model.predict(X_test))\n\nlgb_score = accuracy_score(y_test,lgb_preds_x)\nlgb_score","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:13:10.182004Z","iopub.execute_input":"2024-06-11T10:13:10.182347Z","iopub.status.idle":"2024-06-11T10:14:01.532117Z","shell.execute_reply.started":"2024-06-11T10:13:10.182318Z","shell.execute_reply":"2024-06-11T10:14:01.530865Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"!rm submission.csv","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:14:33.073981Z","iopub.execute_input":"2024-06-11T10:14:33.074447Z","iopub.status.idle":"2024-06-11T10:14:34.160635Z","shell.execute_reply.started":"2024-06-11T10:14:33.074405Z","shell.execute_reply":"2024-06-11T10:14:34.159067Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mapping = dict(enumerate(dls.vocab))\npredicted_labels = [mapping[value.item()] for value in xgb_preds]\nsubmit = pd.read_csv(path/'sample_submission.csv')\nsubmit.Target = predicted_labels\nsubmit.to_csv('submission.csv',index=False)\nsubmit","metadata":{"execution":{"iopub.status.busy":"2024-06-11T10:15:10.237822Z","iopub.execute_input":"2024-06-11T10:15:10.238322Z","iopub.status.idle":"2024-06-11T10:15:10.739706Z","shell.execute_reply.started":"2024-06-11T10:15:10.238268Z","shell.execute_reply":"2024-06-11T10:15:10.738328Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Neural Network Ensemble","metadata":{"id":"75E_pjgO_8LW"}},{"cell_type":"code","source":"def ensemble():\n learn = tabular_learner(dls, metrics=accuracy)\n with learn.no_bar(),learn.no_logging(): learn.fit(6, 0.02)\n return learn.get_preds(dl=dl)[0]","metadata":{"id":"Gsi88cA5_8LX","execution":{"iopub.status.busy":"2024-06-10T10:27:37.349831Z","iopub.execute_input":"2024-06-10T10:27:37.350397Z","iopub.status.idle":"2024-06-10T10:27:37.357499Z","shell.execute_reply.started":"2024-06-10T10:27:37.350357Z","shell.execute_reply":"2024-06-10T10:27:37.356107Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"learns = [ensemble() for _ in range(5)]","metadata":{"id":"wK8O6o3L_8LX","execution":{"iopub.status.busy":"2024-06-10T10:27:37.519049Z","iopub.execute_input":"2024-06-10T10:27:37.519559Z","iopub.status.idle":"2024-06-10T10:34:07.203058Z","shell.execute_reply.started":"2024-06-10T10:27:37.519518Z","shell.execute_reply":"2024-06-10T10:34:07.20171Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"ens_preds = torch.stack(learns).mean(0)","metadata":{"id":"xUlI9CSG_8LY","execution":{"iopub.status.busy":"2024-06-10T10:34:07.205893Z","iopub.execute_input":"2024-06-10T10:34:07.206424Z","iopub.status.idle":"2024-06-10T10:34:07.218108Z","shell.execute_reply.started":"2024-06-10T10:34:07.20638Z","shell.execute_reply":"2024-06-10T10:34:07.216762Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"nn_preds_x.shape,ens_preds.shape","metadata":{"id":"bbGl6BUp_8LZ","execution":{"iopub.status.busy":"2024-06-10T10:38:47.213699Z","iopub.execute_input":"2024-06-10T10:38:47.214931Z","iopub.status.idle":"2024-06-10T10:38:47.222872Z","shell.execute_reply.started":"2024-06-10T10:38:47.214883Z","shell.execute_reply":"2024-06-10T10:38:47.221549Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Assuming ens_preds is a PyTorch tensor with shape [51012, 3]\n# Select predictions for the first class (index 0)\nselected_class_preds = ens_preds[:, 0]\n\n# Now selected_class_preds has a shape of torch.Size([51012])\nprint(selected_class_preds.shape)\n","metadata":{"execution":{"iopub.status.busy":"2024-06-10T10:44:58.684787Z","iopub.execute_input":"2024-06-10T10:44:58.685309Z","iopub.status.idle":"2024-06-10T10:44:58.692391Z","shell.execute_reply.started":"2024-06-10T10:44:58.685279Z","shell.execute_reply":"2024-06-10T10:44:58.691132Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"ens_preds_final = ens_preds.squeeze(1)\nens_preds_final.shape","metadata":{"execution":{"iopub.status.busy":"2024-06-10T10:39:58.129053Z","iopub.execute_input":"2024-06-10T10:39:58.129582Z","iopub.status.idle":"2024-06-10T10:39:58.138111Z","shell.execute_reply.started":"2024-06-10T10:39:58.129546Z","shell.execute_reply":"2024-06-10T10:39:58.136725Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"r2_score(y_test,nn_preds_x)","metadata":{"id":"akcVt1Od_8LZ"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"target_preds = nn_preds[0]","metadata":{"id":"U_6Z8qnF_8La"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### **If you found my notebook interesting,Please kindly help me upvote.**\n\n### Happy Modelling","metadata":{}},{"cell_type":"markdown","source":"#### Further reading and references\n\nhttps://docs.fast.ai/tutorial.tabular.html\n\nhttps://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoaderhttps://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader\n\nhttps://dirk-kalmbach.medium.com/datablock-and-dataloaders-in-fastai-d5aa7ae560e5","metadata":{}},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}