From 56cfe7fc5a91d514315a6a0bd7efc5040b85cb30 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Sun, 26 Aug 2018 22:01:43 -0400 Subject: [PATCH] Delete Hotel recommendation.ipynb --- Hotel recommendation.ipynb | 1412 ------------------------------------ 1 file changed, 1412 deletions(-) delete mode 100644 Hotel recommendation.ipynb diff --git a/Hotel recommendation.ipynb b/Hotel recommendation.ipynb deleted file mode 100644 index 40ba226..0000000 --- a/Hotel recommendation.ipynb +++ /dev/null @@ -1,1412 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import datetime\n", - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "%matplotlib inline\n", - "\n", - "from sklearn.model_selection import cross_val_score\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.pipeline import make_pipeline\n", - "from sklearn import preprocessing\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn import svm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Specifying dtypes helps reduce memory requirements for reading in csv file later." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# data_type={'is_booking':bool,'srch_ci' : np.str_, 'srch_co' : np.str_,\n", - " 'srch_adults_cnt' : np.int32, 'srch_children_cnt' : np.int32,\n", - " 'srch_rm_cnt' : np.int32, 'srch_destination_id':np.int32,\n", - " 'user_location_country' : np.int32, 'user_location_region' : np.int32,\n", - " 'user_location_city' : np.int32, 'hotel_cluster' : np.int32,\n", - " 'orig_destination_distance':np.float64, 'date_time':np.str_,\n", - " 'hotel_market':np.int32}\n", - "# d_type={'is_booking':bool, 'cnt':np.int32, 'hotel_cluster' : np.int32,'srch_destination_id':np.int32}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To be able to process locally, we randomly sample 1% of the records. After that, we still have a large number of records at 241,179." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(241179, 24)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv('train.csv.gz', sep=',').dropna()\n", - "dest = pd.read_csv('destinations.csv.gz')\n", - "df = df.sample(frac=0.01, random_state=99)\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
date_timesite_nameposa_continentuser_location_countryuser_location_regionuser_location_cityorig_destination_distanceuser_idis_mobileis_package...srch_children_cntsrch_rm_cntsrch_destination_idsrch_destination_type_idis_bookingcnthotel_continenthotel_countryhotel_markethotel_cluster
323521342014-05-22 11:40:072366174241032323.523280249901...011442301412517744
297960212013-06-29 12:24:372366311255382288.61218522900...11827210125065959
151851562014-10-30 13:58:32236629440046587.697075521701...011132110125064222
33019482014-08-22 20:14:342366332551212234.439416073301...011152111447150265
254291192014-03-25 18:47:43236631447869839.0087107849300...0182841042506856
\n", - "

5 rows × 24 columns

\n", - "
" - ], - "text/plain": [ - " date_time site_name posa_continent \\\n", - "32352134 2014-05-22 11:40:07 2 3 \n", - "29796021 2013-06-29 12:24:37 2 3 \n", - "15185156 2014-10-30 13:58:32 2 3 \n", - "3301948 2014-08-22 20:14:34 2 3 \n", - "25429119 2014-03-25 18:47:43 2 3 \n", - "\n", - " user_location_country user_location_region user_location_city \\\n", - "32352134 66 174 24103 \n", - "29796021 66 311 25538 \n", - "15185156 66 294 40046 \n", - "3301948 66 332 55121 \n", - "25429119 66 314 47869 \n", - "\n", - " orig_destination_distance user_id is_mobile is_package \\\n", - "32352134 2323.5232 802499 0 1 \n", - "29796021 2288.6121 85229 0 0 \n", - "15185156 587.6970 755217 0 1 \n", - "3301948 2234.4394 160733 0 1 \n", - "25429119 839.0087 1078493 0 0 \n", - "\n", - " ... srch_children_cnt srch_rm_cnt srch_destination_id \\\n", - "32352134 ... 0 1 1442 \n", - "29796021 ... 1 1 8272 \n", - "15185156 ... 0 1 11321 \n", - "3301948 ... 0 1 1152 \n", - "25429119 ... 0 1 8284 \n", - "\n", - " srch_destination_type_id is_booking cnt hotel_continent \\\n", - "32352134 3 0 1 4 \n", - "29796021 1 0 1 2 \n", - "15185156 1 0 1 2 \n", - "3301948 1 1 1 4 \n", - "25429119 1 0 4 2 \n", - "\n", - " hotel_country hotel_market hotel_cluster \n", - "32352134 125 177 44 \n", - "29796021 50 659 59 \n", - "15185156 50 642 22 \n", - "3301948 47 1502 65 \n", - "25429119 50 685 6 \n", - "\n", - "[5 rows x 24 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EDA\n", - "\n", - "#### What are we predicting?\n", - "The objective is to predict which hotel_cluster a user will book given the information in their search. There are 100 clusters in total. In another word, we are dealing with a 100 class classification problem." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(12, 6))\n", - "sns.distplot(df['hotel_cluster'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The data is pretty much well distributed over all 100 clusters and there is skewness in the data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Feature Engineering" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime\n", - "def get_year(x):\n", - " if x is not None and type(x) is not float:\n", - " try:\n", - " return datetime.strptime(x, '%Y-%m-%d').year\n", - " except ValueError:\n", - " return datetime.strptime(x, '%Y-%m-%d %H:%M:%S').year\n", - " else:\n", - " return 2013\n", - " pass\n", - "\n", - "def get_month(x):\n", - " if x is not None and type(x) is not float:\n", - " try:\n", - " return datetime.strptime(x, '%Y-%m-%d').month\n", - " except:\n", - " return datetime.strptime(x, '%Y-%m-%d %H:%M:%S').month\n", - " else:\n", - " return 1\n", - " pass\n", - " \n", - "def left_merge_dataset(left_dframe, right_dframe, merge_column):\n", - " return pd.merge(left_dframe, right_dframe, on=merge_column, how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# msk = np.random.rand(len(df)) < 0.8\n", - "# train = df[msk]\n", - "# test = df[~msk]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# dealing with date_time column\n", - "\n", - "df['date_time_year'] = pd.Series(df.date_time, index = df.index)\n", - "df['date_time_month'] = pd.Series(df.date_time, index = df.index)\n", - "\n", - "from datetime import datetime\n", - "df.date_time_year = df.date_time_year.apply(lambda x: get_year(x))\n", - "df.date_time_month = df.date_time_month.apply(lambda x: get_month(x))\n", - "\n", - "del df['date_time']" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# work on srch_ci column\n", - "\n", - "df['srch_ci_year'] = pd.Series(df.srch_ci, index=df.index)\n", - "df['srch_ci_month'] = pd.Series(df.srch_ci, index=df.index)\n", - "\n", - "# convert year & months to int\n", - "df.srch_ci_year = df.srch_ci_year.apply(lambda x: get_year(x))\n", - "df.srch_ci_month = df.srch_ci_month.apply(lambda x: get_month(x))\n", - "\n", - "# remove the srch_ci column\n", - "del df['srch_ci']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# work on srch_co column\n", - "\n", - "df['srch_co_year'] = pd.Series(df.srch_co, index=df.index)\n", - "df['srch_co_month'] = pd.Series(df.srch_co, index=df.index)\n", - "\n", - "# convert year & months to int\n", - "df.srch_co_year = df.srch_co_year.apply(lambda x: get_year(x))\n", - "df.srch_co_month = df.srch_co_month.apply(lambda x: get_month(x))\n", - "\n", - "# remove the srch_co column\n", - "del df['srch_co']" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
site_nameposa_continentuser_location_countryuser_location_regionuser_location_cityorig_destination_distanceuser_idis_mobileis_packagechannel...hotel_continenthotel_countryhotel_markethotel_clusterdate_time_yeardate_time_monthsrch_ci_yearsrch_ci_monthsrch_co_yearsrch_co_month
323521342366174241032323.5232802499019...412517744201452014720147
297960212366311255382288.612185229009...25065959201362013720137
15185156236629440046587.6970755217019...25064222201410201412201412
33019482366332551212234.4394160733019...447150265201482015120151
25429119236631447869839.00871078493009...2506856201432014420144
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " site_name posa_continent user_location_country \\\n", - "32352134 2 3 66 \n", - "29796021 2 3 66 \n", - "15185156 2 3 66 \n", - "3301948 2 3 66 \n", - "25429119 2 3 66 \n", - "\n", - " user_location_region user_location_city orig_destination_distance \\\n", - "32352134 174 24103 2323.5232 \n", - "29796021 311 25538 2288.6121 \n", - "15185156 294 40046 587.6970 \n", - "3301948 332 55121 2234.4394 \n", - "25429119 314 47869 839.0087 \n", - "\n", - " user_id is_mobile is_package channel ... \\\n", - "32352134 802499 0 1 9 ... \n", - "29796021 85229 0 0 9 ... \n", - "15185156 755217 0 1 9 ... \n", - "3301948 160733 0 1 9 ... \n", - "25429119 1078493 0 0 9 ... \n", - "\n", - " hotel_continent hotel_country hotel_market hotel_cluster \\\n", - "32352134 4 125 177 44 \n", - "29796021 2 50 659 59 \n", - "15185156 2 50 642 22 \n", - "3301948 4 47 1502 65 \n", - "25429119 2 50 685 6 \n", - "\n", - " date_time_year date_time_month srch_ci_year srch_ci_month \\\n", - "32352134 2014 5 2014 7 \n", - "29796021 2013 6 2013 7 \n", - "15185156 2014 10 2014 12 \n", - "3301948 2014 8 2015 1 \n", - "25429119 2014 3 2014 4 \n", - "\n", - " srch_co_year srch_co_month \n", - "32352134 2014 7 \n", - "29796021 2013 7 \n", - "15185156 2014 12 \n", - "3301948 2015 1 \n", - "25429119 2014 4 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Any correlations?\n", - "We want to know if anything correlates well with hotel_cluster .This will tell us if we should pay more attention to any particular columns." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "srch_destination_type_id -0.036120\n", - "site_name -0.027497\n", - "hotel_country -0.023837\n", - "is_booking -0.022898\n", - "user_location_country -0.020239\n", - "srch_destination_id -0.016736\n", - "srch_co_month -0.005874\n", - "srch_rm_cnt -0.005570\n", - "srch_ci_month -0.005015\n", - "date_time_month -0.002142\n", - "channel -0.001386\n", - "date_time_year -0.000435\n", - "cnt 0.000378\n", - "hotel_continent 0.000422\n", - "user_location_city 0.001241\n", - "user_id 0.003891\n", - "orig_destination_distance 0.006084\n", - "user_location_region 0.006927\n", - "srch_ci_year 0.008562\n", - "is_mobile 0.008788\n", - "srch_co_year 0.009287\n", - "posa_continent 0.012180\n", - "srch_adults_cnt 0.012407\n", - "srch_children_cnt 0.014901\n", - "hotel_market 0.022149\n", - "is_package 0.047598\n", - "hotel_cluster 1.000000\n", - "Name: hotel_cluster, dtype: float64" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.corr()[\"hotel_cluster\"].sort_values()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "No column correlates linearly with hotel_cluster, this means that linear regression and logistic regression won't work well on our data." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(241179, 27)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For known combinations of user location cities, origin-destination distances and search destinations, will definitely help finding hotel cluster." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "pieces = [df.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count'])]\n", - "agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()\n", - "agg.dropna(inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sumcount
srch_destination_idhotel_countryhotel_markethotel_cluster
472462201
2901
3001
3212
4301
\n", - "
" - ], - "text/plain": [ - " sum count\n", - "srch_destination_id hotel_country hotel_market hotel_cluster \n", - "4 7 246 22 0 1\n", - " 29 0 1\n", - " 30 0 1\n", - " 32 1 2\n", - " 43 0 1" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']\n", - "agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())\n", - "agg.reset_index(inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
srch_destination_idhotel_countryhotel_markethotel_clustersumcountsum_and_cnt
047246220.00.1250.073171
147246290.00.1250.073171
247246300.00.1250.073171
347246321.00.2500.560976
447246430.00.1250.073171
\n", - "
" - ], - "text/plain": [ - " srch_destination_id hotel_country hotel_market hotel_cluster sum \\\n", - "0 4 7 246 22 0.0 \n", - "1 4 7 246 29 0.0 \n", - "2 4 7 246 30 0.0 \n", - "3 4 7 246 32 1.0 \n", - "4 4 7 246 43 0.0 \n", - "\n", - " count sum_and_cnt \n", - "0 0.125 0.073171 \n", - "1 0.125 0.073171 \n", - "2 0.125 0.073171 \n", - "3 0.250 0.560976 \n", - "4 0.125 0.073171 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "agg_pivot = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
hotel_clustersrch_destination_idhotel_countryhotel_market0123456...90919293949596979899
047246NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1850416NaNNaNNaNNaNNaNNaNNaN...NaN0.025210NaNNaNNaNNaNNaNNaNNaNNaN
21150824NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
314271434NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
41650419NaNNaNNaNNaNNaNNaNNaN...NaN0.344828NaNNaNNaNNaNNaNNaNNaNNaN
\n", - "

5 rows × 103 columns

\n", - "
" - ], - "text/plain": [ - "hotel_cluster srch_destination_id hotel_country hotel_market 0 1 2 \\\n", - "0 4 7 246 NaN NaN NaN \n", - "1 8 50 416 NaN NaN NaN \n", - "2 11 50 824 NaN NaN NaN \n", - "3 14 27 1434 NaN NaN NaN \n", - "4 16 50 419 NaN NaN NaN \n", - "\n", - "hotel_cluster 3 4 5 6 ... 90 91 92 93 94 95 96 97 98 \\\n", - "0 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN ... NaN 0.025210 NaN NaN NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN ... NaN 0.344828 NaN NaN NaN NaN NaN NaN NaN \n", - "\n", - "hotel_cluster 99 \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "\n", - "[5 rows x 103 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "agg_pivot.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.merge(df, dest, how='left', on='srch_destination_id')\n", - "df = pd.merge(df, agg_pivot, how='left', on=['srch_destination_id','hotel_country','hotel_market'])" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "df.fillna(0, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(241179, 276)" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are only interested in booking events." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.loc[df['is_booking'] == 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "X = df.drop(['user_id', 'hotel_cluster', 'is_booking'], axis=1)\n", - "y = df.hotel_cluster" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((20032, 273), (20032,))" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X.shape, y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "100" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y.nunique()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Random Forest" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.24865023372782996" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "clf = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier(n_estimators=273,max_depth=10,random_state=0))\n", - "np.mean(cross_val_score(clf, X, y, cv=10))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SVM" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.3228727137315005" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn import svm\n", - "\n", - "clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(decision_function_shape='ovo'))\n", - "np.mean(cross_val_score(clf, X, y, cv=10))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}