-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
119 lines (97 loc) · 3.44 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#%%
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import os
#%%
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objects as go
import plotly.offline as offline
offline.init_notebook_mode()
#%%
%%time
folder = 'input/'
train_df = pd.read_csv(folder + 'train.csv')
weather_train_df = pd.read_csv(folder + 'weather_train.csv')
test_df = pd.read_csv(folder + 'test.csv')
weather_test_df = pd.read_csv(folder + 'weather_test.csv')
building_meta_df = pd.read_csv(folder + 'building_metadata.csv')
sample_submission = pd.read_csv(folder + 'sample_submission.csv')
#%%
# check size of data
print('Size of train_df data', train_df.shape)
print('Size of weather_train_df data', weather_train_df.shape)
print('Size of weather_test_df data', weather_test_df.shape)
print('Size of building_meta_df data', building_meta_df.shape)
#%%
train_df.head()
#%%
train_df.columns.values
#%%
weather_train_df.head()
#%%
weather_train_df.columns.values
#%%
weather_test_df.head()
#%%
weather_test_df.columns.values
#%%
building_meta_df.head()
#%%
building_meta_df.columns.values
#%%
# for key, d in train_df.groupby('meter_reading'):
# break
# d.head()
plt.figure(figsize = (10, 5))
train_df['meter_reading'].plot()
#%%
plt.hist(train_df['meter_reading'], bins=77)
plt.title('Distribution of id_01 variable')
#%%
train_df['meter_reading'].plot(kind='hist', bins=25, figsize=(15, 5), title= 'Distribution of Target Variable (meter_reading)')
plt.show()
#%%
# examine missing values for train data
total = train_df.isnull().sum().sort_values(ascending=False)
percent= (train_df.isnull().sum()/train_df.isnull().count()*100).sort_values(ascending=False)
missing_train_data = pd.concat([total, percent], axis=1, keys=['total', 'Percent'])
missing_train_data
#%%
# examine missing values for weather_train data
total = weather_train_df.isnull().sum().sort_values(ascending=False)
percent = (weather_train_df.isnull().sum()/weather_train_df.isnull().count()*100).sort_values(ascending=False)
missing_weather_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_weather_data
#%%
# examine missing values for weather_test data
total = weather_test_df.isnull().sum().sort_values(ascending=False)
percent = (weather_test_df.isnull().sum()/weather_test_df.isnull().count()*100).sort_values(ascending=False)
missing_weather_test_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_weather_test_data
#%%
# examine missing values for building_meta data
total = building_meta_df.isnull().sum().sort_values(ascending=False)
percent = (building_meta_df.isnull().sum()/building_meta_df.isnull().count()*100).sort_values(ascending=False)
missing_building_meta_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_building_meta_data
#%%
# Number of each type of column
train_df.dtypes.value_counts()
#%%
# Number of unique classes in each object column
train_df.select_dtypes('object').apply(pd.Series.nunique, axis=0)
#%%
# lets find the correlation between the data to find some relevance of that features.
correlations = train_df.corr()['meter_reading'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))
#%%