-
Notifications
You must be signed in to change notification settings - Fork 0
/
profiler.py
81 lines (56 loc) · 1.82 KB
/
profiler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:light
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.4'
# jupytext_version: 1.1.1
# kernel_info:
# name: python3
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# %load_ext autoreload
# %autoreload 2
import sys
sys.path.append("..")
# ### Now you can get extra information for the profiler if you activate pass verbose= True to optimus
# Create optimus
from optimus import Optimus
op = Optimus(master="local[*]", app_name = "optimus" , checkpoint= True, verbose=True)
df = op.load.csv("data/Meteorite_Landings.csv").h_repartition()
df.table(10)
# ### Profiler dump mode (Faster). It just handle the column data type as present in the dataframe
op.profiler.run(df, "name", infer=False, approx_count= True)
# ### Profiler smart mode (Slower). It just try to infer the column data type and present extra data accordingly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data.
op.profiler.run(df, "GeoLocation",infer=True)
# ### Plot profile for a specific column
op.profiler.run(df, "reclat")
# ### Output a json file
# ### Plot histagram for multiple columns
df.plot.hist(["id", "reclong"], 20)
df.plot.frequency(["id", "reclong"], 10)
df.table()
df.cols.count_na("*")
a = {'name': 0,
'id': 0,
'nametype': 0,
'recclass': 0,
'mass (g)': 131,
'fall': 0,
'year': 288,
'reclat': 7315,
'reclong': 7315,
'GeoLocation': 7315}
df.cols.dtypes()
# +
cols = ["id","mass (g)","reclat"]
# We drops nulls because correlation can not handle them
df_not_nulls = df.rows.drop_na(cols)
df_not_nulls.plot.correlation(cols)
# -
df_not_nulls.cols.correlation(["id","mass (g)", "reclat"], output="array")