-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_newPage.py
120 lines (106 loc) · 4.78 KB
/
4_newPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from PIL import Image
import plotly.express as px
from IPython.display import HTML
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)
st.set_page_config(page_title="Prediction", page_icon=":chopsticks:", layout="wide")
st.markdown("# LA Restaurant Health Risk Prediction")
st.sidebar.header("Restaurant Prediction")
st.markdown(
"""
#### This page is designed for the government.
"""
)
uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
# Can be used wherever a "file-like" object is accepted:
dff = pd.read_csv(uploaded_file)
st.write(dff)
# with st.expander("See raw data"):
# st.write('Show the raw data.')
# # show the raw dataframe
# st.dataframe(dff, 1000, 300)
# load model
model = joblib.load('rf_model.joblib') # ml prediction model
# preprocessing
from textblob import TextBlob
dff_m = dff[['FACILITY_NAME', 'name', 'FACILITY_ZIP', 'review_counts', 'price', 'category', 'type',
'size', 'open_hours_week', 'SCORE', 'score', 'num_photos', 'is_bus_web', 'is_phone_number', 'is_message_bus',
'num_attributes', 'num_questions', 'comments_list']]
dff_m = dff_m[dff_m.type=='restaurant'] # type includes restaurant, food market, other
dff_m.rename(columns={'score': 'rating'}, inplace=True)
dff_m.columns = map(lambda x: str(x).upper(), dff_m.columns)
#zip
dff_m.FACILITY_ZIP = dff_m.FACILITY_ZIP.apply(lambda x: x[:5])
#review_counts
def func0(x):
if 'k' in x:
return float(x.replace('k', '')) * 1000
else:
return int(x)
dff_m.REVIEW_COUNTS.fillna('0 reviews', inplace=True)
dff_m.REVIEW_COUNTS = dff_m.REVIEW_COUNTS.apply(lambda x: x.strip('(').strip(')').split()[0])
dff_m.REVIEW_COUNTS = dff_m.REVIEW_COUNTS.apply(func0)
#price
dff_m.PRICE = dff_m.PRICE.map({'$': '1', '$$': '2', '$$$': '3', '$$$$': '4'})
dff_m.PRICE.fillna(dff_m.PRICE.mode()[0], inplace=True)
#size
dff_m.SIZE = dff_m.SIZE.map({'0-30': '1', '31-60': '2', '61-150': '3', '151 +': '4'})
dff_m.SIZE.fillna(dff_m.SIZE.mode()[0], inplace=True)
#open_hours_week
dff_m.OPEN_HOURS_WEEK.fillna(dff_m.OPEN_HOURS_WEEK.mean(), inplace=True)
#rating
dff_m.RATING.fillna(dff_m.RATING.mode()[0], inplace=True)
#num_photos
def func(x):
if pd.isnull(x)==False:
num = x.split()[2]
try:
return int(num)
except:
if 'k' in num:
return float(num.replace('k', '')) * 1000
dff_m.NUM_PHOTOS = dff_m.NUM_PHOTOS.apply(func)
dff_m.NUM_PHOTOS.fillna(0, inplace=True)
#is_bus_web
dff_m.IS_BUS_WEB = dff_m.IS_BUS_WEB.map({True: 1, False: 0})
#is_phone_number
dff_m.IS_PHONE_NUMBER = dff_m.IS_PHONE_NUMBER.map({True: 1, False: 0})
#is_message_bus
dff_m.IS_MESSAGE_BUS = dff_m.IS_MESSAGE_BUS.map({True: 1, False: 0})
#num_attributes
dff_m.NUM_ATTRIBUTES.fillna(0, inplace=True)
#num_questions
dff_m.NUM_QUESTIONS = dff_m.NUM_QUESTIONS.apply(lambda x: int(x.split()[2]) if pd.isnull(x)==False else 0)
#comments_list
dff_m['SENTIMENT_POLARITY'] = dff_m.COMMENTS_LIST.apply(lambda x: TextBlob(x).sentiment.polarity)
#label:risk_level
dff_m['RISK_LEVEL'] = pd.cut(dff_m.SCORE, 3, labels=['high risk', 'medium risk', 'low risk']).astype(str) #pd.cut/pd.qcut, retbins=True
dff_m['RISK_LEVEL'] = dff_m['RISK_LEVEL'].map({'low risk': 0, 'medium risk': 1, 'high risk': 2})
# prediction
dff_rf = dff_m.copy()
all_x = dff_rf[['FACILITY_ZIP', 'REVIEW_COUNTS', 'PRICE', 'SIZE', 'OPEN_HOURS_WEEK', 'RATING',
'NUM_PHOTOS', 'IS_BUS_WEB', 'IS_PHONE_NUMBER', 'IS_MESSAGE_BUS',
'NUM_ATTRIBUTES', 'NUM_QUESTIONS', 'SENTIMENT_POLARITY']]
all_y = dff_rf['RISK_LEVEL']
y_pred = model.predict(all_x)
dff_m['prediction_result'] = y_pred
dff_m['prediction_result'] = dff_m['prediction_result'].map({0: 'low risk', 1: 'medium risk', 2: 'high risk'})
# show prediction results in dataframe
st.markdown("## Prediction Results")
st.dataframe(dff_m.drop(['RISK_LEVEL'], axis=1), 1000, 300)
# show high/medium risk
st.markdown("## Show medium/high risk")
st.dataframe(dff_m[(dff_m['prediction_result']=='high risk') | (dff_m['prediction_result']=='medium risk')], 1000, 300)
#visualize
st.markdown("## Prediction Distribution")
fig = px.histogram(dff_m, x='prediction_result')
#HTML(fig.to_html())
st.plotly_chart(fig)