-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
271 lines (200 loc) · 13.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#Library to create dashboard
import streamlit as st
import pandas as pd
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import defaultdict
from datetime import datetime
import io
import plotly.express as px
import pickle
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
.row_heading.level0 {display:none}
.blank {display:none}
.dataframe {text-align: left !important}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
def data_visualization():
st.image("https://techlabs.org/static/tl-logo-cf3f70e8f5222649e6b06468adfae64c.png")
st.header("Our Team")
st.image("teamphoto.PNG")
st.header("Technologies Used:")
st.image("logos.png")
st.header("Data Sources:")
st.image("deutsche bahn.png")
st.header("Deutsche Bahn Data Analysis")
zf = zipfile.ZipFile('Mai-August_Departures-2.csv (1).zip')
data = pd.read_csv(zf.open('Mai-August_Departures-2.csv'),usecols=['Abfahrt','nach (Ankunft)','Abfahrtsbhf.','date','Zugnr.'])
data['Final Station']=data['nach (Ankunft)'].apply(lambda x:x.split(' (')[0])
data['Expected Arrival']=data['nach (Ankunft)'].apply(lambda x:x.split(' (an')[1].split(')')[0])
data['Arrival Delay']=data['Abfahrt'].apply(lambda x:int(x.split('(')[1][:-1]) if('(' in x) else 0)
zf2 = zipfile.ZipFile('Mai-August_Arrivals-2.csv (1).zip')
data2 = pd.read_csv(zf2.open('Mai-August_Arrivals-2.csv'),usecols=['Ankunft','Zugnr.','von (Abfahrt)','Ankunftsbhf.','date'])
data2['Departure Station']=data2['von (Abfahrt)'].apply(lambda x:x.split(' (')[0])
data2['Expected Departure']=data2['von (Abfahrt)'].apply(lambda x:x.split(' (ab')[1].split(')')[0])
data2['Departure Delay']=data2['Ankunft'].apply(lambda x:int(x.split('(')[1][:-1]) if('(' in x) else 0)
datatotal = data2.merge(data, left_on=['Zugnr.','date','Ankunftsbhf.'],right_on=['Zugnr.','date','Abfahrtsbhf.'])
datatotal=datatotal.drop(['von (Abfahrt)', 'nach (Ankunft)'], axis=1)
filter_date=st.date_input('Choose a date to analyse')
trip_id=datatotal[datatotal['date']==str(filter_date)]['Zugnr.'].unique().tolist()
trip_id_filter=st.multiselect('Pick one', trip_id)
checked=st.checkbox('Filtering?')
if checked:
if trip_id_filter!=[]:
datatotal=datatotal[datatotal['Zugnr.'].isin(trip_id_filter)]
filter_date_month=str(filter_date)[:7]
datatotal_month=datatotal[datatotal['date'].str.contains(str(filter_date_month))]
datatotal=datatotal[datatotal['date']==str(filter_date)]
st.write(datatotal)
st.header("Summary of a DataFrame")
buffer = io.StringIO()
datatotal2=datatotal
datatotal2.columns=["Arrival time","Train Number","Arrival Station","date","Departure Station","Expected Departure","Departure Delay","Departure","Departure Station","Final Station","Expected Arrival","Arrival Delay"]
datatotal2.info(buf=buffer)
s = buffer.getvalue()
st.text(s)
st.header("Histogram of Delay for Train Journeys in Germany ("+str(filter_date)+")")
fig1,ax1=plt.subplots(figsize=(11,7))
ax1.hist(datatotal["Arrival Delay"], bins=50)
ax1.set_xlabel("Arrival Delay (minutes)")
ax1.set_ylabel("Frequency")
st.pyplot(fig1)
datatotal['date'] = pd.to_datetime(datatotal['date'])
datatotal['day_of_week'] = datatotal['date'].dt.dayofweek
fig2,ax2=plt.subplots(figsize=(11,7))
fig2 = px.box(datatotal, y="Arrival Delay",x="day_of_week")
st.plotly_chart(fig2)
datatotal['delay?']=datatotal['Arrival Delay'].apply(lambda x: True if x>0 else False)
if checked:
st.header("Delay Percentage ("+str(filter_date)+")")
fig5,ax5=plt.subplots(figsize=(11,7))
ax5.pie(datatotal['delay?'].value_counts(), labels=['No Delay','Delay'], autopct='%1.1f%%',shadow=True, startangle=90)
st.pyplot(fig5)
else:
st.header("Delay Percentage (all dataset)")
fig5,ax5=plt.subplots(figsize=(11,7))
ax5.pie(datatotal['delay?'].value_counts(), labels=['No Delay','Delay'], autopct='%1.1f%%',shadow=True, startangle=90)
st.pyplot(fig5)
if checked:
st.header("Delay Percentage ("+str(filter_date_month)+")")
datatotal_month['delay?']=datatotal_month['Arrival Delay'].apply(lambda x: True if x>0 else False)
fig4,ax4=plt.subplots(figsize=(11,7))
ax4.pie(datatotal_month['delay?'].value_counts(), labels=['No Delay','Delay'], autopct='%1.1f%%',shadow=True, startangle=90)
st.pyplot(fig4)
datatotal_month['day']=datatotal_month['date'].apply(lambda x:int(x.split('-')[2]))
datatotal_month=datatotal_month.groupby(['day'])['Arrival Delay'].mean()
st.header("Average Delay per day ("+str(filter_date_month)+")")
fig3,ax3=plt.subplots(figsize=(11,7))
ax3.plot(datatotal_month.index,datatotal_month)
st.pyplot(fig3)
st.header("Heatmap about average delay per hour and station")
datatotal['hour']=datatotal['Expected Arrival'].apply(lambda x:int(x.split(':')[0]))
datatotalgroup = datatotal[['Arrival Delay','Final Station','hour']].groupby(['Final Station','hour'], as_index=False).mean()
#st.write(datatotalgroup)
fig = plt.figure(figsize=(10, 4))
datatotalgroup = pd.pivot_table(datatotalgroup.head(23),'Arrival Delay','Final Station','hour')
sns.heatmap(datatotalgroup)
st.pyplot(fig)
st.header("# of trains from each station")
st.image("number of trains from each departure station.JPG")
st.header("# of trains departing from each station per week day")
st.image("heatmap visual.JPG",width=1100)
def prediction():
predict_data=[]
st.header("Prediction")
st.subheader("EX: Logistic Regression")
L3= ['EC','FLX', 'IC', 'ICE', 'NJ', 'OTHERS', 'TGV', 'THA']
train=st.selectbox('train type', L3)
List_train=[0]*len(L3)
List_train[L3.index(train)]=1
predict_data=predict_data+List_train
d=st.number_input('day',0, 31)
w=st.number_input('weekday',0, 6)
predict_data.append(d)
predict_data.append(w)
#hd=st.number_input('hour departure',0, 23)
ddd=st.time_input('Departure time')
#md=st.number_input('minute departure',0, 59)
predict_data.append(ddd.hour)
predict_data.append(ddd.minute)
#ha=st.number_input('hour arrival',0, 23)
aaa=st.time_input('Arrival time')
#ma=st.number_input('minute arrival',0, 59)
predict_data.append(aaa.hour)
predict_data.append(aaa.minute)
L1=['Augsburg Hbf', 'Berlin Gesundbrunnen', 'Berlin Hbf', 'Berlin Hbf (tief)', 'Bremen Hbf', 'Dortmund Hbf', 'Dresden Hbf', 'Duisburg Hbf', 'Düsseldorf Hbf', 'Erfurt Hbf', 'Essen Hbf', 'Frankfurt(Main)Hbf', 'Freiburg(Breisgau) Hbf', 'Fulda', 'Göttingen', 'Hamburg Hbf', 'Hamburg-Altona', 'Hannover Hbf', 'Heidelberg Hbf', 'Karlsruhe Hbf', 'Köln Hbf', 'Köln Messe/Deutz Gl.11-12', 'Leipzig Hbf', 'Mainz Hbf', 'Mannheim Hbf', 'München Hbf', 'München Ost', 'Münster(Westf)Hbf', 'Nürnberg Hbf', 'Stuttgart Hbf','Würzburg Hbf']
departure=st.selectbox('departure station', L1)
List_departure=[0]*len(L1)
List_departure[L1.index(departure)]=1
predict_data=predict_data+List_departure
L2=['Aachen Hbf', 'Amsterdam Centraal', 'Basel SBB', 'Berchtesgaden Hbf', 'Berlin Gesundbrunnen', 'Berlin Hbf', 'Berlin Ostbahnhof', 'Berlin Südkreuz', 'Berlin-Spandau', 'Bochum Hbf', 'Bologna Centrale', 'Bonn Hbf', 'Bonn-Bad Godesberg', 'Braunschweig Hbf', 'Bregenz', 'Bremen Hbf', 'Bremerhaven-Lehe', 'Bruxelles Midi', 'Budapest-Keleti', 'Chemnitz Hbf', 'Chur', 'Cottbus Hbf', 'Dortmund Hbf', 'Dresden Hbf', 'Duisburg Hbf', 'Düsseldorf Hbf', 'Emden Außenhafen', 'Emden Hbf', 'Erfurt Hbf', 'Essen Hbf', 'Flensburg', 'Frankfurt(M) Flughafen Fernbf', 'Frankfurt(Main)Hbf', 'Frankfurt(Main)Süd', 'Frankfurt(Main)West', 'Freiburg(Breisgau) Hbf', 'Friedberg(Hess)', 'Gera Hbf', 'Graz Hbf', 'Greifswald', 'Hamburg Dammtor', 'Hamburg Hbf', 'Hamburg-Altona', 'Hamburg-Harburg', 'Hannover Hbf', 'Heidelberg Hbf', 'Innsbruck Hbf', 'Interlaken Ost', 'Jena Paradies', 'Karlsruhe Hbf', 'Kassel-Wilhelmshöhe', 'Kiel Hbf', 'Klagenfurt Hbf', 'Koblenz Hbf', 'Konstanz', 'Köln Hbf', 'Köln Messe/Deutz Gl.11-12', 'Leipzig Hbf', 'Lübeck Hbf', 'Magdeburg Hbf', 'Mannheim Hbf', 'Marseille-St-Charles', 'Milano Porta Garibaldi', 'Mönchengladbach Hbf', 'München Hbf', 'Münster(Westf)Hbf', 'Norddeich', 'Norddeich Mole', 'Nürnberg Hbf', 'Oberstdorf', 'Offenburg', 'Oldenburg(Oldb)', 'Oldenburg(Oldb)Hbf', 'Ostseebad Binz', 'Paris Est', 'Paris Nord', 'Passau Hbf', 'Praha hl.n.', 'Rostock Hbf', 'Saarbrücken Hbf', 'Salzburg Hbf', 'Siegen', 'Singen(Hohentwiel)', 'Stralsund Hbf', 'Stuttgart Hbf', 'Tübingen Hbf', 'Ulm Hbf', 'Verona Porta Nuova', 'Warnemünde', 'Westerland(Sylt)', 'Wien Hbf', 'Wiesbaden Hbf', 'Wiesloch-Walldorf', 'Zürich HB']
final=st.selectbox('final station',L2)
List_final=[0]*len(L2)
List_final[L2.index(final)]=1
predict_data=predict_data+List_final
pickled_model_1 = pickle.load(open('model/LR2.pkl', 'rb'))
class_predictions = pickled_model_1.predict([predict_data])
class_predictions_proba = pickled_model_1.predict_proba([predict_data])
if class_predictions[0]==1:
st.success(str("The train will arrive late %"+str(class_predictions_proba[0][1]*100)[:5]))
else:
st.success(str("The train will arrive in time %"+str(class_predictions_proba[0][0]*100)[:5]))
pickled_model_2 = pickle.load(open('model/LR3.pkl', 'rb'))
class_predictions = pickled_model_2.predict([predict_data])
class_predictions_proba = pickled_model_2.predict_proba([predict_data])
if class_predictions[0]==0:
st.success(str("The train will arrive in time %"+str(class_predictions_prob[0][0]*100)[:5]))
elif class_predictions[0]==1:
st.success(str("The train delay will be short (between 0 and 5 minutes) %"+str(class_predictions_proba[0][1]*100)[:5]))
else:
st.success(str("The train delay will be long (more than 5 minutes) %"+str(class_predictions_proba[0][2]*100)[:5]))
def performance():
st.header("Data Preprocessing")
st.header("ML Model Workflow")
st.write("""To begin our analysis, we first extracted data from the Zugfinder website and saved it as a CSV file. Next, we loaded this file into a Pandas dataframe for further processing.
For the train types, we transformed the categorical data into dummy variables. We also converted the departure and arrival station columns into dummy variables, which allowed us to include this information in our analysis.
To make the date column more useful, we converted it into a datetime format. We then extracted the hour and minute of departure and arrival times from the "Abfahrt" column, and calculated the total delay based on the "Ankunft" column.
To prepare our data for modeling, we split the dataset into a training set (70%) and a testing set (30%).
Using the training data, we trained several classification models to predict whether there would be a delay or not, as well as whether the delay would be short or long. The models we used included gradient boosting, random forest, K-nearest neighbors, logistic regression, and support vector classification.
To evaluate our models, we used a variety of metrics, including accuracy, F1 score, and classification reports. We also generated confusion matrices to visualize the performance of our models.
Overall, these data preprocessing steps allowed us to clean and transform our raw data into a format that was suitable for machine learning analysis.""")
st.subheader("Model Performance")
st.subheader("Binary Classification")
st.success("Logistic Regression:")
st.write("Accuracy: 77%")
st.write("F1 Score: 69%")
st.success("Random Forest:")
st.write("Accuracy: 74%")
st.write("F1 Score: 73%")
st.success("Gradent Boosting:")
st.write("Accuracy: 77%")
st.write("F1 Score: 67%")
st.success("Support Vector Machine:")
st.write("Accuracy: 77%")
st.write("F1 Score: 67%")
st.subheader("Multiclass Classification")
st.success("Logistic Regression:")
st.write("Accuracy: 49%")
st.write("F1 Score: 42%")
st.success("Random Forest:")
st.write("Accuracy: 51%")
st.write("F1 Score: 50%")
st.success("Gradent Boosting:")
st.write("Accuracy: 50%")
st.write("F1 Score: 40%")
st.success("Support Vector Machine:")
st.write("Accuracy: 47%")
st.write("F1 Score: 31%")
page_names_to_funcs = {
"Data Visualization": data_visualization,
"Prediction": prediction,
"Model Performance": performance
}
demo_name = st.sidebar.selectbox("Choose the App", page_names_to_funcs.keys())
page_names_to_funcs[demo_name]()