-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv_funct.py
79 lines (72 loc) · 3.56 KB
/
csv_funct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import pandas as pd
import numpy as np
class csv():
### Removes unused columns from weather dataframe and converts objects to datatypes.
### Takes an optional month name string to return a dataframe for a single month
def format_weather(current_df, month=""):
print("Formatting weather dataframe...")
print("Changing datatypes...")
new_df = current_df.astype({'STATION':'category',
'DATE':'datetime64[ns]'})
print("Dropping columns...")
new_df.drop(labels=['ELEVATION','MDSF', 'AWND', 'SNWD',
'WESD', 'WESF','WT01', 'WT02', 'WT03',
'WT04', 'WT05',"WT06","WT07","WT08",
"WT09","WT11"], axis=1, inplace=True)
new_df.rename(columns={'DATE':'date'}, inplace='true')
if month == "" :
print("Done formatting weather dataframe")
return new_df
else:
print("Done formatting weather dataframe")
new_df_w_month = new_df.loc[new_df['DATE'].apply(pd.Timestamp.month_name) == month]
return new_df_w_month
### clean up services dataframes
### Drop status column bc it's not used and lateness for Amtrak is not tracked
### in the CSV file. Meadowlands is a special service so that is dropped as well.
def format_services(current_df):
print("Dropping columns...")
if 'status' in current_df:
current_df.drop(labels=['status'], axis=1, inplace=True)
if 'line' in current_df:
current_df = current_df[current_df['line'] != 'Meadowlands Rail']
if 'type' in current_df:
current_df.drop(labels=['type'], axis = 1, inplace=True)
if 'stop_sequence' in current_df:
current_df.drop(labels=['stop_sequence'], axis=1, inplace=True)
if ('from_id' in current_df) & ('to_id' in current_df):
current_df.drop(labels=['from_id', 'to_id'], axis=1, inplace=True)
print("Changing datatypes...")
new_df = current_df.astype({'date' : 'datetime64[ns]',
'train_id' : 'category',
'from' : 'category',
'to' : 'category',
'scheduled_time' : 'datetime64[ns]',
'actual_time' : 'datetime64[ns]',
'delay_minutes' : 'float16',
'line' : 'category'},
errors='ignore')
new_df.dropna(how='any', inplace=True)
print("Done formatting dataframe")
return new_df
### combines all CSV files into a single dataframe
### it also exports the dataframe into a combined CSV file
### combines all CSV files into a single dataframe
### it also exports the dataframe into a combined CSV file
def combine_csvs(directory):
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
dfs = []
print("Combining CSV files....")
for file in csv_files:
df = pd.read_csv(os.path.join(directory, file))
dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
print("Formatting new CSV file...")
combined_df = csv.format_services(combined_df)
# make a new CSV for the dataframe
print("Exporting....")
compression_opts = dict(method='zip', archive_name='all_services.csv')
combined_df.to_csv('out.zip', index=False, compression=compression_opts)
print("CSV files successfully combined and exported.")
return combined_df