forked from kolasniwash/short-term-energy-demand-forecasting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_weather_data.py
98 lines (67 loc) · 2.82 KB
/
make_weather_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# coding: utf-8
# # Data Cleaning: Weather
#
#
# #### Data Source:
# Weather data for the top 5 largest cities in spain was obtained from OpenWeatherMap. The data contains hourly information on teperature, pressure, rainfall, cloud index, and weather descrption.
#
# #### Summary of cleaning actions:
# - Add names to the cities
# - Drop columns that contain no data
# - Convert timestamps to datetimes and set a datetime index
# - In columns with partial data, assume Nans are zero values.
# - Set elements to lower case and remove speical characters in categorical columns
#
#
# #### Function list:
# 1. get_clean_weather - takes in weather data and returns a cleaned set for the spain top 5 cities data
import pandas as pd
#import data
def clean_weather_data(data):
"""
Input: hourly bulk data export from OpenWeatherMaps.
Output: cleaned data
"""
#add city names
city_codes = {3128760 : ' Barcelona',
3117735 : 'Madrid',
3128026 : 'Bilbao',
2509954 : 'Valencia',
6361046 : 'Seville'}
data['city_name'] = data['city_id'].replace(city_codes)
#drop all columns with only NaN values
data = data.drop(['lat',
'lon',
'sea_level',
'grnd_level',
'rain_24h',
'snow_today',
'rain_today',
'snow_1h',
'snow_24h'], axis=1)
#convert timestamp to datetime object
times = pd.to_datetime(data['dt'], unit='s', origin='unix')
#convert the times to local time zone
data['dt'] = times.dt.tz_localize('UTC').dt.tz_convert('Europe/Madrid').dt.strftime('%Y-%m-%d %H:%M:%S')
data = data.set_index(pd.DatetimeIndex(data['dt']))
#replace null values with zeros in columns with relevant informaiton
nul_cols = ['rain_1h', 'rain_3h', 'snow_3h']
data[nul_cols] = data[nul_cols].fillna(0)
return data
def clean_descrption_cols(data):
"""
small function that sets the descrption columns to lower case, and removes special characters from the names.
"""
#make each element in the columns lowercase
data[['weather_main', 'weather_description']] = data[['weather_main', 'weather_description']].apply(lambda x: x.str.lower())
#remove spcial characters
special_chars = [',', '/', ':', ';', '-']
for char in special_chars:
data['weather_description'] = data['weather_description'].str.replace(char,' ')
return data
def get_weather_data(path='./data/weather/spain-weather-2013-2019.csv'):
data = pd.read_csv(path)
weather_data = clean_weather_data(data)
weather_data = clean_descrption_cols(weather_data)
return weather_data