-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGet_htmls.py
99 lines (70 loc) · 2.69 KB
/
Get_htmls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding: utf-8
# In[24]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta
import pandas as pd
import urllib3
import pickle
# In[5]:
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns true if the response seems to be HTML, false otherwise
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
It is always a good idea to log errors.
This function just prints them, but you can
make it do anything.
"""
print(e)
# In[6]:
cities=['Berlin', 'Hamburg', 'Munich', 'Cologne', 'Frankfurt_am_Main']
base_url=['https://www.wetter.de/deutschland/wetter-berlin-18228265/','https://www.wetter.de/deutschland/wetter-hamburg-18219464/','https://www.wetter.de/deutschland/wetter-muenchen-18225562/','https://www.wetter.de/deutschland/wetter-koeln-18220679/','https://www.wetter.de/deutschland/wetter-frankfurt-18221009/']
# In[18]:
def collect_htmls(city_base_url):
raw_html=[]
days_to_predict = 15
http = urllib3.PoolManager()
url_hourly_base = city_base_url
tag_tags = ['tag-'+str(tag) for tag in range(9,days_to_predict+1)]
hourly_website_tags = ['wetterbericht-aktuell', 'wetterbericht-morgen', 'wetterbericht-uebermorgen','wetter-bericht','wettervorhersage','wetter-vorhersage','wettervorschau','wetter-vorschau']
hourly_website_tags.extend(tag_tags)
for i, tag in enumerate(hourly_website_tags):
url = url_hourly_base+tag+'.html'
raw_html.append(simple_get(url))
return raw_html
# In[26]:
for i,city in enumerate(cities):
html_dict = {}
current_time = pd.Timestamp(datetime.now())
html_dict['website'] = 'www.wetter.de'
html_dict['city'] = city
html_dict['date_of_aquisition'] = current_time
html_dict['htmls'] = collect_htmls(base_url[i])
pkl_name='./wetter_de/wetter_de_'+city+'_'+str(current_time)[:10]+'.pkl'
f = open(pkl_name,"wb")
pickle.dump(html_dict,f)
f.close()