-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathberlin.py
165 lines (132 loc) · 4.7 KB
/
berlin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import time
from .base import *
MONTH_MAPPING = {
"Januar": 1,
"Februar": 2,
"März": 3,
"April": 4,
"Mai": 5,
"Juni": 6,
"Juli": 7,
"August": 8,
"September": 9,
"Oktober": 10,
"November": 11,
"Dezember": 12,
}
class Berlin(SourceBase):
"""
NOT IMPLEMENTED
Started Berlin scraper just to find that they have
a couple of protections including captchas.
So this is not finished and not imported by __init__.py
"""
ID = "berlin"
NAME = "Stadt Berlin"
BASE_URL = "https://service.berlin.de"
@classmethod
def index_url(cls) -> str:
return f"{cls.BASE_URL}/terminvereinbarung/"
def make_snapshot(self):
locations = self.get_locations()
ret_data = []
for loc in locations:
time.sleep(1)
dates = self.get_free_dates(loc["url"])
ret_data.append({
"loc": loc,
"dates": dates
})
exit()
return ret_data
def get_locations(self) -> List:
soup = self.get_html_soup(f"{self.BASE_URL}/standorte/buergeraemter/")
locations = []
for div in soup.find_all("div", {"class": "ort-group"}):
district = div.find("h2").text.strip()
if district.endswith("nach oben"):
district = district[:-9].strip()
for a in div.find_all("a"):
href = a.get("href")
if href and href.startswith("/standort/"):
locations.append({
"district": district,
"name": a.text.strip(),
"url": f"{self.BASE_URL}{href}"
})
return locations
def get_free_dates(self, location_url: str) -> List:
soup = self.get_html_soup(location_url)
alert = soup.find("div", {"class": "alert"})
if alert:
alert = alert.text
if alert:
if "bis auf Weiteres geschlossen" in alert:
print("closed")
return []
# -- pick one service --
form = soup.find("form", {"id": "termin_form"})
if not form:
print("no form")
return []
cb = form.find("input", {"type": "checkbox", "name": "anliegen[]"})
if not cb:
print("no checkbox")
return []
action_url = form.get('action').split('?')[0]
query = {
i.get("name"): i.get("value")
for i in form.find_all("input", {"type": "hidden"})
}
query.update({
"termin": 1,
cb.get("name"): cb.get("value"),
})
# print(action_url, query)
#for a in form.find_all("a", {"class": "referdienstleister"}):
# print(a)
days = []
time.sleep(1)
soup = self.get_html_soup(action_url, data=query)
soup_str = str(soup)
if "Ihre Auswahl von Standort und Diensteistung hat sich geändert." in soup_str:
restart_url = None
for a in soup.find_all("a"):
href = a.get("href")
if href and href.startswith("/terminvereinbarung/termin/restart/?"):
restart_url = href
if not restart_url:
print("NO RESTART LINK FOUND")
return []
time.sleep(1)
soup = self.get_html_soup(f"{self.BASE_URL}{restart_url}")
soup_str = str(soup)
if "<h1>Zu viele Zugriffe</h1>" in soup_str:
print("THROTTLED")
return []
for div in soup.find_all("div", {"class": "calendar-month-table"}):
month = div.find("th", {"class": "month"}).text.strip()
month, year = MONTH_MAPPING[month.split()[0]], int(month.split()[1])
for td in div.find_all("td"):#, {"class": "buchbar"}):
klass = td.get("class") or []
if "buchbar" in klass or "nichtbuchbar" in klass:
days.append({
"date": datetime.date(year, month, int(td.text.strip().lstrip("0"))),
"class": td.get("class")
})
a = td.find("a")
if a:
days[-1]["url"] = a.get("href")
if not soup.find("div", {"class": "calendar-month-table"}):
print(soup)
for day in days:
if day.get("url"):
day["times"] = self.get_free_day_times(
f"{self.BASE_URL}{day['url']}"
)
return days
def get_free_day_times(self, url: str) -> List:
time.sleep(1)
soup = self.get_html_soup(url)
print(soup)
return []