-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathun_press_releases.py
78 lines (67 loc) · 3.14 KB
/
un_press_releases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
def check_un_link(url):
# Checks if a link is a press release and contains the word "crisis"
try:
response = urllib.request.urlopen(url)
except:
return None
soup = BeautifulSoup(response, features="lxml")
press_release_link = soup.find(
'a', href=True, hreflang='en', string="Press Release")
if press_release_link:
title = soup.find(
'div', class_="field field--name-field-display-title field--type-string field--label-hidden field__item").text.strip()
content = soup.find(
'div', class_="field field--name-body field--type-text-with-summary field--label-hidden field__item").text.strip()
if "crisis" in title or "crisis" in content:
return title, content
return None
def generate_txt(title, content, num):
with open(f"1_{num}.txt", "w") as f:
f.write(title + "\n" + content)
def get_un_press_releases():
# Goes through 3 layers of links from the seed url to collect press releases
seed_url = "https://press.un.org/en"
press_releases = {}
response = urllib.request.urlopen(seed_url)
soup = BeautifulSoup(response, features="lxml")
home_page_links = set([link['href'] for link in soup.find_all(
'a', href=True) if link['href'].startswith('/en')])
for home_link in home_page_links:
page_url = urllib.parse.urljoin(seed_url, home_link)
res = check_un_link(page_url)
if res and res[0] not in press_releases.keys():
press_releases[res[0]] = res[1]
generate_txt(res[0], res[1], len(press_releases))
if len(press_releases) == 10:
return press_releases
response = urllib.request.urlopen(page_url)
soup = BeautifulSoup(response, features="lxml")
page_links = set([link['href'] for link in soup.find_all(
'a', href=True) if link['href'].startswith('/en')])
for page_link in page_links:
subpage_url = urllib.parse.urljoin(seed_url, page_link)
res = check_un_link(subpage_url)
if res and res[0] not in press_releases.keys():
press_releases[res[0]] = res[1]
generate_txt(res[0], res[1], len(press_releases))
if len(press_releases) == 10:
return press_releases
response = urllib.request.urlopen(subpage_url)
soup = BeautifulSoup(response, features="lxml")
subpage_links = set([link['href'] for link in soup.find_all(
'a', href=True) if link['href'].startswith('/en')])
for subpage_link in subpage_links:
subsubpage_url = urllib.parse.urljoin(seed_url, subpage_link)
res = check_un_link(subsubpage_url)
if res and res[0] not in press_releases.keys():
press_releases[res[0]] = res[1]
generate_txt(res[0], res[1], len(press_releases))
if len(press_releases) == 10:
return press_releases
return press_releases
if __name__ == "__main__":
result = get_un_press_releases()
print(result)