-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
188 lines (142 loc) · 5.94 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def scrape():
# Dependencies
from splinter import Browser
from selenium.webdriver.chrome.options import Options
import pandas as pd
import os
import bs4 as bs
#setup resulting dict
results = {}
CHROMEDRIVER_PATH = os.environ.get('CHROMEDRIVER_PATH')
print(f'\n######## CHROMEDRIVER_PATH: {CHROMEDRIVER_PATH}\n')
GOOGLE_CHROME_BIN = os.environ.get('GOOGLE_CHROME_BIN')
print(f'\n######## GOOGLE_CHROME_BIN: {GOOGLE_CHROME_BIN}\n')
#--------------------------------------------
# SCRAPE LATEST NEWS
#--------------------------------------------
# set options for chrome driver
chrome_options = Options()
chrome_options.binary_location = GOOGLE_CHROME_BIN
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
# set path to the driver
executable_path = {'executable_path': CHROMEDRIVER_PATH}
# launch splinter browser object
browser = Browser('chrome', executable_path, chrome_options=chrome_options)
# URL of NASA's mars news
url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'
# launch browser
browser.visit(url)
#check if the page has been loaded
browser.is_element_present_by_name('list_date', wait_time=10)
# create beautifulsoup object
soup = bs.BeautifulSoup(browser.html, 'lxml')
# find latest news date, title and body (first appearanse of a tag in html)
news_date = soup.find('div', class_='list_date').text
news_title = soup.find('div', class_='content_title').text
news_p = soup.find('div', class_='article_teaser_body').text
# save to results
results['news_date'] = news_date
results['news_title'] = news_title
results['news_p'] = news_p
# print to console
print(f'Scraped: {news_title} as of {news_date}')
print(f'Results lenght: {len(results)}')
#--------------------------------------------
# SCRAPE FEATURED IMAGE
#--------------------------------------------
# URL of mars images page
url = 'https://www.jpl.nasa.gov/'
url1 = 'spaceimages/?search=&category=Mars'
# launch browser
browser.visit(url+url1)
#check if the page has been loaded
browser.is_element_present_by_tag('article', wait_time=3)
# create beautifulsoup object
soup = bs.BeautifulSoup(browser.html, 'html.parser')
# remove last '/' symbol from url
# locate the 'style' attribute of 'article' tag
# split by quotes and access an element of resulting list
# combine the two text elements get the image url
featured_image_url = url[:-1] + soup.find('article', class_='carousel_item')['style'].split("'")[1]
results['featured_image_url'] = featured_image_url
# print to console
print(f'Scraped: {featured_image_url}')
print(f'Results lenght: {len(results)}')
#--------------------------------------------
# SCRAPE MARS'S WEATHER
#--------------------------------------------
# twitter url to visit
url = 'https://twitter.com/marswxreport?lang=en'
# launch browser
browser.visit(url)
#check if the page has been loaded
browser.is_element_present_by_tag('div', wait_time=3)
# create beautifulsoup object
soup = bs.BeautifulSoup(browser.html, 'html.parser')
# localate the first tweet and extract text from it
mars_weather = soup.find('div', class_='js-tweet-text-container').text.split('\n')[1]
results['mars_weather'] = mars_weather
# print to console
print(f'Scraped: {mars_weather}')
print(f'Results lenght: {len(results)}')
#--------------------------------------------
# SCRAPE MARS'S FACTS
#--------------------------------------------
# url to visit
url = 'http://space-facts.com/mars/'
# use pandas to extract table from the page
facts_table = pd.read_html(url)
# create df with columns' headers
df = facts_table[0]
df.columns = ['Parameter','Value']
# convert to html table
facts_table = df.to_html()
results['facts_table'] = facts_table
# print to console
print(f'Scraped: {facts_table[:20]}..')
print(f'Results lenght: {len(results)}')
#--------------------------------------------
# SCRAPE HEMISPHERES' IMAGES
#--------------------------------------------
# url to visit
url = 'https://astrogeology.usgs.gov'
url1 = '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
# launch browser
browser.visit(url+url1)
#check if the page has been loaded
browser.is_element_present_by_tag('div', wait_time=3)
# create beautifulsoup object
soup = bs.BeautifulSoup(browser.html, 'html.parser')
# find section with images on the page
div = soup.find('div', class_='results').findAll('div', class_='description')
# extract urls
urls = []
for item in div:
links = item.findAll('a')
for a in links:
urls.append(a.get('href'))
# visit urls to extract links and titles
hemisphere_image_urls = []
for u in urls:
result_dict = {}
browser.visit(url+u)
#check if the page has been loaded
browser.is_element_present_by_tag('div', wait_time=3)
soup = bs.BeautifulSoup(browser.html, 'html.parser')
# find and extract links to images
links = soup.find('div', class_='downloads').findAll('a')
result_dict['img_url_jpeg'] = links[0].get('href')
result_dict['img_url'] = links[1].get('href')
# extract title
result_dict['title'] = soup.find('h2', class_='title').text
# append dict to results list
hemisphere_image_urls.append(result_dict)
results['hemisphere_image_urls'] = hemisphere_image_urls
# print to console
print(f'Scraped: hemisphere_image_urls')
print(f'Results lenght: {len(results)}')
browser.quit()
return results
# print(scrape())