-
Notifications
You must be signed in to change notification settings - Fork 0
/
webscraper.py
39 lines (32 loc) · 1.31 KB
/
webscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#==========================================================================================
#Name : webscraper.py
#Author : Cole Dapprich
#Version : 1.0
#Course : CSCE 4430.001
#Description : This python script uses the BeautifulSoup4 and requests libraries to return
# the 10 most recent top stories from http://www.technewsworld.com/ in the
# following format: title, date, URL.
#Copyright : Copyright 2016 CDSoftworks ( AMDG )
#==========================================================================================
import requests
import bs4
#open webpage for scraping
response = requests.get('http://www.technewsworld.com/')
soup = bs4.BeautifulSoup(response.text, "html.parser")
#containers
titles = []
dates = []
urls = []
#find top 10 stories, append titles to list
for i in soup.find_all(class_='title')[:10]:
titles.append(i.text.strip())
#append each story's date to list
for i in soup.find_all(class_='date')[:10]:
dates.append(i.text.strip())
#append each story's link to list
for i in soup.find_all(class_='title')[:10]:
urls.append('http://www.technewsworld.com' + i.find('a')['href'])
count = 1
for i, j, k in zip(dates, titles, urls):
print count, " - ", i, " - ", j, " - ", k
count += 1