forked from holwech/NewsScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNewsScraper.py
110 lines (99 loc) · 4.05 KB
/
NewsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
# Set the limit for number of articles to download
LIMIT = 4
data = {}
data['newspapers'] = {}
# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
companies = json.load(data_file)
count = 1
# Iterate through each news company
for company, value in companies.items():
# If a RSS link is provided in the JSON file, this will be the first choice.
# Reason for this is that, RSS feeds often give more consistent and correct data.
# If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
if 'rss' in value:
d = fp.parse(value['rss'])
print("Downloading articles from ", company)
newsPaper = {
"rss": value['rss'],
"link": value['link'],
"articles": []
}
for entry in d.entries:
# Check if publish date is provided, if no the article is skipped.
# This is done to keep consistency in the data and to keep the script from crashing.
if hasattr(entry, 'published'):
if count > LIMIT:
break
article = {}
article['link'] = entry.link
date = entry.published_parsed
article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
try:
content = Article(entry.link)
content.download()
content.parse()
except Exception as e:
# If the download for some reason fails (ex. 404) the script will continue downloading
# the next article.
print(e)
print("continuing...")
continue
article['title'] = content.title
article['text'] = content.text
newsPaper['articles'].append(article)
print(count, "articles downloaded from", company, ", url: ", entry.link)
count = count + 1
else:
# This is the fallback method if a RSS-feed link is not provided.
# It uses the python newspaper library to extract articles
print("Building site for ", company)
paper = newspaper.build(value['link'], memoize_articles=False)
newsPaper = {
"link": value['link'],
"articles": []
}
noneTypeCount = 0
for content in paper.articles:
if count > LIMIT:
break
try:
content.download()
content.parse()
except Exception as e:
print(e)
print("continuing...")
continue
# Again, for consistency, if there is no found publish date the article will be skipped.
# After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
if content.publish_date is None:
print(count, " Article has date of type None...")
noneTypeCount = noneTypeCount + 1
if noneTypeCount > 10:
print("Too many noneType dates, aborting...")
noneTypeCount = 0
break
count = count + 1
continue
article = {}
article['title'] = content.title
article['text'] = content.text
article['link'] = content.url
article['published'] = content.publish_date.isoformat()
newsPaper['articles'].append(article)
print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
count = count + 1
noneTypeCount = 0
count = 1
data['newspapers'][company] = newsPaper
# Finally it saves the articles as a JSON-file.
try:
with open('scraped_articles.json', 'w') as outfile:
json.dump(data, outfile)
except Exception as e: print(e)