-
Notifications
You must be signed in to change notification settings - Fork 4
/
newsScraper.py
209 lines (170 loc) · 8.52 KB
/
newsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import requests
from bs4 import BeautifulSoup
import datetime
import re
import openai
from openai import OpenAI
from dotenv import load_dotenv
import os
OUTPUT_DIRECTORY = './output/' # Default output directory for files
def is_today(date_input, current_date):
if isinstance(date_input, datetime.datetime):
return date_input.date() == current_date
elif isinstance(date_input, str):
try:
match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', date_input)
if match:
date_part = match.group(1)
parsed_date = datetime.datetime.strptime(
date_part, "%Y/%m/%d").date()
return parsed_date == current_date
except ValueError as e:
print(f"Error parsing date: {e}")
return False
return False
def scrape_verge(current_date):
url = 'https://www.theverge.com/tech'
base_url = 'https://www.theverge.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('a', {'class': 'group hover:text-white'})
articles = [[item.get('aria-label'), base_url + item['href']]
for item in items if is_today(item['href'], current_date)]
return articles
def scrape_cnbctech(current_date):
url = 'https://www.cnbc.com/technology/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
article_cards = soup.find_all('div', class_='Card-standardBreakerCard')
articles = []
for card in article_cards:
title_tag = card.find('a', class_='Card-title')
time_tag = card.find('span', class_='Card-time')
if title_tag and time_tag:
title = title_tag.text.strip()
link = title_tag['href']
publication_time = time_tag.text.strip()
# convert Sat, Apr 13th 2024 to 2024-04-13
date_str = re.sub(r'(st|nd|rd|th)', '', publication_time)
try:
date_object = datetime.datetime.strptime(
date_str, '%a, %b %d %Y')
except ValueError as e:
print(f"Error parsing date: {e}, treating as today")
date_object = datetime.datetime.today()
if is_today(date_object, current_date):
articles.append([title, link])
return articles
def scrape_techcrunch(current_date):
url = 'https://techcrunch.com/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
data_links = soup.find_all('a', attrs={'data-destinationlink': True})
# Debugging: print the number of data_links found
# print(f"Number of data links found: {len(data_links)}")
# Extract the href attributes and text of these links
articles = [[link.text.strip(), link['href']] for link in data_links if is_today(link['href'], current_date) and len(link.text.strip()) > 0]
return articles
def classify_titles(titles):
prompt_text = "You are classifying news titles together into specific events. Be as accurate and specific as possible. Group the following news titles by the particular event they discuss:\n\n" + \
"\n".join(f"{i+1}. {title}" for i, title in enumerate(titles))
# print((prompt_text))
try:
response = client.chat.completions.create(
model=TEXT_MODEL,
messages=[
{"role": "system", "content": "Output the response as a dictionary of specific event names and the titles that belong to them."},
{"role": "user", "content": prompt_text}
]
)
# print(response)
output = response.choices[0].message.content
return output
except Exception as e:
print(f"An error occurred: {e}")
return None
def select_events(titles):
prompt_text = "Suppose you are the chief editor at CNBC-TechCheck-Briefing. You need to select 5 most important news events to put into today's briefing(You might be able to see some hint by how many times a news event is reported, but also consider what your audience of CNBC-TechCheck-Briefing is interested in). Return the title of the event in order of importance for these unqiue events. Here are the news of today:\n\n" + \
"\n".join(f"{i+1}. {title}" for i, title in enumerate(titles))
try:
response = client.chat.completions.create(
model=TEXT_MODEL,
messages=[
{"role": "system", "content": " Output the response as string titles seperated by newline."},
{"role": "user", "content": prompt_text}
]
)
output = response.choices[0].message.content
return output
except Exception as e:
print(f"An error occurred: {e}")
return None
# below functions are for grouping by source
def scrape_and_group_by_source(current_date):
sources = {
'TechCrunch': scrape_techcrunch(current_date),
'The Verge': scrape_verge(current_date),
'CNBC Tech': scrape_cnbctech(current_date),
}
return sources
def format_grouped_titles_by_source(grouped_sources):
formatted_text = ""
for source, articles in grouped_sources.items():
formatted_text += f"{source}\n" + \
"\n".join(title for title, _ in articles) + "\n\n"
return formatted_text.strip()
def select_events_by_source(titles):
prompt_text = "You are creating newsletters for audience. From the list of sources and their news, select the top 5 news events that you would include in the newsletter.:\n\n" + \
"\n Grouped By Source:\n" + titles
try:
response = client.chat.completions.create(
model=TEXT_MODEL,
messages=[
{"role": "system", "content": "Output the response as string titles seperated by newline that are most important."},
{"role": "user", "content": prompt_text}
]
)
output = response.choices[0].message.content
return output
except Exception as e:
print(f"An error occurred: {e}")
return None
if __name__ == '__main__':
load_dotenv()
TEXT_MODEL = "gpt-4o"
client = OpenAI()
client.api_key = os.getenv('OPENAI_API_KEY')
group_by_source = True # Change to true for new mode
# today = datetime.date.today()
today = datetime.date(2024, 4, 17)
# print(today)
all_news = scrape_verge(
today) + scrape_cnbctech(today) + scrape_techcrunch(today)
titles = [str(news[0]) for news in all_news]
news_to_URL = {news[0]: news[1] for news in all_news}
if group_by_source:
grouped_sources = scrape_and_group_by_source(today)
formatted_text = format_grouped_titles_by_source(grouped_sources)
prompt_text = "You are creating newsletters for audience. From the list of sources and their news, consider the frequency of the event being discussed and how interesting audience find them to be. Then, select the top 5 news events that you would include in the newsletter:\n\n" + \
"Grouped By Source:\n" + formatted_text
prompt_text = "Suppose you are the chief editor at CNBC-TechCheck-Briefing. You need to select 5 most important news events to put into today's briefing(You might be able to see some hint by how many times a news event is reported, but also consider what your audience of CNBC-TechCheck-Briefing is interested in). Return the title of the event in order of importance for these unqiue events. Here are the news of today:\n"
+ formatted_text
news_titles = select_events_by_source(formatted_text)
print(news_titles)
output_data = f"Titles:\n{chr(10).join(titles)}\n\nselected_news:\n{news_titles}\n"
output_file_path = f"{OUTPUT_DIRECTORY}grouped_by_source{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.txt"
else:
# print(all_news)
grouped_titles = classify_titles(titles)
selected_events = select_events(grouped_titles).split("\n")
if grouped_titles:
# print(grouped_titles)
# Prepare output data
# output_data = f"Titles:\n{chr(10).join(titles)}\n\nall_news:\n{chr(10).join(all_news)}\n\ngrouped_titles:\n{grouped_titles}\n"
output_data = f"Titles:\n{chr(10).join(titles)}\n\ngrouped_titles:\n{grouped_titles}\n\nselected_events:\n{selected_events}\n"
output_file_path = f"{OUTPUT_DIRECTORY}grouped_by_event{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.txt"
else:
print("Failed to group the titles.")
# Write output data to file
with open(output_file_path, 'w') as file:
file.write(output_data)