-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_collector.py
29 lines (27 loc) · 1.28 KB
/
reddit_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pandas as pd
from datetime import datetime, timedelta
class RedditDataCollector:
def __init__(self, reddit, subreddits, time_filter='hour', limit=100):
self.reddit = reddit
self.subreddits = subreddits
self.time_filter = time_filter
self.limit = limit
async def collect_posts(self, top_n_per_subreddit):
data = []
for subreddit_name in self.subreddits:
subreddit = await self.reddit.subreddit(subreddit_name)
async for post in subreddit.new(limit=top_n_per_subreddit):
post_time = datetime.fromtimestamp(post.created_utc)
if datetime.now() - post_time <= timedelta(hours=0.5): # Only posts from last 30 minutes
data.append({
'subreddit': subreddit_name,
'title': post.title,
'body': post.selftext[:500],
'url': post.url,
'score': post.score,
'num_comments': post.num_comments,
'created_utc': post.created_utc,
'author': post.author.name
})
print(f"Collected post from r/{subreddit_name}: {post.title}")
return pd.DataFrame(data)