diff --git a/example/example.py b/example/example.py index d519653..bb6f550 100644 --- a/example/example.py +++ b/example/example.py @@ -1,6 +1,6 @@ import pprint -from datetime import datetime, timedelta - +from datetime import datetime, timedelta, timezone +import import pytz as pytz from reddit_rss_reader.reader import RedditRSSReader @@ -12,8 +12,7 @@ ) # To consider comments entered in past 5 days only -since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5) - +since_time = datetime.now(timezone.utc).astimezone(pytz.utc) + timedelta(days=-5) # fetch_content will fetch all contents if no parameters are passed. # If `after` is passed then it will fetch contents after this date # If `since_id` is passed then it will fetch contents after this id diff --git a/reddit_rss_reader/reader.py b/reddit_rss_reader/reader.py index 7117b4f..85a54b1 100644 --- a/reddit_rss_reader/reader.py +++ b/reddit_rss_reader/reader.py @@ -57,13 +57,17 @@ def fetch_content(self, after: Optional[datetime] = None, since_id: Optional[int image_alt_texts = [x['alt'] for x in soup.find_all('img', alt=True)] image_alt_texts = image_alt_texts if image_alt_texts else [] + # Strip HTML tags from content and extracted_text + content = BeautifulSoup(entry.summary, "html.parser").get_text(strip=True) + extracted_text = soup.get_text(strip=True) + contents.append( RedditContent( link=entry.link, id=entry.id, title=entry.title, - content=entry.summary, - extracted_text=soup.get_text(), + content=content, + extracted_text=extracted_text, image_alt_text=". ".join(image_alt_texts), updated=datetime.fromtimestamp(mktime(entry.updated_parsed)), author_name=entry.author_detail.name,