From 114e7dee5620ef0889a52dd5fc7784d587c63a86 Mon Sep 17 00:00:00 2001 From: strueman <36525480+strueman@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:56:45 +1000 Subject: [PATCH 1/2] Update reader.py strip html tags from content The changes made are: 1. Added a new line to strip HTML tags from the `content` field: content = BeautifulSoup(entry.summary, "html.parser").get_text(strip=True) 2. Modified the `extracted_text` assignment to use `strip=True`: extracted_text = soup.get_text(strip=True) 3. Updated the `RedditContent` creation to use these new stripped values. These changes will remove HTML tags and extra whitespace from both the `content` and `extracted_text` fields. The `get_text(strip=True)` method removes all HTML tags and strips leading and trailing whitespace. With these modifications, the output should no longer contain HTML tags in the `content` and `extracted_text` fields. --- reddit_rss_reader/reader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/reddit_rss_reader/reader.py b/reddit_rss_reader/reader.py index 7117b4f..85a54b1 100644 --- a/reddit_rss_reader/reader.py +++ b/reddit_rss_reader/reader.py @@ -57,13 +57,17 @@ def fetch_content(self, after: Optional[datetime] = None, since_id: Optional[int image_alt_texts = [x['alt'] for x in soup.find_all('img', alt=True)] image_alt_texts = image_alt_texts if image_alt_texts else [] + # Strip HTML tags from content and extracted_text + content = BeautifulSoup(entry.summary, "html.parser").get_text(strip=True) + extracted_text = soup.get_text(strip=True) + contents.append( RedditContent( link=entry.link, id=entry.id, title=entry.title, - content=entry.summary, - extracted_text=soup.get_text(), + content=content, + extracted_text=extracted_text, image_alt_text=". ".join(image_alt_texts), updated=datetime.fromtimestamp(mktime(entry.updated_parsed)), author_name=entry.author_detail.name, From 864c996155817019725b9eed802d16d1542f9c03 Mon Sep 17 00:00:00 2001 From: strueman <36525480+strueman@users.noreply.github.com> Date: Fri, 13 Sep 2024 23:36:36 +1000 Subject: [PATCH 2/2] Update example.py - Replaced datetime.utcnow() as its depreciated replaced datetime.utcnow() with datetime.now(timezone.utc) as datetime.utcnow() is depreciated and being removed from datetime. Changes: added import timzone OLD: since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5) NEW: since_time = datetime.now(timezone.utc).astimezone(pytz.utc) + timedelta(days=-5) --- example/example.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/example/example.py b/example/example.py index d519653..bb6f550 100644 --- a/example/example.py +++ b/example/example.py @@ -1,6 +1,6 @@ import pprint -from datetime import datetime, timedelta - +from datetime import datetime, timedelta, timezone +import import pytz as pytz from reddit_rss_reader.reader import RedditRSSReader @@ -12,8 +12,7 @@ ) # To consider comments entered in past 5 days only -since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5) - +since_time = datetime.now(timezone.utc).astimezone(pytz.utc) + timedelta(days=-5) # fetch_content will fetch all contents if no parameters are passed. # If `after` is passed then it will fetch contents after this date # If `since_id` is passed then it will fetch contents after this id