From 114e7dee5620ef0889a52dd5fc7784d587c63a86 Mon Sep 17 00:00:00 2001
From: strueman <36525480+strueman@users.noreply.github.com>
Date: Fri, 13 Sep 2024 21:56:45 +1000
Subject: [PATCH 1/2] Update reader.py strip html tags from content

The changes made are:

1. Added a new line to strip HTML tags from the `content` field:

   content = BeautifulSoup(entry.summary, "html.parser").get_text(strip=True)

2. Modified the `extracted_text` assignment to use `strip=True`:

   extracted_text = soup.get_text(strip=True)

3. Updated the `RedditContent` creation to use these new stripped values.
These changes will remove HTML tags and extra whitespace from both the `content` and `extracted_text` fields. The `get_text(strip=True)` method removes all HTML tags and strips leading and trailing whitespace.
With these modifications, the output should no longer contain HTML tags in the `content` and `extracted_text` fields.
---
 reddit_rss_reader/reader.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/reddit_rss_reader/reader.py b/reddit_rss_reader/reader.py
index 7117b4f..85a54b1 100644
--- a/reddit_rss_reader/reader.py
+++ b/reddit_rss_reader/reader.py
@@ -57,13 +57,17 @@ def fetch_content(self, after: Optional[datetime] = None, since_id: Optional[int
                 image_alt_texts = [x['alt'] for x in soup.find_all('img', alt=True)]
                 image_alt_texts = image_alt_texts if image_alt_texts else []
 
+                # Strip HTML tags from content and extracted_text
+                content = BeautifulSoup(entry.summary, "html.parser").get_text(strip=True)
+                extracted_text = soup.get_text(strip=True)
+
                 contents.append(
                     RedditContent(
                         link=entry.link,
                         id=entry.id,
                         title=entry.title,
-                        content=entry.summary,
-                        extracted_text=soup.get_text(),
+                        content=content,
+                        extracted_text=extracted_text,
                         image_alt_text=". ".join(image_alt_texts),
                         updated=datetime.fromtimestamp(mktime(entry.updated_parsed)),
                         author_name=entry.author_detail.name,

From 864c996155817019725b9eed802d16d1542f9c03 Mon Sep 17 00:00:00 2001
From: strueman <36525480+strueman@users.noreply.github.com>
Date: Fri, 13 Sep 2024 23:36:36 +1000
Subject: [PATCH 2/2] Update example.py - Replaced datetime.utcnow() as its
 depreciated

replaced datetime.utcnow() with datetime.now(timezone.utc) as datetime.utcnow() is depreciated and being removed from datetime.
Changes:
added import timzone
OLD: since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5)

NEW: since_time = datetime.now(timezone.utc).astimezone(pytz.utc) + timedelta(days=-5)
---
 example/example.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/example/example.py b/example/example.py
index d519653..bb6f550 100644
--- a/example/example.py
+++ b/example/example.py
@@ -1,6 +1,6 @@
 import pprint
-from datetime import datetime, timedelta
-
+from datetime import datetime, timedelta, timezone
+import 
 import pytz as pytz
 
 from reddit_rss_reader.reader import RedditRSSReader
@@ -12,8 +12,7 @@
 )
 
 # To consider comments entered in past 5 days only
-since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5)
-
+since_time = datetime.now(timezone.utc).astimezone(pytz.utc) + timedelta(days=-5)
 # fetch_content will fetch all contents if no parameters are passed.
 # If `after` is passed then it will fetch contents after this date
 # If `since_id` is passed then it will fetch contents after this id