resolved conflicts with latest code

edsu · May 22, 2024 · dccc90a · dccc90a
2 parents 84f1365 + 6ee7333
commit dccc90a
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -40,6 +40,16 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l
 stripped). Please be aware that this might easily exceed Mastodon's
 limit of 512 characters.
 
+
+## De-duping
+
+If you are attempting to use the RSS feed of a major news site, you may find
+that they change / update (or just re-post) the same items multiple times which
+will lead to duplicate toots. To enable de-duplication, use the `{--dedupe}`
+option to check for duplicates based on a tag before tooting, e.g.
+
+    feediverse --dedupe url
+
 ## Multiple Feeds
 
 Since *feeds* is a list you can add additional feeds to watch if you want.

diff --git a/feediverse.py b/feediverse.py
@@ -28,9 +28,13 @@ def main():
                         default=os.path.expanduser(DEFAULT_CONFIG_FILE))
     parser.add_argument("-d", "--delay", action="store_true",
                         help="delay randomly from 10 to 30 seconds between each post")
+    parser.add_argument("-p", "--dedupe",
+                        help="dedupe against the given tag",
+                        default="", metavar="TAG")
 
     args = parser.parse_args()
     config_file = args.config
+    dedupe_field = args.dedupe
 
     if args.verbose:
         print("using config file", config_file)
@@ -48,18 +52,28 @@ def main():
     )
 
     newest_post = config['updated']
+    dupes = config['dupecheck']
     for feed in config['feeds']:
         if args.verbose:
             print(f"fetching {feed['url']} entries since {config['updated']}")
         for entry in get_feed(feed['url'], config['updated']):
             newest_post = max(newest_post, entry['updated'])
-            if args.verbose:
-                print(entry)
+            entry_text = feed['template'].format(**entry)[:499]
 
             if args.dry_run:
-                print("trial run, not tooting ", entry["title"][:50])
+                print(entry_text)
                 continue
-
+
+            if args.verbose:
+                print(entry_text)
+
+            if dedupe_field:
+                if entry[dedupe_field] in dupes:
+                    if args.verbose:
+                        print(f"Skipping dupe post: {entry_text} based on dedupe field {dedupe_field}")
+                    continue
+                update_dupes(dupes, entry[dedupe_field])
+
             image_medias = []
             if feed['include_images'] and entry['images']:
                 for image in entry['images'][:4]:
@@ -69,7 +83,7 @@ def main():
 
             if not args.dry_run:
                 masto.status_post(
-                    feed['template'].format(**entry)[:499],
+                    entry_text,
                     media_ids=image_medias
                 )
 
@@ -80,6 +94,7 @@ def main():
 
     if not args.dry_run:
         config['updated'] = newest_post.isoformat()
+        config['dupecheck'] = dupes
         save_config(config, config_file)
 
 def get_feed(feed_url, last_update):
@@ -98,21 +113,27 @@ def get_feed(feed_url, last_update):
     for entry in entries:
         yield get_entry(entry)
 
+def update_dupes(dupes, new):
+   if len(dupes) > 10:
+     del dupes[0]
+   dupes.append(new)
+
 def get_entry(entry):
     hashtags = []
     for tag in entry.get('tags', []):
         t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
         hashtags.append('#{}'.format(t))
     summary = entry.get('summary', '')
-    content = entry.get('content', '') or ''
+    content = entry.get('content', '')
+    comments = entry.get('comments', '')
     if content:
         content = cleanup(content[0].get('value', ''))
     url = entry.id
     return {
         'url': url,
         'link': entry.link,
         'links': entry.links,
-        'comments': entry.comments,
+        'comments': comments,
         'title': cleanup(entry.title),
         'summary': cleanup(summary),
         'content': content,
@@ -167,7 +188,8 @@ def save_config(config, config_file):
 
 def read_config(config_file):
     config = {
-        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc)
+        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
+        'dupecheck': [],
     }
     with open(config_file) as fh:
         cfg = yaml.load(fh, yaml.SafeLoader)