From e9bf055b3b28c9cac5dbca3b55bf8a0dede531a6 Mon Sep 17 00:00:00 2001 From: Chris Allegretta Date: Tue, 6 Dec 2022 15:39:21 -0500 Subject: [PATCH 1/4] Add de-duping to feediverse. Command line option "dedupe", by default disabled, will check the tag in question (e.g. "url") again not re-toot if it has already been seen. Ads a "dupecheck" entry in .feediverse state file to state preserve across multiple runs --- feediverse.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index cee0078..03bb9ad 100755 --- a/feediverse.py +++ b/feediverse.py @@ -24,9 +24,13 @@ def main(): parser.add_argument("-c", "--config", help="config file to use", default=os.path.expanduser(DEFAULT_CONFIG_FILE)) + parser.add_argument("-d", "--dedupe", + help="dedupe against the given field", + default="") args = parser.parse_args() config_file = args.config + dedupe_field = args.dedupe if args.verbose: print("using config file", config_file) @@ -44,6 +48,7 @@ def main(): ) newest_post = config['updated'] + dupes = config['dupecheck'] for feed in config['feeds']: if args.verbose: print(f"fetching {feed['url']} entries since {config['updated']}") @@ -51,13 +56,22 @@ def main(): newest_post = max(newest_post, entry['updated']) if args.verbose: print(entry) + if dedupe_field: + if entry[dedupe_field] in dupes: + if args.verbose: + print("Skipping dupe post: ", entry["title"][:50], + "based on dedupe field (", dedupe_field, ")") + continue + update_dupes(dupes, entry[dedupe_field]) if args.dry_run: print("trial run, not tooting ", entry["title"][:50]) continue + masto.status_post(feed['template'].format(**entry)[:499]) if not args.dry_run: config['updated'] = newest_post.isoformat() + config['dupecheck'] = dupes save_config(config, config_file) def get_feed(feed_url, last_update): @@ -71,6 +85,11 @@ def get_feed(feed_url, last_update): for entry in entries: yield get_entry(entry) +def update_dupes(dupes, new): + if len(dupes) > 10: + del dupes[0] + dupes.append(new) + def get_entry(entry): hashtags = [] for tag in entry.get('tags', []): @@ -125,7 +144,8 @@ def save_config(config, config_file): def read_config(config_file): config = { - 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc) + 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), + 'dupecheck': [], } with open(config_file) as fh: cfg = yaml.load(fh, yaml.SafeLoader) From ac04a402ea443df5b496838f89c2866e327a3d80 Mon Sep 17 00:00:00 2001 From: Chris Allegretta Date: Tue, 6 Dec 2022 15:46:39 -0500 Subject: [PATCH 2/4] Add details to the README about how de-duplication works. --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 4ff2224..f351f09 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,15 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l stripped). Please be aware that this might easily exceed Mastodon's limit of 512 characters. + +## De-duping + +If you are attempting to use the RSS feed of a major news site, you may find +that they change / update (or just re-post) the same items multiple times which +will lead to duplicate toots. To enable de-duplication, use the `{--dedupe}` +option to check for duplicates based on a tag before tooting, e.g. + feediverse --dedupe url + ## Multiple Feeds Since *feeds* is a list you can add additional feeds to watch if you want. From ff1ce223cc6d756456e8ecfff3c729669d9fddd3 Mon Sep 17 00:00:00 2001 From: Chris Allegretta Date: Tue, 6 Dec 2022 15:53:10 -0500 Subject: [PATCH 3/4] Just spacing fix for README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f351f09..c9c1d79 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ If you are attempting to use the RSS feed of a major news site, you may find that they change / update (or just re-post) the same items multiple times which will lead to duplicate toots. To enable de-duplication, use the `{--dedupe}` option to check for duplicates based on a tag before tooting, e.g. + feediverse --dedupe url ## Multiple Feeds From 6ee7333ebf1d556733117439d5b2c2f9fb608b4f Mon Sep 17 00:00:00 2001 From: Chris Allegretta Date: Tue, 6 Dec 2022 16:02:15 -0500 Subject: [PATCH 4/4] Add "TAG" metavar for argparse option instead of the default. --- feediverse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index 03bb9ad..7deb28e 100755 --- a/feediverse.py +++ b/feediverse.py @@ -25,8 +25,8 @@ def main(): help="config file to use", default=os.path.expanduser(DEFAULT_CONFIG_FILE)) parser.add_argument("-d", "--dedupe", - help="dedupe against the given field", - default="") + help="dedupe against the given tag", + default="", metavar="TAG") args = parser.parse_args() config_file = args.config