From 76f7c22d73eb6ed615343cbfdd09f0e6923a1314 Mon Sep 17 00:00:00 2001 From: nanos Date: Mon, 20 Mar 2023 09:12:16 +0000 Subject: [PATCH] backfill follow requests' profiles (addresses #7) --- .github/workflows/get_context.yml | 2 +- README.md | 38 +++++++++++----- find_posts.py | 73 ++++++++++++++++++++++++------- 3 files changed, 87 insertions(+), 26 deletions(-) diff --git a/.github/workflows/get_context.yml b/.github/workflows/get_context.yml index 154e0f26..b0206fb4 100644 --- a/.github/workflows/get_context.yml +++ b/.github/workflows/get_context.yml @@ -32,7 +32,7 @@ jobs: path: artifacts - name: Get Directory structure run: ls -lR - - run: python find_posts.py --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} + - run: python find_posts.py --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} --max-follow-requests=${{ vars.MAX_FOLLOW_REQUESTS || 0 }} - name: Upload artifacts uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 8d95c9bd..5c318c4d 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,12 @@ This GitHub repository provides a GitHub action that runs every 10 mins, doing t 1. It can [pull remote replies into your instance](https://blog.thms.uk/2023/03/pull-missing-responses-into-mastodon?utm_source=github), using the Mastodon API. That part itself has two parts: 1. It gets remote replies to posts that users on your instance have already replied to during the last `REPLY_INTERVAL_IN_HOURS` hours, and adds them to your own server. 2. It gets remote replies to the last `HOME_TIMELINE_LENGTH` posts from your home timeline, and adds them to your own server. -2. It can also [backfill posts](https://blog.thms.uk/2023/03/backfill-recently-followed-accounts?utm_source=github) from the last `MAX_FOLLOWINGS` users that you have followed. -3. In the same way, it can also backfill posts form the last `MAX_FOLLOWERS` users that have followed you. +2. It can also [backfill posts](https://blog.thms.uk/2023/03/backfill-recently-followed-accounts?utm_source=github): + 1. from the last `MAX_FOLLOWINGS` users that you have followed. + 2. form the last `MAX_FOLLOWERS` users that have followed you. + 3. form the last `MAX_FOLLOW_REQUESTS` users that have sent you a follow request. -Each part can be disabled completely, and all of the values are configurable. +Each part can be disabled completely, and all of the parameters are configurable. **Be aware, that this script may run for a long time, if these values are too high.** Experiment a bit with what works for you, by starting with fairly small numbers (maybe `HOME_TIMELINE_LENGTH = 200`, `REPLY_INTERVAL_IN_HOURS = 12`) and increase the numbers as you see fit. @@ -19,15 +21,20 @@ For full context and discussion on why this is needed, read the following two bl ## Setup +You can run this script either as a GitHub Action, as a scheduled cron job on your local machine, or from a pre-packed container. ### 1) Get the required access token: +Regardless of how you want to run this script, you must first get an access token: + 1. In Mastodon go to Preferences > Development > New Application 1. give it a nice name - 2. enable `read:search`, `read:statuses` and `admin:read:accounts ` + 2. enable `read:search`, `read:statuses`, `read:follows`, and `admin:read:accounts` 3. Save 4. Copy the value of `Your access token` -### 2) Configure and run the GitHub action +### 2.1) Configure and run the GitHub Action + +To run this script as a GitHub Action: 1. Fork this repository 2. Add your access token: @@ -41,7 +48,7 @@ For full context and discussion on why this is needed, read the following two bl 4. Add environment variables to configure your action as described below. 4. Finally go to the Actions tab and enable the action. The action should now automatically run approximately once every 10 min. -### 3) Run this script locally as a cron job +### 2.2) Run this script locally as a cron job If you want to, you can of course also run this script locally as a cron job: @@ -52,7 +59,7 @@ When setting up your cronjob, do make sure you are setting the interval long eno If you are running this script locally, my recommendation is to run it manually once, before turning on the cron job: The first run will be significantly slower than subsequent runs, and that will help you prevent overlapping during that first run. -### 4) Run this script from a container +### 2.3) Run this script from a container This script is also available in a pre-packaged container, [mastodon_get_replies](https://github.com/nanos/mastodon_get_replies/pkgs/container/mastodon_get_replies). @@ -63,7 +70,7 @@ The same rules for running this as a cron job apply to running the container, do An example Kubernetes CronJob for running the container is included in the [`examples`](https://github.com/nanos/mastodon_get_replies/tree/main/examples) folder. -### 5) Configuration options +### Configuration options Please see below for a list of configuration options. @@ -72,12 +79,23 @@ Please see below for a list of configuration options. | -- | `--access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN` | |`MASTODON_SERVER`|`--server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. | | `HOME_TIMELINE_LENGTH` | `--home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. (An integer number, e.g. `200`) -| `REPLY_INTERVAL_IN_HOURS` | `--reply-interval-in-hours` | No | Provide to fetch remote replies to posts that have received replies from users on your own instance. Determines how far back in time we'll go to find posts that have received replies. (An integer number, e.g. `24`) +| `REPLY_INTERVAL_IN_HOURS` | `--reply-interval-in-hours` | No | Provide to fetch remote replies to posts that have received replies from users on your own instance. Determines how far back in time we'll go to find posts that have received replies. (An integer number, e.g. `24`.) Requires an access token with `admin:read:accounts` +| `USER` | `--user` | See Notes | Required together with `MAX_FOLLOWERS` or `MAX_FOLLOWINGS`: The username of the user whose followers or followings you want to backfill (e.g. `michael` for the user `@michael@thms.uk`). | `MAX_FOLLOWINGS` | `--max-followings` | No | Provide to backfill profiles for your most recent followings. Determines how many of your last followings you want to backfill. (An integer number, e.g. `80`. Ensure you also provide `USER`). | `MAX_FOLLOWERS` | `--max-followers` | No | Provide to backfill profiles for your most recent followers. Determines how many of your last followers you want to backfill. (An integer number, e.g. `80`. Ensure you also provide `USER`). -| `USER` | `--user` | See Notes | Required together with `MAX_FOLLOWERS` or `MAX_FOLLOWINGS`: The username of the user whose followers or followings you want to backfill (e.g. `michael` for the user `@michael@thms.uk`). +| `MAX_FOLLOW_REQUESTS` | `--max-follow-requests` | No | Provide to backfill profiles for the API key owner's most recent pending follow requests. Determines how many of your last follow requests you want to backfill. (An integer number, e.g. `80`.). Requires an access token with `read:follows` scope. | `HTTP_TIMEOUT` | `--http-timeout` | No | The timeout for any HTTP requests to the Mastodon API in seconds. Defaults to `5`. +#### Required Access Token Scopes + + - For all actions, your access token must include these scopes: + - `read:search` + - `read:statuses` + - If you are supplying `REPLY_INTERVAL_IN_HOURS` / `--reply-interval-in-hours` you must additionally enable this scope: + - `admin:read:accounts` + - If you are supplying `MAX_FOLLOW_REQUESTS` / `--max-follow-requests` you must additionally enable this scope: + - `read:follows` + ## Acknowledgments This script is mostly taken from [Abhinav Sarkar](https://notes.abhinavsarkar.net/2023/mastodon-context), with just some additions and alterations. Thank you Abhinav! diff --git a/find_posts.py b/find_posts.py index bcda62d2..0fdd1717 100644 --- a/find_posts.py +++ b/find_posts.py @@ -20,6 +20,7 @@ argparser.add_argument('--user', required = False, default='', help="Use together with --max-followings or --max-followers to tell us which user's followings/followers we should backfill") argparser.add_argument('--max-followings', required = False, type=int, default=0, help="Backfill posts for new accounts followed by --user. We'll backfill at most this many followings' posts") argparser.add_argument('--max-followers', required = False, type=int, default=0, help="Backfill posts for new accounts following --user. We'll backfill at most this many followers' posts") +argparser.add_argument('--max-follow-requests', required = False, type=int, default=5, help="Backfill posts of the API key owners pending follow requests. We'll backfill at most this many requester's posts") argparser.add_argument('--http-timeout', required = False, type=int, default=5, help="The timeout for any HTTP requests to your own, or other instances.") def pull_context( @@ -32,7 +33,8 @@ def pull_context( max_followings, backfill_followings_for_user, known_followings, - max_followers + max_followers, + max_follow_requests ): parsed_urls = {} @@ -71,6 +73,12 @@ def pull_context( followers = get_new_followers(server, user_id, max_followers, known_followings) add_following_posts(server, access_token, followers, known_followings, seen_urls) + if max_follow_requests > 0: + log(f"Getting posts from {backfill_followings_for_user}'s last {max_follow_requests} follow requests") + user_id = get_user_id(server, backfill_followings_for_user) + follow_requests = get_new_follow_requests(server, access_token, max_follow_requests, known_followings) + add_following_posts(server, access_token, follow_requests, known_followings, seen_urls) + def add_following_posts(server, access_token, followings, know_followings, seen_urls): for user in followings: posts = get_user_posts(user, know_followings, server) @@ -126,16 +134,27 @@ def get_user_posts(user, know_followings, server): except Exception as ex: log(f"Error getting posts for user {user['acct']}: {ex}") return None + +def get_new_follow_requests(server, access_token, max, known_followings): + """Get any new follow requests for the specified user, up to the max number provided""" -def get_new_followers(server, user_id, max, known_followers): - """Get any new followings for the specified user, up to the max number provided""" - response = get(f"https://{server}/api/v1/accounts/{user_id}/followers?limit={max}") + follow_requests = get_paginated_mastodon(f"https://{server}/api/v1/follow_requests", max, { + "Authorization": f"Bearer {access_token}", + }) - followers = response.json() + # Remove any we already know about + new_follow_requests = list(filter( + lambda user: user['acct'] not in known_followings, + follow_requests + )) + + log(f"Got {len(follow_requests)} follow_requests, {len(new_follow_requests)} of which are new") + + return new_follow_requests - while len(followers) < max and 'next' in response.links: - response = get(response.links['next']['url']) - followers = followers + response.json() +def get_new_followers(server, user_id, max, known_followers): + """Get any new followings for the specified user, up to the max number provided""" + followers = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/followers", max) # Remove any we already know about new_followers = list(filter( @@ -149,13 +168,7 @@ def get_new_followers(server, user_id, max, known_followers): def get_new_followings(server, user_id, max, known_followings): """Get any new followings for the specified user, up to the max number provided""" - - response = get(f"https://{server}/api/v1/accounts/{user_id}/following?limit={max}") - following = response.json() - - while len(following) < max and 'next' in response.links: - response = get(response.links['next']['url']) - following = following + response.json() + following = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/following", max) # Remove any we already know about new_followings = list(filter( @@ -588,6 +601,35 @@ def add_context_url(url, server, access_token): ) return False +def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5): + """Make a paginated request to mastodon""" + response = get(f"{url}?limit={max}", headers, timeout, max_tries) + + if response.status_code != 200: + if response.status_code == 401: + raise Exception( + f"Error getting URL {url}. Status code: {response.status_code}. " + "Ensure your access token is correct" + ) + elif response.status_code == 403: + raise Exception( + f"Error getting URL {url}. Status code: {response.status_code}. " + "Make sure you have the correct scopes enabled for your access token." + ) + else: + raise Exception( + f"Error getting URL {url}. Status code: {response.status_code}" + ) + + result = response.json() + + while len(result) < max and 'next' in response.links: + response = get(response.links['next']['url'], headers, timeout, max_tries) + result = result + response.json() + + return result + + def get(url, headers = {}, timeout = 0, max_tries = 5): """A simple wrapper to make a get request while providing our user agent, and respecting rate limits""" h = headers.copy() @@ -674,6 +716,7 @@ def __len__(self): arguments.user, KNOWN_FOLLOWINGS, arguments.max_followers, + arguments.max_follow_requests ) with open(KNOWN_FOLLOWINGS_FILE, "w", encoding="utf-8") as f: