From 8d2756724600959a6bd003204dc9dffc544b3dbd Mon Sep 17 00:00:00 2001 From: Jack Green Date: Mon, 9 Dec 2024 16:24:09 +0000 Subject: [PATCH] Introduce GitHub Action to test resolvability of external links [DOC-253] (#1382) This PR adds an action that will, when a PR is opened, check the resolvability of any external links. External links are defined as those using the `http` scheme - internal links _between_ documentation is already covered by our existing dead links check. Action is deliberately modular to be be easily applied to other docs repos. This runs whenever a PR is raised, which is sub-optimal because: - links can break at any time as they reference external content - there's no guarantees the breakage is anything to do with the new PR But the other alternative - a scheduled check - would have difficulty in knowing _who_ to target a failure communication at to resolve the issues (other projects notify Slack channels, with varying degrees of responses). An example of the output produced by this action can be found [here](https://github.com/hazelcast/hz-docs/actions/runs/11901935786). Note that the external link check *fails* because of dead links, and won't pass until the following are addressed: - https://github.com/hazelcast/hz-docs/pull/1355 - https://github.com/hazelcast/hz-docs/pull/1389 - `URL 'https://raw.github.com/olivernn/lunr.js/master/lunr.min.js' had status 404 (found in node_modules/lunr/index.html)` We should consider whether adding a check that we know will fail is a good idea - comparing the annoyance of a failing (non-blocking) test against the coverage that at least we can ensure things don't get _worse_. Fixes: [DOC-253](https://hazelcast.atlassian.net/browse/DOC-253) [DOC-253]: https://hazelcast.atlassian.net/browse/DOC-253?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --- .../actions/test-external-links/action.yml | 94 +++++++++++++++++++ .github/workflows/test-external-links.yml | 16 ++++ 2 files changed, 110 insertions(+) create mode 100644 .github/actions/test-external-links/action.yml create mode 100644 .github/workflows/test-external-links.yml diff --git a/.github/actions/test-external-links/action.yml b/.github/actions/test-external-links/action.yml new file mode 100644 index 000000000..e2d3de3eb --- /dev/null +++ b/.github/actions/test-external-links/action.yml @@ -0,0 +1,94 @@ +name: Test external links + +inputs: + SLACK_WEBHOOK: + required: true + +env: + # Not possible to set this as a default + # https://github.com/orgs/community/discussions/46670 + shell: bash + +runs: + using: composite + + steps: + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: 'npm' + + - name: Install Lynx + shell: ${{ env.shell }} + run: | + sudo apt-get update + sudo apt-get install -y lynx + + - name: Build documentation + shell: ${{ env.shell }} + run: | + npm install + npm run-script build-local + + - shell: ${{ env.shell }} + run: | + echo "temp_file=$(mktemp)" >> $GITHUB_ENV + + - name: Extract links + shell: ${{ env.shell }} + run: | + ${RUNNER_DEBUG:+set -o xtrace} + + # Extract all unique URLs + # Faster than potentially checking the same link on multiple pages + find test -name "*.html" | while read -r file; do + lynx -dump -listonly -nonumbers "${file}" | { grep --extended-regexp "^http" || test $? = 1; } >> "${temp_file}" + done + + - name: Check links + shell: ${{ env.shell }} + run: | + ${RUNNER_DEBUG:+set -o xtrace} + + distinct_urls=$(sort -u "${temp_file}") + + while read -r url; do + if [[ -n "${url}" ]]; then + echo "::debug::Checking URL '${url}'..." + + # Some links will probably still fail to resolve, e.g. `localhost`, "some.dummy.url" etc, so don't treat CURL exit codes as fact + # We want to identify when a real server responds to the request + + # First try a HEAD request to avoid downloading the whole response + status=$(curl --globoff --silent --output /dev/null --location --head --write-out "%{http_code}" "${url}" || true) + + if [[ "${status}" -eq 404 ]]; then + # But not all servers support "HEAD" (e.g. azure.microsoft.com), so try again + status=$(curl --globoff --silent --output /dev/null --location --write-out "%{http_code}" "${url}" || true) + fi + + if [[ "${status}" -eq 404 ]]; then + locations=$(grep -rl "${url}") + echo "::error::❌ URL '${url}' had status ${status} (found in ${locations})" 1>&2 + found_error=1 + else + echo "::debug::✅ URL '${url}' had status ${status}" + fi + fi + done <<< "${distinct_urls}" + + if [[ "${found_error}" -eq 1 ]]; then + exit 1 + else + exit 0 + fi + + - name: Slack notification + uses: 8398a7/action-slack@v3 + if: failure() + with: + fields: repo,message,action,workflow + status: ${{ job.status }} + channel: "#docs" + env: + SLACK_WEBHOOK_URL: ${{ inputs.SLACK_WEBHOOK }} diff --git a/.github/workflows/test-external-links.yml b/.github/workflows/test-external-links.yml new file mode 100644 index 000000000..a01396b73 --- /dev/null +++ b/.github/workflows/test-external-links.yml @@ -0,0 +1,16 @@ +name: Test external links + +on: + workflow_dispatch: + schedule: + - cron: "0 12 * * 1" # Runs at 12:00, only on Monday + +jobs: + test-external-links: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/test-external-links + with: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_DOCS }}