forked from github/docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck-english-links.js
executable file
·143 lines (120 loc) · 5 KB
/
check-english-links.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env node
const path = require('path')
const fs = require('fs')
const linkinator = require('linkinator')
const program = require('commander')
const { pull, uniq } = require('lodash')
const checker = new linkinator.LinkChecker()
const rimraf = require('rimraf').sync
const mkdirp = require('mkdirp').sync
const root = 'https://docs.github.com'
const englishRoot = `${root}/en`
const { deprecated } = require('../lib/enterprise-server-releases')
const got = require('got')
// Links with these codes may or may not really be broken.
const retryStatusCodes = [429, 503, 'Invalid']
// [start-readme]
//
// This script runs once per day via a scheduled GitHub Action to check all links in
// English content, not including deprecated Enterprise Server content. It opens an issue
// if it finds broken links. To exclude a link path, add it to `lib/excluded-links.js`.
// Note that linkinator somtimes returns 429 and 503 errors for links that are not actually
// broken, so this script double-checks those using `got`.
//
// [end-readme]
program
.description('Check all links in the English docs.')
.option('-d, --dry-run', 'Turn off recursion to get a fast minimal report (useful for previewing output).')
.option('-r, --do-not-retry', `Do not retry broken links with status codes ${retryStatusCodes.join(', ')}.`)
.option('-p, --path <PATH>', `Provide an optional path to check. Best used with --dry-run. Default: ${englishRoot}`)
.parse(process.argv)
// Skip excluded links defined in separate file.
const excludedLinks = require('../lib/excluded-links')
// Skip non-English content.
const languagesToSkip = Object.keys(require('../lib/languages'))
.filter(code => code !== 'en')
.map(code => `${root}/${code}`)
// Skip deprecated Enterprise content.
// Capture the old format https://docs.github.com/enterprise/2.1/
// and the new format https://docs.github.com/[email protected]/.
const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})(/|$)`)
const config = {
path: program.opts().path || englishRoot,
concurrency: 300,
// If this is a dry run, turn off recursion.
recurse: !program.opts().dryRun,
silent: true,
// The values in this array are treated as regexes.
linksToSkip: [
enterpriseReleasesToSkip,
...languagesToSkip,
...excludedLinks
]
}
main()
async function main () {
// Clear and recreate a directory for logs.
const logFile = path.join(__dirname, '../.linkinator/full.log')
rimraf(path.dirname(logFile))
mkdirp(path.dirname(logFile))
// Update CLI output and append to logfile after each checked link.
checker.on('link', result => {
// We don't need to dump all of the HTTP and HTML details
delete result.failureDetails
fs.appendFileSync(logFile, JSON.stringify(result) + '\n')
})
// Start the scan; events will be logged as they occur.
const result = (await checker.check(config)).links
// Scan is complete! Filter the results for broken links.
const brokenLinks = result
.filter(link => link.state === 'BROKEN')
// Coerce undefined status codes into `Invalid` strings so we can display them.
// Without this, undefined codes get JSON.stringified as `0`, which is not useful output.
.map(link => { link.status = link.status || 'Invalid'; return link })
if (!program.opts().doNotRetry) {
// Links to retry individually.
const linksToRetry = brokenLinks
.filter(link => retryStatusCodes.includes(link.status))
await Promise.all(linksToRetry
.map(async (link) => {
try {
// got throws an HTTPError if response code is not 2xx or 3xx.
// If got succeeds, we can remove the link from the list.
await got(link.url)
pull(brokenLinks, link)
// If got fails, do nothing. The link is already in the broken list.
} catch (err) {
// noop
}
}))
}
// Exit successfully if no broken links!
if (!brokenLinks.length) {
console.log('All links are good!')
process.exit(0)
}
// Format and display the results.
console.log(`${brokenLinks.length} broken links found on docs.github.com\n`)
displayBrokenLinks(brokenLinks)
// Exit unsuccessfully if broken links are found.
process.exit(1)
}
function displayBrokenLinks (brokenLinks) {
// Sort results by status code.
const allStatusCodes = uniq(brokenLinks
// Coerce undefined status codes into `Invalid` strings so we can display them.
// Without this, undefined codes get JSON.stringified as `0`, which is not useful output.
.map(link => link.status || 'Invalid')
)
allStatusCodes.forEach(statusCode => {
const brokenLinksForStatus = brokenLinks.filter(x => x.status === statusCode)
console.log(`## Status ${statusCode}: Found ${brokenLinksForStatus.length} broken links`)
console.log('```')
brokenLinksForStatus.forEach(brokenLinkObj => {
// We don't need to dump all of the HTTP and HTML details
delete brokenLinkObj.failureDetails
console.log(JSON.stringify(brokenLinkObj, null, 2))
})
console.log('```')
})
}