-
Notifications
You must be signed in to change notification settings - Fork 0
/
philosophy.groovy
executable file
·66 lines (58 loc) · 2.13 KB
/
philosophy.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env groovy
// https://en.wikipedia.org/wiki/Wikipedia:Getting_to_Philosophy
@Grab(group='org.ccil.cowan.tagsoup', module='tagsoup', version='1.2' )
history = new HashSet()
slurper = new XmlSlurper( new org.ccil.cowan.tagsoup.Parser() )
// more reliable than a regex :s
cleanParens = { doc ->
def reader = new StringReader( doc )
int depth = 0
def inquote = false
def c
def open = '(' as char
def close = ')' as char
def clean = new StringBuilder()
while( (c = reader.read()) != -1)
{
c = c as char
if( !inquote && c == open) depth++
else if(!inquote && c == close && depth) depth--
else if(!inquote && c == close && !depth) clean.append(c) // avoid hits on '1) item one <b> 2) item two'
else if( c == '"' as char && !depth)
{
clean.append(c)
inquote = !inquote
}
else if( !depth ) clean.append(c)
}
clean.toString()
}
doIt = { pageName ->
def raw = new URL("https://en.wikipedia.org/wiki/${pageName}").text
def html = slurper.parseText( cleanParens(raw) )
def content = html."**".find{ it.@id == "mw-content-text" }
def links = content."**".findAll{
it.name() == 'a' &&
it.@href?.text()?.length() != 0 &&
it.@href?.text()?.indexOf('#') == -1 &&
[email protected]() == 0 &&
[email protected]().indexOf("action=edit") == -1
}
// see if we can find a link with a 'p' parent, or return the first (this weeds out a lot of junk)
// links on disambiguation pages are sometimes not in a 'p'
// check for parent->parent == mw-content-text to avoid sidebar links..
def link = links.find{
it.parent().name() == 'p' &&
it.parent().parent().@id == "mw-content-text" &&
!history.contains( it.@href?.text() )
} ?: links.find{ !history.contains( it.@href?.text() ) }
def linkToken = link.@href?.text()?.split("/")?.getAt(-1)
history.add( link.@href?.text() )
if( linkToken != "Philosophy" )
{
println linkToken
doIt( linkToken )
}
}
doIt(args[0] )
println "\nTotal links to get to Philosophy: ${history.size()}"