-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathsimplecrawler.node.txt
145 lines (126 loc) · 8.62 KB
/
simplecrawler.node.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
┏━━━━━━━━━━━━━━━━━━━┓
┃ SIMPLECRAWLER ┃
┗━━━━━━━━━━━━━━━━━━━┛
SIMPLECRAWLER ==> #Uses HTTP[S].request()
#Does not respect robots.txt
#Automatic links detection, that can be customized
SIMPLECRAWLER.crawl #Returns CRAWLER then runs it.
(URL[, FUNC[, FUNC2]]) #FUNC and FUNC2 are fetchcomplete and fetcherror event handlers.
CRAWLER.initialPath #First page fetched (def: "/"). Must be specified before start()
CRAWLER.initialPort #Def: 80. Must be specified before start()
CRAWLER.initialProtocol #Def: "http". Must be specified before start()
CRAWLER.useProxy #BOOL
CRAWLER.proxyHostname|Port #
CRAWLER.proxyUser|Pass #
CRAWLER.needsAuth #If true, must then use CRAWLER.authUser|Pass
CRAWLER.customHeaders #REQ headers to add as OBJ. Uses fetchstart event handler to do on request basis.
#By default only host, connection, user-agent.
CRAWLER.userAgent #Def: "Node/SimpleCrawler <version> (http://www.github.com/cgiffard/node-simplecrawler)"
CRAWLER.acceptCookies #Def: true. Use CRAWLER.cookies
CRAWLER.cookies.on
("addcookie", FUNC(COOKIE)) #
CRAWLER.cookies.on
("removecookie",
FUNC(COOKIE_ARR)) #
CRAWLER.discoverResources #If left as is, use automatic discovery process, which means PENDINGREQ will be added auto.
#Can personalize with a FUNC(STR, PENDINGREQ)->URL_ARR
#All the following members are used if discoverResources is default one.
CRAWLER.host #Domain to restrict discovery process (set up by constructor), if CRAWLER.filterByDomain is
#true (def)
CRAWLER.domainWhitelist #Same for several DOMAIN_ARR (on top of CRAWLER.host)
CRAWLER.scanSubdomains #If true (def: false), discovery applies to subdomains
CRAWLER.ignoreWWWDomain #If true (def), discovery applies to subdomain www.
CRAWLER.addFetchCondition #Filters URL to match for discovery. Returns ID so can CRAWLER.removeFetchCondition(ID).
(FUNC(URL_OBJ)->BOOL) #URL_OBJ: protocol, host, port, path (includes query|hash), uriPath (doesn't include them)
CRAWLER.discoverRegex #REGEXP_ARR of URL to match for discovery (\1 is the link)
#Def:
# - HTML href or src attributes
# - CSS url()
# - any http[s]://* or url() in the text, include textNode
# - javascript:URI, will also get pure JavaScript code, which will result in error requests
CRAWLER.supportedMimeTypes #REGEXP_ARR of MIME types to match for discovery (def: "text/*", "application/javascript",
#"xml/*", "application/rss|html|xhtml")
CRAWLER.downloadUnsupported #If true (def), download (but not follow) unsupportedMimeTypes (I don't think it does anything)
CRAWLER.allowedProtocols #REGEXP_ARR of HTTP-based protocols to match for discovery (def: "http|https|rss|atom|feed|xml")
CRAWLER.stripWWWDomain #If true (def: false), strips subdomain www from requests. Only if default discoverResources
CRAWLER.stripQueryString #If true (def: false), strips query string from requests. Only if default discoverResources
CRAWLER.maxResourceSize #Response max size. Def: 16MB
CRAWLER.maxConcurrency #Max. number of requests at once (def: 5)
CRAWLER.interval #Interval between requests (def: 250)
CRAWLER.timeout #Request timeout (def: 5mins)
CRAWLER.cache #Needs to assign CRAWLER.cache = new SIMPLECRAWLER.cache([VAL[_ARR][, FUNC()]]).
#Usually only SIMPLECRAWLER.cache(TEMP_DIR), but can customize FUNC():
# - FUNC():
# - takes VAL_ARR as arguments
# - new FUNC(VAL_ARR) -> OBJ:
# - setItem(PENDINGREQ, VAL, CALLBACK())
# - getItem(PENDINGREQ, CALLBACK())->VAL
# - saveCache()
# - load(): init function. Might restore after saveCache()
# - Def. OBJ is SIMPLECRAWLER.FilesystemBackend:
# - writes to disk, keys are PENDINGREQ URL (so create mirror site)
# - VAL_ARR is PATH_STR (def: "./cache/"), where to create cache
# - saveCache() saves to PATH/cacheindex.json, and load() reload it
CRAWLR.on("crawlstart",FUNC())#
CRAWLER.on("complete", FUNC())#
CRAWLER.on("queueadd",
FUNC(PENDINGREQ)) #When a new PENDINGREQ added by crawler discovery (not through CRAWLER.queue.add())
CRAWLER.on("queueduplicate",
FUNC(URL_OBJ)) #Fired when new PENDINGREQ is duplicate, so won't be added.
CRAWLER.on("queueerror",
FUNC(ERROR, URL_OBJ)) #Fired when new PENDINGREQ can't be added because of error
CRAWLR.on("discoverycomplete",
FUNC(PENDINGREQ, URL_ARR)) #When crawler discovery on one page has been done
CRAWLER.on("fetchstart",
FUNC(PENDINGREQ, OBJ)) #When starts making request. Can modify OBJ, which is passed to HTTP[S].get(OBJ)
CRAWLER.on("fetchheaders",
FUNC(PENDINGREQ, RES)) #When received headers.
CRAWLER.on("fetchcomplete",
FUNC(PENDINGREQ, BUFFER, RES))#When received headers+body. BUFFER is body
CRAWLER.on("fetchclienterror",
FUNC(PENDINGREQ, ERROR)) #Client-side error
CRAWLER.on
("fetch404|dataerror|error",
FUNC(PENDINGREQ, RES)) #Error 404, other 4**|5** or over max file size (CRAWLER.maxResourceSize)
CRAWLER.on("notmodified", #When 304. OBJ is notmodified is result of CRAWLER.cache
FUNC(PENDINGREQ, RES[, OBJ])) #I don't think it will ever fire since crawler don't use If-Modified, etc.
CRAWLER.on("fetchredirect",
FUNC(PENDINGREQ, URL_OBJ,RES))#When receiving other 3**
CRAWLER.on("fetchtimeout",
FUNC(PENDINGREQ, NUM)) #When request time exceeds threshold NUM
this.wait() #Returns FUNC(). To be used in a CRAWLER event handler, to stop it until FUNC() is called,
#or after CRAWLER.listenerTTL NUM timeout (def: 10000) (for async operations).
CRAWLER.queue #Can change with anything that implements FETCHQUEUE interface, which is a PENDINGREQ_ARR also
#implementing some methods (look up source code).
FETCHQUEUE.get(NUM) #Same as FETCHQUEUE[NUM]
FETCHQUEUE.add
(PROTOCOL, HOSTNAME,PORT,PATH,
CALLBACK([ERROR, ]PENDINGREQ))#
CRAWLER.queueURL(URL) #Both methods add URL to the queue
FETCHQUEUE.freeze|defrost
(JSON_FILE) #Saves|restores current progress. Don't call often because CPU intensive.
FETCHQUEU.min|max|avg(VAR_STR)#Does on all PENDINGREQ.VAR, e.g. "requestTime"
FETCHQUEUE.getWithStatus(STR) #Returns PENDINGREQ_ARR having PENDINGREQ.status STR
FETCHQUEU.countWithStatus(STR)#Same but returns length
FETCHQUEUE.complete #Returns number of requests with PENDINGREQ.fetched true
FETCHQUEUE.errors #Returns number of requests with PENDINGREQ.code 4**, 5** or client errors
PENDINGREQ.url|protocol|host|
port|path #
PENDINGREQ.fetched #True if finished fetching
PENDINGREQ.status #Can be:
# - "queued": no request made
# - "spooled": request made but no reply
# - "headers": request made and only headers
# - "downloaded": request made and answered
# - "redirected": request made and answered 3**
# - "notfound": request made and answered 404
# - "failed": request made and error while fetching
PENDINGREQ.stateData.code #Status code
PENDINGREQ.stateData.headers #REQ.headers
PENDINGREQ.stateData.
contentLength|Type #
PENDINGREQ.stateData. #Length of actual body, not based on header Content-Length.
actualDataSize #If different, PENDINGREQ.sentIncorrectSize will be true
PENDINGREQ.stateData.
requestLatency|downloadTime|
requestTime #Time between fetchstart-fetchheaders, fetchheaders-fetchcomplete, fetchstart-fetchcomplete