RPC/Client/Scraping_crawling/Crawler/simplecrawler.node.txt


                                  ┏━━━━━━━━━━━━━━━━━━━┓
                                  ┃   SIMPLECRAWLER   ┃
                                  ┗━━━━━━━━━━━━━━━━━━━┛

SIMPLECRAWLER ==>             #Uses HTTP[S].request()
                              #Does not respect robots.txt
                              #Automatic links detection, that can be customized

SIMPLECRAWLER.crawl           #Returns CRAWLER then runs it.
(URL[, FUNC[, FUNC2]])        #FUNC and FUNC2 are fetchcomplete and fetcherror event handlers.

CRAWLER.initialPath           #First page fetched (def: "/"). Must be specified before start()
CRAWLER.initialPort           #Def: 80. Must be specified before start()
CRAWLER.initialProtocol       #Def: "http". Must be specified before start()

CRAWLER.useProxy              #BOOL
CRAWLER.proxyHostname|Port    #
CRAWLER.proxyUser|Pass        #

CRAWLER.needsAuth             #If true, must then use CRAWLER.authUser|Pass
CRAWLER.customHeaders         #REQ headers to add as OBJ. Uses fetchstart event handler to do on request basis.
                              #By default only host, connection, user-agent.
CRAWLER.userAgent             #Def: "Node/SimpleCrawler <version> (http://www.github.com/cgiffard/node-simplecrawler)"
CRAWLER.acceptCookies         #Def: true. Use CRAWLER.cookies
CRAWLER.cookies.on
("addcookie", FUNC(COOKIE))   #
CRAWLER.cookies.on
("removecookie", 
FUNC(COOKIE_ARR))             #

CRAWLER.discoverResources     #If left as is, use automatic discovery process, which means PENDINGREQ will be added auto.
                              #Can personalize with a FUNC(STR, PENDINGREQ)->URL_ARR
                              #All the following members are used if discoverResources is default one.
CRAWLER.host                  #Domain to restrict discovery process (set up by constructor), if CRAWLER.filterByDomain is 
                              #true (def)
CRAWLER.domainWhitelist       #Same for several DOMAIN_ARR (on top of CRAWLER.host)
CRAWLER.scanSubdomains        #If true (def: false), discovery applies to subdomains
CRAWLER.ignoreWWWDomain       #If true (def), discovery applies to subdomain www.
CRAWLER.addFetchCondition     #Filters URL to match for discovery. Returns ID so can CRAWLER.removeFetchCondition(ID).
(FUNC(URL_OBJ)->BOOL)         #URL_OBJ: protocol, host, port, path (includes query|hash), uriPath (doesn't include them)
CRAWLER.discoverRegex         #REGEXP_ARR of URL to match for discovery (\1 is the link)
                              #Def: 
                              #  - HTML href or src attributes
                              #  - CSS url()
                              #  - any http[s]://* or url() in the text, include textNode
                              #  - javascript:URI, will also get pure JavaScript code, which will result in error requests
CRAWLER.supportedMimeTypes    #REGEXP_ARR of MIME types to match for discovery (def: "text/*", "application/javascript",
                              #"xml/*", "application/rss|html|xhtml")
CRAWLER.downloadUnsupported   #If true (def), download (but not follow) unsupportedMimeTypes (I don't think it does anything)
CRAWLER.allowedProtocols      #REGEXP_ARR of HTTP-based protocols to match for discovery (def: "http|https|rss|atom|feed|xml")

CRAWLER.stripWWWDomain        #If true (def: false), strips subdomain www from requests. Only if default discoverResources
CRAWLER.stripQueryString      #If true (def: false), strips query string from requests. Only if default discoverResources

CRAWLER.maxResourceSize       #Response max size. Def: 16MB
CRAWLER.maxConcurrency        #Max. number of requests at once (def: 5)
CRAWLER.interval              #Interval between requests (def: 250)
CRAWLER.timeout               #Request timeout (def: 5mins)

CRAWLER.cache                 #Needs to assign CRAWLER.cache = new SIMPLECRAWLER.cache([VAL[_ARR][, FUNC()]]).
                              #Usually only SIMPLECRAWLER.cache(TEMP_DIR), but can customize FUNC():
                              #  - FUNC():
                              #     - takes VAL_ARR as arguments
                              #     - new FUNC(VAL_ARR) -> OBJ:
                              #        - setItem(PENDINGREQ, VAL, CALLBACK())
                              #        - getItem(PENDINGREQ, CALLBACK())->VAL
                              #        - saveCache()
                              #        - load(): init function. Might restore after saveCache()
                              #  - Def. OBJ is SIMPLECRAWLER.FilesystemBackend:
                              #     - writes to disk, keys are PENDINGREQ URL (so create mirror site)
                              #     - VAL_ARR is PATH_STR (def: "./cache/"), where to create cache 
                              #     - saveCache() saves to PATH/cacheindex.json, and load() reload it

CRAWLR.on("crawlstart",FUNC())#
CRAWLER.on("complete", FUNC())#

CRAWLER.on("queueadd", 
FUNC(PENDINGREQ))             #When a new PENDINGREQ added by crawler discovery (not through CRAWLER.queue.add())
CRAWLER.on("queueduplicate", 
FUNC(URL_OBJ))                #Fired when new PENDINGREQ is duplicate, so won't be added.
CRAWLER.on("queueerror", 
FUNC(ERROR, URL_OBJ))         #Fired when new PENDINGREQ can't be added because of error

CRAWLR.on("discoverycomplete", 
FUNC(PENDINGREQ, URL_ARR))    #When crawler discovery on one page has been done

CRAWLER.on("fetchstart",      
FUNC(PENDINGREQ, OBJ))        #When starts making request. Can modify OBJ, which is passed to HTTP[S].get(OBJ)
CRAWLER.on("fetchheaders",    
FUNC(PENDINGREQ, RES))        #When received headers. 
CRAWLER.on("fetchcomplete",   
FUNC(PENDINGREQ, BUFFER, RES))#When received headers+body. BUFFER is body

CRAWLER.on("fetchclienterror",
FUNC(PENDINGREQ, ERROR))      #Client-side error
CRAWLER.on
("fetch404|dataerror|error", 
FUNC(PENDINGREQ, RES))        #Error 404, other 4**|5** or over max file size (CRAWLER.maxResourceSize)
CRAWLER.on("notmodified",     #When 304. OBJ is notmodified is result of CRAWLER.cache
FUNC(PENDINGREQ, RES[, OBJ])) #I don't think it will ever fire since crawler don't use If-Modified, etc.
CRAWLER.on("fetchredirect", 
FUNC(PENDINGREQ, URL_OBJ,RES))#When receiving other 3**
CRAWLER.on("fetchtimeout", 
FUNC(PENDINGREQ, NUM))        #When request time exceeds threshold NUM

this.wait()                   #Returns FUNC(). To be used in a CRAWLER event handler, to stop it until FUNC() is called,
                              #or after CRAWLER.listenerTTL NUM timeout (def: 10000) (for async operations).

CRAWLER.queue                 #Can change with anything that implements FETCHQUEUE interface, which is a PENDINGREQ_ARR also
                              #implementing some methods (look up source code).
FETCHQUEUE.get(NUM)           #Same as FETCHQUEUE[NUM]
FETCHQUEUE.add
(PROTOCOL, HOSTNAME,PORT,PATH,
CALLBACK([ERROR, ]PENDINGREQ))#
CRAWLER.queueURL(URL)         #Both methods add URL to the queue
FETCHQUEUE.freeze|defrost
(JSON_FILE)                   #Saves|restores current progress. Don't call often because CPU intensive.

FETCHQUEU.min|max|avg(VAR_STR)#Does on all PENDINGREQ.VAR, e.g. "requestTime"
FETCHQUEUE.getWithStatus(STR) #Returns PENDINGREQ_ARR having PENDINGREQ.status STR
FETCHQUEU.countWithStatus(STR)#Same but returns length
FETCHQUEUE.complete           #Returns number of requests with PENDINGREQ.fetched true
FETCHQUEUE.errors             #Returns number of requests with PENDINGREQ.code 4**, 5** or client errors
                              
PENDINGREQ.url|protocol|host|
port|path                     #
PENDINGREQ.fetched            #True if finished fetching
PENDINGREQ.status             #Can be:
                              #  - "queued": no request made
                              #  - "spooled": request made but no reply
                              #  - "headers": request made and only headers
                              #  - "downloaded": request made and answered
                              #  - "redirected": request made and answered 3**
                              #  - "notfound": request made and answered 404
                              #  - "failed": request made and error while fetching
PENDINGREQ.stateData.code     #Status code
PENDINGREQ.stateData.headers  #REQ.headers
PENDINGREQ.stateData.
contentLength|Type            #
PENDINGREQ.stateData.         #Length of actual body, not based on header Content-Length. 
actualDataSize                #If different, PENDINGREQ.sentIncorrectSize will be true
PENDINGREQ.stateData.
requestLatency|downloadTime|  
requestTime                   #Time between fetchstart-fetchheaders, fetchheaders-fetchcomplete, fetchstart-fetchcomplete