-
Notifications
You must be signed in to change notification settings - Fork 140
Scraping
Node.io includes a robust framework for scraping data from the web. The primary methods for scraping data are get
and getHtml
, although there are methods for making any type of request, modifying headers, etc. See the API for a full list of methods.
A note before you start scraping
The --debug
switch is your friend - use it to see the request and response headers, and whether there was an error with the request. If your scraping job is behaving unexpectedly, --debug
will show you what's going on under the hood.
node.io --debug my_scraping_job
Example 1: Save a web page to disk
save.js
var nodeio = require('node.io');
exports.job = new nodeio.Job({
input: false,
run: function () {
var url = this.options.args[0];
this.get(url, function(err, data) {
if (err) {
this.exit(err);
} else {
this.emit(data);
}
});
}
});
save.coffee
nodeio = require 'node.io'
class SavePage extends nodeio.JobClass
input: false
run: () ->
url = @options.args[0]
@get url, (err, data) =>
if err? then @exit err else @emit data
@class = SavePage
@job = new SavePage()
To save a page to disk, run
$ node.io -s save "http://www.google.com/" > google.html
Which is equivalent to
$ curl "http://www.google.com/" > google.html
Example 2: Get the number of Google results for a list of keywords
To use node.io effectively, try and encapsulate common scraping code in run()
so that the resulting job is as generic and versatile as possible. Each thread should contain only one request where possible.
keywords.js
var nodeio = require('node.io'), options = {timeout: 10};
exports.job = new nodeio.Job(options, {
input: ['hello', 'foobar', 'weather'],
run: function (keyword) {
this.getHtml('http://www.google.com/search?q=' + encodeURIComponent(keyword), function (err, $) {
var results = $('#resultStats').text.toLowerCase();
this.emit(keyword + ' has ' + results);
});
}
});
Note: you can also override the input at the command line using the -i
switch, e.g.
$ node.io -i list_of_words.txt keywords
Example 3: Scraping a page using CSS selector / traversal methods
When using getHtml(url, callback)
, the second argument of callback is $
, an object similar to jQuery's. For advanced usage of $
, see the API
reddit.js - scrape the front page stories from reddit.com
var nodeio = require('node.io');
var methods = {
input: false,
run: function() {
this.getHtml('http://www.reddit.com/', function(err, $) {
//Handle any request / parsing errors
if (err) this.exit(err);
var titles = [], scores = [], output = [];
//Select all titles on the page
$('div#siteTable a.title').each(function(a) {
titles.push(a.text);
});
//Select all scores on the page
$('div#siteTable div.score.unvoted').each(function(div) {
scores.push(div.rawtext); //rawtext doesn't decode entities or trim the text
});
//Mismatch? page probably didn't load properly
if (scores.length != titles.length) {
this.exit('Title / score mismatch');
}
for (var i = 0, len = scores.length; i < len; i++) {
//Ignore upcoming stories
if (scores[i] == '•') continue;
//Check the data is ok
this.assert(scores[i]).isInt();
//Output = [score] title
output.push('['+scores[i]+'] '+titles[i]);
}
this.emit(output);
});
}
}
exports.job = new nodeio.Job({timeout:10}, methods);
reddit.coffee
nodeio = require 'node.io'
titles = []
scores = []
output = []
class Reddit extends nodeio.JobClass
input: false
run: ->
@getHtml 'http://www.reddit.com/', (err, $, data) =>
@exit err if err
$('div#siteTable a.title').each (a) -> titles.push a.text
$('div#siteTable div.score.unvoted').each (div) -> scores.push div.rawtext
@exit 'Title / score mismatch' if scores.length isnt titles.length
for score, i in scores
if score is '•' then continue
@assert(score).isInt()
output.push '[' + score + '] ' + titles[i]
@emit output
@class = Reddit
@job = new Reddit({timeout:10})