-
Notifications
You must be signed in to change notification settings - Fork 39
/
index.js
114 lines (98 loc) · 3.25 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
var utils = require('./lib/utils.js');
var req = require('fetch').fetchUrl;
var url = require('url');
var cheerio = require('cheerio');
function Article(dom, options, uri) {
this.$ = dom; // Will be modified in-place after analyzing
this.originalDOM = dom; // Save the original DOM if the user needs it
this.cache = {};
if (uri && typeof uri != "undefined") {
this.base = uri.protocol + "//" + uri.hostname + uri.pathname;
if (uri.port && uri.port != 80) this.base += ":" + uri.port;
} else {
this.base = false;
}
this.options = options;
this.__defineGetter__('content', function() {
return this.getContent(true);
});
this.__defineGetter__('title', function() {
return this.getTitle(true);
});
this.__defineGetter__('html', function() {
return this.getHTML(true);
});
this.__defineGetter__('dom', function() {
return this.getDOM(true);
});
}
Article.prototype.getContent = function() {
if (typeof this.cache['article-content'] !== 'undefined') {
return this.cache['article-content'];
}
var content = utils.extract(this.$, this.base, this.options).html();
return this.cache['article-content'] = content;
}
// Better Article Title Extraction.
// Author Zihua Li https://github.com/luin/node-readability
Article.prototype.getTitle = function() {
if (typeof this.cache['article-title'] !== 'undefined') {
return this.cache['article-title'];
}
// Prefer to pull the title from one of the class names known to hold
// the article title (Instapaper conventions and
// https://www.readability.com/developers/guidelines#publisher).
var preferredTitle = this.$('.entry-title, .instapaper_title');
if (preferredTitle.length > 0) {
return this.cache['article-title'] = preferredTitle.first().text().trim();
}
var title = this.$('title').text();
var betterTitle;
var commonSeparatingCharacters = [' | ', ' _ ', ' - ', '«', '»', '—'];
var self = this;
commonSeparatingCharacters.forEach(function(char) {
var tmpArray = title.split(char);
if (tmpArray.length > 1) {
if (betterTitle) return self.cache['article-title'] = title;
betterTitle = tmpArray[0].trim();
}
});
if (betterTitle && betterTitle.length > 10) {
return this.cache['article-title'] = betterTitle;
}
return this.cache['article-title'] = title.trim();
}
Article.prototype.getDOM = function() {
return this.originalDOM;
}
Article.prototype.getHTML = function() {
return this.$.html();
}
var read = module.exports = function(html, options, callback) {
if (typeof options === 'function') {
callback = options;
options = {
considerDIVs: true,
nodesToRemove: 'meta,iframe,noscript,style,aside,object,script'
};
}
if (!html.match(/^\s*</)) {
req(html, options, function(err, res, body) {
if (err) {
return callback(err);
}
parseDOM(body.toString(), url.parse(html));
});
} else {
parseDOM(html, null);
}
function parseDOM(html, url) {
if (!html) return callback(new Error('Empty html'));
var $ = cheerio.load(html, {
normalizeWhitespace: true,
decodeEntities: false
});
if ($('body').length < 1) return callback(new Error("No body tag was found"));
return callback(null, new Article($, options, url), url);
}
}