This repository has been archived by the owner on Jan 27, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.js
127 lines (111 loc) · 4.26 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"use strict";
/////Includes
const Q = require("q");
const Nightmare = require("nightmare");
Nightmare.Promise = Q.Promise;
/////
function getItemsListPageNumber(){
const url = "https://www.google.fr/search?q=plop";
console.log("Visiting url : "+url);
return new Nightmare()
.goto(url)
.wait()
.inject("js", "node_modules/jquery/dist/jquery.js")
.evaluate(function() { //the js we execute in the page opened in the electron browser
//jquery available here because of the inject() line
//js executed on the client side
return parseInt($("table#nav td:not(.navend):not(.cur)").last().text());
})
.end() //stops the navigation
}
function getItemsListsForPages(pageNumber){
console.log("let's crawl pages from pageNumber 1 to "+pageNumber);
const baseUrl = "https://www.google.com/search?q=plop&start=";
var promises = [];
for(var i=1; i<=pageNumber; i++){
promises.push(getItemsListForPage(baseUrl+i+"0")); //https://www.google.com/search?q=plop&start=10
}
return Q.allSettled(promises)
.then(function (results) {
console.log("allSettled results : "+JSON.stringify(results, null, 4));
var allResults = [];
results.forEach(function (result) {
if (result.state === "fulfilled") {
console.log("allSettled one promise fulfilled");
console.log("fulfilled promise data : "+JSON.stringify(result.value, null, 4));
allResults.push(result.value)
} else {
console.log("allSettled one promise rejected");
console.log("rejected promise data : "+JSON.stringify(result.value, null, 4));
console.log("allSettled error : "+result.reason);
throw new Error("Let's crash dirty, one promise among others rejected");
}
});
console.log("allSettled returning array of results from all crawled pages");
return allResults;
}).fail(function(error){
console.log("allSettled error : "+error);
});
}
function getItemsListForPage(url){
console.log("crawling url : "+url);
return new Nightmare()
.goto(url)
.wait()
.inject("js", "node_modules/jquery/dist/jquery.js")
.evaluate(function() { //the js we execute in the page opened in the electron browser
//jquery available here because of the inject() line
//js executed on the client side
var itemsList = [];
$("div.g").each(function() {
var itemPageLink = $(this).find("h3 > a").attr("href");
var itemTitle = $(this).find("h3 > a").text();
var item = {
itemPageLink: itemPageLink,
itemTitle: itemTitle
};
itemsList.push(item);
});
return itemsList;
})
.end() //stops the navigation
.then(function(results){
console.log("page crawled with success : "+url);
console.log("extracted data : "+JSON.stringify(results, null, 4));
return results;
}, function(error){
return error;
})
}
function workingAsExpected(){
//this is the final objective :
//should display the crawled content of all crawled pages in one array
/*return getItemsListPageNumber()
.then(function(result){
console.log("getItemsListPageNumber fullfilled with result : "+result);
return getItemsListsForPages(result) //result holds the pageNumber crawled
.then(function(results){
console.log("getItemsListsForPages fullfilled with results : "+results);
return results;
}, function(error){
console.log("getItemsListsForPages rejected with error : "+error);
});
}, function(error){
console.log("getItemsListPageNumber rejected with error : "+error);
});*/
//same final test but with a more compact syntax
return getItemsListPageNumber()
.then(getItemsListsForPages);
}
Q.try(function(){
var workingAsExpectedPromise = workingAsExpected();
console.log("workingAsExpected return : " +workingAsExpectedPromise);
workingAsExpectedPromise
.then(function(success){
console.log("workingAsExpected main success : "+JSON.stringify(success, null, 4)); //return an array containing the results from all the crawled pages
}, function(error){
console.log("workingAsExpected main error : "+error);
}).done();
}).catch(function(e){
console.log("catch error : "+e);
})