-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapSciRep.js
130 lines (113 loc) · 4.39 KB
/
scrapSciRep.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
// TODO: async library and promises?
//creates new cookie jar
var j = request.jar();
var output = [];
var coreURL = 'http://mts-srep.nature.com/';
var mainURL = 'http://mts-srep.nature.com/cgi-bin/main.plex';
var loginDetails = {
url: mainURL,
form: {
'form_type' : 'login_results',
'j_id' : 110,
'ms_id_key' : '15ftdRK61w0uYJQjudLsliMaIQ',
'login' : 'yaskoike',
'password' : '50klab54'
},
jar: j,
headers: {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/601.5.17 (KHTML, like Gecko) Version/9.1 Safari/601.5.17'
}
}
request({url: mainURL, jar:j}, function (error, response, body) {
if (error)
return console.log(error)
console.log("logging in to Scientific Reports");
// console.log(j.getCookieString('http://mts-srep.nature.com/cgi-bin/main.plex'));
request.post(loginDetails, function (error, response, body) {
if (error) {
sciRepLogout(j)
return console.error(error)
}
console.log("logged in to Scientific Reports");
console.log(breakLine);
// console.log(j.getCookieString('http://mts-srep.nature.com/cgi-bin/main.plex'));
var $ = cheerio.load(body)
if ($('a:contains("Post Decision")').length != 1) {
sciRepLogout(j)
return console.log('HTML structure mismatched , Author Tasks not found')
} else {
var pdPage = $('a:contains("Post Decision")').attr('href');
request.get({url: coreURL + pdPage, jar: j}, function (error, response, body) {
if (error) {
sciRepLogout(j)
return console.log('Link access error: Post decision')
}
var $ = cheerio.load(body)
// console.log('http://mts-srep.nature.com/' + pdPage);
if ($('a:contains("Check Status")').length > 0) {
var pdCount = $('a:contains("Check Status")').length
console.log('Found ' + pdCount + ' Post Decision manuscript(s)')
console.log(breakLine)
$('a:contains("Check Status")').each(function (index, element) {
var manuscriptPage = $(element).attr('href');
var lastManuscript = index == pdCount-1 ? true: false;
var manuscriptResult = handleManuscriptPage(manuscriptPage, lastManuscript, appendOutput);
}); // <-- each manuscript
} else {
console.log('cries')
}
})
}
})
})
function handleManuscriptPage(manuscriptPagePath, final, callback) {
request.get({url: coreURL + manuscriptPagePath, jar: j}, function (error, response, body) {
if (error) {
sciRepLogout(j);
return console.error('Link acess error: Manuscript ' + error);
}
var $ = cheerio.load(body);
var rowHeaders = ['Manuscript #','Current Revision','Current Stage','Title'];
var manuscriptData = {};
//conduct heavy search once and do detailed search from here
$('table tr th:contains("Manuscript #")').closest('table').children('tr').each(function (rowIndex, element){
rowHeaders.forEach(function (header) {
if ($(element).children('th:contains("'+ header + '")').length > 0) {
manuscriptData[header.toString()] = $(element).children('th:contains("'+ header + '")').next('td').text().replace('\n','');
}
});
});
//conduct another heavy search to find the status table, gets only the latest information
var stageTableHeaders = ['Status', 'Date'];
$('table tr:contains("Approximate Duration")').next('tr').children('td').slice(1,3).each(function (rowIndex, element) {
manuscriptData[stageTableHeaders[rowIndex].toString()] = $(element).text().replace('\n','');
});
callback(final, manuscriptData);
});
} //<--- end handleManuscriptPage
function appendOutput(final, manuscriptInfo) {
output.push(manuscriptInfo);
for (var property in manuscriptInfo) {
console.log(property + " : " + manuscriptInfo[property]);
}
console.log(breakLine);
if (final) {
//JSON output here?
sciRepLogout(j);
}
}
function sciRepLogout(cookieObject) {
request.get({url: 'http://mts-srep.nature.com/cgi-bin/main.plex?form_type=logout', jar: j}, function(error) {
if (error) {
console.log(breakLine);
console.error('Error logging out: ' + error);
} else {
console.log('Logout completed');
console.log(breakLine);
}
});
}
var breakLine = '-------------------------------------------------';