-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.js
111 lines (103 loc) · 3.12 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
var request = require('request');
var cheerio = require('cheerio');
var fs = require("fs");
var util = require("util");
var profileUrl = "https://www.ssmatri.com/ssnmlprofile.php?id=19909";
var totalPages = 0;
var finalString = "";
readAndProcessProfiles();
function readAndProcessProfiles() {
fs.readFile('inputfile.txt', 'utf8', function (err, data) {
if (err) {
return console.log(err);
}
console.log(data);
var profileLinks = [];
var rows = data.split("\n");
for (var k = 0; k < rows.length; k++) {
profileLinks.push(rows[k].split(" ")[0]);
}
//584 is hardcoded value
for (var i = 0; i < 584; i++) {
(function (i) {
setTimeout(function () {
processFiles(profileLinks[i]);
}, i * 120000);
})(i);
}
});
}
function processFiles(urlName) {
console.log('url ' + urlName + new Date());
request(urlName, function (error, response, body) {
if (error) {
console.log("error: " + error);
}
var $ = cheerio.load(body);
var proviewtab = $(".proview");
searchForElements(proviewtab);
});
}
function searchForElements(proviewTab) {
var keys = [' DOB -Time-Place', 'Sect-Gothram-Star',
'Qual-Job-Place', 'Income(p.m)',
'Ht -Wt-Complexion',
'MotherTongue',
"OtherLang.Known(Speaking)",
'Nativity',
'ParentsAlive?',
'Father',
'Mother ',
'Brothers',
'Sisters',
'Status--Property',
'VisaStatus',
'Talents/Achievements',
'Likes/Hobbies',
'VehicleDrivingknown',
'AnyotherDetails'
]
if (proviewTab && proviewTab[0] && proviewTab[0].children && proviewTab[0].children[1]) {
var htmlElement = proviewTab[0].children[1].children[1].children; //14 children
finalString = "";
for (var j = 0; j < htmlElement.length; j++) {
//console.log("html element " + j);
searchForTrTags(htmlElement[j], 14);
}
console.log(finalString);
var finalValueString = ""
for (var k = 0; k < keys.length; k++) {
startStringIndex = finalString.indexOf(keys[k]);
if (k < keys.length - 1) {
endStringIndex = finalString.indexOf(keys[k + 1]);
}
var valueString = finalString.substring(startStringIndex + keys[k].length + 1, endStringIndex);
finalValueString = finalValueString + "," + valueString;
}
console.log(finalValueString.replace(/:/g, "").substring(2, finalValueString.length));
}
//Now parse the whole string
}
function searchForTrTags(htmlElement, appendText) {
if (htmlElement.name === undefined || htmlElement.children === undefined) {
if (htmlElement.data && htmlElement.type != "comment") {
var stringAfterRemovingWhiteSpaces = htmlElement.data.replace(/ /g, '').replace(/(\r\n|\n|\r)/gm, "");
if (stringAfterRemovingWhiteSpaces != "") {
return stringAfterRemovingWhiteSpaces;
}
}
return null;
}
if (htmlElement.name == "tr" || htmlElement.children.length > 0) {
if (htmlElement.children) {
appendText = appendText + " -> " + htmlElement.children.length;
//console.log(htmlElement.children.length + " " + appendText);
for (var i = 0; i < htmlElement.children.length; i++) {
var infoString = searchForTrTags(htmlElement.children[i], appendText);
if (infoString) {
finalString = finalString + " " + infoString;
}
}
}
}
}