-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSpider.cpp
104 lines (92 loc) · 1.97 KB
/
Spider.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "Spider.h"
Spider::Spider(const string_deque& master_url_list, string_list* url_pool, string_set* data_pool, bool gather_urls)
:
url_list(master_url_list),
url_pool(url_pool),
data_pool(data_pool),
gather_urls(gather_urls)
{
}
Spider::Spider()
{
}
void Spider::crawl_list()
{
std::mutex mutex;
while (!url_list.empty())
{
AmIStuck = std::chrono::high_resolution_clock::now();
auto target_url = url_list.back();
{
std::lock_guard<std::mutex> guard(mutex);
CurrentUrl = target_url;
}
url_list.pop_back();
if (open_connection(target_url))
{
if (Manager::instance().Config->debug)
{
Logger::log << "Fetched stream from: " << target_url;
}
parse_stream(target_url);
}
stream.str(""); // Clear the stream
}
}
bool Spider::open_connection(std::string target_url)
{
Connection socket(target_url, stream);
if (stream.rdbuf()->in_avail()) // If stream is not empty
{
return true;
}
return false;
}
void Spider::parse_stream(std::string target_url)
{
Parser parser(utility_tools::get_host(target_url), gather_urls);
if (parser.load_stream(stream))
{
parser.traverse_tree();
parser.return_data(url_pool, data_pool);
}
}
Robot_Spider::Robot_Spider(std::string host, std::map<std::string, string_set>* master_exclusion_list)
: host(host)
{
std::string target_url{host + "/robots.txt"};
if (Spider::open_connection(target_url))
{
parse_stream();
(*master_exclusion_list)[host].insert(exclusion_set.begin(), exclusion_set.end());
}
}
void Robot_Spider::parse_stream()
{
std::string line;
bool get_paths = false;
while (std::getline(stream, line))
{
if (line == user_agent)
{
get_paths = true;
}
else if (get_paths && !line.empty())
{
std::string path;
if (line.substr(0, disallow.length()) == disallow)
{
path = line.substr(disallow.length(), std::string::npos);
exclusion_set.insert(path);
}
else if (line.empty())
{
get_paths = false;
}
}
else
{
get_paths = false;
}
}
}