forked from peter-ha/qt-ssl-crawl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqt-ssl-crawler.cpp
275 lines (241 loc) · 11.9 KB
/
qt-ssl-crawler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/*************************************************************************************
**
** QtSslCrawl
** Copyright (C) 2012 Peter Hartmann <[email protected]>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public
** License as published by the Free Software Foundation; either
** version 2.1 of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free Software
** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
**
*************************************************************************************/
#include "qt-ssl-crawler.h"
#include <QFile>
#include <QUrl>
#include <QDebug>
#include <QNetworkReply>
#include <QSslCertificate>
#include <QSslConfiguration>
#include <QCoreApplication>
#include <QStringList>
#include <QThreadPool>
#include <QTimer>
// in reality the number of open connections is higher than the value below
int QtSslCrawler::s_concurrentRequests = 100;
QNetworkRequest::Attribute QtSslCrawler::s_tryCountAttribute =
static_cast<QNetworkRequest::Attribute>(QNetworkRequest::User + 1);
QtSslCrawler::QtSslCrawler(QObject *parent, int from, int to) :
QObject(parent),
m_manager(new QNetworkAccessManager(this)),
m_crawlFrom(from),
m_crawlTo(to)
{
QFile domainFile(QStringLiteral("top-1m.csv"));
if (!domainFile.open(QIODevice::ReadOnly)) {
qFatal("could not open file 'top-1m.csv', download it from http://s3.amazonaws.com/alexa-static/top-1m.csv.zip");
}
int currentLine = 0;
while (!domainFile.atEnd()) {
currentLine++;
QByteArray line = domainFile.readLine();
if (m_crawlFrom == 0 || m_crawlTo == 0 || (m_crawlFrom <= currentLine && currentLine <= m_crawlTo)) {
QByteArray domain = line.right(line.count() - line.indexOf(',') - 1).prepend("https://www.");
QUrl url = QUrl::fromEncoded(domain.trimmed());
QNetworkRequest request(url);
// setting the attribute to trace the originating URL,
// because we might try different URLs or get redirects
request.setAttribute(QNetworkRequest::User, url);
queueRequestIfNew(request); // all requests should be new here
if (currentLine == m_crawlTo)
break; // no need to crawl the rest of the file
}
}
}
void QtSslCrawler::start() {
QMetaObject::invokeMethod(this, "checkForSendingMoreRequests", Qt::QueuedConnection);
}
void QtSslCrawler::foundUrl(const QUrl &foundUrl, const QUrl &originalUrl) {
QNetworkRequest request(foundUrl);
request.setAttribute(QNetworkRequest::User, originalUrl);
queueRequestIfNew(request);
QMetaObject::invokeMethod(this, "checkForSendingMoreRequests", Qt::QueuedConnection);
}
void QtSslCrawler::timeout() {
QNetworkReply *reply = qobject_cast<QNetworkReply*>(sender()->parent());
finishRequest(reply);
QNetworkRequest request = reply->request();
int tryCount = request.attribute(QtSslCrawler::s_tryCountAttribute).toInt();
if (tryCount == 0) {
qDebug() << "timeout, re-scheduling request for" << request.url();
tryCount++;
QNetworkRequest newRequest(request);
newRequest.setAttribute(QtSslCrawler::s_tryCountAttribute, QVariant(tryCount));
m_visitedUrls.remove(reply->request().url()); // hack (has just been inserted by finishRequest)
queueRequestIfNew(newRequest);
} else {
qDebug() << "timeout, tried" << request.url() << "twice, giving up.";
}
// we called checkForSendingMoreRequests() implicitly with finishRequest()
}
void QtSslCrawler::checkForSendingMoreRequests() {
while (m_urlsWaitForFinished.count() < s_concurrentRequests
&& m_requestsToSend.count() > 0) {
QNetworkRequest request = m_requestsToSend.dequeue();
sendRequest(request);
}
}
void QtSslCrawler::queueRequestIfNew(const QNetworkRequest &request) {
if (!m_visitedUrls.contains(request.url())
&& !m_urlsWaitForFinished.contains(request.url())
&& !m_requestsToSend.contains(request)) {
m_requestsToSend.enqueue(request);
} else {
qDebug() << "visited" << request.url() << "already or visiting it currently";
}
}
void QtSslCrawler::sendRequest(const QNetworkRequest &request) {
qDebug() << "sending request for" << request.url();
QNetworkRequest newRequest(request);
// do not keep connections open, we will not issue
// more than one request to the same host
newRequest.setRawHeader("Connection", "close");
QNetworkReply *reply = m_manager->get(newRequest);
// if there is neither error nor success after 5 minutes,
// try again one more time and then skip the URL.
// (The timer will be destroyed if the reply finished after 5 minutes)
QTimer *timer = new QTimer(reply);
connect(timer, SIGNAL(timeout()), this, SLOT(timeout()));
timer->setSingleShot(true);
timer->start(300000); // 5 minutes
reply->ignoreSslErrors(); // we don't care, we just want the certificate
connect(reply, SIGNAL(metaDataChanged()), this, SLOT(replyMetaDataChanged()));
connect(reply, SIGNAL(error(QNetworkReply::NetworkError)),
this, SLOT(replyError(QNetworkReply::NetworkError)));
connect(reply, SIGNAL(finished()), this, SLOT(replyFinished()));
m_urlsWaitForFinished.insert(request.url());
}
void QtSslCrawler::finishRequest(QNetworkReply *reply) {
reply->disconnect(SIGNAL(metaDataChanged()));
reply->disconnect(SIGNAL(error(QNetworkReply::NetworkError)));
reply->disconnect(SIGNAL(finished()));
reply->close();
reply->abort();
reply->deleteLater();
m_visitedUrls.insert(reply->request().url());
m_urlsWaitForFinished.remove(reply->request().url());
qDebug() << "finishRequest pending requests:" << m_requestsToSend.count() + m_urlsWaitForFinished.count();
QMetaObject::invokeMethod(this, "checkForSendingMoreRequests", Qt::QueuedConnection);
if (m_urlsWaitForFinished.count() + m_requestsToSend.count() == 0) {
emit crawlFinished();
}
}
void QtSslCrawler::replyMetaDataChanged() {
QNetworkReply *reply = qobject_cast<QNetworkReply*>(sender());
QUrl currentUrl = reply->url();
QUrl originalUrl = reply->request().attribute(QNetworkRequest::User).toUrl();
qDebug() << "replyMetaDataChanged" << currentUrl << "original url:" << originalUrl;
if (reply->error() == QNetworkReply::NoError) {
if (currentUrl.scheme() == QLatin1String("https")) {
// success, https://[domain] exists and serves meaningful content
QList<QSslCertificate> chain = reply->sslConfiguration().peerCertificateChain();
if (!chain.empty()) {
QStringList organizations = chain.last().issuerInfo(QSslCertificate::Organization);
emit crawlResult(originalUrl, currentUrl, chain);
qDebug() << "found ssl cert at" << currentUrl
<< "organizations:" << organizations << ", coming from" << originalUrl;
} else {
// never saw that happen
qWarning() << "weird: no errors but certificate chain is empty for " << reply->url();
}
} else if (currentUrl.scheme() == QLatin1String("http")) {
// check for redirections, we might end up at an SSL site
int statusCode = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();
if (statusCode >= 300 && statusCode < 400) {
QByteArray locationHeader = reply->header(QNetworkRequest::LocationHeader).toByteArray();
if (locationHeader.isEmpty()) // this seems to be a bug in QtNetwork
locationHeader = reply->rawHeader("Location");
QUrl newUrl = QUrl::fromEncoded(locationHeader);
if (!newUrl.isEmpty()) {
qDebug() << "found redirect header at" << currentUrl << "to" << newUrl;
QNetworkRequest request(newUrl);
request.setAttribute(QNetworkRequest::User, originalUrl);
queueRequestIfNew(request);
QMetaObject::invokeMethod(this, "checkForSendingMoreRequests", Qt::QueuedConnection);
}
} else {
qDebug() << "meta data changed for" << currentUrl << "do nothing I guess, wait for finished";
}
} else {
// never saw that happen
qWarning() << "scheme for" << currentUrl << "is neither https nor http";
}
} else { // there was an error
// does not happen
qDebug() << "error with" << currentUrl << reply->errorString();
}
}
void QtSslCrawler::replyError(QNetworkReply::NetworkError error) {
QNetworkReply *reply = qobject_cast<QNetworkReply*>(sender());
QUrl currentUrl = reply->url();
QUrl originalUrl = reply->request().attribute(QNetworkRequest::User).toUrl();
qDebug() << "replyError" << error << currentUrl << reply->errorString() << "original url:" << originalUrl;
// 2nd try: if https://[domain] does not work, fetch
// http://[domain] and parse the HTML for https:// URLs
// ### check which error we got
// our blind check for https://[domain] was not succesful, try http://[domain] now
if (originalUrl.host() == currentUrl.host() && currentUrl.scheme() == QLatin1String("https")) {
QUrl newUrl = currentUrl;
newUrl.setScheme(QStringLiteral("http"));
QNetworkRequest newRequest(newUrl); // ### probably we can just copy it
newRequest.setAttribute(QNetworkRequest::User, newUrl);
qDebug() << "queueing new request" << newUrl << "original url:" << newUrl;
queueRequestIfNew(newRequest);
} else {
qWarning() << "could not fetch" << currentUrl << "original url:" << originalUrl; // ### try again?
}
finishRequest(reply);
}
void QtSslCrawler::replyFinished() {
QNetworkReply *reply = qobject_cast<QNetworkReply*>(sender());
QUrl currentUrl = reply->url();
QUrl originalUrl = reply->request().attribute(QNetworkRequest::User).toUrl();
if (reply->error() == QNetworkReply::NoError) {
qDebug() << "reply finished:" << currentUrl << "original url:" << originalUrl << ", now grep for urls";
QByteArray replyData = reply->readAll();
// now start the job to find URLs in a new thread
UrlFinderRunnable *runnable = new UrlFinderRunnable(replyData, originalUrl, currentUrl);
connect(runnable, SIGNAL(foundUrl(QUrl,QUrl)), this, SLOT(foundUrl(QUrl,QUrl)), Qt::QueuedConnection);
QThreadPool::globalInstance()->start(runnable);
} else {
qWarning() << "got error while parsing" << currentUrl << "for" << originalUrl << reply->errorString();
}
finishRequest(reply);
}
UrlFinderRunnable::UrlFinderRunnable(const QByteArray &data, const QUrl &originalUrl, const QUrl ¤tUrl) :
QObject(), QRunnable(), m_data(data), m_originalUrl(originalUrl), m_currentUrl(currentUrl),
m_regExp("(https://[a-z0-9.@:]+)", Qt::CaseInsensitive) {
}
void UrlFinderRunnable::run() {
int pos = 0;
while ((pos = m_regExp.indexIn(m_data, pos)) != -1) {
QUrl newUrl(m_regExp.cap(1));
if (newUrl.isValid()
&& newUrl.host().contains('.') // filter out 'https://ssl'
&& newUrl.host() != QLatin1String("ssl.")
&& newUrl.host() != m_originalUrl.host()
&& newUrl != m_currentUrl) { // prevent endless loops
qDebug() << "runnable: found valid url" << newUrl << "at original url" << m_originalUrl;
emit foundUrl(newUrl, m_originalUrl);
}
pos += m_regExp.matchedLength();
}
}