17
17
USER_AGENTS ,
18
18
)
19
19
20
- from utils .web_scraper import parse_google_search_results , render_js_and_get_text
20
+ from scraping .web_scraper import parse_google_search_results , render_js_and_get_text
21
21
22
- from utils .proxies_manager import prepare_proxies , round_robin_proxies
23
- from utils .request_manager import param_converter , start_request
24
- from utils .results_manager import get_processed_dorks , safe_add_result
22
+ from vpn_proxies .proxies_manager import prepare_proxies , round_robin_proxies
23
+ from requester .request_manager import param_converter , start_request
24
+ from reporting .results_manager import get_processed_dorks , safe_add_result
25
25
26
26
dork_id_lock = threading .Lock ()
27
27
@@ -32,6 +32,7 @@ def google_search_with_proxy(
32
32
category ,
33
33
config ,
34
34
domain ,
35
+ processed_dorks ,
35
36
retries = 1 ,
36
37
advanced = False ,
37
38
dork_id = 0 ,
@@ -46,7 +47,7 @@ def google_search_with_proxy(
46
47
47
48
params = prepare_params (config )
48
49
49
- dork_id = perform_searches (
50
+ return perform_searches (
50
51
full_query ,
51
52
proxies ,
52
53
category ,
@@ -55,11 +56,10 @@ def google_search_with_proxy(
55
56
config ,
56
57
advanced ,
57
58
dork_id ,
59
+ processed_dorks ,
58
60
use_session = not (proxy == None ),
59
61
)
60
62
61
- return dork_id
62
-
63
63
64
64
def prepare_params (config ):
65
65
return {
@@ -79,6 +79,7 @@ def perform_searches(
79
79
config ,
80
80
advanced ,
81
81
dork_id ,
82
+ processed_dorks ,
82
83
use_session ,
83
84
):
84
85
@@ -92,6 +93,7 @@ def perform_searches(
92
93
config ,
93
94
advanced ,
94
95
dork_id ,
96
+ processed_dorks ,
95
97
use_session = use_session ,
96
98
)
97
99
@@ -107,10 +109,30 @@ def execute_search_with_retries(
107
109
config ,
108
110
advanced ,
109
111
dork_id ,
112
+ processed_dorks ,
110
113
use_session = False ,
111
114
):
112
115
base_url = "https://www.google.com/search"
113
- headers = {"User-Agent" : random .choice (USER_AGENTS )}
116
+ headers = {
117
+ "User-Agent" : random .choice (USER_AGENTS ),
118
+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ,
119
+ "Accept-Language" : "en-US,en;q=0.5" ,
120
+ "Accept-Encoding" : "gzip,deflate" ,
121
+ "Connection" : "close" ,
122
+ "DNT" : "1" ,
123
+ "accept-language" : "en-US,en;q=0.9" ,
124
+ "cache-control" : "max-age=0" ,
125
+ "Upgrade-Insecure-Requests" : "1" ,
126
+ }
127
+
128
+ if query in processed_dorks :
129
+ cprint (
130
+ f"Skipping already processed dork: { query } " ,
131
+ "yellow" ,
132
+ file = sys .stderr ,
133
+ )
134
+ return dork_id
135
+
114
136
for retry_no in range (retries ):
115
137
if use_session :
116
138
cprint (
@@ -127,7 +149,14 @@ def execute_search_with_retries(
127
149
headers = headers ,
128
150
params = params ,
129
151
is_json = False ,
130
- secured = True if "socks" in proxies ["https" ] else False ,
152
+ secured = (
153
+ True
154
+ if proxies
155
+ and "https" in proxies
156
+ and proxies ["https" ]
157
+ and "socks" in proxies ["https" ]
158
+ else False
159
+ ),
131
160
session = session ,
132
161
cookies = {
133
162
"CONSENT" : "PENDING+987" ,
@@ -148,15 +177,24 @@ def execute_search_with_retries(
148
177
headers = headers ,
149
178
params = params ,
150
179
is_json = False ,
151
- secured = True if "socks" in proxies ["https" ] else False ,
180
+ secured = (
181
+ True
182
+ if proxies
183
+ and "https" in proxies
184
+ and proxies ["https" ]
185
+ and "socks" in proxies ["https" ]
186
+ else False
187
+ ),
152
188
cookies = {
153
189
"CONSENT" : "PENDING+987" ,
154
190
"SOCS" : "CAESHAgBEhJnd3NfMjAyMzA4MTAtMF9SQzIaAmRlIAEaBgiAo_CmBg" ,
155
191
},
156
192
)
193
+
194
+ urls = []
157
195
if response :
158
196
urls = parse_google_search_results (proxies , advanced , query , response .text )
159
- if not urls or len (urls ) == 0 :
197
+ if ( not urls or len (urls ) == 0 ) and config [ "use_selenium" ] :
160
198
cprint (
161
199
f"Parsing for google search failed for { query } - retrying with selenium..." ,
162
200
"red" ,
@@ -168,10 +206,10 @@ def execute_search_with_retries(
168
206
urls = parse_google_search_results (
169
207
proxies , advanced , query , html_content
170
208
)
171
- result = dork_id , category , urls , query
172
- safe_add_result (result , config )
173
- with dork_id_lock :
174
- dork_id += 1
209
+ result = dork_id , category , urls , query
210
+ safe_add_result (result , config )
211
+ # with dork_id_lock:
212
+ # dork_id += 1
175
213
# TODO to be faster also record non functionnal dork
176
214
return dork_id
177
215
@@ -282,12 +320,6 @@ def load_google_dorks_and_search(config, categories):
282
320
file = sys .stderr ,
283
321
)
284
322
processed_dorks = get_processed_dorks (config )
285
- search_tasks = filter_search_tasks (search_tasks , processed_dorks )
286
- cprint (
287
- f"Number of dorks to process: { sum ([len (search_tasks [task ]) for task in search_tasks ])} " ,
288
- "yellow" ,
289
- file = sys .stderr ,
290
- )
291
323
292
324
if not search_tasks :
293
325
cprint (f"No dorks to process." , "red" , file = sys .stderr )
@@ -328,6 +360,7 @@ def load_google_dorks_and_search(config, categories):
328
360
task ["category" ],
329
361
config ,
330
362
task ["domain" ],
363
+ processed_dorks ,
331
364
): task
332
365
for task in search_tasks_with_proxy
333
366
}
0 commit comments