-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbrowser.py
executable file
·148 lines (116 loc) · 4.02 KB
/
browser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python3
# Browser subprocess
# https://www.seleniumhq.org/
import sys
import random
import pickle
from time import sleep
import time
from random import randint
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from slugify import slugify
# --- Runtime configuration:
hide_firefox = False
save_screenshots = True
use_cookies = False
# --- Functions begin:
# Take a screenshot of the webpage:
def screenshot(driver):
imgname = "/home/img/"
imgname += slugify(driver.title)
imgname += '.png'
driver.save_screenshot(imgname)
print('[SCREENSHOT] ' + imgname)
# Create webdriver object and start Firefox/Gecko
def webdriver_init():
firefox_options = Options()
firefox_options.headless = hide_firefox
driver = webdriver.Firefox(options=firefox_options )
return driver
# Import cookies previously exported
# Used to retain a session
def load_cookies(driver):
cookie_url = "https://www.google.com"
driver.get(cookie_url)
cookies = pickle.load(open("private/glogin.pkl", "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
print('[LOADCOOKIE] ' + cookie_url)
# Open google search page and type search
def webdriver_rootpage(driver,searchterm):
rootpage = 'https://www.google.com'
driver.get(rootpage)
driver.implicitly_wait(5)
# Locate the search field and enter the search term
searchfield = driver.find_element_by_css_selector('.gLFyf')
for i in searchterm:
searchfield.send_keys(i)
sleep(random.uniform(0.01,0.3))
for i in range(randint(1,3)):
searchfield.send_keys(Keys.DOWN)
sleep(0.3)
searchfield.send_keys(Keys.RETURN)
# Output search activity to log:
print('[SEARCHTERM] ' + searchterm)
# Wait for results page to load:
driver.implicitly_wait(5)
sleep(random.uniform(5,10))
# Exclude pages that include 'google' in them
def filter_webpages(href):
searchable = True
if 'google.com' in href:
searchable = False
if 'googleusercontent' in href:
searchable = False
return searchable
# Go to subpages and 'explore' the results
def webdriver_subpage(driver,subpages):
if not subpages:
subpages = 5
links_clicked = 0
while links_clicked < subpages:
# https://stackoverflow.com/questions/20315000/select-href-with-id-and-class-using-xpath
links = driver.find_elements_by_css_selector("div.tF2Cxc:nth-child(2) > div:nth-child(1) > a:nth-child(1)")
randomlink = random.choice(links)
href = randomlink.get_attribute("href")
# Exclude irrelevant pages:
if not (filter_webpages(href)):
continue
# Click a result from the search page:
try:
randomlink.click()
driver.implicitly_wait(5)
sleep(random.uniform(5,10))
# Go back on failed click:
except:
sleep(random.uniform(5,10))
driver.execute_script("window.history.go(-1)")
links_clicked+=1
continue
# Wait for page page to load:
pagetitle = driver.title
pageurl = driver.current_url
print('[PAGE_TITLE] ' + pagetitle)
print('[NAVIGATION] ' + pageurl)
# if saving screenshots is enabled, take a picture of current page:
if(save_screenshots):
screenshot(driver)
# When done on page, go back and prepare to click another:
driver.execute_script("window.history.go(-1)")
sleep(random.uniform(2,5))
driver.implicitly_wait(5)
links_clicked+=1
def search(searchterm):
# Initialize webdriver
driver = webdriver_init()
# Load login cookies from file
if use_cookies:
load_cookies(driver)
webdriver_rootpage(driver, searchterm)
# Try browsing subpages of the search result:
try:
webdriver_subpage(driver, 4)
except:
driver.close()