-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
221 lines (199 loc) · 10.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import handling, connections, executor, modparser
import PySimpleGUI as sg
from threading import Thread
from queue import *
from time import sleep
#Todo relative to whole project
# Add some form of .txt logging for ease of debugging when the meat and potatos are done
# There's a lot of different structures being passed around, a combination of list,
# dict and tuples. See if you can combine all of those to tuples if you can.
# You know where you are? You're in the jungle, baby! You're gonna die!
# @Todo This is incomprehensible spaghetti, add some comments PLEASE GOD -past me
# This is painful, but here we are. This function contains everything required
# To create the gui and implements all the dependencies listed above. The reason
# it's in a function is because that allows the program to be multithreaded, which
# is the only way to get the print output inside of the gui. Also prevents it from
# Not responding while a blocking activity is taking place, like when you start scraping.
def gui():
# Variables and what they do @Todo convert everything to hump case or whatever the _ one is
# Theme
sg.theme('DarkAmber')
# Reference to the active executor since it needs to be used multiple times.
activeExecutor = None
# URL input list.
inputList = None
# The queue that the executor sessions use to send data back to the GUI thread
guiqueue = Queue()
# Yarr, page layouts be here
# @Todo reformat these so they're easier to look through
# Page to construct the URL list.
inputlayout = [
[sg.T('Use this tab to create or input your URL list.')],
[
sg.Frame(element_justification='left', title='Input List Creation',
layout=[
[sg.InputText(size=(35, 1), key='csv'), sg.FileBrowse('Import CSV', size=(6, 2)),
sg.T('Column'),
sg.Spin([i for i in range(101)], initial_value=0, text_color='black',
background_color='white', key='inputrow',
tooltip='Which column the pertinent data is in')],
[sg.Radio('Extension from CSV', 'input', True, key='extension'),
sg.T('Base URL'),
sg.InputText('https://www.google.com/', key='baseurl',
size=(25, 1),
tooltip='The base URL to append the extensions in your csv to (with /)')
],
[sg.Radio('Full URLs from CSV', 'input', True, size=(37, 2), key='full'),
sg.Button('Stage URLs', key='stageurl', size=(9, 2),
tooltip='Generates the list of URLs to scrape.')],
[sg.Frame(element_justification='center',
title='Staged URLs', layout=[
[sg.Multiline(key='urloutput', do_not_clear=False, size=(54, 13))]]
)]
]),
]
]
# Page to construct the executor class (# of threads, universal delay, timeout etc)
connectionlayout = [
[sg.T('Initialize and manage your connections here.')],
[sg.Frame(element_justification='left',
title='Proxy Configuration', layout=[
[sg.T('Select the number of sessions/threads to create.', size=(20, 2))],
[sg.Spin([i for i in range(1,17)], initial_value=1, key='number',
text_color='black', background_color='white')],
[sg.T('Universal Timeout'),
sg.T('Universal Delay')],
[sg.Spin([i for i in range(121)], initial_value=5, text_color='black',
background_color='white', key='timeout'), sg.Text('Seconds'),
sg.Spin([i for i in range(121)], initial_value=5, text_color='black',
background_color='white', key='delay'), sg.Text('Seconds')],
[sg.Button('Stage Executor', key='executor',
tooltip='Generates the proxies and browser headers'
'that will be used for connections')]
]),
sg.Frame(element_justification='right',
title='Staged Proxies', layout=[
[sg.Multiline(key='proxyoutput', do_not_clear=False, size=(23, 9))]]
)
],
[sg.Frame(element_justification='center',
title='Staged Headers', layout=[
[sg.Multiline(key='headeroutput', do_not_clear=False, size=(56, 11))]])]
]
# Where to execute the scraping based on the constructed executor class & list of URLs.
# Also has the ability to import pickled list data
scrapelayout = [
[sg.T('Scrape the data and export as a pickled python object')],
[sg.Frame(element_justification='right',
title='Scraping Progress', layout=[
[sg.Button('Scrape', key='scrape',
tooltip='Begins scraping!'),
sg.Button('Extract', key='extract',
tooltip='Extract and reformat the raw scraped data.')
],
[sg.Output(key='scrapeout', size=(55, 18))],
[sg.InputText(size=(24, 1), key='pickleout'),
sg.InputText(size=(15, 1), key='picklename'),
sg.FolderBrowse('Browse', target='pickleout'),
sg.Button('Export', key='pickleexport')]]
)
]
]
# Aggregate all above layouts into one master page.
masterlayout = [
[sg.Button('Exit', tooltip='Bye! :)')],
[sg.TabGroup([[sg.Tab('Input', inputlayout), sg.Tab('Connections', connectionlayout),
sg.Tab('Scraping', scrapelayout)]])],
]
# End of page layouts
# Initialize window
# @Todo toggleable grab_anywhere
window = sg.Window('Scraper GUI', masterlayout,
no_titlebar=True, grab_anywhere=True, keep_on_top=True)
# Event loop
while True:
event, values = window.read()
# If the stage URL button is hit, and using the input/baseURL & row # on the GUI
# @Todo this can probably be made more concise.
if event == 'stageurl':
if len(values['csv']) == 0:
window['urloutput'].print('You need to import a CSV first.')
else:
if values['extension']:
if len(values['baseurl']) == 0:
window['urloutput'].print('You need to fill in the base URL first.')
else:
try:
inputList = handling.formatcsv(values['baseurl'], values['csv'], values['inputrow'])
for url in inputList:
window['urloutput'].print(url)
except FileNotFoundError:
window['urloutput'].print('File not found.')
else:
try:
inputList = handling.rawcsv(values['csv'], values['inputrow'])
for url in inputList:
window['urloutput'].print(url)
except FileNotFoundError:
window['urloutput'].print('File not found.')
# Generate a list of URLs to iterate through, either by pulling that
# information directly from a CSV or by appending page extensions
# listed in a CSV onto a base URL.
# If executor staging button is hit and using the inputs on the page.
if event == 'executor':
activeExecutor = executor.Executor(int(values['number']),
values['timeout'], values['delay'])
for session in activeExecutor.sessions:
proxyinfo = list(session.proxies.items())[0]
agentinfo = list(session.headers.values())[0]
window['proxyoutput'].print('Proxy IP: {}:{}, \n'.format(proxyinfo[0],
proxyinfo[1]))
window['headeroutput'].print('User-Agent: {}, \n'.format(agentinfo))
# Generate the sessions as you can see in executor.py, and also print out
# that information so it's nice and purdy.
# This one is a doozy. When scrape is hit, create a thread to run the executor
# and begin scraping. This is to keep the gui window from not responding, and
# also allows printouts.
if event == 'scrape':
thread = Thread(target=activeExecutor.run, args=(inputList, guiqueue), daemon=True)
thread.start()
try:
message = guiqueue.get_nowait()
except Empty:
message = None
if message:
window.read()
# Put all of the output into the guiqueue
if event == 'extract':
parselist = handling.extract(guiqueue)
# When pickleimport button is hit
if event == 'pickleimport':
try:
parselist = handling.unpickle(values['picklein'])
print('Previously scraped data imported from {}, total'
' number of pages imported is {}.'.format(values['picklein'], len(parselist)))
except FileNotFoundError:
print('No path was entered or the file could not be found.')
# Unpickle the filepath using the handling module, turning it into a list.
# Then set the internal parse list to that unpickled information.
# When the pickleexport button is hit.
if event == 'pickleexport':
try:
outputloc = values['pickleout']+'/'+values['picklename']
handling.topickle(outputloc, guiqueue)
print('Saved pickled list of parsable information to {}, total'
' number of pages exported is {}.'.format(outputloc, len(inputList)))
except PermissionError:
print('Operation failed due to permission error. This could mean'
' that you do not have access to the file you are attempting to'
' export into, or that you failed to browse for a location entirely.')
# Combine the two fields that comprise folder and file name with a '/' and
# use handling.topickle to turn it into a list and then pickle it for later use.
# End program when exit is hit.
if event in (None, 'Exit'):
break
# To ensure it all returns to nothing (and comes tumbling down, tumbling down, tumbling down)
window.close()
del window
if __name__ == '__main__':
gui()