-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
91 lines (78 loc) · 3.08 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from selenium import webdriver
from PIL import Image, ImageOps
from io import BytesIO
import glob
import os
import numpy as np
def setup_web():
driver.set_window_size(800, 600)
html_file = os.getcwd() + "//html//" + [file for file in os.listdir('html') if file.endswith("html")][0]
driver.get("file:///" + html_file)
driver.execute_script('''var rules = document.styleSheets[0].cssRules;
for(var i=1; i < rules .length; i++){
if(rules[i].style.lineHeight != '')
rules[i].style.lineHeight = '2';
if(rules[i].style.textIndent != '')
rules[i].style.textIndent = '';
}
var hiddenElements = document.querySelectorAll('span');
var i = hiddenElements.length;
while(i--) {
hiddenElements[i].style.display = '';
}
''')
def remove_img():
files = glob.glob("img/*.png")
for f in files:
os.remove(f)
def img_file_name(a): return 'img/'+str(a).zfill(2) + '.png'
def get_crop_list():
crop_list = []
for question in questions:
size = question.size
location = question.location
print(location)
if(question.find_elements_by_tag_name("span")[0].get_attribute("style") != ''):
crop_list[-1][1] += size['height']
print('merge')
else:
top = location['y']
bottom = top + size['height']
crop_list.append([top, bottom])
print(crop_list[-1])
return crop_list
def crop_margin(img):
ivt_image = ImageOps.invert(img.convert('RGB'))
bbox = ivt_image.getbbox()
cropped_image = img.crop(bbox)
return cropped_image
def crop_save(crop_list):
for index, box in enumerate(crop_list):
im = img_base.crop((body_left, box[0], body_left + body_width, box[1]))
im = crop_margin(im)
count_black = np.count_nonzero(np.array(im) == 0)
print(count_black)
if count_black > 500:
im.save(img_file_name(index))
else:
print('drop', index)
if __name__ == '__main__':
driver = webdriver.PhantomJS(executable_path="phantomjs.exe")
setup_web()
remove_img()
# get body boundary
body = driver.find_elements_by_tag_name("body")[0]
body_left = body.location['x']
body_width = body.size['width']
tag_list = ['p','ol']
for tag in tag_list:
questions = driver.find_elements_by_tag_name(tag) # find each question
if questions != []:
break
png = driver.get_screenshot_as_png() # saves screenshot of entire page
img_base = Image.open(BytesIO(png)) # uses PIL library to open image in memory
crop_list = get_crop_list() # get each question location
print(len(crop_list))
crop_save(crop_list) # crop from base image and save png
driver.quit()
print("\nFinish !!!")