-
Notifications
You must be signed in to change notification settings - Fork 1
/
zhua.py
85 lines (73 loc) · 2.36 KB
/
zhua.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf8 -*-
##################################
#万能抓取工具
#v1.0
##################################
import urllib
import Queue
import time
import re
from threading import Thread
class config:
mainUrl = "http://www.baidu.com"
contentThreadNum = 5
contentQue = Queue.Queue(0)
reList = []
queList = []
spiderList = []
reCell = {}
reCell['list'] = re.compile('''xxx.php''');
reCell['nextList'] = re.compile('''xxx.php''');
reCell['toGet'] = {};
reCell['toGet']['url'] = 0;
reCell['toGet']['other'] = 1;
reList.append(object)
class spiderGo(Thread):
def __init__(self,queList,reList,index):
Thread.__init__(self)
self.queList = queList
self.reList = reList
self.reLen = len(self.reList)
self.index = index
def packageData(self,cell):
data = {}
for k,i in self.reList[self.index]['toGet'].items():
data[k] = cell[i]
def getContent(self,url):
return url
def run(self):
if self.index==0:#第一级
reList = self.reList[self.index].findall(self.getContent(config.mainUrl))
for k in reList:
k = self.packageData(k)
self.queList[self.index].put(k)
elif self.index>0 and self.index<self.reLen-1:#中间级别
while True:
data = self.queList[self.index-1].get()
if data is not 0:
content = self.getContent(data[self.index-1]['url'])
reList = self.reList[self.index].findall(content)
for k in reList:
k= (data,self.packageData(k))
self.queList[self.index].put(k)
else:
break
elif self.index == self.reLen-1:
while True:
data = self.queList[self.index-1].get()
if data is not 0:
content = self.getContent(data[self.index-1]['url'])
content = (data,content)
config.contentQue.put(content)
class contentGet(Thread):
def __init__(self):
Thread.__init__()
def run(self):
return False
for i in range(len(reList)):
queList.append(Queue.Queue(0))
spiderList.append(spiderGo(queList,reList,i))
for i in range(config.contentThreadNum):
ct = contentGet()
ct.start()
#最终内容获取正则