forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathajax_spider.py
33 lines (24 loc) · 942 Bytes
/
ajax_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
"""
Ajax proxy ip crawler with scrapy-splash
"""
from config.settings import SPIDER_AJAX_TASK
from ..redis_spiders import RedisAjaxSpider
from ..items import ProxyUrlItem
from .base import BaseSpider
class AjaxSpider(BaseSpider, RedisAjaxSpider):
name = 'ajax'
task_queue = SPIDER_AJAX_TASK
def __init__(self):
super().__init__()
self.parser_maps.setdefault('goubanjia', self.parse_goubanjia)
def parse_goubanjia(self, response):
infos = response.xpath('//tr')[1:]
items = list()
for info in infos:
proxy_detail = info.xpath('td[1]//*[name(.)!="p"]/text()').extract()
ip = "".join(proxy_detail[:-1])
port = proxy_detail[-1]
protocols = self.procotol_extractor(info.extract())
for protocol in protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items