forked from SpiderClub/haipproxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathajax_gfw_spider.py
50 lines (40 loc) · 1.63 KB
/
ajax_gfw_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Ajax gfw proxy ip crawler with scrapy-splash
"""
from config.settings import SPIDER_AJAX_GFW_TASK
from ..redis_spiders import RedisAjaxSpider
from ..items import ProxyUrlItem
from .base import BaseSpider
class AjaxGFWSpider(BaseSpider, RedisAjaxSpider):
name = 'ajax_gfw'
proxy_mode = 2
task_queue = SPIDER_AJAX_GFW_TASK
def __init__(self):
super().__init__()
self.parser_maps.setdefault('cnproxy', self.parse_cnproxy)
self.parser_maps.setdefault('free-proxy', self.parse_free_proxy)
def parse_cnproxy(self, response):
items = list()
infos = response.xpath('//tr')[2:]
for info in infos:
info_str = info.extract()
proxy_detail = info.css('td::text').extract()
ip = proxy_detail[0].strip()
port = proxy_detail[1][1:].strip()
cur_protocols = self.procotol_extractor(info_str)
for protocol in cur_protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items
def parse_free_proxy(self, response):
items = list()
infos = response.xpath('//table[@id="proxy_list"]').css('tr')[1:]
for info in infos:
info_str = info.extract()
ip = info.css('abbr::text').extract_first()
port = info.css('.fport::text').extract_first()
if not ip or not port:
continue
cur_protocols = self.procotol_extractor(info_str)
for protocol in cur_protocols:
items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
return items