Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dynamic signaling #38

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions rotating_proxies/expire.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Proxies(object):
* unchecked.

Initially, all proxies are in 'unchecked' state.
When a request using a proxy is successful, this proxy moves to 'good'
When a request using scrapera proxy is successful, this proxy moves to 'good'
state. When a request using a proxy fails, proxy moves to 'dead' state.

For crawling only 'good' and 'unchecked' proxies are used.
Expand All @@ -32,7 +32,7 @@ class Proxies(object):
'reanimated'). This timeout increases exponentially after each
unsuccessful attempt to use a proxy.
"""
def __init__(self, proxy_list, backoff=None):
def __init__(self, proxy_list, backoff=None, crawler=None):
self.proxies = {url: ProxyState() for url in proxy_list}
self.proxies_by_hostport = {
extract_proxy_hostport(proxy): proxy
Expand All @@ -46,6 +46,8 @@ def __init__(self, proxy_list, backoff=None):
backoff = exp_backoff_full_jitter
self.backoff = backoff

self.crawler = crawler

def get_random(self):
""" Return a random available proxy (either good or unchecked) """
available = list(self.unchecked | self.good)
Expand All @@ -72,9 +74,13 @@ def mark_dead(self, proxy, _time=None):

if proxy in self.good:
logger.debug("GOOD proxy became DEAD: <%s>" % proxy)

else:
logger.debug("Proxy <%s> is DEAD" % proxy)

if self.crawler:
self.crawler.signals.send_catch_log("DEAD_PROXY", proxy=proxy)

self.unchecked.discard(proxy)
self.good.discard(proxy)
self.dead.add(proxy)
Expand All @@ -94,6 +100,9 @@ def mark_good(self, proxy):
if proxy not in self.good:
logger.debug("Proxy <%s> is GOOD" % proxy)

if self.crawler:
self.crawler.signals.send_catch_log("GOOD_PROXY", proxy=proxy)

self.unchecked.discard(proxy)
self.dead.discard(proxy)
self.good.add(proxy)
Expand All @@ -118,6 +127,34 @@ def reset(self):
self.dead.remove(proxy)
self.unchecked.add(proxy)

def add(self, proxy):
""" Add a proxy to the proxy list """
if proxy in self.proxies:
logger.warn("Proxy <%s> is already in proxies list" % proxy)
return

hostport = extract_proxy_hostport(proxy)
self.proxies[proxy] = ProxyState()
self.proxies_by_hostport[hostport] = proxy
self.unchecked.add(proxy)

def remove(self, proxy):
"""
Permanently remove a proxy. The proxy cannot be recovered, except
if 'add()' is called.
"""
if proxy not in self.proxies:
logger.warn("Proxy <%s> was not found in proxies list" % proxy)
return

logger.debug("Removing proxy <%s> from proxies list" % proxy)
hostport = extract_proxy_hostport(proxy)
self.unchecked.discard(proxy)
self.good.discard(proxy)
self.dead.discard(proxy)
del self.proxies[proxy]
del self.proxies_by_hostport[hostport]

@property
def mean_backoff_time(self):
if not self.dead:
Expand Down
27 changes: 16 additions & 11 deletions rotating_proxies/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(self, proxy_list, logstats_interval, stop_if_no_proxies,

backoff = partial(exp_backoff_full_jitter, base=backoff_base, cap=backoff_cap)
self.proxies = Proxies(self.cleanup_proxy_list(proxy_list),
crawler=crawler,
backoff=backoff)
self.logstats_interval = logstats_interval
self.reanimate_interval = 5
Expand Down Expand Up @@ -95,6 +96,10 @@ def from_crawler(cls, crawler):
backoff_cap=s.getfloat('ROTATING_PROXY_BACKOFF_CAP', 3600),
crawler=crawler,
)
crawler.signals.connect(mw.proxies.add,
signal="ADD_PROXY")
crawler.signals.connect(mw.proxies.remove,
signal="REMOVE_PROXY")
crawler.signals.connect(mw.engine_started,
signal=signals.engine_started)
crawler.signals.connect(mw.engine_stopped,
Expand Down Expand Up @@ -220,32 +225,32 @@ class BanDetectionMiddleware(object):

By default, client is considered banned if a request failed, and alive
if a response was received. You can override ban detection method by
passing a path to a custom BanDectionPolicy in
passing a path to a custom BanDectionPolicy in
``ROTATING_PROXY_BAN_POLICY``, e.g.::

ROTATING_PROXY_BAN_POLICY = 'myproject.policy.MyBanPolicy'
The policy must be a class with ``response_is_ban``
and ``exception_is_ban`` methods. These methods can return True

The policy must be a class with ``response_is_ban``
and ``exception_is_ban`` methods. These methods can return True
(ban detected), False (not a ban) or None (unknown). It can be convenient
to subclass and modify default BanDetectionPolicy::

# myproject/policy.py
from rotating_proxies.policy import BanDetectionPolicy

class MyPolicy(BanDetectionPolicy):
def response_is_ban(self, request, response):
# use default rules, but also consider HTTP 200 responses
# a ban if there is 'captcha' word in response body.
ban = super(MyPolicy, self).response_is_ban(request, response)
ban = ban or b'captcha' in response.body
return ban

def exception_is_ban(self, request, exception):
# override method completely: don't take exceptions in account
return None
Instead of creating a policy you can also implement ``response_is_ban``

Instead of creating a policy you can also implement ``response_is_ban``
and ``exception_is_ban`` methods as spider methods, for example::

class MySpider(scrapy.Spider):
Expand All @@ -256,7 +261,7 @@ def response_is_ban(self, request, response):

def exception_is_ban(self, request, exception):
return None

"""
def __init__(self, stats, policy):
self.stats = stats
Expand Down
12 changes: 12 additions & 0 deletions tests/test_expire.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ def test_proxies():
p.mark_good('bar')
assert p.get_random() == 'bar'

p.remove('foo')
assert p.get_random() == 'bar'

p.add('baz')
assert p.get_random() == 'bar'

p.remove('bar')
p.add('qux')
p.mark_good('qux')
assert p.get_random() == 'qux'



def test_auth_proxies():
proxy_list = ['http://foo:bar@baz:1234', 'http://egg:1234']
Expand Down