From d79e7eddd3f283d3d7e8a358fe403660205d4026 Mon Sep 17 00:00:00 2001
From: conlin <995018884@qq.com>
Date: Wed, 27 Sep 2023 16:51:28 +0800
Subject: [PATCH] feat: metric & logger
---
aioscrapy/VERSION | 2 +-
aioscrapy/__init__.py | 2 +-
aioscrapy/core/downloader/__init__.py | 12 +-
.../core/downloader/handlers/__init__.py | 8 +-
aioscrapy/core/downloader/handlers/aiohttp.py | 6 +-
aioscrapy/core/downloader/handlers/httpx.py | 6 +-
.../handlers/playwright/__init__.py | 4 -
aioscrapy/core/downloader/handlers/pyhttpx.py | 4 +-
.../core/downloader/handlers/requests.py | 4 +-
aioscrapy/core/engine.py | 35 ++-
aioscrapy/core/scraper.py | 53 ++---
aioscrapy/crawler.py | 67 +++---
aioscrapy/db/__init__.py | 4 +-
aioscrapy/db/aiomongo.py | 4 -
aioscrapy/db/aiomysql.py | 3 -
aioscrapy/db/aiopg.py | 4 +-
aioscrapy/db/aiorabbitmq.py | 3 -
aioscrapy/dupefilters/disk.py | 13 +-
aioscrapy/dupefilters/redis.py | 11 +-
aioscrapy/http/request/__init__.py | 4 +-
aioscrapy/libs/downloader/retry.py | 25 +--
aioscrapy/libs/downloader/stats.py | 6 +-
aioscrapy/libs/extensions/closespider.py | 11 +-
aioscrapy/libs/extensions/corestats.py | 8 +-
aioscrapy/libs/extensions/logstats.py | 17 +-
aioscrapy/libs/extensions/metric.py | 138 ++++++++++++
aioscrapy/libs/extensions/throttle.py | 13 +-
aioscrapy/libs/pipelines/__init__.py | 10 +-
aioscrapy/libs/pipelines/sink/_csv.py | 5 +-
aioscrapy/libs/pipelines/sink/execl.py | 26 +--
aioscrapy/libs/pipelines/sink/mongo.py | 4 +-
aioscrapy/libs/pipelines/sink/mysql.py | 4 +-
aioscrapy/libs/pipelines/sink/pg.py | 4 +-
aioscrapy/libs/spider/depth.py | 12 +-
aioscrapy/libs/spider/httperror.py | 10 +-
aioscrapy/libs/spider/offsite.py | 8 +-
aioscrapy/libs/spider/urllength.py | 13 +-
aioscrapy/logformatter.py | 31 ++-
aioscrapy/middleware/absmanager.py | 15 +-
aioscrapy/middleware/downloader.py | 5 -
aioscrapy/proxy/__init__.py | 4 +-
aioscrapy/proxy/redis.py | 4 +-
aioscrapy/queue/redis.py | 7 +-
aioscrapy/settings/default_settings.py | 16 +-
aioscrapy/spiders/__init__.py | 45 +---
aioscrapy/statscollectors.py | 5 +-
.../project/module/middlewares.py.tmpl | 6 +-
aioscrapy/utils/log.py | 201 +++---------------
aioscrapy/utils/signal.py | 22 +-
aioscrapy/utils/spider.py | 12 +-
aioscrapy/utils/tools.py | 8 +
example/singlespider/demo_duplicate.py | 18 +-
example/singlespider/demo_metric.py | 66 ++++++
example/singlespider/demo_queue_memory.py | 7 +-
example/singlespider/demo_request_httpx.py | 4 +-
example/singlespider/start.py | 6 +-
setup.py | 1 +
57 files changed, 474 insertions(+), 572 deletions(-)
create mode 100644 aioscrapy/libs/extensions/metric.py
create mode 100644 example/singlespider/demo_metric.py
diff --git a/aioscrapy/VERSION b/aioscrapy/VERSION
index 6261a05..359a5b9 100644
--- a/aioscrapy/VERSION
+++ b/aioscrapy/VERSION
@@ -1 +1 @@
-1.3.1
\ No newline at end of file
+2.0.0
\ No newline at end of file
diff --git a/aioscrapy/__init__.py b/aioscrapy/__init__.py
index ac097ab..6f2da6a 100644
--- a/aioscrapy/__init__.py
+++ b/aioscrapy/__init__.py
@@ -13,7 +13,7 @@
__all__ = [
- '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler'
+ '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler', 'Settings'
]
diff --git a/aioscrapy/core/downloader/__init__.py b/aioscrapy/core/downloader/__init__.py
index 58f2c98..fdf4c24 100644
--- a/aioscrapy/core/downloader/__init__.py
+++ b/aioscrapy/core/downloader/__init__.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import random
from abc import abstractmethod
from collections import deque
@@ -16,10 +15,9 @@
from aioscrapy.settings import Settings
from aioscrapy.signalmanager import SignalManager
from aioscrapy.utils.httpobj import urlparse_cached
+from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_instance
-from aioscrapy.utils.tools import call_helper
-
-logger = logging.getLogger('aioscrapy.downloader')
+from aioscrapy.utils.tools import call_helper, create_task
class BaseDownloaderMeta(type):
@@ -135,7 +133,7 @@ def __init__(
self.active: Set[Request] = set()
self.slots: dict = {}
self.running: bool = True
- asyncio.create_task(self._slot_gc(60))
+ create_task(self._slot_gc(60))
@classmethod
async def from_crawler(cls, crawler) -> "Downloader":
@@ -170,13 +168,13 @@ async def _process_queue(self, slot: Slot) -> None:
slot.delay_lock = True
await asyncio.sleep(penalty)
slot.delay_lock = False
- asyncio.create_task(self._process_queue(slot))
+ create_task(self._process_queue(slot))
return
while slot.queue and slot.free_transfer_slots() > 0:
request = slot.queue.popleft()
slot.transferring.add(request)
- asyncio.create_task(self._download(slot, request))
+ create_task(self._download(slot, request))
if delay:
break
diff --git a/aioscrapy/core/downloader/handlers/__init__.py b/aioscrapy/core/downloader/handlers/__init__.py
index 75dd718..a135bd9 100644
--- a/aioscrapy/core/downloader/handlers/__init__.py
+++ b/aioscrapy/core/downloader/handlers/__init__.py
@@ -1,6 +1,5 @@
"""Download handlers for different schemes"""
-import logging
from abc import abstractmethod
from typing import Optional
@@ -8,11 +7,10 @@
from aioscrapy.exceptions import NotConfigured, NotSupported
from aioscrapy.http import HtmlResponse
from aioscrapy.utils.httpobj import urlparse_cached
+from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_instance
from aioscrapy.utils.python import without_none_values
-logger = logging.getLogger(__name__)
-
class BaseDownloadHandler:
@abstractmethod
@@ -67,9 +65,7 @@ async def _load_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
- logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
- {"clspath": path, "scheme": scheme},
- exc_info=True, extra={'crawler': self._crawler})
+ logger.exception(f'Loading "{path}" for scheme "{scheme}"')
self._notconfigured[scheme] = str(ex)
return None
else:
diff --git a/aioscrapy/core/downloader/handlers/aiohttp.py b/aioscrapy/core/downloader/handlers/aiohttp.py
index c7ba890..353ca5c 100644
--- a/aioscrapy/core/downloader/handlers/aiohttp.py
+++ b/aioscrapy/core/downloader/handlers/aiohttp.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import re
import ssl
from typing import Optional
@@ -10,8 +9,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class AioHttpDownloadHandler(BaseDownloadHandler):
@@ -41,7 +39,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse:
'data': request.body or None,
'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
'dont_redirect') is None else request.meta.get('dont_redirect'),
- 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10),
+ 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
}
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
diff --git a/aioscrapy/core/downloader/handlers/httpx.py b/aioscrapy/core/downloader/handlers/httpx.py
index 6febb40..dee3ea1 100644
--- a/aioscrapy/core/downloader/handlers/httpx.py
+++ b/aioscrapy/core/downloader/handlers/httpx.py
@@ -1,4 +1,3 @@
-import logging
import ssl
import httpx
@@ -7,8 +6,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class HttpxDownloadHandler(BaseDownloadHandler):
@@ -43,7 +41,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse:
'verify': request.meta.get('verify_ssl', self.verify_ssl),
'follow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
'dont_redirect') is None else request.meta.get('dont_redirect'),
- 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10),
+ 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
})
ssl_ciphers = request.meta.get('TLS_CIPHERS')
ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
diff --git a/aioscrapy/core/downloader/handlers/playwright/__init__.py b/aioscrapy/core/downloader/handlers/playwright/__init__.py
index bd17f42..e468580 100644
--- a/aioscrapy/core/downloader/handlers/playwright/__init__.py
+++ b/aioscrapy/core/downloader/handlers/playwright/__init__.py
@@ -1,5 +1,3 @@
-import logging
-
from aioscrapy import Request
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import PlaywrightResponse
@@ -8,8 +6,6 @@
from .driverpool import WebDriverPool
from .webdriver import PlaywrightDriver
-logger = logging.getLogger(__name__)
-
class PlaywrightHandler(BaseDownloadHandler):
def __init__(self, settings: Settings):
diff --git a/aioscrapy/core/downloader/handlers/pyhttpx.py b/aioscrapy/core/downloader/handlers/pyhttpx.py
index a555ebc..324a156 100644
--- a/aioscrapy/core/downloader/handlers/pyhttpx.py
+++ b/aioscrapy/core/downloader/handlers/pyhttpx.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import pyhttpx
@@ -7,8 +6,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class PyhttpxDownloadHandler(BaseDownloadHandler):
diff --git a/aioscrapy/core/downloader/handlers/requests.py b/aioscrapy/core/downloader/handlers/requests.py
index 12fad88..844bda8 100644
--- a/aioscrapy/core/downloader/handlers/requests.py
+++ b/aioscrapy/core/downloader/handlers/requests.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import requests
@@ -7,8 +6,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class RequestsDownloadHandler(BaseDownloadHandler):
diff --git a/aioscrapy/core/engine.py b/aioscrapy/core/engine.py
index 8f3c7d1..de704e6 100644
--- a/aioscrapy/core/engine.py
+++ b/aioscrapy/core/engine.py
@@ -1,10 +1,9 @@
# _*_ coding: utf-8 _*_
import asyncio
-import logging
-from typing import Optional, AsyncGenerator, Union, Callable
from asyncio import Queue
from asyncio.queues import QueueEmpty
+from typing import Optional, AsyncGenerator, Union, Callable
import aioscrapy
from aioscrapy import Spider
@@ -17,9 +16,9 @@
from aioscrapy.http.request import Request
from aioscrapy.utils.log import logformatter_adapter
from aioscrapy.utils.misc import load_instance
-from aioscrapy.utils.tools import call_helper
+from aioscrapy.utils.tools import call_helper, create_task
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class Slot:
@@ -72,7 +71,7 @@ async def start(
while not self.finish:
self.running and await self._next_request()
await asyncio.sleep(1)
- self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl())
+ self.enqueue_cache_num != 1 and create_task(self._crawl())
self.running and await self._spider_idle(self.spider)
async def stop(self, reason: str = 'shutdown') -> None:
@@ -83,7 +82,7 @@ async def stop(self, reason: str = 'shutdown') -> None:
while not self.is_idle():
await asyncio.sleep(0.2)
- self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl())
+ self.enqueue_cache_num != 1 and create_task(self._crawl())
await self.close_spider(self.spider, reason=reason)
await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self.finish = True
@@ -93,7 +92,7 @@ async def open(
spider: Spider,
start_requests: Optional[AsyncGenerator] = None
) -> None:
- logger.info("Spider opened", extra={'spider': spider})
+ logger.info("Spider opened")
self.spider = spider
await call_helper(self.crawler.stats.open_spider, spider)
@@ -145,7 +144,7 @@ async def _next_request(self) -> None:
self.slot.start_requests = None
except Exception as e:
self.slot.start_requests = None
- logger.error('Error while obtaining start requests', exc_info=e, extra={'spider': self.spider})
+ logger.exception('Error while obtaining start requests')
else:
request and await self.crawl(request)
finally:
@@ -177,9 +176,7 @@ async def handle_downloader_output(
result.request = request
if isinstance(result, Response):
- logkws = self.logformatter.crawled(request, result, self.spider)
- if logkws is not None:
- logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider})
+ logger.log(** self.logformatter.crawled(request, result, self.spider))
await self.signals.send_catch_log(signals.response_received,
response=result, request=request, spider=self.spider)
await self.scraper.enqueue_scrape(result, request)
@@ -207,7 +204,7 @@ def is_idle(self) -> bool:
async def crawl(self, request: Request) -> None:
if self.enqueue_cache_num == 1:
await self.scheduler.enqueue_request(request)
- asyncio.create_task(self._next_request())
+ create_task(self._next_request())
else:
await self.enqueue_cache.put(request)
@@ -224,15 +221,13 @@ async def _crawl(self) -> None:
break
if requests:
await call_helper(self.scheduler.enqueue_request_batch, requests)
- asyncio.create_task(self._next_request())
+ create_task(self._next_request())
self.enqueue_unlock = True
async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
"""Close (cancel) spider and clear all its outstanding requests"""
- logger.info("Closing spider (%(reason)s)",
- {'reason': reason},
- extra={'spider': spider})
+ logger.info(f"Closing spider ({reason})")
async def close_handler(
callback: Callable,
@@ -243,11 +238,7 @@ async def close_handler(
try:
await call_helper(callback, *args, **kwargs)
except (Exception, BaseException) as e:
- logger.error(
- errmsg,
- exc_info=e,
- extra={'spider': spider}
- )
+ logger.exception(errmsg)
await close_handler(self.downloader.close, errmsg='Downloader close failure')
@@ -260,7 +251,7 @@ async def close_handler(
await close_handler(self.crawler.stats.close_spider, spider, reason=reason, errmsg='Stats close failure')
- logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})
+ logger.info(f"Spider closed ({reason})")
await close_handler(setattr, self, 'slot', None, errmsg='Error while unassigning slot')
diff --git a/aioscrapy/core/scraper.py b/aioscrapy/core/scraper.py
index 7aa1f45..06cf888 100644
--- a/aioscrapy/core/scraper.py
+++ b/aioscrapy/core/scraper.py
@@ -1,7 +1,6 @@
"""This module implements the Scraper component which parses responses and
extracts information from them"""
import asyncio
-import logging
from typing import Any, AsyncGenerator, Set, Union, Optional
import aioscrapy
@@ -12,11 +11,9 @@
from aioscrapy.logformatter import LogFormatter
from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager
from aioscrapy.signalmanager import SignalManager
-from aioscrapy.utils.log import logformatter_adapter
+from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_object
-from aioscrapy.utils.tools import call_helper
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.tools import call_helper, create_task
class Slot:
@@ -113,11 +110,8 @@ async def _scrape(self, result: Union[Response, BaseException], request: Request
await self.handle_spider_error(e, request, result)
else:
await self.handle_spider_output(output, request, result)
- except BaseException as e:
- logger.error('Scraper bug processing %(request)s',
- {'request': request},
- exc_info=e,
- extra={'spider': self.spider})
+ except BaseException:
+ logger.exception('Scraper bug processing %(request)s' % {'request': request})
finally:
if isinstance(result, PlaywrightResponse):
await result.release()
@@ -152,23 +146,16 @@ async def call_spider(self, result: Union[Response, BaseException], request: Req
async def handle_spider_error(self, exc: BaseException, request: Request, response: Response) -> None:
if isinstance(exc, CloseSpider):
- asyncio.create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled'))
+ create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled'))
return
- logkws = self.logformatter.spider_error(exc, request, response, self.spider)
- logger.log(
- *logformatter_adapter(logkws),
- exc_info=exc,
- extra={'spider': self.spider}
- )
+ logger.log(**self.logformatter.spider_error(exc, request, response, self.spider))
await self.signals.send_catch_log(
signal=signals.spider_error,
failure=exc, response=response,
spider=self.spider
)
- self.crawler.stats.inc_value(
- "spider_exceptions/%s" % exc.__class__.__name__,
- spider=self.spider
- )
+ self.crawler.stats.inc_value("spider_exceptions/%s" % exc.__class__.__name__, spider=self.spider)
+ self.crawler.stats.inc_value("spider_exceptions", spider=self.spider)
async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None:
"""Iter each Request/Item (given in the output parameter) returned from the given spider"""
@@ -202,9 +189,8 @@ async def _process_spidermw_output(self, output: Any, request: Request, response
else:
typename = type(output).__name__
logger.error(
- 'Spider must return request, item, or None, got %(typename)r in %(request)s',
- {'request': request, 'typename': typename},
- extra={'spider': self.spider},
+ 'Spider must return request, item, or None, got %(typename)r in %(request)s' % {'request': request,
+ 'typename': typename},
)
async def _log_download_errors(
@@ -215,12 +201,7 @@ async def _log_download_errors(
) -> None:
"""Process and record errors"""
if isinstance(download_exception, BaseException) and not isinstance(download_exception, IgnoreRequest):
- logkws = self.logformatter.download_error(download_exception, request, self.spider)
- logger.log(
- *logformatter_adapter(logkws),
- extra={'spider': self.spider},
- exc_info=download_exception,
- )
+ logger.log(**self.logformatter.download_error(download_exception, request, self.spider))
if spider_exception is not download_exception:
raise spider_exception
@@ -230,23 +211,17 @@ async def _itemproc_finished(self, output: Any, item: Any, response: Response) -
self.slot.itemproc_size -= 1
if isinstance(output, BaseException):
if isinstance(output, DropItem):
- logkws = self.logformatter.dropped(item, output, response, self.spider)
- if logkws is not None:
- logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider})
+ logger.log(**self.logformatter.dropped(item, output, response, self.spider))
return await self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=self.spider, exception=output)
else:
- logkws = self.logformatter.item_error(item, output, response, self.spider)
- logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider},
- exc_info=output)
+ logger.log(**self.logformatter.item_error(item, output, response, self.spider))
return await self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=self.spider, failure=output)
else:
- logkws = self.logformatter.scraped(output, response, self.spider)
- if logkws is not None:
- logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider})
+ logger.log(**self.logformatter.scraped(output, response, self.spider))
return await self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=self.spider)
diff --git a/aioscrapy/crawler.py b/aioscrapy/crawler.py
index 2414b98..3ecdaec 100644
--- a/aioscrapy/crawler.py
+++ b/aioscrapy/crawler.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import pprint
import signal
import sys
@@ -16,14 +15,9 @@
from zope.interface.verify import verifyClass
from aioscrapy.logformatter import LogFormatter
-from aioscrapy import signals, Spider
+from aioscrapy import Spider
from aioscrapy.settings import overridden_settings, Settings
-from aioscrapy.utils.log import (
- get_scrapy_root_handler,
- install_scrapy_root_handler,
- LogCounterHandler,
- configure_logging,
-)
+from aioscrapy.utils.log import configure_logging, logger
from aioscrapy.utils.misc import load_object, load_instance
from aioscrapy.spiderloader import ISpiderLoader
from aioscrapy.exceptions import AioScrapyDeprecationWarning
@@ -36,8 +30,6 @@
from aioscrapy.utils.ossignal import install_shutdown_handlers, signal_names
from aioscrapy.statscollectors import StatsCollector
-logger = logging.getLogger(__name__)
-
class Crawler:
@@ -64,6 +56,8 @@ def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None
async def crawl(self, *args, **kwargs) -> None:
try:
+ configure_logging(self.spidercls, self.settings)
+
if self.crawling:
raise RuntimeError("Crawling already taking place")
@@ -71,15 +65,7 @@ async def crawl(self, *args, **kwargs) -> None:
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
- handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
- logging.root.addHandler(handler)
-
- d = dict(overridden_settings(self.settings))
- logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)})
-
- if get_scrapy_root_handler() is not None:
- install_scrapy_root_handler(self.settings)
- self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped)
+ logger.info(f"Overridden settings:\n{pprint.pformat(dict(overridden_settings(self.settings)))}")
self.spider = await self.spidercls.from_crawler(self, *args, **kwargs)
self.logformatter = await load_instance(self.settings['LOG_FORMATTER'], crawler=self)
@@ -96,13 +82,23 @@ async def crawl(self, *args, **kwargs) -> None:
await self.engine.close()
raise e
- async def stop(self) -> None:
+ async def stop(self, signum=None) -> None:
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
+ if signum is not None:
+ asyncio.current_task().set_name(self.spidercls.name)
+ logger.info(
+ "Received %(signame)s, shutting down gracefully. Send again to force" % {
+ 'signame': signal_names[signum]
+ }
+ )
if self.crawling:
self.crawling = False
await self.engine.stop()
+ def _signal_shutdown(self, signum: Any, _) -> None:
+ asyncio.create_task(self.stop(signum))
+
class CrawlerRunner:
crawlers = property(
@@ -156,7 +152,7 @@ def crawl_soon(
self.active_crawler(crawler, *args, **kwargs)
def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None:
- task = asyncio.create_task(crawler.crawl(*args, **kwargs))
+ task = asyncio.create_task(crawler.crawl(*args, **kwargs), name=crawler.spidercls.name)
self._active.add(task)
def _done(result):
@@ -204,8 +200,8 @@ def _create_crawler(
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, settings=settings)
- async def stop(self) -> None:
- await asyncio.gather(*[c.stop() for c in self.crawlers])
+ async def stop(self, signum=None) -> None:
+ await asyncio.gather(*[c.stop(signum) for c in self.crawlers])
class CrawlerProcess(CrawlerRunner):
@@ -217,28 +213,25 @@ def __init__(
) -> None:
super().__init__(settings)
install_shutdown_handlers(self._signal_shutdown)
- configure_logging(self.settings, install_root_handler)
def _signal_shutdown(self, signum: Any, _) -> None:
install_shutdown_handlers(self._signal_kill)
- signame = signal_names[signum]
- logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
- {'signame': signame})
- asyncio.create_task(self._graceful_stop_reactor())
+ asyncio.create_task(self.stop(signum))
def _signal_kill(self, signum: Any, _) -> None:
install_shutdown_handlers(signal.SIG_IGN)
signame = signal_names[signum]
- logger.info('Received %(signame)s twice, forcing unclean shutdown',
- {'signame': signame})
+ logger.info('Received %(signame)s twice, forcing unclean shutdown' % {'signame': signame})
asyncio.create_task(self._stop_reactor())
async def run(self) -> None:
- for crawler, (args, kwargs) in self.crawlers.items():
- self.active_crawler(crawler, *args, **kwargs)
- while self._active:
- await asyncio.gather(*self._active)
- await self.recycle_db_connect()
+ try:
+ for crawler, (args, kwargs) in self.crawlers.items():
+ self.active_crawler(crawler, *args, **kwargs)
+ while self._active:
+ await asyncio.gather(*self._active)
+ finally:
+ await self.recycle_db_connect()
def start(self) -> None:
if sys.platform.startswith('win'):
@@ -251,10 +244,6 @@ def start(self) -> None:
pass
asyncio.run(self.run())
- async def _graceful_stop_reactor(self) -> None:
- await self.stop()
- await self.recycle_db_connect()
-
async def _stop_reactor(self) -> None:
try:
await self.recycle_db_connect()
diff --git a/aioscrapy/db/__init__.py b/aioscrapy/db/__init__.py
index 0f537fe..be9d08a 100644
--- a/aioscrapy/db/__init__.py
+++ b/aioscrapy/db/__init__.py
@@ -1,14 +1,12 @@
-import logging
from importlib import import_module
from typing import Any
import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
from aioscrapy.db.aioredis import redis_manager
+from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_object
-logger = logging.getLogger(__name__)
-
__all__ = ['db_manager', 'get_pool', 'get_manager']
DB_MODULE_MAP = {
diff --git a/aioscrapy/db/aiomongo.py b/aioscrapy/db/aiomongo.py
index 58385e1..48edf8e 100644
--- a/aioscrapy/db/aiomongo.py
+++ b/aioscrapy/db/aiomongo.py
@@ -1,12 +1,8 @@
-import logging
-
from motor.motor_asyncio import AsyncIOMotorClient
import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
-logger = logging.getLogger(__name__)
-
class MongoExecutor:
def __init__(self, alias: str, pool_manager: "AioMongoManager"):
diff --git a/aioscrapy/db/aiomysql.py b/aioscrapy/db/aiomysql.py
index 62e813b..a538973 100644
--- a/aioscrapy/db/aiomysql.py
+++ b/aioscrapy/db/aiomysql.py
@@ -1,4 +1,3 @@
-import logging
import socket
from contextlib import asynccontextmanager
@@ -7,8 +6,6 @@
import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
-logger = logging.getLogger(__name__)
-
class MysqlExecutor:
def __init__(self, alias: str, pool_manager: "AioMysqlPoolManager"):
diff --git a/aioscrapy/db/aiopg.py b/aioscrapy/db/aiopg.py
index e0bb86a..a9821aa 100644
--- a/aioscrapy/db/aiopg.py
+++ b/aioscrapy/db/aiopg.py
@@ -1,4 +1,3 @@
-import logging
from contextlib import asynccontextmanager
from asyncpg.pool import create_pool
@@ -6,8 +5,6 @@
import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
-logger = logging.getLogger(__name__)
-
class PGExecutor:
def __init__(self, alias: str, pool_manager: "AioPGPoolManager"):
@@ -109,4 +106,5 @@ async def test():
result = await conn.fetch('select 1 ')
print(tuple(result[0]))
+
asyncio.run(test())
diff --git a/aioscrapy/db/aiorabbitmq.py b/aioscrapy/db/aiorabbitmq.py
index d3ccf57..60f71b6 100644
--- a/aioscrapy/db/aiorabbitmq.py
+++ b/aioscrapy/db/aiorabbitmq.py
@@ -1,4 +1,3 @@
-import logging
from contextlib import asynccontextmanager
import aio_pika
@@ -8,8 +7,6 @@
import aioscrapy
from aioscrapy.db.absmanager import AbsDBPoolManager
-logger = logging.getLogger(__name__)
-
class RabbitmqExecutor:
def __init__(self, alias: str, pool_manager: "AioRabbitmqManager"):
diff --git a/aioscrapy/dupefilters/disk.py b/aioscrapy/dupefilters/disk.py
index e668fa4..df5e7fa 100644
--- a/aioscrapy/dupefilters/disk.py
+++ b/aioscrapy/dupefilters/disk.py
@@ -1,9 +1,9 @@
-import logging
import os
from typing import Optional, Set
from aioscrapy import Request, Spider
from aioscrapy.dupefilters import DupeFilterBase
+from aioscrapy.utils.log import logger
from aioscrapy.utils.request import referer_str
@@ -15,7 +15,6 @@ def __init__(self, path: Optional[str] = None, debug: bool = False):
self.debug = debug
self.fingerprints: Set = set()
self.logdupes: bool = True
- self.logger = logging.getLogger(__name__)
if path:
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
self.file.seek(0)
@@ -37,20 +36,20 @@ async def request_seen(self, request: Request) -> bool:
self.file.write(request.fingerprint + '\n')
return False
- def close(self, reason: str = '') -> None:
+ async def close(self, reason: str = '') -> None:
if self.file:
self.file.close()
def log(self, request: Request, spider: Spider):
if self.debug:
- msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
- args = {'request': request, 'referer': referer_str(request)}
- self.logger.debug(msg, args, extra={'spider': spider})
+ logger.debug("Filtered duplicate request: %(request)s (referer: %(referer)s)" % {
+ 'request': request, 'referer': referer_str(request)
+ })
elif self.logdupes:
msg = ("Filtered duplicate request: %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
- self.logger.debug(msg, {'request': request}, extra={'spider': spider})
+ logger.debug(msg % {'request': request})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
diff --git a/aioscrapy/dupefilters/redis.py b/aioscrapy/dupefilters/redis.py
index 28bd258..3fbecca 100644
--- a/aioscrapy/dupefilters/redis.py
+++ b/aioscrapy/dupefilters/redis.py
@@ -1,17 +1,13 @@
-import logging
-
from aioscrapy import Request
from aioscrapy.db import db_manager
from aioscrapy.dupefilters import DupeFilterBase
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class RedisRFPDupeFilter(DupeFilterBase):
"""Request Fingerprint duplicates filter built with Set of Redis"""
- logger = logger
-
def __init__(
self,
server: "redis.asyncio.Redis",
@@ -47,13 +43,12 @@ async def clear(self):
def log(self, request, spider):
if self.debug:
- msg = "Filtered duplicate request: %(request)s"
- self.logger.debug(msg, {'request': request}, extra={'spider': spider})
+ logger.debug("Filtered duplicate request: %(request)s" % {'request': request})
elif self.logdupes:
msg = ("Filtered duplicate request %(request)s"
" - no more duplicates will be shown"
" (see DUPEFILTER_DEBUG to show all duplicates)")
- self.logger.debug(msg, {'request': request}, extra={'spider': spider})
+ logger.debug(msg % {'request': request})
self.logdupes = False
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
diff --git a/aioscrapy/http/request/__init__.py b/aioscrapy/http/request/__init__.py
index 1d6a71f..8ec2f78 100644
--- a/aioscrapy/http/request/__init__.py
+++ b/aioscrapy/http/request/__init__.py
@@ -63,8 +63,8 @@ def __init__(
self.dont_filter = dont_filter
self.use_proxy = use_proxy
- self._meta = dict(meta) if meta else None
- if fingerprint:
+ self._meta = dict(meta) if meta else {}
+ if fingerprint is not None:
self._set_fingerprint(fingerprint)
self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None
self.flags = [] if flags is None else list(flags)
diff --git a/aioscrapy/libs/downloader/retry.py b/aioscrapy/libs/downloader/retry.py
index ff15597..e8c02ae 100644
--- a/aioscrapy/libs/downloader/retry.py
+++ b/aioscrapy/libs/downloader/retry.py
@@ -9,7 +9,6 @@
Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages.
"""
-from logging import getLogger, Logger
from typing import Optional, Union
try:
@@ -21,30 +20,35 @@
try:
from aiohttp.client_exceptions import ClientError
- NEED_RETRY_ERROR += (ClientError, )
+
+ NEED_RETRY_ERROR += (ClientError,)
except ImportError:
pass
try:
from httpx import HTTPError as HttpxError
- NEED_RETRY_ERROR += (HttpxError, )
+
+ NEED_RETRY_ERROR += (HttpxError,)
except ImportError:
pass
try:
from pyhttpx.exception import BaseExpetion as PyHttpxError
+
NEED_RETRY_ERROR += (PyHttpxError,)
except ImportError:
pass
try:
from requests.exceptions import RequestException as RequestsError
+
NEED_RETRY_ERROR += (RequestsError,)
except ImportError:
pass
try:
from playwright._impl._api_types import Error as PlaywrightError
+
NEED_RETRY_ERROR += (PlaywrightError,)
except ImportError:
pass
@@ -53,8 +57,7 @@
from aioscrapy.http.request import Request
from aioscrapy.spiders import Spider
from aioscrapy.utils.python import global_object_name
-
-retry_logger = getLogger(__name__)
+from aioscrapy.utils.log import logger as retry_logger
def get_retry_request(
@@ -64,7 +67,7 @@ def get_retry_request(
reason: Union[str, Exception] = 'unspecified',
max_retry_times: Optional[int] = None,
priority_adjust: Optional[int] = None,
- logger: Logger = retry_logger,
+ logger=retry_logger,
stats_base_key: str = 'retry',
):
"""
@@ -84,9 +87,9 @@ def get_retry_request(
reason = global_object_name(reason.__class__)
logger.info(
- "Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
- {'request': request, 'retry_times': retry_times, 'reason': reason},
- extra={'spider': spider}
+ "Retrying %(request)s (failed %(retry_times)d times): %(reason)s" % {
+ 'request': request, 'retry_times': retry_times, 'reason': reason
+ },
)
new_request = request.copy()
new_request.meta['retry_times'] = retry_times
@@ -102,9 +105,7 @@ def get_retry_request(
stats.inc_value(f'{stats_base_key}/max_reached')
logger.error(
"Gave up retrying %(request)s (failed %(retry_times)d times): "
- "%(reason)s",
- {'request': request, 'retry_times': retry_times, 'reason': reason},
- extra={'spider': spider},
+ "%(reason)s" % {'request': request, 'retry_times': retry_times, 'reason': reason}
)
return None
diff --git a/aioscrapy/libs/downloader/stats.py b/aioscrapy/libs/downloader/stats.py
index d9134c4..136850f 100644
--- a/aioscrapy/libs/downloader/stats.py
+++ b/aioscrapy/libs/downloader/stats.py
@@ -17,14 +17,12 @@ def from_crawler(cls, crawler):
def process_request(self, request, spider):
self.stats.inc_value('downloader/request_count', spider=spider)
self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider)
- reqlen = len(request_httprepr(request))
- self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
+ self.stats.inc_value('downloader/request_bytes', len(request_httprepr(request)), spider=spider)
def process_response(self, request, response, spider):
self.stats.inc_value('downloader/response_count', spider=spider)
self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider)
- reslen = len(response.body)
- self.stats.inc_value('downloader/response_bytes', reslen, spider=spider)
+ self.stats.inc_value('downloader/response_bytes', len(response.body), spider=spider)
return response
def process_exception(self, request, exception, spider):
diff --git a/aioscrapy/libs/extensions/closespider.py b/aioscrapy/libs/extensions/closespider.py
index 3cf223a..b0071a0 100644
--- a/aioscrapy/libs/extensions/closespider.py
+++ b/aioscrapy/libs/extensions/closespider.py
@@ -9,6 +9,7 @@
from aioscrapy import signals
from aioscrapy.exceptions import NotConfigured
+from aioscrapy.utils.tools import create_task
class CloseSpider:
@@ -46,24 +47,24 @@ def from_crawler(cls, crawler):
async def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
- asyncio.create_task(self.crawler.engine.stop(reason='closespider_errorcount'))
+ create_task(self.crawler.engine.stop(reason='closespider_errorcount'))
async def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
- asyncio.create_task(self.crawler.engine.stop(reason='closespider_pagecount'))
+ create_task(self.crawler.engine.stop(reason='closespider_pagecount'))
async def timeout_close(self, spider):
async def close():
await asyncio.sleep(self.close_on['timeout'])
- asyncio.create_task(self.crawler.engine.stop(reason='closespider_timeout'))
+ create_task(self.crawler.engine.stop(reason='closespider_timeout'))
- self.task = asyncio.create_task(close())
+ self.task = create_task(close())
async def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
- asyncio.create_task(self.crawler.engine.stop(reason='closespider_itemcount'))
+ create_task(self.crawler.engine.stop(reason='closespider_itemcount'))
def spider_closed(self, spider):
if self.task and not self.task.done():
diff --git a/aioscrapy/libs/extensions/corestats.py b/aioscrapy/libs/extensions/corestats.py
index 206b408..bcbef20 100644
--- a/aioscrapy/libs/extensions/corestats.py
+++ b/aioscrapy/libs/extensions/corestats.py
@@ -23,15 +23,15 @@ def from_crawler(cls, crawler):
return o
def spider_opened(self, spider):
- self.start_time = datetime.utcnow()
- self.stats.set_value('start_time', self.start_time, spider=spider)
+ self.start_time = datetime.now()
+ self.stats.set_value('start_time', str(self.start_time), spider=spider)
def spider_closed(self, spider, reason):
- finish_time = datetime.utcnow()
+ finish_time = datetime.now()
elapsed_time = finish_time - self.start_time
elapsed_time_seconds = elapsed_time.total_seconds()
self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider)
- self.stats.set_value('finish_time', finish_time, spider=spider)
+ self.stats.set_value('finish_time', str(finish_time), spider=spider)
self.stats.set_value('finish_reason', reason, spider=spider)
def item_scraped(self, item, spider):
diff --git a/aioscrapy/libs/extensions/logstats.py b/aioscrapy/libs/extensions/logstats.py
index 2c57bfd..837f805 100644
--- a/aioscrapy/libs/extensions/logstats.py
+++ b/aioscrapy/libs/extensions/logstats.py
@@ -1,10 +1,9 @@
-import logging
import asyncio
-from aioscrapy.exceptions import NotConfigured
from aioscrapy import signals
-
-logger = logging.getLogger(__name__)
+from aioscrapy.exceptions import NotConfigured
+from aioscrapy.utils.log import logger
+from aioscrapy.utils.tools import create_task
class LogStats:
@@ -15,6 +14,8 @@ def __init__(self, stats, interval=60.0):
self.interval = interval
self.multiplier = 60.0 / self.interval
self.task = None
+ self.pagesprev = 0
+ self.itemsprev = 0
@classmethod
def from_crawler(cls, crawler):
@@ -27,9 +28,7 @@ def from_crawler(cls, crawler):
return o
def spider_opened(self, spider):
- self.pagesprev = 0
- self.itemsprev = 0
- self.task = asyncio.create_task(self.log(spider))
+ self.task = create_task(self.log(spider))
async def log(self, spider):
await asyncio.sleep(self.interval)
@@ -43,8 +42,8 @@ async def log(self, spider):
"scraped %(items)d items (at %(itemrate)d items/min)")
log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name,
'items': items, 'itemrate': irate}
- logger.info(msg, log_args, extra={'spider': spider})
- self.task = asyncio.create_task(self.log(spider))
+ logger.info(msg % log_args)
+ self.task = create_task(self.log(spider))
def spider_closed(self, spider, reason):
if self.task and not self.task.done():
diff --git a/aioscrapy/libs/extensions/metric.py b/aioscrapy/libs/extensions/metric.py
new file mode 100644
index 0000000..b72d80a
--- /dev/null
+++ b/aioscrapy/libs/extensions/metric.py
@@ -0,0 +1,138 @@
+import asyncio
+import os
+import platform
+import random
+import time
+
+from aiohttp import ClientSession
+
+from aioscrapy import Settings
+from aioscrapy import signals
+from aioscrapy.utils.log import _logger, logger
+from aioscrapy.utils.tools import create_task
+
+
+class InfluxBase:
+ @staticmethod
+ def format_metric(metric_name, value, spider_name, location, measurement=None):
+ measurement = measurement or metric_name
+ return f"{measurement},spider_name={spider_name},location={location} {metric_name}={value} {time.time_ns() + int(random.random() * 100000)}"
+
+ async def record(self, obj: "Metric"):
+ raise NotImplementedError
+
+ async def close(self):
+ pass
+
+
+class InfluxHttp(InfluxBase):
+ def __init__(self, spider_name: str, settings: Settings):
+ influxdb_url = settings.get('METRIC_INFLUXDB_URL')
+ token = settings.get('METRIC_INFLUXDB_TOKEN')
+ location = settings.get('METRIC_LOCATION')
+ self.location = location or f"{platform.node()}_{os.getpid()}"
+ self.spider_name = spider_name
+ self.session = ClientSession(headers={
+ "Authorization": f"Token {token}",
+ "Content-Type": "text/plain; charset=utf-8",
+ "Accept": "application/json",
+ })
+ self.url = influxdb_url
+ self.lock = asyncio.Lock()
+
+ async def emit(self, data):
+ async with self.session.post(self.url, data=data) as response:
+ await response.read()
+ logger.debug(f"emit metric success<{response.status}>: \n{data}")
+
+ async def record(self, obj: "Metric"):
+ async with self.lock:
+ data = ''
+ for metric_name in obj.metrics.keys():
+ current_cnt = obj.stats.get_value(metric_name, 0)
+ if not isinstance(current_cnt, (int, float)):
+ continue
+ cnt = current_cnt - obj.prev.get(metric_name, 0)
+ if cnt:
+ data += self.format_metric(metric_name.replace('/', '-'), cnt, self.spider_name,
+ self.location) + '\n'
+ obj.prev[metric_name] = current_cnt
+ if data:
+ await self.emit(data)
+
+ async def close(self):
+ if self.session is not None:
+ await self.session.close()
+ await asyncio.sleep(0.250)
+
+
+class InfluxLog(InfluxBase):
+ def __init__(self, spider_name: str, settings: Settings):
+ location = settings.get('METRIC_LOCATION')
+ self.location = location or f"{platform.node()}_{os.getpid()}"
+ self.spider_name = spider_name
+
+ log_args = settings.getdict('METRIC_LOG_ARGS')
+ log_args.update(dict(
+ filter=lambda record: record["extra"].get("metric") is not None,
+ format="{time:YYYY-MM-DD HH:mm:ss.SSS} {message}",
+ encoding="utf-8"
+ ))
+ for k, v in dict(
+ sink=f'{spider_name}.metric', level="INFO", rotation='20MB',
+ retention=3
+ ).items():
+ log_args.setdefault(k, v)
+
+ _logger.add(**log_args)
+ self.log = _logger.bind(metric="metric")
+
+ async def record(self, obj: "Metric"):
+ for metric_name in obj.metrics.keys():
+ current_cnt = obj.stats.get_value(metric_name, 0)
+ if not isinstance(current_cnt, (int, float)):
+ continue
+ prev_cnt = obj.prev.get(metric_name, 0)
+ cnt = current_cnt - prev_cnt
+ if cnt:
+ msg = self.format_metric(metric_name.replace('/', '-'), cnt, self.spider_name, self.location)
+ self.log.info(msg)
+ logger.debug(msg)
+ obj.prev[metric_name] = current_cnt
+
+
+class Metric:
+ """Log Metric scraping stats periodically"""
+
+ def __init__(self, stats, spider_name, settings, interval=10.0):
+ if settings.get('METRIC_INFLUXDB_URL'):
+ self.influx = InfluxHttp(spider_name, settings)
+ else:
+ self.influx = InfluxLog(spider_name, settings)
+ self.stats = stats
+ self.metrics = settings.getdict('METRICS') or self.stats._stats
+ self.interval = interval
+ self.task = None
+ self.prev = {}
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ interval = crawler.settings.getfloat('METRIC_INTERVAL', 10.0)
+ o = cls(crawler.stats, crawler.spider.name, crawler.settings, interval)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_opened(self, spider):
+ self.task = create_task(self.run(spider))
+
+ async def run(self, spider):
+ await asyncio.sleep(self.interval)
+ await self.influx.record(self)
+ self.task = create_task(self.run(spider))
+
+ async def spider_closed(self, spider, reason):
+ if self.task and not self.task.done():
+ self.task.cancel()
+ await self.influx.record(self)
+ await self.influx.close()
diff --git a/aioscrapy/libs/extensions/throttle.py b/aioscrapy/libs/extensions/throttle.py
index f485f78..b614952 100644
--- a/aioscrapy/libs/extensions/throttle.py
+++ b/aioscrapy/libs/extensions/throttle.py
@@ -1,9 +1,6 @@
-import logging
-
-from aioscrapy.exceptions import NotConfigured
from aioscrapy import signals
-
-logger = logging.getLogger(__name__)
+from aioscrapy.exceptions import NotConfigured
+from aioscrapy.utils.log import logger
class AutoThrottle:
@@ -52,13 +49,11 @@ def _response_downloaded(self, response, request, spider):
logger.info(
"slot: %(slot)s | conc:%(concurrency)2d | "
"delay:%(delay)5d ms (%(delaydiff)+d) | "
- "latency:%(latency)5d ms | size:%(size)6d bytes",
- {
+ "latency:%(latency)5d ms | size:%(size)6d bytes" % {
'slot': key, 'concurrency': conc,
'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
'latency': latency * 1000, 'size': size
- },
- extra={'spider': spider}
+ }
)
def _get_slot(self, request, spider):
diff --git a/aioscrapy/libs/pipelines/__init__.py b/aioscrapy/libs/pipelines/__init__.py
index 90dcf44..b50e956 100644
--- a/aioscrapy/libs/pipelines/__init__.py
+++ b/aioscrapy/libs/pipelines/__init__.py
@@ -1,9 +1,7 @@
import asyncio
-import logging
-from typing import Optional
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
+from aioscrapy.utils.tools import create_task
class SqlFormat:
@@ -107,12 +105,12 @@ def __init__(self, settings, db_type: str):
self.item_save_key: str = f'__{db_type}__'
async def open_spider(self, spider):
- asyncio.create_task(self.save_heartbeat())
+ create_task(self.save_heartbeat())
async def save_heartbeat(self):
while self.running:
await asyncio.sleep(self.save_cache_interval)
- asyncio.create_task(self.save_all())
+ create_task(self.save_all())
async def process_item(self, item, spider):
save_info = item.pop(self.item_save_key, None)
diff --git a/aioscrapy/libs/pipelines/sink/_csv.py b/aioscrapy/libs/pipelines/sink/_csv.py
index 478cebc..75738be 100644
--- a/aioscrapy/libs/pipelines/sink/_csv.py
+++ b/aioscrapy/libs/pipelines/sink/_csv.py
@@ -1,9 +1,8 @@
import asyncio
import csv
-import logging
from typing import Optional
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class CsvSinkMixin:
@@ -27,7 +26,7 @@ async def save_item(
def _get_writer(self, filename, item):
writer, *_ = self.csv_writer.get(filename, (None, None))
if writer is None:
- file = open(filename, 'w', encoding="UTF8", newline='')
+ file = open(filename, 'w', encoding="UTF8", newline='')
writer = csv.writer(file)
writer.writerow(item.keys())
self.csv_writer[filename] = (writer, file)
diff --git a/aioscrapy/libs/pipelines/sink/execl.py b/aioscrapy/libs/pipelines/sink/execl.py
index 6a31593..9d07e8c 100644
--- a/aioscrapy/libs/pipelines/sink/execl.py
+++ b/aioscrapy/libs/pipelines/sink/execl.py
@@ -1,5 +1,4 @@
import asyncio
-import logging
import math
from io import BytesIO
from typing import Tuple, Optional
@@ -8,7 +7,8 @@
import xlsxwriter
from PIL import Image, ImageFile
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
+
try:
resample = Image.LANCZOS
except:
@@ -106,15 +106,18 @@ def _get_write_class(self, filename, item, sheet='sheet1', **options):
self.y_cache[filename_sheet]
def close_execl(self, filename=None):
- if filename in self.wb_cache.keys():
- logger.info(f'Closing Execl: {filename}')
- self.wb_cache.pop(filename, None).close()
- for filename_sheet in list(self.ws_cache.keys()):
- if not filename_sheet.startswith(filename):
- continue
- self.ws_cache.pop(filename_sheet, None)
- self.y_cache.pop(filename_sheet, None)
- self.fields_cache.pop(filename_sheet, None)
+ if filename not in self.wb_cache:
+ return
+
+ logger.info(f'Closing Execl: {filename}')
+ if wb := self.wb_cache.pop(filename):
+ wb.close()
+ for filename_sheet in list(self.ws_cache.keys()):
+ if not filename_sheet.startswith(filename):
+ continue
+ self.ws_cache.pop(filename_sheet, None)
+ self.y_cache.pop(filename_sheet, None)
+ self.fields_cache.pop(filename_sheet, None)
def close(self):
for filename in list(self.wb_cache.keys()):
@@ -144,7 +147,6 @@ async def close_spider(self, spider):
if __name__ == '__main__':
-
class TestSpider:
name = 'TestSpider'
diff --git a/aioscrapy/libs/pipelines/sink/mongo.py b/aioscrapy/libs/pipelines/sink/mongo.py
index 9f42f0a..a53a3dd 100644
--- a/aioscrapy/libs/pipelines/sink/mongo.py
+++ b/aioscrapy/libs/pipelines/sink/mongo.py
@@ -1,9 +1,7 @@
-import logging
-
from aioscrapy.db import db_manager
from aioscrapy.libs.pipelines import DBPipelineBase
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class MongoPipeline(DBPipelineBase):
diff --git a/aioscrapy/libs/pipelines/sink/mysql.py b/aioscrapy/libs/pipelines/sink/mysql.py
index 37334b9..c4f2955 100644
--- a/aioscrapy/libs/pipelines/sink/mysql.py
+++ b/aioscrapy/libs/pipelines/sink/mysql.py
@@ -1,9 +1,7 @@
-import logging
-
from aioscrapy.db import db_manager
from aioscrapy.libs.pipelines import DBPipelineBase
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class MysqlPipeline(DBPipelineBase):
diff --git a/aioscrapy/libs/pipelines/sink/pg.py b/aioscrapy/libs/pipelines/sink/pg.py
index 5395743..be81919 100644
--- a/aioscrapy/libs/pipelines/sink/pg.py
+++ b/aioscrapy/libs/pipelines/sink/pg.py
@@ -1,9 +1,7 @@
-import logging
-
from aioscrapy.db import db_manager
from aioscrapy.libs.pipelines import DBPipelineBase
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class PGPipeline(DBPipelineBase):
diff --git a/aioscrapy/libs/spider/depth.py b/aioscrapy/libs/spider/depth.py
index ffbd9d6..a94b346 100644
--- a/aioscrapy/libs/spider/depth.py
+++ b/aioscrapy/libs/spider/depth.py
@@ -4,11 +4,9 @@
See documentation in docs/topics/spider-middleware.rst
"""
-import logging
-
from aioscrapy.http import Request
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class DepthMiddleware:
@@ -35,11 +33,9 @@ def _filter(request):
if self.prio:
request.priority -= depth * self.prio
if self.maxdepth and depth > self.maxdepth:
- logger.debug(
- "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
- {'maxdepth': self.maxdepth, 'requrl': request.url},
- extra={'spider': spider}
- )
+ logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s " % {
+ 'maxdepth': self.maxdepth, 'requrl': request.url
+ })
return False
else:
if self.verbose_stats:
diff --git a/aioscrapy/libs/spider/httperror.py b/aioscrapy/libs/spider/httperror.py
index c6f2869..afe791f 100644
--- a/aioscrapy/libs/spider/httperror.py
+++ b/aioscrapy/libs/spider/httperror.py
@@ -3,11 +3,10 @@
See documentation in docs/topics/spider-middleware.rst
"""
-import logging
from aioscrapy.exceptions import IgnoreRequest
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class HttpError(IgnoreRequest):
@@ -49,8 +48,7 @@ async def process_spider_exception(self, response, exception, spider):
spider.crawler.stats.inc_value(
f'httperror/response_ignored_status_count/{response.status}'
)
- logger.info(
- "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
- {'response': response}, extra={'spider': spider},
- )
+ logger.info("Ignoring response %(response)r: HTTP status code is not handled or not allowed" % {
+ 'response': response
+ })
return []
diff --git a/aioscrapy/libs/spider/offsite.py b/aioscrapy/libs/spider/offsite.py
index 172c806..67666e7 100644
--- a/aioscrapy/libs/spider/offsite.py
+++ b/aioscrapy/libs/spider/offsite.py
@@ -4,14 +4,12 @@
See documentation in docs/topics/spider-middleware.rst
"""
import re
-import logging
import warnings
from aioscrapy import signals
from aioscrapy.http import Request
from aioscrapy.utils.httpobj import urlparse_cached
-
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class OffsiteMiddleware:
@@ -35,8 +33,8 @@ async def process_spider_output(self, response, result, spider):
if domain and domain not in self.domains_seen:
self.domains_seen.add(domain)
logger.debug(
- "Filtered offsite request to %(domain)r: %(request)s",
- {'domain': domain, 'request': x}, extra={'spider': spider})
+ "Filtered offsite request to %(domain)r: %(request)s" % {'domain': domain, 'request': x}
+ )
self.stats.inc_value('offsite/domains', spider=spider)
self.stats.inc_value('offsite/filtered', spider=spider)
else:
diff --git a/aioscrapy/libs/spider/urllength.py b/aioscrapy/libs/spider/urllength.py
index b6b8bc3..c1649da 100644
--- a/aioscrapy/libs/spider/urllength.py
+++ b/aioscrapy/libs/spider/urllength.py
@@ -4,12 +4,9 @@
See documentation in docs/topics/spider-middleware.rst
"""
-import logging
-
-from aioscrapy.http import Request
from aioscrapy.exceptions import NotConfigured
-
-logger = logging.getLogger(__name__)
+from aioscrapy.http import Request
+from aioscrapy.utils.log import logger
class UrlLengthMiddleware:
@@ -28,9 +25,9 @@ async def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request) and len(request.url) > self.maxlength:
logger.info(
- "Ignoring link (url length > %(maxlength)d): %(url)s ",
- {'maxlength': self.maxlength, 'url': request.url},
- extra={'spider': spider}
+ "Ignoring link (url length > %(maxlength)d): %(url)s " % {
+ 'maxlength': self.maxlength, 'url': request.url
+ }
)
spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider)
return False
diff --git a/aioscrapy/logformatter.py b/aioscrapy/logformatter.py
index 673d1c5..2534863 100644
--- a/aioscrapy/logformatter.py
+++ b/aioscrapy/logformatter.py
@@ -1,5 +1,4 @@
import os
-import logging
from aioscrapy.utils.request import referer_str
@@ -19,9 +18,8 @@ def crawled(request, response, spider):
request_flags = f' {str(request.flags)}' if request.flags else ''
response_flags = f' {str(response.flags)}' if response.flags else ''
return {
- 'level': logging.DEBUG,
- 'msg': CRAWLEDMSG,
- 'args': {
+ '_Logger__level': "DEBUG",
+ '_Logger__message': CRAWLEDMSG % {
'status': response.status,
'request': request,
'request_flags': request_flags,
@@ -36,9 +34,8 @@ def scraped(item, response, spider):
"""Logs a message when an item is scraped by a spider."""
src = response
return {
- 'level': logging.DEBUG,
- 'msg': SCRAPEDMSG,
- 'args': {
+ '_Logger__level': "DEBUG",
+ '_Logger__message': SCRAPEDMSG % {
'src': src,
'item': item,
}
@@ -48,9 +45,8 @@ def scraped(item, response, spider):
def dropped(item, exception, response, spider):
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
return {
- 'level': logging.WARNING,
- 'msg': DROPPEDMSG,
- 'args': {
+ '_Logger__level': "WARNING",
+ '_Logger__message': DROPPEDMSG % {
'exception': exception,
'item': item,
}
@@ -64,9 +60,8 @@ def item_error(item, exception, response, spider):
.. versionadded:: 2.0
"""
return {
- 'level': logging.ERROR,
- 'msg': ITEMERRORMSG,
- 'args': {
+ '_Logger__level': "ERROR",
+ '_Logger__message': ITEMERRORMSG % {
'item': item,
}
}
@@ -78,9 +73,8 @@ def spider_error(failure, request, response, spider):
.. versionadded:: 2.0
"""
return {
- 'level': logging.ERROR,
- 'msg': SPIDERERRORMSG,
- 'args': {
+ '_Logger__level': "ERROR",
+ '_Logger__message': SPIDERERRORMSG % {
'request': request,
'referer': referer_str(request),
}
@@ -100,9 +94,8 @@ def download_error(failure, request, spider, errmsg=None):
else:
msg = DOWNLOADERRORMSG_SHORT
return {
- 'level': logging.ERROR,
- 'msg': msg,
- 'args': args,
+ '_Logger__level': "ERROR",
+ '_Logger__message': msg % args,
}
@classmethod
diff --git a/aioscrapy/middleware/absmanager.py b/aioscrapy/middleware/absmanager.py
index 5c996c2..89cb68b 100644
--- a/aioscrapy/middleware/absmanager.py
+++ b/aioscrapy/middleware/absmanager.py
@@ -1,14 +1,12 @@
-import logging
import pprint
from abc import ABCMeta, abstractmethod
from collections import defaultdict, deque
from aioscrapy.exceptions import NotConfigured
+from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_instance
from aioscrapy.utils.tools import call_helper
-logger = logging.getLogger(__name__)
-
class AbsMiddlewareManager(object, metaclass=ABCMeta):
"""Base class for implementing middleware managers"""
@@ -38,14 +36,9 @@ async def from_settings(cls, settings, crawler=None):
except NotConfigured as e:
if e.args:
clsname = clspath.split('.')[-1]
- logger.warning("Disabled %(clsname)s: %(eargs)s",
- {'clsname': clsname, 'eargs': e.args[0]},
- extra={'crawler': crawler})
-
- logger.info("Enabled %(componentname)ss:\n%(enabledlist)s",
- {'componentname': cls.component_name,
- 'enabledlist': pprint.pformat(enabled)},
- extra={'crawler': crawler})
+ logger.warning("Disabled %(clsname)s: %(eargs)s" % {'clsname': clsname, 'eargs': e.args[0]})
+
+ logger.info(f"Enabled {cls.component_name}s:\n{pprint.pformat(enabled)}")
return cls(*middlewares)
@classmethod
diff --git a/aioscrapy/middleware/downloader.py b/aioscrapy/middleware/downloader.py
index e532d51..188dfdd 100644
--- a/aioscrapy/middleware/downloader.py
+++ b/aioscrapy/middleware/downloader.py
@@ -1,13 +1,9 @@
-import logging
-
from aioscrapy.exceptions import _InvalidOutput
from aioscrapy.http import Request, Response
from aioscrapy.middleware.absmanager import AbsMiddlewareManager
from aioscrapy.utils.conf import build_component_list
from aioscrapy.utils.tools import call_helper
-logger = logging.getLogger(__name__)
-
class DownloaderMiddlewareManager(AbsMiddlewareManager):
component_name = 'downloader middleware'
@@ -71,4 +67,3 @@ async def process_exception(self, spider, request, exception):
if response:
return response
return exception
-
diff --git a/aioscrapy/proxy/__init__.py b/aioscrapy/proxy/__init__.py
index 66b14bd..48e49b8 100644
--- a/aioscrapy/proxy/__init__.py
+++ b/aioscrapy/proxy/__init__.py
@@ -1,10 +1,8 @@
-import logging
from abc import ABCMeta, abstractmethod
+from aioscrapy.utils.log import logger
from aioscrapy.utils.python import global_object_name
-logger = logging.getLogger('aioscrapy.proxy')
-
class AbsProxy(metaclass=ABCMeta):
def __init__(self, settings):
diff --git a/aioscrapy/proxy/redis.py b/aioscrapy/proxy/redis.py
index ece930a..07572e5 100644
--- a/aioscrapy/proxy/redis.py
+++ b/aioscrapy/proxy/redis.py
@@ -1,12 +1,10 @@
import asyncio
import time
from typing import Optional
-import logging
from aioscrapy.db import db_manager
from aioscrapy.proxy import AbsProxy
-
-logger = logging.getLogger('aioscrapy.proxy')
+from aioscrapy.utils.log import logger
class RedisProxy(AbsProxy):
diff --git a/aioscrapy/queue/redis.py b/aioscrapy/queue/redis.py
index d79f20c..eb482fc 100644
--- a/aioscrapy/queue/redis.py
+++ b/aioscrapy/queue/redis.py
@@ -1,4 +1,3 @@
-import logging
from abc import ABC
from typing import Optional
@@ -8,8 +7,6 @@
from aioscrapy.serializer import AbsSerializer
from aioscrapy.utils.misc import load_object
-logger = logging.getLogger(__name__)
-
class RedisQueueBase(AbsQueue, ABC):
inc_key = 'scheduler/enqueued/redis'
@@ -96,8 +93,8 @@ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
stop = count - 1 if count - 1 > 0 else 0
results, _ = await (
pipe.zrange(self.key, 0, stop)
- .zremrangebyrank(self.key, 0, stop)
- .execute()
+ .zremrangebyrank(self.key, 0, stop)
+ .execute()
)
for result in results:
yield self._decode_request(result)
diff --git a/aioscrapy/settings/default_settings.py b/aioscrapy/settings/default_settings.py
index 0ea4a52..97a24e1 100644
--- a/aioscrapy/settings/default_settings.py
+++ b/aioscrapy/settings/default_settings.py
@@ -106,7 +106,7 @@
'aioscrapy.libs.extensions.corestats.CoreStats': 0,
'aioscrapy.libs.extensions.closespider.CloseSpider': 0,
'aioscrapy.libs.extensions.logstats.LogStats': 0,
- 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0,
+ # 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0,
}
FILES_STORE_S3_ACL = 'private'
@@ -122,24 +122,18 @@
LOG_ENABLED = True
LOG_ENCODING = 'utf-8'
-LOG_MAX_BYTES = 50 * 1024 * 1024
-LOG_BACKUP_COUNT = 10
+LOG_ROTATION = '20MB'
+LOG_RETENTION = 10
LOG_FORMATTER = 'aioscrapy.logformatter.LogFormatter'
-LOG_FORMAT = '%(asctime)s %(process)d [%(name)s] %(levelname)s: %(message)s'
-LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
-LOG_STDOUT = False
+LOG_FORMAT = "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {process} | {extra[spidername]} | {name}:{function}:{line} | {message}"
+LOG_STDOUT = True
LOG_LEVEL = 'DEBUG'
LOG_FILE = None
-LOG_SHORT_NAMES = False
SCHEDULER_DEBUG = False
LOGSTATS_INTERVAL = 60.0
-METAREFRESH_ENABLED = True
-METAREFRESH_IGNORE_TAGS = []
-METAREFRESH_MAXDELAY = 100
-
NEWSPIDER_MODULE = ''
RANDOMIZE_DOWNLOAD_DELAY = True
diff --git a/aioscrapy/spiders/__init__.py b/aioscrapy/spiders/__init__.py
index 48bac67..7646b84 100644
--- a/aioscrapy/spiders/__init__.py
+++ b/aioscrapy/spiders/__init__.py
@@ -3,17 +3,16 @@
See documentation in docs/topics/spiders.rst
"""
-import logging
import warnings
from typing import Optional
from aioscrapy import signals
+from aioscrapy.exceptions import DontCloseSpider
from aioscrapy.http.request import Request
from aioscrapy.http.response import Response
-from aioscrapy.utils.url import url_is_from_spider
from aioscrapy.utils.deprecate import method_is_overridden
-from aioscrapy.exceptions import DontCloseSpider
from aioscrapy.utils.tools import call_helper
+from aioscrapy.utils.url import url_is_from_spider
class Spider(object):
@@ -34,20 +33,6 @@ def __init__(self, name=None, **kwargs):
if not hasattr(self, 'start_urls'):
self.start_urls = []
- @property
- def logger(self):
- logger = logging.getLogger(self.name)
- return logging.LoggerAdapter(logger, {'spider': self})
-
- def log(self, message, level=logging.DEBUG, **kw):
- """Log the given message at the given log level
-
- This helper wraps a log call to the logger within the spider, but you
- can use it directly (e.g. Spider.logger.info('msg')) or use any other
- Python logger too.
- """
- self.logger.log(level, message, **kw)
-
@classmethod
async def from_crawler(cls, crawler, *args, **kwargs):
spider = cls(*args, **kwargs)
@@ -62,34 +47,14 @@ def _set_crawler(self, crawler):
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
async def start_requests(self):
- cls = self.__class__
if not self.start_urls and hasattr(self, 'start_url'):
raise AttributeError(
"Crawling could not start: 'start_urls' not found "
"or empty (but found 'start_url' attribute instead, "
"did you miss an 's'?)")
- if method_is_overridden(cls, Spider, 'make_requests_from_url'):
- warnings.warn(
- "Spider.make_requests_from_url method is deprecated; it "
- "won't be called in future Scrapy releases. Please "
- "override Spider.start_requests method instead "
- f"(see {cls.__module__}.{cls.__name__}).",
- )
- for url in self.start_urls:
- yield self.make_requests_from_url(url)
- else:
- for url in self.start_urls:
- yield Request(url)
-
- def make_requests_from_url(self, url):
- """ This method is deprecated. """
- warnings.warn(
- "Spider.make_requests_from_url method is deprecated: "
- "it will be removed and not be called by the default "
- "Spider.start_requests method in future Scrapy releases. "
- "Please override Spider.start_requests method instead."
- )
- return Request(url, dont_filter=True)
+
+ for url in self.start_urls:
+ yield Request(url)
async def _parse(self, response: Response, **kwargs):
return await call_helper(self.parse, response)
diff --git a/aioscrapy/statscollectors.py b/aioscrapy/statscollectors.py
index a4b3e9f..6b25fbc 100644
--- a/aioscrapy/statscollectors.py
+++ b/aioscrapy/statscollectors.py
@@ -4,7 +4,7 @@
import pprint
import logging
-logger = logging.getLogger(__name__)
+from aioscrapy.utils.log import logger
class StatsCollector:
@@ -43,8 +43,7 @@ def open_spider(self, spider):
def close_spider(self, spider, reason):
if self._dump:
- logger.info("Dumping aioscrapy stats:\n" + pprint.pformat(self._stats),
- extra={'spider': spider})
+ logger.info("Dumping aioscrapy stats:\n" + pprint.pformat(self._stats))
self._persist_stats(self._stats, spider)
def _persist_stats(self, stats, spider):
diff --git a/aioscrapy/templates/project/module/middlewares.py.tmpl b/aioscrapy/templates/project/module/middlewares.py.tmpl
index 643e96d..075545f 100644
--- a/aioscrapy/templates/project/module/middlewares.py.tmpl
+++ b/aioscrapy/templates/project/module/middlewares.py.tmpl
@@ -1,6 +1,6 @@
from aioscrapy import signals
-
+from aioscrapy.utils.log import logger
class ${ProjectName}SpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
@@ -46,7 +46,7 @@ class ${ProjectName}SpiderMiddleware:
yield r
async def spider_opened(self, spider):
- spider.logger.info('Spider opened: %s' % spider.name)
+ logger.info('Spider opened: %s' % spider.name)
class ${ProjectName}DownloaderMiddleware:
@@ -93,4 +93,4 @@ class ${ProjectName}DownloaderMiddleware:
pass
async def spider_opened(self, spider):
- spider.logger.info('Spider opened: %s' % spider.name)
+ logger.info('Spider opened: %s' % spider.name)
diff --git a/aioscrapy/utils/log.py b/aioscrapy/utils/log.py
index 5b649bc..bf25e20 100644
--- a/aioscrapy/utils/log.py
+++ b/aioscrapy/utils/log.py
@@ -1,175 +1,35 @@
-import logging
+import asyncio
import sys
import warnings
-from logging.config import dictConfig
-from logging.handlers import RotatingFileHandler
+from typing import Type
-import aioscrapy
-from aioscrapy.exceptions import AioScrapyDeprecationWarning
-from aioscrapy.settings import Settings
-
-
-logger = logging.getLogger(__name__)
-
-
-class TopLevelFormatter(logging.Filter):
- """Keep only top level loggers's name (direct children from root) from
- records.
-
- This filter will replace aioscrapy loggers' names with 'aioscrapy'. This mimics
- the old Scrapy log behaviour and helps shortening long names.
-
- Since it can't be set for just one logger (it won't propagate for its
- children), it's going to be set in the root handler, with a parametrized
- ``loggers`` list where it should act.
- """
-
- def __init__(self, loggers=None):
- self.loggers = loggers or []
-
- def filter(self, record):
- if any(record.name.startswith(logger + '.') for logger in self.loggers):
- record.name = record.name.split('.', 1)[0]
- return True
-
-
-DEFAULT_LOGGING = {
- 'version': 1,
- 'disable_existing_loggers': False,
- 'loggers': {
- 'hpack': {
- 'level': 'ERROR',
- },
- 'aio_pika': {
- 'level': 'ERROR',
- },
- 'aiormq': {
- 'level': 'ERROR',
- },
- 'aioscrapy': {
- 'level': 'DEBUG',
- },
- }
-}
-
-
-def configure_logging(settings=None, install_root_handler=True):
- """
- Initialize logging defaults for Scrapy.
-
- :param settings: settings used to create and configure a handler for the
- root logger (default: None).
- :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``
-
- :param install_root_handler: whether to install root logging handler
- (default: True)
- :type install_root_handler: bool
-
- This function does:
-
- - Route warnings and twisted logging through Python standard logging
- - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
- - Route stdout to log if LOG_STDOUT setting is True
-
- When ``install_root_handler`` is True (default), this function also
- creates a handler for the root logger according to given settings
- (see :ref:`topics-logging-settings`). You can override default options
- using ``settings`` argument. When ``settings`` is empty or None, defaults
- are used.
- """
- if not sys.warnoptions:
- # Route warnings through python logging
- logging.captureWarnings(True)
-
- dictConfig(DEFAULT_LOGGING)
-
- if isinstance(settings, dict) or settings is None:
- settings = Settings(settings)
-
- if settings.getbool('LOG_STDOUT'):
- sys.stdout = StreamLogger(logging.getLogger('stdout'))
+from loguru import logger as _logger
- if install_root_handler:
- install_scrapy_root_handler(settings)
-
-
-def install_scrapy_root_handler(settings):
- global _scrapy_root_handler
-
- if (_scrapy_root_handler is not None
- and _scrapy_root_handler in logging.root.handlers):
- logging.root.removeHandler(_scrapy_root_handler)
- logging.root.setLevel(logging.NOTSET)
- _scrapy_root_handler = _get_handler(settings)
- logging.root.addHandler(_scrapy_root_handler)
-
-
-def get_scrapy_root_handler():
- return _scrapy_root_handler
-
-
-_scrapy_root_handler = None
-
-
-def _get_handler(settings):
- """ Return a log handler object according to settings """
- filename = settings.get('LOG_FILE')
- if filename:
- encoding = settings.get('LOG_ENCODING')
- max_bytes = settings.get('LOG_MAX_BYTES', 50*1024*1024)
- backup_count = settings.get('LOG_BACKUP_COUNT', 10)
- handler = RotatingFileHandler(filename, encoding=encoding, maxBytes=max_bytes, backupCount=backup_count)
- elif settings.getbool('LOG_ENABLED'):
- handler = logging.StreamHandler()
- else:
- handler = logging.NullHandler()
-
- formatter = logging.Formatter(
- fmt=settings.get('LOG_FORMAT'),
- datefmt=settings.get('LOG_DATEFORMAT')
- )
- handler.setFormatter(formatter)
- handler.setLevel(settings.get('LOG_LEVEL'))
- if settings.getbool('LOG_SHORT_NAMES'):
- handler.addFilter(TopLevelFormatter(['aioscrapy']))
- return handler
-
-
-def log_scrapy_info(settings):
- logger.info("Aioscrapy %(version)s started (bot: %(bot)s)",
- {'version': aioscrapy.__version__, 'bot': settings['BOT_NAME']})
-
-
-class StreamLogger:
- """Fake file-like stream object that redirects writes to a logger instance
-
- Taken from:
- https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
- """
- def __init__(self, logger, log_level=logging.INFO):
- self.logger = logger
- self.log_level = log_level
- self.linebuf = ''
-
- def write(self, buf):
- for line in buf.rstrip().splitlines():
- self.logger.log(self.log_level, line.rstrip())
-
- def flush(self):
- for h in self.logger.handlers:
- h.flush()
+from aioscrapy import Settings, Spider
+from aioscrapy.exceptions import AioScrapyDeprecationWarning
+_logger.remove(0)
-class LogCounterHandler(logging.Handler):
- """Record log levels count into a crawler stats"""
- def __init__(self, crawler, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.crawler = crawler
+def configure_logging(spider: Type[Spider], settings: Settings):
+ formatter = settings.get('LOG_FORMAT')
+ level = settings.get('LOG_LEVEL', 'INFO')
+ enqueue = settings.get('ENQUEUE', True)
+ if settings.get('LOG_STDOUT', True):
+ _logger.add(
+ sys.stderr, format=formatter, level=level, enqueue=enqueue,
+ filter=lambda record: record["extra"].get("spidername") == spider.name,
+ )
- def emit(self, record):
- sname = f'log_count/{record.levelname}'
- self.crawler.stats.inc_value(sname)
+ if filename := settings.get('LOG_FILE'):
+ rotation = settings.get('LOG_ROTATION', '20MB')
+ retention = settings.get('LOG_RETENTION', 10)
+ encoding = settings.get('LOG_ENCODING', 'utf-8')
+ _logger.add(
+ sink=filename, format=formatter, encoding=encoding, level=level,
+ enqueue=enqueue, rotation=rotation, retention=retention,
+ filter=lambda record: record["extra"].get("spidername") == spider.name,
+ )
def logformatter_adapter(logkws):
@@ -187,10 +47,19 @@ def logformatter_adapter(logkws):
'deprecated, use `msg` instead',
AioScrapyDeprecationWarning)
- level = logkws.get('level', logging.INFO)
+ level = logkws.get('level', "INFO")
message = logkws.get('format', logkws.get('msg'))
# NOTE: This also handles 'args' being an empty dict, that case doesn't
# play well in logger.log calls
args = logkws if not logkws.get('args') else logkws['args']
+ return level, message, args
+
+
+class AioScrapyLogger:
+
+ def __getattr__(self, item):
+ spider_name = asyncio.current_task().get_name()
+ return getattr(_logger.bind(spidername=spider_name), item)
+
- return (level, message, args)
+logger: Type[_logger] = AioScrapyLogger()
diff --git a/aioscrapy/utils/signal.py b/aioscrapy/utils/signal.py
index 6f1110d..6382ab8 100644
--- a/aioscrapy/utils/signal.py
+++ b/aioscrapy/utils/signal.py
@@ -1,12 +1,12 @@
"""Helper functions for working with signals"""
import asyncio
-import logging
from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
from pydispatch.robustapply import robustApply
-from aioscrapy.exceptions import StopDownload
-logger = logging.getLogger(__name__)
+from aioscrapy.exceptions import StopDownload
+from aioscrapy.utils.log import logger
+from aioscrapy.utils.tools import create_task
class _IgnoredException(Exception):
@@ -22,10 +22,7 @@ async def robustApplyWrap(f, recv, *args, **kw):
return await result
except (Exception, BaseException) as exc: # noqa: E722
if dont_log is None or not isinstance(exc, dont_log):
- logger.error("Error caught on signal handler: %(receiver)s",
- {'receiver': recv},
- exc_info=exc,
- extra={'spider': spider})
+ logger.exception(f"Error caught on signal handler: {recv}")
return exc
@@ -48,10 +45,12 @@ async def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **na
"""
dfds = []
for receiver in liveReceivers(getAllReceivers(sender, signal)):
- dfds.append(asyncio.create_task(
- robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)))
- res = await asyncio.gather(*dfds)
- return res
+ dfds.append(
+ create_task(
+ robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named)
+ )
+ )
+ return await asyncio.gather(*dfds)
def disconnect_all(signal=Any, sender=Any):
@@ -60,4 +59,3 @@ def disconnect_all(signal=Any, sender=Any):
"""
for receiver in liveReceivers(getAllReceivers(sender, signal)):
disconnect(receiver, signal=signal, sender=sender)
-
diff --git a/aioscrapy/utils/spider.py b/aioscrapy/utils/spider.py
index 7459699..156d8c2 100644
--- a/aioscrapy/utils/spider.py
+++ b/aioscrapy/utils/spider.py
@@ -1,12 +1,8 @@
import inspect
-import logging
from aioscrapy.spiders import Spider
-logger = logging.getLogger(__name__)
-
-
def iter_spider_classes(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (i.e. which have name)
@@ -16,9 +12,9 @@ def iter_spider_classes(module):
for obj in vars(module).values():
if (
- inspect.isclass(obj)
- and issubclass(obj, Spider)
- and obj.__module__ == module.__name__
- and getattr(obj, 'name', None)
+ inspect.isclass(obj)
+ and issubclass(obj, Spider)
+ and obj.__module__ == module.__name__
+ and getattr(obj, 'name', None)
):
yield obj
diff --git a/aioscrapy/utils/tools.py b/aioscrapy/utils/tools.py
index 9364211..0bd856d 100644
--- a/aioscrapy/utils/tools.py
+++ b/aioscrapy/utils/tools.py
@@ -68,3 +68,11 @@ def exec_js_func(js_file_path, func_name, func_params=None, cwd_path=None, cmd_p
js = ''.join(lines)
js_context = execjs.get(name).compile(js, cwd=cwd_path)
return js_context.call(func_name, *func_params)
+
+
+def create_task(coros, name=None):
+ """ 感染所有的协程函数 """
+ return asyncio.create_task(
+ coros,
+ name=asyncio.current_task().get_name()
+ )
diff --git a/example/singlespider/demo_duplicate.py b/example/singlespider/demo_duplicate.py
index 22e9eef..2fba3f3 100644
--- a/example/singlespider/demo_duplicate.py
+++ b/example/singlespider/demo_duplicate.py
@@ -16,18 +16,18 @@ class DemoDuplicateSpider(Spider):
"CLOSE_SPIDER_ON_IDLE": True,
# 'LOG_FILE': 'test.log',
- # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', # 本地文件存储指纹去重
+ 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', # 本地文件存储指纹去重
# 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter', # redis set去重
- 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.BloomDupeFilter', # 布隆过滤器去重
+ # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.BloomDupeFilter', # 布隆过滤器去重
- 'SCHEDULER_QUEUE_CLASS': 'aioscrapy.queue.redis.SpiderPriorityQueue',
+ # 'SCHEDULER_QUEUE_CLASS': 'aioscrapy.queue.redis.SpiderPriorityQueue',
'SCHEDULER_SERIALIZER': 'aioscrapy.serializer.JsonSerializer',
- 'REDIS_ARGS': {
- 'queue': {
- 'url': 'redis://:@192.168.43.165:6379/10',
- 'max_connections': 2,
- }
- }
+ # 'REDIS_ARGS': {
+ # 'queue': {
+ # 'url': 'redis://:@192.168.43.165:6379/10',
+ # 'max_connections': 2,
+ # }
+ # }
}
async def start_requests(self):
diff --git a/example/singlespider/demo_metric.py b/example/singlespider/demo_metric.py
new file mode 100644
index 0000000..0a7705a
--- /dev/null
+++ b/example/singlespider/demo_metric.py
@@ -0,0 +1,66 @@
+import logging
+
+from aioscrapy import Request
+from aioscrapy.spiders import Spider
+
+logger = logging.getLogger(__name__)
+
+
+class DemoMetricSpider(Spider):
+ name = 'DemoMetricSpider'
+ custom_settings = dict(
+ USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
+ # DOWNLOAD_DELAY=3,
+ # RANDOMIZE_DOWNLOAD_DELAY=True,
+ # CONCURRENT_REQUESTS=1,
+ LOG_LEVEL='DEBUG',
+ CLOSE_SPIDER_ON_IDLE=True,
+
+
+ EXTENSIONS={
+ 'aioscrapy.libs.extensions.metric.Metric': 0,
+ },
+
+ # http(侵入式) 使用http协议将监控指标写入influxdb2
+ # METRIC_INFLUXDB_URL="http://127.0.0.1:8086/api/v2/write?org=spiderman&bucket=spider-metric&precision=ns",
+ # METRIC_INFLUXDB_TOKEN="YequFPGDEuukHUG9l8l2nlaatufGQK_UOD7UBpo3KvB8jIg5-cFa89GLXYfgk76M2sHvEtERpAXK7_fMNsBjAA==",
+
+ # log + vector(非侵入式) 将监控指标写入单独的日志文件,利用vector收集日志写入influxdb2
+ METRIC_LOG_ARGS=dict(sink='DemoMetricSpider.metric', rotation='20MB', retention=3)
+ )
+
+ start_urls = ['https://quotes.toscrape.com']
+
+ @staticmethod
+ async def process_request(request, spider):
+ """ request middleware """
+ pass
+
+ @staticmethod
+ async def process_response(request, response, spider):
+ """ response middleware """
+ return response
+
+ @staticmethod
+ async def process_exception(request, exception, spider):
+ """ exception middleware """
+ pass
+
+ async def parse(self, response):
+ for quote in response.css('div.quote'):
+ yield {
+ 'author': quote.xpath('span/small/text()').get(),
+ 'text': quote.css('span.text::text').get(),
+ }
+
+ next_page = response.css('li.next a::attr("href")').get()
+ if next_page is not None:
+ # yield response.follow(next_page, self.parse)
+ yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse)
+
+ # async def process_item(self, item):
+ # print(item)
+
+
+if __name__ == '__main__':
+ DemoMetricSpider.start()
diff --git a/example/singlespider/demo_queue_memory.py b/example/singlespider/demo_queue_memory.py
index 6d48837..2a06d9d 100644
--- a/example/singlespider/demo_queue_memory.py
+++ b/example/singlespider/demo_queue_memory.py
@@ -13,7 +13,7 @@ class DemoMemorySpider(Spider):
# 'DOWNLOAD_DELAY': 3,
# 'RANDOMIZE_DOWNLOAD_DELAY': True,
# 'CONCURRENT_REQUESTS': 1,
- # 'LOG_LEVEL': 'INFO',
+ 'LOG_LEVEL': 'INFO',
"CLOSE_SPIDER_ON_IDLE": True,
}
@@ -40,14 +40,15 @@ async def parse(self, response):
'author': quote.xpath('span/small/text()').get(),
'text': quote.css('span.text::text').get(),
}
+ raise Exception(111)
next_page = response.css('li.next a::attr("href")').get()
if next_page is not None:
# yield response.follow(next_page, self.parse)
yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse)
- async def process_item(self, item):
- print(item)
+ # async def process_item(self, item):
+ # print(item)
if __name__ == '__main__':
diff --git a/example/singlespider/demo_request_httpx.py b/example/singlespider/demo_request_httpx.py
index e72008d..31606a4 100644
--- a/example/singlespider/demo_request_httpx.py
+++ b/example/singlespider/demo_request_httpx.py
@@ -53,8 +53,8 @@ async def parse(self, response):
# yield response.follow(next_page, self.parse)
yield Request(f"https://quotes.toscrape.com/{next_page}", callback=self.parse)
- async def process_item(self, item):
- print(item)
+ # async def process_item(self, item):
+ # print(item)
if __name__ == '__main__':
diff --git a/example/singlespider/start.py b/example/singlespider/start.py
index 5118a21..5440aa9 100644
--- a/example/singlespider/start.py
+++ b/example/singlespider/start.py
@@ -4,9 +4,9 @@
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from aioscrapy.process import multi_process_run, single_process_run
-from demo_memory import DemoMemorySpider
-from demo_httpx import DemoHttpxSpider
-from demo_playwright import DemoPlaywrightSpider
+from demo_queue_memory import DemoMemorySpider
+from demo_request_httpx import DemoHttpxSpider
+from demo_request_playwright import DemoPlaywrightSpider
if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index a607ece..44c6b43 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
'zope.interface>=5.1.0',
"redis>=4.3.1",
"aiomultiprocess>=0.9.0",
+ "loguru>=0.7.0",
]
extras_require = {
"all": [