From d79e7eddd3f283d3d7e8a358fe403660205d4026 Mon Sep 17 00:00:00 2001 From: conlin <995018884@qq.com> Date: Wed, 27 Sep 2023 16:51:28 +0800 Subject: [PATCH] feat: metric & logger --- aioscrapy/VERSION | 2 +- aioscrapy/__init__.py | 2 +- aioscrapy/core/downloader/__init__.py | 12 +- .../core/downloader/handlers/__init__.py | 8 +- aioscrapy/core/downloader/handlers/aiohttp.py | 6 +- aioscrapy/core/downloader/handlers/httpx.py | 6 +- .../handlers/playwright/__init__.py | 4 - aioscrapy/core/downloader/handlers/pyhttpx.py | 4 +- .../core/downloader/handlers/requests.py | 4 +- aioscrapy/core/engine.py | 35 ++- aioscrapy/core/scraper.py | 53 ++--- aioscrapy/crawler.py | 67 +++--- aioscrapy/db/__init__.py | 4 +- aioscrapy/db/aiomongo.py | 4 - aioscrapy/db/aiomysql.py | 3 - aioscrapy/db/aiopg.py | 4 +- aioscrapy/db/aiorabbitmq.py | 3 - aioscrapy/dupefilters/disk.py | 13 +- aioscrapy/dupefilters/redis.py | 11 +- aioscrapy/http/request/__init__.py | 4 +- aioscrapy/libs/downloader/retry.py | 25 +-- aioscrapy/libs/downloader/stats.py | 6 +- aioscrapy/libs/extensions/closespider.py | 11 +- aioscrapy/libs/extensions/corestats.py | 8 +- aioscrapy/libs/extensions/logstats.py | 17 +- aioscrapy/libs/extensions/metric.py | 138 ++++++++++++ aioscrapy/libs/extensions/throttle.py | 13 +- aioscrapy/libs/pipelines/__init__.py | 10 +- aioscrapy/libs/pipelines/sink/_csv.py | 5 +- aioscrapy/libs/pipelines/sink/execl.py | 26 +-- aioscrapy/libs/pipelines/sink/mongo.py | 4 +- aioscrapy/libs/pipelines/sink/mysql.py | 4 +- aioscrapy/libs/pipelines/sink/pg.py | 4 +- aioscrapy/libs/spider/depth.py | 12 +- aioscrapy/libs/spider/httperror.py | 10 +- aioscrapy/libs/spider/offsite.py | 8 +- aioscrapy/libs/spider/urllength.py | 13 +- aioscrapy/logformatter.py | 31 ++- aioscrapy/middleware/absmanager.py | 15 +- aioscrapy/middleware/downloader.py | 5 - aioscrapy/proxy/__init__.py | 4 +- aioscrapy/proxy/redis.py | 4 +- aioscrapy/queue/redis.py | 7 +- aioscrapy/settings/default_settings.py | 16 +- aioscrapy/spiders/__init__.py | 45 +--- aioscrapy/statscollectors.py | 5 +- .../project/module/middlewares.py.tmpl | 6 +- aioscrapy/utils/log.py | 201 +++--------------- aioscrapy/utils/signal.py | 22 +- aioscrapy/utils/spider.py | 12 +- aioscrapy/utils/tools.py | 8 + example/singlespider/demo_duplicate.py | 18 +- example/singlespider/demo_metric.py | 66 ++++++ example/singlespider/demo_queue_memory.py | 7 +- example/singlespider/demo_request_httpx.py | 4 +- example/singlespider/start.py | 6 +- setup.py | 1 + 57 files changed, 474 insertions(+), 572 deletions(-) create mode 100644 aioscrapy/libs/extensions/metric.py create mode 100644 example/singlespider/demo_metric.py diff --git a/aioscrapy/VERSION b/aioscrapy/VERSION index 6261a05..359a5b9 100644 --- a/aioscrapy/VERSION +++ b/aioscrapy/VERSION @@ -1 +1 @@ -1.3.1 \ No newline at end of file +2.0.0 \ No newline at end of file diff --git a/aioscrapy/__init__.py b/aioscrapy/__init__.py index ac097ab..6f2da6a 100644 --- a/aioscrapy/__init__.py +++ b/aioscrapy/__init__.py @@ -13,7 +13,7 @@ __all__ = [ - '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler' + '__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler', 'Settings' ] diff --git a/aioscrapy/core/downloader/__init__.py b/aioscrapy/core/downloader/__init__.py index 58f2c98..fdf4c24 100644 --- a/aioscrapy/core/downloader/__init__.py +++ b/aioscrapy/core/downloader/__init__.py @@ -1,5 +1,4 @@ import asyncio -import logging import random from abc import abstractmethod from collections import deque @@ -16,10 +15,9 @@ from aioscrapy.settings import Settings from aioscrapy.signalmanager import SignalManager from aioscrapy.utils.httpobj import urlparse_cached +from aioscrapy.utils.log import logger from aioscrapy.utils.misc import load_instance -from aioscrapy.utils.tools import call_helper - -logger = logging.getLogger('aioscrapy.downloader') +from aioscrapy.utils.tools import call_helper, create_task class BaseDownloaderMeta(type): @@ -135,7 +133,7 @@ def __init__( self.active: Set[Request] = set() self.slots: dict = {} self.running: bool = True - asyncio.create_task(self._slot_gc(60)) + create_task(self._slot_gc(60)) @classmethod async def from_crawler(cls, crawler) -> "Downloader": @@ -170,13 +168,13 @@ async def _process_queue(self, slot: Slot) -> None: slot.delay_lock = True await asyncio.sleep(penalty) slot.delay_lock = False - asyncio.create_task(self._process_queue(slot)) + create_task(self._process_queue(slot)) return while slot.queue and slot.free_transfer_slots() > 0: request = slot.queue.popleft() slot.transferring.add(request) - asyncio.create_task(self._download(slot, request)) + create_task(self._download(slot, request)) if delay: break diff --git a/aioscrapy/core/downloader/handlers/__init__.py b/aioscrapy/core/downloader/handlers/__init__.py index 75dd718..a135bd9 100644 --- a/aioscrapy/core/downloader/handlers/__init__.py +++ b/aioscrapy/core/downloader/handlers/__init__.py @@ -1,6 +1,5 @@ """Download handlers for different schemes""" -import logging from abc import abstractmethod from typing import Optional @@ -8,11 +7,10 @@ from aioscrapy.exceptions import NotConfigured, NotSupported from aioscrapy.http import HtmlResponse from aioscrapy.utils.httpobj import urlparse_cached +from aioscrapy.utils.log import logger from aioscrapy.utils.misc import load_instance from aioscrapy.utils.python import without_none_values -logger = logging.getLogger(__name__) - class BaseDownloadHandler: @abstractmethod @@ -67,9 +65,7 @@ async def _load_handler(self, scheme: str) -> Optional[BaseDownloadHandler]: self._notconfigured[scheme] = str(ex) return None except Exception as ex: - logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"', - {"clspath": path, "scheme": scheme}, - exc_info=True, extra={'crawler': self._crawler}) + logger.exception(f'Loading "{path}" for scheme "{scheme}"') self._notconfigured[scheme] = str(ex) return None else: diff --git a/aioscrapy/core/downloader/handlers/aiohttp.py b/aioscrapy/core/downloader/handlers/aiohttp.py index c7ba890..353ca5c 100644 --- a/aioscrapy/core/downloader/handlers/aiohttp.py +++ b/aioscrapy/core/downloader/handlers/aiohttp.py @@ -1,5 +1,4 @@ import asyncio -import logging import re import ssl from typing import Optional @@ -10,8 +9,7 @@ from aioscrapy.core.downloader.handlers import BaseDownloadHandler from aioscrapy.http import HtmlResponse from aioscrapy.settings import Settings - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class AioHttpDownloadHandler(BaseDownloadHandler): @@ -41,7 +39,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse: 'data': request.body or None, 'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get( 'dont_redirect') is None else request.meta.get('dont_redirect'), - 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10), + 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20), } headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS') diff --git a/aioscrapy/core/downloader/handlers/httpx.py b/aioscrapy/core/downloader/handlers/httpx.py index 6febb40..dee3ea1 100644 --- a/aioscrapy/core/downloader/handlers/httpx.py +++ b/aioscrapy/core/downloader/handlers/httpx.py @@ -1,4 +1,3 @@ -import logging import ssl import httpx @@ -7,8 +6,7 @@ from aioscrapy.core.downloader.handlers import BaseDownloadHandler from aioscrapy.http import HtmlResponse from aioscrapy.settings import Settings - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class HttpxDownloadHandler(BaseDownloadHandler): @@ -43,7 +41,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse: 'verify': request.meta.get('verify_ssl', self.verify_ssl), 'follow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get( 'dont_redirect') is None else request.meta.get('dont_redirect'), - 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10), + 'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20), }) ssl_ciphers = request.meta.get('TLS_CIPHERS') ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol) diff --git a/aioscrapy/core/downloader/handlers/playwright/__init__.py b/aioscrapy/core/downloader/handlers/playwright/__init__.py index bd17f42..e468580 100644 --- a/aioscrapy/core/downloader/handlers/playwright/__init__.py +++ b/aioscrapy/core/downloader/handlers/playwright/__init__.py @@ -1,5 +1,3 @@ -import logging - from aioscrapy import Request from aioscrapy.core.downloader.handlers import BaseDownloadHandler from aioscrapy.http import PlaywrightResponse @@ -8,8 +6,6 @@ from .driverpool import WebDriverPool from .webdriver import PlaywrightDriver -logger = logging.getLogger(__name__) - class PlaywrightHandler(BaseDownloadHandler): def __init__(self, settings: Settings): diff --git a/aioscrapy/core/downloader/handlers/pyhttpx.py b/aioscrapy/core/downloader/handlers/pyhttpx.py index a555ebc..324a156 100644 --- a/aioscrapy/core/downloader/handlers/pyhttpx.py +++ b/aioscrapy/core/downloader/handlers/pyhttpx.py @@ -1,5 +1,4 @@ import asyncio -import logging import pyhttpx @@ -7,8 +6,7 @@ from aioscrapy.core.downloader.handlers import BaseDownloadHandler from aioscrapy.http import HtmlResponse from aioscrapy.settings import Settings - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class PyhttpxDownloadHandler(BaseDownloadHandler): diff --git a/aioscrapy/core/downloader/handlers/requests.py b/aioscrapy/core/downloader/handlers/requests.py index 12fad88..844bda8 100644 --- a/aioscrapy/core/downloader/handlers/requests.py +++ b/aioscrapy/core/downloader/handlers/requests.py @@ -1,5 +1,4 @@ import asyncio -import logging import requests @@ -7,8 +6,7 @@ from aioscrapy.core.downloader.handlers import BaseDownloadHandler from aioscrapy.http import HtmlResponse from aioscrapy.settings import Settings - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class RequestsDownloadHandler(BaseDownloadHandler): diff --git a/aioscrapy/core/engine.py b/aioscrapy/core/engine.py index 8f3c7d1..de704e6 100644 --- a/aioscrapy/core/engine.py +++ b/aioscrapy/core/engine.py @@ -1,10 +1,9 @@ # _*_ coding: utf-8 _*_ import asyncio -import logging -from typing import Optional, AsyncGenerator, Union, Callable from asyncio import Queue from asyncio.queues import QueueEmpty +from typing import Optional, AsyncGenerator, Union, Callable import aioscrapy from aioscrapy import Spider @@ -17,9 +16,9 @@ from aioscrapy.http.request import Request from aioscrapy.utils.log import logformatter_adapter from aioscrapy.utils.misc import load_instance -from aioscrapy.utils.tools import call_helper +from aioscrapy.utils.tools import call_helper, create_task -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class Slot: @@ -72,7 +71,7 @@ async def start( while not self.finish: self.running and await self._next_request() await asyncio.sleep(1) - self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl()) + self.enqueue_cache_num != 1 and create_task(self._crawl()) self.running and await self._spider_idle(self.spider) async def stop(self, reason: str = 'shutdown') -> None: @@ -83,7 +82,7 @@ async def stop(self, reason: str = 'shutdown') -> None: while not self.is_idle(): await asyncio.sleep(0.2) - self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl()) + self.enqueue_cache_num != 1 and create_task(self._crawl()) await self.close_spider(self.spider, reason=reason) await self.signals.send_catch_log_deferred(signal=signals.engine_stopped) self.finish = True @@ -93,7 +92,7 @@ async def open( spider: Spider, start_requests: Optional[AsyncGenerator] = None ) -> None: - logger.info("Spider opened", extra={'spider': spider}) + logger.info("Spider opened") self.spider = spider await call_helper(self.crawler.stats.open_spider, spider) @@ -145,7 +144,7 @@ async def _next_request(self) -> None: self.slot.start_requests = None except Exception as e: self.slot.start_requests = None - logger.error('Error while obtaining start requests', exc_info=e, extra={'spider': self.spider}) + logger.exception('Error while obtaining start requests') else: request and await self.crawl(request) finally: @@ -177,9 +176,7 @@ async def handle_downloader_output( result.request = request if isinstance(result, Response): - logkws = self.logformatter.crawled(request, result, self.spider) - if logkws is not None: - logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider}) + logger.log(** self.logformatter.crawled(request, result, self.spider)) await self.signals.send_catch_log(signals.response_received, response=result, request=request, spider=self.spider) await self.scraper.enqueue_scrape(result, request) @@ -207,7 +204,7 @@ def is_idle(self) -> bool: async def crawl(self, request: Request) -> None: if self.enqueue_cache_num == 1: await self.scheduler.enqueue_request(request) - asyncio.create_task(self._next_request()) + create_task(self._next_request()) else: await self.enqueue_cache.put(request) @@ -224,15 +221,13 @@ async def _crawl(self) -> None: break if requests: await call_helper(self.scheduler.enqueue_request_batch, requests) - asyncio.create_task(self._next_request()) + create_task(self._next_request()) self.enqueue_unlock = True async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None: """Close (cancel) spider and clear all its outstanding requests""" - logger.info("Closing spider (%(reason)s)", - {'reason': reason}, - extra={'spider': spider}) + logger.info(f"Closing spider ({reason})") async def close_handler( callback: Callable, @@ -243,11 +238,7 @@ async def close_handler( try: await call_helper(callback, *args, **kwargs) except (Exception, BaseException) as e: - logger.error( - errmsg, - exc_info=e, - extra={'spider': spider} - ) + logger.exception(errmsg) await close_handler(self.downloader.close, errmsg='Downloader close failure') @@ -260,7 +251,7 @@ async def close_handler( await close_handler(self.crawler.stats.close_spider, spider, reason=reason, errmsg='Stats close failure') - logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider}) + logger.info(f"Spider closed ({reason})") await close_handler(setattr, self, 'slot', None, errmsg='Error while unassigning slot') diff --git a/aioscrapy/core/scraper.py b/aioscrapy/core/scraper.py index 7aa1f45..06cf888 100644 --- a/aioscrapy/core/scraper.py +++ b/aioscrapy/core/scraper.py @@ -1,7 +1,6 @@ """This module implements the Scraper component which parses responses and extracts information from them""" import asyncio -import logging from typing import Any, AsyncGenerator, Set, Union, Optional import aioscrapy @@ -12,11 +11,9 @@ from aioscrapy.logformatter import LogFormatter from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager from aioscrapy.signalmanager import SignalManager -from aioscrapy.utils.log import logformatter_adapter +from aioscrapy.utils.log import logger from aioscrapy.utils.misc import load_object -from aioscrapy.utils.tools import call_helper - -logger = logging.getLogger(__name__) +from aioscrapy.utils.tools import call_helper, create_task class Slot: @@ -113,11 +110,8 @@ async def _scrape(self, result: Union[Response, BaseException], request: Request await self.handle_spider_error(e, request, result) else: await self.handle_spider_output(output, request, result) - except BaseException as e: - logger.error('Scraper bug processing %(request)s', - {'request': request}, - exc_info=e, - extra={'spider': self.spider}) + except BaseException: + logger.exception('Scraper bug processing %(request)s' % {'request': request}) finally: if isinstance(result, PlaywrightResponse): await result.release() @@ -152,23 +146,16 @@ async def call_spider(self, result: Union[Response, BaseException], request: Req async def handle_spider_error(self, exc: BaseException, request: Request, response: Response) -> None: if isinstance(exc, CloseSpider): - asyncio.create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled')) + create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled')) return - logkws = self.logformatter.spider_error(exc, request, response, self.spider) - logger.log( - *logformatter_adapter(logkws), - exc_info=exc, - extra={'spider': self.spider} - ) + logger.log(**self.logformatter.spider_error(exc, request, response, self.spider)) await self.signals.send_catch_log( signal=signals.spider_error, failure=exc, response=response, spider=self.spider ) - self.crawler.stats.inc_value( - "spider_exceptions/%s" % exc.__class__.__name__, - spider=self.spider - ) + self.crawler.stats.inc_value("spider_exceptions/%s" % exc.__class__.__name__, spider=self.spider) + self.crawler.stats.inc_value("spider_exceptions", spider=self.spider) async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None: """Iter each Request/Item (given in the output parameter) returned from the given spider""" @@ -202,9 +189,8 @@ async def _process_spidermw_output(self, output: Any, request: Request, response else: typename = type(output).__name__ logger.error( - 'Spider must return request, item, or None, got %(typename)r in %(request)s', - {'request': request, 'typename': typename}, - extra={'spider': self.spider}, + 'Spider must return request, item, or None, got %(typename)r in %(request)s' % {'request': request, + 'typename': typename}, ) async def _log_download_errors( @@ -215,12 +201,7 @@ async def _log_download_errors( ) -> None: """Process and record errors""" if isinstance(download_exception, BaseException) and not isinstance(download_exception, IgnoreRequest): - logkws = self.logformatter.download_error(download_exception, request, self.spider) - logger.log( - *logformatter_adapter(logkws), - extra={'spider': self.spider}, - exc_info=download_exception, - ) + logger.log(**self.logformatter.download_error(download_exception, request, self.spider)) if spider_exception is not download_exception: raise spider_exception @@ -230,23 +211,17 @@ async def _itemproc_finished(self, output: Any, item: Any, response: Response) - self.slot.itemproc_size -= 1 if isinstance(output, BaseException): if isinstance(output, DropItem): - logkws = self.logformatter.dropped(item, output, response, self.spider) - if logkws is not None: - logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider}) + logger.log(**self.logformatter.dropped(item, output, response, self.spider)) return await self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=self.spider, exception=output) else: - logkws = self.logformatter.item_error(item, output, response, self.spider) - logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider}, - exc_info=output) + logger.log(**self.logformatter.item_error(item, output, response, self.spider)) return await self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=self.spider, failure=output) else: - logkws = self.logformatter.scraped(output, response, self.spider) - if logkws is not None: - logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider}) + logger.log(**self.logformatter.scraped(output, response, self.spider)) return await self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=self.spider) diff --git a/aioscrapy/crawler.py b/aioscrapy/crawler.py index 2414b98..3ecdaec 100644 --- a/aioscrapy/crawler.py +++ b/aioscrapy/crawler.py @@ -1,5 +1,4 @@ import asyncio -import logging import pprint import signal import sys @@ -16,14 +15,9 @@ from zope.interface.verify import verifyClass from aioscrapy.logformatter import LogFormatter -from aioscrapy import signals, Spider +from aioscrapy import Spider from aioscrapy.settings import overridden_settings, Settings -from aioscrapy.utils.log import ( - get_scrapy_root_handler, - install_scrapy_root_handler, - LogCounterHandler, - configure_logging, -) +from aioscrapy.utils.log import configure_logging, logger from aioscrapy.utils.misc import load_object, load_instance from aioscrapy.spiderloader import ISpiderLoader from aioscrapy.exceptions import AioScrapyDeprecationWarning @@ -36,8 +30,6 @@ from aioscrapy.utils.ossignal import install_shutdown_handlers, signal_names from aioscrapy.statscollectors import StatsCollector -logger = logging.getLogger(__name__) - class Crawler: @@ -64,6 +56,8 @@ def __init__(self, spidercls: Type[Spider], settings: Union[Settings, dict, None async def crawl(self, *args, **kwargs) -> None: try: + configure_logging(self.spidercls, self.settings) + if self.crawling: raise RuntimeError("Crawling already taking place") @@ -71,15 +65,7 @@ async def crawl(self, *args, **kwargs) -> None: self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) - handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) - logging.root.addHandler(handler) - - d = dict(overridden_settings(self.settings)) - logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) - - if get_scrapy_root_handler() is not None: - install_scrapy_root_handler(self.settings) - self.signals.connect(lambda: logging.root.removeHandler(handler), signals.engine_stopped) + logger.info(f"Overridden settings:\n{pprint.pformat(dict(overridden_settings(self.settings)))}") self.spider = await self.spidercls.from_crawler(self, *args, **kwargs) self.logformatter = await load_instance(self.settings['LOG_FORMATTER'], crawler=self) @@ -96,13 +82,23 @@ async def crawl(self, *args, **kwargs) -> None: await self.engine.close() raise e - async def stop(self) -> None: + async def stop(self, signum=None) -> None: """Starts a graceful stop of the crawler and returns a deferred that is fired when the crawler is stopped.""" + if signum is not None: + asyncio.current_task().set_name(self.spidercls.name) + logger.info( + "Received %(signame)s, shutting down gracefully. Send again to force" % { + 'signame': signal_names[signum] + } + ) if self.crawling: self.crawling = False await self.engine.stop() + def _signal_shutdown(self, signum: Any, _) -> None: + asyncio.create_task(self.stop(signum)) + class CrawlerRunner: crawlers = property( @@ -156,7 +152,7 @@ def crawl_soon( self.active_crawler(crawler, *args, **kwargs) def active_crawler(self, crawler: Crawler, *args, **kwargs) -> None: - task = asyncio.create_task(crawler.crawl(*args, **kwargs)) + task = asyncio.create_task(crawler.crawl(*args, **kwargs), name=crawler.spidercls.name) self._active.add(task) def _done(result): @@ -204,8 +200,8 @@ def _create_crawler( spidercls = self.spider_loader.load(spidercls) return Crawler(spidercls, settings=settings) - async def stop(self) -> None: - await asyncio.gather(*[c.stop() for c in self.crawlers]) + async def stop(self, signum=None) -> None: + await asyncio.gather(*[c.stop(signum) for c in self.crawlers]) class CrawlerProcess(CrawlerRunner): @@ -217,28 +213,25 @@ def __init__( ) -> None: super().__init__(settings) install_shutdown_handlers(self._signal_shutdown) - configure_logging(self.settings, install_root_handler) def _signal_shutdown(self, signum: Any, _) -> None: install_shutdown_handlers(self._signal_kill) - signame = signal_names[signum] - logger.info("Received %(signame)s, shutting down gracefully. Send again to force ", - {'signame': signame}) - asyncio.create_task(self._graceful_stop_reactor()) + asyncio.create_task(self.stop(signum)) def _signal_kill(self, signum: Any, _) -> None: install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] - logger.info('Received %(signame)s twice, forcing unclean shutdown', - {'signame': signame}) + logger.info('Received %(signame)s twice, forcing unclean shutdown' % {'signame': signame}) asyncio.create_task(self._stop_reactor()) async def run(self) -> None: - for crawler, (args, kwargs) in self.crawlers.items(): - self.active_crawler(crawler, *args, **kwargs) - while self._active: - await asyncio.gather(*self._active) - await self.recycle_db_connect() + try: + for crawler, (args, kwargs) in self.crawlers.items(): + self.active_crawler(crawler, *args, **kwargs) + while self._active: + await asyncio.gather(*self._active) + finally: + await self.recycle_db_connect() def start(self) -> None: if sys.platform.startswith('win'): @@ -251,10 +244,6 @@ def start(self) -> None: pass asyncio.run(self.run()) - async def _graceful_stop_reactor(self) -> None: - await self.stop() - await self.recycle_db_connect() - async def _stop_reactor(self) -> None: try: await self.recycle_db_connect() diff --git a/aioscrapy/db/__init__.py b/aioscrapy/db/__init__.py index 0f537fe..be9d08a 100644 --- a/aioscrapy/db/__init__.py +++ b/aioscrapy/db/__init__.py @@ -1,14 +1,12 @@ -import logging from importlib import import_module from typing import Any import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager from aioscrapy.db.aioredis import redis_manager +from aioscrapy.utils.log import logger from aioscrapy.utils.misc import load_object -logger = logging.getLogger(__name__) - __all__ = ['db_manager', 'get_pool', 'get_manager'] DB_MODULE_MAP = { diff --git a/aioscrapy/db/aiomongo.py b/aioscrapy/db/aiomongo.py index 58385e1..48edf8e 100644 --- a/aioscrapy/db/aiomongo.py +++ b/aioscrapy/db/aiomongo.py @@ -1,12 +1,8 @@ -import logging - from motor.motor_asyncio import AsyncIOMotorClient import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager -logger = logging.getLogger(__name__) - class MongoExecutor: def __init__(self, alias: str, pool_manager: "AioMongoManager"): diff --git a/aioscrapy/db/aiomysql.py b/aioscrapy/db/aiomysql.py index 62e813b..a538973 100644 --- a/aioscrapy/db/aiomysql.py +++ b/aioscrapy/db/aiomysql.py @@ -1,4 +1,3 @@ -import logging import socket from contextlib import asynccontextmanager @@ -7,8 +6,6 @@ import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager -logger = logging.getLogger(__name__) - class MysqlExecutor: def __init__(self, alias: str, pool_manager: "AioMysqlPoolManager"): diff --git a/aioscrapy/db/aiopg.py b/aioscrapy/db/aiopg.py index e0bb86a..a9821aa 100644 --- a/aioscrapy/db/aiopg.py +++ b/aioscrapy/db/aiopg.py @@ -1,4 +1,3 @@ -import logging from contextlib import asynccontextmanager from asyncpg.pool import create_pool @@ -6,8 +5,6 @@ import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager -logger = logging.getLogger(__name__) - class PGExecutor: def __init__(self, alias: str, pool_manager: "AioPGPoolManager"): @@ -109,4 +106,5 @@ async def test(): result = await conn.fetch('select 1 ') print(tuple(result[0])) + asyncio.run(test()) diff --git a/aioscrapy/db/aiorabbitmq.py b/aioscrapy/db/aiorabbitmq.py index d3ccf57..60f71b6 100644 --- a/aioscrapy/db/aiorabbitmq.py +++ b/aioscrapy/db/aiorabbitmq.py @@ -1,4 +1,3 @@ -import logging from contextlib import asynccontextmanager import aio_pika @@ -8,8 +7,6 @@ import aioscrapy from aioscrapy.db.absmanager import AbsDBPoolManager -logger = logging.getLogger(__name__) - class RabbitmqExecutor: def __init__(self, alias: str, pool_manager: "AioRabbitmqManager"): diff --git a/aioscrapy/dupefilters/disk.py b/aioscrapy/dupefilters/disk.py index e668fa4..df5e7fa 100644 --- a/aioscrapy/dupefilters/disk.py +++ b/aioscrapy/dupefilters/disk.py @@ -1,9 +1,9 @@ -import logging import os from typing import Optional, Set from aioscrapy import Request, Spider from aioscrapy.dupefilters import DupeFilterBase +from aioscrapy.utils.log import logger from aioscrapy.utils.request import referer_str @@ -15,7 +15,6 @@ def __init__(self, path: Optional[str] = None, debug: bool = False): self.debug = debug self.fingerprints: Set = set() self.logdupes: bool = True - self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.file.seek(0) @@ -37,20 +36,20 @@ async def request_seen(self, request: Request) -> bool: self.file.write(request.fingerprint + '\n') return False - def close(self, reason: str = '') -> None: + async def close(self, reason: str = '') -> None: if self.file: self.file.close() def log(self, request: Request, spider: Spider): if self.debug: - msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" - args = {'request': request, 'referer': referer_str(request)} - self.logger.debug(msg, args, extra={'spider': spider}) + logger.debug("Filtered duplicate request: %(request)s (referer: %(referer)s)" % { + 'request': request, 'referer': referer_str(request) + }) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + logger.debug(msg % {'request': request}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) diff --git a/aioscrapy/dupefilters/redis.py b/aioscrapy/dupefilters/redis.py index 28bd258..3fbecca 100644 --- a/aioscrapy/dupefilters/redis.py +++ b/aioscrapy/dupefilters/redis.py @@ -1,17 +1,13 @@ -import logging - from aioscrapy import Request from aioscrapy.db import db_manager from aioscrapy.dupefilters import DupeFilterBase -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class RedisRFPDupeFilter(DupeFilterBase): """Request Fingerprint duplicates filter built with Set of Redis""" - logger = logger - def __init__( self, server: "redis.asyncio.Redis", @@ -47,13 +43,12 @@ async def clear(self): def log(self, request, spider): if self.debug: - msg = "Filtered duplicate request: %(request)s" - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + logger.debug("Filtered duplicate request: %(request)s" % {'request': request}) elif self.logdupes: msg = ("Filtered duplicate request %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") - self.logger.debug(msg, {'request': request}, extra={'spider': spider}) + logger.debug(msg % {'request': request}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) diff --git a/aioscrapy/http/request/__init__.py b/aioscrapy/http/request/__init__.py index 1d6a71f..8ec2f78 100644 --- a/aioscrapy/http/request/__init__.py +++ b/aioscrapy/http/request/__init__.py @@ -63,8 +63,8 @@ def __init__( self.dont_filter = dont_filter self.use_proxy = use_proxy - self._meta = dict(meta) if meta else None - if fingerprint: + self._meta = dict(meta) if meta else {} + if fingerprint is not None: self._set_fingerprint(fingerprint) self._cb_kwargs = dict(cb_kwargs) if cb_kwargs else None self.flags = [] if flags is None else list(flags) diff --git a/aioscrapy/libs/downloader/retry.py b/aioscrapy/libs/downloader/retry.py index ff15597..e8c02ae 100644 --- a/aioscrapy/libs/downloader/retry.py +++ b/aioscrapy/libs/downloader/retry.py @@ -9,7 +9,6 @@ Failed pages are collected on the scraping process and rescheduled at the end, once the spider has finished crawling all regular (non failed) pages. """ -from logging import getLogger, Logger from typing import Optional, Union try: @@ -21,30 +20,35 @@ try: from aiohttp.client_exceptions import ClientError - NEED_RETRY_ERROR += (ClientError, ) + + NEED_RETRY_ERROR += (ClientError,) except ImportError: pass try: from httpx import HTTPError as HttpxError - NEED_RETRY_ERROR += (HttpxError, ) + + NEED_RETRY_ERROR += (HttpxError,) except ImportError: pass try: from pyhttpx.exception import BaseExpetion as PyHttpxError + NEED_RETRY_ERROR += (PyHttpxError,) except ImportError: pass try: from requests.exceptions import RequestException as RequestsError + NEED_RETRY_ERROR += (RequestsError,) except ImportError: pass try: from playwright._impl._api_types import Error as PlaywrightError + NEED_RETRY_ERROR += (PlaywrightError,) except ImportError: pass @@ -53,8 +57,7 @@ from aioscrapy.http.request import Request from aioscrapy.spiders import Spider from aioscrapy.utils.python import global_object_name - -retry_logger = getLogger(__name__) +from aioscrapy.utils.log import logger as retry_logger def get_retry_request( @@ -64,7 +67,7 @@ def get_retry_request( reason: Union[str, Exception] = 'unspecified', max_retry_times: Optional[int] = None, priority_adjust: Optional[int] = None, - logger: Logger = retry_logger, + logger=retry_logger, stats_base_key: str = 'retry', ): """ @@ -84,9 +87,9 @@ def get_retry_request( reason = global_object_name(reason.__class__) logger.info( - "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", - {'request': request, 'retry_times': retry_times, 'reason': reason}, - extra={'spider': spider} + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s" % { + 'request': request, 'retry_times': retry_times, 'reason': reason + }, ) new_request = request.copy() new_request.meta['retry_times'] = retry_times @@ -102,9 +105,7 @@ def get_retry_request( stats.inc_value(f'{stats_base_key}/max_reached') logger.error( "Gave up retrying %(request)s (failed %(retry_times)d times): " - "%(reason)s", - {'request': request, 'retry_times': retry_times, 'reason': reason}, - extra={'spider': spider}, + "%(reason)s" % {'request': request, 'retry_times': retry_times, 'reason': reason} ) return None diff --git a/aioscrapy/libs/downloader/stats.py b/aioscrapy/libs/downloader/stats.py index d9134c4..136850f 100644 --- a/aioscrapy/libs/downloader/stats.py +++ b/aioscrapy/libs/downloader/stats.py @@ -17,14 +17,12 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): self.stats.inc_value('downloader/request_count', spider=spider) self.stats.inc_value(f'downloader/request_method_count/{request.method}', spider=spider) - reqlen = len(request_httprepr(request)) - self.stats.inc_value('downloader/request_bytes', reqlen, spider=spider) + self.stats.inc_value('downloader/request_bytes', len(request_httprepr(request)), spider=spider) def process_response(self, request, response, spider): self.stats.inc_value('downloader/response_count', spider=spider) self.stats.inc_value(f'downloader/response_status_count/{response.status}', spider=spider) - reslen = len(response.body) - self.stats.inc_value('downloader/response_bytes', reslen, spider=spider) + self.stats.inc_value('downloader/response_bytes', len(response.body), spider=spider) return response def process_exception(self, request, exception, spider): diff --git a/aioscrapy/libs/extensions/closespider.py b/aioscrapy/libs/extensions/closespider.py index 3cf223a..b0071a0 100644 --- a/aioscrapy/libs/extensions/closespider.py +++ b/aioscrapy/libs/extensions/closespider.py @@ -9,6 +9,7 @@ from aioscrapy import signals from aioscrapy.exceptions import NotConfigured +from aioscrapy.utils.tools import create_task class CloseSpider: @@ -46,24 +47,24 @@ def from_crawler(cls, crawler): async def error_count(self, failure, response, spider): self.counter['errorcount'] += 1 if self.counter['errorcount'] == self.close_on['errorcount']: - asyncio.create_task(self.crawler.engine.stop(reason='closespider_errorcount')) + create_task(self.crawler.engine.stop(reason='closespider_errorcount')) async def page_count(self, response, request, spider): self.counter['pagecount'] += 1 if self.counter['pagecount'] == self.close_on['pagecount']: - asyncio.create_task(self.crawler.engine.stop(reason='closespider_pagecount')) + create_task(self.crawler.engine.stop(reason='closespider_pagecount')) async def timeout_close(self, spider): async def close(): await asyncio.sleep(self.close_on['timeout']) - asyncio.create_task(self.crawler.engine.stop(reason='closespider_timeout')) + create_task(self.crawler.engine.stop(reason='closespider_timeout')) - self.task = asyncio.create_task(close()) + self.task = create_task(close()) async def item_scraped(self, item, spider): self.counter['itemcount'] += 1 if self.counter['itemcount'] == self.close_on['itemcount']: - asyncio.create_task(self.crawler.engine.stop(reason='closespider_itemcount')) + create_task(self.crawler.engine.stop(reason='closespider_itemcount')) def spider_closed(self, spider): if self.task and not self.task.done(): diff --git a/aioscrapy/libs/extensions/corestats.py b/aioscrapy/libs/extensions/corestats.py index 206b408..bcbef20 100644 --- a/aioscrapy/libs/extensions/corestats.py +++ b/aioscrapy/libs/extensions/corestats.py @@ -23,15 +23,15 @@ def from_crawler(cls, crawler): return o def spider_opened(self, spider): - self.start_time = datetime.utcnow() - self.stats.set_value('start_time', self.start_time, spider=spider) + self.start_time = datetime.now() + self.stats.set_value('start_time', str(self.start_time), spider=spider) def spider_closed(self, spider, reason): - finish_time = datetime.utcnow() + finish_time = datetime.now() elapsed_time = finish_time - self.start_time elapsed_time_seconds = elapsed_time.total_seconds() self.stats.set_value('elapsed_time_seconds', elapsed_time_seconds, spider=spider) - self.stats.set_value('finish_time', finish_time, spider=spider) + self.stats.set_value('finish_time', str(finish_time), spider=spider) self.stats.set_value('finish_reason', reason, spider=spider) def item_scraped(self, item, spider): diff --git a/aioscrapy/libs/extensions/logstats.py b/aioscrapy/libs/extensions/logstats.py index 2c57bfd..837f805 100644 --- a/aioscrapy/libs/extensions/logstats.py +++ b/aioscrapy/libs/extensions/logstats.py @@ -1,10 +1,9 @@ -import logging import asyncio -from aioscrapy.exceptions import NotConfigured from aioscrapy import signals - -logger = logging.getLogger(__name__) +from aioscrapy.exceptions import NotConfigured +from aioscrapy.utils.log import logger +from aioscrapy.utils.tools import create_task class LogStats: @@ -15,6 +14,8 @@ def __init__(self, stats, interval=60.0): self.interval = interval self.multiplier = 60.0 / self.interval self.task = None + self.pagesprev = 0 + self.itemsprev = 0 @classmethod def from_crawler(cls, crawler): @@ -27,9 +28,7 @@ def from_crawler(cls, crawler): return o def spider_opened(self, spider): - self.pagesprev = 0 - self.itemsprev = 0 - self.task = asyncio.create_task(self.log(spider)) + self.task = create_task(self.log(spider)) async def log(self, spider): await asyncio.sleep(self.interval) @@ -43,8 +42,8 @@ async def log(self, spider): "scraped %(items)d items (at %(itemrate)d items/min)") log_args = {'pages': pages, 'pagerate': prate, 'spider_name': spider.name, 'items': items, 'itemrate': irate} - logger.info(msg, log_args, extra={'spider': spider}) - self.task = asyncio.create_task(self.log(spider)) + logger.info(msg % log_args) + self.task = create_task(self.log(spider)) def spider_closed(self, spider, reason): if self.task and not self.task.done(): diff --git a/aioscrapy/libs/extensions/metric.py b/aioscrapy/libs/extensions/metric.py new file mode 100644 index 0000000..b72d80a --- /dev/null +++ b/aioscrapy/libs/extensions/metric.py @@ -0,0 +1,138 @@ +import asyncio +import os +import platform +import random +import time + +from aiohttp import ClientSession + +from aioscrapy import Settings +from aioscrapy import signals +from aioscrapy.utils.log import _logger, logger +from aioscrapy.utils.tools import create_task + + +class InfluxBase: + @staticmethod + def format_metric(metric_name, value, spider_name, location, measurement=None): + measurement = measurement or metric_name + return f"{measurement},spider_name={spider_name},location={location} {metric_name}={value} {time.time_ns() + int(random.random() * 100000)}" + + async def record(self, obj: "Metric"): + raise NotImplementedError + + async def close(self): + pass + + +class InfluxHttp(InfluxBase): + def __init__(self, spider_name: str, settings: Settings): + influxdb_url = settings.get('METRIC_INFLUXDB_URL') + token = settings.get('METRIC_INFLUXDB_TOKEN') + location = settings.get('METRIC_LOCATION') + self.location = location or f"{platform.node()}_{os.getpid()}" + self.spider_name = spider_name + self.session = ClientSession(headers={ + "Authorization": f"Token {token}", + "Content-Type": "text/plain; charset=utf-8", + "Accept": "application/json", + }) + self.url = influxdb_url + self.lock = asyncio.Lock() + + async def emit(self, data): + async with self.session.post(self.url, data=data) as response: + await response.read() + logger.debug(f"emit metric success<{response.status}>: \n{data}") + + async def record(self, obj: "Metric"): + async with self.lock: + data = '' + for metric_name in obj.metrics.keys(): + current_cnt = obj.stats.get_value(metric_name, 0) + if not isinstance(current_cnt, (int, float)): + continue + cnt = current_cnt - obj.prev.get(metric_name, 0) + if cnt: + data += self.format_metric(metric_name.replace('/', '-'), cnt, self.spider_name, + self.location) + '\n' + obj.prev[metric_name] = current_cnt + if data: + await self.emit(data) + + async def close(self): + if self.session is not None: + await self.session.close() + await asyncio.sleep(0.250) + + +class InfluxLog(InfluxBase): + def __init__(self, spider_name: str, settings: Settings): + location = settings.get('METRIC_LOCATION') + self.location = location or f"{platform.node()}_{os.getpid()}" + self.spider_name = spider_name + + log_args = settings.getdict('METRIC_LOG_ARGS') + log_args.update(dict( + filter=lambda record: record["extra"].get("metric") is not None, + format="{time:YYYY-MM-DD HH:mm:ss.SSS} {message}", + encoding="utf-8" + )) + for k, v in dict( + sink=f'{spider_name}.metric', level="INFO", rotation='20MB', + retention=3 + ).items(): + log_args.setdefault(k, v) + + _logger.add(**log_args) + self.log = _logger.bind(metric="metric") + + async def record(self, obj: "Metric"): + for metric_name in obj.metrics.keys(): + current_cnt = obj.stats.get_value(metric_name, 0) + if not isinstance(current_cnt, (int, float)): + continue + prev_cnt = obj.prev.get(metric_name, 0) + cnt = current_cnt - prev_cnt + if cnt: + msg = self.format_metric(metric_name.replace('/', '-'), cnt, self.spider_name, self.location) + self.log.info(msg) + logger.debug(msg) + obj.prev[metric_name] = current_cnt + + +class Metric: + """Log Metric scraping stats periodically""" + + def __init__(self, stats, spider_name, settings, interval=10.0): + if settings.get('METRIC_INFLUXDB_URL'): + self.influx = InfluxHttp(spider_name, settings) + else: + self.influx = InfluxLog(spider_name, settings) + self.stats = stats + self.metrics = settings.getdict('METRICS') or self.stats._stats + self.interval = interval + self.task = None + self.prev = {} + + @classmethod + def from_crawler(cls, crawler): + interval = crawler.settings.getfloat('METRIC_INTERVAL', 10.0) + o = cls(crawler.stats, crawler.spider.name, crawler.settings, interval) + crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) + crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + return o + + def spider_opened(self, spider): + self.task = create_task(self.run(spider)) + + async def run(self, spider): + await asyncio.sleep(self.interval) + await self.influx.record(self) + self.task = create_task(self.run(spider)) + + async def spider_closed(self, spider, reason): + if self.task and not self.task.done(): + self.task.cancel() + await self.influx.record(self) + await self.influx.close() diff --git a/aioscrapy/libs/extensions/throttle.py b/aioscrapy/libs/extensions/throttle.py index f485f78..b614952 100644 --- a/aioscrapy/libs/extensions/throttle.py +++ b/aioscrapy/libs/extensions/throttle.py @@ -1,9 +1,6 @@ -import logging - -from aioscrapy.exceptions import NotConfigured from aioscrapy import signals - -logger = logging.getLogger(__name__) +from aioscrapy.exceptions import NotConfigured +from aioscrapy.utils.log import logger class AutoThrottle: @@ -52,13 +49,11 @@ def _response_downloaded(self, response, request, spider): logger.info( "slot: %(slot)s | conc:%(concurrency)2d | " "delay:%(delay)5d ms (%(delaydiff)+d) | " - "latency:%(latency)5d ms | size:%(size)6d bytes", - { + "latency:%(latency)5d ms | size:%(size)6d bytes" % { 'slot': key, 'concurrency': conc, 'delay': slot.delay * 1000, 'delaydiff': diff * 1000, 'latency': latency * 1000, 'size': size - }, - extra={'spider': spider} + } ) def _get_slot(self, request, spider): diff --git a/aioscrapy/libs/pipelines/__init__.py b/aioscrapy/libs/pipelines/__init__.py index 90dcf44..b50e956 100644 --- a/aioscrapy/libs/pipelines/__init__.py +++ b/aioscrapy/libs/pipelines/__init__.py @@ -1,9 +1,7 @@ import asyncio -import logging -from typing import Optional - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger +from aioscrapy.utils.tools import create_task class SqlFormat: @@ -107,12 +105,12 @@ def __init__(self, settings, db_type: str): self.item_save_key: str = f'__{db_type}__' async def open_spider(self, spider): - asyncio.create_task(self.save_heartbeat()) + create_task(self.save_heartbeat()) async def save_heartbeat(self): while self.running: await asyncio.sleep(self.save_cache_interval) - asyncio.create_task(self.save_all()) + create_task(self.save_all()) async def process_item(self, item, spider): save_info = item.pop(self.item_save_key, None) diff --git a/aioscrapy/libs/pipelines/sink/_csv.py b/aioscrapy/libs/pipelines/sink/_csv.py index 478cebc..75738be 100644 --- a/aioscrapy/libs/pipelines/sink/_csv.py +++ b/aioscrapy/libs/pipelines/sink/_csv.py @@ -1,9 +1,8 @@ import asyncio import csv -import logging from typing import Optional -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class CsvSinkMixin: @@ -27,7 +26,7 @@ async def save_item( def _get_writer(self, filename, item): writer, *_ = self.csv_writer.get(filename, (None, None)) if writer is None: - file = open(filename, 'w', encoding="UTF8", newline='') + file = open(filename, 'w', encoding="UTF8", newline='') writer = csv.writer(file) writer.writerow(item.keys()) self.csv_writer[filename] = (writer, file) diff --git a/aioscrapy/libs/pipelines/sink/execl.py b/aioscrapy/libs/pipelines/sink/execl.py index 6a31593..9d07e8c 100644 --- a/aioscrapy/libs/pipelines/sink/execl.py +++ b/aioscrapy/libs/pipelines/sink/execl.py @@ -1,5 +1,4 @@ import asyncio -import logging import math from io import BytesIO from typing import Tuple, Optional @@ -8,7 +7,8 @@ import xlsxwriter from PIL import Image, ImageFile -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger + try: resample = Image.LANCZOS except: @@ -106,15 +106,18 @@ def _get_write_class(self, filename, item, sheet='sheet1', **options): self.y_cache[filename_sheet] def close_execl(self, filename=None): - if filename in self.wb_cache.keys(): - logger.info(f'Closing Execl: {filename}') - self.wb_cache.pop(filename, None).close() - for filename_sheet in list(self.ws_cache.keys()): - if not filename_sheet.startswith(filename): - continue - self.ws_cache.pop(filename_sheet, None) - self.y_cache.pop(filename_sheet, None) - self.fields_cache.pop(filename_sheet, None) + if filename not in self.wb_cache: + return + + logger.info(f'Closing Execl: {filename}') + if wb := self.wb_cache.pop(filename): + wb.close() + for filename_sheet in list(self.ws_cache.keys()): + if not filename_sheet.startswith(filename): + continue + self.ws_cache.pop(filename_sheet, None) + self.y_cache.pop(filename_sheet, None) + self.fields_cache.pop(filename_sheet, None) def close(self): for filename in list(self.wb_cache.keys()): @@ -144,7 +147,6 @@ async def close_spider(self, spider): if __name__ == '__main__': - class TestSpider: name = 'TestSpider' diff --git a/aioscrapy/libs/pipelines/sink/mongo.py b/aioscrapy/libs/pipelines/sink/mongo.py index 9f42f0a..a53a3dd 100644 --- a/aioscrapy/libs/pipelines/sink/mongo.py +++ b/aioscrapy/libs/pipelines/sink/mongo.py @@ -1,9 +1,7 @@ -import logging - from aioscrapy.db import db_manager from aioscrapy.libs.pipelines import DBPipelineBase -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class MongoPipeline(DBPipelineBase): diff --git a/aioscrapy/libs/pipelines/sink/mysql.py b/aioscrapy/libs/pipelines/sink/mysql.py index 37334b9..c4f2955 100644 --- a/aioscrapy/libs/pipelines/sink/mysql.py +++ b/aioscrapy/libs/pipelines/sink/mysql.py @@ -1,9 +1,7 @@ -import logging - from aioscrapy.db import db_manager from aioscrapy.libs.pipelines import DBPipelineBase -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class MysqlPipeline(DBPipelineBase): diff --git a/aioscrapy/libs/pipelines/sink/pg.py b/aioscrapy/libs/pipelines/sink/pg.py index 5395743..be81919 100644 --- a/aioscrapy/libs/pipelines/sink/pg.py +++ b/aioscrapy/libs/pipelines/sink/pg.py @@ -1,9 +1,7 @@ -import logging - from aioscrapy.db import db_manager from aioscrapy.libs.pipelines import DBPipelineBase -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class PGPipeline(DBPipelineBase): diff --git a/aioscrapy/libs/spider/depth.py b/aioscrapy/libs/spider/depth.py index ffbd9d6..a94b346 100644 --- a/aioscrapy/libs/spider/depth.py +++ b/aioscrapy/libs/spider/depth.py @@ -4,11 +4,9 @@ See documentation in docs/topics/spider-middleware.rst """ -import logging - from aioscrapy.http import Request -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class DepthMiddleware: @@ -35,11 +33,9 @@ def _filter(request): if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: - logger.debug( - "Ignoring link (depth > %(maxdepth)d): %(requrl)s ", - {'maxdepth': self.maxdepth, 'requrl': request.url}, - extra={'spider': spider} - ) + logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s " % { + 'maxdepth': self.maxdepth, 'requrl': request.url + }) return False else: if self.verbose_stats: diff --git a/aioscrapy/libs/spider/httperror.py b/aioscrapy/libs/spider/httperror.py index c6f2869..afe791f 100644 --- a/aioscrapy/libs/spider/httperror.py +++ b/aioscrapy/libs/spider/httperror.py @@ -3,11 +3,10 @@ See documentation in docs/topics/spider-middleware.rst """ -import logging from aioscrapy.exceptions import IgnoreRequest -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class HttpError(IgnoreRequest): @@ -49,8 +48,7 @@ async def process_spider_exception(self, response, exception, spider): spider.crawler.stats.inc_value( f'httperror/response_ignored_status_count/{response.status}' ) - logger.info( - "Ignoring response %(response)r: HTTP status code is not handled or not allowed", - {'response': response}, extra={'spider': spider}, - ) + logger.info("Ignoring response %(response)r: HTTP status code is not handled or not allowed" % { + 'response': response + }) return [] diff --git a/aioscrapy/libs/spider/offsite.py b/aioscrapy/libs/spider/offsite.py index 172c806..67666e7 100644 --- a/aioscrapy/libs/spider/offsite.py +++ b/aioscrapy/libs/spider/offsite.py @@ -4,14 +4,12 @@ See documentation in docs/topics/spider-middleware.rst """ import re -import logging import warnings from aioscrapy import signals from aioscrapy.http import Request from aioscrapy.utils.httpobj import urlparse_cached - -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class OffsiteMiddleware: @@ -35,8 +33,8 @@ async def process_spider_output(self, response, result, spider): if domain and domain not in self.domains_seen: self.domains_seen.add(domain) logger.debug( - "Filtered offsite request to %(domain)r: %(request)s", - {'domain': domain, 'request': x}, extra={'spider': spider}) + "Filtered offsite request to %(domain)r: %(request)s" % {'domain': domain, 'request': x} + ) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: diff --git a/aioscrapy/libs/spider/urllength.py b/aioscrapy/libs/spider/urllength.py index b6b8bc3..c1649da 100644 --- a/aioscrapy/libs/spider/urllength.py +++ b/aioscrapy/libs/spider/urllength.py @@ -4,12 +4,9 @@ See documentation in docs/topics/spider-middleware.rst """ -import logging - -from aioscrapy.http import Request from aioscrapy.exceptions import NotConfigured - -logger = logging.getLogger(__name__) +from aioscrapy.http import Request +from aioscrapy.utils.log import logger class UrlLengthMiddleware: @@ -28,9 +25,9 @@ async def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request) and len(request.url) > self.maxlength: logger.info( - "Ignoring link (url length > %(maxlength)d): %(url)s ", - {'maxlength': self.maxlength, 'url': request.url}, - extra={'spider': spider} + "Ignoring link (url length > %(maxlength)d): %(url)s " % { + 'maxlength': self.maxlength, 'url': request.url + } ) spider.crawler.stats.inc_value('urllength/request_ignored_count', spider=spider) return False diff --git a/aioscrapy/logformatter.py b/aioscrapy/logformatter.py index 673d1c5..2534863 100644 --- a/aioscrapy/logformatter.py +++ b/aioscrapy/logformatter.py @@ -1,5 +1,4 @@ import os -import logging from aioscrapy.utils.request import referer_str @@ -19,9 +18,8 @@ def crawled(request, response, spider): request_flags = f' {str(request.flags)}' if request.flags else '' response_flags = f' {str(response.flags)}' if response.flags else '' return { - 'level': logging.DEBUG, - 'msg': CRAWLEDMSG, - 'args': { + '_Logger__level': "DEBUG", + '_Logger__message': CRAWLEDMSG % { 'status': response.status, 'request': request, 'request_flags': request_flags, @@ -36,9 +34,8 @@ def scraped(item, response, spider): """Logs a message when an item is scraped by a spider.""" src = response return { - 'level': logging.DEBUG, - 'msg': SCRAPEDMSG, - 'args': { + '_Logger__level': "DEBUG", + '_Logger__message': SCRAPEDMSG % { 'src': src, 'item': item, } @@ -48,9 +45,8 @@ def scraped(item, response, spider): def dropped(item, exception, response, spider): """Logs a message when an item is dropped while it is passing through the item pipeline.""" return { - 'level': logging.WARNING, - 'msg': DROPPEDMSG, - 'args': { + '_Logger__level': "WARNING", + '_Logger__message': DROPPEDMSG % { 'exception': exception, 'item': item, } @@ -64,9 +60,8 @@ def item_error(item, exception, response, spider): .. versionadded:: 2.0 """ return { - 'level': logging.ERROR, - 'msg': ITEMERRORMSG, - 'args': { + '_Logger__level': "ERROR", + '_Logger__message': ITEMERRORMSG % { 'item': item, } } @@ -78,9 +73,8 @@ def spider_error(failure, request, response, spider): .. versionadded:: 2.0 """ return { - 'level': logging.ERROR, - 'msg': SPIDERERRORMSG, - 'args': { + '_Logger__level': "ERROR", + '_Logger__message': SPIDERERRORMSG % { 'request': request, 'referer': referer_str(request), } @@ -100,9 +94,8 @@ def download_error(failure, request, spider, errmsg=None): else: msg = DOWNLOADERRORMSG_SHORT return { - 'level': logging.ERROR, - 'msg': msg, - 'args': args, + '_Logger__level': "ERROR", + '_Logger__message': msg % args, } @classmethod diff --git a/aioscrapy/middleware/absmanager.py b/aioscrapy/middleware/absmanager.py index 5c996c2..89cb68b 100644 --- a/aioscrapy/middleware/absmanager.py +++ b/aioscrapy/middleware/absmanager.py @@ -1,14 +1,12 @@ -import logging import pprint from abc import ABCMeta, abstractmethod from collections import defaultdict, deque from aioscrapy.exceptions import NotConfigured +from aioscrapy.utils.log import logger from aioscrapy.utils.misc import load_instance from aioscrapy.utils.tools import call_helper -logger = logging.getLogger(__name__) - class AbsMiddlewareManager(object, metaclass=ABCMeta): """Base class for implementing middleware managers""" @@ -38,14 +36,9 @@ async def from_settings(cls, settings, crawler=None): except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] - logger.warning("Disabled %(clsname)s: %(eargs)s", - {'clsname': clsname, 'eargs': e.args[0]}, - extra={'crawler': crawler}) - - logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", - {'componentname': cls.component_name, - 'enabledlist': pprint.pformat(enabled)}, - extra={'crawler': crawler}) + logger.warning("Disabled %(clsname)s: %(eargs)s" % {'clsname': clsname, 'eargs': e.args[0]}) + + logger.info(f"Enabled {cls.component_name}s:\n{pprint.pformat(enabled)}") return cls(*middlewares) @classmethod diff --git a/aioscrapy/middleware/downloader.py b/aioscrapy/middleware/downloader.py index e532d51..188dfdd 100644 --- a/aioscrapy/middleware/downloader.py +++ b/aioscrapy/middleware/downloader.py @@ -1,13 +1,9 @@ -import logging - from aioscrapy.exceptions import _InvalidOutput from aioscrapy.http import Request, Response from aioscrapy.middleware.absmanager import AbsMiddlewareManager from aioscrapy.utils.conf import build_component_list from aioscrapy.utils.tools import call_helper -logger = logging.getLogger(__name__) - class DownloaderMiddlewareManager(AbsMiddlewareManager): component_name = 'downloader middleware' @@ -71,4 +67,3 @@ async def process_exception(self, spider, request, exception): if response: return response return exception - diff --git a/aioscrapy/proxy/__init__.py b/aioscrapy/proxy/__init__.py index 66b14bd..48e49b8 100644 --- a/aioscrapy/proxy/__init__.py +++ b/aioscrapy/proxy/__init__.py @@ -1,10 +1,8 @@ -import logging from abc import ABCMeta, abstractmethod +from aioscrapy.utils.log import logger from aioscrapy.utils.python import global_object_name -logger = logging.getLogger('aioscrapy.proxy') - class AbsProxy(metaclass=ABCMeta): def __init__(self, settings): diff --git a/aioscrapy/proxy/redis.py b/aioscrapy/proxy/redis.py index ece930a..07572e5 100644 --- a/aioscrapy/proxy/redis.py +++ b/aioscrapy/proxy/redis.py @@ -1,12 +1,10 @@ import asyncio import time from typing import Optional -import logging from aioscrapy.db import db_manager from aioscrapy.proxy import AbsProxy - -logger = logging.getLogger('aioscrapy.proxy') +from aioscrapy.utils.log import logger class RedisProxy(AbsProxy): diff --git a/aioscrapy/queue/redis.py b/aioscrapy/queue/redis.py index d79f20c..eb482fc 100644 --- a/aioscrapy/queue/redis.py +++ b/aioscrapy/queue/redis.py @@ -1,4 +1,3 @@ -import logging from abc import ABC from typing import Optional @@ -8,8 +7,6 @@ from aioscrapy.serializer import AbsSerializer from aioscrapy.utils.misc import load_object -logger = logging.getLogger(__name__) - class RedisQueueBase(AbsQueue, ABC): inc_key = 'scheduler/enqueued/redis' @@ -96,8 +93,8 @@ async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]: stop = count - 1 if count - 1 > 0 else 0 results, _ = await ( pipe.zrange(self.key, 0, stop) - .zremrangebyrank(self.key, 0, stop) - .execute() + .zremrangebyrank(self.key, 0, stop) + .execute() ) for result in results: yield self._decode_request(result) diff --git a/aioscrapy/settings/default_settings.py b/aioscrapy/settings/default_settings.py index 0ea4a52..97a24e1 100644 --- a/aioscrapy/settings/default_settings.py +++ b/aioscrapy/settings/default_settings.py @@ -106,7 +106,7 @@ 'aioscrapy.libs.extensions.corestats.CoreStats': 0, 'aioscrapy.libs.extensions.closespider.CloseSpider': 0, 'aioscrapy.libs.extensions.logstats.LogStats': 0, - 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0, + # 'aioscrapy.libs.extensions.throttle.AutoThrottle': 0, } FILES_STORE_S3_ACL = 'private' @@ -122,24 +122,18 @@ LOG_ENABLED = True LOG_ENCODING = 'utf-8' -LOG_MAX_BYTES = 50 * 1024 * 1024 -LOG_BACKUP_COUNT = 10 +LOG_ROTATION = '20MB' +LOG_RETENTION = 10 LOG_FORMATTER = 'aioscrapy.logformatter.LogFormatter' -LOG_FORMAT = '%(asctime)s %(process)d [%(name)s] %(levelname)s: %(message)s' -LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S' -LOG_STDOUT = False +LOG_FORMAT = "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {process} | {extra[spidername]} | {name}:{function}:{line} | {message}" +LOG_STDOUT = True LOG_LEVEL = 'DEBUG' LOG_FILE = None -LOG_SHORT_NAMES = False SCHEDULER_DEBUG = False LOGSTATS_INTERVAL = 60.0 -METAREFRESH_ENABLED = True -METAREFRESH_IGNORE_TAGS = [] -METAREFRESH_MAXDELAY = 100 - NEWSPIDER_MODULE = '' RANDOMIZE_DOWNLOAD_DELAY = True diff --git a/aioscrapy/spiders/__init__.py b/aioscrapy/spiders/__init__.py index 48bac67..7646b84 100644 --- a/aioscrapy/spiders/__init__.py +++ b/aioscrapy/spiders/__init__.py @@ -3,17 +3,16 @@ See documentation in docs/topics/spiders.rst """ -import logging import warnings from typing import Optional from aioscrapy import signals +from aioscrapy.exceptions import DontCloseSpider from aioscrapy.http.request import Request from aioscrapy.http.response import Response -from aioscrapy.utils.url import url_is_from_spider from aioscrapy.utils.deprecate import method_is_overridden -from aioscrapy.exceptions import DontCloseSpider from aioscrapy.utils.tools import call_helper +from aioscrapy.utils.url import url_is_from_spider class Spider(object): @@ -34,20 +33,6 @@ def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] - @property - def logger(self): - logger = logging.getLogger(self.name) - return logging.LoggerAdapter(logger, {'spider': self}) - - def log(self, message, level=logging.DEBUG, **kw): - """Log the given message at the given log level - - This helper wraps a log call to the logger within the spider, but you - can use it directly (e.g. Spider.logger.info('msg')) or use any other - Python logger too. - """ - self.logger.log(level, message, **kw) - @classmethod async def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) @@ -62,34 +47,14 @@ def _set_crawler(self, crawler): crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) async def start_requests(self): - cls = self.__class__ if not self.start_urls and hasattr(self, 'start_url'): raise AttributeError( "Crawling could not start: 'start_urls' not found " "or empty (but found 'start_url' attribute instead, " "did you miss an 's'?)") - if method_is_overridden(cls, Spider, 'make_requests_from_url'): - warnings.warn( - "Spider.make_requests_from_url method is deprecated; it " - "won't be called in future Scrapy releases. Please " - "override Spider.start_requests method instead " - f"(see {cls.__module__}.{cls.__name__}).", - ) - for url in self.start_urls: - yield self.make_requests_from_url(url) - else: - for url in self.start_urls: - yield Request(url) - - def make_requests_from_url(self, url): - """ This method is deprecated. """ - warnings.warn( - "Spider.make_requests_from_url method is deprecated: " - "it will be removed and not be called by the default " - "Spider.start_requests method in future Scrapy releases. " - "Please override Spider.start_requests method instead." - ) - return Request(url, dont_filter=True) + + for url in self.start_urls: + yield Request(url) async def _parse(self, response: Response, **kwargs): return await call_helper(self.parse, response) diff --git a/aioscrapy/statscollectors.py b/aioscrapy/statscollectors.py index a4b3e9f..6b25fbc 100644 --- a/aioscrapy/statscollectors.py +++ b/aioscrapy/statscollectors.py @@ -4,7 +4,7 @@ import pprint import logging -logger = logging.getLogger(__name__) +from aioscrapy.utils.log import logger class StatsCollector: @@ -43,8 +43,7 @@ def open_spider(self, spider): def close_spider(self, spider, reason): if self._dump: - logger.info("Dumping aioscrapy stats:\n" + pprint.pformat(self._stats), - extra={'spider': spider}) + logger.info("Dumping aioscrapy stats:\n" + pprint.pformat(self._stats)) self._persist_stats(self._stats, spider) def _persist_stats(self, stats, spider): diff --git a/aioscrapy/templates/project/module/middlewares.py.tmpl b/aioscrapy/templates/project/module/middlewares.py.tmpl index 643e96d..075545f 100644 --- a/aioscrapy/templates/project/module/middlewares.py.tmpl +++ b/aioscrapy/templates/project/module/middlewares.py.tmpl @@ -1,6 +1,6 @@ from aioscrapy import signals - +from aioscrapy.utils.log import logger class ${ProjectName}SpiderMiddleware: # Not all methods need to be defined. If a method is not defined, @@ -46,7 +46,7 @@ class ${ProjectName}SpiderMiddleware: yield r async def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) + logger.info('Spider opened: %s' % spider.name) class ${ProjectName}DownloaderMiddleware: @@ -93,4 +93,4 @@ class ${ProjectName}DownloaderMiddleware: pass async def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) + logger.info('Spider opened: %s' % spider.name) diff --git a/aioscrapy/utils/log.py b/aioscrapy/utils/log.py index 5b649bc..bf25e20 100644 --- a/aioscrapy/utils/log.py +++ b/aioscrapy/utils/log.py @@ -1,175 +1,35 @@ -import logging +import asyncio import sys import warnings -from logging.config import dictConfig -from logging.handlers import RotatingFileHandler +from typing import Type -import aioscrapy -from aioscrapy.exceptions import AioScrapyDeprecationWarning -from aioscrapy.settings import Settings - - -logger = logging.getLogger(__name__) - - -class TopLevelFormatter(logging.Filter): - """Keep only top level loggers's name (direct children from root) from - records. - - This filter will replace aioscrapy loggers' names with 'aioscrapy'. This mimics - the old Scrapy log behaviour and helps shortening long names. - - Since it can't be set for just one logger (it won't propagate for its - children), it's going to be set in the root handler, with a parametrized - ``loggers`` list where it should act. - """ - - def __init__(self, loggers=None): - self.loggers = loggers or [] - - def filter(self, record): - if any(record.name.startswith(logger + '.') for logger in self.loggers): - record.name = record.name.split('.', 1)[0] - return True - - -DEFAULT_LOGGING = { - 'version': 1, - 'disable_existing_loggers': False, - 'loggers': { - 'hpack': { - 'level': 'ERROR', - }, - 'aio_pika': { - 'level': 'ERROR', - }, - 'aiormq': { - 'level': 'ERROR', - }, - 'aioscrapy': { - 'level': 'DEBUG', - }, - } -} - - -def configure_logging(settings=None, install_root_handler=True): - """ - Initialize logging defaults for Scrapy. - - :param settings: settings used to create and configure a handler for the - root logger (default: None). - :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None`` - - :param install_root_handler: whether to install root logging handler - (default: True) - :type install_root_handler: bool - - This function does: - - - Route warnings and twisted logging through Python standard logging - - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively - - Route stdout to log if LOG_STDOUT setting is True - - When ``install_root_handler`` is True (default), this function also - creates a handler for the root logger according to given settings - (see :ref:`topics-logging-settings`). You can override default options - using ``settings`` argument. When ``settings`` is empty or None, defaults - are used. - """ - if not sys.warnoptions: - # Route warnings through python logging - logging.captureWarnings(True) - - dictConfig(DEFAULT_LOGGING) - - if isinstance(settings, dict) or settings is None: - settings = Settings(settings) - - if settings.getbool('LOG_STDOUT'): - sys.stdout = StreamLogger(logging.getLogger('stdout')) +from loguru import logger as _logger - if install_root_handler: - install_scrapy_root_handler(settings) - - -def install_scrapy_root_handler(settings): - global _scrapy_root_handler - - if (_scrapy_root_handler is not None - and _scrapy_root_handler in logging.root.handlers): - logging.root.removeHandler(_scrapy_root_handler) - logging.root.setLevel(logging.NOTSET) - _scrapy_root_handler = _get_handler(settings) - logging.root.addHandler(_scrapy_root_handler) - - -def get_scrapy_root_handler(): - return _scrapy_root_handler - - -_scrapy_root_handler = None - - -def _get_handler(settings): - """ Return a log handler object according to settings """ - filename = settings.get('LOG_FILE') - if filename: - encoding = settings.get('LOG_ENCODING') - max_bytes = settings.get('LOG_MAX_BYTES', 50*1024*1024) - backup_count = settings.get('LOG_BACKUP_COUNT', 10) - handler = RotatingFileHandler(filename, encoding=encoding, maxBytes=max_bytes, backupCount=backup_count) - elif settings.getbool('LOG_ENABLED'): - handler = logging.StreamHandler() - else: - handler = logging.NullHandler() - - formatter = logging.Formatter( - fmt=settings.get('LOG_FORMAT'), - datefmt=settings.get('LOG_DATEFORMAT') - ) - handler.setFormatter(formatter) - handler.setLevel(settings.get('LOG_LEVEL')) - if settings.getbool('LOG_SHORT_NAMES'): - handler.addFilter(TopLevelFormatter(['aioscrapy'])) - return handler - - -def log_scrapy_info(settings): - logger.info("Aioscrapy %(version)s started (bot: %(bot)s)", - {'version': aioscrapy.__version__, 'bot': settings['BOT_NAME']}) - - -class StreamLogger: - """Fake file-like stream object that redirects writes to a logger instance - - Taken from: - https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/ - """ - def __init__(self, logger, log_level=logging.INFO): - self.logger = logger - self.log_level = log_level - self.linebuf = '' - - def write(self, buf): - for line in buf.rstrip().splitlines(): - self.logger.log(self.log_level, line.rstrip()) - - def flush(self): - for h in self.logger.handlers: - h.flush() +from aioscrapy import Settings, Spider +from aioscrapy.exceptions import AioScrapyDeprecationWarning +_logger.remove(0) -class LogCounterHandler(logging.Handler): - """Record log levels count into a crawler stats""" - def __init__(self, crawler, *args, **kwargs): - super().__init__(*args, **kwargs) - self.crawler = crawler +def configure_logging(spider: Type[Spider], settings: Settings): + formatter = settings.get('LOG_FORMAT') + level = settings.get('LOG_LEVEL', 'INFO') + enqueue = settings.get('ENQUEUE', True) + if settings.get('LOG_STDOUT', True): + _logger.add( + sys.stderr, format=formatter, level=level, enqueue=enqueue, + filter=lambda record: record["extra"].get("spidername") == spider.name, + ) - def emit(self, record): - sname = f'log_count/{record.levelname}' - self.crawler.stats.inc_value(sname) + if filename := settings.get('LOG_FILE'): + rotation = settings.get('LOG_ROTATION', '20MB') + retention = settings.get('LOG_RETENTION', 10) + encoding = settings.get('LOG_ENCODING', 'utf-8') + _logger.add( + sink=filename, format=formatter, encoding=encoding, level=level, + enqueue=enqueue, rotation=rotation, retention=retention, + filter=lambda record: record["extra"].get("spidername") == spider.name, + ) def logformatter_adapter(logkws): @@ -187,10 +47,19 @@ def logformatter_adapter(logkws): 'deprecated, use `msg` instead', AioScrapyDeprecationWarning) - level = logkws.get('level', logging.INFO) + level = logkws.get('level', "INFO") message = logkws.get('format', logkws.get('msg')) # NOTE: This also handles 'args' being an empty dict, that case doesn't # play well in logger.log calls args = logkws if not logkws.get('args') else logkws['args'] + return level, message, args + + +class AioScrapyLogger: + + def __getattr__(self, item): + spider_name = asyncio.current_task().get_name() + return getattr(_logger.bind(spidername=spider_name), item) + - return (level, message, args) +logger: Type[_logger] = AioScrapyLogger() diff --git a/aioscrapy/utils/signal.py b/aioscrapy/utils/signal.py index 6f1110d..6382ab8 100644 --- a/aioscrapy/utils/signal.py +++ b/aioscrapy/utils/signal.py @@ -1,12 +1,12 @@ """Helper functions for working with signals""" import asyncio -import logging from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers from pydispatch.robustapply import robustApply -from aioscrapy.exceptions import StopDownload -logger = logging.getLogger(__name__) +from aioscrapy.exceptions import StopDownload +from aioscrapy.utils.log import logger +from aioscrapy.utils.tools import create_task class _IgnoredException(Exception): @@ -22,10 +22,7 @@ async def robustApplyWrap(f, recv, *args, **kw): return await result except (Exception, BaseException) as exc: # noqa: E722 if dont_log is None or not isinstance(exc, dont_log): - logger.error("Error caught on signal handler: %(receiver)s", - {'receiver': recv}, - exc_info=exc, - extra={'spider': spider}) + logger.exception(f"Error caught on signal handler: {recv}") return exc @@ -48,10 +45,12 @@ async def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **na """ dfds = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): - dfds.append(asyncio.create_task( - robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named))) - res = await asyncio.gather(*dfds) - return res + dfds.append( + create_task( + robustApplyWrap(robustApply, receiver, signal=signal, sender=sender, *arguments, **named) + ) + ) + return await asyncio.gather(*dfds) def disconnect_all(signal=Any, sender=Any): @@ -60,4 +59,3 @@ def disconnect_all(signal=Any, sender=Any): """ for receiver in liveReceivers(getAllReceivers(sender, signal)): disconnect(receiver, signal=signal, sender=sender) - diff --git a/aioscrapy/utils/spider.py b/aioscrapy/utils/spider.py index 7459699..156d8c2 100644 --- a/aioscrapy/utils/spider.py +++ b/aioscrapy/utils/spider.py @@ -1,12 +1,8 @@ import inspect -import logging from aioscrapy.spiders import Spider -logger = logging.getLogger(__name__) - - def iter_spider_classes(module): """Return an iterator over all spider classes defined in the given module that can be instantiated (i.e. which have name) @@ -16,9 +12,9 @@ def iter_spider_classes(module): for obj in vars(module).values(): if ( - inspect.isclass(obj) - and issubclass(obj, Spider) - and obj.__module__ == module.__name__ - and getattr(obj, 'name', None) + inspect.isclass(obj) + and issubclass(obj, Spider) + and obj.__module__ == module.__name__ + and getattr(obj, 'name', None) ): yield obj diff --git a/aioscrapy/utils/tools.py b/aioscrapy/utils/tools.py index 9364211..0bd856d 100644 --- a/aioscrapy/utils/tools.py +++ b/aioscrapy/utils/tools.py @@ -68,3 +68,11 @@ def exec_js_func(js_file_path, func_name, func_params=None, cwd_path=None, cmd_p js = ''.join(lines) js_context = execjs.get(name).compile(js, cwd=cwd_path) return js_context.call(func_name, *func_params) + + +def create_task(coros, name=None): + """ 感染所有的协程函数 """ + return asyncio.create_task( + coros, + name=asyncio.current_task().get_name() + ) diff --git a/example/singlespider/demo_duplicate.py b/example/singlespider/demo_duplicate.py index 22e9eef..2fba3f3 100644 --- a/example/singlespider/demo_duplicate.py +++ b/example/singlespider/demo_duplicate.py @@ -16,18 +16,18 @@ class DemoDuplicateSpider(Spider): "CLOSE_SPIDER_ON_IDLE": True, # 'LOG_FILE': 'test.log', - # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', # 本地文件存储指纹去重 + 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter', # 本地文件存储指纹去重 # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter', # redis set去重 - 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.BloomDupeFilter', # 布隆过滤器去重 + # 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.BloomDupeFilter', # 布隆过滤器去重 - 'SCHEDULER_QUEUE_CLASS': 'aioscrapy.queue.redis.SpiderPriorityQueue', + # 'SCHEDULER_QUEUE_CLASS': 'aioscrapy.queue.redis.SpiderPriorityQueue', 'SCHEDULER_SERIALIZER': 'aioscrapy.serializer.JsonSerializer', - 'REDIS_ARGS': { - 'queue': { - 'url': 'redis://:@192.168.43.165:6379/10', - 'max_connections': 2, - } - } + # 'REDIS_ARGS': { + # 'queue': { + # 'url': 'redis://:@192.168.43.165:6379/10', + # 'max_connections': 2, + # } + # } } async def start_requests(self): diff --git a/example/singlespider/demo_metric.py b/example/singlespider/demo_metric.py new file mode 100644 index 0000000..0a7705a --- /dev/null +++ b/example/singlespider/demo_metric.py @@ -0,0 +1,66 @@ +import logging + +from aioscrapy import Request +from aioscrapy.spiders import Spider + +logger = logging.getLogger(__name__) + + +class DemoMetricSpider(Spider): + name = 'DemoMetricSpider' + custom_settings = dict( + USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", + # DOWNLOAD_DELAY=3, + # RANDOMIZE_DOWNLOAD_DELAY=True, + # CONCURRENT_REQUESTS=1, + LOG_LEVEL='DEBUG', + CLOSE_SPIDER_ON_IDLE=True, + + + EXTENSIONS={ + 'aioscrapy.libs.extensions.metric.Metric': 0, + }, + + # http(侵入式) 使用http协议将监控指标写入influxdb2 + # METRIC_INFLUXDB_URL="http://127.0.0.1:8086/api/v2/write?org=spiderman&bucket=spider-metric&precision=ns", + # METRIC_INFLUXDB_TOKEN="YequFPGDEuukHUG9l8l2nlaatufGQK_UOD7UBpo3KvB8jIg5-cFa89GLXYfgk76M2sHvEtERpAXK7_fMNsBjAA==", + + # log + vector(非侵入式) 将监控指标写入单独的日志文件,利用vector收集日志写入influxdb2 + METRIC_LOG_ARGS=dict(sink='DemoMetricSpider.metric', rotation='20MB', retention=3) + ) + + start_urls = ['https://quotes.toscrape.com'] + + @staticmethod + async def process_request(request, spider): + """ request middleware """ + pass + + @staticmethod + async def process_response(request, response, spider): + """ response middleware """ + return response + + @staticmethod + async def process_exception(request, exception, spider): + """ exception middleware """ + pass + + async def parse(self, response): + for quote in response.css('div.quote'): + yield { + 'author': quote.xpath('span/small/text()').get(), + 'text': quote.css('span.text::text').get(), + } + + next_page = response.css('li.next a::attr("href")').get() + if next_page is not None: + # yield response.follow(next_page, self.parse) + yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse) + + # async def process_item(self, item): + # print(item) + + +if __name__ == '__main__': + DemoMetricSpider.start() diff --git a/example/singlespider/demo_queue_memory.py b/example/singlespider/demo_queue_memory.py index 6d48837..2a06d9d 100644 --- a/example/singlespider/demo_queue_memory.py +++ b/example/singlespider/demo_queue_memory.py @@ -13,7 +13,7 @@ class DemoMemorySpider(Spider): # 'DOWNLOAD_DELAY': 3, # 'RANDOMIZE_DOWNLOAD_DELAY': True, # 'CONCURRENT_REQUESTS': 1, - # 'LOG_LEVEL': 'INFO', + 'LOG_LEVEL': 'INFO', "CLOSE_SPIDER_ON_IDLE": True, } @@ -40,14 +40,15 @@ async def parse(self, response): 'author': quote.xpath('span/small/text()').get(), 'text': quote.css('span.text::text').get(), } + raise Exception(111) next_page = response.css('li.next a::attr("href")').get() if next_page is not None: # yield response.follow(next_page, self.parse) yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse) - async def process_item(self, item): - print(item) + # async def process_item(self, item): + # print(item) if __name__ == '__main__': diff --git a/example/singlespider/demo_request_httpx.py b/example/singlespider/demo_request_httpx.py index e72008d..31606a4 100644 --- a/example/singlespider/demo_request_httpx.py +++ b/example/singlespider/demo_request_httpx.py @@ -53,8 +53,8 @@ async def parse(self, response): # yield response.follow(next_page, self.parse) yield Request(f"https://quotes.toscrape.com/{next_page}", callback=self.parse) - async def process_item(self, item): - print(item) + # async def process_item(self, item): + # print(item) if __name__ == '__main__': diff --git a/example/singlespider/start.py b/example/singlespider/start.py index 5118a21..5440aa9 100644 --- a/example/singlespider/start.py +++ b/example/singlespider/start.py @@ -4,9 +4,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.getcwd()))) from aioscrapy.process import multi_process_run, single_process_run -from demo_memory import DemoMemorySpider -from demo_httpx import DemoHttpxSpider -from demo_playwright import DemoPlaywrightSpider +from demo_queue_memory import DemoMemorySpider +from demo_request_httpx import DemoHttpxSpider +from demo_request_playwright import DemoPlaywrightSpider if __name__ == '__main__': diff --git a/setup.py b/setup.py index a607ece..44c6b43 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ 'zope.interface>=5.1.0', "redis>=4.3.1", "aiomultiprocess>=0.9.0", + "loguru>=0.7.0", ] extras_require = { "all": [