Skip to content

Commit

Permalink
feat: metric & logger
Browse files Browse the repository at this point in the history
  • Loading branch information
ConlinH committed Sep 27, 2023
1 parent 4325f34 commit d79e7ed
Show file tree
Hide file tree
Showing 57 changed files with 474 additions and 572 deletions.
2 changes: 1 addition & 1 deletion aioscrapy/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.3.1
2.0.0
2 changes: 1 addition & 1 deletion aioscrapy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


__all__ = [
'__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler'
'__version__', 'version_info', 'Spider', 'Request', 'FormRequest', 'Crawler', 'Settings'
]


Expand Down
12 changes: 5 additions & 7 deletions aioscrapy/core/downloader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import logging
import random
from abc import abstractmethod
from collections import deque
Expand All @@ -16,10 +15,9 @@
from aioscrapy.settings import Settings
from aioscrapy.signalmanager import SignalManager
from aioscrapy.utils.httpobj import urlparse_cached
from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_instance
from aioscrapy.utils.tools import call_helper

logger = logging.getLogger('aioscrapy.downloader')
from aioscrapy.utils.tools import call_helper, create_task


class BaseDownloaderMeta(type):
Expand Down Expand Up @@ -135,7 +133,7 @@ def __init__(
self.active: Set[Request] = set()
self.slots: dict = {}
self.running: bool = True
asyncio.create_task(self._slot_gc(60))
create_task(self._slot_gc(60))

@classmethod
async def from_crawler(cls, crawler) -> "Downloader":
Expand Down Expand Up @@ -170,13 +168,13 @@ async def _process_queue(self, slot: Slot) -> None:
slot.delay_lock = True
await asyncio.sleep(penalty)
slot.delay_lock = False
asyncio.create_task(self._process_queue(slot))
create_task(self._process_queue(slot))
return

while slot.queue and slot.free_transfer_slots() > 0:
request = slot.queue.popleft()
slot.transferring.add(request)
asyncio.create_task(self._download(slot, request))
create_task(self._download(slot, request))
if delay:
break

Expand Down
8 changes: 2 additions & 6 deletions aioscrapy/core/downloader/handlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
"""Download handlers for different schemes"""

import logging
from abc import abstractmethod
from typing import Optional

from aioscrapy import signals, Request, Spider
from aioscrapy.exceptions import NotConfigured, NotSupported
from aioscrapy.http import HtmlResponse
from aioscrapy.utils.httpobj import urlparse_cached
from aioscrapy.utils.log import logger
from aioscrapy.utils.misc import load_instance
from aioscrapy.utils.python import without_none_values

logger = logging.getLogger(__name__)


class BaseDownloadHandler:
@abstractmethod
Expand Down Expand Up @@ -67,9 +65,7 @@ async def _load_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
self._notconfigured[scheme] = str(ex)
return None
except Exception as ex:
logger.error('Loading "%(clspath)s" for scheme "%(scheme)s"',
{"clspath": path, "scheme": scheme},
exc_info=True, extra={'crawler': self._crawler})
logger.exception(f'Loading "{path}" for scheme "{scheme}"')
self._notconfigured[scheme] = str(ex)
return None
else:
Expand Down
6 changes: 2 additions & 4 deletions aioscrapy/core/downloader/handlers/aiohttp.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import asyncio
import logging
import re
import ssl
from typing import Optional
Expand All @@ -10,8 +9,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings

logger = logging.getLogger(__name__)
from aioscrapy.utils.log import logger


class AioHttpDownloadHandler(BaseDownloadHandler):
Expand Down Expand Up @@ -41,7 +39,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse:
'data': request.body or None,
'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
'dont_redirect') is None else request.meta.get('dont_redirect'),
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10),
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
}

headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
Expand Down
6 changes: 2 additions & 4 deletions aioscrapy/core/downloader/handlers/httpx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import logging
import ssl

import httpx
Expand All @@ -7,8 +6,7 @@
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings

logger = logging.getLogger(__name__)
from aioscrapy.utils.log import logger


class HttpxDownloadHandler(BaseDownloadHandler):
Expand Down Expand Up @@ -43,7 +41,7 @@ async def download_request(self, request: Request, _) -> HtmlResponse:
'verify': request.meta.get('verify_ssl', self.verify_ssl),
'follow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
'dont_redirect') is None else request.meta.get('dont_redirect'),
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 10),
'max_redirects': self.settings.getint('REDIRECT_MAX_TIMES', 20),
})
ssl_ciphers = request.meta.get('TLS_CIPHERS')
ssl_protocol = request.meta.get('ssl_protocol', self.ssl_protocol)
Expand Down
4 changes: 0 additions & 4 deletions aioscrapy/core/downloader/handlers/playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import logging

from aioscrapy import Request
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import PlaywrightResponse
Expand All @@ -8,8 +6,6 @@
from .driverpool import WebDriverPool
from .webdriver import PlaywrightDriver

logger = logging.getLogger(__name__)


class PlaywrightHandler(BaseDownloadHandler):
def __init__(self, settings: Settings):
Expand Down
4 changes: 1 addition & 3 deletions aioscrapy/core/downloader/handlers/pyhttpx.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import asyncio
import logging

import pyhttpx

from aioscrapy import Request
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings

logger = logging.getLogger(__name__)
from aioscrapy.utils.log import logger


class PyhttpxDownloadHandler(BaseDownloadHandler):
Expand Down
4 changes: 1 addition & 3 deletions aioscrapy/core/downloader/handlers/requests.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import asyncio
import logging

import requests

from aioscrapy import Request
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
from aioscrapy.http import HtmlResponse
from aioscrapy.settings import Settings

logger = logging.getLogger(__name__)
from aioscrapy.utils.log import logger


class RequestsDownloadHandler(BaseDownloadHandler):
Expand Down
35 changes: 13 additions & 22 deletions aioscrapy/core/engine.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# _*_ coding: utf-8 _*_

import asyncio
import logging
from typing import Optional, AsyncGenerator, Union, Callable
from asyncio import Queue
from asyncio.queues import QueueEmpty
from typing import Optional, AsyncGenerator, Union, Callable

import aioscrapy
from aioscrapy import Spider
Expand All @@ -17,9 +16,9 @@
from aioscrapy.http.request import Request
from aioscrapy.utils.log import logformatter_adapter
from aioscrapy.utils.misc import load_instance
from aioscrapy.utils.tools import call_helper
from aioscrapy.utils.tools import call_helper, create_task

logger = logging.getLogger(__name__)
from aioscrapy.utils.log import logger


class Slot:
Expand Down Expand Up @@ -72,7 +71,7 @@ async def start(
while not self.finish:
self.running and await self._next_request()
await asyncio.sleep(1)
self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl())
self.enqueue_cache_num != 1 and create_task(self._crawl())
self.running and await self._spider_idle(self.spider)

async def stop(self, reason: str = 'shutdown') -> None:
Expand All @@ -83,7 +82,7 @@ async def stop(self, reason: str = 'shutdown') -> None:

while not self.is_idle():
await asyncio.sleep(0.2)
self.enqueue_cache_num != 1 and asyncio.create_task(self._crawl())
self.enqueue_cache_num != 1 and create_task(self._crawl())
await self.close_spider(self.spider, reason=reason)
await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self.finish = True
Expand All @@ -93,7 +92,7 @@ async def open(
spider: Spider,
start_requests: Optional[AsyncGenerator] = None
) -> None:
logger.info("Spider opened", extra={'spider': spider})
logger.info("Spider opened")

self.spider = spider
await call_helper(self.crawler.stats.open_spider, spider)
Expand Down Expand Up @@ -145,7 +144,7 @@ async def _next_request(self) -> None:
self.slot.start_requests = None
except Exception as e:
self.slot.start_requests = None
logger.error('Error while obtaining start requests', exc_info=e, extra={'spider': self.spider})
logger.exception('Error while obtaining start requests')
else:
request and await self.crawl(request)
finally:
Expand Down Expand Up @@ -177,9 +176,7 @@ async def handle_downloader_output(

result.request = request
if isinstance(result, Response):
logkws = self.logformatter.crawled(request, result, self.spider)
if logkws is not None:
logger.log(*logformatter_adapter(logkws), extra={'spider': self.spider})
logger.log(** self.logformatter.crawled(request, result, self.spider))
await self.signals.send_catch_log(signals.response_received,
response=result, request=request, spider=self.spider)
await self.scraper.enqueue_scrape(result, request)
Expand Down Expand Up @@ -207,7 +204,7 @@ def is_idle(self) -> bool:
async def crawl(self, request: Request) -> None:
if self.enqueue_cache_num == 1:
await self.scheduler.enqueue_request(request)
asyncio.create_task(self._next_request())
create_task(self._next_request())
else:
await self.enqueue_cache.put(request)

Expand All @@ -224,15 +221,13 @@ async def _crawl(self) -> None:
break
if requests:
await call_helper(self.scheduler.enqueue_request_batch, requests)
asyncio.create_task(self._next_request())
create_task(self._next_request())
self.enqueue_unlock = True

async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
"""Close (cancel) spider and clear all its outstanding requests"""

logger.info("Closing spider (%(reason)s)",
{'reason': reason},
extra={'spider': spider})
logger.info(f"Closing spider ({reason})")

async def close_handler(
callback: Callable,
Expand All @@ -243,11 +238,7 @@ async def close_handler(
try:
await call_helper(callback, *args, **kwargs)
except (Exception, BaseException) as e:
logger.error(
errmsg,
exc_info=e,
extra={'spider': spider}
)
logger.exception(errmsg)

await close_handler(self.downloader.close, errmsg='Downloader close failure')

Expand All @@ -260,7 +251,7 @@ async def close_handler(

await close_handler(self.crawler.stats.close_spider, spider, reason=reason, errmsg='Stats close failure')

logger.info("Spider closed (%(reason)s)", {'reason': reason}, extra={'spider': spider})
logger.info(f"Spider closed ({reason})")

await close_handler(setattr, self, 'slot', None, errmsg='Error while unassigning slot')

Expand Down
Loading

0 comments on commit d79e7ed

Please sign in to comment.