From 96e9b8e0c62364f8ab8b07938495275d3b1ee1b5 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Sun, 12 Jan 2025 04:03:21 +0200 Subject: [PATCH] improve output --- md_dead_link_check/__main__.py | 2 +- md_dead_link_check/config.py | 8 ++++ md_dead_link_check/helpers.py | 27 +++++++++-- md_dead_link_check/link_checker.py | 70 ++++++++++++++++++--------- tests/test_link_cheker.py | 76 ++++++++++++++++-------------- 5 files changed, 118 insertions(+), 65 deletions(-) diff --git a/md_dead_link_check/__main__.py b/md_dead_link_check/__main__.py index c739e01..855459e 100644 --- a/md_dead_link_check/__main__.py +++ b/md_dead_link_check/__main__.py @@ -53,7 +53,7 @@ def main() -> int: files = list(md_data) status_list = check_all_links(md_data, config, repo_dir, files, files_in_repo) - err_num = summary(status_list, args.warn, args.all, args.no_color) + err_num = summary(status_list, args.warn, args.all, args.no_color, config) return min(err_num, 1) diff --git a/md_dead_link_check/config.py b/md_dead_link_check/config.py index a198aee..8439867 100644 --- a/md_dead_link_check/config.py +++ b/md_dead_link_check/config.py @@ -46,4 +46,12 @@ def get_config(root_dir: Path, config_path: Optional[Path]) -> Config: f"Unexpected config key `{key}` in {config_path.name}. " f"Available keys: [{', '.join(config.__annotations__)}]" ) + if not isinstance(config.timeout, int) or config.timeout < 1: + raise ValueError("`timeout` must be an integer greater than or equal to 1.") + if not isinstance(config.throttle_groups, int) or config.throttle_groups < 1: + raise ValueError("`throttle_groups` must be an integer greater than or equal to 1.") + if not isinstance(config.throttle_delay, int) or config.throttle_delay < 0: + raise ValueError("`throttle_delay` must be a non-negative float or integer.") + if not isinstance(config.throttle_max_delay, int) or config.throttle_max_delay < 0: + raise ValueError("`throttle_max_delay` must be a non-negative integer.") return config diff --git a/md_dead_link_check/helpers.py b/md_dead_link_check/helpers.py index aad2500..fdb281c 100644 --- a/md_dead_link_check/helpers.py +++ b/md_dead_link_check/helpers.py @@ -3,6 +3,8 @@ from pathlib import Path from typing import List +from md_dead_link_check.config import Config +from md_dead_link_check.link_checker import Status from md_dead_link_check.link_checker import StatusInfo @@ -38,7 +40,7 @@ def disable_colors(self) -> None: setattr(self, key, "") -def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool) -> int: +def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool, config: Config) -> int: """ Print summary. Returns 0 if not found any error, otherwise 1. @@ -47,19 +49,34 @@ def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_colo if no_color: specs.disable_colors() err_nums = 0 + count_429 = 0 + for x in status: link_msg = ( f"{specs.blue}File:{specs.clean} {x.link_info.get_location()}" f" {specs.split} {specs.blue}Link:{specs.clean} {x.link_info.link}" ) - if x.err_msg: - print(f"{link_msg} {specs.split} {specs.red}Error{specs.clean}: {x.err_msg}") + if x.msg is not None and x.msg == "429: To Many Request": + count_429 += 1 + + if x.status == Status.ERROR: + print(f"{link_msg} {specs.split} {specs.red}Error{specs.clean}: {x.msg}") err_nums += 1 - elif x.warn_msg and (print_warn or print_all): - print(f"{link_msg} {specs.split} {specs.yellow}Warn{specs.clean}: {x.warn_msg}") + elif x.status == Status.WARNING and (print_warn or print_all): + print(f"{link_msg} {specs.split} {specs.yellow}Warn{specs.clean}: {x.msg}") elif print_all: print(f"{link_msg} {specs.split} {specs.green}OK{specs.clean}") + if count_429: + print( + f"{specs.yellow}WARNING:{specs.clean} " + f"Found {count_429} link{'s' if count_429 > 1 else ''} with \"429: To Many Request\" respond code.\n" + f"Wait some time and try again or update configuration file to increase delay, for example:\n" + f"throttle_groups = {max(1, config.throttle_groups // 2)}\n" + f"throttle_delay = {max(1, config.throttle_delay) // 2}\n" + f"throttle_max_delay = {config.throttle_max_delay * 4}\n" + ) + if err_nums: cat_repeat = 0 if no_color else max(min(err_nums // 10, 5), 1) print(f"{specs.fail}Found {err_nums} dead link{'s' if err_nums >1 else ''}" + specs.cat_fail * cat_repeat) diff --git a/md_dead_link_check/link_checker.py b/md_dead_link_check/link_checker.py index 387ab9b..96b2827 100644 --- a/md_dead_link_check/link_checker.py +++ b/md_dead_link_check/link_checker.py @@ -3,6 +3,7 @@ import asyncio from collections import defaultdict from dataclasses import dataclass +from enum import Enum from fnmatch import fnmatch from pathlib import Path from typing import Dict, List, Optional @@ -27,21 +28,27 @@ IGNORED_PROTOCOLS = ("ftp", "sftp") +class Status(int, Enum): + OK = 0 + WARNING = 1 + ERROR = 2 + + @dataclass class StatusInfo: link_info: LinkInfo - err_msg: Optional[str] = None - warn_msg: Optional[str] = None + status: Status + msg: Optional[str] = None def __lt__(self, other: StatusInfo) -> bool: - return self.link_info < other.link_info + return self.status < other.status or (self.status == other.status and self.link_info < other.link_info) @dataclass class LinkStatus: link: str - err_msg: Optional[str] = None - warn_msg: Optional[str] = None + status: Status + msg: Optional[str] = None @dataclass @@ -81,22 +88,22 @@ async def process_link(data: LinkWithDelay, session: ClientSession, config: Conf response.raise_for_status() except ClientResponseError as e: if not config.catch_response_codes or e.status in config.catch_response_codes: - return LinkStatus(link, f"{e.status}: {e.message}") - return LinkStatus(link, warn_msg=f"{e.status}: {e.message}") + return LinkStatus(link, Status.ERROR, f"{e.status}: {e.message}") + return LinkStatus(link, Status.WARNING, f"{e.status}: {e.message}") except asyncio.CancelledError as e: - return LinkStatus(link, str(e)) + return LinkStatus(link, Status.ERROR, str(e)) except ClientConnectorError as e: - return LinkStatus(link, str(e)) + return LinkStatus(link, Status.ERROR, str(e)) except asyncio.TimeoutError: if TIMEOUT_RESPONSE_CODE in config.catch_response_codes: - return LinkStatus(link, err_msg=MSG_TIMEOUT) - return LinkStatus(link, warn_msg=MSG_TIMEOUT) + return LinkStatus(link, Status.ERROR, MSG_TIMEOUT) + return LinkStatus(link, Status.WARNING, MSG_TIMEOUT) except Exception as e: msg = str(e) if not msg: msg = MSG_UNKNOWN_ERROR - return LinkStatus(link, err_msg=msg) - return LinkStatus(link) + return LinkStatus(link, Status.ERROR, msg) + return LinkStatus(link, Status.OK) async def async_check_links(links: List[LinkWithDelay], config: Config) -> List[LinkStatus]: @@ -105,16 +112,33 @@ async def async_check_links(links: List[LinkWithDelay], config: Config) -> List[ return ret +def calculate_delay(counter: int, config: Config) -> int: + return min(counter // config.throttle_groups * config.throttle_delay, config.throttle_max_delay) + + def generate_delays_for_one_domain_links(links: List[str], config: Config) -> List[LinkWithDelay]: domain_count: Dict[str, int] = defaultdict(int) ret: List[LinkWithDelay] = [] for link in links: domain = urlsplit(link).netloc - delay = min(domain_count[domain] // config.throttle_groups * config.throttle_delay, config.throttle_max_delay) + delay = calculate_delay(domain_count[domain], config) ret.append(LinkWithDelay(link, delay)) domain_count[domain] += 1 + enabled_throttling = {domain for domain, count in domain_count.items() if count - 1 > config.throttle_groups} + if enabled_throttling: + print("Throttling applied to limit request frequency:") + for domain, count in domain_count.items(): + if count > config.throttle_groups: + max_delay = calculate_delay(domain_count[domain] - 1, config) + print(f" - Domain: {domain}") + print(f" Requests count: {count}") + if max_delay == config.throttle_max_delay: + print(f" Maximum delay: {max_delay} seconds (reached throttle_max_delay)") + else: + print(f" Maximum delay: {max_delay} seconds") + return ret @@ -150,7 +174,7 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis for wl in web_links: li_status = links_status_dict[wl.link] - ret.append(StatusInfo(wl, err_msg=li_status.err_msg, warn_msg=li_status.warn_msg)) + ret.append(StatusInfo(wl, li_status.status, li_status.msg)) return ret @@ -173,7 +197,7 @@ def check_path_links( try: split_result = urlsplit(md_link.link) except ValueError: - ret.append(StatusInfo(md_link, MSG_PARSING_ERROR)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_PARSING_ERROR)) continue if split_result.scheme or split_result.netloc: @@ -182,7 +206,7 @@ def check_path_links( if not split_result.path: if fragment not in md_file_info.fragments: - ret.append(StatusInfo(md_link, MSG_FRAGMENT_NOT_FOUND)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_FRAGMENT_NOT_FOUND)) continue else: try: @@ -194,28 +218,28 @@ def check_path_links( abs_path = (md_abs_path.parent / split_result.path).resolve() rel_path = abs_path.relative_to(root_dir) except ValueError: - ret.append(StatusInfo(md_link, MSG_PATH_NOT_FOUND)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_PATH_NOT_FOUND)) continue if abs_path.as_posix() != abs_path.resolve().as_posix(): - ret.append(StatusInfo(md_link, MSG_PATH_NOT_FOUND)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_PATH_NOT_FOUND)) continue if rel_path.as_posix() in md_data: # Markdowns in repository if fragment and fragment not in md_data[rel_path.as_posix()].fragments: - ret.append(StatusInfo(md_link, MSG_FRAGMENT_NOT_FOUND)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_FRAGMENT_NOT_FOUND)) continue else: # Not markdown file if not any(f.as_posix().startswith(rel_path.as_posix()) for f in files_in_repo): if rel_path.exists(): - ret.append(StatusInfo(md_link, MSG_PATH_NOT_ADDED)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_PATH_NOT_ADDED)) else: - ret.append(StatusInfo(md_link, MSG_PATH_NOT_FOUND)) + ret.append(StatusInfo(md_link, Status.ERROR, MSG_PATH_NOT_FOUND)) continue - ret.append(StatusInfo(md_link)) + ret.append(StatusInfo(md_link, Status.OK)) return ret diff --git a/tests/test_link_cheker.py b/tests/test_link_cheker.py index 85cfb88..4b7c78e 100644 --- a/tests/test_link_cheker.py +++ b/tests/test_link_cheker.py @@ -5,6 +5,7 @@ from md_dead_link_check.config import Config from md_dead_link_check.link_checker import LinkWithDelay from md_dead_link_check.link_checker import MarkdownInfo +from md_dead_link_check.link_checker import Status from md_dead_link_check.link_checker import StatusInfo from md_dead_link_check.link_checker import check_all_links from md_dead_link_check.link_checker import check_web_links @@ -24,7 +25,7 @@ def test_check_link(url, msg): config = Config() data = {"test.md": MarkdownInfo("test.md", links=[LinkInfo(url, Path("test.md"), 0)])} [r] = check_web_links(data, config, ["test.md"]) - assert r.err_msg == msg + assert r.msg == msg TEST_FILES = [Path("tests/test_md_files/fail.md"), Path("tests/test_md_files/a.md")] @@ -37,10 +38,10 @@ def test_fails(): ret = check_all_links(md_data, Config(), root_dir, list(md_data.keys()), TEST_FILES) # Output message depends on proxy settings - ret[1].err_msg = None - ret[1].warn_msg = None - ret[7].err_msg = None - ret[7].warn_msg = None + ret[1].msg = None + ret[1].status = None + ret[7].msg = None + ret[7].status = None ref = [ StatusInfo( link_info=LinkInfo( @@ -48,7 +49,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=3, ), - err_msg="404: Not Found", + status=Status.ERROR, + msg="404: Not Found", ), StatusInfo( link_info=LinkInfo( @@ -56,16 +58,18 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=4, ), - err_msg=None, - warn_msg=None, + status=None, + msg=None, ), StatusInfo( link_info=LinkInfo(link="/test/fail.md1", location=Path("tests/test_md_files/fail.md"), line_num=8), - err_msg="Path not found", + status=Status.ERROR, + msg="Path not found", ), StatusInfo( link_info=LinkInfo(link="fail.md1", location=Path("tests/test_md_files/fail.md"), line_num=9), - err_msg="Path not found", + status=Status.ERROR, + msg="Path not found", ), StatusInfo( link_info=LinkInfo( @@ -73,8 +77,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=13, ), - err_msg="Fragment not found", - warn_msg=None, + status=Status.ERROR, + msg="Fragment not found", ), StatusInfo( link_info=LinkInfo( @@ -82,8 +86,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=15, ), - err_msg="Path not found", - warn_msg=None, + status=Status.ERROR, + msg="Path not found", ), StatusInfo( link_info=LinkInfo( @@ -91,8 +95,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=17, ), - err_msg="error://urls/", - warn_msg=None, + status=Status.ERROR, + msg="error://urls/", ), StatusInfo( link_info=LinkInfo( @@ -100,8 +104,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=19, ), - err_msg=None, - warn_msg=None, + msg=None, + status=None, ), StatusInfo( link_info=LinkInfo( @@ -109,8 +113,8 @@ def test_fails(): location=Path("tests/test_md_files/fail.md"), line_num=21, ), - err_msg="Error parsing link", - warn_msg=None, + status=Status.ERROR, + msg="Error parsing link", ), ] assert ret == ref @@ -145,18 +149,18 @@ def test_exclude_links(exclude_links): ) # Output message depends on proxy settings - ret[0].err_msg = None - ret[0].warn_msg = None - ret[4].err_msg = None - ret[4].warn_msg = None + ret[0].status = None + ret[0].msg = None + ret[4].status = None + ret[4].msg = None ref = [ StatusInfo( link_info=LinkInfo( link="https://not_exist_github.githubcom/", location=Path("tests/test_md_files/fail.md"), line_num=4 ), - err_msg=None, - warn_msg=None, + status=None, + msg=None, ), StatusInfo( link_info=LinkInfo( @@ -164,8 +168,8 @@ def test_exclude_links(exclude_links): location=Path("tests/test_md_files/fail.md"), line_num=13, ), - err_msg="Fragment not found", - warn_msg=None, + status=Status.ERROR, + msg="Fragment not found", ), StatusInfo( link_info=LinkInfo( @@ -173,8 +177,8 @@ def test_exclude_links(exclude_links): location=Path("tests/test_md_files/fail.md"), line_num=15, ), - err_msg="Path not found", - warn_msg=None, + status=Status.ERROR, + msg="Path not found", ), StatusInfo( link_info=LinkInfo( @@ -182,8 +186,8 @@ def test_exclude_links(exclude_links): location=Path("tests/test_md_files/fail.md"), line_num=17, ), - err_msg="error://urls/", - warn_msg=None, + status=Status.ERROR, + msg="error://urls/", ), StatusInfo( link_info=LinkInfo( @@ -191,8 +195,8 @@ def test_exclude_links(exclude_links): location=Path("tests/test_md_files/fail.md"), line_num=19, ), - err_msg=None, - warn_msg=None, + status=None, + msg=None, ), StatusInfo( link_info=LinkInfo( @@ -200,8 +204,8 @@ def test_exclude_links(exclude_links): location=Path("tests/test_md_files/fail.md"), line_num=21, ), - err_msg="Error parsing link", - warn_msg=None, + status=Status.ERROR, + msg="Error parsing link", ), ] assert ret == ref