diff --git a/README.md b/README.md index daca9c7..cb8f3e7 100644 --- a/README.md +++ b/README.md @@ -157,3 +157,66 @@ throttle_groups = 100 throttle_delay = 20 throttle_max_delay = 100 ``` + +## Rate Limiting and Request Throttling + +Websites often have limits on how many requests you can make within a certain period. +If these limits are exceeded, the server will return a 429 Too Many Requests status code. + +### Failure Handling + +By default, the 429 status code is treated as a warning. +You can modify this behavior and configure how the tool handles different status codes. + +```toml +catch_response_codes = [404, 410, 429, 500] +``` + +### Throttling Mechanism + +To prevent your requests from overwhelming a website and potentially getting you blocked, this tool implements +a throttling mechanism. This mechanism limits the number of requests that can be made in a given period. + +You can control the following parameters to fine-tune request throttling: + +```toml +throttle_groups = 40 # default: 100 +throttle_delay = 30 # default: 20 +throttle_max_delay = 240 # default: 100 +``` + +### Filter Links to Check + +By filtering out non-critical links and files, you can stay within rate limits while throttling requests. + +#### Exclude Links by Pattern + +Exclude specific URLs that match patterns: + +```toml +exclude_links = ["https://github.com/AlexanderDokuchaev/md-dead-link-check/pull/*"] +``` + +#### Exclude Specific Files + +Prevent specific files (e.g., changelogs) from being checked: + +```toml +exclude_files = ["CHANGELOG.md"] +``` + +#### Exclude Parts of Files Using Comments + +Ignore sections of files using a special comment ``. + +```md +... + + + +All links will be ignored in this part of the file. + + + +... +``` diff --git a/md_dead_link_check/__main__.py b/md_dead_link_check/__main__.py index 855459e..c739e01 100644 --- a/md_dead_link_check/__main__.py +++ b/md_dead_link_check/__main__.py @@ -53,7 +53,7 @@ def main() -> int: files = list(md_data) status_list = check_all_links(md_data, config, repo_dir, files, files_in_repo) - err_num = summary(status_list, args.warn, args.all, args.no_color, config) + err_num = summary(status_list, args.warn, args.all, args.no_color) return min(err_num, 1) diff --git a/md_dead_link_check/helpers.py b/md_dead_link_check/helpers.py index c1bad3e..5f1d37d 100644 --- a/md_dead_link_check/helpers.py +++ b/md_dead_link_check/helpers.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import List -from md_dead_link_check.config import Config from md_dead_link_check.link_checker import Status from md_dead_link_check.link_checker import StatusInfo @@ -40,7 +39,7 @@ def disable_colors(self) -> None: setattr(self, key, "") -def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool, config: Config) -> int: +def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool) -> int: """ Print summary. Returns 0 if not found any error, otherwise 1. @@ -72,10 +71,8 @@ def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_colo f"\n{specs.yellow}WARNING:{specs.clean} " f"{count_429} link{'s' if count_429 > 1 else ''} returned \"429: Too Many Request\" respond code. " f"This indicates that one of the servers is being accessed too frequently.\n" - f"Wait and try again later, or adjust the configuration:\n" - f"throttle_groups = {max(1, config.throttle_groups // 2)}\n" - f"throttle_delay = {config.throttle_delay}\n" - f"throttle_max_delay = {config.throttle_max_delay * 2}\n" + f"To more information visit " + "https://github.com/AlexanderDokuchaev/md-dead-link-check/#rate-limiting-and-request-throttling" ) if err_nums: diff --git a/md_dead_link_check/preprocess.py b/md_dead_link_check/preprocess.py index f377b44..e043dad 100644 --- a/md_dead_link_check/preprocess.py +++ b/md_dead_link_check/preprocess.py @@ -16,6 +16,9 @@ RE_HTML_TAG_HREF = r"<\w+\s+(?:[^>]*?\s+)?href=([\"'])(.*?)\1" RE_SUB = r"[$`][^`]+?[$`]" +MD_TAG_DISABLE = "" +MD_TAG_ENABLE = "" + @dataclass class LinkInfo: @@ -83,6 +86,7 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: links: List[LinkInfo] = [] with (root_dir / path).open(encoding="utf8") as stream: in_code_block = "" + disable_detection_links = False for line_num, line in enumerate(stream.readlines(), 1): striped_line = line.strip() # Skip code blocks that can be start ``` or ```` @@ -112,6 +116,22 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: # Skip $ and ` tags line = re.sub(RE_SUB, "", line) + # Detect id under a tag + matches = re.findall(RE_HTML_TAG_ID, line) + for _, id in matches: + fragments.append(id.lower()) + + if MD_TAG_DISABLE in line: + disable_detection_links = True + continue + + if MD_TAG_ENABLE in line: + disable_detection_links = False + continue + + if disable_detection_links: + continue + # Detect links copy_line = line # Used to detect bare links matches = re.findall(RE_LINK, line) @@ -127,11 +147,6 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: links.append(LinkInfo(link, path, line_num)) copy_line = copy_line.replace(link, "") - # Detect id under a tag - matches = re.findall(RE_HTML_TAG_ID, line) - for _, id in matches: - fragments.append(id.lower()) - # Detect links under a tag matches = re.findall(RE_HTML_TAG_HREF, line) for _, link in matches: @@ -144,6 +159,7 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: if url.endswith("."): url = url[:-1] links.append(LinkInfo(url, path, line_num)) + return MarkdownInfo(path=path, fragments=fragments, links=links) diff --git a/tests/test_md_files/a.md b/tests/test_md_files/a.md index 3b1f11e..3d46a59 100644 --- a/tests/test_md_files/a.md +++ b/tests/test_md_files/a.md @@ -60,3 +60,11 @@ Some text [ftp](ftp://example.example/example) https://github.com. + + + +https://github.com + + + +https://github.com diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index a10010e..33476be 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -168,5 +168,10 @@ def test_process_md_file(): location=Path("tests/test_md_files/a.md"), line_num=62, ), + LinkInfo( + link="https://github.com", + location=Path("tests/test_md_files/a.md"), + line_num=70, + ), ] assert md_info.links == ref_links