Skip to content

Commit

Permalink
Rate Limiting and Request Throttling (#16)
Browse files Browse the repository at this point in the history
- Add `Rate Limiting and Request Throttling` block in readme 
- Add comment `<!-- md-dead-link-check: off -->` to ignore links
  • Loading branch information
AlexanderDokuchaev authored Jan 12, 2025
1 parent fa46d12 commit 05af041
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 12 deletions.
63 changes: 63 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,66 @@ throttle_groups = 100
throttle_delay = 20
throttle_max_delay = 100
```

## Rate Limiting and Request Throttling

Websites often have limits on how many requests you can make within a certain period.
If these limits are exceeded, the server will return a 429 Too Many Requests status code.

### Failure Handling

By default, the 429 status code is treated as a warning.
You can modify this behavior and configure how the tool handles different status codes.

```toml
catch_response_codes = [404, 410, 429, 500]
```

### Throttling Mechanism

To prevent your requests from overwhelming a website and potentially getting you blocked, this tool implements
a throttling mechanism. This mechanism limits the number of requests that can be made in a given period.

You can control the following parameters to fine-tune request throttling:

```toml
throttle_groups = 40 # default: 100
throttle_delay = 30 # default: 20
throttle_max_delay = 240 # default: 100
```

### Filter Links to Check

By filtering out non-critical links and files, you can stay within rate limits while throttling requests.

#### Exclude Links by Pattern

Exclude specific URLs that match patterns:

```toml
exclude_links = ["https://github.com/AlexanderDokuchaev/md-dead-link-check/pull/*"]
```

#### Exclude Specific Files

Prevent specific files (e.g., changelogs) from being checked:

```toml
exclude_files = ["CHANGELOG.md"]
```

#### Exclude Parts of Files Using Comments

Ignore sections of files using a special comment `<!-- md-dead-link-check: off -->`.

```md
...
<!-- md-dead-link-check: off -->
All links will be ignored in this part of the file.
<!-- md-dead-link-check: on -->
...
```
2 changes: 1 addition & 1 deletion md_dead_link_check/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def main() -> int:
files = list(md_data)

status_list = check_all_links(md_data, config, repo_dir, files, files_in_repo)
err_num = summary(status_list, args.warn, args.all, args.no_color, config)
err_num = summary(status_list, args.warn, args.all, args.no_color)

return min(err_num, 1)

Expand Down
9 changes: 3 additions & 6 deletions md_dead_link_check/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path
from typing import List

from md_dead_link_check.config import Config
from md_dead_link_check.link_checker import Status
from md_dead_link_check.link_checker import StatusInfo

Expand Down Expand Up @@ -40,7 +39,7 @@ def disable_colors(self) -> None:
setattr(self, key, "")


def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool, config: Config) -> int:
def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_color: bool) -> int:
"""
Print summary.
Returns 0 if not found any error, otherwise 1.
Expand Down Expand Up @@ -72,10 +71,8 @@ def summary(status: List[StatusInfo], print_warn: bool, print_all: bool, no_colo
f"\n{specs.yellow}WARNING:{specs.clean} "
f"{count_429} link{'s' if count_429 > 1 else ''} returned \"429: Too Many Request\" respond code. "
f"This indicates that one of the servers is being accessed too frequently.\n"
f"Wait and try again later, or adjust the configuration:\n"
f"throttle_groups = {max(1, config.throttle_groups // 2)}\n"
f"throttle_delay = {config.throttle_delay}\n"
f"throttle_max_delay = {config.throttle_max_delay * 2}\n"
f"To more information visit "
"https://github.com/AlexanderDokuchaev/md-dead-link-check/#rate-limiting-and-request-throttling"
)

if err_nums:
Expand Down
26 changes: 21 additions & 5 deletions md_dead_link_check/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
RE_HTML_TAG_HREF = r"<\w+\s+(?:[^>]*?\s+)?href=([\"'])(.*?)\1"
RE_SUB = r"[$`][^`]+?[$`]"

MD_TAG_DISABLE = "<!-- md-dead-link-check: off -->"
MD_TAG_ENABLE = "<!-- md-dead-link-check: on -->"


@dataclass
class LinkInfo:
Expand Down Expand Up @@ -83,6 +86,7 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
links: List[LinkInfo] = []
with (root_dir / path).open(encoding="utf8") as stream:
in_code_block = ""
disable_detection_links = False
for line_num, line in enumerate(stream.readlines(), 1):
striped_line = line.strip()
# Skip code blocks that can be start ``` or ````
Expand Down Expand Up @@ -112,6 +116,22 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
# Skip $ and ` tags
line = re.sub(RE_SUB, "", line)

# Detect id under a tag <a id="introduction"></a>
matches = re.findall(RE_HTML_TAG_ID, line)
for _, id in matches:
fragments.append(id.lower())

if MD_TAG_DISABLE in line:
disable_detection_links = True
continue

if MD_TAG_ENABLE in line:
disable_detection_links = False
continue

if disable_detection_links:
continue

# Detect links
copy_line = line # Used to detect bare links
matches = re.findall(RE_LINK, line)
Expand All @@ -127,11 +147,6 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
links.append(LinkInfo(link, path, line_num))
copy_line = copy_line.replace(link, "")

# Detect id under a tag <a id="introduction"></a>
matches = re.findall(RE_HTML_TAG_ID, line)
for _, id in matches:
fragments.append(id.lower())

# Detect links under a tag <a href="introduction"></a>
matches = re.findall(RE_HTML_TAG_HREF, line)
for _, link in matches:
Expand All @@ -144,6 +159,7 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
if url.endswith("."):
url = url[:-1]
links.append(LinkInfo(url, path, line_num))

return MarkdownInfo(path=path, fragments=fragments, links=links)


Expand Down
8 changes: 8 additions & 0 deletions tests/test_md_files/a.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,11 @@ Some text

[ftp](ftp://example.example/example)
https://github.com.

<!-- md-dead-link-check: off -->

https://github.com

<!-- md-dead-link-check: on -->

https://github.com
5 changes: 5 additions & 0 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,5 +168,10 @@ def test_process_md_file():
location=Path("tests/test_md_files/a.md"),
line_num=62,
),
LinkInfo(
link="https://github.com",
location=Path("tests/test_md_files/a.md"),
line_num=70,
),
]
assert md_info.links == ref_links

0 comments on commit 05af041

Please sign in to comment.