Skip to content

Commit

Permalink
fnmatch
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexanderDokuchaev committed Mar 3, 2024
1 parent e8f4be4 commit 401727b
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 49 deletions.
70 changes: 40 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Markdown dead link checker
# Markdown Dead Link Checker

This is a lightweight and fast tool to help you keep your Markdown files free of broken links!
It scans your Markdown files and verifies whether each link is still active, letting you know if any need attention.
This handy tool helps you maintain the integrity of your Markdown files by identifying broken links.
It scans your files and detects:

## Detection issues
Here's what it does:

- Unavailable web links.
- Incorrect links to file.
- Not existed fragments in markdown, like `[no-fragment](README.md#no-fragment)`.
- Missing webpages: Links that no longer exist on the internet.
- Incorrect file links: Links that point to the wrong file in your project.
- Non-existent fragments (anchors): Links to specific sections that don't exist, e.g. `README.md#no-fragment`.

Example of output for [fail.md](tests/test_md_files/fail.md)

Expand All @@ -20,59 +20,69 @@ File: tests/test_md_files/fail.md:13 • Link: a.md#fail • Error: Not found fr
❌ Found 5 dead links 🙀
```

## Usage
## Performance

### Github action
This tool utilizes asynchronous API calls and avoids downloading full web pages,
enabling it to process thousands links in several seconds.

## Proxy

This tool works in accordance with your system's proxy settings for HTTP and HTTPS requests.
These settings are typically found in your environment variables and are identified
by the names `HTTP_PROXY` and `HTTPS_PROXY`.

## How to Use It

### Option 1: GitHub Actions

Add Github Action config to `.github/workflow/`

```yaml
jobs:
md-dead-link-check:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: AlexanderDokuchaev/md-dead-link-check@main
- uses: AlexanderDokuchaev/md-dead-link-check@latest
```
### Pre-commit hook
### Option 2: Pre-Commit
Adding to your .pre-commit-config.yaml
Adding to your `.pre-commit-config.yaml` to integrate in [pre-commit](https://pre-commit.com/) tool

```yaml
- repo: https://github.com/AlexanderDokuchaev/md-dead-link-check
rev: main
rev: latest
hooks:
- id: md-dead-link-check
```

### Install rom pip
```bash
pip install md-dead-link-check
md-dead-link-check
```
### Option 3: Install from pip

### Install github repo
For direct use, install with pip and run:

```bash
git clone https://github.com/AlexanderDokuchaev/md-dead-link-check
cd md-dead-link-check
pip install .
pip install md-dead-link-check
md-dead-link-check
```

## Configuration

By default use `pyproject.toml`, to use another config toml file use `--config`.
This tool seamlessly integrates with your project's `pyproject.toml` file for configuration.
To leverage a different file, invoke the `--config` option during execution.

- timeout: Specifies the maximum time (in seconds) to wait for web link responses. Default: `10` seconds.
- exclude_links: Accepts a list of links to exclude from checks. Default: `[]`.
- exclude_files: Accepts a list of files to exclude from checks. Default: `[]`.
- check_web_links: Toggle web link checks on or off. Set to `false` to focus solely on file-based links. Default: `true`.

- timeout - timeout to response web link, defaults `10`.
- exclude_links - disable fails for links, defaults `[]`.
- exclude_files - disable check for file, defaults `[]`.
- check_web_links - to disable check web links, defaults `true`.
[!TIP]
Leverage wildcard patterns ([fnmatch](https://docs.python.org/3/library/fnmatch.html) syntax) for flexible exclusions in both `exclude_links` and `exclude_files` lists.

```toml
[tool.md_dead_link_check]
timeout = 10
exclude_links = ["https://github.com/"]
exclude_files = ["tests/test_md_files/fail.md"]
exclude_links = ["https://github.com/", "*localhost*"]
exclude_files = ["tests/test_md_files/fail.md", "tests/*"]
check_web_links = true
```
15 changes: 9 additions & 6 deletions md_dead_link_check/link_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import os
from dataclasses import dataclass
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, Dict, List, Mapping, Optional
from urllib.parse import urlsplit
Expand Down Expand Up @@ -35,8 +36,8 @@ def get_proxies(env: Mapping[str, Any]) -> Dict[str, Optional[str]]:
Find proxies in environment.
"""
return {
"http": env.get("http_proxy", env.get("HTTP_PROXY")),
"https": env.get("https_proxy", env.get("HTTPS_PROXY")),
"http": env.get("HTTP_PROXY", env.get("http_proxy")),
"https": env.get("HTTPS_PROXY", env.get("https_proxy")),
}


Expand Down Expand Up @@ -89,10 +90,10 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis
if md_file not in md_data:
continue
md_file_info = md_data[md_file]
if md_file in config.exclude_files:
if any(fnmatch(md_file, p) for p in config.exclude_files):
continue
for li in md_file_info.links:
if li.link in config.exclude_links:
if any(fnmatch(li.link, p) for p in config.exclude_links):
continue
if urlsplit(li.link).netloc:
web_links.append(li)
Expand All @@ -103,11 +104,13 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis
def check_path_links(md_data: Dict[str, MarkdownInfo], root_dir: Path, config: Config) -> List[StatusInfo]:
ret = []
for md_file, md_file_info in md_data.items():
if md_file in config.exclude_files:
if any(fnmatch(md_file, p) for p in config.exclude_files):
continue
md_abs_path = root_dir / md_file_info.path
for md_link in md_file_info.links:
if md_link.link in config.exclude_links:
for p in config.exclude_links:
print(fnmatch(md_link.link, p), md_link.link, p)
if any(fnmatch(md_link.link, p) for p in config.exclude_links):
continue
split_result = urlsplit(md_link.link)
if split_result.netloc:
Expand Down
6 changes: 3 additions & 3 deletions md_dead_link_check/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

RE_HEADER = r"^[#]{1,6}\s*(.*)"
RE_LINK = r"([!]{0,1})\[[^\]]*\]\((([^\s)]+)(?:\s*(.*?))?)\)"
RE_ID = r"<a\s+id=[\"'](.*)[\"']>.*?<\/a>"
RE_HTML_A_TAG_ID = r"<a\s+id=[\"'](.*)[\"']>.*?<\/a>"
RE_SUB = r"[$`][^`]+?[$`]"


Expand Down Expand Up @@ -99,8 +99,8 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
# for link [link](img_link)
links.append(LinkInfo(full_link, path, line_num))

# Detect id
matches = re.findall(RE_ID, line)
# Detect id under a tag <a id="introduction"></a>
matches = re.findall(RE_HTML_A_TAG_ID, line)
if matches:
for id in matches:
fragments.append(id)
Expand Down
20 changes: 10 additions & 10 deletions tests/test_link_cheker.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,21 @@ def test_exclude_files():
assert ret == []


def test_exclude_links():
@pytest.mark.parametrize(
"exclude_links",
(
["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1", "/test/fail.md1"],
["https://github.com/AlexanderDokuchaev/*", "*.md1"],
),
ids=["no_re", "re"],
)
def test_exclude_links(exclude_links):
path = "tests/test_md_files/fail.md"
root_dir = Path(__file__).parent.parent
md_data = {path: process_md_file(Path(path), root_dir)}
ret = check_all_links(
md_data,
Config(exclude_links=["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1"]),
Config(exclude_links=exclude_links),
root_dir,
list(md_data.keys()),
)
Expand All @@ -110,13 +118,5 @@ def test_exclude_links():
),
err_msg="",
),
StatusInfo(
link_info=LinkInfo(
link="/test/fail.md1",
location=Path("tests/test_md_files/fail.md"),
line_num=8,
),
err_msg="Path does not exist",
),
]
assert ret == ref

0 comments on commit 401727b

Please sign in to comment.