From 401727b7ab7810331c21b7e65cd682927a8ae293 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Sun, 3 Mar 2024 17:36:16 +0200 Subject: [PATCH] fnmatch --- README.md | 70 +++++++++++++++++------------- md_dead_link_check/link_checker.py | 15 ++++--- md_dead_link_check/preprocess.py | 6 +-- tests/test_link_cheker.py | 20 ++++----- 4 files changed, 62 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 15f35b7..6500564 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Markdown dead link checker +# Markdown Dead Link Checker -This is a lightweight and fast tool to help you keep your Markdown files free of broken links! -It scans your Markdown files and verifies whether each link is still active, letting you know if any need attention. +This handy tool helps you maintain the integrity of your Markdown files by identifying broken links. +It scans your files and detects: -## Detection issues +Here's what it does: -- Unavailable web links. -- Incorrect links to file. -- Not existed fragments in markdown, like `[no-fragment](README.md#no-fragment)`. +- Missing webpages: Links that no longer exist on the internet. +- Incorrect file links: Links that point to the wrong file in your project. +- Non-existent fragments (anchors): Links to specific sections that don't exist, e.g. `README.md#no-fragment`. Example of output for [fail.md](tests/test_md_files/fail.md) @@ -20,9 +20,22 @@ File: tests/test_md_files/fail.md:13 • Link: a.md#fail • Error: Not found fr ❌ Found 5 dead links 🙀 ``` -## Usage +## Performance -### Github action +This tool utilizes asynchronous API calls and avoids downloading full web pages, +enabling it to process thousands links in several seconds. + +## Proxy + +This tool works in accordance with your system's proxy settings for HTTP and HTTPS requests. +These settings are typically found in your environment variables and are identified +by the names `HTTP_PROXY` and `HTTPS_PROXY`. + +## How to Use It + +### Option 1: GitHub Actions + +Add Github Action config to `.github/workflow/` ```yaml jobs: @@ -30,49 +43,46 @@ jobs: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 - - uses: AlexanderDokuchaev/md-dead-link-check@main + - uses: AlexanderDokuchaev/md-dead-link-check@latest ``` -### Pre-commit hook +### Option 2: Pre-Commit -Adding to your .pre-commit-config.yaml +Adding to your `.pre-commit-config.yaml` to integrate in [pre-commit](https://pre-commit.com/) tool ```yaml - repo: https://github.com/AlexanderDokuchaev/md-dead-link-check - rev: main + rev: latest hooks: - id: md-dead-link-check ``` -### Install rom pip - -```bash -pip install md-dead-link-check -md-dead-link-check -``` +### Option 3: Install from pip -### Install github repo +For direct use, install with pip and run: ```bash -git clone https://github.com/AlexanderDokuchaev/md-dead-link-check -cd md-dead-link-check -pip install . +pip install md-dead-link-check md-dead-link-check ``` ## Configuration -By default use `pyproject.toml`, to use another config toml file use `--config`. +This tool seamlessly integrates with your project's `pyproject.toml` file for configuration. +To leverage a different file, invoke the `--config` option during execution. + +- timeout: Specifies the maximum time (in seconds) to wait for web link responses. Default: `10` seconds. +- exclude_links: Accepts a list of links to exclude from checks. Default: `[]`. +- exclude_files: Accepts a list of files to exclude from checks. Default: `[]`. +- check_web_links: Toggle web link checks on or off. Set to `false` to focus solely on file-based links. Default: `true`. -- timeout - timeout to response web link, defaults `10`. -- exclude_links - disable fails for links, defaults `[]`. -- exclude_files - disable check for file, defaults `[]`. -- check_web_links - to disable check web links, defaults `true`. +[!TIP] +Leverage wildcard patterns ([fnmatch](https://docs.python.org/3/library/fnmatch.html) syntax) for flexible exclusions in both `exclude_links` and `exclude_files` lists. ```toml [tool.md_dead_link_check] timeout = 10 -exclude_links = ["https://github.com/"] -exclude_files = ["tests/test_md_files/fail.md"] +exclude_links = ["https://github.com/", "*localhost*"] +exclude_files = ["tests/test_md_files/fail.md", "tests/*"] check_web_links = true ``` diff --git a/md_dead_link_check/link_checker.py b/md_dead_link_check/link_checker.py index d280dd8..866ec5a 100644 --- a/md_dead_link_check/link_checker.py +++ b/md_dead_link_check/link_checker.py @@ -3,6 +3,7 @@ import asyncio import os from dataclasses import dataclass +from fnmatch import fnmatch from pathlib import Path from typing import Any, Dict, List, Mapping, Optional from urllib.parse import urlsplit @@ -35,8 +36,8 @@ def get_proxies(env: Mapping[str, Any]) -> Dict[str, Optional[str]]: Find proxies in environment. """ return { - "http": env.get("http_proxy", env.get("HTTP_PROXY")), - "https": env.get("https_proxy", env.get("HTTPS_PROXY")), + "http": env.get("HTTP_PROXY", env.get("http_proxy")), + "https": env.get("HTTPS_PROXY", env.get("https_proxy")), } @@ -89,10 +90,10 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis if md_file not in md_data: continue md_file_info = md_data[md_file] - if md_file in config.exclude_files: + if any(fnmatch(md_file, p) for p in config.exclude_files): continue for li in md_file_info.links: - if li.link in config.exclude_links: + if any(fnmatch(li.link, p) for p in config.exclude_links): continue if urlsplit(li.link).netloc: web_links.append(li) @@ -103,11 +104,13 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis def check_path_links(md_data: Dict[str, MarkdownInfo], root_dir: Path, config: Config) -> List[StatusInfo]: ret = [] for md_file, md_file_info in md_data.items(): - if md_file in config.exclude_files: + if any(fnmatch(md_file, p) for p in config.exclude_files): continue md_abs_path = root_dir / md_file_info.path for md_link in md_file_info.links: - if md_link.link in config.exclude_links: + for p in config.exclude_links: + print(fnmatch(md_link.link, p), md_link.link, p) + if any(fnmatch(md_link.link, p) for p in config.exclude_links): continue split_result = urlsplit(md_link.link) if split_result.netloc: diff --git a/md_dead_link_check/preprocess.py b/md_dead_link_check/preprocess.py index 0b171a9..9b9138b 100644 --- a/md_dead_link_check/preprocess.py +++ b/md_dead_link_check/preprocess.py @@ -10,7 +10,7 @@ RE_HEADER = r"^[#]{1,6}\s*(.*)" RE_LINK = r"([!]{0,1})\[[^\]]*\]\((([^\s)]+)(?:\s*(.*?))?)\)" -RE_ID = r".*?<\/a>" +RE_HTML_A_TAG_ID = r".*?<\/a>" RE_SUB = r"[$`][^`]+?[$`]" @@ -99,8 +99,8 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: # for link [link](img_link) links.append(LinkInfo(full_link, path, line_num)) - # Detect id - matches = re.findall(RE_ID, line) + # Detect id under a tag + matches = re.findall(RE_HTML_A_TAG_ID, line) if matches: for id in matches: fragments.append(id) diff --git a/tests/test_link_cheker.py b/tests/test_link_cheker.py index b6e3203..134978f 100644 --- a/tests/test_link_cheker.py +++ b/tests/test_link_cheker.py @@ -87,13 +87,21 @@ def test_exclude_files(): assert ret == [] -def test_exclude_links(): +@pytest.mark.parametrize( + "exclude_links", + ( + ["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1", "/test/fail.md1"], + ["https://github.com/AlexanderDokuchaev/*", "*.md1"], + ), + ids=["no_re", "re"], +) +def test_exclude_links(exclude_links): path = "tests/test_md_files/fail.md" root_dir = Path(__file__).parent.parent md_data = {path: process_md_file(Path(path), root_dir)} ret = check_all_links( md_data, - Config(exclude_links=["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1"]), + Config(exclude_links=exclude_links), root_dir, list(md_data.keys()), ) @@ -110,13 +118,5 @@ def test_exclude_links(): ), err_msg="", ), - StatusInfo( - link_info=LinkInfo( - link="/test/fail.md1", - location=Path("tests/test_md_files/fail.md"), - line_num=8, - ), - err_msg="Path does not exist", - ), ] assert ret == ref