From 401727b7ab7810331c21b7e65cd682927a8ae293 Mon Sep 17 00:00:00 2001
From: Alexander Dokuchaev <alexander.dokuchaev@intel.com>
Date: Sun, 3 Mar 2024 17:36:16 +0200
Subject: [PATCH] fnmatch

---
 README.md                          | 70 +++++++++++++++++-------------
 md_dead_link_check/link_checker.py | 15 ++++---
 md_dead_link_check/preprocess.py   |  6 +--
 tests/test_link_cheker.py          | 20 ++++-----
 4 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 15f35b7..6500564 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
-# Markdown dead link checker
+# Markdown Dead Link Checker
 
-This is a lightweight and fast tool to help you keep your Markdown files free of broken links!
-It scans your Markdown files and verifies whether each link is still active, letting you know if any need attention.
+This handy tool helps you maintain the integrity of your Markdown files by identifying broken links.
+It scans your files and detects:
 
-## Detection issues
+Here's what it does:
 
-- Unavailable web links.
-- Incorrect links to file.
-- Not existed fragments in markdown, like `[no-fragment](README.md#no-fragment)`.
+- Missing webpages: Links that no longer exist on the internet.
+- Incorrect file links: Links that point to the wrong file in your project.
+- Non-existent fragments (anchors): Links to specific sections that don't exist, e.g. `README.md#no-fragment`.
 
 Example of output for [fail.md](tests/test_md_files/fail.md)
 
@@ -20,9 +20,22 @@ File: tests/test_md_files/fail.md:13 • Link: a.md#fail • Error: Not found fr
 ❌ Found 5 dead links 🙀
 ```
 
-## Usage
+## Performance
 
-### Github action
+This tool utilizes asynchronous API calls and avoids downloading full web pages,
+enabling it to process thousands links in several seconds.
+
+## Proxy
+
+This tool works in accordance with your system's proxy settings for HTTP and HTTPS requests.
+These settings are typically found in your environment variables and are identified
+by the names `HTTP_PROXY` and `HTTPS_PROXY`.
+
+## How to Use It
+
+### Option 1: GitHub Actions
+
+Add Github Action config to `.github/workflow/`
 
 ```yaml
 jobs:
@@ -30,49 +43,46 @@ jobs:
     runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
-      - uses: AlexanderDokuchaev/md-dead-link-check@main
+      - uses: AlexanderDokuchaev/md-dead-link-check@latest
 ```
 
-### Pre-commit hook
+### Option 2: Pre-Commit
 
-Adding to your .pre-commit-config.yaml
+Adding to your `.pre-commit-config.yaml` to integrate in [pre-commit](https://pre-commit.com/) tool
 
 ```yaml
   - repo: https://github.com/AlexanderDokuchaev/md-dead-link-check
-    rev: main
+    rev: latest
     hooks:
       - id: md-dead-link-check
 ```
 
-### Install rom pip
-
-```bash
-pip install md-dead-link-check
-md-dead-link-check
-```
+### Option 3: Install from pip
 
-### Install github repo
+For direct use, install with pip and run:
 
 ```bash
-git clone https://github.com/AlexanderDokuchaev/md-dead-link-check
-cd md-dead-link-check
-pip install .
+pip install md-dead-link-check
 md-dead-link-check
 ```
 
 ## Configuration
 
-By default use `pyproject.toml`, to use another config toml file use `--config`.
+This tool seamlessly integrates with your project's `pyproject.toml` file for configuration.
+To leverage a different file, invoke the `--config` option during execution.
+
+- timeout: Specifies the maximum time (in seconds) to wait for web link responses. Default: `10` seconds.
+- exclude_links: Accepts a list of links to exclude from checks. Default: `[]`.
+- exclude_files: Accepts a list of files to exclude from checks. Default: `[]`.
+- check_web_links: Toggle web link checks on or off. Set to `false` to focus solely on file-based links. Default: `true`.
 
-- timeout - timeout to response web link, defaults `10`.
-- exclude_links - disable fails for links, defaults `[]`.
-- exclude_files - disable check for file, defaults `[]`.
-- check_web_links - to disable check web links, defaults `true`.
+[!TIP]
+Leverage wildcard patterns ([fnmatch](https://docs.python.org/3/library/fnmatch.html) syntax) for flexible exclusions in both `exclude_links` and `exclude_files` lists.
 
 ```toml
 [tool.md_dead_link_check]
 timeout = 10
-exclude_links = ["https://github.com/"]
-exclude_files = ["tests/test_md_files/fail.md"]
+exclude_links = ["https://github.com/", "*localhost*"]
+exclude_files = ["tests/test_md_files/fail.md", "tests/*"]
 check_web_links = true
 ```
diff --git a/md_dead_link_check/link_checker.py b/md_dead_link_check/link_checker.py
index d280dd8..866ec5a 100644
--- a/md_dead_link_check/link_checker.py
+++ b/md_dead_link_check/link_checker.py
@@ -3,6 +3,7 @@
 import asyncio
 import os
 from dataclasses import dataclass
+from fnmatch import fnmatch
 from pathlib import Path
 from typing import Any, Dict, List, Mapping, Optional
 from urllib.parse import urlsplit
@@ -35,8 +36,8 @@ def get_proxies(env: Mapping[str, Any]) -> Dict[str, Optional[str]]:
     Find proxies in environment.
     """
     return {
-        "http": env.get("http_proxy", env.get("HTTP_PROXY")),
-        "https": env.get("https_proxy", env.get("HTTPS_PROXY")),
+        "http": env.get("HTTP_PROXY", env.get("http_proxy")),
+        "https": env.get("HTTPS_PROXY", env.get("https_proxy")),
     }
 
 
@@ -89,10 +90,10 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis
         if md_file not in md_data:
             continue
         md_file_info = md_data[md_file]
-        if md_file in config.exclude_files:
+        if any(fnmatch(md_file, p) for p in config.exclude_files):
             continue
         for li in md_file_info.links:
-            if li.link in config.exclude_links:
+            if any(fnmatch(li.link, p) for p in config.exclude_links):
                 continue
             if urlsplit(li.link).netloc:
                 web_links.append(li)
@@ -103,11 +104,13 @@ def check_web_links(md_data: Dict[str, MarkdownInfo], config: Config, files: Lis
 def check_path_links(md_data: Dict[str, MarkdownInfo], root_dir: Path, config: Config) -> List[StatusInfo]:
     ret = []
     for md_file, md_file_info in md_data.items():
-        if md_file in config.exclude_files:
+        if any(fnmatch(md_file, p) for p in config.exclude_files):
             continue
         md_abs_path = root_dir / md_file_info.path
         for md_link in md_file_info.links:
-            if md_link.link in config.exclude_links:
+            for p in config.exclude_links:
+                print(fnmatch(md_link.link, p), md_link.link, p)
+            if any(fnmatch(md_link.link, p) for p in config.exclude_links):
                 continue
             split_result = urlsplit(md_link.link)
             if split_result.netloc:
diff --git a/md_dead_link_check/preprocess.py b/md_dead_link_check/preprocess.py
index 0b171a9..9b9138b 100644
--- a/md_dead_link_check/preprocess.py
+++ b/md_dead_link_check/preprocess.py
@@ -10,7 +10,7 @@
 
 RE_HEADER = r"^[#]{1,6}\s*(.*)"
 RE_LINK = r"([!]{0,1})\[[^\]]*\]\((([^\s)]+)(?:\s*(.*?))?)\)"
-RE_ID = r"<a\s+id=[\"'](.*)[\"']>.*?<\/a>"
+RE_HTML_A_TAG_ID = r"<a\s+id=[\"'](.*)[\"']>.*?<\/a>"
 RE_SUB = r"[$`][^`]+?[$`]"
 
 
@@ -99,8 +99,8 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo:
                         # for link [link](img_link)
                         links.append(LinkInfo(full_link, path, line_num))
 
-            # Detect id
-            matches = re.findall(RE_ID, line)
+            # Detect id under a tag <a id="introduction"></a>
+            matches = re.findall(RE_HTML_A_TAG_ID, line)
             if matches:
                 for id in matches:
                     fragments.append(id)
diff --git a/tests/test_link_cheker.py b/tests/test_link_cheker.py
index b6e3203..134978f 100644
--- a/tests/test_link_cheker.py
+++ b/tests/test_link_cheker.py
@@ -87,13 +87,21 @@ def test_exclude_files():
     assert ret == []
 
 
-def test_exclude_links():
+@pytest.mark.parametrize(
+    "exclude_links",
+    (
+        ["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1", "/test/fail.md1"],
+        ["https://github.com/AlexanderDokuchaev/*", "*.md1"],
+    ),
+    ids=["no_re", "re"],
+)
+def test_exclude_links(exclude_links):
     path = "tests/test_md_files/fail.md"
     root_dir = Path(__file__).parent.parent
     md_data = {path: process_md_file(Path(path), root_dir)}
     ret = check_all_links(
         md_data,
-        Config(exclude_links=["https://github.com/AlexanderDokuchaev/FAILED", "fail.md1"]),
+        Config(exclude_links=exclude_links),
         root_dir,
         list(md_data.keys()),
     )
@@ -110,13 +118,5 @@ def test_exclude_links():
             ),
             err_msg="",
         ),
-        StatusInfo(
-            link_info=LinkInfo(
-                link="/test/fail.md1",
-                location=Path("tests/test_md_files/fail.md"),
-                line_num=8,
-            ),
-            err_msg="Path does not exist",
-        ),
     ]
     assert ret == ref