From 9844c060f13b7766a4870823bae0d3496b3771a5 Mon Sep 17 00:00:00 2001 From: Alexander Dokuchaev Date: Sat, 11 Jan 2025 23:43:52 +0200 Subject: [PATCH] Detect bare links (#13) Issue: #11 Passing links without any tags ``` https://github.com ``` --- .markdownlint.yaml | 3 ++- md_dead_link_check/preprocess.py | 12 ++++++++++++ tests/test_md_files/a.md | 6 +++--- tests/test_preprocess.py | 12 +++++++++++- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.markdownlint.yaml b/.markdownlint.yaml index 38c3390..d844474 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,7 +1,8 @@ # Default state for all rules default: true +MD003: false # heading-style MD013: false # Line length MD033: false # Inline HTML +MD034: false # no-bare-urls MD041: false # First line -MD003: false # heading-style diff --git a/md_dead_link_check/preprocess.py b/md_dead_link_check/preprocess.py index 3a1ca73..9f785c4 100644 --- a/md_dead_link_check/preprocess.py +++ b/md_dead_link_check/preprocess.py @@ -9,6 +9,7 @@ from git import Repo RE_HEADER = r"^(?:\s*[-+*]\s+|)[#]{1,6}\s*(.*?)\s*[#]*$" +RE_URL = r"(http[s]?://[^>)\]\s\"]+)" RE_LINK = r"([!]{0,1})\[([^\]!]*)\]\(([^()\s]+(?:\([^()\s]*\))*)\s*(.*?)\)" RE_HTML_TAG = r"]*>" RE_HTML_TAG_ID = r"<\w+\s+(?:[^>]*?\s+)?(?:id|name)=([\"'])(.*?)\1" @@ -113,9 +114,11 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: line = re.sub(RE_SUB, "", line) # Detect links + copy_line = line # Used to detect bare links matches = re.findall(RE_LINK, line) for img_tag, text, link, title in matches: links.append(LinkInfo(link, path, line_num)) + copy_line = copy_line.replace(link, "") if matches: # For case [![text](img_link)](link) @@ -123,6 +126,7 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: matches2 = re.findall(RE_LINK, sub_line) for img_tag, text, link, title in matches2: links.append(LinkInfo(link, path, line_num)) + copy_line = copy_line.replace(link, "") # Detect id under a tag matches = re.findall(RE_HTML_TAG_ID, line) @@ -133,6 +137,14 @@ def process_md_file(path: Path, root_dir: Path) -> MarkdownInfo: matches = re.findall(RE_HTML_TAG_HREF, line) for _, link in matches: links.append(LinkInfo(link, path, line_num)) + copy_line = copy_line.replace(link, "") + + # Detect simple urls without any tags + matches = re.findall(RE_URL, copy_line) + for url in matches: + if url.endswith("."): + url = url[:-1] + links.append(LinkInfo(url, path, line_num)) return MarkdownInfo(path=path, fragments=fragments, links=links) diff --git a/tests/test_md_files/a.md b/tests/test_md_files/a.md index 6937d35..3b1f11e 100644 --- a/tests/test_md_files/a.md +++ b/tests/test_md_files/a.md @@ -24,9 +24,9 @@ Some text ## links ## -[github](https://github.com/AlexanderDokuchaev) +[github](https://github.com/AlexanderDokuchaev) https://github.com [b](./b.md) [d.a](b.md) `[A+B](A)` -d.ad.a +d.ad.a [d.a](/tests/test_md_files/d/a.md "tag") ### Header with `quotes` and $math$ @@ -59,4 +59,4 @@ Some text mail [ftp](ftp://example.example/example) - +https://github.com. diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 6cf3e3f..aae8a53 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -72,6 +72,11 @@ def test_process_md_file(): location=Path("tests/test_md_files/a.md"), line_num=27, ), + LinkInfo( + link="https://github.com", + location=Path("tests/test_md_files/a.md"), + line_num=27, + ), LinkInfo( link="./b.md", location=Path("tests/test_md_files/a.md"), @@ -88,7 +93,7 @@ def test_process_md_file(): line_num=29, ), LinkInfo( - link="./d/a.md", + link="https://github.com", location=Path("tests/test_md_files/a.md"), line_num=29, ), @@ -157,5 +162,10 @@ def test_process_md_file(): location=Path("tests/test_md_files/a.md"), line_num=61, ), + LinkInfo( + link="https://github.com", + location=Path("tests/test_md_files/a.md"), + line_num=62, + ), ] assert md_info.links == ref_links