diff --git a/wiki/wiki/report/wiki_broken_links/test_broken_link_checker.py b/wiki/wiki/report/wiki_broken_links/test_broken_link_checker.py index 8b8c83fc..a011e061 100644 --- a/wiki/wiki/report/wiki_broken_links/test_broken_link_checker.py +++ b/wiki/wiki/report/wiki_broken_links/test_broken_link_checker.py @@ -1,22 +1,37 @@ # Copyright (c) 2024, Frappe and Contributors # See license.txt +from unittest.mock import patch + import frappe from frappe.tests.utils import FrappeTestCase from wiki.wiki.report.wiki_broken_links.wiki_broken_links import execute, get_broken_links -BROKEN_LINK = "https://frappewiki.notavalidtld" -BROKEN_IMG_LINK = "https://img.notavalidtld/failed.jpeg" +WORKING_EXTERNAL_URL = "https://frappe.io" +BROKEN_EXTERNAL_URL = "https://frappewiki.notavalidtld" +BROKEN_IMG_URL = "https://img.notavalidtld/failed.jpeg" +WORKING_INTERNAL_URL = "/api/method/ping" +BROKEN_INTERNAL_URL = "/api/method/ring" + + +def internal_to_external_urls(internal_url: str) -> str: + if internal_url == WORKING_INTERNAL_URL: + return WORKING_EXTERNAL_URL + else: + return BROKEN_EXTERNAL_URL + TEST_MD_WITH_BROKEN_LINK = f""" ## Hello -This is a test for a [broken link]({BROKEN_LINK}). +This is a test for a [broken link]({BROKEN_EXTERNAL_URL}). -This is a [valid link](https://frappe.io). +This is a [valid link]({WORKING_EXTERNAL_URL}). +And [this is a correct relative link]({WORKING_INTERNAL_URL}). +And [this is an incorrect relative link]({BROKEN_INTERNAL_URL}). -![Broken Image]({BROKEN_IMG_LINK}) +![Broken Image]({BROKEN_IMG_URL}) """ @@ -41,7 +56,7 @@ def test_returns_correct_broken_links(self): def test_wiki_broken_link_report(self): _, data = execute() self.assertEqual(len(data), 1) - self.assertEqual(data[0]["broken_link"], BROKEN_LINK) + self.assertEqual(data[0]["broken_link"], BROKEN_EXTERNAL_URL) def test_wiki_broken_link_report_with_wiki_space_filter(self): _, data = execute({"wiki_space": self.test_wiki_space.name}) @@ -55,16 +70,29 @@ def test_wiki_broken_link_report_with_wiki_space_filter(self): _, data = execute({"wiki_space": self.test_wiki_space.name}) self.assertEqual(len(data), 1) self.assertEqual(data[0]["wiki_page"], self.test_wiki_page.name) - self.assertEqual(data[0]["broken_link"], BROKEN_LINK) + self.assertEqual(data[0]["broken_link"], BROKEN_EXTERNAL_URL) def test_wiki_broken_link_report_with_image_filter(self): _, data = execute({"check_images": 1}) self.assertEqual(len(data), 2) self.assertEqual(data[0]["wiki_page"], self.test_wiki_page.name) - self.assertEqual(data[0]["broken_link"], BROKEN_LINK) + self.assertEqual(data[0]["broken_link"], BROKEN_EXTERNAL_URL) + + self.assertEqual(data[1]["wiki_page"], self.test_wiki_page.name) + self.assertEqual(data[1]["broken_link"], BROKEN_IMG_URL) + + @patch.object(frappe.utils.data, "get_url", side_effect=internal_to_external_urls) + def test_wiki_broken_link_report_with_internal_links(self, _get_url): + # patch the get_url to return valid/invalid external links instead + # of internal links in test + _, data = execute({"check_internal_links": 1}) + + self.assertEqual(len(data), 2) + self.assertEqual(data[0]["wiki_page"], self.test_wiki_page.name) + self.assertEqual(data[0]["broken_link"], BROKEN_EXTERNAL_URL) self.assertEqual(data[1]["wiki_page"], self.test_wiki_page.name) - self.assertEqual(data[1]["broken_link"], BROKEN_IMG_LINK) + self.assertEqual(data[1]["broken_link"], BROKEN_INTERNAL_URL) def tearDown(self): frappe.db.rollback() diff --git a/wiki/wiki/report/wiki_broken_links/wiki_broken_links.js b/wiki/wiki/report/wiki_broken_links/wiki_broken_links.js index 61db8dd1..bfe4ab40 100644 --- a/wiki/wiki/report/wiki_broken_links/wiki_broken_links.js +++ b/wiki/wiki/report/wiki_broken_links/wiki_broken_links.js @@ -11,9 +11,15 @@ frappe.query_reports["Wiki Broken Links"] = { }, { fieldname: "check_images", - label: __("Check Images?"), + label: __("Include images?"), fieldtype: "Check", default: 1, }, + { + fieldname: "check_internal_links", + label: __("Include internal links?"), + fieldtype: "Check", + default: 0, + }, ], }; diff --git a/wiki/wiki/report/wiki_broken_links/wiki_broken_links.py b/wiki/wiki/report/wiki_broken_links/wiki_broken_links.py index 3f8f8339..2c4daca6 100644 --- a/wiki/wiki/report/wiki_broken_links/wiki_broken_links.py +++ b/wiki/wiki/report/wiki_broken_links/wiki_broken_links.py @@ -61,15 +61,19 @@ def get_data(filters: dict | None = None) -> list[list]: ) include_images = filters and bool(filters.get("check_images")) + check_internal_links = filters and bool(filters.get("check_internal_links")) + for page in wiki_pages: - broken_links_for_page = get_broken_links(page.content, include_images) + broken_links_for_page = get_broken_links(page.content, include_images, check_internal_links) rows = [{"broken_link": link, "wiki_page": page["name"]} for link in broken_links_for_page] data.extend(rows) return data -def get_broken_links(md_content: str, include_images: bool = True): +def get_broken_links( + md_content: str, include_images: bool = True, include_relative_urls: bool = False +) -> list[str]: html = frappe.utils.md_to_html(md_content) soup = BeautifulSoup(html, "html.parser") @@ -80,11 +84,41 @@ def get_broken_links(md_content: str, include_images: bool = True): broken_links = [] for el in links: url = el.attrs.get("href") or el.attrs.get("src") - try: - response = requests.head(url, verify=False, timeout=5) - if response.status_code >= 400: + is_relative = is_relative_url(url) + relative_url = None + + if is_relative and not include_relative_urls: + continue + + if is_relative: + relative_url = url + url = frappe.utils.data.get_url(url) # absolute URL + + is_broken = is_broken_link(url) + if is_broken: + if is_relative: + broken_links.append(relative_url) # original URL + else: broken_links.append(url) - except Exception: - broken_links.append(url) return broken_links + + +def is_relative_url(url: str) -> bool: + return url.startswith("/") + + +def is_broken_link(url: str) -> bool: + try: + status_code = get_request_status_code(url) + if status_code >= 400: + return True + except Exception: + return True + + return False + + +def get_request_status_code(url: str) -> int: + response = requests.head(url, verify=False, timeout=5) + return response.status_code