-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgen_backlinks.py
51 lines (39 loc) · 1.52 KB
/
gen_backlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from collections.abc import Iterable
import json
from pathlib import Path
from bs4 import BeautifulSoup
def extract_links_from_html(file_path: Path) -> set[str]:
"""
Extracts all href links from the given HTML file.
:param file_path: The path.
:returns: The set of link hrefs.
"""
with file_path.open("r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
link_results: Iterable[dict[str, str]] = soup.find_all("a", href=True)
links = {
a["href"].lstrip("./")
for a in link_results
if not a["href"].startswith("https://")
}
return links
def build_backlinks(directory: Path) -> dict[str, list[str]]:
"""Builds a dictionary with links as keys and their backlinks (HTML files) as values."""
backlinks: dict[str, list[str]] = {}
html_files = directory.rglob("*.html")
for html_file in html_files:
links = extract_links_from_html(html_file)
if html_file.name not in backlinks:
backlinks[html_file.name] = []
for link in links:
if link not in backlinks:
backlinks[link] = []
backlinks[link].append(html_file.name)
return backlinks
if __name__ == "__main__":
base_dir = Path("./app/src/lib/notes")
output_file = Path("app/src/lib/backlinks.json")
backlinks = build_backlinks(base_dir)
with output_file.open("w", encoding="utf-8") as f:
json.dump(backlinks, f, indent=4)
print(f"Backlinks saved to {output_file}")