Skip to content

Commit

Permalink
new option to merge schema and necessary refactorings
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliver Kandler committed Mar 4, 2024
1 parent 3d8e1f5 commit 9911221
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 19 deletions.
5 changes: 3 additions & 2 deletions blurry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ async def write_html_file(
extra_context["sibling_pages"] = sibling_pages
folder_in_build = convert_content_path_to_directory_in_build(file_data.path)

schema_type = file_data.front_matter.get("@type")
schema_type = file_data.top_level_type
if not schema_type:
raise ValueError(
f"Required @type value missing in file or TOML front matter invalid: "
Expand Down Expand Up @@ -207,9 +207,10 @@ async def build(release=True):
file_data_by_directory[directory] = []

# Convert Markdown file to HTML
body, front_matter = convert_markdown_file_to_html(filepath)
body, front_matter, top_level_type = convert_markdown_file_to_html(filepath)
file_data = MarkdownFileData(
body=body,
top_level_type=top_level_type,
front_matter=front_matter,
path=relative_filepath,
)
Expand Down
111 changes: 94 additions & 17 deletions blurry/markdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from typing import TypeAlias
from typing import TypeGuard

import json
from pyld import jsonld

import mistune
from mistune import BlockState
from mistune.plugins.abbr import abbr
Expand Down Expand Up @@ -149,9 +152,96 @@ def is_blurry_renderer(
+ [plugin.load() for plugin in discovered_markdown_plugins],
)

SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }')
def jsonld_document_loader(secure=False, fragments=[], **kwargs):
"""
Create a Requests document loader.
Can be used to setup extra Requests args such as verify, cert, timeout,
or others.
:param secure: require all requests to use HTTPS (default: False).
:param fragments: the fragments of schema loaded as dicts
:param **kwargs: extra keyword args for Requests get() call.
:return: the RemoteDocument loader function.
"""
from pyld.jsonld import JsonLdError

def loader(ignored, options={}):
"""
Retrieves JSON-LD from the dicts provided as fragments.
:param ignored: this positional paramter is ignored, because the tomls fragments are side loaded
:return: the RemoteDocument.
"""
fragments_str = []
for fragment in fragments:
if not fragment.get('@context'):
fragment['@context'] = SCHEMA_ORG
fragments_str.append(json.dumps(fragment))
# print("==========================")
# print(json.dumps(fragment, indent=2))

result = '[' + ','.join(fragments_str) + ']'
# print(">>>>>>>>> ",result)

doc = {
'contentType': 'application/ld+json',
'contextUrl': None,
'documentUrl': None,
'document': result
}
return doc

def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
return loader

def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict:
CONTENT_DIR = get_content_directory()

# Add inferred/computed/relative values
local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})

# Add inferred/computed/relative values
# https://schema.org/image
# https://schema.org/thumbnailUrl
if image := front_matter.get("image"):
image_copy = deepcopy(image)
relative_image_path = get_relative_image_path_from_image_property(image_copy)
image_path = resolve_relative_path_in_markdown(relative_image_path, filepath)
front_matter["image"] = update_image_with_url(image_copy, image_path)
front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path)

return local_front_matter

def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]:
if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge":
try:
global_schema = dict(SETTINGS.get("SCHEMA_DATA", {}))
if not global_schema.get('@context'):
global_schema['@context'] = SCHEMA_ORG

local_schema = state.env.get("front_matter", {})
top_level_type = local_schema.get("@type", None)
if not local_schema.get('@context'):
local_schema['@context'] = SCHEMA_ORG
local_schema = add_inferred_schema(local_schema, filepath)
jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema]))
front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG)
except Exception as e:
print("merging front matter failed:", e)
raise e
else:
# Seed front_matter with schema_data from config file
front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
front_matter.update(state.env.get("front_matter", {}))
front_matter = add_inferred_schema(front_matter, filepath)

top_level_type = None
return front_matter, top_level_type

def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]:
if not markdown.renderer:
raise Exception("Blurry markdown renderer not set on Mistune Markdown instance")

Expand All @@ -167,26 +257,13 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
html, state = markdown.parse(markdown_text, state=state)

if not is_str(html):
raise Exception(f"Expected html to be a string but got: {type(html)}")
raise Exception(f"Expected html to be a string but got: {top_level_type(html)}")

# Post-process HTML
html = remove_lazy_loading_from_first_image(html)

# Seed front_matter with schema_data from config file
front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
front_matter.update(state.env.get("front_matter", {}))

# Add inferred/computed/relative values
# https://schema.org/image
# https://schema.org/thumbnailUrl
front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
if image := front_matter.get("image"):
image_copy = deepcopy(image)
relative_image_path = get_relative_image_path_from_image_property(image_copy)
image_path = resolve_relative_path_in_markdown(relative_image_path, filepath)
front_matter["image"] = update_image_with_url(image_copy, image_path)
front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path)
return html, front_matter
front_matter, top_level_type = resolve_front_matter(state, filepath)
return html, front_matter, top_level_type


def image_path_to_thumbnailUrl(image_path: Path):
Expand Down
1 change: 1 addition & 0 deletions blurry/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
@dataclass
class MarkdownFileData:
body: str
top_level_type: str
front_matter: dict[str, Any]
path: Path

Expand Down
4 changes: 4 additions & 0 deletions tests/test_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@
directory_file_data = [
MarkdownFileData(
front_matter=dict(datePublished=date(2021, 1, 1), url="/blog/a-post-1/"),
top_level_type = "WebPage",
body="",
path=blog_path / "a-post-1",
),
MarkdownFileData(
front_matter=dict(datePublished=date(2021, 3, 1), url="/blog/b-post-3/"),
top_level_type = "WebPage",
body="",
path=blog_path / "b-post-3",
),
MarkdownFileData(
front_matter=dict(dateCreated=date(2021, 2, 1), url="/blog/c-post-2/"),
top_level_type = "WebPage",
body="",
path=blog_path / "c-post-2",
),
Expand All @@ -27,6 +30,7 @@
dateModified=date(2022, 1, 13),
url="/blog/c-post-4/",
),
top_level_type = "WebPage",
body="",
path=blog_path / "c-post-4",
),
Expand Down
4 changes: 4 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,25 @@ def test_sort_directory_file_data_by_date():
blog_path: [
MarkdownFileData(
front_matter=dict(datePublished=date(2021, 1, 1)),
top_level_type = "WebPage",
body="",
path=Path("a-post-1"),
),
MarkdownFileData(
front_matter=dict(datePublished=date(2021, 3, 1)),
top_level_type = "WebPage",
body="",
path=Path("b-post-3"),
),
MarkdownFileData(
front_matter=dict(dateCreated=date(2021, 2, 1)),
top_level_type = "WebPage",
body="",
path=Path("c-post-2"),
),
MarkdownFileData(
front_matter=dict(),
top_level_type = "WebPage",
body="",
path=Path("c-post-4"),
),
Expand Down

0 comments on commit 9911221

Please sign in to comment.