Skip to content

Commit

Permalink
new option to merge schema and necessary refactorings
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliver Kandler committed Feb 12, 2024
1 parent 5a10100 commit 5f00111
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 24 deletions.
5 changes: 3 additions & 2 deletions blurry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def write_html_file(
extra_context["sibling_pages"] = sibling_pages
folder_in_build = convert_content_path_to_directory_in_build(file_data.path)

schema_type = file_data.front_matter.get("@type")
schema_type = file_data.top_level_type
if not schema_type:
raise ValueError(
f"Required @type value missing in file or TOML front matter invalid: "
Expand Down Expand Up @@ -193,9 +193,10 @@ async def build(release=True):
file_data_by_directory[directory] = []

# Convert Markdown file to HTML
body, front_matter = convert_markdown_file_to_html(filepath)
body, front_matter, top_level_type = convert_markdown_file_to_html(filepath)
file_data = MarkdownFileData(
body=body,
top_level_type=top_level_type,
front_matter=front_matter,
path=relative_filepath,
)
Expand Down
125 changes: 103 additions & 22 deletions blurry/markdown/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from typing import Any
from typing import TypeGuard

import json
from pyld import jsonld

import mistune
from mistune import BlockState
from mistune.plugins.abbr import abbr
Expand Down Expand Up @@ -144,14 +147,109 @@ def is_blurry_renderer(
+ [plugin.load() for plugin in discovered_markdown_plugins],
)

SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }')
def jsonld_document_loader(secure=False, fragments=[], **kwargs):
"""
Create a Requests document loader.
Can be used to setup extra Requests args such as verify, cert, timeout,
or others.
:param secure: require all requests to use HTTPS (default: False).
:param fragments: the fragments of schema loaded as dicts
:param **kwargs: extra keyword args for Requests get() call.
:return: the RemoteDocument loader function.
"""
from pyld.jsonld import JsonLdError

def loader(ignored, options={}):
"""
Retrieves JSON-LD from the dicts provided as fragments.
:param ignored: this positional paramter is ignored, because the tomls fragments are side loaded
:return: the RemoteDocument.
"""
fragments_str = []
for fragment in fragments:
if not fragment.get('@context'):
fragment['@context'] = SCHEMA_ORG
fragments_str.append(json.dumps(fragment))
# print("==========================")
# print(json.dumps(fragment, indent=2))

result = '[' + ','.join(fragments_str) + ']'
# print(">>>>>>>>> ",result)

doc = {
'contentType': 'application/ld+json',
'contextUrl': None,
'documentUrl': None,
'document': result
}
return doc

return loader

def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict:
CONTENT_DIR = get_content_directory()
THUMBNAIL_WIDTH = SETTINGS.get("THUMBNAIL_WIDTH")
BUILD_DIR = get_build_directory()

# Add inferred/computed/relative values
local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
if image := local_front_matter.get("image"):
image_path = filepath.parent / Path(image)
local_front_matter["image"] = content_path_to_url(image_path)
# Add thumbnail URL, using the full image if the thumbnail doesn't exist
thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH)
thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to(
CONTENT_DIR
)
if thumbnail_image_build_path.exists():
local_front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path)
else:
local_front_matter["thumbnailUrl"] = local_front_matter["image"]
return local_front_matter

def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]:
if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge":
try:
global_schema = dict(SETTINGS.get("SCHEMA_DATA", {}))
if not global_schema.get('@context'):
global_schema['@context'] = SCHEMA_ORG

# print("-----")
# print(json.dumps(global_schema, indent=2))
local_schema = state.env.get("front_matter", {})
top_level_type = local_schema.get("@type", None)
if not local_schema.get('@context'):
local_schema['@context'] = SCHEMA_ORG
local_schema = add_inferred_schema(local_schema, filepath)
# print("-----")
# print(json.dumps(local_schema, indent=2))
jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema]))
front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG)
# print("-----")
# print(json.dumps(front_matter, indent=2))
except Exception as e:
print("merging front matter failed:", e)
raise e
else:
# Seed front_matter with schema_data from config file
front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
front_matter.update(state.env.get("front_matter", {}))
front_matter = add_inferred_schema(front_matter, filepath)

top_level_type = None
return front_matter, top_level_type


def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]:
if not markdown.renderer:
raise Exception("Blurry markdown renderer not set on Mistune Markdown instance")

BUILD_DIR = get_build_directory()
# Add filepath to the renderer to resolve relative paths
if not is_blurry_renderer(markdown.renderer):
raise Exception(
Expand All @@ -164,27 +262,10 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
html, state = markdown.parse(markdown_text, state=state)

if not is_str(html):
raise Exception(f"Expected html to be a string but got: {type(html)}")
raise Exception(f"Expected html to be a string but got: {top_level_type(html)}")

# Post-process HTML
html = remove_lazy_loading_from_first_image(html)

# Seed front_matter with schema_data from config file
front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
front_matter.update(state.env.get("front_matter", {}))

# Add inferred/computed/relative values
front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
if image := front_matter.get("image"):
image_path = filepath.parent / Path(image)
front_matter["image"] = content_path_to_url(image_path)
# Add thumbnail URL, using the full image if the thumbnail doesn't exist
thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH)
thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to(
CONTENT_DIR
)
if thumbnail_image_build_path.exists():
front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path)
else:
front_matter["thumbnailUrl"] = front_matter["image"]
return html, front_matter
front_matter, top_level_type = resolve_front_matter(state, filepath)
return html, front_matter, top_level_type
1 change: 1 addition & 0 deletions blurry/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
@dataclass
class MarkdownFileData:
body: str
top_level_type: str
front_matter: dict[str, Any]
path: Path

Expand Down

0 comments on commit 5f00111

Please sign in to comment.