From 5f001118b53f5f7f0faa61afbd77566298aaeb75 Mon Sep 17 00:00:00 2001 From: Oliver Kandler Date: Mon, 12 Feb 2024 22:01:20 +0100 Subject: [PATCH] new option to merge schema and necessary refactorings --- blurry/__init__.py | 5 +- blurry/markdown/__init__.py | 125 +++++++++++++++++++++++++++++------- blurry/types.py | 1 + 3 files changed, 107 insertions(+), 24 deletions(-) diff --git a/blurry/__init__.py b/blurry/__init__.py index 3194bff..52c32ab 100644 --- a/blurry/__init__.py +++ b/blurry/__init__.py @@ -105,7 +105,7 @@ async def write_html_file( extra_context["sibling_pages"] = sibling_pages folder_in_build = convert_content_path_to_directory_in_build(file_data.path) - schema_type = file_data.front_matter.get("@type") + schema_type = file_data.top_level_type if not schema_type: raise ValueError( f"Required @type value missing in file or TOML front matter invalid: " @@ -193,9 +193,10 @@ async def build(release=True): file_data_by_directory[directory] = [] # Convert Markdown file to HTML - body, front_matter = convert_markdown_file_to_html(filepath) + body, front_matter, top_level_type = convert_markdown_file_to_html(filepath) file_data = MarkdownFileData( body=body, + top_level_type=top_level_type, front_matter=front_matter, path=relative_filepath, ) diff --git a/blurry/markdown/__init__.py b/blurry/markdown/__init__.py index 76d0583..d6fc44c 100644 --- a/blurry/markdown/__init__.py +++ b/blurry/markdown/__init__.py @@ -2,6 +2,9 @@ from typing import Any from typing import TypeGuard +import json +from pyld import jsonld + import mistune from mistune import BlockState from mistune.plugins.abbr import abbr @@ -144,14 +147,109 @@ def is_blurry_renderer( + [plugin.load() for plugin in discovered_markdown_plugins], ) +SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }') +def jsonld_document_loader(secure=False, fragments=[], **kwargs): + """ + Create a Requests document loader. + + Can be used to setup extra Requests args such as verify, cert, timeout, + or others. + + :param secure: require all requests to use HTTPS (default: False). + :param fragments: the fragments of schema loaded as dicts + :param **kwargs: extra keyword args for Requests get() call. + + :return: the RemoteDocument loader function. + """ + from pyld.jsonld import JsonLdError + + def loader(ignored, options={}): + """ + Retrieves JSON-LD from the dicts provided as fragments. + + :param ignored: this positional paramter is ignored, because the tomls fragments are side loaded + + :return: the RemoteDocument. + """ + fragments_str = [] + for fragment in fragments: + if not fragment.get('@context'): + fragment['@context'] = SCHEMA_ORG + fragments_str.append(json.dumps(fragment)) + # print("==========================") + # print(json.dumps(fragment, indent=2)) + + result = '[' + ','.join(fragments_str) + ']' + # print(">>>>>>>>> ",result) + + doc = { + 'contentType': 'application/ld+json', + 'contextUrl': None, + 'documentUrl': None, + 'document': result + } + return doc + + return loader -def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]: +def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict: CONTENT_DIR = get_content_directory() THUMBNAIL_WIDTH = SETTINGS.get("THUMBNAIL_WIDTH") + BUILD_DIR = get_build_directory() + + # Add inferred/computed/relative values + local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))}) + if image := local_front_matter.get("image"): + image_path = filepath.parent / Path(image) + local_front_matter["image"] = content_path_to_url(image_path) + # Add thumbnail URL, using the full image if the thumbnail doesn't exist + thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH) + thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to( + CONTENT_DIR + ) + if thumbnail_image_build_path.exists(): + local_front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path) + else: + local_front_matter["thumbnailUrl"] = local_front_matter["image"] + return local_front_matter + +def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]: + if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge": + try: + global_schema = dict(SETTINGS.get("SCHEMA_DATA", {})) + if not global_schema.get('@context'): + global_schema['@context'] = SCHEMA_ORG + + # print("-----") + # print(json.dumps(global_schema, indent=2)) + local_schema = state.env.get("front_matter", {}) + top_level_type = local_schema.get("@type", None) + if not local_schema.get('@context'): + local_schema['@context'] = SCHEMA_ORG + local_schema = add_inferred_schema(local_schema, filepath) + # print("-----") + # print(json.dumps(local_schema, indent=2)) + jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema])) + front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG) + # print("-----") + # print(json.dumps(front_matter, indent=2)) + except Exception as e: + print("merging front matter failed:", e) + raise e + else: + # Seed front_matter with schema_data from config file + front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {})) + front_matter.update(state.env.get("front_matter", {})) + front_matter = add_inferred_schema(front_matter, filepath) + + top_level_type = None + return front_matter, top_level_type + + +def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]: if not markdown.renderer: raise Exception("Blurry markdown renderer not set on Mistune Markdown instance") - BUILD_DIR = get_build_directory() # Add filepath to the renderer to resolve relative paths if not is_blurry_renderer(markdown.renderer): raise Exception( @@ -164,27 +262,10 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]: html, state = markdown.parse(markdown_text, state=state) if not is_str(html): - raise Exception(f"Expected html to be a string but got: {type(html)}") + raise Exception(f"Expected html to be a string but got: {top_level_type(html)}") # Post-process HTML html = remove_lazy_loading_from_first_image(html) - # Seed front_matter with schema_data from config file - front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {})) - front_matter.update(state.env.get("front_matter", {})) - - # Add inferred/computed/relative values - front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))}) - if image := front_matter.get("image"): - image_path = filepath.parent / Path(image) - front_matter["image"] = content_path_to_url(image_path) - # Add thumbnail URL, using the full image if the thumbnail doesn't exist - thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH) - thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to( - CONTENT_DIR - ) - if thumbnail_image_build_path.exists(): - front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path) - else: - front_matter["thumbnailUrl"] = front_matter["image"] - return html, front_matter + front_matter, top_level_type = resolve_front_matter(state, filepath) + return html, front_matter, top_level_type diff --git a/blurry/types.py b/blurry/types.py index 93338c1..1fe8c8b 100644 --- a/blurry/types.py +++ b/blurry/types.py @@ -8,6 +8,7 @@ @dataclass class MarkdownFileData: body: str + top_level_type: str front_matter: dict[str, Any] path: Path