From 5f001118b53f5f7f0faa61afbd77566298aaeb75 Mon Sep 17 00:00:00 2001
From: Oliver Kandler <oliver@martpersonalfinance.info>
Date: Mon, 12 Feb 2024 22:01:20 +0100
Subject: [PATCH] new option to merge schema and necessary refactorings

---
 blurry/__init__.py          |   5 +-
 blurry/markdown/__init__.py | 125 +++++++++++++++++++++++++++++-------
 blurry/types.py             |   1 +
 3 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/blurry/__init__.py b/blurry/__init__.py
index 3194bff..52c32ab 100644
--- a/blurry/__init__.py
+++ b/blurry/__init__.py
@@ -105,7 +105,7 @@ async def write_html_file(
         extra_context["sibling_pages"] = sibling_pages
     folder_in_build = convert_content_path_to_directory_in_build(file_data.path)
 
-    schema_type = file_data.front_matter.get("@type")
+    schema_type = file_data.top_level_type
     if not schema_type:
         raise ValueError(
             f"Required @type value missing in file or TOML front matter invalid: "
@@ -193,9 +193,10 @@ async def build(release=True):
             file_data_by_directory[directory] = []
 
         # Convert Markdown file to HTML
-        body, front_matter = convert_markdown_file_to_html(filepath)
+        body, front_matter, top_level_type = convert_markdown_file_to_html(filepath)
         file_data = MarkdownFileData(
             body=body,
+            top_level_type=top_level_type,
             front_matter=front_matter,
             path=relative_filepath,
         )
diff --git a/blurry/markdown/__init__.py b/blurry/markdown/__init__.py
index 76d0583..d6fc44c 100644
--- a/blurry/markdown/__init__.py
+++ b/blurry/markdown/__init__.py
@@ -2,6 +2,9 @@
 from typing import Any
 from typing import TypeGuard
 
+import json
+from pyld import jsonld
+
 import mistune
 from mistune import BlockState
 from mistune.plugins.abbr import abbr
@@ -144,14 +147,109 @@ def is_blurry_renderer(
     + [plugin.load() for plugin in discovered_markdown_plugins],
 )
 
+SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }')
+def jsonld_document_loader(secure=False, fragments=[], **kwargs):
+    """
+    Create a Requests document loader.
+
+    Can be used to setup extra Requests args such as verify, cert, timeout,
+    or others.
+
+    :param secure: require all requests to use HTTPS (default: False).
+    :param fragments: the fragments of schema loaded as dicts
+    :param **kwargs: extra keyword args for Requests get() call.
+
+    :return: the RemoteDocument loader function.
+    """
+    from pyld.jsonld import JsonLdError
+
+    def loader(ignored, options={}):
+        """
+        Retrieves JSON-LD from the dicts provided as fragments.
+
+        :param ignored: this positional paramter is ignored, because the tomls fragments are side loaded
+
+        :return: the RemoteDocument.
+        """
+        fragments_str = []
+        for fragment in fragments:
+            if not fragment.get('@context'):
+                fragment['@context'] = SCHEMA_ORG
+            fragments_str.append(json.dumps(fragment))
+            # print("==========================")
+            # print(json.dumps(fragment, indent=2))
+
+        result = '[' + ','.join(fragments_str) + ']'
+        # print(">>>>>>>>> ",result)
+
+        doc = {
+                'contentType': 'application/ld+json',
+                'contextUrl': None,
+                'documentUrl': None,
+                'document': result
+            }
+        return doc
+
+    return loader
 
-def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
+def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict:
     CONTENT_DIR = get_content_directory()
     THUMBNAIL_WIDTH = SETTINGS.get("THUMBNAIL_WIDTH")
+    BUILD_DIR = get_build_directory()
+
+    # Add inferred/computed/relative values
+    local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
+    if image := local_front_matter.get("image"):
+        image_path = filepath.parent / Path(image)
+        local_front_matter["image"] = content_path_to_url(image_path)
+        # Add thumbnail URL, using the full image if the thumbnail doesn't exist
+        thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH)
+        thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to(
+            CONTENT_DIR
+        )
+        if thumbnail_image_build_path.exists():
+            local_front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path)
+        else:
+            local_front_matter["thumbnailUrl"] = local_front_matter["image"]
+    return local_front_matter
+
+def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]:
+    if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge":
+        try:
+            global_schema = dict(SETTINGS.get("SCHEMA_DATA", {}))
+            if not global_schema.get('@context'):
+                global_schema['@context'] = SCHEMA_ORG
+
+            # print("-----")
+            # print(json.dumps(global_schema, indent=2))
+            local_schema = state.env.get("front_matter", {})
+            top_level_type = local_schema.get("@type", None)
+            if not local_schema.get('@context'):
+                local_schema['@context'] = SCHEMA_ORG
+            local_schema = add_inferred_schema(local_schema, filepath)
+            # print("-----")
+            # print(json.dumps(local_schema, indent=2))
+            jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema]))
+            front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG)
+            # print("-----")
+            # print(json.dumps(front_matter, indent=2))
+        except Exception as e:
+            print("merging front matter failed:", e)
+            raise e
+    else:
+        # Seed front_matter with schema_data from config file
+        front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
+        front_matter.update(state.env.get("front_matter", {}))
+        front_matter = add_inferred_schema(front_matter, filepath)
+
+        top_level_type = None
+    return front_matter, top_level_type
+
+
+def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]:
     if not markdown.renderer:
         raise Exception("Blurry markdown renderer not set on Mistune Markdown instance")
 
-    BUILD_DIR = get_build_directory()
     # Add filepath to the renderer to resolve relative paths
     if not is_blurry_renderer(markdown.renderer):
         raise Exception(
@@ -164,27 +262,10 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
     html, state = markdown.parse(markdown_text, state=state)
 
     if not is_str(html):
-        raise Exception(f"Expected html to be a string but got: {type(html)}")
+        raise Exception(f"Expected html to be a string but got: {top_level_type(html)}")
 
     # Post-process HTML
     html = remove_lazy_loading_from_first_image(html)
 
-    # Seed front_matter with schema_data from config file
-    front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
-    front_matter.update(state.env.get("front_matter", {}))
-
-    # Add inferred/computed/relative values
-    front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
-    if image := front_matter.get("image"):
-        image_path = filepath.parent / Path(image)
-        front_matter["image"] = content_path_to_url(image_path)
-        # Add thumbnail URL, using the full image if the thumbnail doesn't exist
-        thumbnail_image_path = add_image_width_to_path(image_path, THUMBNAIL_WIDTH)
-        thumbnail_image_build_path = BUILD_DIR / thumbnail_image_path.relative_to(
-            CONTENT_DIR
-        )
-        if thumbnail_image_build_path.exists():
-            front_matter["thumbnailUrl"] = build_path_to_url(thumbnail_image_build_path)
-        else:
-            front_matter["thumbnailUrl"] = front_matter["image"]
-    return html, front_matter
+    front_matter, top_level_type = resolve_front_matter(state, filepath)
+    return html, front_matter, top_level_type
diff --git a/blurry/types.py b/blurry/types.py
index 93338c1..1fe8c8b 100644
--- a/blurry/types.py
+++ b/blurry/types.py
@@ -8,6 +8,7 @@
 @dataclass
 class MarkdownFileData:
     body: str
+    top_level_type: str
     front_matter: dict[str, Any]
     path: Path