new option to merge schema and necessary refactorings

blurry-dev · Mar 4, 2024 · 9911221 · 9911221
1 parent 3d8e1f5
commit 9911221
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 19 deletions.
diff --git a/blurry/__init__.py b/blurry/__init__.py
@@ -118,7 +118,7 @@ async def write_html_file(
         extra_context["sibling_pages"] = sibling_pages
     folder_in_build = convert_content_path_to_directory_in_build(file_data.path)
 
-    schema_type = file_data.front_matter.get("@type")
+    schema_type = file_data.top_level_type
     if not schema_type:
         raise ValueError(
             f"Required @type value missing in file or TOML front matter invalid: "
@@ -207,9 +207,10 @@ async def build(release=True):
             file_data_by_directory[directory] = []
 
         # Convert Markdown file to HTML
-        body, front_matter = convert_markdown_file_to_html(filepath)
+        body, front_matter, top_level_type = convert_markdown_file_to_html(filepath)
         file_data = MarkdownFileData(
             body=body,
+            top_level_type=top_level_type,
             front_matter=front_matter,
             path=relative_filepath,
         )

diff --git a/blurry/markdown/__init__.py b/blurry/markdown/__init__.py
@@ -4,6 +4,9 @@
 from typing import TypeAlias
 from typing import TypeGuard
 
+import json
+from pyld import jsonld
+
 import mistune
 from mistune import BlockState
 from mistune.plugins.abbr import abbr
@@ -149,9 +152,96 @@ def is_blurry_renderer(
     + [plugin.load() for plugin in discovered_markdown_plugins],
 )
 
+SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }')
+def jsonld_document_loader(secure=False, fragments=[], **kwargs):
+    """
+    Create a Requests document loader.
+
+    Can be used to setup extra Requests args such as verify, cert, timeout,
+    or others.
+
+    :param secure: require all requests to use HTTPS (default: False).
+    :param fragments: the fragments of schema loaded as dicts
+    :param **kwargs: extra keyword args for Requests get() call.
+
+    :return: the RemoteDocument loader function.
+    """
+    from pyld.jsonld import JsonLdError
+
+    def loader(ignored, options={}):
+        """
+        Retrieves JSON-LD from the dicts provided as fragments.
+
+        :param ignored: this positional paramter is ignored, because the tomls fragments are side loaded
+
+        :return: the RemoteDocument.
+        """
+        fragments_str = []
+        for fragment in fragments:
+            if not fragment.get('@context'):
+                fragment['@context'] = SCHEMA_ORG
+            fragments_str.append(json.dumps(fragment))
+            # print("==========================")
+            # print(json.dumps(fragment, indent=2))
+
+        result = '[' + ','.join(fragments_str) + ']'
+        # print(">>>>>>>>> ",result)
+
+        doc = {
+                'contentType': 'application/ld+json',
+                'contextUrl': None,
+                'documentUrl': None,
+                'document': result
+            }
+        return doc
 
-def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
+    return loader
+
+def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict:
     CONTENT_DIR = get_content_directory()
+
+    # Add inferred/computed/relative values
+    local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
+
+    # Add inferred/computed/relative values
+    # https://schema.org/image
+    # https://schema.org/thumbnailUrl
+    if image := front_matter.get("image"):
+        image_copy = deepcopy(image)
+        relative_image_path = get_relative_image_path_from_image_property(image_copy)
+        image_path = resolve_relative_path_in_markdown(relative_image_path, filepath)
+        front_matter["image"] = update_image_with_url(image_copy, image_path)
+        front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path)
+
+    return local_front_matter
+
+def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]:
+    if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge":
+        try:
+            global_schema = dict(SETTINGS.get("SCHEMA_DATA", {}))
+            if not global_schema.get('@context'):
+                global_schema['@context'] = SCHEMA_ORG
+
+            local_schema = state.env.get("front_matter", {})
+            top_level_type = local_schema.get("@type", None)
+            if not local_schema.get('@context'):
+                local_schema['@context'] = SCHEMA_ORG
+            local_schema = add_inferred_schema(local_schema, filepath)
+            jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema]))
+            front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG)
+        except Exception as e:
+            print("merging front matter failed:", e)
+            raise e
+    else:
+        # Seed front_matter with schema_data from config file
+        front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
+        front_matter.update(state.env.get("front_matter", {}))
+        front_matter = add_inferred_schema(front_matter, filepath)
+
+        top_level_type = None
+    return front_matter, top_level_type
+
+def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]:
     if not markdown.renderer:
         raise Exception("Blurry markdown renderer not set on Mistune Markdown instance")
 
@@ -167,26 +257,13 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]:
     html, state = markdown.parse(markdown_text, state=state)
 
     if not is_str(html):
-        raise Exception(f"Expected html to be a string but got: {type(html)}")
+        raise Exception(f"Expected html to be a string but got: {top_level_type(html)}")
 
     # Post-process HTML
     html = remove_lazy_loading_from_first_image(html)
 
-    # Seed front_matter with schema_data from config file
-    front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {}))
-    front_matter.update(state.env.get("front_matter", {}))
-
-    # Add inferred/computed/relative values
-    # https://schema.org/image
-    # https://schema.org/thumbnailUrl
-    front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))})
-    if image := front_matter.get("image"):
-        image_copy = deepcopy(image)
-        relative_image_path = get_relative_image_path_from_image_property(image_copy)
-        image_path = resolve_relative_path_in_markdown(relative_image_path, filepath)
-        front_matter["image"] = update_image_with_url(image_copy, image_path)
-        front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path)
-    return html, front_matter
+    front_matter, top_level_type = resolve_front_matter(state, filepath)
+    return html, front_matter, top_level_type
 
 
 def image_path_to_thumbnailUrl(image_path: Path):

diff --git a/blurry/types.py b/blurry/types.py
@@ -8,6 +8,7 @@
 @dataclass
 class MarkdownFileData:
     body: str
+    top_level_type: str
     front_matter: dict[str, Any]
     path: Path
 

diff --git a/tests/test_sitemap.py b/tests/test_sitemap.py
@@ -8,16 +8,19 @@
 directory_file_data = [
     MarkdownFileData(
         front_matter=dict(datePublished=date(2021, 1, 1), url="/blog/a-post-1/"),
+        top_level_type = "WebPage",
         body="",
         path=blog_path / "a-post-1",
     ),
     MarkdownFileData(
         front_matter=dict(datePublished=date(2021, 3, 1), url="/blog/b-post-3/"),
+        top_level_type = "WebPage",
         body="",
         path=blog_path / "b-post-3",
     ),
     MarkdownFileData(
         front_matter=dict(dateCreated=date(2021, 2, 1), url="/blog/c-post-2/"),
+        top_level_type = "WebPage",
         body="",
         path=blog_path / "c-post-2",
     ),
@@ -27,6 +30,7 @@
             dateModified=date(2022, 1, 13),
             url="/blog/c-post-4/",
         ),
+        top_level_type = "WebPage",
         body="",
         path=blog_path / "c-post-4",
     ),

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -67,21 +67,25 @@ def test_sort_directory_file_data_by_date():
         blog_path: [
             MarkdownFileData(
                 front_matter=dict(datePublished=date(2021, 1, 1)),
+                top_level_type = "WebPage",
                 body="",
                 path=Path("a-post-1"),
             ),
             MarkdownFileData(
                 front_matter=dict(datePublished=date(2021, 3, 1)),
+                top_level_type = "WebPage",
                 body="",
                 path=Path("b-post-3"),
             ),
             MarkdownFileData(
                 front_matter=dict(dateCreated=date(2021, 2, 1)),
+                top_level_type = "WebPage",
                 body="",
                 path=Path("c-post-2"),
             ),
             MarkdownFileData(
                 front_matter=dict(),
+                top_level_type = "WebPage",
                 body="",
                 path=Path("c-post-4"),
             ),