From 533b0f984854f45e399518a92881d4d5018a80fd Mon Sep 17 00:00:00 2001 From: Oliver Kandler Date: Mon, 12 Feb 2024 17:31:43 +0100 Subject: [PATCH 1/3] bump to private version 0.6.2.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 60e2f2f..151a3be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "blurry-cli" -version = "0.6.2" +version = "0.6.2.1" description = "A Mistune-based static site generator for Python" authors = ["John Franey "] license = "MIT" From 3d8e1f5e4ffea4ff909cb4c1e8c056ce1c6ae491 Mon Sep 17 00:00:00 2001 From: Oliver Kandler Date: Mon, 12 Feb 2024 21:57:33 +0100 Subject: [PATCH 2/3] new setting for global/local schema merge --- blurry/settings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/blurry/settings.py b/blurry/settings.py index fa812fb..843b9e4 100644 --- a/blurry/settings.py +++ b/blurry/settings.py @@ -26,6 +26,7 @@ class Settings(TypedDict): USE_HTTP: bool RUNSERVER: bool FRONTMATTER_NON_SCHEMA_VARIABLE_PREFIX: str + FRONT_MATTER_RESOLUTION: str SETTINGS: Settings = { @@ -46,6 +47,7 @@ class Settings(TypedDict): "RUNSERVER": False, "FRONTMATTER_NON_SCHEMA_VARIABLE_PREFIX": "~", "TEMPLATE_SCHEMA_TYPES": {}, + "FRONT_MATTER_RESOLUTION": "overwrite", # or "merge" } From 9911221a2cb2aec8aa90799724933903676cafc8 Mon Sep 17 00:00:00 2001 From: Oliver Kandler Date: Mon, 12 Feb 2024 22:01:20 +0100 Subject: [PATCH 3/3] new option to merge schema and necessary refactorings --- blurry/__init__.py | 5 +- blurry/markdown/__init__.py | 111 ++++++++++++++++++++++++++++++------ blurry/types.py | 1 + tests/test_sitemap.py | 4 ++ tests/test_utils.py | 4 ++ 5 files changed, 106 insertions(+), 19 deletions(-) diff --git a/blurry/__init__.py b/blurry/__init__.py index 2bebb0f..aba261e 100644 --- a/blurry/__init__.py +++ b/blurry/__init__.py @@ -118,7 +118,7 @@ async def write_html_file( extra_context["sibling_pages"] = sibling_pages folder_in_build = convert_content_path_to_directory_in_build(file_data.path) - schema_type = file_data.front_matter.get("@type") + schema_type = file_data.top_level_type if not schema_type: raise ValueError( f"Required @type value missing in file or TOML front matter invalid: " @@ -207,9 +207,10 @@ async def build(release=True): file_data_by_directory[directory] = [] # Convert Markdown file to HTML - body, front_matter = convert_markdown_file_to_html(filepath) + body, front_matter, top_level_type = convert_markdown_file_to_html(filepath) file_data = MarkdownFileData( body=body, + top_level_type=top_level_type, front_matter=front_matter, path=relative_filepath, ) diff --git a/blurry/markdown/__init__.py b/blurry/markdown/__init__.py index 0e75656..d06426d 100644 --- a/blurry/markdown/__init__.py +++ b/blurry/markdown/__init__.py @@ -4,6 +4,9 @@ from typing import TypeAlias from typing import TypeGuard +import json +from pyld import jsonld + import mistune from mistune import BlockState from mistune.plugins.abbr import abbr @@ -149,9 +152,96 @@ def is_blurry_renderer( + [plugin.load() for plugin in discovered_markdown_plugins], ) +SCHEMA_ORG = json.loads('{ "@vocab": "https://schema.org/" }') +def jsonld_document_loader(secure=False, fragments=[], **kwargs): + """ + Create a Requests document loader. + + Can be used to setup extra Requests args such as verify, cert, timeout, + or others. + + :param secure: require all requests to use HTTPS (default: False). + :param fragments: the fragments of schema loaded as dicts + :param **kwargs: extra keyword args for Requests get() call. + + :return: the RemoteDocument loader function. + """ + from pyld.jsonld import JsonLdError + + def loader(ignored, options={}): + """ + Retrieves JSON-LD from the dicts provided as fragments. + + :param ignored: this positional paramter is ignored, because the tomls fragments are side loaded + + :return: the RemoteDocument. + """ + fragments_str = [] + for fragment in fragments: + if not fragment.get('@context'): + fragment['@context'] = SCHEMA_ORG + fragments_str.append(json.dumps(fragment)) + # print("==========================") + # print(json.dumps(fragment, indent=2)) + + result = '[' + ','.join(fragments_str) + ']' + # print(">>>>>>>>> ",result) + + doc = { + 'contentType': 'application/ld+json', + 'contextUrl': None, + 'documentUrl': None, + 'document': result + } + return doc -def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]: + return loader + +def add_inferred_schema(local_front_matter: dict, filepath: Path) -> dict: CONTENT_DIR = get_content_directory() + + # Add inferred/computed/relative values + local_front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))}) + + # Add inferred/computed/relative values + # https://schema.org/image + # https://schema.org/thumbnailUrl + if image := front_matter.get("image"): + image_copy = deepcopy(image) + relative_image_path = get_relative_image_path_from_image_property(image_copy) + image_path = resolve_relative_path_in_markdown(relative_image_path, filepath) + front_matter["image"] = update_image_with_url(image_copy, image_path) + front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path) + + return local_front_matter + +def resolve_front_matter(state: dict, filepath: Path) -> tuple[dict[str, Any], str]: + if SETTINGS.get("FRONT_MATTER_RESOLUTION") == "merge": + try: + global_schema = dict(SETTINGS.get("SCHEMA_DATA", {})) + if not global_schema.get('@context'): + global_schema['@context'] = SCHEMA_ORG + + local_schema = state.env.get("front_matter", {}) + top_level_type = local_schema.get("@type", None) + if not local_schema.get('@context'): + local_schema['@context'] = SCHEMA_ORG + local_schema = add_inferred_schema(local_schema, filepath) + jsonld.set_document_loader(jsonld_document_loader(fragments=[global_schema, local_schema])) + front_matter: dict[str, Any] = jsonld.compact("ignore", SCHEMA_ORG) + except Exception as e: + print("merging front matter failed:", e) + raise e + else: + # Seed front_matter with schema_data from config file + front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {})) + front_matter.update(state.env.get("front_matter", {})) + front_matter = add_inferred_schema(front_matter, filepath) + + top_level_type = None + return front_matter, top_level_type + +def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any], str]: if not markdown.renderer: raise Exception("Blurry markdown renderer not set on Mistune Markdown instance") @@ -167,26 +257,13 @@ def convert_markdown_file_to_html(filepath: Path) -> tuple[str, dict[str, Any]]: html, state = markdown.parse(markdown_text, state=state) if not is_str(html): - raise Exception(f"Expected html to be a string but got: {type(html)}") + raise Exception(f"Expected html to be a string but got: {top_level_type(html)}") # Post-process HTML html = remove_lazy_loading_from_first_image(html) - # Seed front_matter with schema_data from config file - front_matter: dict[str, Any] = dict(SETTINGS.get("SCHEMA_DATA", {})) - front_matter.update(state.env.get("front_matter", {})) - - # Add inferred/computed/relative values - # https://schema.org/image - # https://schema.org/thumbnailUrl - front_matter.update({"url": content_path_to_url(filepath.relative_to(CONTENT_DIR))}) - if image := front_matter.get("image"): - image_copy = deepcopy(image) - relative_image_path = get_relative_image_path_from_image_property(image_copy) - image_path = resolve_relative_path_in_markdown(relative_image_path, filepath) - front_matter["image"] = update_image_with_url(image_copy, image_path) - front_matter["thumbnailUrl"] = image_path_to_thumbnailUrl(image_path) - return html, front_matter + front_matter, top_level_type = resolve_front_matter(state, filepath) + return html, front_matter, top_level_type def image_path_to_thumbnailUrl(image_path: Path): diff --git a/blurry/types.py b/blurry/types.py index 93338c1..1fe8c8b 100644 --- a/blurry/types.py +++ b/blurry/types.py @@ -8,6 +8,7 @@ @dataclass class MarkdownFileData: body: str + top_level_type: str front_matter: dict[str, Any] path: Path diff --git a/tests/test_sitemap.py b/tests/test_sitemap.py index f51cb9d..7576905 100644 --- a/tests/test_sitemap.py +++ b/tests/test_sitemap.py @@ -8,16 +8,19 @@ directory_file_data = [ MarkdownFileData( front_matter=dict(datePublished=date(2021, 1, 1), url="/blog/a-post-1/"), + top_level_type = "WebPage", body="", path=blog_path / "a-post-1", ), MarkdownFileData( front_matter=dict(datePublished=date(2021, 3, 1), url="/blog/b-post-3/"), + top_level_type = "WebPage", body="", path=blog_path / "b-post-3", ), MarkdownFileData( front_matter=dict(dateCreated=date(2021, 2, 1), url="/blog/c-post-2/"), + top_level_type = "WebPage", body="", path=blog_path / "c-post-2", ), @@ -27,6 +30,7 @@ dateModified=date(2022, 1, 13), url="/blog/c-post-4/", ), + top_level_type = "WebPage", body="", path=blog_path / "c-post-4", ), diff --git a/tests/test_utils.py b/tests/test_utils.py index 2737351..02c2d02 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -67,21 +67,25 @@ def test_sort_directory_file_data_by_date(): blog_path: [ MarkdownFileData( front_matter=dict(datePublished=date(2021, 1, 1)), + top_level_type = "WebPage", body="", path=Path("a-post-1"), ), MarkdownFileData( front_matter=dict(datePublished=date(2021, 3, 1)), + top_level_type = "WebPage", body="", path=Path("b-post-3"), ), MarkdownFileData( front_matter=dict(dateCreated=date(2021, 2, 1)), + top_level_type = "WebPage", body="", path=Path("c-post-2"), ), MarkdownFileData( front_matter=dict(), + top_level_type = "WebPage", body="", path=Path("c-post-4"), ),