Use pathlibfs for scheme-agnostic source access

tech-writing · amotl · Oct 9, 2023 · Oct 9, 2023 · May 28, 2024 · 2f79fc9aaf0c6166948cf783ffd84968e5615d76
commit 2f79fc9aaf0c6166948cf783ffd84968e5615d76
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,7 @@
 
 
 ## Unreleased
+- Use `pathlibfs` for scheme-agnostic source access
 
 ## 2023-10-07 0.1.0
 - Add example data files in different formats

diff --git a/hubspot_tech_writing/core.py b/hubspot_tech_writing/core.py
@@ -14,7 +14,7 @@
 from hubspot_tech_writing.hubspot_api import HubSpotAdapter, HubSpotBlogPost, HubSpotFile
 from hubspot_tech_writing.util.common import ContentTypeResolver
 from hubspot_tech_writing.util.html import HTMLImageTranslator
-from hubspot_tech_writing.util.io import to_io
+from hubspot_tech_writing.util.io import path_from_url, to_io
 
 logger = logging.getLogger(__name__)
 
@@ -75,9 +75,14 @@ def upload(
     folder_id: t.Optional[str] = None,
     folder_path: t.Optional[str] = None,
 ):
-    source_path = Path(source)
+    source_path: Path
+    if isinstance(source, str):
+        source_path = path_from_url(source)
+    else:
+        source_path = source
+    logger.info(f"Source: {source_path}")
 
-    ctr = ContentTypeResolver(name=source_path)
+    ctr = ContentTypeResolver(filepath=source_path)
 
     logger.info(f"Uploading file: {source}")
     hsa = HubSpotAdapter(access_token=access_token)
@@ -101,6 +106,7 @@ def upload(
             )
             hit = HTMLImageTranslator(html=html, source_path=source_path, uploader=uploader)
             hit.discover().process()
+            logger.debug(hit)
             html = hit.html_out
 
         # Upload blog post.

diff --git a/hubspot_tech_writing/hubspot_api.py b/hubspot_tech_writing/hubspot_api.py
@@ -3,13 +3,14 @@
 import os
 import typing as t
 from copy import deepcopy
-from pathlib import Path
+from tempfile import NamedTemporaryFile
 
 import hubspot
 from click import confirm
 from hubspot import HubSpot
 from hubspot.cms.blogs.blog_posts import BlogPost
 from hubspot.files.files import File
+from pathlibfs import Path
 
 logger = logging.getLogger(__name__)
 
@@ -130,12 +131,14 @@ def get_file_by_name(self, file: "HubSpotFile") -> File:
         logger.info(f"Found file: id={result.id}, path={result.path}, url={result.url}")
         return result
 
-    def save_file(self, file_id: str, source: str):
+    def save_file(self, file_id: str, source: Path):
         """
         Save / overwrite existing file.
         """
+        tmpfile = NamedTemporaryFile()
+        tmpfile.write(source.read_bytes())
         return self.hs.files.files.files_api.replace(
-            file_id=file_id, file=source, options=json.dumps(self.FILE_OPTIONS)
+            file_id=file_id, file=tmpfile.name, options=json.dumps(self.FILE_OPTIONS)
         )
 
     def delete_file_by_id(self, identifier: str) -> t.Optional[File]:
@@ -254,7 +257,7 @@ class HubSpotFile:
     def __init__(
         self,
         hubspot_adapter: HubSpotAdapter,
-        source: t.Union[str, Path],
+        source: Path,
         identifier: t.Optional[str] = None,
         name: t.Optional[str] = None,
         folder_id: t.Optional[str] = None,
@@ -286,7 +289,7 @@ def __init__(
     def __str__(self):
         return (
             f"{self.__class__.__name__} identifier={self.identifier}, "
-            f"name={self.name}, folder={self.folder_id or self.folder_path}"
+            f"name={self.name}, folder={self.folder_id or self.folder_path}, source={self.source}"
         )
 
     def load(self):
@@ -310,7 +313,7 @@ def save(self):
         if not self.source:
             raise ValueError(f"Unable to save file without source: {self}")
         logger.info(f"Saving file: {self}")
-        return self.hsa.save_file(file_id=self.identifier, source=str(self.source))
+        return self.hsa.save_file(file_id=self.identifier, source=self.source)
 
     def delete(self):
         """

diff --git a/hubspot_tech_writing/util/common.py b/hubspot_tech_writing/util/common.py
@@ -1,9 +1,10 @@
 import logging
 import typing as t
-from pathlib import Path
 
 import colorlog
 from colorlog.escape_codes import escape_codes
+from pathlibfs import Path
+from yarl import URL
 
 
 def setup_logging(level=logging.INFO, verbose: bool = False):
@@ -23,9 +24,12 @@ class ContentTypeResolver:
     HTML_SUFFIXES = [".html", ".html5", ".htm"]
     TEXT_SUFFIXES = MARKUP_SUFFIXES + HTML_SUFFIXES + [".txt"]
 
-    def __init__(self, name: t.Union[str, Path]):
-        self.name = name
-        self.suffix = Path(name).suffix
+    def __init__(self, filepath: t.Union[str, Path]):
+        self.url = URL(str(filepath))
+        if self.url.is_absolute():
+            self.url = self.url.with_scheme("")
+        self.path = Path(str(self.url))
+        self.suffix = self.path.suffix
 
     def is_markup(self):
         return self.suffix in self.MARKUP_SUFFIXES
@@ -38,3 +42,8 @@ def is_text(self):
 
     def is_file(self):
         return not self.is_text()
+
+
+def url_to_path(filepath: str):
+    url = URL(str(filepath)).with_scheme("")
+    return Path(str(url))
diff --git a/hubspot_tech_writing/util/html.py b/hubspot_tech_writing/util/html.py
@@ -2,17 +2,18 @@
 import logging
 import typing as t
 from copy import deepcopy
-from pathlib import Path
+from pprint import pformat
 
 from bs4 import BeautifulSoup
+from pathlibfs import Path
 
 logger = logging.getLogger(__name__)
 
 
 @dataclasses.dataclass
 class HTMLImage:
     alt: str
-    src: str
+    src: Path
 
 
 class HTMLImageTranslator:
@@ -21,19 +22,18 @@ class HTMLImageTranslator:
     After that, replace URLs in HTML document.
     """
 
-    def __init__(self, html: str, source_path: t.Union[str, Path], uploader: t.Optional[t.Callable] = None):
+    def __init__(self, html: str, source_path: Path, uploader: t.Optional[t.Callable] = None):
         self.html_in: str = html
         self.html_out: t.Optional[str] = None
-        self.source_path = source_path
+        self.source = source_path
         self.uploader = uploader
         self.images_in: t.List[HTMLImage] = []
         self.images_local: t.List[HTMLImage] = []
         self.images_remote: t.List[HTMLImage] = []
 
     def __str__(self):
-        return (
-            f"HTMLImageTranslator:\nin:     {self.images_in}\nlocal:  {self.images_local}\nremote: {self.images_remote}"
-        )
+        info = {"source": self.source, "in": self.images_in, "local": self.images_local, "remote": self.images_remote}
+        return f"HTMLImageTranslator:\n{pformat(info)}"
 
     def discover(self):
         self.scan().resolve()
@@ -59,9 +59,10 @@ def resolve(self) -> "HTMLImageTranslator":
         """
         Process discovered image elements, computing effective paths.
         """
-        if self.source_path is None:
+        if self.source is None:
+            logger.warning("No resolving without source path")
             return self
-        parent_path = Path(self.source_path)
+        parent_path = self.source
         if parent_path.is_file():
             parent_path = parent_path.parent
         self.images_local = []
@@ -74,7 +75,7 @@ def resolve(self) -> "HTMLImageTranslator":
 
                 # Relative paths are relative to the original document.
                 else:
-                    image_new.src = str(Path(parent_path) / image.src)
+                    image_new.src = parent_path / image.src
             self.images_local.append(image_new)
         return self
 
@@ -86,7 +87,7 @@ def upload(self) -> "HTMLImageTranslator":
             logger.warning("No upload without uploader")
             return self
         for image_local in self.images_local:
-            hs_file = self.uploader(source=image_local.src, name=Path(image_local.src).name)
+            hs_file = self.uploader(source=image_local.src, name=image_local.src.name)
             image_url = hs_file.url
             image_remote: HTMLImage = deepcopy(image_local)
             image_remote.src = image_url

diff --git a/hubspot_tech_writing/util/io.py b/hubspot_tech_writing/util/io.py
@@ -3,20 +3,63 @@
 import typing as t
 from pathlib import Path
 
-import requests
+from pathlibfs import Path as PathPlus
+from yarl import URL
 
 
 @contextlib.contextmanager
 def to_io(source: t.Union[str, Path, t.IO]) -> t.Generator[t.IO, None, None]:
-    if isinstance(source, (str, Path)):
+    fp: t.IO
+    if isinstance(source, io.TextIOWrapper):
+        fp = source
+    elif isinstance(source, (str, Path, PathPlus)):
         source = str(source)
-        fp: t.IO
+        path = path_from_url(source)
+        fp = path.open(mode="rt")
+        """
         if source.startswith("http://") or source.startswith("https://"):
             response = requests.get(source, timeout=10.0)
             fp = io.StringIO(response.text)
         else:
             fp = open(source, "r")
+        """
     else:
-        fp = source
+        raise TypeError(f"Unable to converge to IO handle. type={type(source)}, value={source}")
     yield fp
     fp.close()
+
+
+def path_from_url(url: str) -> PathPlus:
+    """
+    Convert GitHub HTTP URL to pathlibfs / fsspec URL.
+
+    Input URLs
+    ----------
+    github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/path/to/document.md
+    github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/blob/main/path/to/document.md
+
+    Output Path
+    -----------
+    fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino")
+    """
+    uri = URL(url)
+
+    if uri.scheme.startswith("github+https"):
+        path_fragments = uri.path.split("/")[1:]
+        path_kwargs = {
+            "username": uri.user,
+            "token": uri.password,
+            "org": path_fragments[0],
+            "repo": path_fragments[1],
+        }
+
+        real_path_fragments = path_fragments[2:]
+        if path_fragments[2] == "blob":
+            real_path_fragments = path_fragments[4:]
+
+        downstream_url = "github://" + "/".join(real_path_fragments)
+        path = PathPlus(downstream_url, **path_kwargs)
+
+    else:
+        path = PathPlus(url)
+    return path
diff --git a/pyproject.toml b/pyproject.toml
@@ -70,7 +70,9 @@ dependencies = [
   "hubspot-api-client<9",
   "markdown<4",
   "mkdocs-linkcheck<2",
+  "pathlibfs<0.6",
   "requests<3",
+  "yarl<2",
 ]
 
 [project.optional-dependencies]