Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pathlibfs for scheme-agnostic file access #4

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


## Unreleased
- Use `pathlibfs` for scheme-agnostic source access

## 2023-10-07 0.1.0
- Add example data files in different formats
Expand Down
12 changes: 9 additions & 3 deletions hubspot_tech_writing/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from hubspot_tech_writing.hubspot_api import HubSpotAdapter, HubSpotBlogPost, HubSpotFile
from hubspot_tech_writing.util.common import ContentTypeResolver
from hubspot_tech_writing.util.html import HTMLImageTranslator
from hubspot_tech_writing.util.io import to_io
from hubspot_tech_writing.util.io import open_url, to_io

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,9 +75,14 @@ def upload(
folder_id: t.Optional[str] = None,
folder_path: t.Optional[str] = None,
):
source_path = Path(source)
source_path: Path
if isinstance(source, str):
source_path = open_url(source)
else:
source_path = source
logger.info(f"Source: {source_path}")

ctr = ContentTypeResolver(name=source_path)
ctr = ContentTypeResolver(filepath=source_path)

logger.info(f"Uploading file: {source}")
hsa = HubSpotAdapter(access_token=access_token)
Expand All @@ -101,6 +106,7 @@ def upload(
)
hit = HTMLImageTranslator(html=html, source_path=source_path, uploader=uploader)
hit.discover().process()
logger.debug(hit)
html = hit.html_out

# Upload blog post.
Expand Down
15 changes: 9 additions & 6 deletions hubspot_tech_writing/hubspot_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import os
import typing as t
from copy import deepcopy
from pathlib import Path
from tempfile import NamedTemporaryFile

import hubspot
from click import confirm
from hubspot import HubSpot
from hubspot.cms.blogs.blog_posts import BlogPost
from hubspot.files.files import File
from pathlibfs import Path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -130,12 +131,14 @@ def get_file_by_name(self, file: "HubSpotFile") -> File:
logger.info(f"Found file: id={result.id}, path={result.path}, url={result.url}")
return result

def save_file(self, file_id: str, source: str):
def save_file(self, file_id: str, source: Path):
"""
Save / overwrite existing file.
"""
tmpfile = NamedTemporaryFile()
tmpfile.write(source.read_bytes())
return self.hs.files.files.files_api.replace(
file_id=file_id, file=source, options=json.dumps(self.FILE_OPTIONS)
file_id=file_id, file=tmpfile.name, options=json.dumps(self.FILE_OPTIONS)
)

def delete_file_by_id(self, identifier: str) -> t.Optional[File]:
Expand Down Expand Up @@ -254,7 +257,7 @@ class HubSpotFile:
def __init__(
self,
hubspot_adapter: HubSpotAdapter,
source: t.Union[str, Path],
source: Path,
identifier: t.Optional[str] = None,
name: t.Optional[str] = None,
folder_id: t.Optional[str] = None,
Expand Down Expand Up @@ -286,7 +289,7 @@ def __init__(
def __str__(self):
return (
f"{self.__class__.__name__} identifier={self.identifier}, "
f"name={self.name}, folder={self.folder_id or self.folder_path}"
f"name={self.name}, folder={self.folder_id or self.folder_path}, source={self.source}"
)

def load(self):
Expand All @@ -310,7 +313,7 @@ def save(self):
if not self.source:
raise ValueError(f"Unable to save file without source: {self}")
logger.info(f"Saving file: {self}")
return self.hsa.save_file(file_id=self.identifier, source=str(self.source))
return self.hsa.save_file(file_id=self.identifier, source=self.source)

def delete(self):
"""
Expand Down
10 changes: 6 additions & 4 deletions hubspot_tech_writing/util/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import logging
import typing as t
from pathlib import Path

import colorlog
from colorlog.escape_codes import escape_codes
from pathlibfs import Path

from hubspot_tech_writing.util.io import path_without_scheme


def setup_logging(level=logging.INFO, verbose: bool = False):
Expand All @@ -23,9 +25,9 @@ class ContentTypeResolver:
HTML_SUFFIXES = [".html", ".html5", ".htm"]
TEXT_SUFFIXES = MARKUP_SUFFIXES + HTML_SUFFIXES + [".txt"]

def __init__(self, name: t.Union[str, Path]):
self.name = name
self.suffix = Path(name).suffix
def __init__(self, filepath: t.Union[str, Path]):
self.path = path_without_scheme(filepath)
self.suffix = self.path.suffix

def is_markup(self):
return self.suffix in self.MARKUP_SUFFIXES
Expand Down
26 changes: 13 additions & 13 deletions hubspot_tech_writing/util/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
import logging
import typing as t
from copy import deepcopy
from pathlib import Path
from pprint import pformat

from bs4 import BeautifulSoup
from pathlibfs import Path

logger = logging.getLogger(__name__)


@dataclasses.dataclass
class HTMLImage:
alt: str
src: str
src: Path


class HTMLImageTranslator:
Expand All @@ -21,19 +22,18 @@ class HTMLImageTranslator:
After that, replace URLs in HTML document.
"""

def __init__(self, html: str, source_path: t.Union[str, Path], uploader: t.Optional[t.Callable] = None):
def __init__(self, html: str, source_path: Path, uploader: t.Optional[t.Callable] = None):
self.html_in: str = html
self.html_out: t.Optional[str] = None
self.source_path = source_path
self.source = source_path
self.uploader = uploader
self.images_in: t.List[HTMLImage] = []
self.images_local: t.List[HTMLImage] = []
self.images_remote: t.List[HTMLImage] = []

def __str__(self):
return (
f"HTMLImageTranslator:\nin: {self.images_in}\nlocal: {self.images_local}\nremote: {self.images_remote}"
)
info = {"source": self.source, "in": self.images_in, "local": self.images_local, "remote": self.images_remote}
return f"HTMLImageTranslator:\n{pformat(info)}"

def discover(self):
self.scan().resolve()
Expand All @@ -59,9 +59,10 @@ def resolve(self) -> "HTMLImageTranslator":
"""
Process discovered image elements, computing effective paths.
"""
if self.source_path is None:
if self.source is None:
logger.warning("No resolving without source path")
return self
parent_path = Path(self.source_path)
parent_path = self.source
if parent_path.is_file():
parent_path = parent_path.parent
self.images_local = []
Expand All @@ -74,7 +75,7 @@ def resolve(self) -> "HTMLImageTranslator":

# Relative paths are relative to the original document.
else:
image_new.src = str(Path(parent_path) / image.src)
image_new.src = parent_path / image.src
self.images_local.append(image_new)
return self

Expand All @@ -86,10 +87,9 @@ def upload(self) -> "HTMLImageTranslator":
logger.warning("No upload without uploader")
return self
for image_local in self.images_local:
hs_file = self.uploader(source=image_local.src, name=Path(image_local.src).name)
image_url = hs_file.url
hs_file = self.uploader(source=image_local.src, name=image_local.src.name)
image_remote: HTMLImage = deepcopy(image_local)
image_remote.src = image_url
image_remote.src = hs_file.url
self.images_remote.append(image_remote)
return self

Expand Down
71 changes: 62 additions & 9 deletions hubspot_tech_writing/util/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,73 @@
import typing as t
from pathlib import Path

import requests
from pathlibfs import Path as PathPlus
from yarl import URL


@contextlib.contextmanager
def to_io(source: t.Union[str, Path, t.IO]) -> t.Generator[t.IO, None, None]:
if isinstance(source, (str, Path)):
"""
Main context manager for accessing resources.
Before accessing / opening, it converges a path string, object, or IO handle, to an IO handle.
"""
fp: t.IO
if isinstance(source, io.TextIOWrapper):
fp = source
elif isinstance(source, (str, Path, PathPlus)):
source = str(source)
fp: t.IO
if source.startswith("http://") or source.startswith("https://"):
response = requests.get(source, timeout=10.0)
fp = io.StringIO(response.text)
else:
fp = open(source, "r")
path = open_url(source)
fp = path.open(mode="rt")
else:
fp = source
raise TypeError(f"Unable to converge to IO handle. type={type(source)}, value={source}")
yield fp
fp.close()


def open_url(url: str) -> PathPlus:
"""
Access URL, with specific handling for GitHub URLs.

When approached using a GitHub HTTP URL, converge it to a pathlibfs / fsspec URL,
and open it.

Input URLs
----------
github+https://foobar:[email protected]/acme/sweet-camino/path/to/document.md
github+https://foobar:[email protected]/acme/sweet-camino/blob/main/path/to/document.md

Output Path
-----------
fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino")
"""
uri = URL(url)

if uri.scheme.startswith("github+https"):
path_fragments = uri.path.split("/")[1:]
path_kwargs = {
"username": uri.user,
"token": uri.password,
"org": path_fragments[0],
"repo": path_fragments[1],
}

real_path_fragments = path_fragments[2:]
if path_fragments[2] in ["blob", "raw"]:
real_path_fragments = path_fragments[4:]

downstream_url = "github://" + "/".join(real_path_fragments)
path = PathPlus(downstream_url, **path_kwargs)

else:
path = PathPlus(url)
return path


def path_without_scheme(url_like: str) -> PathPlus:
"""
Return a pathlibfs Path, without the scheme.
"""
url = URL(str(url_like))
if url.is_absolute():
url = url.with_scheme("")
return PathPlus(str(url))
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ dependencies = [
"hubspot-api-client<9",
"markdown<4",
"mkdocs-linkcheck<2",
"pathlibfs<0.6",
"requests<3",
"yarl<2",
]

[project.optional-dependencies]
Expand Down
24 changes: 24 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@ def markdownfile() -> Path:
return Path(__file__).parent / "data" / "hubspot-blog-post-original.md"


def get_markdownurl(infix: str = "", scheme: str = "https:") -> str:
return f"{scheme}//github.com/crate-workbench/hubspot-tech-writing/{infix}tests/data/hubspot-blog-post-original.md"


@pytest.fixture
def markdownurl_https_raw() -> str:
return get_markdownurl(infix="raw/main/")


@pytest.fixture
def markdownurl_github_https_bare() -> str:
return get_markdownurl(scheme="github+https:")


@pytest.fixture
def markdownurl_github_https_raw() -> str:
return get_markdownurl(infix="raw/main/", scheme="github+https:")


@pytest.fixture
def markdownurl_github_https_blob() -> str:
return get_markdownurl(infix="blob/main/", scheme="github+https:")


@pytest.fixture
def markdownfile_minimal_broken_links() -> Path:
return Path(__file__).parent / "data" / "minimal-broken-links.md"
Expand Down
5 changes: 2 additions & 3 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ def test_convert_file(markdownfile):
check_content(html)


def test_convert_url():
url = "https://github.com/crate-workbench/hubspot-tech-writing/raw/main/tests/data/hubspot-blog-post-original.md"
html = convert(url)
def test_convert_url(markdownurl_https_raw):
html = convert(markdownurl_https_raw)
check_content(html)


Expand Down
Loading
Loading