Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pathlibfs for scheme-agnostic file access #4

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Use pathlibfs for scheme-agnostic source access
  • Loading branch information
amotl committed May 28, 2024
commit 2f79fc9aaf0c6166948cf783ffd84968e5615d76
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@


## Unreleased
- Use `pathlibfs` for scheme-agnostic source access

## 2023-10-07 0.1.0
- Add example data files in different formats
Expand Down
12 changes: 9 additions & 3 deletions hubspot_tech_writing/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from hubspot_tech_writing.hubspot_api import HubSpotAdapter, HubSpotBlogPost, HubSpotFile
from hubspot_tech_writing.util.common import ContentTypeResolver
from hubspot_tech_writing.util.html import HTMLImageTranslator
from hubspot_tech_writing.util.io import to_io
from hubspot_tech_writing.util.io import path_from_url, to_io

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -75,9 +75,14 @@ def upload(
folder_id: t.Optional[str] = None,
folder_path: t.Optional[str] = None,
):
source_path = Path(source)
source_path: Path
if isinstance(source, str):
source_path = path_from_url(source)
else:
source_path = source
logger.info(f"Source: {source_path}")

ctr = ContentTypeResolver(name=source_path)
ctr = ContentTypeResolver(filepath=source_path)

logger.info(f"Uploading file: {source}")
hsa = HubSpotAdapter(access_token=access_token)
Expand All @@ -101,6 +106,7 @@ def upload(
)
hit = HTMLImageTranslator(html=html, source_path=source_path, uploader=uploader)
hit.discover().process()
logger.debug(hit)
html = hit.html_out

# Upload blog post.
Expand Down
15 changes: 9 additions & 6 deletions hubspot_tech_writing/hubspot_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import os
import typing as t
from copy import deepcopy
from pathlib import Path
from tempfile import NamedTemporaryFile

import hubspot
from click import confirm
from hubspot import HubSpot
from hubspot.cms.blogs.blog_posts import BlogPost
from hubspot.files.files import File
from pathlibfs import Path

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -130,12 +131,14 @@ def get_file_by_name(self, file: "HubSpotFile") -> File:
logger.info(f"Found file: id={result.id}, path={result.path}, url={result.url}")
return result

def save_file(self, file_id: str, source: str):
def save_file(self, file_id: str, source: Path):
"""
Save / overwrite existing file.
"""
tmpfile = NamedTemporaryFile()
tmpfile.write(source.read_bytes())
return self.hs.files.files.files_api.replace(
file_id=file_id, file=source, options=json.dumps(self.FILE_OPTIONS)
file_id=file_id, file=tmpfile.name, options=json.dumps(self.FILE_OPTIONS)
)

def delete_file_by_id(self, identifier: str) -> t.Optional[File]:
Expand Down Expand Up @@ -254,7 +257,7 @@ class HubSpotFile:
def __init__(
self,
hubspot_adapter: HubSpotAdapter,
source: t.Union[str, Path],
source: Path,
identifier: t.Optional[str] = None,
name: t.Optional[str] = None,
folder_id: t.Optional[str] = None,
Expand Down Expand Up @@ -286,7 +289,7 @@ def __init__(
def __str__(self):
return (
f"{self.__class__.__name__} identifier={self.identifier}, "
f"name={self.name}, folder={self.folder_id or self.folder_path}"
f"name={self.name}, folder={self.folder_id or self.folder_path}, source={self.source}"
)

def load(self):
Expand All @@ -310,7 +313,7 @@ def save(self):
if not self.source:
raise ValueError(f"Unable to save file without source: {self}")
logger.info(f"Saving file: {self}")
return self.hsa.save_file(file_id=self.identifier, source=str(self.source))
return self.hsa.save_file(file_id=self.identifier, source=self.source)

def delete(self):
"""
Expand Down
17 changes: 13 additions & 4 deletions hubspot_tech_writing/util/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import typing as t
from pathlib import Path

import colorlog
from colorlog.escape_codes import escape_codes
from pathlibfs import Path
from yarl import URL


def setup_logging(level=logging.INFO, verbose: bool = False):
Expand All @@ -23,9 +24,12 @@ class ContentTypeResolver:
HTML_SUFFIXES = [".html", ".html5", ".htm"]
TEXT_SUFFIXES = MARKUP_SUFFIXES + HTML_SUFFIXES + [".txt"]

def __init__(self, name: t.Union[str, Path]):
self.name = name
self.suffix = Path(name).suffix
def __init__(self, filepath: t.Union[str, Path]):
self.url = URL(str(filepath))
if self.url.is_absolute():
self.url = self.url.with_scheme("")
self.path = Path(str(self.url))
self.suffix = self.path.suffix

def is_markup(self):
return self.suffix in self.MARKUP_SUFFIXES
Expand All @@ -38,3 +42,8 @@ def is_text(self):

def is_file(self):
return not self.is_text()


def url_to_path(filepath: str):
url = URL(str(filepath)).with_scheme("")
return Path(str(url))
23 changes: 12 additions & 11 deletions hubspot_tech_writing/util/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
import logging
import typing as t
from copy import deepcopy
from pathlib import Path
from pprint import pformat

from bs4 import BeautifulSoup
from pathlibfs import Path

logger = logging.getLogger(__name__)


@dataclasses.dataclass
class HTMLImage:
alt: str
src: str
src: Path


class HTMLImageTranslator:
Expand All @@ -21,19 +22,18 @@ class HTMLImageTranslator:
After that, replace URLs in HTML document.
"""

def __init__(self, html: str, source_path: t.Union[str, Path], uploader: t.Optional[t.Callable] = None):
def __init__(self, html: str, source_path: Path, uploader: t.Optional[t.Callable] = None):
self.html_in: str = html
self.html_out: t.Optional[str] = None
self.source_path = source_path
self.source = source_path
self.uploader = uploader
self.images_in: t.List[HTMLImage] = []
self.images_local: t.List[HTMLImage] = []
self.images_remote: t.List[HTMLImage] = []

def __str__(self):
return (
f"HTMLImageTranslator:\nin: {self.images_in}\nlocal: {self.images_local}\nremote: {self.images_remote}"
)
info = {"source": self.source, "in": self.images_in, "local": self.images_local, "remote": self.images_remote}
return f"HTMLImageTranslator:\n{pformat(info)}"

def discover(self):
self.scan().resolve()
Expand All @@ -59,9 +59,10 @@ def resolve(self) -> "HTMLImageTranslator":
"""
Process discovered image elements, computing effective paths.
"""
if self.source_path is None:
if self.source is None:
logger.warning("No resolving without source path")
return self
parent_path = Path(self.source_path)
parent_path = self.source
if parent_path.is_file():
parent_path = parent_path.parent
self.images_local = []
Expand All @@ -74,7 +75,7 @@ def resolve(self) -> "HTMLImageTranslator":

# Relative paths are relative to the original document.
else:
image_new.src = str(Path(parent_path) / image.src)
image_new.src = parent_path / image.src
self.images_local.append(image_new)
return self

Expand All @@ -86,7 +87,7 @@ def upload(self) -> "HTMLImageTranslator":
logger.warning("No upload without uploader")
return self
for image_local in self.images_local:
hs_file = self.uploader(source=image_local.src, name=Path(image_local.src).name)
hs_file = self.uploader(source=image_local.src, name=image_local.src.name)
image_url = hs_file.url
image_remote: HTMLImage = deepcopy(image_local)
image_remote.src = image_url
Expand Down
51 changes: 47 additions & 4 deletions hubspot_tech_writing/util/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,63 @@
import typing as t
from pathlib import Path

import requests
from pathlibfs import Path as PathPlus
from yarl import URL


@contextlib.contextmanager
def to_io(source: t.Union[str, Path, t.IO]) -> t.Generator[t.IO, None, None]:
if isinstance(source, (str, Path)):
fp: t.IO
if isinstance(source, io.TextIOWrapper):
fp = source
elif isinstance(source, (str, Path, PathPlus)):
source = str(source)
fp: t.IO
path = path_from_url(source)
fp = path.open(mode="rt")
"""
if source.startswith("http://") or source.startswith("https://"):
response = requests.get(source, timeout=10.0)
fp = io.StringIO(response.text)
else:
fp = open(source, "r")
"""
else:
fp = source
raise TypeError(f"Unable to converge to IO handle. type={type(source)}, value={source}")
yield fp
fp.close()


def path_from_url(url: str) -> PathPlus:
"""
Convert GitHub HTTP URL to pathlibfs / fsspec URL.

Input URLs
----------
github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/path/to/document.md
github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/blob/main/path/to/document.md

Output Path
-----------
fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino")
"""
uri = URL(url)

if uri.scheme.startswith("github+https"):
path_fragments = uri.path.split("/")[1:]
path_kwargs = {
"username": uri.user,
"token": uri.password,
"org": path_fragments[0],
"repo": path_fragments[1],
}

real_path_fragments = path_fragments[2:]
if path_fragments[2] == "blob":
real_path_fragments = path_fragments[4:]

downstream_url = "github://" + "/".join(real_path_fragments)
path = PathPlus(downstream_url, **path_kwargs)

else:
path = PathPlus(url)
return path
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ dependencies = [
"hubspot-api-client<9",
"markdown<4",
"mkdocs-linkcheck<2",
"pathlibfs<0.6",
"requests<3",
"yarl<2",
]

[project.optional-dependencies]
Expand Down