Skip to content

rework github _open() implementation to support LFS #1810

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 17, 2025
64 changes: 46 additions & 18 deletions fsspec/implementations/github.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import requests
import base64

import fsspec
import requests

from ..spec import AbstractFileSystem
from ..utils import infer_storage_options
Expand All @@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem):
repository. You may specify a point in the repos history, by SHA, branch
or tag (default is current master).

Given that code files tend to be small, and that github does not support
retrieving partial content, we always fetch whole files.
For files less than 1 MB in size, file content is returned directly in a
MemoryFile. For larger files, or for files tracked by git-lfs, file content
is returned as an HTTPFile wrapping the ``download_url`` provided by the
GitHub API.

When using fsspec.open, allows URIs of the form:

Expand All @@ -36,7 +38,7 @@ class GithubFileSystem(AbstractFileSystem):
"""

url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
protocol = "github"
timeout = (60, 60) # connect, read timeouts

Expand All @@ -63,6 +65,12 @@ def __init__(

self.root = sha
self.ls("")
try:
from .http import HTTPFileSystem

self.http_fs = HTTPFileSystem(**kwargs)
except ImportError:
self.http_fs = None

@property
def kw(self):
Expand Down Expand Up @@ -212,28 +220,48 @@ def _open(
path,
mode="rb",
block_size=None,
autocommit=True,
cache_options=None,
sha=None,
**kwargs,
):
if mode != "rb":
raise NotImplementedError
url = self.rurl.format(

# construct a url to hit the GitHub API's repo contents API
url = self.content_url.format(
org=self.org, repo=self.repo, path=path, sha=sha or self.root
)

# make a request to this API, and parse the response as JSON
r = requests.get(url, timeout=self.timeout, **self.kw)
if r.status_code == 404:
raise FileNotFoundError(path)
r.raise_for_status()
return MemoryFile(None, None, r.content)

def cat(self, path, recursive=False, on_error="raise", **kwargs):
paths = self.expand_path(path, recursive=recursive)
urls = [
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
for u, sh in paths
]
fs = fsspec.filesystem("http")
data = fs.cat(urls, on_error="return")
return {u: v for ((k, v), u) in zip(data.items(), urls)}
content_json = r.json()

# if the response's content key is not empty, try to parse it as base64
if content_json["content"]:
content = base64.b64decode(content_json["content"])

# as long as the content does not start with the string
# "version https://git-lfs.github.com/"
# then it is probably not a git-lfs pointer and we can just return
# the content directly
if not content.startswith(b"version https://git-lfs.github.com/"):
return MemoryFile(None, None, content)

# we land here if the content was not present in the first response
# (regular file over 1MB or git-lfs tracked file)
# in this case, we get let the HTTPFileSystem handle the download
if self.http_fs is None:
raise ImportError(
"Please install fsspec[http] to access github files >1 MB "
"or git-lfs tracked files."
)
return self.http_fs.open(
content_json["download_url"],
mode=mode,
block_size=block_size,
cache_options=cache_options,
**kwargs,
)
48 changes: 48 additions & 0 deletions fsspec/implementations/tests/test_github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import fsspec


def test_github_open_small_file():
# test opening a small file <1 MB
with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f:
assert f.readline().startswith(b"species,island")


def test_github_open_large_file():
# test opening a large file >1 MB
# use block_size=0 to get a streaming interface to the file, ensuring that
# we fetch only the parts we need instead of downloading the full file all
# at once
with fsspec.open(
"github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0
) as f:
# read only the first 20 bytes of the file
assert f.read(20) == b"network,1,1,2,2,3,3,"


def test_github_open_lfs_file():
# test opening a git-lfs tracked file
with fsspec.open(
"github://cBioPortal:datahub@55cd360"
"/public/acc_2019/data_gene_panel_matrix.txt",
block_size=0,
) as f:
assert f.read(19) == b"SAMPLE_ID\tmutations"


def test_github_cat():
# test using cat to fetch the content of multiple files
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
paths = ["penguins.csv", "mpg.csv"]
cat_result = fs.cat(paths)
assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"}
assert cat_result["penguins.csv"].startswith(b"species,island")
assert cat_result["mpg.csv"].startswith(b"mpg,cylinders")


def test_github_ls():
# test using ls to list the files in a resository
fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data")
ls_result = set(fs.ls(""))
expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"}
# check if the result is a subset of the expected files
assert expected.issubset(ls_result)
Loading