From c085e2d3b7b27d8720502ea55c21af7d904d4bff Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Fri, 14 Mar 2025 14:29:13 -0400 Subject: [PATCH 1/9] rework github implementation to support lfs --- fsspec/implementations/github.py | 42 ++++++++++++++------- fsspec/implementations/tests/test_github.py | 41 ++++++++++++++++++++ 2 files changed, 69 insertions(+), 14 deletions(-) create mode 100644 fsspec/implementations/tests/test_github.py diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index 3650b8eba..944fdf033 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -1,6 +1,6 @@ -import requests +import base64 -import fsspec +import requests from ..spec import AbstractFileSystem from ..utils import infer_storage_options @@ -36,7 +36,7 @@ class GithubFileSystem(AbstractFileSystem): """ url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}" - rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}" + content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}" protocol = "github" timeout = (60, 60) # connect, read timeouts @@ -219,21 +219,35 @@ def _open( ): if mode != "rb": raise NotImplementedError - url = self.rurl.format( + + # construct a url to hit the GitHub API's repo contents API + url = self.content_url.format( org=self.org, repo=self.repo, path=path, sha=sha or self.root ) + + # make a request to this API, and parse the response as JSON r = requests.get(url, timeout=self.timeout, **self.kw) if r.status_code == 404: raise FileNotFoundError(path) r.raise_for_status() + content_json = r.json() + + # if the response's content key is not empty, try to parse it as base64 + if content_json["content"]: + content = base64.b64decode(content_json["content"]) + + # as long as the content does not start with the string + # "version https://git-lfs.github.com/" + # then it is probably not a git-lfs pointer and we can just return + # the content directly + if not content.startswith(b"version https://git-lfs.github.com/"): + return MemoryFile(None, None, content) + + # we land here if the content was not present in the first response + # (regular file over 1MB or git-lfs tracked file) + # in this case, we get the content from the download_url + r = requests.get(content_json["download_url"], timeout=self.timeout, **self.kw) + if r.status_code == 404: + raise FileNotFoundError(path) + r.raise_for_status() return MemoryFile(None, None, r.content) - - def cat(self, path, recursive=False, on_error="raise", **kwargs): - paths = self.expand_path(path, recursive=recursive) - urls = [ - self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root) - for u, sh in paths - ] - fs = fsspec.filesystem("http") - data = fs.cat(urls, on_error="return") - return {u: v for ((k, v), u) in zip(data.items(), urls)} diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py new file mode 100644 index 000000000..540ec6b23 --- /dev/null +++ b/fsspec/implementations/tests/test_github.py @@ -0,0 +1,41 @@ +import fsspec + + +def test_github_open_small_file(): + # test opening a small file <1 MB + with fsspec.open("github://mwaskom:seaborn-data@4e06bf0/penguins.csv") as f: + assert f.readline().startswith(b"species,island") + + +def test_github_open_large_file(): + # test opening a large file >1 MB + with fsspec.open("github://mwaskom:seaborn-data@83bfba7/brain_networks.csv") as f: + assert f.readline().startswith(b"network,1,1,2,2") + + +def test_github_open_lfs_file(): + # test opening a git-lfs tracked file + with fsspec.open( + "github://cBioPortal:datahub@55cd360" + "/public/acc_2019/data_gene_panel_matrix.txt", + ) as f: + assert f.readline().startswith(b"SAMPLE_ID\tmutations") + + +def test_github_cat(): + # test using cat to fetch the content of multiple files + fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") + paths = ["penguins.csv", "mpg.csv"] + cat_result = fs.cat(paths) + assert set(cat_result.keys()) == {"penguins.csv", "mpg.csv"} + assert cat_result["penguins.csv"].startswith(b"species,island") + assert cat_result["mpg.csv"].startswith(b"mpg,cylinders") + + +def test_github_ls(): + # test using ls to list the files in a resository + fs = fsspec.filesystem("github", org="mwaskom", repo="seaborn-data") + ls_result = set(fs.ls("")) + expected = {"brain_networks.csv", "mpg.csv", "penguins.csv", "README.md", "raw"} + # check if the result is a subset of the expected files + assert expected.issubset(ls_result) From a751042e5cd446a2d4b917bbc3341322961da91c Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Fri, 14 Mar 2025 19:53:57 -0400 Subject: [PATCH 2/9] make github _open() return HTTPFile when getting content from download_url --- fsspec/implementations/github.py | 26 ++++++++++++++++----- fsspec/implementations/tests/test_github.py | 10 ++++++-- pyproject.toml | 2 +- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index 944fdf033..e7d1f2847 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -2,8 +2,10 @@ import requests +from ..asyn import get_loop, sync from ..spec import AbstractFileSystem from ..utils import infer_storage_options +from .http import HTTPFile, HTTPFileSystem from .memory import MemoryFile # TODO: add GIST backend, would be very similar @@ -64,6 +66,11 @@ def __init__( self.root = sha self.ls("") + # prepare elements needed to return HTTPFile + self.http_fs = HTTPFileSystem(**kwargs) + self.loop = get_loop() + self.session = sync(self.loop, self.http_fs.set_session) + @property def kw(self): if self.username: @@ -245,9 +252,16 @@ def _open( # we land here if the content was not present in the first response # (regular file over 1MB or git-lfs tracked file) - # in this case, we get the content from the download_url - r = requests.get(content_json["download_url"], timeout=self.timeout, **self.kw) - if r.status_code == 404: - raise FileNotFoundError(path) - r.raise_for_status() - return MemoryFile(None, None, r.content) + # in this case, we get return an HTTPFile object wrapping the + # download_url + return HTTPFile( + self.http_fs, + content_json["download_url"], + session=self.session, + block_size=block_size, + autocommit=autocommit, + cache_options=cache_options, + size=content_json["size"], + loop=self.loop, + **kwargs, + ) diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py index 540ec6b23..55072ac20 100644 --- a/fsspec/implementations/tests/test_github.py +++ b/fsspec/implementations/tests/test_github.py @@ -9,8 +9,14 @@ def test_github_open_small_file(): def test_github_open_large_file(): # test opening a large file >1 MB - with fsspec.open("github://mwaskom:seaborn-data@83bfba7/brain_networks.csv") as f: - assert f.readline().startswith(b"network,1,1,2,2") + # use block_size=0 to get a streaming interface to the file, ensuring that + # we fetch only the parts we need instead of downloading the full file all + # at once + with fsspec.open( + "github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0 + ) as f: + # read only the first 20 bytes of the file + assert f.read(20).startswith(b"network,1,1,2,2,3,3,") def test_github_open_lfs_file(): diff --git a/pyproject.toml b/pyproject.toml index 9c2355625..aa84f9047 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ full = [ fuse = ["fusepy"] gcs = ["gcsfs"] git = ["pygit2"] -github = ["requests"] +github = ["fsspec[http]", "requests"] gs = ["gcsfs"] gui = ["panel"] hdfs = ["pyarrow >= 1"] From f505affd792c10fcd2da28bfd7c7d16b7bb168cb Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Fri, 14 Mar 2025 20:17:30 -0400 Subject: [PATCH 3/9] tweak tests for github implementation --- fsspec/implementations/tests/test_github.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py index 55072ac20..a256f82dd 100644 --- a/fsspec/implementations/tests/test_github.py +++ b/fsspec/implementations/tests/test_github.py @@ -16,7 +16,7 @@ def test_github_open_large_file(): "github://mwaskom:seaborn-data@83bfba7/brain_networks.csv", block_size=0 ) as f: # read only the first 20 bytes of the file - assert f.read(20).startswith(b"network,1,1,2,2,3,3,") + assert f.read(20) == b"network,1,1,2,2,3,3," def test_github_open_lfs_file(): @@ -25,7 +25,7 @@ def test_github_open_lfs_file(): "github://cBioPortal:datahub@55cd360" "/public/acc_2019/data_gene_panel_matrix.txt", ) as f: - assert f.readline().startswith(b"SAMPLE_ID\tmutations") + assert f.read(19) == b"SAMPLE_ID\tmutations" def test_github_cat(): From 7d88086cf1a2721f529e56aa66e7a1c469db8866 Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Fri, 14 Mar 2025 20:17:59 -0400 Subject: [PATCH 4/9] tweak tests for github implementation --- fsspec/implementations/tests/test_github.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fsspec/implementations/tests/test_github.py b/fsspec/implementations/tests/test_github.py index a256f82dd..32df94f74 100644 --- a/fsspec/implementations/tests/test_github.py +++ b/fsspec/implementations/tests/test_github.py @@ -24,6 +24,7 @@ def test_github_open_lfs_file(): with fsspec.open( "github://cBioPortal:datahub@55cd360" "/public/acc_2019/data_gene_panel_matrix.txt", + block_size=0, ) as f: assert f.read(19) == b"SAMPLE_ID\tmutations" From 75a6f4e2dcd3ba73a96230362c9f2c35d474cf22 Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Fri, 14 Mar 2025 20:27:15 -0400 Subject: [PATCH 5/9] delegate HTTPFile creation to HTTPFileSystem.open() --- fsspec/implementations/github.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index e7d1f2847..ca1980baa 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -2,10 +2,9 @@ import requests -from ..asyn import get_loop, sync from ..spec import AbstractFileSystem from ..utils import infer_storage_options -from .http import HTTPFile, HTTPFileSystem +from .http import HTTPFileSystem from .memory import MemoryFile # TODO: add GIST backend, would be very similar @@ -65,11 +64,7 @@ def __init__( self.root = sha self.ls("") - - # prepare elements needed to return HTTPFile self.http_fs = HTTPFileSystem(**kwargs) - self.loop = get_loop() - self.session = sync(self.loop, self.http_fs.set_session) @property def kw(self): @@ -219,7 +214,6 @@ def _open( path, mode="rb", block_size=None, - autocommit=True, cache_options=None, sha=None, **kwargs, @@ -252,16 +246,11 @@ def _open( # we land here if the content was not present in the first response # (regular file over 1MB or git-lfs tracked file) - # in this case, we get return an HTTPFile object wrapping the - # download_url - return HTTPFile( - self.http_fs, + # in this case, we get let the HTTPFileSystem handle the download + return self.http_fs.open( content_json["download_url"], - session=self.session, + mode=mode, block_size=block_size, - autocommit=autocommit, cache_options=cache_options, - size=content_json["size"], - loop=self.loop, **kwargs, ) From 8e7fb56130a7584c2566ed1dca91d8a440c3a62f Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Mon, 17 Mar 2025 16:32:27 -0400 Subject: [PATCH 6/9] rework github large file open to treat http extra as optional --- fsspec/implementations/github.py | 12 +++++++++--- pyproject.toml | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index ca1980baa..956510537 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -4,7 +4,6 @@ from ..spec import AbstractFileSystem from ..utils import infer_storage_options -from .http import HTTPFileSystem from .memory import MemoryFile # TODO: add GIST backend, would be very similar @@ -64,7 +63,7 @@ def __init__( self.root = sha self.ls("") - self.http_fs = HTTPFileSystem(**kwargs) + self.kwargs = kwargs @property def kw(self): @@ -247,7 +246,14 @@ def _open( # we land here if the content was not present in the first response # (regular file over 1MB or git-lfs tracked file) # in this case, we get let the HTTPFileSystem handle the download - return self.http_fs.open( + try: + from .http import HTTPFileSystem + except ImportError as e: + raise ImportError( + "Please install fsspec[http] to acccess github files >1 MB " + "or git-lfs tracked files." + ) from e + return HTTPFileSystem(**self.kwargs).open( content_json["download_url"], mode=mode, block_size=block_size, diff --git a/pyproject.toml b/pyproject.toml index aa84f9047..9c2355625 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ full = [ fuse = ["fusepy"] gcs = ["gcsfs"] git = ["pygit2"] -github = ["fsspec[http]", "requests"] +github = ["requests"] gs = ["gcsfs"] gui = ["panel"] hdfs = ["pyarrow >= 1"] From 083392bdb4e603ab7be93db36260f8de6aef6969 Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Mon, 17 Mar 2025 16:35:56 -0400 Subject: [PATCH 7/9] update docstring --- fsspec/implementations/github.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index 956510537..d22953458 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -16,8 +16,10 @@ class GithubFileSystem(AbstractFileSystem): repository. You may specify a point in the repos history, by SHA, branch or tag (default is current master). - Given that code files tend to be small, and that github does not support - retrieving partial content, we always fetch whole files. + For files less than 1 MB in size, file content is returned directly in a + MemoryFile. For larger files, or for files tracked by git-lfs, file content + is returned as an HTTPFile wrapping the ``download_url`` provided by the + GitHub API. When using fsspec.open, allows URIs of the form: From 65e09cf4cf7b5d1de30bb3d31ce4c3c0431a7a71 Mon Sep 17 00:00:00 2001 From: Thomas Gilgenast Date: Mon, 17 Mar 2025 16:41:31 -0400 Subject: [PATCH 8/9] alternate approach to deferred import that allows HTTPFileSystem reuse --- fsspec/implementations/github.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index d22953458..36c3647cf 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -65,7 +65,12 @@ def __init__( self.root = sha self.ls("") - self.kwargs = kwargs + try: + from .http import HTTPFileSystem + + self.http_fs = HTTPFileSystem(**kwargs) + except ImportError: + self.http_fs = None @property def kw(self): @@ -248,14 +253,12 @@ def _open( # we land here if the content was not present in the first response # (regular file over 1MB or git-lfs tracked file) # in this case, we get let the HTTPFileSystem handle the download - try: - from .http import HTTPFileSystem - except ImportError as e: + if self.http_fs is None: raise ImportError( "Please install fsspec[http] to acccess github files >1 MB " "or git-lfs tracked files." - ) from e - return HTTPFileSystem(**self.kwargs).open( + ) + return self.http_fs.open( content_json["download_url"], mode=mode, block_size=block_size, From 38aadce4b9263d3253f5635099631ac7eaa4d776 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 17 Mar 2025 17:36:23 -0400 Subject: [PATCH 9/9] lint --- fsspec/implementations/github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fsspec/implementations/github.py b/fsspec/implementations/github.py index 36c3647cf..b624ca522 100644 --- a/fsspec/implementations/github.py +++ b/fsspec/implementations/github.py @@ -255,7 +255,7 @@ def _open( # in this case, we get let the HTTPFileSystem handle the download if self.http_fs is None: raise ImportError( - "Please install fsspec[http] to acccess github files >1 MB " + "Please install fsspec[http] to access github files >1 MB " "or git-lfs tracked files." ) return self.http_fs.open(