Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow reading from remote resources over http #19

Merged
merged 7 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
# --------------------
# This file records the packages and requirements needed in order for
# the library to work as expected. And to run tests.
aiohttp
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is aiohttp implicitly required by fsspec? Or does it change the behavior of fsspec if it's available at runtime?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It changes the behavior. http protocol did not seem to be supported at all without it:

>>> of = fsspec.open('https://storage.googleapis.com/gef-ckan-public-data/awc-isric-soilgrids/awc.tif.yml')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "C:\Users\dmf\projects\geometamaker\env-test\Lib\site-packages\fsspec\core.py", line 459, in open
    out = open_files(
          ^^^^^^^^^^^
  File "C:\Users\dmf\projects\geometamaker\env-test\Lib\site-packages\fsspec\core.py", line 283, in open_files
    fs, fs_token, paths = get_fs_token_paths(
                          ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dmf\projects\geometamaker\env-test\Lib\site-packages\fsspec\core.py", line 623, in get_fs_token_paths
    chain = _un_chain(urlpath0, storage_options or {})
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dmf\projects\geometamaker\env-test\Lib\site-packages\fsspec\core.py", line 332, in _un_chain
    cls = get_filesystem_class(protocol)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dmf\projects\geometamaker\env-test\Lib\site-packages\fsspec\registry.py", line 238, in get_filesystem_class
    raise ImportError(bit["err"]) from e
ImportError: HTTPFileSystem requires "requests" and "aiohttp" to be installed

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. It's strange to me that aiohttp is listed as an extra requirement in fsspec's setup.py but not requests. Oh well!

fsspec
GDAL
jsonschema
numpy
pygeometa
pygeoprocessing>=2.4.2
shapely
pyyaml
pygeoprocessing>=2.4.3
pyyaml
requests
shapely
67 changes: 51 additions & 16 deletions src/geometamaker/geometamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import uuid
from datetime import datetime

import fsspec
import jsonschema
from jsonschema.exceptions import ValidationError
import pygeometa.core
Expand Down Expand Up @@ -189,32 +190,47 @@ def __init__(self, source_dataset_path=None):
Instantiating without a ``source_dataset_path`` creates an MCF template.

Args:
source_dataset_path (string): path to dataset to which the metadata
applies
source_dataset_path (string): path or URL to dataset to which the
metadata applies

"""
self.mcf = None
if source_dataset_path is not None:
self.datasource = source_dataset_path
self.mcf_path = f'{self.datasource}.yml'

if os.path.exists(self.mcf_path):
try:
# pygeometa.core.read_mcf can parse nested MCF documents,
# where one MCF refers to another
self.mcf = pygeometa.core.read_mcf(self.mcf_path)
self.validate()
except (pygeometa.core.MCFReadError, ValidationError,
AttributeError) as err:
# AttributeError in read_mcf not caught by pygeometa
LOGGER.warning(err)
self.mcf = None
# Despite naming, this does not open a resource that must be closed
of = fsspec.open(self.datasource)
if not of.fs.exists(self.datasource):
raise FileNotFoundError(f'{self.datasource} does not exist')

try:
with fsspec.open(self.mcf_path, 'r') as file:
yaml_string = file.read()

# pygeometa.core.read_mcf can parse nested MCF documents,
# where one MCF refers to another
self.mcf = pygeometa.core.read_mcf(yaml_string)
LOGGER.info(f'loaded existing metadata from {self.mcf_path}')
self.validate()

# Common path: MCF often does not already exist
except FileNotFoundError as err:
LOGGER.debug(err)

# Uncommon path: MCF already exists but cannot be used
except (pygeometa.core.MCFReadError,
ValidationError, AttributeError) as err:
# AttributeError in read_mcf not caught by pygeometa
LOGGER.warning(err)
self.mcf = None

if self.mcf is None:
self.mcf = _get_template(MCF_SCHEMA)
self.mcf['metadata']['identifier'] = str(uuid.uuid4())

# fill all values that can be derived from the dataset
LOGGER.debug(f'getting properties from {source_dataset_path}')
self._set_spatial_info()

else:
Expand Down Expand Up @@ -525,7 +541,7 @@ def _write_mcf(self, target_path):
with open(target_path, 'w') as file:
file.write(yaml.dump(self.mcf, Dumper=_NoAliasDumper))

def write(self):
def write(self, workspace=None):
"""Write MCF and ISO-19139 XML to disk.

This creates sidecar files with '.yml' and '.xml' extensions
Expand All @@ -535,13 +551,30 @@ def write(self):
- 'myraster.tif.yml'
- 'myraster.tif.xml'

Args:
workspace (str): if ``None``, files write to the same location
as the source data. If not ``None``, a path to a local directory
to write files. They will still be named to match the source
filename. Use this option if the source data is not on the local
filesystem.

"""
if workspace is None:
target_mcf_path = self.mcf_path
target_xml_path = f'{self.datasource}.xml'
else:
target_mcf_path = os.path.join(
workspace, f'{os.path.basename(self.datasource)}.yml')
target_xml_path = os.path.join(
workspace, f'{os.path.basename(self.datasource)}.xml')

self.mcf['metadata']['datestamp'] = datetime.utcnow().strftime(
'%Y-%m-%d')
self._write_mcf(self.mcf_path)
self._write_mcf(target_mcf_path)

schema_obj = load_schema('iso19139')
xml_string = schema_obj.write(self.mcf)
with open(f'{self.datasource}.xml', 'w') as xmlfile:
with open(target_xml_path, 'w') as xmlfile:
xmlfile.write(xml_string)

def validate(self):
Expand All @@ -561,6 +594,7 @@ def _set_spatial_info(self):
self.mcf['metadata']['hierarchylevel'] = 'dataset'

if gis_type == pygeoprocessing.VECTOR_TYPE:
LOGGER.debug('opening as GDAL vector')
self.mcf['content_info']['type'] = 'coverage'
self.mcf['spatial']['datatype'] = 'vector'
open_options = []
Expand Down Expand Up @@ -616,6 +650,7 @@ def _set_spatial_info(self):
gis_info = pygeoprocessing.get_vector_info(self.datasource)

if gis_type == pygeoprocessing.RASTER_TYPE:
LOGGER.debug('opening as GDAL raster')
self.mcf['spatial']['datatype'] = 'grid'
self.mcf['spatial']['geomtype'] = 'surface'
self.mcf['content_info']['type'] = 'image'
Expand Down
25 changes: 25 additions & 0 deletions tests/test_geometamaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ def tearDown(self):
"""Override tearDown function to remove temporary directory."""
shutil.rmtree(self.workspace_dir)

def test_file_does_not_exist(self):
"""MetadataControl: raises exception if given file does not exist."""
from geometamaker import MetadataControl

with self.assertRaises(FileNotFoundError):
_ = MetadataControl('foo.tif')

def test_blank_MetadataControl(self):
"""MetadataControl: template has expected properties."""
from geometamaker import MetadataControl
Expand Down Expand Up @@ -685,3 +692,21 @@ def test_invalid_preexisting_mcf(self):
self.fail(
'unexpected write error occurred\n'
f'{e}')

def test_write_to_local_workspace(self):
"""MetadataControl: test write metadata to a different location."""
from geometamaker import MetadataControl

datasource_path = os.path.join(self.workspace_dir, 'raster.tif')
create_raster(numpy.int16, datasource_path)
mc = MetadataControl(datasource_path)

temp_dir = tempfile.mkdtemp(dir=self.workspace_dir)
mc.write(workspace=temp_dir)

self.assertTrue(
os.path.exists(os.path.join(
temp_dir, f'{os.path.basename(datasource_path)}.yml')))
self.assertTrue(
os.path.exists(os.path.join(
temp_dir, f'{os.path.basename(datasource_path)}.xml')))
Loading