Finish adding docstring, fix typo

Dewberry · nick-j-roberts · Jun 15, 2023 · Jun 15, 2023 · Jun 15, 2023 · Jun 16, 2023
commit 94e6e9035081a4fe6ac8eb64a796e7b95597c7c0
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # blobfish
 
 ## Summary
-This repository is a proof of concept to evaluate the usability of resource description framework (RDF) metadata in documenting extensible data pipelines. The data pipeline documented by this repository is the mirroring, transformation, and subsequent use of NOAA AORC gridded precipitation data for stochaistic storm transposition (SST) modeling.
+This repository is a proof of concept to evaluate the usability of resource description framework (RDF) metadata in documenting extensible data pipelines. The data pipeline documented by this repository is the mirroring, transformation, and subsequent use of NOAA AORC gridded precipitation data for stochastic storm transposition (SST) modeling.
 
 ## Pipeline Description
 This pipeline can be broken into 3 stages

diff --git a/blobfish/aorc/README.md b/blobfish/aorc/README.md
@@ -28,4 +28,4 @@ Main script for the mirror creation task
 Responsible for verifying the total available data from NOAA, its asynchronous acquistion, and its upload to s3, as well as collecting and uploading relevant metadata not only for the source data but also the mirror datasets created in the process
 
 ### transposition_meta.py
-Main script for collecting, parsing, and submitting metadata created during stochaistic storm transposition models to CKAN for serialization as RDF
+Main script for collecting, parsing, and submitting metadata created during stochastic storm transposition models to CKAN for serialization as RDF
diff --git a/blobfish/aorc/classes/composite.py b/blobfish/aorc/classes/composite.py
@@ -41,6 +41,12 @@ def __exit__(self, exception_type, exception_value, traceback) -> Self:
             logging.warning(f"Traceback: {traceback}")
 
     def register_netcdfs(self, mirror_dataset: URIRef, nc_paths: list[str]) -> None:
+        """Adds mirror dataset uris and netCDF paths to sqlite database for tracking
+
+        Args:
+            mirror_dataset (URIRef): Mirror dataset URI
+            nc_paths (list[str]): List of netCDF files associated with the mirror dataset
+        """
         insert_rows = []
         for nc_path in nc_paths:
             match = re.search(self.datetime_pattern, nc_path)
@@ -55,16 +61,37 @@ def register_netcdfs(self, mirror_dataset: URIRef, nc_paths: list[str]) -> None:
         self.cur.executemany("insert into mirror_datasets VALUES(?, ?, ?)", insert_rows)
 
     def get_nc_files(self, timestamp: datetime.datetime) -> list[str]:
+        """Retrieve netCDF file names with a given timestamp
+
+        Args:
+            timestamp (datetime.datetime): Timestamp of interest
+
+        Returns:
+            list[str]: List of netCDF files
+        """
         self.cur.execute("select nc_path from mirror_datasets where t = ?", timestamp.isoformat())
         nc_files = [f[0] for f in self.cur.fetchall()]
         return nc_files
 
     def get_mirror_datasets(self, timestamp: datetime.datetime) -> list[URIRef]:
+        """Retrieve mirror dataset URIs with a given timestamp
+
+        Args:
+            timestamp (datetime.datetime): Timestamp of interest
+
+        Returns:
+            list[URIRef]: List of mirror dataset URIs
+        """
         self.cur.execute("select DISTINCT uri from mirror_datasets where t = ?", timestamp.isoformat())
         mirror_datasets = [URIRef(f[0]) for f in self.cur.fetchall()]
         return mirror_datasets
 
     def group_data_by_time(self) -> list[tuple[list[str], list[URIRef], datetime.datetime]]:
+        """Gets all netCDF files and mirror URIs grouped by timestamp
+
+        Returns:
+            list[tuple[list[str], list[URIRef], datetime.datetime]]: Tuple containing a list of netCDF files, a list of mirror dataset URIs, and the datetime which is within the temporal coverage of the mirror datasets and netCDF files, in that order
+        """
         results = []
         for nc_path_concat, uri_concat, t in self.cur.execute(
             "select GROUP_CONCAT(nc_path) AS nc_path_concat, GROUP_CONCAT(uri) AS uri_concat, t from mirror_datasets group by t"

diff --git a/blobfish/aorc/classes/mirror.py b/blobfish/aorc/classes/mirror.py
@@ -28,6 +28,14 @@ class AORCDataURL:
     additional_args: dict = None
 
     def s3_key(self, prefix: str = "mirrors/aorc/precip/") -> str:
+        """Construct assumed s3 key for mirror dataset
+
+        Args:
+            prefix (str, optional): prefix to attach to zip file. Defaults to "mirrors/aorc/precip/".
+
+        Returns:
+            str: mirror dataset s3 key
+        """
         if prefix[-1] != "/":
             prefix += "/"
         url_parts = self.url.split("/")

diff --git a/blobfish/aorc/classes/transposition.py b/blobfish/aorc/classes/transposition.py
@@ -33,23 +33,45 @@ class TranspositionMetadata:
     atlas_s3: str = field(init=False)
 
     def __get_s3_base(self) -> tuple[str, list[str], str]:
+        """Gets transposition s3 resource divided into the bucket, the parts of the uri between the bucket and basename, and the basename of the key
+        ex: s3://bucket/1/2/3/key.zip -> ('bucket', [1, 2, 3], 'key.zip')
+
+        Returns:
+            tuple[str, list[str], str]: s3 resource divided into the bucket, the parts of the uri between the bucket and basename, and the basename of the key
+        """
         geo_bucket, geo_key = extract_bucketname_and_keyname(self.transposition_geo_s3)
         key_parts = geo_key.split("/")
         return geo_bucket, key_parts[:-1], key_parts[-1]
 
     def __create_dss_s3_path(self) -> str:
+        """Creates presumed path to dss s3 resource
+
+        Returns:
+            str: DSS s3 URI
+        """
         bucket, key_parts, transpo_file_name = self.__get_s3_base()
         transpo_folder = transpo_file_name.replace(".geojson", "")
         path = os.path.join(bucket, *key_parts, transpo_folder, "72h", "dss", self.start_time.strftime("%Y%m%d.dss"))
         return f"s3://{path}"
 
     def __create_atlas_s3_path(self) -> str:
+        """Gives the ATLAS14 path used during normalization - this uses a single .vrt file currently so just returns that value
+
+        Returns:
+            str: s3 URI of ATLAS dataset
+        """
         return "s3://tempest/noaa-atlas-14/2yr03da.vrt"
 
     def __create_max_precip_point(self) -> Point:
+        """Creates Point feature from metadata for max precip
+
+        Returns:
+            Point: Point conversion of max precip metadata
+        """
         return Point(self.max_precip_x, self.max_precip_y)
 
     def __post_init__(self):
+        """Creates dss_s3, atlas_s3, and max_precip_point attributes"""
         self.max_precip_point = self.__create_max_precip_point()
         self.dss_s3 = self.__create_dss_s3_path()
         self.atlas_s3 = self.__create_atlas_s3_path()
diff --git a/blobfish/aorc/composite.py b/blobfish/aorc/composite.py
@@ -10,6 +10,16 @@
 def create_composite_dataset_identifiers(
     start_date: datetime.datetime, end_date: datetime.datetime, location_name: str
 ) -> BasicDescriptors:
+    """Creates required identifiers for composite dataset
+
+    Args:
+        start_date (datetime.datetime): start date for composite dataset
+        end_date (datetime.datetime): end date for composite dataset
+        location_name (str): name of spatial coverage location
+
+    Returns:
+        BasicDescriptors: Descriptors including title, id, name, URL, and description
+    """
     dataset_id = f"composite_{start_date.strftime('%Y%m%d%H')}".lower()
     dataset_name = dataset_id
     start_time_formatted = start_date.strftime("%Y-%m-%d %H:%M")
@@ -22,6 +32,15 @@ def create_composite_dataset_identifiers(
 
 
 def create_composite_s3_path(bucket: str, start_time: datetime.datetime) -> str:
+    """Creates s3 URI of zarr for composite dataset
+
+    Args:
+        bucket (str): Target bucket
+        start_time (datetime.datetime): start time for dataset
+
+    Returns:
+        str: s3 URI
+    """
     return f"s3://{bucket}/transforms/aorc/precipitation/{start_time.year}/{start_time.strftime('%Y%m%d%H')}.zarr"
 
 

diff --git a/blobfish/aorc/composite_utils/array.py b/blobfish/aorc/composite_utils/array.py
@@ -4,6 +4,14 @@
 
 
 def create_composite_dataset(dataset_paths: set[str]) -> xr.Dataset:
+    """Merges netCDF files provided into single dataset based on shared time coordinate
+
+    Args:
+        dataset_paths (set[str]): List of netCDF file paths
+
+    Returns:
+        xr.Dataset: Spatially merged data
+    """
     datasets = []
     for dataset_path in dataset_paths:
         ds = xr.open_dataset(dataset_path)
@@ -14,4 +22,10 @@ def create_composite_dataset(dataset_paths: set[str]) -> xr.Dataset:
 
 
 def upload_zarr(zarr_s3_path: str, dataset: xr.Dataset) -> None:
+    """Uploads dataset to zarr format
+
+    Args:
+        zarr_s3_path (str): s3 target path for zarr dataset
+        dataset (xr.Dataset): Dataset to upload
+    """
     dataset.to_zarr(store=storage.FSStore(zarr_s3_path))
diff --git a/blobfish/aorc/composite_utils/cloud.py b/blobfish/aorc/composite_utils/cloud.py
@@ -8,6 +8,16 @@
 
 
 def stream_s3_zipped(s3_resource, s3_uri: str, extract_directory: str) -> list[str]:
+    """Streams zipped s3 resource to provided directory
+
+    Args:
+        s3_resource: s3 service resource to use in streaming
+        s3_uri (str): s3 uri of zipped resource (ie s3://bucket/key.zip)
+        extract_directory (str): Directory to which zipped resources will be extracted
+
+    Returns:
+        list[str]: Paths of unzipped data
+    """
     bucket, key = extract_bucketname_and_keyname(s3_uri)
     response = s3_resource.meta.client.get_object(Bucket=bucket, Key=key)
     zip_data = response["Body"].read()
@@ -18,6 +28,15 @@ def stream_s3_zipped(s3_resource, s3_uri: str, extract_directory: str) -> list[s
 
 
 def check_zarr_modification(s3_resource, zarr_path: str) -> datetime.datetime:
+    """Checks modification date of a .zmetadata resource in a zarr dataset
+
+    Args:
+        s3_resource: s3 service resource to use in streaming
+        zarr_path (str): s3 uri of zarr dataset (ie s3://bucket/zarr_dataset)
+
+    Returns:
+        datetime.datetime: _description_
+    """
     bucket, key = extract_bucketname_and_keyname(zarr_path)
     key += "/.zmetadata"
     obj = s3_resource.Object(bucket, key)

diff --git a/blobfish/aorc/composite_utils/general.py b/blobfish/aorc/composite_utils/general.py
@@ -33,6 +33,8 @@ def upload_composite_to_ckan(
     resources: list[dict],
     **kwargs,
 ) -> int:
+    """Uploads composite dataset JSON data to CKAN"""
+
     if not ckan_base_url.endswith("/"):
         ckan_base_url = ckan_base_url[:-1]
     upload_endpoint = f"{ckan_base_url}/api/3/action/package_create"
@@ -73,6 +75,14 @@ def upload_composite_to_ckan(
 
 
 def create_composite_wkt(mirror_wkts: list[str]) -> str:
+    """Converts list of polygon WKTs into a convex hull of all the provided WKTs merged together
+
+    Args:
+        mirror_wkts (list[str]): List of polygon WKTs
+
+    Returns:
+        str: Convex hull of merged polygons
+    """
     logging.info("Merging geometries from RFC regions into composite coverage area")
     polys = [shapely.wkt.loads(wkt) for wkt in mirror_wkts]
     multipoly = MultiPolygon(polys)

diff --git a/blobfish/aorc/composite_utils/rdf.py b/blobfish/aorc/composite_utils/rdf.py
@@ -10,6 +10,15 @@
 
 
 def retrieve_mirror_dataset_metadata(ckan_base_url: str, rfc_count: int) -> Iterator[list[RetrievedMirror]]:
+    """Get mirror dataset metadata from CKAN catalog
+
+    Args:
+        ckan_base_url (str): Base CKAN instance URL
+        rfc_count (int): RFC count
+
+    Yields:
+        Iterator[list[RetrievedMirror]]: Yields mirror dataset metadata
+    """
     logging.info("Retreiving mirror dataset metadata")
     if not ckan_base_url.endswith("/"):
         ckan_base_url += "/"
@@ -43,6 +52,18 @@ def retrieve_mirror_dataset_metadata(ckan_base_url: str, rfc_count: int) -> Iter
 
 
 def verify_date_rfc_count(catalog_graph: Graph, rfc_count: int) -> Iterator[tuple[Literal, Literal]]:
+    """Verifies that mirror dataset count in a catalog matches expected count
+
+    Args:
+        catalog_graph (Graph): Graph of mirror dataset catalog
+        rfc_count (int): Count of RFCs that will be merged together when composite dataset is created
+
+    Raises:
+        ValueError: Error if count doesn't match
+
+    Yields:
+        Iterator[tuple[Literal, Literal]]: Yields tuple of unique start date and end date literals found in catalog associated with mirror datasets
+    """
     logging.info(f"Verifying match between mirror dataset count and RFC count")
     query_string = """
         SELECT ?sd ?ed

diff --git a/blobfish/aorc/general_utils/ckan.py b/blobfish/aorc/general_utils/ckan.py
@@ -12,6 +12,18 @@ def create_ckan_resource(
     description: str,
     s3: bool,
 ) -> dict:
+    """Creates the general dict form of a dcat:Distribution instance as expected in CKAN
+
+    Args:
+        download_url (str): Download URL
+        format (str): URI of data format
+        compress_format (str | None): URI of compression format, if there is compression
+        description (str): Description of distribution
+        s3 (bool): True if distribution is on s3 and has implied access rights constraint
+
+    Returns:
+        dict: Distribution data
+    """
     args_dict = {}
     if s3:
         args_dict["access_rights"] = "Access to distribution requires access to parent s3 bucket"
@@ -28,6 +40,17 @@ def create_ckan_resource(
 
 
 def query_ckan_catalog(catalog_url: str) -> Graph:
+    """Queries provided catalog for RDF data
+
+    Args:
+        catalog_url (str): URL of dataset catalog
+
+    Raises:
+        ValueError: Catalog URL doesn't have expected format
+
+    Returns:
+        Graph: Parsed catalog graph
+    """
     logging.info(f"Querying CKAN catalog {catalog_url}")
     graph = Graph()
     if catalog_url.endswith(".ttl"):

diff --git a/blobfish/aorc/general_utils/cloud.py b/blobfish/aorc/general_utils/cloud.py
@@ -4,13 +4,34 @@
 
 
 def create_s3_resource(access_key_id: str, secret_access_key: str, region_name: str):
+    """Creates s3 resource
+
+    Args:
+        access_key_id (str): AWS access key ID
+        secret_access_key (str): AWS secret access key
+        region_name (str): Default region for AWS resource to use
+
+    Returns:
+        Any: s3 resource
+    """
     logging.info("Creating s3 resource")
     session = boto3.Session(access_key_id, secret_access_key, region_name=region_name)
     resource = session.resource("s3")
     return resource
 
 
 def extract_bucketname_and_keyname(s3path: str) -> tuple[str, str]:
+    """Extracts bucket and key name from s3 URI
+
+    Args:
+        s3path (str): s3 URI (ex: s3://bucket/key)
+
+    Raises:
+        ValueError: Error if path doesn't have correct format
+
+    Returns:
+        tuple[str, str]: Tuple of bucket and key
+    """
     if not s3path.startswith("s3://"):
         raise ValueError(f"s3path does not start with s3://: {s3path}")
     bucket, _, key = s3path[5:].partition("/")

diff --git a/blobfish/aorc/general_utils/provenance.py b/blobfish/aorc/general_utils/provenance.py
@@ -7,6 +7,11 @@
 
 
 def retrieve_meta() -> ProvenanceMetadata:
+    """Retrieves provenance metadata from environment variables
+
+    Returns:
+        ProvenanceMetadata: Provenance metadata
+    """
     relative_composite_path = os.environ["COMPOSE_FILE_PATH"]
     relative_docker_file_path = os.environ["DOCKER_FILE_PATH"]
     git_url = os.environ["GIT_REPO"]
@@ -26,6 +31,18 @@ def retrieve_meta() -> ProvenanceMetadata:
 
 
 def create_raw_content_url(relative_path: str, git_url: str) -> str:
+    """Creates the publicly accessible format of raw content at github address
+
+    Args:
+        relative_path (str): relative path of resource in git repo
+        git_url (str): github repo url
+
+    Raises:
+        ValueError: GitHub url is not in correct format
+
+    Returns:
+        str: Raw content URL
+    """
     if not git_url.startswith("raw."):
         git_url = git_url.replace("https://github.com", "https://raw.githubusercontent.com/", 1)
         git_url = git_url.replace("/commit", "", 1)
@@ -35,6 +52,11 @@ def create_raw_content_url(relative_path: str, git_url: str) -> str:
 
 
 def get_command_list() -> list[str]:
+    """Retrieves command list supplied to python
+
+    Returns:
+        list[str]: Commands, split by space delimiter
+    """
     command_list = [sys.executable, *sys.argv]
     logging.info(f"Command list: {command_list}")
     return command_list