Merge pull request #264 from vinisalazar/refactor-url-methods

Refactor url methods
ioos · Aug 31, 2022 · 29f6e69 · 29f6e69
2 parents 0a340dc + 66545d1
commit 29f6e69
Show file tree

Hide file tree

Showing 6 changed files with 254 additions and 105 deletions.
diff --git a/erddapy/core/url.py b/erddapy/core/url.py
@@ -1,16 +1,20 @@
 """URL handling."""
 
+import copy
 import functools
 import io
 from datetime import datetime
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 from typing.io import BinaryIO
 from urllib.parse import quote_plus
 
 import httpx
 import pytz
 from pandas._libs.tslibs.parsing import parse_time_string
 
+ListLike = Union[List[str], Tuple[str]]
+OptionalStr = Optional[str]
+
 
 @functools.lru_cache(maxsize=256)
 def _urlopen(url: str, auth: Optional[tuple] = None, **kwargs: Dict) -> BinaryIO:
@@ -59,7 +63,7 @@ def _distinct(url: str, **kwargs: Dict) -> str:
     For example, a query for the variables ["stationType", "stationID"] with `distinct=True`
     will return a sorted list of "stationIDs" associated with each "stationType".
 
-    See https://coastwatch.pfeg.noaa.gov/erddap/tabledap/documentation.html#distinct
+    See http://erddap.ioos.us/erddap/tabledap/documentation.html#distinct
 
     """
     distinct = kwargs.pop("distinct", False)
@@ -129,7 +133,7 @@ def parse_dates(date_time: Union[datetime, str]) -> float:
     return parse_date_time.timestamp()
 
 
-def _search_url(
+def get_search_url(
     server: str,
     response: str = "html",
     search_for: Optional[str] = None,
@@ -138,6 +142,39 @@ def _search_url(
     page: int = 1,
     **kwargs,
 ):
+    """
+    Build the search URL for the `server` endpoint provided.
+
+    Args:
+        search_for: "Google-like" search of the datasets' metadata.
+
+            - Type the words you want to search for, with spaces between the words.
+                ERDDAP will search for the words separately, not as a phrase.
+            - To search for a phrase, put double quotes around the phrase
+                (for example, `"wind speed"`).
+            - To exclude datasets with a specific word, use `-excludedWord`.
+            - To exclude datasets with a specific phrase, use `-"excluded phrase"`
+            - Searches are not case-sensitive.
+            - You can search for any part of a word. For example,
+                searching for `spee` will find datasets with `speed` and datasets with
+                `WindSpeed`
+            - The last word in a phrase may be a partial word. For example,
+                to find datasets from a specific website (usually the start of the datasetID),
+                include (for example) `"datasetID=erd"` in your search.
+
+        response: default is HTML.
+        items_per_page: how many items per page in the return,
+            default is 1000 for HTML, 1e6 (hopefully all items) for CSV, JSON.
+        page: which page to display, default is the first page (1).
+        kwargs: extra search constraints based on metadata and/or coordinates ke/value.
+            metadata: `cdm_data_type`, `institution`, `ioos_category`,
+            `keywords`, `long_name`, `standard_name`, and `variableName`.
+            coordinates: `minLon`, `maxLon`, `minLat`, `maxLat`, `minTime`, and `maxTime`.
+
+    Returns:
+        url: the search URL.
+
+    """
     server = server.rstrip("/")
     base = (
         "{server}/search/advanced.{response}"
@@ -191,6 +228,25 @@ def _search_url(
             lowercase = kwargs[search_term].lower()
             kwargs.update({search_term: lowercase})
 
+    # These responses should not be paginated b/c that hinders the correct amount of data silently
+    # and can surprise users when the number of items is greater than ERDDAP's defaults (1000 items).
+    # Ideally there should be no pagination for this on the ERDDAP side but for now we settled for a
+    # "really big" `items_per_page` number.
+    non_paginated_responses = [
+        "csv",
+        "csvp",
+        "csv0",
+        "json",
+        "jsonlCSV1",
+        "jsonlCSV",
+        "jsonlKVP",
+        "tsv",
+        "tsvp",
+        "tsv0",
+    ]
+    if response in non_paginated_responses:
+        items_per_page = int(1e6)
+
     default = "(ANY)"
     url = base.format(
         server=server,
@@ -217,3 +273,158 @@ def _search_url(
     # Removing them entirely should be OK for older versions too.
     url = url.replace("&minTime=(ANY)", "").replace("&maxTime=(ANY)", "")
     return url
+
+
+def get_info_url(
+    server: str,
+    dataset_id: OptionalStr = None,
+    response: OptionalStr = None,
+) -> str:
+    """
+    Build the info URL for the `server` endpoint.
+
+    Args:
+        dataset_id: a dataset unique id.
+        response: default is HTML.
+
+    Returns:
+        url: the info URL for the `response` chosen.
+
+    """
+    if not dataset_id:
+        raise ValueError(f"You must specify a valid dataset_id, got {dataset_id}")
+
+    url = f"{server}/info/{dataset_id}/index.{response}"
+    return url
+
+
+def get_categorize_url(
+    server: str,
+    categorize_by: str,
+    value: OptionalStr = None,
+    response: OptionalStr = None,
+) -> str:
+    """
+    Build the categorize URL for the `server` endpoint.
+
+    Args:
+        categorize_by: a valid attribute, e.g.: ioos_category or standard_name.
+            Valid attributes are shown in http://erddap.ioos.us/erddap/categorize page.
+        value: an attribute value.
+        response: default is HTML.
+
+    Returns:
+        url: the categorized URL for the `response` chosen.
+
+    """
+    if value:
+        url = f"{server}/categorize/{categorize_by}/{value}/index.{response}"
+    else:
+        url = f"{server}/categorize/{categorize_by}/index.{response}"
+    return url
+
+
+def get_download_url(
+    server: str,
+    dataset_id: OptionalStr = None,
+    protocol: OptionalStr = None,
+    variables: Optional[ListLike] = None,
+    dim_names: Optional[ListLike] = None,
+    response=None,
+    constraints=None,
+    **kwargs,
+) -> str:
+    """
+    Build the download URL for the `server` endpoint.
+
+    Args:
+        dataset_id: a dataset unique id.
+        protocol: tabledap or griddap.
+        variables (list/tuple): a list of the variables to download.
+        response (str): default is HTML.
+        constraints (dict): download constraints, default None (opendap-like url)
+        example: constraints = {'latitude<=': 41.0,
+                                'latitude>=': 38.0,
+                                'longitude<=': -69.0,
+                                'longitude>=': -72.0,
+                                'time<=': '2017-02-10T00:00:00+00:00',
+                                'time>=': '2016-07-10T00:00:00+00:00',}
+
+        One can also use relative constraints like {'time>': 'now-7days',
+                                                    'latitude<': 'min(longitude)+180',
+                                                    'depth>': 'max(depth)-23',}
+
+    Returns:
+        url (str): the download URL for the `response` chosen.
+
+    """
+    if not dataset_id:
+        raise ValueError(f"Please specify a valid `dataset_id`, got {dataset_id}")
+
+    if not protocol:
+        raise ValueError(f"Please specify a valid `protocol`, got {protocol}")
+
+    if (
+        protocol == "griddap"
+        and constraints is not None
+        and variables is not None
+        and dim_names is not None
+    ):
+        download_url = [
+            server,
+            "/",
+            protocol,
+            "/",
+            dataset_id,
+            ".",
+            response,
+            "?",
+        ]
+        for var in variables:
+            sub_url = [var]
+            for dim in dim_names:
+                sub_url.append(
+                    f"[({constraints[dim + '>=']}):"
+                    f"{constraints[dim + '_step']}:"
+                    f"({constraints[dim + '<=']})]",
+                )
+            sub_url.append(",")
+            download_url.append("".join(sub_url))
+        url = "".join(download_url)[:-1]
+        return url
+
+    # This is an unconstrained OPeNDAP response b/c
+    # the integer based constrained version is just not worth supporting ;-p
+    if response == "opendap":
+        return f"{server}/{protocol}/{dataset_id}"
+    else:
+        url = f"{server}/{protocol}/{dataset_id}.{response}?"
+
+    if variables:
+        url += ",".join(variables)
+
+    if constraints:
+        _constraints = copy.copy(constraints)
+        for k, v in _constraints.items():
+            if _check_substrings(v):
+                continue
+            # The valid operators are
+            # =, != (not equals), =~ (a regular expression test), <, <=, >, and >=
+            valid_time_constraints = (
+                "time=",
+                "time!=",
+                "time=~",
+                "time<",
+                "time<=",
+                "time>",
+                "time>=",
+            )
+            if k.startswith(valid_time_constraints):
+                _constraints.update({k: parse_dates(v)})
+        _constraints = _quote_string_constraints(_constraints)
+        _constraints_url = _format_constraints_url(_constraints)
+
+        url += f"{_constraints_url}"
+
+    url = _distinct(url, **kwargs)
+    return url