Skip to content

Commit

Permalink
Merge pull request #264 from vinisalazar/refactor-url-methods
Browse files Browse the repository at this point in the history
Refactor url methods
  • Loading branch information
ocefpaf authored Aug 31, 2022
2 parents 0a340dc + 66545d1 commit 29f6e69
Show file tree
Hide file tree
Showing 6 changed files with 254 additions and 105 deletions.
217 changes: 214 additions & 3 deletions erddapy/core/url.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""URL handling."""

import copy
import functools
import io
from datetime import datetime
from typing import Dict, Optional, Union
from typing import Dict, List, Optional, Tuple, Union
from typing.io import BinaryIO
from urllib.parse import quote_plus

import httpx
import pytz
from pandas._libs.tslibs.parsing import parse_time_string

ListLike = Union[List[str], Tuple[str]]
OptionalStr = Optional[str]


@functools.lru_cache(maxsize=256)
def _urlopen(url: str, auth: Optional[tuple] = None, **kwargs: Dict) -> BinaryIO:
Expand Down Expand Up @@ -59,7 +63,7 @@ def _distinct(url: str, **kwargs: Dict) -> str:
For example, a query for the variables ["stationType", "stationID"] with `distinct=True`
will return a sorted list of "stationIDs" associated with each "stationType".
See https://coastwatch.pfeg.noaa.gov/erddap/tabledap/documentation.html#distinct
See http://erddap.ioos.us/erddap/tabledap/documentation.html#distinct
"""
distinct = kwargs.pop("distinct", False)
Expand Down Expand Up @@ -129,7 +133,7 @@ def parse_dates(date_time: Union[datetime, str]) -> float:
return parse_date_time.timestamp()


def _search_url(
def get_search_url(
server: str,
response: str = "html",
search_for: Optional[str] = None,
Expand All @@ -138,6 +142,39 @@ def _search_url(
page: int = 1,
**kwargs,
):
"""
Build the search URL for the `server` endpoint provided.
Args:
search_for: "Google-like" search of the datasets' metadata.
- Type the words you want to search for, with spaces between the words.
ERDDAP will search for the words separately, not as a phrase.
- To search for a phrase, put double quotes around the phrase
(for example, `"wind speed"`).
- To exclude datasets with a specific word, use `-excludedWord`.
- To exclude datasets with a specific phrase, use `-"excluded phrase"`
- Searches are not case-sensitive.
- You can search for any part of a word. For example,
searching for `spee` will find datasets with `speed` and datasets with
`WindSpeed`
- The last word in a phrase may be a partial word. For example,
to find datasets from a specific website (usually the start of the datasetID),
include (for example) `"datasetID=erd"` in your search.
response: default is HTML.
items_per_page: how many items per page in the return,
default is 1000 for HTML, 1e6 (hopefully all items) for CSV, JSON.
page: which page to display, default is the first page (1).
kwargs: extra search constraints based on metadata and/or coordinates ke/value.
metadata: `cdm_data_type`, `institution`, `ioos_category`,
`keywords`, `long_name`, `standard_name`, and `variableName`.
coordinates: `minLon`, `maxLon`, `minLat`, `maxLat`, `minTime`, and `maxTime`.
Returns:
url: the search URL.
"""
server = server.rstrip("/")
base = (
"{server}/search/advanced.{response}"
Expand Down Expand Up @@ -191,6 +228,25 @@ def _search_url(
lowercase = kwargs[search_term].lower()
kwargs.update({search_term: lowercase})

# These responses should not be paginated b/c that hinders the correct amount of data silently
# and can surprise users when the number of items is greater than ERDDAP's defaults (1000 items).
# Ideally there should be no pagination for this on the ERDDAP side but for now we settled for a
# "really big" `items_per_page` number.
non_paginated_responses = [
"csv",
"csvp",
"csv0",
"json",
"jsonlCSV1",
"jsonlCSV",
"jsonlKVP",
"tsv",
"tsvp",
"tsv0",
]
if response in non_paginated_responses:
items_per_page = int(1e6)

default = "(ANY)"
url = base.format(
server=server,
Expand All @@ -217,3 +273,158 @@ def _search_url(
# Removing them entirely should be OK for older versions too.
url = url.replace("&minTime=(ANY)", "").replace("&maxTime=(ANY)", "")
return url


def get_info_url(
server: str,
dataset_id: OptionalStr = None,
response: OptionalStr = None,
) -> str:
"""
Build the info URL for the `server` endpoint.
Args:
dataset_id: a dataset unique id.
response: default is HTML.
Returns:
url: the info URL for the `response` chosen.
"""
if not dataset_id:
raise ValueError(f"You must specify a valid dataset_id, got {dataset_id}")

url = f"{server}/info/{dataset_id}/index.{response}"
return url


def get_categorize_url(
server: str,
categorize_by: str,
value: OptionalStr = None,
response: OptionalStr = None,
) -> str:
"""
Build the categorize URL for the `server` endpoint.
Args:
categorize_by: a valid attribute, e.g.: ioos_category or standard_name.
Valid attributes are shown in http://erddap.ioos.us/erddap/categorize page.
value: an attribute value.
response: default is HTML.
Returns:
url: the categorized URL for the `response` chosen.
"""
if value:
url = f"{server}/categorize/{categorize_by}/{value}/index.{response}"
else:
url = f"{server}/categorize/{categorize_by}/index.{response}"
return url


def get_download_url(
server: str,
dataset_id: OptionalStr = None,
protocol: OptionalStr = None,
variables: Optional[ListLike] = None,
dim_names: Optional[ListLike] = None,
response=None,
constraints=None,
**kwargs,
) -> str:
"""
Build the download URL for the `server` endpoint.
Args:
dataset_id: a dataset unique id.
protocol: tabledap or griddap.
variables (list/tuple): a list of the variables to download.
response (str): default is HTML.
constraints (dict): download constraints, default None (opendap-like url)
example: constraints = {'latitude<=': 41.0,
'latitude>=': 38.0,
'longitude<=': -69.0,
'longitude>=': -72.0,
'time<=': '2017-02-10T00:00:00+00:00',
'time>=': '2016-07-10T00:00:00+00:00',}
One can also use relative constraints like {'time>': 'now-7days',
'latitude<': 'min(longitude)+180',
'depth>': 'max(depth)-23',}
Returns:
url (str): the download URL for the `response` chosen.
"""
if not dataset_id:
raise ValueError(f"Please specify a valid `dataset_id`, got {dataset_id}")

if not protocol:
raise ValueError(f"Please specify a valid `protocol`, got {protocol}")

if (
protocol == "griddap"
and constraints is not None
and variables is not None
and dim_names is not None
):
download_url = [
server,
"/",
protocol,
"/",
dataset_id,
".",
response,
"?",
]
for var in variables:
sub_url = [var]
for dim in dim_names:
sub_url.append(
f"[({constraints[dim + '>=']}):"
f"{constraints[dim + '_step']}:"
f"({constraints[dim + '<=']})]",
)
sub_url.append(",")
download_url.append("".join(sub_url))
url = "".join(download_url)[:-1]
return url

# This is an unconstrained OPeNDAP response b/c
# the integer based constrained version is just not worth supporting ;-p
if response == "opendap":
return f"{server}/{protocol}/{dataset_id}"
else:
url = f"{server}/{protocol}/{dataset_id}.{response}?"

if variables:
url += ",".join(variables)

if constraints:
_constraints = copy.copy(constraints)
for k, v in _constraints.items():
if _check_substrings(v):
continue
# The valid operators are
# =, != (not equals), =~ (a regular expression test), <, <=, >, and >=
valid_time_constraints = (
"time=",
"time!=",
"time=~",
"time<",
"time<=",
"time>",
"time>=",
)
if k.startswith(valid_time_constraints):
_constraints.update({k: parse_dates(v)})
_constraints = _quote_string_constraints(_constraints)
_constraints_url = _format_constraints_url(_constraints)

url += f"{_constraints_url}"

url = _distinct(url, **kwargs)
return url
Loading

0 comments on commit 29f6e69

Please sign in to comment.