Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

io.utils.download_file update #270

Merged
merged 8 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions doc/source/notebooks/01. IO/EarthRanger_IO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,11 @@
"metadata": {},
"outputs": [],
"source": [
"patrol_df = er_io.get_patrols()\n",
"patrol_df = er_io.get_patrols(\n",
" since=pd.Timestamp(\"2017-01-01\").isoformat(),\n",
" until=pd.Timestamp(\"2017-04-01\").isoformat(),\n",
")\n",
"\n",
"\n",
"relocs = er_io.get_patrol_observations(\n",
" patrol_df,\n",
Expand Down Expand Up @@ -709,8 +713,8 @@
")\n",
"\n",
"if not elephants.empty:\n",
" for i, value in elephants.iterrows():\n",
" er_io.delete_observation(observation_id=elephants.loc[i, \"extra__id\"])"
" for observation_id in elephants[\"extra__id\"].unique():\n",
" er_io.delete_observation(observation_id)"
]
},
{
Expand Down Expand Up @@ -863,11 +867,7 @@
"metadata": {},
"outputs": [],
"source": [
"relocs.drop(\n",
" columns=relocs.columns[relocs.applymap(lambda x: isinstance(x, list)).any()],\n",
" errors=\"ignore\",\n",
" inplace=True,\n",
")\n",
"relocs = relocs.select_dtypes(exclude=[list])\n",
"\n",
"relocs.to_file(os.path.join(output_dir, \"observations.gpkg\"), layer=\"observations\")"
]
Expand Down
20 changes: 1 addition & 19 deletions doc/source/notebooks/01. IO/GEE_IO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
"source": [
"import os\n",
"import sys\n",
"import zipfile\n",
"\n",
"import shapely\n",
"\n",
Expand Down Expand Up @@ -208,26 +207,9 @@
"ecoscope.io.utils.download_file(\n",
" url=img.getDownloadUrl(download_config),\n",
" path=img_zip_file,\n",
" unzip=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Unzip"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with zipfile.ZipFile(img_zip_file) as z:\n",
" for name in z.namelist():\n",
" z.extract(name, output_dir)"
]
}
],
"metadata": {
Expand Down
18 changes: 1 addition & 17 deletions doc/source/notebooks/01. IO/Landscape Dynamics Data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
"source": [
"import os\n",
"import sys\n",
"import zipfile\n",
"\n",
"import geopandas as gpd\n",
"\n",
Expand Down Expand Up @@ -90,25 +89,10 @@
" url=\"https://maraelephant.maps.arcgis.com/sharing/rest/content/items/162e299f0c7d472b8e36211e946bb273/data\",\n",
" path=output_dir,\n",
" overwrite_existing=False,\n",
" unzip=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extract ZIP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"zipfile.ZipFile(os.path.join(output_dir, \"active_public_uncategorized_shpfiles.zip\")).extractall(path=output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@
"metadata": {},
"outputs": [],
"source": [
"relocs[[\"groupby_col\", \"fixtime\", \"geometry\"]].explore()"
"relocs[[\"groupby_col\", \"geometry\"]].explore()"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
"metadata": {},
"outputs": [],
"source": [
"traj.explore()"
"traj[\"geometry\"].explore()"
]
},
{
Expand Down Expand Up @@ -546,7 +546,7 @@
" individual=\"1d22ff96-44d4-45c4-adc3-db1513acbe7d\",\n",
" interpolation=\"dofjojfs\",\n",
" )\n",
"except NotImplemented as e:\n",
"except NotImplementedError as e:\n",
" print(e)"
]
},
Expand Down
1 change: 0 additions & 1 deletion doc/source/notebooks/04. EcoMap & EcoPlot/EcoPlot.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@
"ecoscope.io.download_file(\n",
" f\"{ECOSCOPE_RAW}/tests/sample_data/vector/er_relocs.csv.zip\",\n",
" os.path.join(output_dir, \"er_relocs.csv.zip\"),\n",
" unzip=False,\n",
")\n",
"\n",
"data = pd.read_csv(os.path.join(output_dir, \"er_relocs.csv.zip\"), header=0, index_col=0)\n",
Expand Down
2 changes: 1 addition & 1 deletion ecoscope/analysis/ecograph.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def compute(df):
G = self._get_ecograph(df, subject_name, radius, cutoff, tortuosity_length)
self.graphs[subject_name] = G

self.trajectory.groupby("groupby_col")[self.trajectory.columns].progress_apply(compute)
self.trajectory.groupby("groupby_col")[self.trajectory.columns].apply(compute)

def to_csv(self, output_path):
"""
Expand Down
2 changes: 1 addition & 1 deletion ecoscope/io/earthranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,7 +1088,7 @@ def upload(obs):
else:
return pd.DataFrame(results)

return observations.groupby(source_id_col, group_keys=False).progress_apply(upload)
return observations.groupby(source_id_col, group_keys=False).apply(upload)

def post_event(
self,
Expand Down
15 changes: 11 additions & 4 deletions ecoscope/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from tqdm.auto import tqdm
from urllib3.util import Retry


def to_hex(val, default="#ff0000"):
Expand All @@ -27,22 +29,26 @@ def pack_columns(dataframe: pd.DataFrame, columns: typing.List):
return dataframe


def download_file(url, path, overwrite_existing=False, chunk_size=1024, unzip=True, **request_kwargs):
def download_file(url, path, retries=2, overwrite_existing=False, chunk_size=1024, unzip=False, **request_kwargs):
"""
Download a file from a URL to a local path. If the path is a directory, the filename will be inferred from
the response header
"""

s = requests.Session()
retries = Retry(total=retries, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))

if __is_gdrive_url(url):
url = __transform_gdrive_url(url)
elif __is_dropbox_url(url):
url = __transform_dropbox_url(url)

r = requests.get(url, stream=True, **request_kwargs)
r = s.get(url, stream=True, **request_kwargs)

if os.path.isdir(path):
m = email.message.Message()
m["content-type"] = r.headers["content-disposition"]
m["content-type"] = r.headers.get("content-disposition")
filename = m.get_param("filename")
if filename is None:
raise ValueError("URL has no RFC 6266 filename.")
Expand All @@ -53,7 +59,8 @@ def download_file(url, path, overwrite_existing=False, chunk_size=1024, unzip=Tr
return

with open(path, "wb") as f:
with tqdm.wrapattr(f, "write", total=int(r.headers["Content-Length"])) as fout:
content_length = r.headers.get("content-length")
with tqdm.wrapattr(f, "write", total=int(content_length)) if content_length else f as fout:
for chunk in r.iter_content(chunk_size=chunk_size):
fout.write(chunk)

Expand Down
13 changes: 1 addition & 12 deletions nb-tests/test_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,7 @@

NB_DIR = pathlib.Path(__file__).parent.parent / "doc" / "source" / "notebooks"

KNOWN_ERRORS_REGEXES = { # This is basically a GitHub ticket queue
"EarthRanger_IO.ipynb": "Series found",
"Relocations_and_Trajectories.ipynb": "No module named 'branca'",
"EcoGraph.ipynb": "not a zip file",
"EcoPlot.ipynb": "not a zip file",
"Landscape Grid.ipynb": "No module named 'branca'",
"Seasonal Calculation.ipynb": "No module named 'branca'",
"Tracking Data Gantt Chart.ipynb": "Bad CRC-32 for file 'er_relocs.csv.zip'",
"Remote Sensing Time Series Anomaly.ipynb": "No module named 'branca'",
"Reduce Regions.ipynb": "No module named 'branca'",
"Landscape Dynamics Data.ipynb": "No module named 'branca'",
}
KNOWN_ERRORS_REGEXES = {} # This is basically a GitHub ticket queue


@dataclass
Expand Down
3 changes: 2 additions & 1 deletion requirements-notebooks-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ pytest
papermill
.[all]
ipykernel
pytest-xdist
pytest-xdist
folium
11 changes: 9 additions & 2 deletions tests/test_asyncearthranger_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,10 @@ async def test_get_patrols(er_io_async, get_patrols_fields):

@pytest.mark.asyncio
async def test_get_patrol_observations(er_io_async, get_patrol_observations_fields):
observations = await er_io_async.get_patrol_observations_with_patrol_filter()
observations = await er_io_async.get_patrol_observations_with_patrol_filter(
since=pd.Timestamp("2017-01-01").isoformat(),
until=pd.Timestamp("2017-04-01").isoformat(),
)
assert not observations.empty
assert set(observations.columns) == set(get_patrol_observations_fields)
assert type(observations["fixtime"] == pd.Timestamp)
Expand All @@ -228,7 +231,11 @@ async def test_get_patrol_observations(er_io_async, get_patrol_observations_fiel
async def test_get_patrol_observations_with_patrol_details(
er_io_async, get_patrol_observations_fields, get_patrol_details_fields
):
observations = await er_io_async.get_patrol_observations_with_patrol_filter(include_patrol_details=True)
observations = await er_io_async.get_patrol_observations_with_patrol_filter(
since=pd.Timestamp("2017-01-01").isoformat(),
until=pd.Timestamp("2017-04-01").isoformat(),
include_patrol_details=True,
)
assert not observations.empty
assert set(observations.columns) == set(get_patrol_observations_fields).union(get_patrol_details_fields)
assert type(observations["fixtime"] == pd.Timestamp)
Expand Down
11 changes: 9 additions & 2 deletions tests/test_earthranger_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ def test_get_patrols(er_io):


def test_get_patrol_events(er_io):
events = er_io.get_patrol_events()
events = er_io.get_patrol_events(
since=pd.Timestamp("2017-01-01").isoformat(),
until=pd.Timestamp("2017-04-01").isoformat(),
)
assert "id" in events
assert "event_type" in events
assert "geometry" in events
Expand Down Expand Up @@ -196,7 +199,11 @@ def test_patch_event(er_io):


def test_get_patrol_observations(er_io):
patrols = er_io.get_patrols()
patrols = er_io.get_patrols(
since=pd.Timestamp("2017-01-01").isoformat(),
until=pd.Timestamp("2017-04-01").isoformat(),
)

observations = er_io.get_patrol_observations(
patrols,
include_source_details=False,
Expand Down
25 changes: 25 additions & 0 deletions tests/test_io_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import json
import os
import pytest

import fsspec
import pandas as pd
from unittest.mock import Mock, patch
from http.client import HTTPMessage
from requests.exceptions import RetryError

import ecoscope

Expand Down Expand Up @@ -80,3 +84,24 @@ def test_download_file_dropbox_share_link():

data = pd.read_csv(os.path.join(output_dir, "download_data.csv"))
assert len(data) > 0


@patch("urllib3.connectionpool.HTTPConnectionPool._get_conn")
def test_download_file_retry_on_error(mock):
mock.return_value.getresponse.side_effect = [
Mock(status=500, msg=HTTPMessage(), headers={}),
Mock(status=504, msg=HTTPMessage(), headers={}),
Mock(status=503, msg=HTTPMessage(), headers={}),
]

url = "https://totallyreal.com"
output_dir = "tests/test_output"

with pytest.raises(RetryError):
ecoscope.io.download_file(
url,
output_dir,
overwrite_existing=True,
)

assert mock.call_count == 3
Loading