Skip to content

Commit

Permalink
download: workaround timouts + fix for nginx index listings
Browse files Browse the repository at this point in the history
  • Loading branch information
nim65s committed Dec 14, 2023
1 parent 223ee4b commit 2c56263
Showing 1 changed file with 44 additions and 22 deletions.
66 changes: 44 additions & 22 deletions happypose/toolbox/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import logging
import os
import random
import re
import zipfile
from pathlib import Path
Expand All @@ -20,6 +21,8 @@
logger = get_logger(__name__)

MIRRORS = {
"wired": "https://aws-w.saurel.me/",
"eduroam": "https://aws.saurel.me/",
"inria": "https://www.paris.inria.fr/archive_ylabbeprojectsdata/",
"laas": "https://gepettoweb.laas.fr/data/happypose/",
"bop": "https://bop.felk.cvut.cz/media/data/bop_datasets/",
Expand Down Expand Up @@ -349,7 +352,11 @@ async def adownload(self, download_path, local_path, flags=None):

for mirror in self.mirrors:
dl = mirror + download_path
head = await self.client.head(dl)
try:
await asyncio.sleep(random.randint(1, 5))
head = await self.client.head(dl)
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ConnectTimeout):
continue
if head.is_success or head.is_redirect:
download_path = dl
break
Expand All @@ -368,9 +375,10 @@ async def adownload(self, download_path, local_path, flags=None):

async def download_dir(self, download_path, local_path, flags):
try:
await asyncio.sleep(random.randint(1, 5))
r = await self.client.get(download_path)
except (httpx.PoolTimeout, httpx.ReadTimeout):
logger.error(f"Failed {download_path} with timeout")
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ConnectTimeout):
logger.error(f"Failed {download_path} with GET timeout")
return
if r.status_code != 200:
logger.error(f"Failed {download_path} with code {r.status_code}")
Expand All @@ -379,8 +387,10 @@ async def download_dir(self, download_path, local_path, flags):
soup = BeautifulSoup(r.content, "html.parser")
logger.info(f"Copying {download_path} to {local_path}")

for link in soup.find_all("a")[5:]:
for link in soup.find_all("a"):
href: str = link.get("href")
if any(href.startswith(wrong) for wrong in ["?", ".."]):
continue
if not flags.flags_managing(href):
continue
if href.endswith("/"):
Expand All @@ -397,7 +407,12 @@ async def download_file(self, download_path, local_path):
if local_path.exists():
# logger.info(f"Existing {download_path=}")
local_size = local_path.stat().st_size
head = await self.client.head(download_path)
try:
await asyncio.sleep(random.randint(1, 5))
head = await self.client.head(download_path)
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ConnectTimeout):
logger.error(f"Failed {download_path} with HEAD timeout")
return
if "content-length" in head.headers:
if local_size == int(head.headers["content-length"]):
logger.info(f"Skipping {download_path} already fully downloaded")
Expand All @@ -407,24 +422,31 @@ async def download_file(self, download_path, local_path):
logger.info(f"Copying {download_path} to {local_path}")
local_path.parent.mkdir(parents=True, exist_ok=True)
with local_path.open("wb") as f:
async with self.client.stream("GET", download_path) as r:
total = None
if "Content-Length" in r.headers:
total = int(r.headers["Content-Length"])
with tqdm(
desc=local_path.name,
total=total,
unit_scale=True,
unit_divisor=1024,
unit="B",
) as progress:
num_bytes_downloaded = r.num_bytes_downloaded
async for chunk in r.aiter_bytes():
f.write(chunk)
progress.update(r.num_bytes_downloaded - num_bytes_downloaded)
try:
await asyncio.sleep(random.randint(5, 20))
async with self.client.stream("GET", download_path) as r:
total = None
if "Content-Length" in r.headers:
total = int(r.headers["Content-Length"])
with tqdm(
desc=local_path.name,
total=total,
unit_scale=True,
unit_divisor=1024,
unit="B",
) as progress:
num_bytes_downloaded = r.num_bytes_downloaded
if r.status_code != 200:
logger.error(f"Failed {download_path} with code {r.status_code}")
async for chunk in r.aiter_bytes():
f.write(chunk)
progress.update(
r.num_bytes_downloaded - num_bytes_downloaded
)
num_bytes_downloaded = r.num_bytes_downloaded
if r.status_code != 200:
logger.error(f"Failed {download_path} with code {r.status_code}")
return
except (httpx.PoolTimeout, httpx.ReadTimeout, httpx.ConnectTimeout):
logger.error(f"Failed {download_path} with stream timeout")
return


Expand Down

0 comments on commit 2c56263

Please sign in to comment.