Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add torrent downloading to CLI #28

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions zod/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from zod.cli.download import app as download_app
from zod.cli.extract_tsr_patches import cli_dummy as tsr_dummy
from zod.cli.generate_coco_json import convert_to_coco
from zod.cli.torrent_download import app as torrent_download_app
from zod.cli.verify import app as verify_app
from zod.cli.visualize_lidar import app as visualize_lidar_app

Expand All @@ -29,6 +30,7 @@ def add_child(parent: typer.Typer, child: typer.Typer, name: str):
app = typer.Typer(help="Zenseact Open Dataset CLI.", no_args_is_help=True)

add_child(app, download_app, "download")
add_child(app, torrent_download_app, "torrent_download")
add_child(app, verify_app, "verify")
add_child(app, visualize_app, "visualize")
add_child(app, convert_app, "generate")
Expand Down
347 changes: 347 additions & 0 deletions zod/cli/torrent_download.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have some hardcoded things like torrent paths and file sizes here, that could potentially become outdated in the future. How about we just print a generic message if something goes wrong, along the lines of:
'this and this went wrong. please make sure to update to the latest cli version. if this doesn't fix it, please submit a github issue. ciao bye'

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this where it applies.

Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
"""Helper script to download the dataset using academic torrents."""

import os
import shutil
import subprocess
import sys
import tarfile
import urllib.request
from pathlib import Path
from typing import List

import typer
from tqdm import tqdm

from zod.cli.download import DownloadSettings, FilterSettings, _print_summary
from zod.cli.utils import SubDataset, Version
from zod.constants import MINI

TORRENT_FILEMAP = {
SubDataset.FRAMES: "https://academictorrents.com/download/329ae74426e067ef06b82241b5906400ca3caf03.torrent", # noqa
SubDataset.SEQUENCES: "https://academictorrents.com/download/95ece9c22470c4f7df51a82bcc3407cc038419be.torrent", # noqa
SubDataset.DRIVES: "https://academictorrents.com/download/a05ab2e524dc38c1498fcc9fb621329cc97e3837.torrent", # noqa
}

SIZE_FILEMAP = {
SubDataset.FRAMES: 9096553,
SubDataset.SEQUENCES: 1280237,
SubDataset.DRIVES: 290441,
}

DRIVES_FILES_TO_IDX = {
"annotations": 1,
"drives_mini": 2,
"images_front_blur": 3,
"infos": 4,
"lidar_velodyne": 5,
"oxts": 6,
"vehicle_data": 7,
}

SEQUENCES_FILES_TO_IDX = {
"annotations": 1,
"images_front_blur": 2,
"infos": 3,
"lidar_velodyne": 4,
"oxts": 5,
"sequences_mini": 6,
"vehicle_data": 7,
}

FRAMES_FILES_TO_IDX = {
"annotations": 1,
"frames_mini": 2,
"images_front_blur": 3,
"images_front_dnat": 4,
"infos": 5,
"lidar_velodyne_01after": 6,
"lidar_velodyne_01before": 7,
"lidar_velodyne_02after": 8,
"lidar_velodyne_02before": 9,
"lidar_velodyne_03after": 10,
"lidar_velodyne_03before": 11,
"lidar_velodyne_04after": 12,
"lidar_velodyne_04before": 13,
"lidar_velodyne_05after": 14,
"lidar_velodyne_05before": 15,
"lidar_velodyne_06after": 16,
"lidar_velodyne_06before": 17,
"lidar_velodyne_07after": 18,
"lidar_velodyne_07before": 19,
"lidar_velodyne_08after": 20,
"lidar_velodyne_08before": 21,
"lidar_velodyne_09after": 22,
"lidar_velodyne_09before": 23,
"lidar_velodyne_10after": 24,
"lidar_velodyne_10before": 25,
"lidar_velodyne": 26,
"oxts": 27,
}

app = typer.Typer(help="Zenseact Open Dataset Downloader", no_args_is_help=True)


def check_aria_install():
# assert that aria2c is installed
try:
subprocess.check_call(["aria2c", "--version"])
except FileNotFoundError:
print(
"aria2c is not installed. Please install aria2c to download the dataset. Using: `apt install aria2`" # noqa
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be nice to provide the user with a browser link to academic torrents, and a message stating that you don't have to install aria2 and can instead download and extract the dataset yourself with your chosen torrent client.
Bonus points if we tell the user which files to download depending on the provided flags :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Dont think adding the file indices is useful.

)
sys.exit(1)


def get_files_to_download(subset: SubDataset, filter_settings: FilterSettings) -> List[int]:
if subset == SubDataset.DRIVES:
files_to_idx = DRIVES_FILES_TO_IDX
elif subset == SubDataset.SEQUENCES:
files_to_idx = SEQUENCES_FILES_TO_IDX
elif subset == SubDataset.FRAMES:
files_to_idx = FRAMES_FILES_TO_IDX
else:
raise ValueError(f"Unknown subset: {subset}")

# mini dataset
if filter_settings.version == Version.MINI:
return [f"{subset.name.lower()}_mini.tar.gz"], [files_to_idx[f"{subset.name.lower()}_mini"]]

files = []
# annotations
if filter_settings.annotations:
files.append("annotations")

# camera data
if filter_settings.images:
if filter_settings.blur:
files.append("images_front_blur")
# we only have dnat images for frames
if filter_settings.dnat and subset == SubDataset.FRAMES:
files.append("images_front_dnat")

# gnss data
if filter_settings.oxts:
files.append("oxts")

# lidar_velodyne data
if filter_settings.lidar:
files.append("lidar_velodyne")
if subset == SubDataset.FRAMES:
if not filter_settings.num_scans_before == 0:
n_scans = (
10
if filter_settings.num_scans_before == -1
else filter_settings.num_scans_before
)
for i in range(1, n_scans + 1):
files.append(f"lidar_velodyne_{i:02d}before")

if not filter_settings.num_scans_after == 0:
n_scans = (
10 if filter_settings.num_scans_after == -1 else filter_settings.num_scans_after
)
for i in range(1, n_scans + 1):
files.append(f"lidar_velodyne_{i:02d}after")

# vehicle data
if filter_settings.vehicle_data:
if subset != SubDataset.FRAMES:
files.append("vehicle_data")

# frame infos
if filter_settings.infos:
files.append("infos")

file_indicies = [files_to_idx[f] for f in files]

files = [f"{file}.tar.gz" for file in files]

return files, file_indicies


def download_torrent_file(subset: SubDataset, output_dir: Path):
"""Download the torrent file for the given subset and version."""
url = TORRENT_FILEMAP[subset]
filename = f"{subset.name.lower()}.torrent"
print("Downloading torrent file: {}".format(filename))
try:
urllib.request.urlretrieve(url, filename)
# move the torrent file to the output directory
os.rename(filename, output_dir / filename)

assert os.path.getsize(output_dir / filename) == SIZE_FILEMAP[subset], (
f"Downloaded torrent file is not the correct size. "
f"Expected: {SIZE_FILEMAP[subset]}, "
f"Actual: {os.path.getsize(output_dir / filename)}"
)
return output_dir / filename
except Exception as e:
print(f"Failed to download torrent file: {e}")
exit(1)


def _download_dataset(
dl_settings: DownloadSettings, filter_settings: FilterSettings, subset: SubDataset
):
dl_dir = Path(dl_settings.output_dir) / "downloads"
dl_dir.mkdir(parents=True, exist_ok=True)

torrent_filepath = download_torrent_file(subset, dl_dir)

files, file_indicies = get_files_to_download(subset, filter_settings)
# create the aria2c command
cmd = [
"aria2c",
f"--select-file={','.join([str(i) for i in file_indicies])}",
f"--torrent-file={torrent_filepath}",
f"--dir={str(dl_dir)}",
"--seed-time=0",
"--continue=true",
wljungbergh marked this conversation as resolved.
Show resolved Hide resolved
]

if dl_settings.dry_run:
print(cmd)
cmd.append("--dry-run")

subprocess.check_call(cmd)

# aria2c will download adjacent files in the torrent, so we need to remove those files
# that we don't want
dl_dir = dl_dir / subset.name.lower()
for f in dl_dir.iterdir():
if f.name not in files:
f.unlink()

# if we are extracting, extract the tar gz files
if dl_settings.extract:
for f in dl_dir.iterdir():
if f.name.endswith(".tar.gz"):
with tarfile.open(f, "r:gz") as tar:
for member in tqdm(
iterable=tar.getmembers(),
total=len(tar.getmembers()),
desc=f"Extracting {f.name}",
):
tar.extract(member=member, path=dl_settings.output_dir)

# if we are removing the archives, remove them
if dl_settings.rm:
shutil.rmtree(dl_dir)


REQ = "Required Arguments"
GEN = "General Download Settings"
FIL = "General Filter Settings"
FRA = "Frames Filter Settings"
SEQ = "Sequences/Drives Filter Settings"


@app.command()
def download(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

almost everything here is shared with the dropbox download script. Can we have a single, shared, entrypoint to simplify maintenance?
something like:
zod download --torrent (with interactive question if not specified)
or hierarchical like
zod download torrent vs zod download dropbox (but maybe messy to share arguments then?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made it to be an argument in the download script, acc. your first suggestion. Now we have an entrypoint that chooses what downloading script to run based on user choice.

output_dir: str = typer.Option(
...,
help="Output directory where dataset will be extracted",
prompt="Where do you want to extract the dataset (e.g. ~/data/zod)?",
prompt_required=False,
rich_help_panel=REQ,
),
subset: SubDataset = typer.Option(
...,
help="The sub-dataset to download",
prompt="Which sub-dataset do you want to download?",
prompt_required=False,
rich_help_panel=REQ,
),
version: Version = typer.Option(
...,
help="The version of the dataset to download",
prompt="Which version do you want to download?",
prompt_required=False,
rich_help_panel=REQ,
),
# General download settings
rm: bool = typer.Option(False, help="Remove the downloaded archives", rich_help_panel=GEN),
dry_run: bool = typer.Option(False, help="Print what would be downloaded", rich_help_panel=GEN),
extract: bool = typer.Option(True, help="Unpack the archives", rich_help_panel=GEN),
extract_already_downloaded: bool = typer.Option(
False, help="Extract already downloaded archives", rich_help_panel=GEN
),
no_confirm: bool = typer.Option(
False,
"-y",
"--no-confirm/--confirm",
help="Don't ask for confirmation",
is_flag=True,
flag_value=False,
rich_help_panel=GEN,
),
# Filter settings
annotations: bool = typer.Option(True, help="Download annotations", rich_help_panel=FIL),
images: bool = typer.Option(True, help="Whether to download the images", rich_help_panel=FIL),
blur: bool = typer.Option(True, help="Download blur images", rich_help_panel=FIL),
lidar: bool = typer.Option(True, help="Download lidar data", rich_help_panel=FIL),
oxts: bool = typer.Option(True, help="Download oxts data", rich_help_panel=FIL),
infos: bool = typer.Option(True, help="Download infos", rich_help_panel=FIL),
vehicle_data: bool = typer.Option(True, help="Download the vehicle data", rich_help_panel=SEQ),
dnat: bool = typer.Option(False, help="Download DNAT images", rich_help_panel=FRA),
num_scans_before: int = typer.Option(
0,
help="Number of earlier lidar scans to download (-1 == all)",
rich_help_panel=FRA,
),
num_scans_after: int = typer.Option(
0, help="Number of later lidar scans to download (-1 == all)", rich_help_panel=FRA
),
parallel: bool = typer.Option(
False,
help="Not used in this script. Exist for compatibility with zod/cli/download.py",
rich_help_panel=GEN,
),
max_workers: int = typer.Option(
1,
help="Not used in this script. Exist for compatibility with zod/cli/download.py",
rich_help_panel=GEN,
),
):
# we have to make sure aria2c is installed
check_aria_install()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be nice if this check considered the dry_run flag. So that I could see what command would run even without having aria installed :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion. Added!


# initialize the download settings
download_settings = DownloadSettings(
url="",
output_dir=os.path.expanduser(output_dir),
rm=rm,
dry_run=dry_run,
extract=extract,
extract_already_downloaded=extract_already_downloaded,
parallel=False,
max_workers=1,
)

# initialize the filter settings
filter_settings = FilterSettings(
version=version,
annotations=annotations,
images=images,
blur=blur,
dnat=dnat,
lidar=lidar,
oxts=oxts,
infos=infos,
vehicle_data=vehicle_data,
num_scans_before=num_scans_before if subset == SubDataset.FRAMES else -1,
num_scans_after=num_scans_after if subset == SubDataset.FRAMES else -1,
)

if not no_confirm:
_print_summary(download_settings, filter_settings, subset)
typer.confirm(
f"Download with the above settings?",
abort=True,
)

_download_dataset(download_settings, filter_settings, subset)


if __name__ == "__main__":
app()