lpiccinelli-eth · lpiccinelli-eth · Oct 15, 2024 · Oct 15, 2024 · RoyYang0714 · Oct 16, 2024
diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,7 @@
 __pycache__/
 
 #scripts
-*.sh
+_*.sh
 
 # package
 unidepth.egg-info
diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
 [![arXiv](https://img.shields.io/badge/arXiv-2403.18913-blue?logo=arxiv&color=%23B31B1B)](https://arxiv.org/abs/2403.18913)
 [![ProjectPage](https://img.shields.io/badge/Project_Page-UniDepth-blue)](https://lpiccinelli-eth.github.io/pub/unidepth/)
-<!-- [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Cooming%20Soon-yellow)](https://huggingface.co/spaces/lpiccinelli/UniDepth) -->
 
 [![KITTI Benchmark](https://img.shields.io/badge/KITTI%20Benchmark-1st%20(at%20submission%20time)-orange)](https://www.cvlibs.net/datasets/kitti/eval_depth.php?benchmark=depth_prediction)
 [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/unidepth-universal-monocular-metric-depth/monocular-depth-estimation-on-nyu-depth-v2)](https://paperswithcode.com/sota/monocular-depth-estimation-on-nyu-depth-v2?p=unidepth-universal-monocular-metric-depth)
@@ -211,6 +210,11 @@ To summarize the main differences are:
 - ONNX support
 
 
+## Training
+
+Please [visit the training README](scripts/README.md) for more information.
+
+
 ## Results
 
 ### Metric Depth Estimation

diff --git a/configs/train_v1_vitl14.json b/configs/train_v1_vitl14.json
@@ -0,0 +1,109 @@
+{
+    "generic": {
+        "seed": 13,
+        "deterministic": true
+    },
+    "training": {    
+    "n_iters": 300000,
+    "batch_size": 32,
+    "validation_interval": 1000,
+    "nsteps_accumulation_gradient": 1,
+    "use_checkpoint": false,
+    "lr": 1e-4,
+    "lr_final": 1e-6,
+    "lr_warmup": 1.0,
+    "cycle_beta": false,
+
+    "wd": 0.1,
+    "wd_final": 0.1,
+    "warmup_iters": 75000,
+    "ld": 1.0,
+
+    "drop_path": 0.0,
+    "ema": true,
+    "f16": true,
+    "clipping": 1.0,
+    "losses": {
+        "depth": {
+            "name": "SILog",
+            "weight": 1.0,
+            "output_fn": "sqrt",
+            "input_fn": "log",
+            "dims": [-2,-1],
+            "integrated": 0.15
+        },
+        "invariance": {
+            "name": "SelfDistill",
+            "weight": 0.1,
+            "output_fn": "sqrt"
+        },
+        "camera": {
+            "name": "Regression",
+            "weight": 0.25,
+            "gamma": 1.0,
+            "alpha": 1.0,
+            "fn": "l2",
+            "output_fn": "sqrt",
+            "input_fn": "linear"
+        }
+    }},
+    "data": {
+        "image_shape": [480, 640],
+        "normalization": "imagenet",
+        "pairs": 2,
+        "num_frames": 1,
+        "sampling":{
+            "Sintel": 1.0,
+            "ADT": 1.0,
+            "KITTI": 1.0,
+            "HM3D": 1.0,
+            "ScanNet": 1.0
+        },
+        "train_datasets": [
+            "ScanNet"
+        ],
+        "val_datasets": [
+            "IBims"
+        ],
+        "data_root": "datasets",
+        "crop": "garg",
+        "augmentations": {
+            "random_scale": 2.0,
+            "random_jitter": 0.4,
+            "jitter_p": 0.8,
+            "random_blur": 2.0,
+            "blur_p": 0.2,
+            "random_gamma": 0.2,
+            "gamma_p": 0.8,
+            "grayscale_p": 0.2,
+            "flip_p": 0.5,
+            "test_context": 1.0,
+            "shape_constraints": {
+                "ratio_bounds": [0.66, 2.0],
+                "pixels_max": 2600,
+                "pixels_min": 1200,
+                "height_min": 15,
+                "width_min": 15,
+                "shape_mult": 14,
+                "sample": true
+            }
+        }
+    },
+    "model": {
+        "name": "UniDepthV1",
+        "num_heads": 8,
+        "expansion": 4,
+        "pixel_decoder": {
+            "hidden_dim": 512,
+            "depths": [3, 2, 1],
+            "dropout": 0.0
+        },
+        "pixel_encoder": {
+            "name": "dinov2_vits14",
+            "norm": true,
+            "pretrained": "",
+            "lr": 1e-5,
+            "frozen_stages": 0
+        }
+    }
+}
diff --git a/install.sh b/install.sh
diff --git a/requirements.txt b/requirements.txt
@@ -55,6 +55,9 @@ pycodestyle
 pyflakes
 pyparsing
 python-dateutil
+# pytorch3d is needed only for chamfer distance calculation
+# you can compile it from ops/knn and avoid this dependency
+pytorch3d @ "git+https://github.com/facebookresearch/pytorch3d.git@stable"
 pytz
 PyYAML
 requests

diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,57 @@
+## Training
+
+We provide the `train.py` script that allows to load the dataset, initialize and start the training. From the root of the repo:
+
+```bash
+export REPO=`pwd`
+export PYTHONPATH=${REPO}:${PYTHONPATH}
+
+# Adapt all this to your setup
+export TMPDIR="/tmp"
+export TORCH_HOME=${TMPDIR}
+export HUGGINGFACE_HUB_CACHE=${TMPDIR}
+export WANDB_HOME=${TMPDIR}
+export DATAROOT=<where-you-stored-the-hdf5>
+
+
+export MASTER_PORT=$((( RANDOM % 600 ) + 29400 ))
+if [ $NNODES -gt 1 ]; then
+    export MASTER_PORT=29400
+fi
+
+# this is the config will be used
+export CFG="train_v1_vitl14.json"
+```
+
+If you are on a machine without SLURM you can run the following:
+```bash
+# make the following input-dependent for multi-node
+export NNODES=1
+export RANK=0
+export MASTER_ADDR=127.0.0.1
+export CUDA_VISIBLE_DEVICES="0" # set yours
+
+export GPUS=$(echo ${CUDA_VISIBLE_DEVICES} | tr ',' '\n' | wc -l)
+echo "Start script with python from: `which python`"
+torchrun --rdzv-backend=c10d --nnodes=${NNODES} --nproc_per_node=${GPUS} --rdzv-endpoint ${MASTER_ADDR}:${MASTER_PORT} ${REPO}/scripts/train.py --config-file ${REPO}/configs/${CFG} --distributed
+```
+
+If you system present SLURM, all the information will be set by the scheduler and you have to run just:
+```bash
+srun -c ${SLURM_CPUS_PER_TASK} --kill-on-bad-exit=1 python -u ${REPO}/scripts/train.py --config-file ${REPO}/configs/${CFG} --master-port ${MASTER_PORT} --distributed
+```
+
+The training is available only for V1.<br>
+We have changes in the upcomping month for V2 and its "trainable model code" will be made public then.
+
+
+### Datasets
+
+We used both image-based and sequence-based dataset. The `ImageDataset` class is actually for legacy only as we moved image-based dataset to be "dummy" single-frame sequences.<br>
+We [provide two example dataset to get familiar to the pipeline and structure, namely iBims-1 and Sintel](https://drive.google.com/drive/folders/1FKsa5-b3EX0ukZq7bxord5fC5OfUiy16?usp=sharing), image- and sequence-based, respectively.<br>
+You can adapt the data loading and processing to your example; however, you will need to keep the same interface for the model to be consisten and train "out-of-the-box" the model.<br>
+
+
+### Additional dependencies
+
+We require chamfer distance for the evaluation, hence we rely on Pytorch3D knn, if you find any issue with Pytorch3D installation, you can compile the knn operation under `ops/knn`: `bash compile.sh` from the directory `$REPO/unidepth/ops/knn`.