prs-eth · markkua · May 15, 2024 · May 15, 2024
diff --git a/script/dataset_preprocess/hypersim/README.md b/script/dataset_preprocess/hypersim/README.md
@@ -0,0 +1,22 @@
+# Hypersim preprocessing
+
+## Download
+
+Download [Hypersim](https://github.com/apple/ml-hypersim) dataset using [this script](https://github.com/apple/ml-hypersim/blob/20f398f4387aeca73175494d6a2568f37f372150/code/python/tools/dataset_download_images.py).
+
+Download the scene split file from [here](https://github.com/apple/ml-hypersim/blob/main/evermotion_dataset/analysis/metadata_images_split_scene_v1.csv).
+
+## Process dataset
+
+Run the preprocessing script:
+
+```bash
+python script/dataset_preprocess/hypersim/preprocess_hypersim.py --split_csv /path/to/metadata_images_split_scene_v1.csv
+```
+
+(optional) Tar the processed data, for example:
+
+```bash
+cd data/Hypersim/processed/train
+tar -cf ../../hypersim_processed_train.tar .
+```
diff --git a/script/dataset_preprocess/hypersim/hypersim_util.py b/script/dataset_preprocess/hypersim/hypersim_util.py
@@ -0,0 +1,69 @@
+# Author: Bingxin Ke
+# Last modified: 2024-02-19
+
+
+from pylab import count_nonzero, clip, np
+
+
+# Adapted from https://github.com/apple/ml-hypersim/blob/main/code/python/tools/scene_generate_images_tonemap.py
+def tone_map(rgb, entity_id_map):
+    assert (entity_id_map != 0).all()
+
+    gamma = 1.0 / 2.2  # standard gamma correction exponent
+    inv_gamma = 1.0 / gamma
+    percentile = (
+        90  # we want this percentile brightness value in the unmodified image...
+    )
+    brightness_nth_percentile_desired = 0.8  # ...to be this bright after scaling
+
+    valid_mask = entity_id_map != -1
+
+    if count_nonzero(valid_mask) == 0:
+        scale = 1.0  # if there are no valid pixels, then set scale to 1.0
+    else:
+        brightness = (
+            0.3 * rgb[:, :, 0] + 0.59 * rgb[:, :, 1] + 0.11 * rgb[:, :, 2]
+        )  # "CCIR601 YIQ" method for computing brightness
+        brightness_valid = brightness[valid_mask]
+
+        eps = 0.0001  # if the kth percentile brightness value in the unmodified image is less than this, set the scale to 0.0 to avoid divide-by-zero
+        brightness_nth_percentile_current = np.percentile(brightness_valid, percentile)
+
+        if brightness_nth_percentile_current < eps:
+            scale = 0.0
+        else:
+            # Snavely uses the following expression in the code at https://github.com/snavely/pbrs_tonemapper/blob/master/tonemap_rgbe.py:
+            # scale = np.exp(np.log(brightness_nth_percentile_desired)*inv_gamma - np.log(brightness_nth_percentile_current))
+            #
+            # Our expression below is equivalent, but is more intuitive, because it follows more directly from the expression:
+            # (scale*brightness_nth_percentile_current)^gamma = brightness_nth_percentile_desired
+
+            scale = (
+                np.power(brightness_nth_percentile_desired, inv_gamma)
+                / brightness_nth_percentile_current
+            )
+
+    rgb_color_tm = np.power(np.maximum(scale * rgb, 0), gamma)
+    rgb_color_tm = clip(rgb_color_tm, 0, 1)
+    return rgb_color_tm
+
+
+# According to https://github.com/apple/ml-hypersim/issues/9
+def dist_2_depth(width, height, flt_focal, distance):
+    img_plane_x = (
+        np.linspace((-0.5 * width) + 0.5, (0.5 * width) - 0.5, width)
+        .reshape(1, width)
+        .repeat(height, 0)
+        .astype(np.float32)[:, :, None]
+    )
+    img_plane_y = (
+        np.linspace((-0.5 * height) + 0.5, (0.5 * height) - 0.5, height)
+        .reshape(height, 1)
+        .repeat(width, 1)
+        .astype(np.float32)[:, :, None]
+    )
+    img_plane_z = np.full([height, width, 1], flt_focal, np.float32)
+    img_plane = np.concatenate([img_plane_x, img_plane_y, img_plane_z], 2)
+
+    depth = distance / np.linalg.norm(img_plane, 2, 2) * flt_focal
+    return depth
diff --git a/script/dataset_preprocess/hypersim/preprocess_hypersim.py b/script/dataset_preprocess/hypersim/preprocess_hypersim.py
@@ -0,0 +1,149 @@
+# Author: Bingxin Ke
+# Last modified: 2024-02-19
+
+import argparse
+import os
+
+import cv2
+import h5py
+import numpy as np
+import pandas as pd
+from hypersim_util import dist_2_depth, tone_map
+from tqdm import tqdm
+
+IMG_WIDTH = 1024
+IMG_HEIGHT = 768
+FOCAL_LENGTH = 886.81
+
+if "__main__" == __name__:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--split_csv",
+        type=str,
+        default="data/Hypersim/metadata_images_split_scene_v1.csv",
+    )
+    parser.add_argument("--dataset_dir", type=str, default="data/Hypersim/raw_data")
+    parser.add_argument("--output_dir", type=str, default="data/Hypersim/processed")
+
+    args = parser.parse_args()
+
+    split_csv = args.split_csv
+    dataset_dir = args.dataset_dir
+    output_dir = args.output_dir
+
+    # %%
+    raw_meta_df = pd.read_csv(split_csv)
+    meta_df = raw_meta_df[raw_meta_df.included_in_public_release].copy()
+
+    # %%
+    for split in ["train", "val", "test"]:
+        split_output_dir = os.path.join(output_dir, split)
+        os.makedirs(split_output_dir)
+
+        split_meta_df = meta_df[meta_df.split_partition_name == split].copy()
+        split_meta_df["rgb_path"] = None
+        split_meta_df["rgb_mean"] = np.nan
+        split_meta_df["rgb_std"] = np.nan
+        split_meta_df["rgb_min"] = np.nan
+        split_meta_df["rgb_max"] = np.nan
+        split_meta_df["depth_path"] = None
+        split_meta_df["depth_mean"] = np.nan
+        split_meta_df["depth_std"] = np.nan
+        split_meta_df["depth_min"] = np.nan
+        split_meta_df["depth_max"] = np.nan
+        split_meta_df["invalid_ratio"] = np.nan
+
+        for i, row in tqdm(split_meta_df.iterrows(), total=len(split_meta_df)):
+            # Load data
+            rgb_path = os.path.join(
+                row.scene_name,
+                "images",
+                f"scene_{row.camera_name}_final_hdf5",
+                f"frame.{row.frame_id:04d}.color.hdf5",
+            )
+            dist_path = os.path.join(
+                row.scene_name,
+                "images",
+                f"scene_{row.camera_name}_geometry_hdf5",
+                f"frame.{row.frame_id:04d}.depth_meters.hdf5",
+            )
+            render_entity_id_path = os.path.join(
+                row.scene_name,
+                "images",
+                f"scene_{row.camera_name}_geometry_hdf5",
+                f"frame.{row.frame_id:04d}.render_entity_id.hdf5",
+            )
+            assert os.path.exists(os.path.join(dataset_dir, rgb_path))
+            assert os.path.exists(os.path.join(dataset_dir, dist_path))
+
+            with h5py.File(os.path.join(dataset_dir, rgb_path), "r") as f:
+                rgb = np.array(f["dataset"]).astype(float)
+            with h5py.File(os.path.join(dataset_dir, dist_path), "r") as f:
+                dist_from_center = np.array(f["dataset"]).astype(float)
+            with h5py.File(os.path.join(dataset_dir, render_entity_id_path), "r") as f:
+                render_entity_id = np.array(f["dataset"]).astype(int)
+
+            # Tone map
+            rgb_color_tm = tone_map(rgb, render_entity_id)
+            rgb_int = (rgb_color_tm * 255).astype(np.uint8)  # [H, W, RGB]
+
+            # Distance -> depth
+            plane_depth = dist_2_depth(
+                IMG_WIDTH, IMG_HEIGHT, FOCAL_LENGTH, dist_from_center
+            )
+            valid_mask = render_entity_id != -1
+
+            # Record invalid ratio
+            invalid_ratio = (np.prod(valid_mask.shape) - valid_mask.sum()) / np.prod(
+                valid_mask.shape
+            )
+            plane_depth[~valid_mask] = 0
+
+            # Save as png
+            scene_path = row.scene_name
+            if not os.path.exists(os.path.join(split_output_dir, row.scene_name)):
+                os.makedirs(os.path.join(split_output_dir, row.scene_name))
+
+            rgb_name = f"rgb_{row.camera_name}_fr{row.frame_id:04d}.png"
+            rgb_path = os.path.join(scene_path, rgb_name)
+            cv2.imwrite(
+                os.path.join(split_output_dir, rgb_path),
+                cv2.cvtColor(rgb_int, cv2.COLOR_RGB2BGR),
+            )
+
+            plane_depth *= 1000.0
+            plane_depth = plane_depth.astype(np.uint16)
+            depth_name = f"depth_plane_{row.camera_name}_fr{row.frame_id:04d}.png"
+            depth_path = os.path.join(scene_path, depth_name)
+            cv2.imwrite(os.path.join(split_output_dir, depth_path), plane_depth)
+
+            # Meta data
+            split_meta_df.at[i, "rgb_path"] = rgb_path
+            split_meta_df.at[i, "rgb_mean"] = np.mean(rgb_int)
+            split_meta_df.at[i, "rgb_std"] = np.std(rgb_int)
+            split_meta_df.at[i, "rgb_min"] = np.min(rgb_int)
+            split_meta_df.at[i, "rgb_max"] = np.max(rgb_int)
+
+            split_meta_df.at[i, "depth_path"] = depth_path
+            restored_depth = plane_depth / 1000.0
+            split_meta_df.at[i, "depth_mean"] = np.mean(restored_depth)
+            split_meta_df.at[i, "depth_std"] = np.std(restored_depth)
+            split_meta_df.at[i, "depth_min"] = np.min(restored_depth)
+            split_meta_df.at[i, "depth_max"] = np.max(restored_depth)
+
+            split_meta_df.at[i, "invalid_ratio"] = invalid_ratio
+
+        with open(
+            os.path.join(split_output_dir, f"filename_list_{split}.txt"), "w+"
+        ) as f:
+            lines = split_meta_df.apply(
+                lambda r: f"{r['rgb_path']} {r['depth_path']}", axis=1
+            ).tolist()
+            f.writelines("\n".join(lines))
+
+        with open(
+            os.path.join(split_output_dir, f"filename_meta_{split}.csv"), "w+"
+        ) as f:
+            split_meta_df.to_csv(f, header=True)
+
+    print("Preprocess finished")