diff --git a/environment.yml b/environment.yml index dd8fec7f..b15dc206 100644 --- a/environment.yml +++ b/environment.yml @@ -1,18 +1,19 @@ -name: happypose +name: happypose2 channels: - conda-forge - pytorch + - nvidia - anaconda - defaults dependencies: + - nvidia::cudatoolkit==11.3.1 - python=3.9 - pip - wget - python-wget - joblib - - pytorch==1.11.0 + - pytorch::pytorch==1.11.0 - torchvision==0.12.0 - - cudatoolkit==11.3.1 - ipython - ipykernel - jupyterlab @@ -90,4 +91,4 @@ dependencies: - webdataset - opencv-contrib-python - roma - - torchgeometry \ No newline at end of file + - torchgeometry diff --git a/happypose/pose_estimators/megapose/src/megapose/evaluation/prediction_runner.py b/happypose/pose_estimators/megapose/src/megapose/evaluation/prediction_runner.py index 2fc853ef..a0afe405 100644 --- a/happypose/pose_estimators/megapose/src/megapose/evaluation/prediction_runner.py +++ b/happypose/pose_estimators/megapose/src/megapose/evaluation/prediction_runner.py @@ -106,6 +106,9 @@ CNOS_SUBMISSION_PATHS = {ds_name: CNOS_SUBMISSION_DIR / fname for ds_name, fname in CNOS_SUBMISSION_FILES.items()} # Check if all paths exist +print("cnos values =", CNOS_SUBMISSION_PATHS.values()) +print("len cnos =", len(CNOS_SUBMISSION_FILES)) +print("sum=", sum(p.exists() for p in CNOS_SUBMISSION_PATHS.values())) assert( sum(p.exists() for p in CNOS_SUBMISSION_PATHS.values()) == len(CNOS_SUBMISSION_FILES)) ################################## ################################## @@ -292,7 +295,7 @@ def get_predictions(self, pose_estimator: PoseEstimator) -> Dict[str, PoseEstima # ############ RUN ONLY BEGINNING OF DATASET # # if n > 0: - # if n < 298: + #if n < 220: # # if n != 582: # print('################') # print('Prediction runner SKIP') diff --git a/happypose/pose_estimators/megapose/src/megapose/scripts/distributed.py b/happypose/pose_estimators/megapose/src/megapose/scripts/distributed.py index e7a4aa4d..995f1f89 100644 --- a/happypose/pose_estimators/megapose/src/megapose/scripts/distributed.py +++ b/happypose/pose_estimators/megapose/src/megapose/scripts/distributed.py @@ -120,7 +120,7 @@ def init_distributed_mode(): backend="nccl", rank=rank, world_size=world_size, - timeout=datetime.timedelta(seconds=1800 * 4), + timeout=datetime.timedelta(seconds=1800 * 16), ) torch.distributed.barrier() diff --git a/happypose/toolbox/utils/distributed.py b/happypose/toolbox/utils/distributed.py index 72750675..90089555 100644 --- a/happypose/toolbox/utils/distributed.py +++ b/happypose/toolbox/utils/distributed.py @@ -39,7 +39,7 @@ def get_tmp_dir() -> Path: if "JOB_DIR" in os.environ: tmp_dir = Path(os.environ["JOB_DIR"]) / "tmp" else: - tmp_dir = Path("/tmp/megapose_job") + tmp_dir = Path("/gpfsscratch/rech/zja/udg82mu/happypose_datasets/results/tmp/megapose_job") tmp_dir.parent.mkdir(exist_ok=True) tmp_dir.mkdir(exist_ok=True) return tmp_dir @@ -149,6 +149,6 @@ def init_distributed_mode() -> None: backend="nccl", rank=rank, world_size=world_size, - timeout=datetime.timedelta(seconds=4 * 1800), # 2 hours + timeout=datetime.timedelta(seconds=16 * 1800), # 2 hours ) torch.distributed.barrier() diff --git a/happypose/toolbox/utils/tensor_collection.py b/happypose/toolbox/utils/tensor_collection.py index e019a3c1..caf3b76d 100644 --- a/happypose/toolbox/utils/tensor_collection.py +++ b/happypose/toolbox/utils/tensor_collection.py @@ -26,6 +26,9 @@ # MegaPose from happypose.toolbox.utils.distributed import get_rank, get_world_size +from happypose.pose_estimators.megapose.src.megapose.config import ( + RESULTS_DIR, +) def concatenate(datas): datas = [data for data in datas if len(data) > 0] @@ -168,6 +171,7 @@ def gather_distributed(self, tmp_dir=None): if rank > 0: tmp_file = tmp_file_template.format(rank=rank) + print("tmp_file =", tmp_file) torch.save(self, tmp_file) if world_size > 1: