Merge pull request #266 from una-auxme/260-feature-distribute-the-ser…

…ver-and-agent-on-two-pcs 260 feature distribute the server and agent on two pcs
una-auxme · Jun 28, 2024 · cf708dd · cf708dd
2 parents 9c52db2 + b052df7
commit cf708dd
Show file tree

Hide file tree

Showing 7 changed files with 259 additions and 1 deletion.
diff --git a/build/Taskfile b/build/Taskfile
@@ -16,6 +16,16 @@ task:run_dev() {
   docker:compose -f docker-compose.dev.yaml up
 }
 
+task:run_distributed() {
+  xhost +local:docker
+  docker:compose -f docker-compose.distributed.yaml up
+}
+
+task:run_dev_distributed() {
+  xhost +local:docker
+  docker:compose -f docker-compose.dev.distributed.yaml up
+}
+
 task:restart() {
     container="$1"
     docker:compose restart "${container:-agent}"

diff --git a/build/docker-compose.dev.distributed.yaml b/build/docker-compose.dev.distributed.yaml
@@ -0,0 +1,77 @@
+version: "3"
+
+services:
+  flake8:
+    image: alpine/flake8
+    command: .
+    volumes:
+      - ../:/apps
+
+  comlipy:
+    build: docker/comlipy
+    command: .
+    volumes:
+      - ../:/apps
+
+  mdlint:
+    image: peterdavehello/markdownlint:0.32.2
+    command: markdownlint .
+    volumes:
+      - ../:/md
+
+  roscore:
+    image: ros:noetic
+    command: roscore
+    environment:
+      - ROS_MASTER_URI=http://roscore:11311
+      - ROS_HOSTNAME=roscore
+    expose:
+      - 11311
+    networks:
+      - ros
+
+  agent:
+    build:
+      dockerfile: build/docker/agent/Dockerfile
+      args:
+        - USER_UID=${DOCKER_HOST_UNIX_UID:-1000}
+        - USER_GID=${DOCKER_HOST_UNIX_GID:-1000}
+      context: ../
+    init: true
+    tty: true
+    shm_size: 2gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
+    #command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
+    command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
+    # command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"
+
+    logging:
+      driver: "local"
+    environment:
+      - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
+      - ROS_MASTER_URI=http://roscore:11311
+      - CARLA_SIM_HOST=<carla-server-ip-address>
+      - ROS_HOSTNAME=agent
+      - XDG_RUNTIME_DIR=/tmp/runtime-carla
+    volumes:
+      - /tmp/.X11-unix:/tmp/.X11-unix
+      # if you change the volume here also change the copy command
+      # in ``build/docker/build/Dockerfile
+      - ../:/workspace
+      # mount git config for dvc
+      - ../.gitconfig:/home/carla/.gitconfig
+      - ../:/workspace/
+    networks:
+      - carla
+      - ros
+
+networks:
+  carla:
+  ros:
diff --git a/build/docker-compose.dev.yaml b/build/docker-compose.dev.yaml
@@ -28,11 +28,18 @@ services:
       resources:
         limits:
           memory: 16G
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
     expose:
       - 2000
       - 2001
       - 2002
     environment:
+      - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
       - XDG_RUNTIME_DIR=/tmp/runtime-carla
     networks:
       - carla
@@ -61,6 +68,12 @@ services:
     init: true
     tty: true
     shm_size: 2gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
     #command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
     command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
     # command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"
@@ -69,6 +82,8 @@ services:
       driver: "local"
     environment:
       - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
       - ROS_MASTER_URI=http://roscore:11311
       - CARLA_SIM_HOST=carla-simulator
       - ROS_HOSTNAME=agent

diff --git a/build/docker-compose.distributed.yaml b/build/docker-compose.distributed.yaml
@@ -0,0 +1,77 @@
+version: "3"
+
+services:
+  flake8:
+    image: alpine/flake8
+    command: .
+    volumes:
+      - ../:/apps
+
+  comlipy:
+    build: docker/comlipy
+    command: .
+    volumes:
+      - ../:/apps
+
+  mdlint:
+    image: peterdavehello/markdownlint:0.32.2
+    command: markdownlint .
+    volumes:
+      - ../:/md
+
+  roscore:
+    image: ros:noetic
+    command: roscore
+    environment:
+      - ROS_MASTER_URI=http://roscore:11311
+      - ROS_HOSTNAME=roscore
+    expose:
+      - 11311
+    networks:
+      - ros
+
+  agent:
+    build:
+      dockerfile: build/docker/agent/Dockerfile
+      args:
+        - USER_UID=${DOCKER_HOST_UNIX_UID:-1000}
+        - USER_GID=${DOCKER_HOST_UNIX_GID:-1000}
+      context: ../
+    init: true
+    tty: true
+    shm_size: 2gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
+    #command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
+    #command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
+    command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=<carla-server-ip-address> --track=MAP"
+
+    logging:
+      driver: "local"
+    environment:
+      - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
+      - ROS_MASTER_URI=http://roscore:11311
+      - CARLA_SIM_HOST=<carla-server-ip-address>
+      - ROS_HOSTNAME=agent
+      - XDG_RUNTIME_DIR=/tmp/runtime-carla
+    volumes:
+      - /tmp/.X11-unix:/tmp/.X11-unix
+      # if you change the volume here also change the copy command
+      # in ``build/docker/build/Dockerfile
+      - ../:/workspace
+      # mount git config for dvc
+      - ../.gitconfig:/home/carla/.gitconfig
+      - ../:/workspace/
+    networks:
+      - carla
+      - ros
+
+networks:
+  carla:
+  ros:
diff --git a/build/docker-compose.yml b/build/docker-compose.yml
@@ -28,11 +28,18 @@ services:
       resources:
         limits:
           memory: 16G
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
     expose:
       - 2000
       - 2001
       - 2002
     environment:
+      - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
       - XDG_RUNTIME_DIR=/tmp/runtime-carla
     networks:
       - carla
@@ -61,14 +68,22 @@ services:
     init: true
     tty: true
     shm_size: 2gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [ gpu ]
     #command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
     #command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
-    command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"
+    command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=carla-simulator --track=MAP"
 
     logging:
       driver: "local"
     environment:
       - DISPLAY
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=all
       - ROS_MASTER_URI=http://roscore:11311
       - CARLA_SIM_HOST=carla-simulator
       - ROS_HOSTNAME=agent

diff --git a/doc/01_general/03_commands.md b/doc/01_general/03_commands.md
@@ -1,5 +1,7 @@
 # ⌨️ Available commands
 
+A specific `b5` workflow for gpu installation in this project is specified in an issue comment: <https://github.com/una-auxme/paf23/issues/260#issuecomment-2196852330>
+
 ## General commands
 
 ### `b5 run`

diff --git a/doc/02_development/14_distributed_simulation.md b/doc/02_development/14_distributed_simulation.md
@@ -0,0 +1,62 @@
+# Distributed Simulation
+
+If you have not enough compute resources, start the `carla-simulator-server` on a remote machine and execute the agent on your local machine.
+As far as we know, you need more than **10 GB of VRAM** to run the server and the agent on the same machine.
+
+## Author
+
+Julian Trommer and Lennart Luttkus
+
+## Date
+
+2024-06-28
+
+## Remote Machine Setup
+
+- Gain `ssh` access to the remote machine.
+- Download the [Carla leaderboard release](https://leaderboard.carla.org/get_started/#11-download-the-carla-leaderboard-package) to the remote PC
+- Extract the `.zip` file.
+- start the server per ssh without rendering the spectator view:
+  - `/CARLA_Leaderboard_20/CarlaUE4.sh -RenderOffScreen`
+
+## Local Machine Setup
+
+- get access to the remote machine via `ssh`
+  - Start the server as described above
+- set the host ip address from the remote machine as the new carla-ip address
+- start the agent on your local machine
+
+### Ensure similarity between normal docker-compose and distributed docker-compose files
+
+Carefully compare that their are no major differences between the `docker-compose.yml` and `docker-compose.distributed.yml` files.
+Mainly, the `carla-simulator` service will not be executed in the non-distributed version.
+
+### Set the `<ip-address>` of the carla simulator in docker-compose distributed files
+
+Replace the argument `<carla-server-ip-address>` with the ip address of the remote machine.
+You can find the ip address of the remote machine by executing the following command on the remote machine:
+
+```bash
+hostname -I
+```
+
+Typically, the ip address is the first one in the list.
+`172.xxx.xxx.xxx` is the localhost address and not the relevant address.
+
+Replace the ip-address in the following files:
+
+- `docker-compose.distributed.yml`
+- `docker-compose.dev.distributed.yml`
+
+### Start the agent on your local machine
+
+```bash
+b5 run_distributed
+b5 run_dev_distributed
+```
+
+## How do you know that you do not have enough compute resources?
+
+```bash
+watch -n 1 nvidia-smi
+```