Skip to content

Commit

Permalink
Merge pull request #266 from una-auxme/260-feature-distribute-the-ser…
Browse files Browse the repository at this point in the history
…ver-and-agent-on-two-pcs

260 feature distribute the server and agent on two pcs
  • Loading branch information
ll7 authored Jun 28, 2024
2 parents 9c52db2 + b052df7 commit cf708dd
Show file tree
Hide file tree
Showing 7 changed files with 259 additions and 1 deletion.
10 changes: 10 additions & 0 deletions build/Taskfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ task:run_dev() {
docker:compose -f docker-compose.dev.yaml up
}

task:run_distributed() {
xhost +local:docker
docker:compose -f docker-compose.distributed.yaml up
}

task:run_dev_distributed() {
xhost +local:docker
docker:compose -f docker-compose.dev.distributed.yaml up
}

task:restart() {
container="$1"
docker:compose restart "${container:-agent}"
Expand Down
77 changes: 77 additions & 0 deletions build/docker-compose.dev.distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
version: "3"

services:
flake8:
image: alpine/flake8
command: .
volumes:
- ../:/apps

comlipy:
build: docker/comlipy
command: .
volumes:
- ../:/apps

mdlint:
image: peterdavehello/markdownlint:0.32.2
command: markdownlint .
volumes:
- ../:/md

roscore:
image: ros:noetic
command: roscore
environment:
- ROS_MASTER_URI=http://roscore:11311
- ROS_HOSTNAME=roscore
expose:
- 11311
networks:
- ros

agent:
build:
dockerfile: build/docker/agent/Dockerfile
args:
- USER_UID=${DOCKER_HOST_UNIX_UID:-1000}
- USER_GID=${DOCKER_HOST_UNIX_GID:-1000}
context: ../
init: true
tty: true
shm_size: 2gb
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
#command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
# command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"

logging:
driver: "local"
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- ROS_MASTER_URI=http://roscore:11311
- CARLA_SIM_HOST=<carla-server-ip-address>
- ROS_HOSTNAME=agent
- XDG_RUNTIME_DIR=/tmp/runtime-carla
volumes:
- /tmp/.X11-unix:/tmp/.X11-unix
# if you change the volume here also change the copy command
# in ``build/docker/build/Dockerfile
- ../:/workspace
# mount git config for dvc
- ../.gitconfig:/home/carla/.gitconfig
- ../:/workspace/
networks:
- carla
- ros

networks:
carla:
ros:
15 changes: 15 additions & 0 deletions build/docker-compose.dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,18 @@ services:
resources:
limits:
memory: 16G
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
expose:
- 2000
- 2001
- 2002
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- XDG_RUNTIME_DIR=/tmp/runtime-carla
networks:
- carla
Expand Down Expand Up @@ -61,6 +68,12 @@ services:
init: true
tty: true
shm_size: 2gb
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
#command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
# command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"
Expand All @@ -69,6 +82,8 @@ services:
driver: "local"
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- ROS_MASTER_URI=http://roscore:11311
- CARLA_SIM_HOST=carla-simulator
- ROS_HOSTNAME=agent
Expand Down
77 changes: 77 additions & 0 deletions build/docker-compose.distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
version: "3"

services:
flake8:
image: alpine/flake8
command: .
volumes:
- ../:/apps

comlipy:
build: docker/comlipy
command: .
volumes:
- ../:/apps

mdlint:
image: peterdavehello/markdownlint:0.32.2
command: markdownlint .
volumes:
- ../:/md

roscore:
image: ros:noetic
command: roscore
environment:
- ROS_MASTER_URI=http://roscore:11311
- ROS_HOSTNAME=roscore
expose:
- 11311
networks:
- ros

agent:
build:
dockerfile: build/docker/agent/Dockerfile
args:
- USER_UID=${DOCKER_HOST_UNIX_UID:-1000}
- USER_GID=${DOCKER_HOST_UNIX_GID:-1000}
context: ../
init: true
tty: true
shm_size: 2gb
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
#command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
#command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=<carla-server-ip-address> --track=MAP"

logging:
driver: "local"
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- ROS_MASTER_URI=http://roscore:11311
- CARLA_SIM_HOST=<carla-server-ip-address>
- ROS_HOSTNAME=agent
- XDG_RUNTIME_DIR=/tmp/runtime-carla
volumes:
- /tmp/.X11-unix:/tmp/.X11-unix
# if you change the volume here also change the copy command
# in ``build/docker/build/Dockerfile
- ../:/workspace
# mount git config for dvc
- ../.gitconfig:/home/carla/.gitconfig
- ../:/workspace/
networks:
- carla
- ros

networks:
carla:
ros:
17 changes: 16 additions & 1 deletion build/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,18 @@ services:
resources:
limits:
memory: 16G
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
expose:
- 2000
- 2001
- 2002
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- XDG_RUNTIME_DIR=/tmp/runtime-carla
networks:
- carla
Expand Down Expand Up @@ -61,14 +68,22 @@ services:
init: true
tty: true
shm_size: 2gb
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
#command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
#command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=paf23-carla-simulator-1 --track=MAP"
command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=carla-simulator --track=MAP"

logging:
driver: "local"
environment:
- DISPLAY
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=all
- ROS_MASTER_URI=http://roscore:11311
- CARLA_SIM_HOST=carla-simulator
- ROS_HOSTNAME=agent
Expand Down
2 changes: 2 additions & 0 deletions doc/01_general/03_commands.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ⌨️ Available commands

A specific `b5` workflow for gpu installation in this project is specified in an issue comment: <https://github.com/una-auxme/paf23/issues/260#issuecomment-2196852330>

## General commands

### `b5 run`
Expand Down
62 changes: 62 additions & 0 deletions doc/02_development/14_distributed_simulation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Distributed Simulation

If you have not enough compute resources, start the `carla-simulator-server` on a remote machine and execute the agent on your local machine.
As far as we know, you need more than **10 GB of VRAM** to run the server and the agent on the same machine.

## Author

Julian Trommer and Lennart Luttkus

## Date

2024-06-28

## Remote Machine Setup

- Gain `ssh` access to the remote machine.
- Download the [Carla leaderboard release](https://leaderboard.carla.org/get_started/#11-download-the-carla-leaderboard-package) to the remote PC
- Extract the `.zip` file.
- start the server per ssh without rendering the spectator view:
- `/CARLA_Leaderboard_20/CarlaUE4.sh -RenderOffScreen`

## Local Machine Setup

- get access to the remote machine via `ssh`
- Start the server as described above
- set the host ip address from the remote machine as the new carla-ip address
- start the agent on your local machine

### Ensure similarity between normal docker-compose and distributed docker-compose files

Carefully compare that their are no major differences between the `docker-compose.yml` and `docker-compose.distributed.yml` files.
Mainly, the `carla-simulator` service will not be executed in the non-distributed version.

### Set the `<ip-address>` of the carla simulator in docker-compose distributed files

Replace the argument `<carla-server-ip-address>` with the ip address of the remote machine.
You can find the ip address of the remote machine by executing the following command on the remote machine:

```bash
hostname -I
```

Typically, the ip address is the first one in the list.
`172.xxx.xxx.xxx` is the localhost address and not the relevant address.

Replace the ip-address in the following files:

- `docker-compose.distributed.yml`
- `docker-compose.dev.distributed.yml`

### Start the agent on your local machine

```bash
b5 run_distributed
b5 run_dev_distributed
```

## How do you know that you do not have enough compute resources?

```bash
watch -n 1 nvidia-smi
```

0 comments on commit cf708dd

Please sign in to comment.