Skip to content

Commit

Permalink
Exception logging and cache cleaning (#34)
Browse files Browse the repository at this point in the history
* devcontainer settings example

* devcontainer example

* upd sfk

* upd sdk

* devcontainer comments

* added report parser to handle DialogWindowError

* debug msg

* process DialogWindowError

* fix unmutable mutations bug

* added agent exceptions handler and docker pull exceptions

* rm import

* change exception code

* change host mkfile to agent

* launch configurations for devcontainer

* change paths from host to mounted agent paths

* added nvsmi info: capability, cuda, driver

* updates for development in devcontainer

* added comments and all app folders

* all folders size + comments

* add paths descriptions

* not reusable app data removal

* auto clean constants

* app sessions and agent logs clean by age functions

* added AppDirCleaner for session/pip files remove

* fix description

* added cleaning for terminated tasks

* files and folders separation

* comments

* files and folders separation

* added autocleaner task

* prevent raising exception in thread; fix bug

* added new clean tasks

* rm clean pip cache from app session cleaner

* pytest config; test autocleaner

* base test functions updated

* upd device capability

* just my code

* rename weights

* fix import

* change log condition

* age_limit typing

* AppDirCleaner inplace initialization

* rename node storage

* fix test path

* files removal for not existing in app_sessions

* node_storage nodes naming

* added removing of unknow sessions files

* check unknown sessions

* fix ValueError bug

* upd auto clean range with env

* upd sdk

* upd sdk

* upd docs

* upd constant docs

* upd sdk

* prevent autoclean app files

* allow manual clean for users (update apps once a day)

* upd sdk

* mv docker exceptions to sdk

* fix comment

* add comment

* upd sdk
  • Loading branch information
TheoLisin authored Sep 13, 2023
1 parent 4e0f1df commit 0154685
Show file tree
Hide file tree
Showing 18 changed files with 1,034 additions and 72 deletions.
32 changes: 32 additions & 0 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
FROM supervisely/agent:latest

# use same name in devcontainer.json
ARG USERNAME=fedor
# execute `echo $UID` on host
ARG USER_UID=1003
ARG USER_GID=$USER_UID
# execute `getent group docker` on host
ARG DOCKER_UID=999

RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
#
# [Optional] Add sudo support. Omit if you don't need to install software after connecting.
&& apt-get update \
&& apt-get install -y sudo \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME

RUN groupadd -g $DOCKER_UID docker \
&& usermod -aG docker $USERNAME

# AGENT_ROOT_DIR
RUN mkdir /sly_agent
# SUPERVISELY_AGENT_FILES_CONTAINER
RUN mkdir -p /app/sly-files
# uncomment and uninstall to debug sdk
# RUN pip3 uninstall -y supervisely

# if no User defined all files changed in devcontainer will have root as owner
USER $USERNAME
ENTRYPOINT ["/bin/bash"]
101 changes: 101 additions & 0 deletions .devcontainer/create-net.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@

set -o pipefail

WHITE='\033[1;37m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'

command_exists() {
command -v "$@" > /dev/null 2>&1
}

sudo_cmd=()

if command_exists sudo; then
sudo_cmd=(sudo -E bash -c)
elif command_exists su; then
sudo_cmd=(su -p -s bash -c)
fi

docker ps > /dev/null 2>&1

access_test_code=$?

if [[ ${access_test_code} -ne 0 && ${EUID} -ne 0 && ${#sudo_cmd} -ne 0 ]]; then
cur_fd="$(printf %q "$BASH_SOURCE")$((($#)) && printf ' %q' "$@")"
cur_script=$(cat "${cur_fd}")
${sudo_cmd[*]} "${cur_script}"

exit 0
fi

export SUPERVISELY_AGENT_IMAGE='supervisely/agent:dev'
# same as in devcontainer.json and debug.env
export AGENT_HOST_DIR="/home/fedor_lisin/agent_debug_dir/agent"

# from secret.env ↓
export ACCESS_TOKEN=''
export SERVER_ADDRESS=''
export DOCKER_REGISTRY=''
export DOCKER_LOGIN=''
export DOCKER_PASSWORD=''
# from secret.env ↑

secrets=("${ACCESS_TOKEN}" "${SERVER_ADDRESS}" "${DOCKER_REGISTRY}" "${DOCKER_LOGIN}" "${DOCKER_PASSWORD}")

for value in "${secrets[@]}"
do
if [ -z $value ];
then
echo "${RED}One of the required secrets is not defined${NC}"
exit 1
fi
done


export DELETE_TASK_DIR_ON_FINISH='true'
export DELETE_TASK_DIR_ON_FAILURE='true'
export PULL_POLICY='ifnotpresent'
# same as in devcontainer.json and debug.env
export SUPERVISELY_AGENT_FILES=$(echo -n "/home/fedor_lisin/agent_debug_dir/files")


echo 'Supervisely Net is enabled, starting client...'
docker pull supervisely/sly-net-client:latest
docker network create "supervisely-net-${ACCESS_TOKEN}" 2> /dev/null
echo 'Remove existing Net Client container if any...'
docker rm -fv $(docker ps -aq -f name="supervisely-net-client-${ACCESS_TOKEN}") 2> /dev/null
docker run -it -d --name="supervisely-net-client-${ACCESS_TOKEN}" \
-e "SLY_NET_CLIENT_PING_INTERVAL=60" \
--privileged \
--network "supervisely-net-${ACCESS_TOKEN}" \
--restart=unless-stopped \
--log-driver=local \
--log-opt max-size=1m \
--log-opt max-file=1 \
--log-opt compress=false \
--cap-add NET_ADMIN \
--device /dev/net/tun:/dev/net/tun \
\
\
\
\
\
\
-v /var/run/docker.sock:/tmp/docker.sock:ro \
-v "${AGENT_HOST_DIR}:/app/sly" \
\
-v "/home/fedor_lisin/agent_debug_dir/files:/app/sly-files" \
"supervisely/sly-net-client:latest" \
"${ACCESS_TOKEN}" \
"https://dev.supervisely.com/net/" \
"dev.supervisely.com:51822"

retVal=$?
if [ $retVal -ne 0 ]; then
echo -e "
${RED}Couldn't start Supervisely Net. Agent is running fine. Please, contact support and attach the log above${NC}"
fi

echo -e "${WHITE}============ You can close this terminal safely now ============${NC}"
33 changes: 33 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
// "image": "agent_dev:latest",
"build": {
"dockerfile": "Dockerfile"
},
"runArgs": [
"--gpus",
"all",
"--ipc=host",
"--net=host",
"--cap-add",
"NET_ADMIN"
],
"containerEnv": {
// access token from secret.env
"DOCKER_NET": "supervisely-net-{access token}"
},
"mounts": [
"source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind",
// AGENT_HOST_DIR and AGENT_ROOT_DIR in env file; AGENT_ROOT_DIR should be created in Dockerfile
"source=/home/fedor_lisin/agent_debug_dir/agent,target=/sly_agent,type=bind",
// SUPERVISELY_AGENT_FILES and SUPERVISELY_AGENT_FILES_CONTAINER in env file; SUPERVISELY_AGENT_FILES_CONTAINER should be created in Dockerfile
"source=/home/fedor_lisin/agent_debug_dir/files,target=/app/sly-files,type=bind"
],
"remoteUser": "fedor",
"customizations": {
"vscode": {
"extensions": [
"ms-python.python"
]
}
}
}
26 changes: 26 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@
"env": {
"PYTHONPATH": "${workspaceFolder}:${PYTHONPATH}"
}
},
{
"name": "devcontainer main.py",
"type": "python",
"request": "launch",
"program": "agent/main.py",
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"PYTHONPATH": "${workspaceFolder}:${PYTHONPATH}"
},
// app session creates __pycache__ after task ending;
// __pycache__ owner is root, so agent need to be launched with sudo setting
// inside devcontainer otherwise it crashes on task stop
"sudo": true
},
{
"name": "Python: Debug Tests",
"type": "python",
"request": "launch",
"program": "${file}",
"purpose": [
"debug-test"
],
"console": "integratedTerminal",
"justMyCode": false
}
]
}
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ RUN pip install requests-toolbelt>=1.0.0

RUN pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html

RUN pip install supervisely==6.72.71
RUN pip install supervisely==6.72.127
# for development
# RUN pip install git+https://github.com/supervisely/supervisely.git@minor-improvements

COPY . /workdir

Expand Down
39 changes: 34 additions & 5 deletions agent/worker/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import docker
import json
import threading
from concurrent.futures import ThreadPoolExecutor, wait
import subprocess
import os
import supervisely_lib as sly
import uuid
import warnings
from concurrent.futures import ThreadPoolExecutor, wait
from pathlib import Path

import supervisely_lib as sly

from worker.agent_utils import TaskDirCleaner, AppDirCleaner

warnings.filterwarnings(action="ignore", category=UserWarning)

Expand Down Expand Up @@ -191,6 +195,12 @@ def stop_task(self, task_id):
"Task could not be stopped. Not found", extra={"task_id": task_id}
)

dir_task = str(Path(constants.AGENT_APP_SESSIONS_DIR()) / str(task_id))
if os.path.exists(dir_task):
cleaner = TaskDirCleaner(dir_task)
cleaner.allow_cleaning()
cleaner.clean()

self.logger.info(
"TASK_MISSED",
extra={
Expand Down Expand Up @@ -353,9 +363,7 @@ def follow_daemon(self, process_cls, name, sleep_sec=5):
self.api.simple_request(
"UpdateTelemetry",
sly.api_proto.Empty,
sly.api_proto.AgentInfo(
info=json.dumps({"gpu_info": gpu_info})
),
sly.api_proto.AgentInfo(info=json.dumps({"gpu_info": gpu_info})),
)
last_gpu_message = GPU_FREQ

Expand Down Expand Up @@ -389,6 +397,11 @@ def inf_loop(self):
sly.function_wrapper_external_logger, self.send_connect_info, self.logger
)
)
self.thread_list.append(
self.thread_pool.submit(
sly.function_wrapper_external_logger, self.task_clear_old_data, self.logger
)
)
if constants.DISABLE_TELEMETRY() is None:
self.thread_list.append(
self.thread_pool.submit(
Expand Down Expand Up @@ -429,3 +442,19 @@ def terminate_all_deamons():

if len(futures_statuses.not_done) != 0:
raise RuntimeError("AGENT: EXCEPTION IN BASE FUTURE !!!")

def task_clear_old_data(self):
day = 60 * 60 * 24
cleaner = AppDirCleaner(self.logger)
while True:
with self.task_pool_lock:
all_tasks = set(self.task_pool.keys())

try:
cleaner.auto_clean(all_tasks)
except Exception as e:
self.logger.exception(e)
# raise or not?
# raise e

time.sleep(day)
Loading

0 comments on commit 0154685

Please sign in to comment.