Skip to content

Commit

Permalink
Major fixes and improvements (#90)
Browse files Browse the repository at this point in the history
* docker login rework to store auth in task memory

* add retries on docker image pull

* add https redirect to requests

* docker pull rework

* change requests to sly Api

* bugfix

* add timeout for tasks

* pull + timeout

* bugfixes + timeout error

* agent options check

---------

Co-authored-by: Tony Bartsits <[email protected]>
  • Loading branch information
NikolaiPetukhov and tonybart1337 authored Oct 30, 2024
1 parent 4db1a02 commit ed4f876
Show file tree
Hide file tree
Showing 8 changed files with 326 additions and 223 deletions.
5 changes: 2 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ RUN apt-get update \
COPY requirements.txt /workdir/requirements.txt
RUN pip install --no-cache-dir -r /workdir/requirements.txt

COPY . /workdir
COPY agent /workdir/agent

#ENV PYTHONPATH /workdir:/workdir/src:/workdir/supervisely_lib/worker_proto:$PYTHONPATH
WORKDIR /workdir/agent

ENTRYPOINT ["sh", "-c", "python -u /workdir/agent/main.py"]

ENTRYPOINT ["python", "-u", "/workdir/agent/main.py"]
22 changes: 16 additions & 6 deletions agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,22 @@ def init_envs():
try:
agent_utils.check_instance_version()
new_envs, new_volumes, ca_cert = agent_utils.updated_agent_options()
except agent_utils.AgentOptionsNotAvailable:
sly.logger.debug("Can not update agent options", exc_info=True)
sly.logger.warning(
"Can not update agent options. Agent will be started with current options"
)
return
except Exception as e:
if not agent_utils.is_agent_container_ready_to_continue():
sly.logger.error(
"Agent options are not available. Agent will be stopped. Please, check the connection to the server"
)
raise

if isinstance(e, agent_utils.AgentOptionsNotAvailable):
sly.logger.debug("Can not update agent options", exc_info=True)
sly.logger.warning(
"Can not update agent options. Agent will be started with current options"
)
return

raise

if new_envs.get(constants._FORCE_CPU_ONLY, "false") == "true":
runtime = "runc"
runtime_changed = _is_runtime_changed(runtime)
Expand Down
35 changes: 7 additions & 28 deletions agent/worker/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ def __init__(self):
self.docker_api = docker.from_env(
version="auto", timeout=constants.DOCKER_API_CALL_TIMEOUT()
)
self._docker_login()

self.logger.info("Agent is ready to get tasks.")
self.api = sly.AgentAPI(
Expand Down Expand Up @@ -451,9 +450,6 @@ def _stop_missed_containers(self, ecosystem_token):
},
)

def _docker_login(self):
agent_utils.docker_login(self.docker_api, self.logger)

def submit_log(self):
while True:
log_lines = self.log_queue.get_log_batch_nowait()
Expand Down Expand Up @@ -657,30 +653,13 @@ def update_base_layers(self):
)
image = f"{constants.SLY_APPS_DOCKER_REGISTRY()}/{image}"

try:
docker_utils.docker_pull_if_needed(
self.docker_api,
image,
policy=docker_utils.PullPolicy.ALWAYS,
logger=self.logger,
progress=False,
)
except DockerException as e:
if "no basic auth credentials" in str(e).lower():
self.logger.warn(
f"Failed to pull docker image '{image}'. Will try to login and pull again",
exc_info=True,
)
self._docker_login()
docker_utils.docker_pull_if_needed(
self.docker_api,
image,
policy=docker_utils.PullPolicy.ALWAYS,
logger=self.logger,
progress=False,
)
else:
raise e
docker_utils.docker_pull_if_needed(
self.docker_api,
image,
policy=docker_utils.PullPolicy.ALWAYS,
logger=self.logger,
progress=False,
)

self.logger.info(f"Docker image '{image}' has been pulled successfully")
pulled.append(image)
Expand Down
51 changes: 27 additions & 24 deletions agent/worker/agent_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,9 +616,14 @@ def get_agent_options(server_address=None, token=None, timeout=60) -> dict:
if token is None:
token = constants.TOKEN()

url = constants.PUBLIC_API_SERVER_ADDRESS() + "agents.options.info"
resp = requests.post(url=url, json={"token": token}, timeout=timeout)
if resp.status_code != requests.codes.ok: # pylint: disable=no-member
api = sly.Api(server_address=server_address)
method = "agents.options.info"

resp = api.post(
method,
data={"token": token},
)
if resp is None or resp.status_code != requests.codes.ok: # pylint: disable=no-member
try:
text = resp.text
except:
Expand All @@ -633,10 +638,11 @@ def get_agent_options(server_address=None, token=None, timeout=60) -> dict:
def get_instance_version(server_address=None, timeout=60):
if server_address is None:
server_address = constants.SERVER_ADDRESS()
url = constants.PUBLIC_API_SERVER_ADDRESS() + "instance.version"
resp = requests.get(url=url, timeout=timeout)
if resp.status_code != requests.codes.ok: # pylint: disable=no-member
if resp.status_code in (400, 401, 403, 404):

api = sly.Api(server_address=server_address)
resp = api.get("instance.version", {})
if resp is None or resp.status_code != requests.codes.ok: # pylint: disable=no-member
if resp is not None and resp.status_code in (400, 401, 403, 404):
return None
try:
text = resp.text
Expand Down Expand Up @@ -928,6 +934,20 @@ def _ca_cert_changed(ca_cert) -> str:
return cert_path


def is_agent_container_ready_to_continue():
container_info = get_container_info()
volumes = binds_to_volumes_dict(container_info.get("HostConfig", {}).get("Binds", []))

# should contain at least 3 volumes:
# docker socket
# agent data files
# apps data files
if len(volumes) < 3:
return False

return True


def get_options_changes(envs: dict, volumes: dict, ca_cert: str) -> Tuple[dict, dict, str]:
return _envs_changes(envs), _volumes_changes(volumes), _ca_cert_changed(ca_cert)

Expand Down Expand Up @@ -1080,20 +1100,3 @@ def maybe_update_runtime():
def convert_millicores_to_cpu_quota(millicores, cpu_period=100000):
cpu_quota = (millicores / 1000) * cpu_period
return int(cpu_quota)


def docker_login(docker_api, logger):
doc_logs = constants.DOCKER_LOGIN().split(",")
doc_pasws = constants.DOCKER_PASSWORD().split(",")
doc_regs = constants.DOCKER_REGISTRY().split(",")

for login, password, registry in zip(doc_logs, doc_pasws, doc_regs):
if registry:
try:
doc_login = docker_api.login(username=login, password=password, registry=registry)
logger.info(
"DOCKER_CLIENT_LOGIN_SUCCESS", extra={**doc_login, "registry": registry}
)
except Exception as e:
if not constants.OFFLINE_MODE():
raise e
Loading

0 comments on commit ed4f876

Please sign in to comment.