From 7c5e02ded7a92e5936df4028361e32bb0e0fb12e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 01:17:49 +0000 Subject: [PATCH 001/114] otaclient._utils --- src/otaclient/{utils.py => _utils.py} | 0 src/otaclient/errors.py | 2 +- src/otaclient/main.py | 2 +- src/otaclient/ota_core.py | 2 +- tests/test_otaclient/test_utils.py | 2 +- 5 files changed, 4 insertions(+), 4 deletions(-) rename src/otaclient/{utils.py => _utils.py} (100%) diff --git a/src/otaclient/utils.py b/src/otaclient/_utils.py similarity index 100% rename from src/otaclient/utils.py rename to src/otaclient/_utils.py diff --git a/src/otaclient/errors.py b/src/otaclient/errors.py index 2aa7cc791..52b5ce23e 100644 --- a/src/otaclient/errors.py +++ b/src/otaclient/errors.py @@ -20,7 +20,7 @@ from typing import ClassVar from otaclient._types import FailureType -from otaclient.utils import get_traceback +from otaclient._utils import get_traceback @unique diff --git a/src/otaclient/main.py b/src/otaclient/main.py index bc34fdc9c..2c3a82b08 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -85,7 +85,7 @@ async def launch_otaclient_grpc_server(): def main() -> None: from otaclient._logging import configure_logging from otaclient.configs.cfg import cfg, ecu_info - from otaclient.utils import check_other_otaclient, create_otaclient_rundir + from otaclient._utils import check_other_otaclient, create_otaclient_rundir # configure logging before any code being executed configure_logging() diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 452487ea7..bfbd2afe3 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -57,7 +57,7 @@ get_standby_slot_creator, ) from otaclient.create_standby.common import DeltaBundle -from otaclient.utils import get_traceback, wait_and_log +from otaclient._utils import get_traceback, wait_and_log from otaclient_common.common import ensure_otaproxy_start from otaclient_common.downloader import ( EMPTY_FILE_SHA256, diff --git a/tests/test_otaclient/test_utils.py b/tests/test_otaclient/test_utils.py index cde15e0b6..cf23594f9 100644 --- a/tests/test_otaclient/test_utils.py +++ b/tests/test_otaclient/test_utils.py @@ -20,7 +20,7 @@ import pytest -from otaclient.utils import wait_and_log +from otaclient._utils import wait_and_log logger = logging.getLogger(__name__) From 141dba94ee56331fe8845fcdc044d09ac269da67 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 01:22:48 +0000 Subject: [PATCH 002/114] add otaclient_common.shm_status module --- src/otaclient_common/shm_status.py | 154 +++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 src/otaclient_common/shm_status.py diff --git a/src/otaclient_common/shm_status.py b/src/otaclient_common/shm_status.py new file mode 100644 index 000000000..2ee397d26 --- /dev/null +++ b/src/otaclient_common/shm_status.py @@ -0,0 +1,154 @@ +# Copyright 2022 TIER IV, INC. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A lib for sharing status between processes. + +shared memory layout: + +rwlock(1byte) | hmac-sha3_512(64bytes) | msg_len(4bytes,big) | msg(bytes) +In which, msg is pickled python object. +""" + + +from __future__ import annotations + +import hmac +import multiprocessing.shared_memory as mp_shm +import pickle +import time +from typing import Generic + +from otaclient_common.typing import T + +HASH_ALG = "sha3_512" +DEFAULT_KEY_LEN = 64 # bytes + +RWLOCK_LEN = 1 # byte +HMAC_SHA3_512_LEN = 64 # bytes +PAYLOAD_LEN_BYTES = 4 # bytes +MIN_ENCAP_MSG_LEN = RWLOCK_LEN + HMAC_SHA3_512_LEN + PAYLOAD_LEN_BYTES + +RWLOCK_LOCKED = b"\xab" +RWLOCK_OPEN = b"\x54" + + +class MPSharedStatusReader(Generic[T]): + + def __init__( + self, *, name: str, key: bytes, max_retry: int = 6, retry_interval: int = 1 + ) -> None: + for _ in range(max_retry): + try: + self._shm = shm = mp_shm.SharedMemory(name=name, create=False) + break + except Exception: + print("retrying ...") + time.sleep(retry_interval) + else: + raise ValueError("failed to connect share memory") + + self.mem_size = size = shm.size + self.msg_max_size = size - MIN_ENCAP_MSG_LEN + self._key = key + + def atexit(self) -> None: + self._shm.close() + + def sync_msg(self) -> T: + buffer = self._shm.buf + + # check if we can read + _cursor = 0 + rwlock = bytes(buffer[_cursor:RWLOCK_LEN]) + if rwlock != RWLOCK_OPEN: + if rwlock == RWLOCK_LOCKED: + raise ValueError("write in progress, abort") + raise ValueError(f"invalid input_msg: wrong rwlock bytes: {rwlock=}") + _cursor += RWLOCK_LEN + + # parsing the msg + input_hmac = bytes(buffer[_cursor : _cursor + HMAC_SHA3_512_LEN]) + _cursor += HMAC_SHA3_512_LEN + + _payload_len_bytes = bytes(buffer[_cursor : _cursor + PAYLOAD_LEN_BYTES]) + payload_len = int.from_bytes(_payload_len_bytes, "big", signed=False) + _cursor += PAYLOAD_LEN_BYTES + + if payload_len > self.msg_max_size: + raise ValueError(f"invalid msg: {payload_len=} > {self.msg_max_size}") + + payload = bytes(buffer[_cursor : _cursor + payload_len]) + payload_hmac = hmac.digest(key=self._key, msg=payload, digest=HASH_ALG) + + if hmac.compare_digest(payload_hmac, input_hmac): + return pickle.loads(payload) + raise ValueError("failed to validate input msg") + + +class MPSharedStatusWriter(Generic[T]): + + def __init__( + self, + *, + name: str | None = None, + size: int = 0, + create: bool = False, + msg_max_size: int | None = None, + key: bytes, + ) -> None: + if create: + _msg_max_size = size - MIN_ENCAP_MSG_LEN + if _msg_max_size < 0: + raise ValueError(f"{size=} < {MIN_ENCAP_MSG_LEN=}") + self._shm = shm = mp_shm.SharedMemory(name=name, size=size, create=True) + self.mem_size = shm.size + else: + self._shm = shm = mp_shm.SharedMemory(name=name, create=False) + self.mem_size = size = shm.size + _msg_max_size = size - MIN_ENCAP_MSG_LEN + if _msg_max_size < 0: + shm.close() + raise ValueError(f"{size=} < {MIN_ENCAP_MSG_LEN=}") + + self.name = shm.name + self._key = key + self.msg_max_size = min(_msg_max_size, msg_max_size or float("infinity")) + + def atexit(self, *, unlink: bool = False) -> None: + self._shm.close() + if unlink: + self._shm.unlink() + + def write_msg(self, obj: T) -> None: + buffer = self._shm.buf + _pickled = pickle.dumps(obj) + _pickled_len = len(_pickled) + + if _pickled_len > self.msg_max_size: + raise ValueError(f"exceed {self.msg_max_size=}: {_pickled_len=}") + + _hmac = hmac.digest(key=self._key, msg=_pickled, digest=HASH_ALG) + msg = b"".join( + [ + RWLOCK_LOCKED, + _hmac, + _pickled_len.to_bytes(PAYLOAD_LEN_BYTES, "big", signed=False), + _pickled, + ] + ) + msg_len = len(msg) + if msg_len > self.mem_size: + raise ValueError(f"{msg_len=} > {self.mem_size=}") + + buffer[:msg_len] = msg + buffer[:1] = RWLOCK_OPEN From 8617f5fe3526647619e608d0176addc977e242aa Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 01:24:37 +0000 Subject: [PATCH 003/114] utils: add SharedOTAClientStatusWriter/Reader types --- src/otaclient/_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index e98a3aabd..e2ef1690d 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -26,7 +26,9 @@ from pathlib import Path from typing import Callable, Protocol +from otaclient._types import OTAClientStatus from otaclient_common._io import read_str_from_file, write_str_to_file_atomic +from otaclient_common.shm_status import MPSharedStatusReader, MPSharedStatusWriter from otaclient_common.typing import StrOrPath logger = logging.getLogger(__name__) @@ -88,3 +90,11 @@ def create_otaclient_rundir(run_dir: StrOrPath = "/run/otaclient"): def get_traceback(exc: Exception, *, splitter: str = "\n") -> str: """Format the traceback as string.""" return splitter.join(traceback.format_exception(type(exc), exc, exc.__traceback__)) + + +class SharedOTAClientStatusWriter(MPSharedStatusWriter[OTAClientStatus]): + """Util for writing OTAClientStatus to shm.""" + + +class SharedOTAClientStatusReader(MPSharedStatusReader[OTAClientStatus]): + """Util for reading OTAClientStatus from shm.""" From 9564b52e805bcdb0de4b21c42844686bcea2571b Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 01:28:20 +0000 Subject: [PATCH 004/114] ecu_tracker: use new shm_status --- src/otaclient/grpc/api_v2/ecu_tracker.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 7a3a8a3c8..609029566 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -16,10 +16,11 @@ from __future__ import annotations +import contextlib import asyncio import logging -from otaclient._status_monitor import OTAClientStatusCollector +from otaclient._utils import SharedOTAClientStatusReader from otaclient.configs import ECUContact from otaclient.configs.cfg import cfg, ecu_info from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage @@ -34,10 +35,10 @@ class ECUTracker: def __init__( self, ecu_status_storage: ECUStatusStorage, - *, - local_status_collector: OTAClientStatusCollector, + /, + local_ecu_status_reader: SharedOTAClientStatusReader, ) -> None: - self._local_status_collector = local_status_collector + self._local_ecu_status_reader = local_ecu_status_reader self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() @@ -68,8 +69,8 @@ async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): async def _polling_local_ecu_status(self): """Task entry for loop polling local ECU status.""" while not self._debug_ecu_status_polling_shutdown_event.is_set(): - status_report = self._local_status_collector.otaclient_status - if status_report: + with contextlib.suppress(Exception): + status_report = self._local_ecu_status_reader.sync_msg() await self._ecu_status_storage.update_from_local_ecu(status_report) await self._polling_waiter() From 1ffbf7d4db39ddb42cccedfb7c6e87c3c273614e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 02:15:42 +0000 Subject: [PATCH 005/114] _status_monitor: integrate shm --- src/otaclient/_status_monitor.py | 28 +++++++++++++++++++++--- src/otaclient/grpc/api_v2/ecu_tracker.py | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index f6686b6cd..0d8201512 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -17,6 +17,7 @@ from __future__ import annotations import atexit +import contextlib import logging import queue import time @@ -34,11 +35,13 @@ UpdateProgress, UpdateTiming, ) +from otaclient._utils import SharedOTAClientStatusWriter logger = logging.getLogger(__name__) _otaclient_shutdown = False _status_report_queue: queue.Queue | None = None +_shm_status: SharedOTAClientStatusWriter | None = None def _global_shutdown(): @@ -48,6 +51,9 @@ def _global_shutdown(): if _status_report_queue: _status_report_queue.put_nowait(TERMINATE_SENTINEL) + if _shm_status: + _shm_status.atexit(unlink=True) + atexit.register(_global_shutdown) @@ -220,6 +226,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor # TERMINATE_SENTINEL = cast(StatusReport, object()) +SHM_PUSH_INTERVAL = 1 class OTAClientStatusCollector: @@ -227,17 +234,26 @@ class OTAClientStatusCollector: def __init__( self, msg_queue: queue.Queue[StatusReport], + shm_status: SharedOTAClientStatusWriter, *, min_collect_interval: int = 1, - min_push_interval: int = 1, + shm_push_interval: int = SHM_PUSH_INTERVAL, ) -> None: self.min_collect_interval = min_collect_interval - self.min_push_interval = min_push_interval + self.shm_push_interval = shm_push_interval self._input_queue = msg_queue self._status = None + self._shm_status = shm_status + self._next_shm_push = 0 + + # register the shm_status to global for cleanup atexit + global _shm_status + _shm_status = shm_status + + def load_report(self, report: StatusReport) -> None: + _now = int(time.time()) - def load_report(self, report: StatusReport): if self._status is None: self._status = OTAClientStatus() status_storage = self._status @@ -269,6 +285,12 @@ def load_report(self, report: StatusReport): if isinstance(payload, SetUpdateMetaReport): return _on_update_meta(status_storage, payload) + # ------ push status to shm ------ # + if _now > self._next_shm_push: + with contextlib.suppress(Exception): + self._shm_status.write_msg(self._status) + self._next_shm_push = _now + self.shm_push_interval + def _status_collector_thread(self) -> None: """Main entry of status monitor working thread.""" while not _otaclient_shutdown: diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 609029566..83795e780 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -16,8 +16,8 @@ from __future__ import annotations -import contextlib import asyncio +import contextlib import logging from otaclient._utils import SharedOTAClientStatusReader From dfb231ffa1de6d9f1963541e2adb114665e821da Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 02:18:14 +0000 Subject: [PATCH 006/114] ecu_tracker: cleanup at exit --- src/otaclient/grpc/api_v2/ecu_tracker.py | 26 +++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 83795e780..13d16a766 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -17,6 +17,7 @@ from __future__ import annotations import asyncio +import atexit import contextlib import logging @@ -29,6 +30,20 @@ logger = logging.getLogger(__name__) +_otaclient_shutdown = False +_shm_status: SharedOTAClientStatusReader | None = None + + +def _global_shutdown(): + global _otaclient_shutdown + _otaclient_shutdown = True + + if _shm_status: + _shm_status.atexit() + + +atexit.register(_global_shutdown) + class ECUTracker: @@ -42,15 +57,12 @@ def __init__( self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() - # launch ECU trackers for all defined ECUs - # NOTE: _debug_ecu_status_polling_shutdown_event is for test only, - # allow us to stop background task without changing codes. - # In normal running this event will never be set. - self._debug_ecu_status_polling_shutdown_event = asyncio.Event() + global _shm_status + _shm_status = local_ecu_status_reader async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): """Task entry for loop polling one subECU's status.""" - while not self._debug_ecu_status_polling_shutdown_event.is_set(): + while not _otaclient_shutdown: try: _ecu_resp = await OTAClientCall.status_call( ecu_contact.ecu_id, @@ -68,7 +80,7 @@ async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): async def _polling_local_ecu_status(self): """Task entry for loop polling local ECU status.""" - while not self._debug_ecu_status_polling_shutdown_event.is_set(): + while not _otaclient_shutdown: with contextlib.suppress(Exception): status_report = self._local_ecu_status_reader.sync_msg() await self._ecu_status_storage.update_from_local_ecu(status_report) From 82f945ce3cd5ebd8644d2559a744b363f810d3be Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:12:22 +0000 Subject: [PATCH 007/114] ota_core: control_flag now becomes mp_sync.Event, cleanup unused code --- src/otaclient/ota_core.py | 41 +++++++++------------------------------ 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index bfbd2afe3..3c0ee6cc7 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -18,6 +18,7 @@ import errno import json import logging +import multiprocessing.synchronize as mp_sync import os import threading import time @@ -50,6 +51,7 @@ UpdateProgressReport, ) from otaclient._types import FailureType, OTAStatus, UpdatePhase, UpdateRequestV2 +from otaclient._utils import get_traceback, wait_and_log from otaclient.boot_control import BootControllerProtocol, get_boot_controller from otaclient.configs.cfg import cfg, ecu_info from otaclient.create_standby import ( @@ -57,7 +59,6 @@ get_standby_slot_creator, ) from otaclient.create_standby.common import DeltaBundle -from otaclient._utils import get_traceback, wait_and_log from otaclient_common.common import ensure_otaproxy_start from otaclient_common.downloader import ( EMPTY_FILE_SHA256, @@ -79,30 +80,6 @@ class OTAClientError(Exception): ... -class OTAClientControlFlags: - """ - When self ECU's otaproxy is enabled, all the child ECUs of this ECU - and self ECU OTA update will depend on its otaproxy, we need to - control when otaclient can start its downloading/reboot with considering - whether local otaproxy is started/required. - """ - - def __init__(self) -> None: - self._can_reboot = threading.Event() - - def is_can_reboot_flag_set(self) -> bool: - return self._can_reboot.is_set() - - def wait_can_reboot_flag(self): - self._can_reboot.wait() - - def set_can_reboot_flag(self): - self._can_reboot.set() - - def clear_can_reboot_flag(self): - self._can_reboot.clear() - - def _download_exception_handler(_fut: Future[Any]) -> bool: """Parse the exception raised by a downloading task. @@ -165,7 +142,7 @@ def __init__( upper_otaproxy: str | None = None, boot_controller: BootControllerProtocol, create_standby_cls: Type[StandbySlotCreatorProtocol], - control_flags: OTAClientControlFlags, + control_flag: mp_sync.Event, status_report_queue: Queue[StatusReport], session_id: str, ) -> None: @@ -210,7 +187,7 @@ def __init__( proxies["http"] = upper_otaproxy # ------ init updater implementation ------ # - self._control_flags = control_flags + self._control_flag = control_flag self._boot_controller = boot_controller self._create_standby_cls = create_standby_cls @@ -562,7 +539,7 @@ def _execute_update(self): ) ) wait_and_log( - flag=self._control_flags._can_reboot, + flag=self._control_flag, message="permit reboot flag", log_func=logger.info, ) @@ -611,20 +588,20 @@ class OTAClient: boot_controller: boot control instance create_standby_cls: type of create standby slot mechanism to use my_ecu_id: ECU id of the device running this otaclient instance - control_flags: flags used by otaclient and ota_service stub for synchronization + control_flag: flags used by otaclient and ota_service stub for synchronization proxy: upper otaproxy URL """ def __init__( self, *, - control_flags: OTAClientControlFlags, + control_flag: mp_sync.Event, proxy: Optional[str] = None, status_report_queue: Queue[StatusReport], ) -> None: self.my_ecu_id = ecu_info.ecu_id self.proxy = proxy - self.control_flags = control_flags + self.control_flag = control_flag self._status_report_queue = status_report_queue self._live_ota_status = OTAStatus.INITIALIZED @@ -770,7 +747,7 @@ def update(self, request: UpdateRequestV2) -> None: ca_chains_store=self.ca_chains_store, boot_controller=self.boot_controller, create_standby_cls=self.create_standby_cls, - control_flags=self.control_flags, + control_flag=self.control_flag, upper_otaproxy=self.proxy, status_report_queue=self._status_report_queue, session_id=new_session_id, From 3a518b228c1c252345c94bf86626698874dbbd91 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:20:35 +0000 Subject: [PATCH 008/114] ota_core: use session_id from request --- src/otaclient/ota_core.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 3c0ee6cc7..6e99af2fe 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -19,7 +19,6 @@ import json import logging import multiprocessing.synchronize as mp_sync -import os import threading import time from concurrent.futures import Future @@ -50,7 +49,13 @@ StatusReport, UpdateProgressReport, ) -from otaclient._types import FailureType, OTAStatus, UpdatePhase, UpdateRequestV2 +from otaclient._types import ( + FailureType, + OTAStatus, + RollbackRequestV2, + UpdatePhase, + UpdateRequestV2, +) from otaclient._utils import get_traceback, wait_and_log from otaclient.boot_control import BootControllerProtocol, get_boot_controller from otaclient.configs.cfg import cfg, ecu_info @@ -664,17 +669,6 @@ def __init__( self.started = True logger.info("otaclient started") - def _gen_session_id(self, update_version: str = "") -> str: - """Generate a unique session_id for the new OTA session. - - token schema: - --<4bytes_hex> - """ - _time_factor = str(int(time.time())) - _random_factor = os.urandom(4).hex() - - return f"{update_version}-{_time_factor}-{_random_factor}" - def _on_failure( self, exc: Exception, @@ -720,7 +714,7 @@ def update(self, request: UpdateRequestV2) -> None: if self.is_busy: return - new_session_id = self._gen_session_id(request.version) + new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( payload=OTAStatusChangeReport( @@ -761,11 +755,11 @@ def update(self, request: UpdateRequestV2) -> None: failure_type=e.failure_type, ) - def rollback(self) -> None: + def rollback(self, request: RollbackRequestV2) -> None: if self.is_busy: return - new_session_id = self._gen_session_id("___rollback") + new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( payload=OTAStatusChangeReport( From c83d17c707e19226cb25ebce7e0174f8d65971f9 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:21:13 +0000 Subject: [PATCH 009/114] move gen_session_id to otaclient._utils --- src/otaclient/_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index e2ef1690d..05a99d0f6 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -98,3 +98,15 @@ class SharedOTAClientStatusWriter(MPSharedStatusWriter[OTAClientStatus]): class SharedOTAClientStatusReader(MPSharedStatusReader[OTAClientStatus]): """Util for reading OTAClientStatus from shm.""" + + +def gen_session_id(update_version: str) -> str: + """Generate a unique session_id for the new OTA session. + + token schema: + --<4bytes_hex> + """ + _time_factor = str(int(time.time())) + _random_factor = os.urandom(4).hex() + + return f"{update_version}-{_time_factor}-{_random_factor}" From 19e4e34b73f92ac49d814230aaab18f4edc071a7 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:21:33 +0000 Subject: [PATCH 010/114] _types: add types for IPC --- src/otaclient/_types.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 2f54a954b..14e9fc27f 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -20,7 +20,6 @@ from typing import ClassVar, Optional from _otaclient_version import __version__ - from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum @@ -123,6 +122,26 @@ class OTAClientStatus: failure_traceback: str = "" +# +# ------ OTA requests IPC ------ # +# + + +class ReqHandleRes(StrEnum): + ACCEPT = "ACCEPT" + REJECT_BUSY = "REJECT_BUSY" + """The request has been rejected due to otaclient is busy.""" + REJECT_OTHER = "REJECT_OTHER" + """The request has been rejected for other reason.""" + + +@dataclass +class ReqResponse: + res: ReqHandleRes + session_id: str + msg: str = "" + + @dataclass class UpdateRequestV2: """Compatible with OTA API version 2.""" @@ -130,7 +149,10 @@ class UpdateRequestV2: version: str url_base: str cookies_json: str + session_id: str class RollbackRequestV2: """Compatbile with OTA API version 2.""" + + session_id: str From 4d9b43d6aa5cd3535affc584a37b06985e2f1a70 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:52:39 +0000 Subject: [PATCH 011/114] api_v2.servicer: integrate IPC --- src/otaclient/_types.py | 9 +- src/otaclient/grpc/api_v2/servicer.py | 211 ++++++++++++++------------ 2 files changed, 119 insertions(+), 101 deletions(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 14e9fc27f..0b0274276 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -136,14 +136,17 @@ class ReqHandleRes(StrEnum): @dataclass -class ReqResponse: +class IPCResponse: res: ReqHandleRes session_id: str msg: str = "" +class IPCRequest: ... + + @dataclass -class UpdateRequestV2: +class UpdateRequestV2(IPCRequest): """Compatible with OTA API version 2.""" version: str @@ -152,7 +155,7 @@ class UpdateRequestV2: session_id: str -class RollbackRequestV2: +class RollbackRequestV2(IPCRequest): """Compatbile with OTA API version 2.""" session_id: str diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index 6b233795b..ecb5b04f3 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -18,22 +18,32 @@ import asyncio import logging -import time +import multiprocessing.queues as mp_queue +import multiprocessing.synchronize as mp_sync from concurrent.futures import ThreadPoolExecutor from functools import partial +from queue import Empty from typing import Dict +from otaclient._types import ( + IPCRequest, + IPCResEnum, + IPCResponse, + RollbackRequestV2, + UpdateRequestV2, +) +from otaclient._utils import gen_session_id from otaclient.configs import ECUContact from otaclient.configs.cfg import cfg, ecu_info, proxy_info from otaclient.grpc._otaproxy_ctx import OTAProxyContext, OTAProxyLauncher from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage -from otaclient.grpc.api_v2.types import convert_from_apiv2_update_request -from otaclient.ota_core import OTAClient, OTAClientControlFlags from otaclient_api.v2 import types as api_types from otaclient_api.v2.api_caller import ECUNoResponse, OTAClientCall logger = logging.getLogger(__name__) +WAIT_FOR_ACK_TIMEOUT = 6 # seconds + class OTAClientAPIServicer: """Handlers for otaclient service API. @@ -45,10 +55,10 @@ class OTAClientAPIServicer: def __init__( self, - otaclient_inst: OTAClient, ecu_status_storage: ECUStatusStorage, + ipc_queue: mp_queue.Queue[IPCRequest | IPCResponse], *, - control_flag: OTAClientControlFlags, + control_flag: mp_sync.Event, executor: ThreadPoolExecutor, ): self._executor = executor @@ -61,8 +71,8 @@ def __init__( self.listen_port = cfg.OTA_API_SERVER_PORT self.my_ecu_id = ecu_info.ecu_id - self._otaclient_control_flags = control_flag - self._otaclient_inst = otaclient_inst + self._otaclient_control_flag = control_flag + self._ipc_queue = ipc_queue self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() @@ -77,70 +87,73 @@ def __init__( executor=executor, subprocess_ctx=OTAProxyContext(), ) - asyncio.create_task(self._otaproxy_lifecycle_managing()) - asyncio.create_task(self._otaclient_control_flags_managing()) + asyncio.create_task(self._otaclient_control_flag_managing()) else: # if otaproxy is not enabled, no dependency relationship will be formed, # always allow local otaclient to reboot - self._otaclient_control_flags.set_can_reboot_flag() + self._otaclient_control_flag.set() # internal - async def _otaproxy_lifecycle_managing(self): - """Task entry for managing otaproxy's launching/shutdown. - - NOTE: cache_dir cleanup is handled here, when all ECUs are in SUCCESS ota_status, - cache_dir will be removed. - """ - otaproxy_last_launched_timestamp = 0 - while not self._debug_status_checking_shutdown_event.is_set(): - cur_timestamp = int(time.time()) - any_requires_network = self._ecu_status_storage.any_requires_network - if self._otaproxy_launcher.is_running: - # NOTE: do not shutdown otaproxy too quick after it just starts! - # If otaproxy just starts less than seconds, - # skip the shutdown this time. - if ( - not any_requires_network - and cur_timestamp - > otaproxy_last_launched_timestamp + self.OTAPROXY_SHUTDOWN_DELAY - ): - await self._otaproxy_launcher.stop() - otaproxy_last_launched_timestamp = 0 - else: # otaproxy is not running - if any_requires_network: - await self._otaproxy_launcher.start(init_cache=False) - otaproxy_last_launched_timestamp = cur_timestamp - # when otaproxy is not running and any_requires_network is False, - # cleanup the cache dir when all ECUs are in SUCCESS ota_status - elif self._ecu_status_storage.all_success: - self._otaproxy_launcher.cleanup_cache_dir() - await self._polling_waiter() - - async def _otaclient_control_flags_managing(self): + async def _otaclient_control_flag_managing(self): """Task entry for set/clear otaclient control flags. Prevent self ECU from rebooting when their is at least one ECU under UPDATING ota_status. """ while not self._debug_status_checking_shutdown_event.is_set(): - _can_reboot = self._otaclient_control_flags.is_can_reboot_flag_set() + _can_reboot = self._otaclient_control_flag.is_set() if not self._ecu_status_storage.in_update_child_ecus_id: if not _can_reboot: logger.info( "local otaclient can reboot as no child ECU is in UPDATING ota_status" ) - self._otaclient_control_flags.set_can_reboot_flag() + self._otaclient_control_flag.set() else: if _can_reboot: logger.info( f"local otaclient cannot reboot as child ECUs {self._ecu_status_storage.in_update_child_ecus_id}" " are in UPDATING ota_status" ) - self._otaclient_control_flags.clear_can_reboot_flag() + self._otaclient_control_flag.clear() await self._polling_waiter() - # API stub + # API servicer + + def _local_update(self, request: UpdateRequestV2) -> api_types.UpdateResponseEcu: + self._ipc_queue.put_nowait(request) + try: + _req_response = self._ipc_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + assert isinstance(_req_response, IPCResponse), "unexpected msg" + assert ( + _req_response.session_id == request.session_id + ), "mismatched session_id" + + if _req_response.res == IPCResEnum.ACCEPT: + return api_types.UpdateResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.NO_FAILURE, + ) + else: + logger.error( + f"local otaclient doesn't accept upate request: {_req_response.msg}" + ) + return api_types.UpdateResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.RECOVERABLE, + ) + except AssertionError as e: + logger.error(f"local otaclient response with unexpected msg: {e!r}") + return api_types.UpdateResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.RECOVERABLE, + ) + except Exception as e: # failed to get ACK from otaclient within timeout + logger.error(f"local otaclient failed to ACK request: {e!r}") + return api_types.UpdateResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.UNRECOVERABLE, + ) async def update( self, request: api_types.UpdateRequest @@ -190,35 +203,19 @@ async def update( # second: dispatch update request to local if required by incoming request if update_req_ecu := request.find_ecu(self.my_ecu_id): - if not self._otaclient_inst.started: - logger.error("otaclient is not running, abort") - response.add_ecu( - api_types.UpdateResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.UNRECOVERABLE, - ) - ) - elif self._otaclient_inst.is_busy: - response.add_ecu( - api_types.UpdateResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.RECOVERABLE, - ) - ) - else: - self._run_in_executor( - self._otaclient_inst.update, - convert_from_apiv2_update_request(update_req_ecu), - ).add_done_callback( - lambda _: logger.info("update execution thread finished") + new_session_id = gen_session_id(update_req_ecu.version) + _resp = self._local_update( + UpdateRequestV2( + version=update_req_ecu.version, + url_base=update_req_ecu.url, + cookies_json=update_req_ecu.cookies, + session_id=new_session_id, ) + ) + + if _resp.result == api_types.FailureType.NO_FAILURE: update_acked_ecus.add(self.my_ecu_id) - response.add_ecu( - api_types.UpdateResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.NO_FAILURE, - ) - ) + response.add_ecu(_resp) # finally, trigger ecu_status_storage entering active mode if needed if update_acked_ecus: @@ -230,6 +227,45 @@ async def update( ) return response + def _local_rollback( + self, rollback_request: RollbackRequestV2 + ) -> api_types.RollbackResponseEcu: + self._ipc_queue.put_nowait(rollback_request) + try: + _req_response = self._ipc_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + assert isinstance( + _req_response, IPCResponse + ), f"unexpected response: {type(_req_response)}" + assert ( + _req_response.session_id == rollback_request.session_id + ), "mismatched session_id" + + if _req_response.res == IPCResEnum.ACCEPT: + return api_types.RollbackResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.NO_FAILURE, + ) + else: + logger.error( + f"local otaclient doesn't accept upate request: {_req_response.msg}" + ) + return api_types.RollbackResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.RECOVERABLE, + ) + except AssertionError as e: + logger.error(f"local otaclient response with unexpected msg: {e!r}") + return api_types.RollbackResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.RECOVERABLE, + ) + except Exception as e: # failed to get ACK from otaclient within timeout + logger.error(f"local otaclient failed to ACK request: {e!r}") + return api_types.RollbackResponseEcu( + ecu_id=self.my_ecu_id, + result=api_types.FailureType.UNRECOVERABLE, + ) + async def rollback( self, request: api_types.RollbackRequest ) -> api_types.RollbackResponse: @@ -276,31 +312,10 @@ async def rollback( # second: dispatch rollback request to local if required if request.find_ecu(self.my_ecu_id): - if not self._otaclient_inst.started: - logger.error("otaclient is not running, abort") - response.add_ecu( - api_types.RollbackResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.UNRECOVERABLE, - ) - ) - elif self._otaclient_inst.is_busy: - response.add_ecu( - api_types.RollbackResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.RECOVERABLE, - ) - ) - else: - self._run_in_executor(self._otaclient_inst.rollback).add_done_callback( - lambda _: logger.info("rollback execution thread finished") - ) - response.add_ecu( - api_types.RollbackResponseEcu( - ecu_id=self.my_ecu_id, - result=api_types.FailureType.NO_FAILURE, - ) - ) + new_session_id = gen_session_id("__rollback") + response.add_ecu( + self._local_rollback(RollbackRequestV2(session_id=new_session_id)) + ) return response async def status(self, _=None) -> api_types.StatusResponse: From 747d247627625488ae8b91e6cbfe90b743f0a3c1 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:52:56 +0000 Subject: [PATCH 012/114] minor fix --- src/otaclient/_types.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 0b0274276..4fd7c1b9c 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -127,7 +127,7 @@ class OTAClientStatus: # -class ReqHandleRes(StrEnum): +class IPCResEnum(StrEnum): ACCEPT = "ACCEPT" REJECT_BUSY = "REJECT_BUSY" """The request has been rejected due to otaclient is busy.""" @@ -137,7 +137,7 @@ class ReqHandleRes(StrEnum): @dataclass class IPCResponse: - res: ReqHandleRes + res: IPCResEnum session_id: str msg: str = "" @@ -155,6 +155,7 @@ class UpdateRequestV2(IPCRequest): session_id: str +@dataclass class RollbackRequestV2(IPCRequest): """Compatbile with OTA API version 2.""" From 89e3ca0c010d1effa09c3808d2bd1fb04b0af537 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 03:54:02 +0000 Subject: [PATCH 013/114] types: cleanup unused --- src/otaclient/grpc/api_v2/types.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/otaclient/grpc/api_v2/types.py b/src/otaclient/grpc/api_v2/types.py index 9386966ac..8ec31a9d9 100644 --- a/src/otaclient/grpc/api_v2/types.py +++ b/src/otaclient/grpc/api_v2/types.py @@ -101,13 +101,3 @@ def convert_to_apiv2_status(_in: OTAClientStatus) -> api_types.StatusResponseEcu base_res.update_status = update_status return base_res - - -def convert_from_apiv2_update_request( - _in: api_types.UpdateRequestEcu, -) -> UpdateRequestV2: - return UpdateRequestV2( - version=_in.version, - url_base=_in.url, - cookies_json=_in.cookies, - ) From 3ae0f4be6784ab22de08def02dbf6fd36ab30d94 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 09:40:54 +0000 Subject: [PATCH 014/114] ota_core: implement IPC interface --- src/otaclient/ota_core.py | 79 +++++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 6e99af2fe..72fd96094 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -18,6 +18,7 @@ import errno import json import logging +import multiprocessing.queues as mp_queue import multiprocessing.synchronize as mp_sync import threading import time @@ -27,8 +28,8 @@ from http import HTTPStatus from json.decoder import JSONDecodeError from pathlib import Path -from queue import Queue -from typing import Any, Iterator, Optional, Type +from queue import Empty, Queue +from typing import Any, Iterator, NoReturn, Optional, Type from urllib.parse import urlparse import requests.exceptions as requests_exc @@ -51,6 +52,9 @@ ) from otaclient._types import ( FailureType, + IPCRequest, + IPCResEnum, + IPCResponse, OTAStatus, RollbackRequestV2, UpdatePhase, @@ -81,6 +85,9 @@ DOWNLOAD_STATS_REPORT_BATCH = 300 DOWNLOAD_REPORT_INTERVAL = 1 # second +OP_CHECK_INTERVAL = 1 # second +HOLD_REQ_HANDLING_ON_ACK_REQUEST = 8 # seconds + class OTAClientError(Exception): ... @@ -711,9 +718,6 @@ def update(self, request: UpdateRequestV2) -> None: NOTE that update API will not raise any exceptions. The failure information is available via status API. """ - if self.is_busy: - return - new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( @@ -733,7 +737,6 @@ def update(self, request: UpdateRequestV2) -> None: module=__name__, ) - self._live_ota_status = OTAStatus.UPDATING _OTAUpdater( version=request.version, raw_url_base=request.url_base, @@ -756,9 +759,6 @@ def update(self, request: UpdateRequestV2) -> None: ) def rollback(self, request: RollbackRequestV2) -> None: - if self.is_busy: - return - new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( @@ -768,17 +768,74 @@ def rollback(self, request: RollbackRequestV2) -> None: session_id=new_session_id, ) ) - logger.info(f"start new OTA rollback session: {new_session_id=}") + logger.info(f"start new OTA rollback session: {new_session_id=}") try: logger.info("[rollback] entering...") self._live_ota_status = OTAStatus.ROLLBACKING _OTARollbacker(boot_controller=self.boot_controller).execute() except ota_errors.OTAError as e: - self._live_ota_status = OTAStatus.FAILURE self._on_failure( e, ota_status=OTAStatus.FAILURE, failure_reason=e.get_failure_reason(), failure_type=e.failure_type, ) + + def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: + """Main loop of ota_core process.""" + _allow_request_after = 0 + while True: + _now = int(time.time()) + try: + request = op_queue.get(timeout=OP_CHECK_INTERVAL) + except Empty: + continue + + if _now < _allow_request_after or self.is_busy: + _err_msg = ( + f"otaclient is busy at {self._live_ota_status} or " + f"request too quickly({_allow_request_after=}), " + f"reject {request}" + ) + logger.warning(_err_msg) + op_queue.put_nowait( + IPCResponse( + res=IPCResEnum.REJECT_BUSY, + msg=_err_msg, + session_id=request.session_id, + ) + ) + elif isinstance(request, UpdateRequestV2): + self._live_ota_status = OTAStatus.UPDATING + self.update(request) + op_queue.put_nowait( + IPCResponse( + res=IPCResEnum.ACCEPT, + session_id=request.session_id, + ) + ) + _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST + elif ( + isinstance(request, RollbackRequestV2) + and self._live_ota_status == OTAStatus.SUCCESS + ): + self._live_ota_status = OTAStatus.FAILURE + self.rollback(request) + op_queue.put_nowait( + IPCResponse( + res=IPCResEnum.ACCEPT, + session_id=request.session_id, + ) + ) + _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST + else: + _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" + logger.error(_err_msg) + op_queue.put_nowait( + IPCResponse( + res=IPCResEnum.REJECT_OTHER, + msg=_err_msg, + session_id=request.session_id, + ) + ) From 844862808ec12ed96433250de52361de7593fa6e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 09:41:09 +0000 Subject: [PATCH 015/114] minor update --- src/otaclient/_types.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 4fd7c1b9c..32f2e4f10 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -142,7 +142,9 @@ class IPCResponse: msg: str = "" -class IPCRequest: ... +@dataclass +class IPCRequest: + session_id: str @dataclass @@ -152,11 +154,8 @@ class UpdateRequestV2(IPCRequest): version: str url_base: str cookies_json: str - session_id: str @dataclass class RollbackRequestV2(IPCRequest): """Compatbile with OTA API version 2.""" - - session_id: str From e473dcc625c68fe8f2f3b2c6d343e003038e95b0 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:00:49 +0000 Subject: [PATCH 016/114] api_v2.servicer: nolonger in charge of launching/shutting down otaproxy --- src/otaclient/grpc/api_v2/servicer.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index ecb5b04f3..bf8c8d377 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -20,10 +20,6 @@ import logging import multiprocessing.queues as mp_queue import multiprocessing.synchronize as mp_sync -from concurrent.futures import ThreadPoolExecutor -from functools import partial -from queue import Empty -from typing import Dict from otaclient._types import ( IPCRequest, @@ -35,7 +31,6 @@ from otaclient._utils import gen_session_id from otaclient.configs import ECUContact from otaclient.configs.cfg import cfg, ecu_info, proxy_info -from otaclient.grpc._otaproxy_ctx import OTAProxyContext, OTAProxyLauncher from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage from otaclient_api.v2 import types as api_types from otaclient_api.v2.api_caller import ECUNoResponse, OTAClientCall @@ -59,13 +54,7 @@ def __init__( ipc_queue: mp_queue.Queue[IPCRequest | IPCResponse], *, control_flag: mp_sync.Event, - executor: ThreadPoolExecutor, ): - self._executor = executor - self._run_in_executor = partial( - asyncio.get_running_loop().run_in_executor, executor - ) - self.sub_ecus = ecu_info.secondaries self.listen_addr = ecu_info.ip_addr self.listen_port = cfg.OTA_API_SERVER_PORT @@ -83,10 +72,6 @@ def __init__( # In normal running this event will never be set. self._debug_status_checking_shutdown_event = asyncio.Event() if proxy_info.enable_local_ota_proxy: - self._otaproxy_launcher = OTAProxyLauncher( - executor=executor, - subprocess_ctx=OTAProxyContext(), - ) asyncio.create_task(self._otaclient_control_flag_managing()) else: # if otaproxy is not enabled, no dependency relationship will be formed, @@ -163,7 +148,7 @@ async def update( response = api_types.UpdateResponse() # first: dispatch update request to all directly connected subECUs - tasks: Dict[asyncio.Task, ECUContact] = {} + tasks: dict[asyncio.Task, ECUContact] = {} for ecu_contact in self.sub_ecus: if not request.if_contains_ecu(ecu_contact.ecu_id): continue @@ -273,7 +258,7 @@ async def rollback( response = api_types.RollbackResponse() # first: dispatch rollback request to all directly connected subECUs - tasks: Dict[asyncio.Task, ECUContact] = {} + tasks: dict[asyncio.Task, ECUContact] = {} for ecu_contact in self.sub_ecus: if not request.if_contains_ecu(ecu_contact.ecu_id): continue From dec832a365890352bb04077ca6a3b5baba99216b Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:07:43 +0000 Subject: [PATCH 017/114] api_v2.ecu_status: expose any_requires_network and all_ecus_succeeded flags --- src/otaclient/grpc/api_v2/ecu_status.py | 38 ++++++++++++++++++------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_status.py b/src/otaclient/grpc/api_v2/ecu_status.py index 5809e6e1b..f2f21fb8b 100644 --- a/src/otaclient/grpc/api_v2/ecu_status.py +++ b/src/otaclient/grpc/api_v2/ecu_status.py @@ -43,16 +43,18 @@ import logging import time from itertools import chain -from typing import Dict, Iterable, Optional, Set, TypeVar +from typing import TYPE_CHECKING, Dict, Iterable, Optional from otaclient._types import OTAClientStatus from otaclient.configs.cfg import cfg, ecu_info from otaclient.grpc.api_v2.types import convert_to_apiv2_status from otaclient_api.v2 import types as api_types +from otaclient_common.typing import T logger = logging.getLogger(__name__) -T = TypeVar("T") +if TYPE_CHECKING: + import multiprocessing.synchronize as mp_sync # NOTE(20230522): # ECU will be treated as disconnected if we cannot get in touch with it @@ -83,7 +85,12 @@ def discard(self, value: T): class ECUStatusStorage: - def __init__(self) -> None: + def __init__( + self, + *, + all_ecus_succeeded: mp_sync.Event, + any_requires_network: mp_sync.Event, + ) -> None: self.my_ecu_id = ecu_info.ecu_id self._writer_lock = asyncio.Lock() # ECU status storage @@ -127,10 +134,12 @@ def __init__(self) -> None: self.in_update_ecus_id = set() self.in_update_child_ecus_id = set() - self.any_requires_network = False self.success_ecus_id = set() - self.all_success = False + + # exposed external events + self.any_requires_network: mp_sync.Event = any_requires_network + self.all_success: mp_sync.Event = all_ecus_succeeded # property update task # NOTE: _debug_properties_update_shutdown_event is for test only, @@ -213,7 +222,7 @@ async def _generate_overall_status_report(self): ) # check if any ECUs in the tracked tracked active ECUs set require network - self.any_requires_network = any( + if any( ( status.requires_network for status in chain( @@ -222,7 +231,10 @@ async def _generate_overall_status_report(self): if status.ecu_id in self._tracked_active_ecus and status.ecu_id not in lost_ecus ) - ) + ): + self.any_requires_network.set() + else: + self.any_requires_network.clear() # check if all tracked active_ota_ecus are in SUCCESS ota_status _old_all_success, _old_success_ecus_id = self.all_success, self.success_ecus_id @@ -236,7 +248,11 @@ async def _generate_overall_status_report(self): and status.ecu_id not in lost_ecus } # NOTE: all_success doesn't count the lost ECUs - self.all_success = len(self.success_ecus_id) == len(self._tracked_active_ecus) + if len(self.success_ecus_id) == len(self._tracked_active_ecus): + self.all_success.set() + else: + self.all_success.clear() + if _new_success_ecu := self.success_ecus_id.difference(_old_success_ecus_id): logger.info(f"new succeeded ECU(s) detected: {_new_success_ecu}") if not _old_all_success and self.all_success: @@ -311,7 +327,7 @@ async def update_from_local_ecu(self, local_status: OTAClientStatus): self._all_ecus_status_v2[ecu_id] = convert_to_apiv2_status(local_status) self._all_ecus_last_contact_timestamp[ecu_id] = cur_timestamp - async def on_ecus_accept_update_request(self, ecus_accept_update: Set[str]): + async def on_ecus_accept_update_request(self, ecus_accept_update: set[str]): """Update overall ECU status report directly on ECU(s) accept OTA update request. for the ECUs that accepts OTA update request, we: @@ -335,8 +351,8 @@ async def on_ecus_accept_update_request(self, ecus_accept_update: Set[str]): self.in_update_ecus_id.update(ecus_accept_update) self.in_update_child_ecus_id = self.in_update_ecus_id - {self.my_ecu_id} - self.any_requires_network = True - self.all_success = False + self.any_requires_network.set() + self.all_success.clear() self.success_ecus_id -= ecus_accept_update self.active_ota_update_present.set() From 72cd46c2933d80635b56ef7162a04d8fa3c7d915 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:09:23 +0000 Subject: [PATCH 018/114] WIP: main --- src/otaclient/main.py | 268 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 219 insertions(+), 49 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 2c3a82b08..5fb9d06e7 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -17,75 +17,174 @@ from __future__ import annotations import asyncio +import atexit import logging -from concurrent.futures import ThreadPoolExecutor +import multiprocessing as mp +import multiprocessing.shared_memory as mp_shm +import multiprocessing.synchronize as mp_sync +import secrets +import shutil +import threading +import time +from functools import partial +from multiprocessing.queues import Queue as mp_Queue +from pathlib import Path from queue import Queue +from typing import NoReturn from otaclient import __version__ +from otaclient._status_monitor import OTAClientStatusCollector +from otaclient._types import IPCRequest, IPCResponse +from otaclient._utils import SharedOTAClientStatusReader, SharedOTAClientStatusWriter logger = logging.getLogger(__name__) +HEALTH_CHECK_INTERAVL = 6 # seconds +OTAPROXY_CHECK_INTERVAL = 3 +OTAPROXY_MIN_STARTUP_TIME = 60 +"""Keep otaproxy running at least 60 seconds after startup.""" +OTA_CACHE_DIR_CHECK_INTERVAL = 60 +SHUTDOWN_AFTER_CORE_EXIT = 16 +SHUTDOWN_AFTER_API_SERVER_EXIT = 3 -async def create_otaclient_grpc_server(): - import grpc.aio +_global_shutdown: bool = False - from otaclient._status_monitor import OTAClientStatusCollector - from otaclient.configs.cfg import cfg, ecu_info, proxy_info - from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage - from otaclient.grpc.api_v2.ecu_tracker import ECUTracker - from otaclient.grpc.api_v2.servicer import OTAClientAPIServicer - from otaclient.ota_core import OTAClient, OTAClientControlFlags - from otaclient_api.v2 import otaclient_v2_pb2_grpc as v2_grpc - from otaclient_api.v2.api_stub import OtaClientServiceV2 - - _executor = ThreadPoolExecutor(thread_name_prefix="otaclient_main") - _control_flag = OTAClientControlFlags() - - status_report_queue = Queue() - status_collector = OTAClientStatusCollector(status_report_queue) - - ecu_status_storage = ECUStatusStorage() - ecu_tracker = ECUTracker( - ecu_status_storage, - local_status_collector=status_collector, + +def _on_global_shutdown(): + global _global_shutdown + _global_shutdown = True + + +def ota_core_process( + shm_writer_factory, + control_flag: mp_sync.Event, + op_queue: mp_Queue[IPCRequest | IPCResponse], +): + from otaclient._logging import configure_logging + from otaclient.configs.cfg import proxy_info + from otaclient.ota_core import OTAClient + + atexit.register(_on_global_shutdown) + shm_writer = shm_writer_factory() + atexit.register(shm_writer.atexit) + + configure_logging() + + _local_status_report_queue = Queue() + _status_monitor = OTAClientStatusCollector( + msg_queue=_local_status_report_queue, + shm_status=shm_writer, ) - ecu_tracker.start() + _status_monitor.start() - otaclient_inst = OTAClient( - control_flags=_control_flag, + _ota_core = OTAClient( + control_flag=control_flag, proxy=proxy_info.get_proxy_for_local_ota(), - status_report_queue=status_report_queue, + status_report_queue=_local_status_report_queue, ) - status_collector.start() + _ota_core.main(op_queue) - service_stub = OTAClientAPIServicer( - otaclient_inst, - ecu_status_storage, - control_flag=_control_flag, - executor=_executor, - ) - ota_client_service_v2 = OtaClientServiceV2(service_stub) - server = grpc.aio.server() - v2_grpc.add_OtaClientServiceServicer_to_server( - server=server, servicer=ota_client_service_v2 - ) - server.add_insecure_port(f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}") - return server +def grpc_server_process( + shm_reader_factory, + control_flag: mp_sync.Event, + op_queue: mp_Queue[IPCRequest | IPCResponse], + all_ecus_succeeded: mp_sync.Event, + any_requires_network: mp_sync.Event, +) -> NoReturn: # type: ignore + from otaclient._logging import configure_logging + + configure_logging() + atexit.register(_on_global_shutdown) + + shm_reader = shm_reader_factory() + atexit.register(shm_reader.atexit) + + async def _grpc_server_launcher(): + import grpc.aio + + from otaclient.configs.cfg import cfg, ecu_info + from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage + from otaclient.grpc.api_v2.ecu_tracker import ECUTracker + from otaclient.grpc.api_v2.servicer import OTAClientAPIServicer + from otaclient_api.v2 import otaclient_v2_pb2_grpc as v2_grpc + from otaclient_api.v2.api_stub import OtaClientServiceV2 + + ecu_status_storage = ECUStatusStorage( + all_ecus_succeeded=all_ecus_succeeded, + any_requires_network=any_requires_network, + ) + ecu_tracker = ECUTracker(ecu_status_storage, shm_reader) + ecu_tracker.start() + + api_servicer = OTAClientAPIServicer( + ecu_status_storage, + op_queue, + control_flag=control_flag, + ) + ota_client_service_v2 = OtaClientServiceV2(api_servicer) + + server = grpc.aio.server() + v2_grpc.add_OtaClientServiceServicer_to_server( + server=server, servicer=ota_client_service_v2 + ) + server.add_insecure_port(f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}") + + await server.start() + try: + await server.wait_for_termination() + finally: + await server.stop(1) + + asyncio.run(_grpc_server_launcher()) -async def launch_otaclient_grpc_server(): - server = await create_otaclient_grpc_server() - await server.start() - try: - await server.wait_for_termination() - finally: - await server.stop(1) + +def otaproxy_control_thread( + *, + any_requires_network: mp_sync.Event, + all_ecus_succeeded: mp_sync.Event, +) -> None: # pragma: no cover + from ota_proxy.config import config + + # TODO: use the otaproxy base_dir config from otaclient.configs + ota_cache_dir = Path(config.BASE_DIR) + next_ota_cache_dir_checkpoint = 0 + + while not _global_shutdown: + time.sleep(OTAPROXY_CHECK_INTERVAL) + + _otaproxy_running = otaproxy_running() + _otaproxy_should_run = any_requires_network.is_set() + + if not _otaproxy_should_run and not _otaproxy_running: + _now = time.time() + if ( + _now > next_ota_cache_dir_checkpoint + and all_ecus_succeeded.is_set() + and ota_cache_dir.is_dir() + ): + logger.info( + "all tracked ECUs are in SUCCESS OTA status, cleanup ota cache dir ..." + ) + next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL + shutil.rmtree(ota_cache_dir) + + elif _otaproxy_should_run and not _otaproxy_running: + start_otaproxy_server(init_cache=False) + time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown + + elif not _otaproxy_should_run and _otaproxy_running: + shutdown_otaproxy_server() + + +STATUS_SHM_SIZE = 4096 +SHM_HMAC_KEY_LEN = 64 # bytes def main() -> None: from otaclient._logging import configure_logging - from otaclient.configs.cfg import cfg, ecu_info from otaclient._utils import check_other_otaclient, create_otaclient_rundir + from otaclient.configs.cfg import cfg, ecu_info, proxy_info # configure logging before any code being executed configure_logging() @@ -97,4 +196,75 @@ def main() -> None: check_other_otaclient(cfg.OTACLIENT_PID_FILE) create_otaclient_rundir(cfg.RUN_DIR) - asyncio.run(launch_otaclient_grpc_server()) + mp_ctx = mp.get_context("spawn") + shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) + _key = secrets.token_bytes(SHM_HMAC_KEY_LEN) + atexit.register(shm.close) + atexit.register(shm.unlink) + + # shared queus and flags + local_otaclient_control_flag = mp_ctx.Event() + local_otaclient_op_queue = mp_ctx.Queue() + all_ecus_succeeded = mp_ctx.Event() + any_requires_network = mp_ctx.Event() + + _ota_core_p = mp_ctx.Process( + target=partial( + ota_core_process, + partial(SharedOTAClientStatusWriter, name=shm.name, key=_key), + local_otaclient_control_flag, + local_otaclient_op_queue, + ), + name="otaclient_ota_core", + ) + _ota_core_p.start() + + _grpc_server_p = mp_ctx.Process( + target=partial( + grpc_server_process, + partial(SharedOTAClientStatusReader, name=shm.name, key=_key), + local_otaclient_control_flag, + local_otaclient_op_queue, + all_ecus_succeeded, + any_requires_network, + ), + name="otaclient_api_server", + ) + _grpc_server_p.start() + + # we only setup the resources in main process + del _key, local_otaclient_control_flag, local_otaclient_op_queue + + # ------ configuring main process ------ # + + atexit.register(_on_global_shutdown) + _otaproxy_control_t = None + if proxy_info.enable_local_ota_proxy: + _otaproxy_control_t = threading.Thread( + target=partial( + otaproxy_control_thread, + any_requires_network=any_requires_network, + all_ecus_succeeded=all_ecus_succeeded, + ), + daemon=True, + name="otaclient_otaproxy_control_t", + ) + _otaproxy_control_t.start() + + while not _global_shutdown: + time.sleep(HEALTH_CHECK_INTERAVL) + + if not _ota_core_p.is_alive(): + logger.error( + "ota_core process is dead! " + f"otaclient will exit in {SHUTDOWN_AFTER_CORE_EXIT}seconds ..." + ) + time.sleep(SHUTDOWN_AFTER_CORE_EXIT) + # TODO: shutdown + + if not _grpc_server_p.is_alive(): + logger.error( + f"ota API server is dead, whole otaclient will exit in {SHUTDOWN_AFTER_API_SERVER_EXIT}seconds ..." + ) + time.sleep(SHUTDOWN_AFTER_API_SERVER_EXIT) + # TODO: shutdown From 2009c0994900f56a6484cf085a1294e9ac2d3a64 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:25:41 +0000 Subject: [PATCH 019/114] re-implement _otaproxy_ctx, move it to otaclient package --- src/otaclient/{grpc => }/_otaproxy_ctx.py | 171 +++++++++------------- 1 file changed, 72 insertions(+), 99 deletions(-) rename src/otaclient/{grpc => }/_otaproxy_ctx.py (55%) diff --git a/src/otaclient/grpc/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py similarity index 55% rename from src/otaclient/grpc/_otaproxy_ctx.py rename to src/otaclient/_otaproxy_ctx.py index 5e645893d..15eec5913 100644 --- a/src/otaclient/grpc/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -11,31 +11,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Control of the launch/shutdown of otaproxy according to sub ECUs' status.""" +"""Control of the otaproxy server startup/shutdown. + +The API exposed by this module is meant to be controlled by otaproxy managing thread only. +See otaclient.main.otaproxy_control_thread for more details. + +A atexit hook is installed to ensure the otaproxy process is terminated on otaclient shutdown. +""" from __future__ import annotations -import asyncio +import atexit import logging +import multiprocessing.context as mp_ctx import shutil import sys -from concurrent.futures import ThreadPoolExecutor -from functools import partial from pathlib import Path -from typing import Any, Dict, Optional, Type +from typing import Any, Optional, Type from typing_extensions import Self from ota_proxy import OTAProxyContextProto, subprocess_otaproxy_launcher from ota_proxy import config as local_otaproxy_cfg -from otaclient import _logging from otaclient.configs.cfg import cfg, proxy_info from otaclient_common import cmdhelper from otaclient_common.common import ensure_otaproxy_start logger = logging.getLogger(__name__) +_otaproxy_p: mp_ctx.SpawnProcess | None = None + class OTAProxyContext(OTAProxyContextProto): EXTERNAL_CACHE_KEY = "external_cache" @@ -57,10 +63,8 @@ def __init__( self._external_cache_dev_mp = external_cache_dev_mp self._external_cache_data_dir = external_cache_path - self.logger = logging.getLogger("ota_proxy") - @property - def extra_kwargs(self) -> Dict[str, Any]: + def extra_kwargs(self) -> dict[str, Any]: """Inject kwargs to otaproxy startup entry. Currently only inject if external cache storage is used. @@ -74,14 +78,14 @@ def extra_kwargs(self) -> Dict[str, Any]: def _subprocess_init(self): """Initializing the subprocess before launching it.""" + from otaclient._logging import configure_logging + # configure logging for otaproxy subprocess # NOTE: on otaproxy subprocess, we first set log level of the root logger # to CRITICAL to filter out third_party libs' logging(requests, urllib3, etc.), # and then set the ota_proxy logger to DEFAULT_LOG_LEVEL - _logging.configure_logging() + configure_logging() otaproxy_logger = logging.getLogger("ota_proxy") - otaproxy_logger.setLevel(cfg.DEFAULT_LOG_LEVEL) - self.logger = otaproxy_logger # wait for upper otaproxy if any if self.upper_proxy: @@ -104,7 +108,7 @@ def _mount_external_cache_storage(self): ) _cache_dev = _cache_dev[0] - self.logger.info(f"external cache dev detected at {_cache_dev}") + logger.info(f"external cache dev detected at {_cache_dev}") self._external_cache_dev = _cache_dev # try to unmount the mount_point and cache_dev unconditionally @@ -112,7 +116,6 @@ def _mount_external_cache_storage(self): cmdhelper.umount(_cache_dev, raise_exception=False) _mp.mkdir(parents=True, exist_ok=True) - # try to mount cache_dev ro try: cmdhelper.mount_ro( target=_cache_dev, mount_point=self._external_cache_dev_mp @@ -142,7 +145,7 @@ def __enter__(self) -> Self: return self except Exception as e: # if subprocess init failed, directly let the process exit - self.logger.error(f"otaproxy subprocess init failed, exit: {e!r}") + logger.error(f"otaproxy subprocess init failed, exit: {e!r}") sys.exit(1) def __exit__( @@ -153,98 +156,68 @@ def __exit__( ) -> Optional[bool]: if __exc_type: _exc = __exc_value if __exc_value else __exc_type() - self.logger.warning(f"exception during otaproxy shutdown: {_exc!r}") + logger.warning(f"exception during otaproxy shutdown: {_exc!r}") # otaproxy post-shutdown cleanup: # 1. umount external cache storage self._umount_external_cache_storage() -class OTAProxyLauncher: - """Launcher of start/stop otaproxy in subprocess.""" - - def __init__( - self, *, executor: ThreadPoolExecutor, subprocess_ctx: OTAProxyContextProto - ) -> None: - self.enabled = proxy_info.enable_local_ota_proxy - self.upper_otaproxy = ( - str(proxy_info.upper_ota_proxy) if proxy_info.upper_ota_proxy else "" - ) - self.subprocess_ctx = subprocess_ctx - - self._lock = asyncio.Lock() - # process start/shutdown will be dispatched to thread pool - self._run_in_executor = partial( - asyncio.get_event_loop().run_in_executor, executor - ) - self._otaproxy_subprocess = None - - @property - def is_running(self) -> bool: - return ( - self.enabled - and self._otaproxy_subprocess is not None - and self._otaproxy_subprocess.is_alive() - ) - - # API +def cleanup_cache_dir(): + """Cleanup the OTA cache dir. - def cleanup_cache_dir(self): - """ - NOTE: this method should only be called when all ECUs in the cluster - are in SUCCESS ota_status(overall_ecu_status.all_success==True). - """ - if (cache_dir := Path(local_otaproxy_cfg.BASE_DIR)).is_dir(): - logger.info("cleanup ota_cache on success") - shutil.rmtree(cache_dir, ignore_errors=True) + NOTE: this method should only be called when all ECUs in the cluster + are in SUCCESS ota_status(overall_ecu_status.all_success==True). + """ + if (cache_dir := Path(local_otaproxy_cfg.BASE_DIR)).is_dir(): + logger.info("cleanup ota_cache on success") + shutil.rmtree(cache_dir, ignore_errors=True) - async def start(self, *, init_cache: bool) -> Optional[int]: - """Start the otaproxy in a subprocess.""" - if not self.enabled or self._lock.locked() or self.is_running: - return - async with self._lock: - # launch otaproxy server process - _subprocess_entry = subprocess_otaproxy_launcher( - subprocess_ctx=self.subprocess_ctx - ) +def otaproxy_running() -> bool: + return _otaproxy_p is not None and _otaproxy_p.is_alive() - otaproxy_subprocess = await self._run_in_executor( - partial( - _subprocess_entry, - host=str(proxy_info.local_ota_proxy_listen_addr), - port=proxy_info.local_ota_proxy_listen_port, - init_cache=init_cache, - cache_dir=local_otaproxy_cfg.BASE_DIR, - cache_db_f=local_otaproxy_cfg.DB_FILE, - upper_proxy=self.upper_otaproxy, - enable_cache=proxy_info.enable_local_ota_proxy_cache, - enable_https=proxy_info.gateway_otaproxy, - ) - ) - self._otaproxy_subprocess = otaproxy_subprocess - logger.info( - f"otaproxy({otaproxy_subprocess.pid=}) started at " - f"{proxy_info.local_ota_proxy_listen_addr}:{proxy_info.local_ota_proxy_listen_port}" - ) - return otaproxy_subprocess.pid - - async def stop(self): - """Stop the otaproxy subprocess. - - NOTE: This method only shutdown the otaproxy process, it will not cleanup the - cache dir. cache dir cleanup is handled by other mechanism. - Check cleanup_cache_dir API for more details. - """ - if not self.enabled or self._lock.locked() or not self.is_running: - return - def _shutdown(): - if self._otaproxy_subprocess and self._otaproxy_subprocess.is_alive(): - logger.info("shuting down otaproxy server process...") - self._otaproxy_subprocess.terminate() - self._otaproxy_subprocess.join() - self._otaproxy_subprocess = None +def start_otaproxy_server( + *, init_cache: bool, enable_external_cache: bool = True +) -> None: + global _otaproxy_p + if _otaproxy_p and _otaproxy_p.is_alive(): + logger.warning("otaproxy is already running, abort") + return - async with self._lock: - await self._run_in_executor(_shutdown) - logger.info("otaproxy closed") + _subprocess_entry = subprocess_otaproxy_launcher( + OTAProxyContext( + external_cache_enabled=enable_external_cache, + ) + ) + host, port = ( + str(proxy_info.local_ota_proxy_listen_addr), + proxy_info.local_ota_proxy_listen_port, + ) + upper_proxy = str(proxy_info.upper_ota_proxy or "") + logger.info(f"will launch otaproxy at http://{host}:{port}, with {upper_proxy=}") + + _otaproxy_p = _subprocess_entry( + host=host, + port=port, + init_cache=init_cache, + cache_dir=local_otaproxy_cfg.BASE_DIR, + cache_db_f=local_otaproxy_cfg.DB_FILE, + upper_proxy=upper_proxy, + enable_cache=proxy_info.enable_local_ota_proxy_cache, + enable_https=proxy_info.gateway_otaproxy, + ) + logger.info("otaproxy started") + + +def shutdown_otaproxy_server() -> None: + global _otaproxy_p + if _otaproxy_p and _otaproxy_p.is_alive(): + logger.info("shuting down otaproxy server process...") + _otaproxy_p.terminate() + _otaproxy_p.join() + _otaproxy_p = None + logger.info("otaproxy closed") + + +atexit.register(shutdown_otaproxy_server) From 544e3561f6b84adaf4e24ce7918df21679614410 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:30:34 +0000 Subject: [PATCH 020/114] _otaproxy_ctx: minor cleanup --- src/otaclient/_otaproxy_ctx.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 15eec5913..d9d005885 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -15,17 +15,13 @@ The API exposed by this module is meant to be controlled by otaproxy managing thread only. See otaclient.main.otaproxy_control_thread for more details. - -A atexit hook is installed to ensure the otaproxy process is terminated on otaclient shutdown. """ from __future__ import annotations -import atexit import logging import multiprocessing.context as mp_ctx -import shutil import sys from pathlib import Path from typing import Any, Optional, Type @@ -162,17 +158,6 @@ def __exit__( self._umount_external_cache_storage() -def cleanup_cache_dir(): - """Cleanup the OTA cache dir. - - NOTE: this method should only be called when all ECUs in the cluster - are in SUCCESS ota_status(overall_ecu_status.all_success==True). - """ - if (cache_dir := Path(local_otaproxy_cfg.BASE_DIR)).is_dir(): - logger.info("cleanup ota_cache on success") - shutil.rmtree(cache_dir, ignore_errors=True) - - def otaproxy_running() -> bool: return _otaproxy_p is not None and _otaproxy_p.is_alive() @@ -218,6 +203,3 @@ def shutdown_otaproxy_server() -> None: _otaproxy_p.join() _otaproxy_p = None logger.info("otaproxy closed") - - -atexit.register(shutdown_otaproxy_server) From a26a4e8bff6499ffe9bf260e02a88bc8254e1ca2 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 22 Nov 2024 10:30:58 +0000 Subject: [PATCH 021/114] WIP: main --- src/otaclient/main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 5fb9d06e7..ca7654b53 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -145,11 +145,17 @@ def otaproxy_control_thread( all_ecus_succeeded: mp_sync.Event, ) -> None: # pragma: no cover from ota_proxy.config import config + from otaclient._otaproxy_ctx import ( + otaproxy_running, + shutdown_otaproxy_server, + start_otaproxy_server, + ) - # TODO: use the otaproxy base_dir config from otaclient.configs ota_cache_dir = Path(config.BASE_DIR) next_ota_cache_dir_checkpoint = 0 + atexit.register(shutdown_otaproxy_server) + while not _global_shutdown: time.sleep(OTAPROXY_CHECK_INTERVAL) @@ -167,7 +173,7 @@ def otaproxy_control_thread( "all tracked ECUs are in SUCCESS OTA status, cleanup ota cache dir ..." ) next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL - shutil.rmtree(ota_cache_dir) + shutil.rmtree(ota_cache_dir, ignore_errors=True) elif _otaproxy_should_run and not _otaproxy_running: start_otaproxy_server(init_cache=False) From 467de3f125463efa59cfb93381073b4336855807 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 11:39:34 +0000 Subject: [PATCH 022/114] implement grpc.api_v2.main --- src/otaclient/grpc/api_v2/main.py | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/otaclient/grpc/api_v2/main.py diff --git a/src/otaclient/grpc/api_v2/main.py b/src/otaclient/grpc/api_v2/main.py new file mode 100644 index 000000000..2c211a032 --- /dev/null +++ b/src/otaclient/grpc/api_v2/main.py @@ -0,0 +1,82 @@ +# Copyright 2022 TIER IV, INC. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Main entry for OTA API v2 grpc server.""" + + +from __future__ import annotations + +import asyncio +import atexit +import logging +import multiprocessing.synchronize as mp_sync +from multiprocessing.queues import Queue as mp_Queue +from typing import Callable, NoReturn + +from otaclient._types import IPCRequest, IPCResponse +from otaclient._utils import SharedOTAClientStatusReader + +logger = logging.getLogger(__name__) + + +def grpc_server_process( + shm_reader_factory: Callable[[], SharedOTAClientStatusReader], + control_flag: mp_sync.Event, + op_queue: mp_Queue[IPCRequest | IPCResponse], + all_ecus_succeeded: mp_sync.Event, + any_requires_network: mp_sync.Event, +) -> NoReturn: # type: ignore + from otaclient._logging import configure_logging + + configure_logging() + + shm_reader = shm_reader_factory() + atexit.register(shm_reader.atexit) + + async def _grpc_server_launcher(): + import grpc.aio + + from otaclient.configs.cfg import cfg, ecu_info + from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage + from otaclient.grpc.api_v2.ecu_tracker import ECUTracker + from otaclient.grpc.api_v2.servicer import OTAClientAPIServicer + from otaclient_api.v2 import otaclient_v2_pb2_grpc as v2_grpc + from otaclient_api.v2.api_stub import OtaClientServiceV2 + + ecu_status_storage = ECUStatusStorage( + all_ecus_succeeded=all_ecus_succeeded, + any_requires_network=any_requires_network, + ) + ecu_tracker = ECUTracker(ecu_status_storage, shm_reader) + ecu_tracker.start() + + api_servicer = OTAClientAPIServicer( + ecu_status_storage, + op_queue, + control_flag=control_flag, + ) + ota_client_service_v2 = OtaClientServiceV2(api_servicer) + + server = grpc.aio.server() + v2_grpc.add_OtaClientServiceServicer_to_server( + server=server, servicer=ota_client_service_v2 + ) + server.add_insecure_port(f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}") + + await server.start() + try: + await server.wait_for_termination() + finally: + await server.stop(1) + + asyncio.run(_grpc_server_launcher()) From d6df546d643a19054e97020c906c02f2d537d783 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 11:44:19 +0000 Subject: [PATCH 023/114] move otaproxy control thread logic into otaproxy_ctx --- src/otaclient/_otaproxy_ctx.py | 60 +++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index d9d005885..2d53f28a0 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -14,15 +14,21 @@ """Control of the otaproxy server startup/shutdown. The API exposed by this module is meant to be controlled by otaproxy managing thread only. -See otaclient.main.otaproxy_control_thread for more details. + +TODO: simplify this module! """ from __future__ import annotations +import atexit import logging import multiprocessing.context as mp_ctx +import multiprocessing.synchronize as mp_sync +import shutil import sys +import threading +import time from pathlib import Path from typing import Any, Optional, Type @@ -38,6 +44,13 @@ _otaproxy_p: mp_ctx.SpawnProcess | None = None +OTAPROXY_CHECK_INTERVAL = 3 +OTAPROXY_MIN_STARTUP_TIME = 60 +"""Keep otaproxy running at least 60 seconds after startup.""" +OTA_CACHE_DIR_CHECK_INTERVAL = 60 +SHUTDOWN_AFTER_CORE_EXIT = 16 +SHUTDOWN_AFTER_API_SERVER_EXIT = 3 + class OTAProxyContext(OTAProxyContextProto): EXTERNAL_CACHE_KEY = "external_cache" @@ -203,3 +216,48 @@ def shutdown_otaproxy_server() -> None: _otaproxy_p.join() _otaproxy_p = None logger.info("otaproxy closed") + + +def otaproxy_control_thread( + *, + shutdown_event: threading.Event, + any_requires_network: mp_sync.Event, + all_ecus_succeeded: mp_sync.Event, +) -> None: # pragma: no cover + from ota_proxy.config import config + from otaclient._otaproxy_ctx import ( + otaproxy_running, + shutdown_otaproxy_server, + start_otaproxy_server, + ) + + ota_cache_dir = Path(config.BASE_DIR) + next_ota_cache_dir_checkpoint = 0 + + atexit.register(shutdown_otaproxy_server) + + while not shutdown_event.is_set(): + time.sleep(OTAPROXY_CHECK_INTERVAL) + + _otaproxy_running = otaproxy_running() + _otaproxy_should_run = any_requires_network.is_set() + + if not _otaproxy_should_run and not _otaproxy_running: + _now = time.time() + if ( + _now > next_ota_cache_dir_checkpoint + and all_ecus_succeeded.is_set() + and ota_cache_dir.is_dir() + ): + logger.info( + "all tracked ECUs are in SUCCESS OTA status, cleanup ota cache dir ..." + ) + next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL + shutil.rmtree(ota_cache_dir, ignore_errors=True) + + elif _otaproxy_should_run and not _otaproxy_running: + start_otaproxy_server(init_cache=False) + time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown + + elif not _otaproxy_should_run and _otaproxy_running: + shutdown_otaproxy_server() From 256d8221095ad1b6946fe638c2f26f1e7c47dbd4 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 12:03:10 +0000 Subject: [PATCH 024/114] move ota_core_process into ota_core module --- src/otaclient/ota_core.py | 42 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 72fd96094..a52c15fc7 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -20,6 +20,8 @@ import logging import multiprocessing.queues as mp_queue import multiprocessing.synchronize as mp_sync +import signal +import sys import threading import time from concurrent.futures import Future @@ -29,7 +31,7 @@ from json.decoder import JSONDecodeError from pathlib import Path from queue import Empty, Queue -from typing import Any, Iterator, NoReturn, Optional, Type +from typing import Any, Callable, Iterator, NoReturn, Optional, Type from urllib.parse import urlparse import requests.exceptions as requests_exc @@ -43,6 +45,7 @@ ) from otaclient import errors as ota_errors from otaclient._status_monitor import ( + OTAClientStatusCollector, OTAStatusChangeReport, OTAUpdatePhaseChangeReport, SetOTAClientMetaReport, @@ -60,7 +63,7 @@ UpdatePhase, UpdateRequestV2, ) -from otaclient._utils import get_traceback, wait_and_log +from otaclient._utils import SharedOTAClientStatusWriter, get_traceback, wait_and_log from otaclient.boot_control import BootControllerProtocol, get_boot_controller from otaclient.configs.cfg import cfg, ecu_info from otaclient.create_standby import ( @@ -839,3 +842,38 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: session_id=request.session_id, ) ) + + +def _sign_handler(signame, frame) -> NoReturn: + logger.info(f"ota_core process receives {signame=}, exits ...") + sys.exit(1) + + +def ota_core_process( + shm_writer_factory: Callable[[], SharedOTAClientStatusWriter], + control_flag: mp_sync.Event, + op_queue: mp_queue.Queue[IPCRequest | IPCResponse], +): + from otaclient._logging import configure_logging + from otaclient.configs.cfg import proxy_info + from otaclient.ota_core import OTAClient + + signal.signal(signal.SIGTERM, _sign_handler) + signal.signal(signal.SIGINT, _sign_handler) + configure_logging() + + shm_writer = shm_writer_factory() + + _local_status_report_queue = Queue() + _status_monitor = OTAClientStatusCollector( + msg_queue=_local_status_report_queue, + shm_status=shm_writer, + ) + _status_monitor.start() + + _ota_core = OTAClient( + control_flag=control_flag, + proxy=proxy_info.get_proxy_for_local_ota(), + status_report_queue=_local_status_report_queue, + ) + _ota_core.main(op_queue) From bd7a183c83d6f8dbc6396261c00ce20e2c1a1095 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 12:23:10 +0000 Subject: [PATCH 025/114] otaproxy_ctx: simplify the implementation of otaproxy control --- src/otaclient/_otaproxy_ctx.py | 53 ++++++++++++---------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 2d53f28a0..6b4fdeea7 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -23,7 +23,6 @@ import atexit import logging -import multiprocessing.context as mp_ctx import multiprocessing.synchronize as mp_sync import shutil import sys @@ -42,8 +41,6 @@ logger = logging.getLogger(__name__) -_otaproxy_p: mp_ctx.SpawnProcess | None = None - OTAPROXY_CHECK_INTERVAL = 3 OTAPROXY_MIN_STARTUP_TIME = 60 """Keep otaproxy running at least 60 seconds after startup.""" @@ -171,18 +168,7 @@ def __exit__( self._umount_external_cache_storage() -def otaproxy_running() -> bool: - return _otaproxy_p is not None and _otaproxy_p.is_alive() - - -def start_otaproxy_server( - *, init_cache: bool, enable_external_cache: bool = True -) -> None: - global _otaproxy_p - if _otaproxy_p and _otaproxy_p.is_alive(): - logger.warning("otaproxy is already running, abort") - return - +def start_otaproxy_server(*, init_cache: bool, enable_external_cache: bool = True): _subprocess_entry = subprocess_otaproxy_launcher( OTAProxyContext( external_cache_enabled=enable_external_cache, @@ -206,16 +192,7 @@ def start_otaproxy_server( enable_https=proxy_info.gateway_otaproxy, ) logger.info("otaproxy started") - - -def shutdown_otaproxy_server() -> None: - global _otaproxy_p - if _otaproxy_p and _otaproxy_p.is_alive(): - logger.info("shuting down otaproxy server process...") - _otaproxy_p.terminate() - _otaproxy_p.join() - _otaproxy_p = None - logger.info("otaproxy closed") + return _otaproxy_p def otaproxy_control_thread( @@ -224,22 +201,25 @@ def otaproxy_control_thread( any_requires_network: mp_sync.Event, all_ecus_succeeded: mp_sync.Event, ) -> None: # pragma: no cover - from ota_proxy.config import config - from otaclient._otaproxy_ctx import ( - otaproxy_running, - shutdown_otaproxy_server, - start_otaproxy_server, - ) - ota_cache_dir = Path(config.BASE_DIR) - next_ota_cache_dir_checkpoint = 0 + _otaproxy_p = None + + def shutdown_otaproxy_server() -> None: + if _otaproxy_p and _otaproxy_p.is_alive(): + logger.info("shuting down otaproxy server process...") + _otaproxy_p.terminate() + _otaproxy_p.join() + logger.info("otaproxy closed") atexit.register(shutdown_otaproxy_server) + ota_cache_dir = Path(local_otaproxy_cfg.BASE_DIR) + next_ota_cache_dir_checkpoint = 0 + while not shutdown_event.is_set(): time.sleep(OTAPROXY_CHECK_INTERVAL) - _otaproxy_running = otaproxy_running() + _otaproxy_running = _otaproxy_p and _otaproxy_p.is_alive() _otaproxy_should_run = any_requires_network.is_set() if not _otaproxy_should_run and not _otaproxy_running: @@ -256,8 +236,11 @@ def otaproxy_control_thread( shutil.rmtree(ota_cache_dir, ignore_errors=True) elif _otaproxy_should_run and not _otaproxy_running: - start_otaproxy_server(init_cache=False) + # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy + # will still init the cache even init_cache is False. + _otaproxy_p = start_otaproxy_server(init_cache=False) time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown elif not _otaproxy_should_run and _otaproxy_running: shutdown_otaproxy_server() + _otaproxy_p = None From 4149c39d11daa82b84165bd4d4a30b8b8117129b Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 12:25:21 +0000 Subject: [PATCH 026/114] finish up main --- src/otaclient/main.py | 203 +++++++++--------------------------------- 1 file changed, 44 insertions(+), 159 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index ca7654b53..ca59409fe 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -16,181 +16,38 @@ from __future__ import annotations -import asyncio import atexit import logging import multiprocessing as mp import multiprocessing.shared_memory as mp_shm -import multiprocessing.synchronize as mp_sync import secrets -import shutil +import signal +import sys import threading import time from functools import partial -from multiprocessing.queues import Queue as mp_Queue -from pathlib import Path -from queue import Queue from typing import NoReturn from otaclient import __version__ -from otaclient._status_monitor import OTAClientStatusCollector -from otaclient._types import IPCRequest, IPCResponse from otaclient._utils import SharedOTAClientStatusReader, SharedOTAClientStatusWriter logger = logging.getLogger(__name__) HEALTH_CHECK_INTERAVL = 6 # seconds -OTAPROXY_CHECK_INTERVAL = 3 -OTAPROXY_MIN_STARTUP_TIME = 60 -"""Keep otaproxy running at least 60 seconds after startup.""" -OTA_CACHE_DIR_CHECK_INTERVAL = 60 -SHUTDOWN_AFTER_CORE_EXIT = 16 -SHUTDOWN_AFTER_API_SERVER_EXIT = 3 +SHUTDOWN_AFTER_CORE_EXIT = 16 # seconds +SHUTDOWN_AFTER_API_SERVER_EXIT = 3 # seconds -_global_shutdown: bool = False - - -def _on_global_shutdown(): - global _global_shutdown - _global_shutdown = True - - -def ota_core_process( - shm_writer_factory, - control_flag: mp_sync.Event, - op_queue: mp_Queue[IPCRequest | IPCResponse], -): - from otaclient._logging import configure_logging - from otaclient.configs.cfg import proxy_info - from otaclient.ota_core import OTAClient - - atexit.register(_on_global_shutdown) - shm_writer = shm_writer_factory() - atexit.register(shm_writer.atexit) - - configure_logging() - - _local_status_report_queue = Queue() - _status_monitor = OTAClientStatusCollector( - msg_queue=_local_status_report_queue, - shm_status=shm_writer, - ) - _status_monitor.start() - - _ota_core = OTAClient( - control_flag=control_flag, - proxy=proxy_info.get_proxy_for_local_ota(), - status_report_queue=_local_status_report_queue, - ) - _ota_core.main(op_queue) - - -def grpc_server_process( - shm_reader_factory, - control_flag: mp_sync.Event, - op_queue: mp_Queue[IPCRequest | IPCResponse], - all_ecus_succeeded: mp_sync.Event, - any_requires_network: mp_sync.Event, -) -> NoReturn: # type: ignore - from otaclient._logging import configure_logging - - configure_logging() - atexit.register(_on_global_shutdown) - - shm_reader = shm_reader_factory() - atexit.register(shm_reader.atexit) - - async def _grpc_server_launcher(): - import grpc.aio - - from otaclient.configs.cfg import cfg, ecu_info - from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage - from otaclient.grpc.api_v2.ecu_tracker import ECUTracker - from otaclient.grpc.api_v2.servicer import OTAClientAPIServicer - from otaclient_api.v2 import otaclient_v2_pb2_grpc as v2_grpc - from otaclient_api.v2.api_stub import OtaClientServiceV2 - - ecu_status_storage = ECUStatusStorage( - all_ecus_succeeded=all_ecus_succeeded, - any_requires_network=any_requires_network, - ) - ecu_tracker = ECUTracker(ecu_status_storage, shm_reader) - ecu_tracker.start() - - api_servicer = OTAClientAPIServicer( - ecu_status_storage, - op_queue, - control_flag=control_flag, - ) - ota_client_service_v2 = OtaClientServiceV2(api_servicer) - - server = grpc.aio.server() - v2_grpc.add_OtaClientServiceServicer_to_server( - server=server, servicer=ota_client_service_v2 - ) - server.add_insecure_port(f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}") - - await server.start() - try: - await server.wait_for_termination() - finally: - await server.stop(1) - - asyncio.run(_grpc_server_launcher()) - - -def otaproxy_control_thread( - *, - any_requires_network: mp_sync.Event, - all_ecus_succeeded: mp_sync.Event, -) -> None: # pragma: no cover - from ota_proxy.config import config - from otaclient._otaproxy_ctx import ( - otaproxy_running, - shutdown_otaproxy_server, - start_otaproxy_server, - ) - - ota_cache_dir = Path(config.BASE_DIR) - next_ota_cache_dir_checkpoint = 0 - - atexit.register(shutdown_otaproxy_server) - - while not _global_shutdown: - time.sleep(OTAPROXY_CHECK_INTERVAL) - - _otaproxy_running = otaproxy_running() - _otaproxy_should_run = any_requires_network.is_set() - - if not _otaproxy_should_run and not _otaproxy_running: - _now = time.time() - if ( - _now > next_ota_cache_dir_checkpoint - and all_ecus_succeeded.is_set() - and ota_cache_dir.is_dir() - ): - logger.info( - "all tracked ECUs are in SUCCESS OTA status, cleanup ota cache dir ..." - ) - next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL - shutil.rmtree(ota_cache_dir, ignore_errors=True) - - elif _otaproxy_should_run and not _otaproxy_running: - start_otaproxy_server(init_cache=False) - time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown - - elif not _otaproxy_should_run and _otaproxy_running: - shutdown_otaproxy_server() - - -STATUS_SHM_SIZE = 4096 +STATUS_SHM_SIZE = 4096 # bytes SHM_HMAC_KEY_LEN = 64 # bytes def main() -> None: from otaclient._logging import configure_logging + from otaclient._otaproxy_ctx import otaproxy_control_thread from otaclient._utils import check_other_otaclient, create_otaclient_rundir from otaclient.configs.cfg import cfg, ecu_info, proxy_info + from otaclient.grpc.api_v2.main import grpc_server_process + from otaclient.ota_core import ota_core_process # configure logging before any code being executed configure_logging() @@ -202,13 +59,42 @@ def main() -> None: check_other_otaclient(cfg.OTACLIENT_PID_FILE) create_otaclient_rundir(cfg.RUN_DIR) + # + # ------ start each processes ------ # + # + _ota_core_p, _grpc_server_p = None, None + shm = None + + def _on_shutdown(signame=None, _=None) -> NoReturn: + if signame: + logger.info( + f"otaclient main process receives {signame=}, shutting down ..." + ) + + if _ota_core_p: + _ota_core_p.terminate() + _ota_core_p.join() + + if _grpc_server_p: + _grpc_server_p.terminate() + _grpc_server_p.join() + + if shm: + shm.close() + shm.unlink() + + logger.info("otaclient shutdown") + sys.exit(1) + + atexit.register(_on_shutdown) + signal.signal(signal.SIGTERM, _on_shutdown) + signal.signal(signal.SIGINT, _on_shutdown) + mp_ctx = mp.get_context("spawn") shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) _key = secrets.token_bytes(SHM_HMAC_KEY_LEN) - atexit.register(shm.close) - atexit.register(shm.unlink) - # shared queus and flags + # shared queues and flags local_otaclient_control_flag = mp_ctx.Event() local_otaclient_op_queue = mp_ctx.Queue() all_ecus_succeeded = mp_ctx.Event() @@ -241,9 +127,8 @@ def main() -> None: # we only setup the resources in main process del _key, local_otaclient_control_flag, local_otaclient_op_queue - # ------ configuring main process ------ # + # ------ setup main process ------ # - atexit.register(_on_global_shutdown) _otaproxy_control_t = None if proxy_info.enable_local_ota_proxy: _otaproxy_control_t = threading.Thread( @@ -257,7 +142,7 @@ def main() -> None: ) _otaproxy_control_t.start() - while not _global_shutdown: + while True: time.sleep(HEALTH_CHECK_INTERAVL) if not _ota_core_p.is_alive(): @@ -266,11 +151,11 @@ def main() -> None: f"otaclient will exit in {SHUTDOWN_AFTER_CORE_EXIT}seconds ..." ) time.sleep(SHUTDOWN_AFTER_CORE_EXIT) - # TODO: shutdown + _on_shutdown() if not _grpc_server_p.is_alive(): logger.error( f"ota API server is dead, whole otaclient will exit in {SHUTDOWN_AFTER_API_SERVER_EXIT}seconds ..." ) time.sleep(SHUTDOWN_AFTER_API_SERVER_EXIT) - # TODO: shutdown + _on_shutdown() From 772c5ba0ce9427281093628ce6eb3cfbf08c165a Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 12:51:18 +0000 Subject: [PATCH 027/114] fix main --- src/otaclient/_otaproxy_ctx.py | 10 +++++----- src/otaclient/main.py | 13 ++++++++++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 6b4fdeea7..fd390b3bc 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -197,7 +197,6 @@ def start_otaproxy_server(*, init_cache: bool, enable_external_cache: bool = Tru def otaproxy_control_thread( *, - shutdown_event: threading.Event, any_requires_network: mp_sync.Event, all_ecus_succeeded: mp_sync.Event, ) -> None: # pragma: no cover @@ -209,14 +208,13 @@ def shutdown_otaproxy_server() -> None: logger.info("shuting down otaproxy server process...") _otaproxy_p.terminate() _otaproxy_p.join() - logger.info("otaproxy closed") atexit.register(shutdown_otaproxy_server) ota_cache_dir = Path(local_otaproxy_cfg.BASE_DIR) next_ota_cache_dir_checkpoint = 0 - while not shutdown_event.is_set(): + while True: time.sleep(OTAPROXY_CHECK_INTERVAL) _otaproxy_running = _otaproxy_p and _otaproxy_p.is_alive() @@ -234,13 +232,15 @@ def shutdown_otaproxy_server() -> None: ) next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL shutil.rmtree(ota_cache_dir, ignore_errors=True) + continue - elif _otaproxy_should_run and not _otaproxy_running: + if _otaproxy_should_run and not _otaproxy_running: # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy # will still init the cache even init_cache is False. _otaproxy_p = start_otaproxy_server(init_cache=False) time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown + continue - elif not _otaproxy_should_run and _otaproxy_running: + if not _otaproxy_should_run and _otaproxy_running: shutdown_otaproxy_server() _otaproxy_p = None diff --git a/src/otaclient/main.py b/src/otaclient/main.py index ca59409fe..d6c825d62 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -33,6 +33,8 @@ logger = logging.getLogger(__name__) +_on_shutdown_triggered = False + HEALTH_CHECK_INTERAVL = 6 # seconds SHUTDOWN_AFTER_CORE_EXIT = 16 # seconds SHUTDOWN_AFTER_API_SERVER_EXIT = 3 # seconds @@ -65,7 +67,13 @@ def main() -> None: _ota_core_p, _grpc_server_p = None, None shm = None - def _on_shutdown(signame=None, _=None) -> NoReturn: + def _on_shutdown(signame=None, _=None): + global _on_shutdown_triggered + + if _on_shutdown_triggered: + return + _on_shutdown_triggered = True + if signame: logger.info( f"otaclient main process receives {signame=}, shutting down ..." @@ -124,8 +132,7 @@ def _on_shutdown(signame=None, _=None) -> NoReturn: ) _grpc_server_p.start() - # we only setup the resources in main process - del _key, local_otaclient_control_flag, local_otaclient_op_queue + del _key # ------ setup main process ------ # From 57d594787a1983a3ee77949d936b08596dd705f3 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 12:58:55 +0000 Subject: [PATCH 028/114] status_monitor: not unlink the shm --- src/otaclient/_status_monitor.py | 2 +- src/otaclient_common/shm_status.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 0d8201512..c50a722b3 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -52,7 +52,7 @@ def _global_shutdown(): _status_report_queue.put_nowait(TERMINATE_SENTINEL) if _shm_status: - _shm_status.atexit(unlink=True) + _shm_status.atexit() atexit.register(_global_shutdown) diff --git a/src/otaclient_common/shm_status.py b/src/otaclient_common/shm_status.py index 2ee397d26..b187d6af9 100644 --- a/src/otaclient_common/shm_status.py +++ b/src/otaclient_common/shm_status.py @@ -124,10 +124,8 @@ def __init__( self._key = key self.msg_max_size = min(_msg_max_size, msg_max_size or float("infinity")) - def atexit(self, *, unlink: bool = False) -> None: + def atexit(self) -> None: self._shm.close() - if unlink: - self._shm.unlink() def write_msg(self, obj: T) -> None: buffer = self._shm.buf From 3575406ffdafea634ae44a92626cb19cad4733e0 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 13:15:49 +0000 Subject: [PATCH 029/114] api_v2.ecu_tracker: actively polling until we get the first valid resp when startup --- src/otaclient/grpc/api_v2/ecu_tracker.py | 26 ++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 13d16a766..d3c358d13 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -20,6 +20,7 @@ import atexit import contextlib import logging +from collections import defaultdict from otaclient._utils import SharedOTAClientStatusReader from otaclient.configs import ECUContact @@ -32,6 +33,9 @@ _otaclient_shutdown = False _shm_status: SharedOTAClientStatusReader | None = None +# actively polling ECUs status until we get the first valid response +# when otaclient is just starting. +_active_polling_interval_on_startup = 1 def _global_shutdown(): @@ -56,12 +60,14 @@ def __init__( self._local_ecu_status_reader = local_ecu_status_reader self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() + self._startup_matrix: defaultdict[str, bool] = defaultdict(lambda: True) global _shm_status _shm_status = local_ecu_status_reader async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): """Task entry for loop polling one subECU's status.""" + this_ecu_id = ecu_contact.ecu_id while not _otaclient_shutdown: try: _ecu_resp = await OTAClientCall.status_call( @@ -71,20 +77,36 @@ async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): timeout=cfg.QUERYING_SUBECU_STATUS_TIMEOUT, request=api_types.StatusRequest(), ) + if self._startup_matrix[this_ecu_id] and ( + _ecu_resp.find_ecu_v2(this_ecu_id) + or _ecu_resp.find_ecu(this_ecu_id) + ): + self._startup_matrix[this_ecu_id] = False await self._ecu_status_storage.update_from_child_ecu(_ecu_resp) except ECUNoResponse as e: logger.debug( f"ecu@{ecu_contact} doesn't respond to status request: {e!r}" ) - await self._polling_waiter() + + if self._startup_matrix[this_ecu_id]: + await asyncio.sleep(_active_polling_interval_on_startup) + else: + await self._polling_waiter() async def _polling_local_ecu_status(self): """Task entry for loop polling local ECU status.""" + my_ecu_id = ecu_info.ecu_id while not _otaclient_shutdown: with contextlib.suppress(Exception): status_report = self._local_ecu_status_reader.sync_msg() + if status_report: + self._startup_matrix[my_ecu_id] = False await self._ecu_status_storage.update_from_local_ecu(status_report) - await self._polling_waiter() + + if self._startup_matrix[my_ecu_id]: + await asyncio.sleep(_active_polling_interval_on_startup) + else: + await self._polling_waiter() def start(self) -> None: asyncio.create_task(self._polling_local_ecu_status()) From 712b441b5f1a54372996c93d5cd49594c707f923 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 13:22:03 +0000 Subject: [PATCH 030/114] ecu_tracker: minor cleanup --- src/otaclient/grpc/api_v2/ecu_tracker.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index d3c358d13..95f713aef 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -31,24 +31,11 @@ logger = logging.getLogger(__name__) -_otaclient_shutdown = False -_shm_status: SharedOTAClientStatusReader | None = None # actively polling ECUs status until we get the first valid response # when otaclient is just starting. _active_polling_interval_on_startup = 1 -def _global_shutdown(): - global _otaclient_shutdown - _otaclient_shutdown = True - - if _shm_status: - _shm_status.atexit() - - -atexit.register(_global_shutdown) - - class ECUTracker: def __init__( @@ -62,13 +49,12 @@ def __init__( self._polling_waiter = self._ecu_status_storage.get_polling_waiter() self._startup_matrix: defaultdict[str, bool] = defaultdict(lambda: True) - global _shm_status - _shm_status = local_ecu_status_reader + atexit.register(local_ecu_status_reader.atexit) async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): """Task entry for loop polling one subECU's status.""" this_ecu_id = ecu_contact.ecu_id - while not _otaclient_shutdown: + while True: try: _ecu_resp = await OTAClientCall.status_call( ecu_contact.ecu_id, @@ -96,7 +82,7 @@ async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): async def _polling_local_ecu_status(self): """Task entry for loop polling local ECU status.""" my_ecu_id = ecu_info.ecu_id - while not _otaclient_shutdown: + while True: with contextlib.suppress(Exception): status_report = self._local_ecu_status_reader.sync_msg() if status_report: From 83c4e49ce2c38936ce69cc56ddf84567deb3e961 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 13:26:30 +0000 Subject: [PATCH 031/114] status_monitor: minor cleanup --- src/otaclient/_status_monitor.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index c50a722b3..d5dab34ae 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -41,7 +41,6 @@ _otaclient_shutdown = False _status_report_queue: queue.Queue | None = None -_shm_status: SharedOTAClientStatusWriter | None = None def _global_shutdown(): @@ -51,9 +50,6 @@ def _global_shutdown(): if _status_report_queue: _status_report_queue.put_nowait(TERMINATE_SENTINEL) - if _shm_status: - _shm_status.atexit() - atexit.register(_global_shutdown) @@ -247,9 +243,7 @@ def __init__( self._shm_status = shm_status self._next_shm_push = 0 - # register the shm_status to global for cleanup atexit - global _shm_status - _shm_status = shm_status + atexit.register(shm_status.atexit) def load_report(self, report: StatusReport) -> None: _now = int(time.time()) From afc8ef505f0109438a7ac24ba7a520d1080ce47e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 13:27:28 +0000 Subject: [PATCH 032/114] status_monitor: minor cleanup --- src/otaclient/_status_monitor.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index d5dab34ae..0b098e4e8 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -39,14 +39,10 @@ logger = logging.getLogger(__name__) -_otaclient_shutdown = False _status_report_queue: queue.Queue | None = None def _global_shutdown(): - global _otaclient_shutdown - _otaclient_shutdown = True - if _status_report_queue: _status_report_queue.put_nowait(TERMINATE_SENTINEL) @@ -287,7 +283,7 @@ def load_report(self, report: StatusReport) -> None: def _status_collector_thread(self) -> None: """Main entry of status monitor working thread.""" - while not _otaclient_shutdown: + while True: try: report = self._input_queue.get_nowait() if report is TERMINATE_SENTINEL: From 5ed81289e8a1a926dd86127ca5fdb3a5128599a5 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 14:02:21 +0000 Subject: [PATCH 033/114] main: refine --- src/otaclient/_otaproxy_ctx.py | 4 +- src/otaclient/main.py | 72 +++++++++++++++++----------------- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index fd390b3bc..bd0b4b790 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -26,7 +26,6 @@ import multiprocessing.synchronize as mp_sync import shutil import sys -import threading import time from pathlib import Path from typing import Any, Optional, Type @@ -205,7 +204,7 @@ def otaproxy_control_thread( def shutdown_otaproxy_server() -> None: if _otaproxy_p and _otaproxy_p.is_alive(): - logger.info("shuting down otaproxy server process...") + print("shuting down otaproxy server process...") _otaproxy_p.terminate() _otaproxy_p.join() @@ -242,5 +241,6 @@ def shutdown_otaproxy_server() -> None: continue if not _otaproxy_should_run and _otaproxy_running: + logger.info("shutting down otaproxy as not needed now ...") shutdown_otaproxy_server() _otaproxy_p = None diff --git a/src/otaclient/main.py b/src/otaclient/main.py index d6c825d62..6bbf351fb 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -19,6 +19,7 @@ import atexit import logging import multiprocessing as mp +import multiprocessing.context as mp_ctx import multiprocessing.shared_memory as mp_shm import secrets import signal @@ -26,15 +27,12 @@ import threading import time from functools import partial -from typing import NoReturn from otaclient import __version__ from otaclient._utils import SharedOTAClientStatusReader, SharedOTAClientStatusWriter logger = logging.getLogger(__name__) -_on_shutdown_triggered = False - HEALTH_CHECK_INTERAVL = 6 # seconds SHUTDOWN_AFTER_CORE_EXIT = 16 # seconds SHUTDOWN_AFTER_API_SERVER_EXIT = 3 # seconds @@ -42,6 +40,34 @@ STATUS_SHM_SIZE = 4096 # bytes SHM_HMAC_KEY_LEN = 64 # bytes +_ota_core_p: mp_ctx.SpawnProcess | None = None +_grpc_server_p: mp_ctx.SpawnProcess | None = None +_shm: mp_shm.SharedMemory | None = None + + +def _on_shutdown() -> None: + global _ota_core_p, _grpc_server_p, _shm + if _ota_core_p: + _ota_core_p.terminate() + _ota_core_p.join() + _ota_core_p = None + + if _grpc_server_p: + _grpc_server_p.terminate() + _grpc_server_p.join() + _grpc_server_p = None + + if _shm: + _shm.close() + _shm.unlink() + _shm = None + + +def _signal_handler(signame, _) -> None: + logger.info(f"otaclient receives {signame=}, shutting down ...") + _on_shutdown() + sys.exit(1) + def main() -> None: from otaclient._logging import configure_logging @@ -64,42 +90,14 @@ def main() -> None: # # ------ start each processes ------ # # - _ota_core_p, _grpc_server_p = None, None - shm = None - - def _on_shutdown(signame=None, _=None): - global _on_shutdown_triggered - - if _on_shutdown_triggered: - return - _on_shutdown_triggered = True - - if signame: - logger.info( - f"otaclient main process receives {signame=}, shutting down ..." - ) - - if _ota_core_p: - _ota_core_p.terminate() - _ota_core_p.join() - - if _grpc_server_p: - _grpc_server_p.terminate() - _grpc_server_p.join() - - if shm: - shm.close() - shm.unlink() - - logger.info("otaclient shutdown") - sys.exit(1) + global _ota_core_p, _grpc_server_p, _shm atexit.register(_on_shutdown) - signal.signal(signal.SIGTERM, _on_shutdown) - signal.signal(signal.SIGINT, _on_shutdown) + signal.signal(signal.SIGTERM, _signal_handler) + signal.signal(signal.SIGINT, _signal_handler) mp_ctx = mp.get_context("spawn") - shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) + _shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) _key = secrets.token_bytes(SHM_HMAC_KEY_LEN) # shared queues and flags @@ -111,7 +109,7 @@ def _on_shutdown(signame=None, _=None): _ota_core_p = mp_ctx.Process( target=partial( ota_core_process, - partial(SharedOTAClientStatusWriter, name=shm.name, key=_key), + partial(SharedOTAClientStatusWriter, name=_shm.name, key=_key), local_otaclient_control_flag, local_otaclient_op_queue, ), @@ -122,7 +120,7 @@ def _on_shutdown(signame=None, _=None): _grpc_server_p = mp_ctx.Process( target=partial( grpc_server_process, - partial(SharedOTAClientStatusReader, name=shm.name, key=_key), + partial(SharedOTAClientStatusReader, name=_shm.name, key=_key), local_otaclient_control_flag, local_otaclient_op_queue, all_ecus_succeeded, From a9e1770044a6a111bb659ce89625da532253c9ef Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 14:04:22 +0000 Subject: [PATCH 034/114] add logging for grpc server startup --- src/otaclient/grpc/api_v2/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/otaclient/grpc/api_v2/main.py b/src/otaclient/grpc/api_v2/main.py index 2c211a032..00fa36825 100644 --- a/src/otaclient/grpc/api_v2/main.py +++ b/src/otaclient/grpc/api_v2/main.py @@ -39,6 +39,7 @@ def grpc_server_process( from otaclient._logging import configure_logging configure_logging() + logger.info("otaclient OTA API grpc server started") shm_reader = shm_reader_factory() atexit.register(shm_reader.atexit) @@ -71,8 +72,10 @@ async def _grpc_server_launcher(): v2_grpc.add_OtaClientServiceServicer_to_server( server=server, servicer=ota_client_service_v2 ) - server.add_insecure_port(f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}") + _address_info = f"{ecu_info.ip_addr}:{cfg.OTA_API_SERVER_PORT}" + server.add_insecure_port(_address_info) + logger.info(f"launch grpc API server at {_address_info}") await server.start() try: await server.wait_for_termination() From 9b50fa908e3c50c537261979661cf7ee89bd9bc1 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 14:58:18 +0000 Subject: [PATCH 035/114] do not use logger in signal handler --- src/otaclient/main.py | 2 +- src/otaclient/ota_core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 6bbf351fb..bff8264a5 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -64,7 +64,7 @@ def _on_shutdown() -> None: def _signal_handler(signame, _) -> None: - logger.info(f"otaclient receives {signame=}, shutting down ...") + print(f"otaclient receives {signame=}, shutting down ...") _on_shutdown() sys.exit(1) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index a52c15fc7..5ffbd2723 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -845,7 +845,7 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: def _sign_handler(signame, frame) -> NoReturn: - logger.info(f"ota_core process receives {signame=}, exits ...") + print(f"ota_core process receives {signame=}, exits ...") sys.exit(1) From cfee25c8360862b2557440ddd665a86b61489bac Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:05:09 +0000 Subject: [PATCH 036/114] ota_core: use two channels for req and resp --- src/otaclient/ota_core.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 5ffbd2723..58ecd1c5a 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -785,13 +785,18 @@ def rollback(self, request: RollbackRequestV2) -> None: failure_type=e.failure_type, ) - def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: + def main( + self, + *, + req_queue: mp_queue.Queue[IPCRequest], + resp_queue: mp_queue.Queue[IPCResponse], + ) -> NoReturn: """Main loop of ota_core process.""" _allow_request_after = 0 while True: _now = int(time.time()) try: - request = op_queue.get(timeout=OP_CHECK_INTERVAL) + request = req_queue.get(timeout=OP_CHECK_INTERVAL) except Empty: continue @@ -802,7 +807,7 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: f"reject {request}" ) logger.warning(_err_msg) - op_queue.put_nowait( + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.REJECT_BUSY, msg=_err_msg, @@ -812,7 +817,7 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: elif isinstance(request, UpdateRequestV2): self._live_ota_status = OTAStatus.UPDATING self.update(request) - op_queue.put_nowait( + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.ACCEPT, session_id=request.session_id, @@ -825,7 +830,7 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: ): self._live_ota_status = OTAStatus.FAILURE self.rollback(request) - op_queue.put_nowait( + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.ACCEPT, session_id=request.session_id, @@ -835,7 +840,7 @@ def main(self, op_queue: mp_queue.Queue[IPCRequest | IPCResponse]) -> NoReturn: else: _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" logger.error(_err_msg) - op_queue.put_nowait( + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.REJECT_OTHER, msg=_err_msg, @@ -852,7 +857,8 @@ def _sign_handler(signame, frame) -> NoReturn: def ota_core_process( shm_writer_factory: Callable[[], SharedOTAClientStatusWriter], control_flag: mp_sync.Event, - op_queue: mp_queue.Queue[IPCRequest | IPCResponse], + op_queue: mp_queue.Queue[IPCRequest], + resp_queue: mp_queue.Queue[IPCResponse], ): from otaclient._logging import configure_logging from otaclient.configs.cfg import proxy_info @@ -876,4 +882,4 @@ def ota_core_process( proxy=proxy_info.get_proxy_for_local_ota(), status_report_queue=_local_status_report_queue, ) - _ota_core.main(op_queue) + _ota_core.main(req_queue=op_queue, resp_queue=resp_queue) From 445d38e0de376892340f498c997ac020dfcce968 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:08:01 +0000 Subject: [PATCH 037/114] servicer: use two channels for req and resp --- src/otaclient/grpc/api_v2/servicer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index bf8c8d377..f2a33dae9 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -51,7 +51,8 @@ class OTAClientAPIServicer: def __init__( self, ecu_status_storage: ECUStatusStorage, - ipc_queue: mp_queue.Queue[IPCRequest | IPCResponse], + op_queue: mp_queue.Queue[IPCRequest], + resp_queue: mp_queue.Queue[IPCResponse], *, control_flag: mp_sync.Event, ): @@ -61,7 +62,8 @@ def __init__( self.my_ecu_id = ecu_info.ecu_id self._otaclient_control_flag = control_flag - self._ipc_queue = ipc_queue + self._op_queue = op_queue + self._resp_queue = resp_queue self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() @@ -106,9 +108,9 @@ async def _otaclient_control_flag_managing(self): # API servicer def _local_update(self, request: UpdateRequestV2) -> api_types.UpdateResponseEcu: - self._ipc_queue.put_nowait(request) + self._op_queue.put_nowait(request) try: - _req_response = self._ipc_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + _req_response = self._resp_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) assert isinstance(_req_response, IPCResponse), "unexpected msg" assert ( _req_response.session_id == request.session_id @@ -215,9 +217,9 @@ async def update( def _local_rollback( self, rollback_request: RollbackRequestV2 ) -> api_types.RollbackResponseEcu: - self._ipc_queue.put_nowait(rollback_request) + self._op_queue.put_nowait(rollback_request) try: - _req_response = self._ipc_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + _req_response = self._resp_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) assert isinstance( _req_response, IPCResponse ), f"unexpected response: {type(_req_response)}" From 42a7cb2e980fab6a0773cbb94e87f31ec0cf13fc Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:09:50 +0000 Subject: [PATCH 038/114] finish up channel split --- src/otaclient/grpc/api_v2/main.py | 8 +++++--- src/otaclient/main.py | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/otaclient/grpc/api_v2/main.py b/src/otaclient/grpc/api_v2/main.py index 00fa36825..84fc63240 100644 --- a/src/otaclient/grpc/api_v2/main.py +++ b/src/otaclient/grpc/api_v2/main.py @@ -32,7 +32,8 @@ def grpc_server_process( shm_reader_factory: Callable[[], SharedOTAClientStatusReader], control_flag: mp_sync.Event, - op_queue: mp_Queue[IPCRequest | IPCResponse], + op_queue: mp_Queue[IPCRequest], + resp_queue: mp_Queue[IPCResponse], all_ecus_succeeded: mp_sync.Event, any_requires_network: mp_sync.Event, ) -> NoReturn: # type: ignore @@ -62,8 +63,9 @@ async def _grpc_server_launcher(): ecu_tracker.start() api_servicer = OTAClientAPIServicer( - ecu_status_storage, - op_queue, + ecu_status_storage=ecu_status_storage, + op_queue=op_queue, + resp_queue=resp_queue, control_flag=control_flag, ) ota_client_service_v2 = OtaClientServiceV2(api_servicer) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index bff8264a5..ac5ff4d35 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -103,6 +103,7 @@ def main() -> None: # shared queues and flags local_otaclient_control_flag = mp_ctx.Event() local_otaclient_op_queue = mp_ctx.Queue() + local_otaclient_resp_queue = mp_ctx.Queue() all_ecus_succeeded = mp_ctx.Event() any_requires_network = mp_ctx.Event() @@ -112,6 +113,7 @@ def main() -> None: partial(SharedOTAClientStatusWriter, name=_shm.name, key=_key), local_otaclient_control_flag, local_otaclient_op_queue, + local_otaclient_resp_queue, ), name="otaclient_ota_core", ) @@ -123,6 +125,7 @@ def main() -> None: partial(SharedOTAClientStatusReader, name=_shm.name, key=_key), local_otaclient_control_flag, local_otaclient_op_queue, + local_otaclient_resp_queue, all_ecus_succeeded, any_requires_network, ), From 194c6cae5f7cfe7e0cf3e902512ba374f016d06c Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:22:21 +0000 Subject: [PATCH 039/114] do not wait otaproxy at OTAUpdater.__init__ method --- src/otaclient/ota_core.py | 67 ++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 58ecd1c5a..348689516 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -89,7 +89,8 @@ DOWNLOAD_REPORT_INTERVAL = 1 # second OP_CHECK_INTERVAL = 1 # second -HOLD_REQ_HANDLING_ON_ACK_REQUEST = 8 # seconds +HOLD_REQ_HANDLING_ON_ACK_REQUEST = 6 # seconds +WAIT_FOR_OTAPROXY_ONLINE = 3 * 60 # 3mins class OTAClientError(Exception): ... @@ -161,6 +162,24 @@ def __init__( status_report_queue: Queue[StatusReport], session_id: str, ) -> None: + status_report_queue.put_nowait( + StatusReport( + payload=OTAUpdatePhaseChangeReport( + new_update_phase=UpdatePhase.INITIALIZING, + trigger_timestamp=self.update_start_timestamp, + ), + session_id=self.session_id, + ) + ) + status_report_queue.put_nowait( + StatusReport( + payload=SetUpdateMetaReport( + update_firmware_version=version, + ), + session_id=self.session_id, + ) + ) + self.ca_chains_store = ca_chains_store self.session_id = session_id self._status_report_queue = status_report_queue @@ -187,19 +206,7 @@ def __init__( # ------ parse upper proxy ------ # logger.debug("configure proxy setting...") - proxies = {} - if upper_otaproxy: - logger.info( - f"use {upper_otaproxy} for local OTA update, " - f"wait for otaproxy@{upper_otaproxy} online..." - ) - ensure_otaproxy_start( - upper_otaproxy, - probing_timeout=cfg.DOWNLOAD_INACTIVE_TIMEOUT, - ) - # NOTE(20221013): check requests document for how to set proxy, - # we only support using http proxy here. - proxies["http"] = upper_otaproxy + self._upper_proxy = upper_otaproxy # ------ init updater implementation ------ # self._control_flag = control_flag @@ -210,24 +217,6 @@ def __init__( self.update_version = version self.update_start_timestamp = int(time.time()) - status_report_queue.put_nowait( - StatusReport( - payload=OTAUpdatePhaseChangeReport( - new_update_phase=UpdatePhase.INITIALIZING, - trigger_timestamp=self.update_start_timestamp, - ), - session_id=self.session_id, - ) - ) - status_report_queue.put_nowait( - StatusReport( - payload=SetUpdateMetaReport( - update_firmware_version=version, - ), - session_id=self.session_id, - ) - ) - # ------ init variables needed for update ------ # _url_base = urlparse(raw_url_base) _path = f"{_url_base.path.rstrip('/')}/" @@ -239,7 +228,9 @@ def __init__( hash_func=sha256, chunk_size=cfg.CHUNK_SIZE, cookies=cookies, - proxies=proxies, + # NOTE(20221013): check requests document for how to set proxy, + # we only support using http proxy here. + proxies={"http": upper_otaproxy} if upper_otaproxy else None, ) self._downloader_mapper: dict[int, Downloader] = {} @@ -412,6 +403,16 @@ def _execute_update(self): """Implementation of OTA updating.""" logger.info(f"execute local update({ecu_info.ecu_id=}): {self.update_version=}") + if _upper_proxy := self._upper_proxy: + logger.info( + f"use {_upper_proxy} for local OTA update, " + f"wait for otaproxy@{_upper_proxy} online..." + ) + ensure_otaproxy_start( + _upper_proxy, + probing_timeout=WAIT_FOR_OTAPROXY_ONLINE, + ) + # ------ init, processing metadata ------ # logger.debug("process metadata.jwt...") self._status_report_queue.put_nowait( From 968afb6ef9cf00e1990e23cc192ab7ba59318cf6 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:26:07 +0000 Subject: [PATCH 040/114] ota_core: use thread for OTA operation executing --- src/otaclient/ota_core.py | 47 ++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 348689516..da70582be 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -815,9 +815,19 @@ def main( session_id=request.session_id, ) ) - elif isinstance(request, UpdateRequestV2): + continue + + if isinstance(request, UpdateRequestV2): self._live_ota_status = OTAStatus.UPDATING - self.update(request) + + _update_thread = threading.Thread( + target=self.update, + args=[request], + daemon=True, + name="ota_update_executor", + ) + _update_thread.start() + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.ACCEPT, @@ -825,12 +835,22 @@ def main( ) ) _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST - elif ( + continue + + if ( isinstance(request, RollbackRequestV2) and self._live_ota_status == OTAStatus.SUCCESS ): self._live_ota_status = OTAStatus.FAILURE - self.rollback(request) + + _rollback_thread = threading.Thread( + target=self.rollback, + args=[request], + daemon=True, + name="ota_rollback_executor", + ) + _rollback_thread.start() + resp_queue.put_nowait( IPCResponse( res=IPCResEnum.ACCEPT, @@ -838,16 +858,17 @@ def main( ) ) _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST - else: - _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" - logger.error(_err_msg) - resp_queue.put_nowait( - IPCResponse( - res=IPCResEnum.REJECT_OTHER, - msg=_err_msg, - session_id=request.session_id, - ) + continue + + _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" + logger.error(_err_msg) + resp_queue.put_nowait( + IPCResponse( + res=IPCResEnum.REJECT_OTHER, + msg=_err_msg, + session_id=request.session_id, ) + ) def _sign_handler(signame, frame) -> NoReturn: From fef3a9c0197a6406e50fcf77e8b68177d00dcdf6 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 15:31:33 +0000 Subject: [PATCH 041/114] minor fix --- src/otaclient/ota_core.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index da70582be..9ce4407a8 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -162,13 +162,19 @@ def __init__( status_report_queue: Queue[StatusReport], session_id: str, ) -> None: + self.update_version = version + self.update_start_timestamp = int(time.time()) + self.ca_chains_store = ca_chains_store + self.session_id = session_id + self._status_report_queue = status_report_queue + status_report_queue.put_nowait( StatusReport( payload=OTAUpdatePhaseChangeReport( new_update_phase=UpdatePhase.INITIALIZING, trigger_timestamp=self.update_start_timestamp, ), - session_id=self.session_id, + session_id=session_id, ) ) status_report_queue.put_nowait( @@ -176,14 +182,10 @@ def __init__( payload=SetUpdateMetaReport( update_firmware_version=version, ), - session_id=self.session_id, + session_id=session_id, ) ) - self.ca_chains_store = ca_chains_store - self.session_id = session_id - self._status_report_queue = status_report_queue - # ------ define OTA temp paths ------ # self._ota_tmp_on_standby = Path(cfg.STANDBY_SLOT_MNT) / Path( cfg.OTA_TMP_STORE @@ -213,10 +215,6 @@ def __init__( self._boot_controller = boot_controller self._create_standby_cls = create_standby_cls - # ------ init update status ------ # - self.update_version = version - self.update_start_timestamp = int(time.time()) - # ------ init variables needed for update ------ # _url_base = urlparse(raw_url_base) _path = f"{_url_base.path.rstrip('/')}/" From d26518485a84e359ddf4696d5d07e551fde153a7 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 16:14:19 +0000 Subject: [PATCH 042/114] fix status_monitor --- src/otaclient/_status_monitor.py | 48 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 0b098e4e8..2abfc98e0 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -24,7 +24,7 @@ from dataclasses import asdict, dataclass from enum import Enum, auto from threading import Thread -from typing import Union, cast +from typing import Literal, Union, cast from otaclient._types import ( FailureType, @@ -118,7 +118,7 @@ class StatusReport: # def _on_session_finished( status_storage: OTAClientStatus, payload: OTAStatusChangeReport -): +) -> Literal[True]: status_storage.session_id = "" status_storage.update_phase = UpdatePhase.INITIALIZING status_storage.update_meta = UpdateMeta() @@ -135,10 +135,12 @@ def _on_session_finished( status_storage.failure_reason = "" status_storage.failure_traceback = "" + return True + def _on_new_ota_session( status_storage: OTAClientStatus, payload: OTAStatusChangeReport -): +) -> Literal[True]: status_storage.ota_status = payload.new_ota_status status_storage.update_phase = UpdatePhase.INITIALIZING status_storage.update_meta = UpdateMeta() @@ -147,6 +149,8 @@ def _on_new_ota_session( status_storage.failure_type = FailureType.NO_FAILURE status_storage.failure_reason = "" + return True + def _on_update_phase_changed( status_storage: OTAClientStatus, payload: OTAUpdatePhaseChangeReport @@ -155,7 +159,7 @@ def _on_update_phase_changed( logger.warning( "attempt to update update_timing when no OTA update session on-going" ) - return + return False phase, trigger_timestamp = payload.new_update_phase, payload.trigger_timestamp if phase == UpdatePhase.PROCESSING_POSTUPDATE: @@ -168,14 +172,17 @@ def _on_update_phase_changed( update_timing.update_apply_start_timestamp = trigger_timestamp status_storage.update_phase = phase + return True -def _on_update_progress(status_storage: OTAClientStatus, payload: UpdateProgressReport): +def _on_update_progress( + status_storage: OTAClientStatus, payload: UpdateProgressReport +) -> bool: if (update_progress := status_storage.update_progress) is None: logger.warning( "attempt to update update_progress when no OTA update session on-going" ) - return + return False op = payload.operation if ( @@ -193,6 +200,7 @@ def _on_update_progress(status_storage: OTAClientStatus, payload: UpdateProgress update_progress.downloading_errors += payload.errors elif op == UpdateProgressReport.Type.APPLY_REMOVE_DELTA: update_progress.removed_files_num += payload.processed_file_num + return True def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaReport): @@ -202,7 +210,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor logger.warning( "attempt to update update_meta when no OTA update session on-going" ) - return + return False _input = asdict(payload) for k, v in _input.items(): @@ -211,6 +219,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor continue if v: setattr(update_meta, k, v) + return True # @@ -237,13 +246,10 @@ def __init__( self._input_queue = msg_queue self._status = None self._shm_status = shm_status - self._next_shm_push = 0 atexit.register(shm_status.atexit) - def load_report(self, report: StatusReport) -> None: - _now = int(time.time()) - + def load_report(self, report: StatusReport) -> bool: if self._status is None: self._status = OTAClientStatus() status_storage = self._status @@ -252,6 +258,7 @@ def load_report(self, report: StatusReport) -> None: # ------ update otaclient meta ------ # if isinstance(payload, SetOTAClientMetaReport): status_storage.firmware_version = payload.firmware_version + return True # ------ on session start/end ------ # if isinstance(payload, OTAStatusChangeReport): @@ -259,7 +266,6 @@ def load_report(self, report: StatusReport) -> None: if new_ota_status in [OTAStatus.UPDATING, OTAStatus.ROLLBACKING]: status_storage.session_id = report.session_id return _on_new_ota_session(status_storage, payload) - status_storage.session_id = "" # clear session if we are not in an OTA return _on_session_finished(status_storage, payload) @@ -267,28 +273,30 @@ def load_report(self, report: StatusReport) -> None: report_session_id = report.session_id if report_session_id != status_storage.session_id: logger.warning(f"drop reports from mismatched session: {report}") - return # drop invalid report + return False if isinstance(payload, OTAUpdatePhaseChangeReport): return _on_update_phase_changed(status_storage, payload) if isinstance(payload, UpdateProgressReport): return _on_update_progress(status_storage, payload) if isinstance(payload, SetUpdateMetaReport): return _on_update_meta(status_storage, payload) - - # ------ push status to shm ------ # - if _now > self._next_shm_push: - with contextlib.suppress(Exception): - self._shm_status.write_msg(self._status) - self._next_shm_push = _now + self.shm_push_interval + return False def _status_collector_thread(self) -> None: """Main entry of status monitor working thread.""" + _next_shm_push = 0 while True: + _now = int(time.time()) try: report = self._input_queue.get_nowait() if report is TERMINATE_SENTINEL: break - self.load_report(report) + + # ------ push status on load_report ------ # + if self.load_report(report) and self._status and _now > _next_shm_push: + with contextlib.suppress(Exception): + self._shm_status.write_msg(self._status) + _next_shm_push = _now + self.shm_push_interval except queue.Empty: time.sleep(self.min_collect_interval) From 809311a70eddc8bfcbb69d5c2772ab66108e6d93 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 16:19:40 +0000 Subject: [PATCH 043/114] status_monitor: increase minimum shm write interval --- src/otaclient/_status_monitor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 2abfc98e0..11a862458 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -227,7 +227,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor # TERMINATE_SENTINEL = cast(StatusReport, object()) -SHM_PUSH_INTERVAL = 1 +SHM_PUSH_INTERVAL = 0.5 class OTAClientStatusCollector: @@ -238,7 +238,7 @@ def __init__( shm_status: SharedOTAClientStatusWriter, *, min_collect_interval: int = 1, - shm_push_interval: int = SHM_PUSH_INTERVAL, + shm_push_interval: float = SHM_PUSH_INTERVAL, ) -> None: self.min_collect_interval = min_collect_interval self.shm_push_interval = shm_push_interval @@ -286,7 +286,7 @@ def _status_collector_thread(self) -> None: """Main entry of status monitor working thread.""" _next_shm_push = 0 while True: - _now = int(time.time()) + _now = time.time() try: report = self._input_queue.get_nowait() if report is TERMINATE_SENTINEL: From b983b9187d5a5689cb73e959faaa0ae6c01848a3 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 17:08:50 +0000 Subject: [PATCH 044/114] Squashed commit of the following: commit 452d3d6766fcf730558f7cca3a2c8e53b7f7dd10 Author: bodong.yang Date: Sun Nov 24 16:55:26 2024 +0000 simplify and cleanup otaproxy_ctx commit f3ee2e4193c23ebb2a0e3f709b0dc773ada0cd27 Author: bodong.yang Date: Sun Nov 24 16:32:06 2024 +0000 cleanup otaproxy.__init__ --- src/ota_proxy/__init__.py | 73 +--------------- src/otaclient/_otaproxy_ctx.py | 153 ++++++++++++++++----------------- 2 files changed, 75 insertions(+), 151 deletions(-) diff --git a/src/ota_proxy/__init__.py b/src/ota_proxy/__init__.py index 7e87bbf70..56893a364 100644 --- a/src/ota_proxy/__init__.py +++ b/src/ota_proxy/__init__.py @@ -13,16 +13,9 @@ # limitations under the License. -import asyncio -import logging -import multiprocessing -from abc import abstractmethod -from contextlib import AbstractContextManager -from functools import partial -from multiprocessing.context import SpawnProcess -from typing import Any, Callable, Coroutine, Dict, Optional, Protocol +from __future__ import annotations -from typing_extensions import ParamSpec, Self +import logging from .cache_control_header import OTAFileCacheControl from .config import config @@ -37,12 +30,8 @@ "OTACache", "OTAFileCacheControl", "config", - "OTAProxyContextProto", - "subprocess_otaproxy_launcher", ) -_P = ParamSpec("_P") - async def run_otaproxy( host: str, @@ -54,7 +43,7 @@ async def run_otaproxy( upper_proxy: str, enable_cache: bool, enable_https: bool, - external_cache: Optional[str] = None, + external_cache: str | None = None, ): import uvicorn @@ -80,59 +69,3 @@ async def run_otaproxy( ) _server = uvicorn.Server(_config) await _server.serve() - - -class OTAProxyContextProto(AbstractContextManager, Protocol): - @abstractmethod - def __init__(self, *args, **kwargs) -> None: ... - - @property - def extra_kwargs(self) -> Dict[str, Any]: - return {} - - @abstractmethod - def __enter__(self) -> Self: ... - - -def _subprocess_main( - subprocess_ctx: OTAProxyContextProto, - otaproxy_entry: Callable[..., Coroutine], -): - """Main entry for launching otaproxy server at subprocess.""" - import uvloop # NOTE: only import uvloop at subprocess - - uvloop.install() - with subprocess_ctx as ctx: - asyncio.run(otaproxy_entry(**ctx.extra_kwargs)) - - -def subprocess_otaproxy_launcher( - subprocess_ctx: OTAProxyContextProto, - otaproxy_entry: Callable[_P, Any] = run_otaproxy, -): - """ - Returns: - A callable main entry for launching otaproxy in subprocess. - """ - - def _inner(*args: _P.args, **kwargs: _P.kwargs) -> SpawnProcess: - """Helper method to launch otaproxy in subprocess. - - This method works like a wrapper and passthrough all args and kwargs - to the _subprocess_main function, and then execute the function in - a subprocess. - check _subprocess_main function for more details. - """ - # prepare otaproxy coro - _otaproxy_entry = partial(otaproxy_entry, *args, **kwargs) - - # run otaproxy in async loop in new subprocess - mp_ctx = multiprocessing.get_context("spawn") - otaproxy_subprocess = mp_ctx.Process( - target=partial(_subprocess_main, subprocess_ctx, _otaproxy_entry), - daemon=True, # kill otaproxy if the parent process exists - ) - otaproxy_subprocess.start() - return otaproxy_subprocess - - return _inner diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index bd0b4b790..955a840ff 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -14,32 +14,41 @@ """Control of the otaproxy server startup/shutdown. The API exposed by this module is meant to be controlled by otaproxy managing thread only. - -TODO: simplify this module! """ from __future__ import annotations +import asyncio import atexit import logging +import multiprocessing as mp +import multiprocessing.context as mp_ctx import multiprocessing.synchronize as mp_sync import shutil -import sys import time +from functools import partial from pathlib import Path -from typing import Any, Optional, Type - -from typing_extensions import Self -from ota_proxy import OTAProxyContextProto, subprocess_otaproxy_launcher from ota_proxy import config as local_otaproxy_cfg +from ota_proxy import run_otaproxy from otaclient.configs.cfg import cfg, proxy_info from otaclient_common import cmdhelper from otaclient_common.common import ensure_otaproxy_start logger = logging.getLogger(__name__) +_otaproxy_p: mp_ctx.SpawnProcess | None = None + + +def shutdown_otaproxy_server() -> None: + global _otaproxy_p + if _otaproxy_p: + _otaproxy_p.terminate() + _otaproxy_p.join() + _otaproxy_p = None + + OTAPROXY_CHECK_INTERVAL = 3 OTAPROXY_MIN_STARTUP_TIME = 60 """Keep otaproxy running at least 60 seconds after startup.""" @@ -48,7 +57,7 @@ SHUTDOWN_AFTER_API_SERVER_EXIT = 3 -class OTAProxyContext(OTAProxyContextProto): +class OTAProxyContext: EXTERNAL_CACHE_KEY = "external_cache" def __init__( @@ -68,36 +77,7 @@ def __init__( self._external_cache_dev_mp = external_cache_dev_mp self._external_cache_data_dir = external_cache_path - @property - def extra_kwargs(self) -> dict[str, Any]: - """Inject kwargs to otaproxy startup entry. - - Currently only inject if external cache storage is used. - """ - _res = {} - if self.external_cache_enabled and self._external_cache_activated: - _res[self.EXTERNAL_CACHE_KEY] = self._external_cache_data_dir - else: - _res.pop(self.EXTERNAL_CACHE_KEY, None) - return _res - - def _subprocess_init(self): - """Initializing the subprocess before launching it.""" - from otaclient._logging import configure_logging - - # configure logging for otaproxy subprocess - # NOTE: on otaproxy subprocess, we first set log level of the root logger - # to CRITICAL to filter out third_party libs' logging(requests, urllib3, etc.), - # and then set the ota_proxy logger to DEFAULT_LOG_LEVEL - configure_logging() - otaproxy_logger = logging.getLogger("ota_proxy") - - # wait for upper otaproxy if any - if self.upper_proxy: - otaproxy_logger.info(f"wait for {self.upper_proxy=} online...") - ensure_otaproxy_start(str(self.upper_proxy)) - - def _mount_external_cache_storage(self): + def _mount_external_cache_storage(self) -> None: # detect cache_dev on every startup _cache_dev = cmdhelper.get_dev_by_token( "LABEL", @@ -143,55 +123,64 @@ def _umount_external_cache_storage(self): finally: self.started = self._external_cache_activated = False - def __enter__(self) -> Self: + def __enter__(self) -> str | None: try: - self._subprocess_init() self._mount_external_cache_storage() - return self + if self._external_cache_activated: + return self._external_cache_data_dir except Exception as e: - # if subprocess init failed, directly let the process exit - logger.error(f"otaproxy subprocess init failed, exit: {e!r}") - sys.exit(1) + logger.warning(f"failed to enable external cache source: {e!r}") def __exit__( self, - __exc_type: Optional[Type[BaseException]], - __exc_value: Optional[BaseException], + __exc_type: type[BaseException] | None, + __exc_value: BaseException | None, __traceback, - ) -> Optional[bool]: + ): if __exc_type: _exc = __exc_value if __exc_value else __exc_type() logger.warning(f"exception during otaproxy shutdown: {_exc!r}") - # otaproxy post-shutdown cleanup: - # 1. umount external cache storage - self._umount_external_cache_storage() + return True # suppress exception + try: + # otaproxy post-shutdown cleanup: + # 1. umount external cache storage + self._umount_external_cache_storage() + except Exception as e: + logger.warning(f"failed to umount external cache source: {e!r}") + + +def otaproxy_process(*, init_cache: bool, enable_external_cache: bool) -> None: + from otaclient._logging import configure_logging + + configure_logging() + logger.info("otaproxy process started") -def start_otaproxy_server(*, init_cache: bool, enable_external_cache: bool = True): - _subprocess_entry = subprocess_otaproxy_launcher( - OTAProxyContext( - external_cache_enabled=enable_external_cache, - ) - ) host, port = ( str(proxy_info.local_ota_proxy_listen_addr), proxy_info.local_ota_proxy_listen_port, ) + upper_proxy = str(proxy_info.upper_ota_proxy or "") logger.info(f"will launch otaproxy at http://{host}:{port}, with {upper_proxy=}") - - _otaproxy_p = _subprocess_entry( - host=host, - port=port, - init_cache=init_cache, - cache_dir=local_otaproxy_cfg.BASE_DIR, - cache_db_f=local_otaproxy_cfg.DB_FILE, - upper_proxy=upper_proxy, - enable_cache=proxy_info.enable_local_ota_proxy_cache, - enable_https=proxy_info.gateway_otaproxy, - ) - logger.info("otaproxy started") - return _otaproxy_p + if upper_proxy: + logger.info(f"wait for {upper_proxy=} online...") + ensure_otaproxy_start(str(upper_proxy)) + + with OTAProxyContext(external_cache_enabled=enable_external_cache) as _cache_dir: + asyncio.run( + run_otaproxy( + host=host, + port=port, + init_cache=init_cache, + cache_dir=local_otaproxy_cfg.BASE_DIR, + cache_db_f=local_otaproxy_cfg.DB_FILE, + upper_proxy=upper_proxy, + enable_cache=proxy_info.enable_local_ota_proxy_cache, + enable_https=proxy_info.gateway_otaproxy, + external_cache=_cache_dir, + ) + ) def otaproxy_control_thread( @@ -199,28 +188,22 @@ def otaproxy_control_thread( any_requires_network: mp_sync.Event, all_ecus_succeeded: mp_sync.Event, ) -> None: # pragma: no cover - - _otaproxy_p = None - - def shutdown_otaproxy_server() -> None: - if _otaproxy_p and _otaproxy_p.is_alive(): - print("shuting down otaproxy server process...") - _otaproxy_p.terminate() - _otaproxy_p.join() - atexit.register(shutdown_otaproxy_server) + _mp_ctx = mp.get_context("spawn") + ota_cache_dir = Path(local_otaproxy_cfg.BASE_DIR) next_ota_cache_dir_checkpoint = 0 + global _otaproxy_p while True: + _now = time.time() time.sleep(OTAPROXY_CHECK_INTERVAL) _otaproxy_running = _otaproxy_p and _otaproxy_p.is_alive() _otaproxy_should_run = any_requires_network.is_set() if not _otaproxy_should_run and not _otaproxy_running: - _now = time.time() if ( _now > next_ota_cache_dir_checkpoint and all_ecus_succeeded.is_set() @@ -236,11 +219,19 @@ def shutdown_otaproxy_server() -> None: if _otaproxy_should_run and not _otaproxy_running: # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy # will still init the cache even init_cache is False. - _otaproxy_p = start_otaproxy_server(init_cache=False) + _otaproxy_p = _mp_ctx.Process( + target=partial( + otaproxy_process, + init_cache=False, + enable_external_cache=True, + ), + name="otaproxy", + ) + _otaproxy_p.start() + next_ota_cache_dir_checkpoint = _now + OTAPROXY_MIN_STARTUP_TIME time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown continue if not _otaproxy_should_run and _otaproxy_running: logger.info("shutting down otaproxy as not needed now ...") shutdown_otaproxy_server() - _otaproxy_p = None From 03e5c93f68ca1d0d3dbd8a3739ec03e4abc662fa Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sun, 24 Nov 2024 17:15:51 +0000 Subject: [PATCH 045/114] main._on_shutdown: add sys_exit arg, if this func is called by atexit, DO NOT do sys.exit --- src/otaclient/main.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index ac5ff4d35..acdde0072 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -45,7 +45,7 @@ _shm: mp_shm.SharedMemory | None = None -def _on_shutdown() -> None: +def _on_shutdown(sys_exit: bool = False): global _ota_core_p, _grpc_server_p, _shm if _ota_core_p: _ota_core_p.terminate() @@ -62,11 +62,14 @@ def _on_shutdown() -> None: _shm.unlink() _shm = None + if sys_exit: + sys.exit(1) + def _signal_handler(signame, _) -> None: print(f"otaclient receives {signame=}, shutting down ...") - _on_shutdown() - sys.exit(1) + # do not sys.exit when we are already shutting down + _on_shutdown(sys_exit=True) def main() -> None: From 67462c2dd5a9817f9f53f9a10b33f486668bcf27 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:01:35 +0000 Subject: [PATCH 046/114] split ensure_* series helpers into otaclient_common.cmdhelper module --- .../boot_control/_slot_mnt_helper.py | 105 ++---------------- src/otaclient_common/cmdhelper.py | 99 +++++++++++++++++ 2 files changed, 109 insertions(+), 95 deletions(-) diff --git a/src/otaclient/boot_control/_slot_mnt_helper.py b/src/otaclient/boot_control/_slot_mnt_helper.py index 0d1d369ba..ae5055026 100644 --- a/src/otaclient/boot_control/_slot_mnt_helper.py +++ b/src/otaclient/boot_control/_slot_mnt_helper.py @@ -19,8 +19,6 @@ import logging import shutil from pathlib import Path -from subprocess import CalledProcessError -from time import sleep from otaclient.configs.cfg import cfg from otaclient_common import cmdhelper, replace_root @@ -28,91 +26,6 @@ logger = logging.getLogger(__name__) -MAX_RETRY_COUNT = 6 -RETRY_INTERVAL = 2 - - -def ensure_mount( - target: StrOrPath, mnt_point: StrOrPath, *, mount_func, raise_exception: bool -) -> None: # pragma: no cover - """Ensure the mounted on by our best. - - Raises: - If is True, raises the last failed attemp's CalledProcessError. - """ - for _retry in range(MAX_RETRY_COUNT + 1): - try: - mount_func(target=target, mount_point=mnt_point) - cmdhelper.is_target_mounted(mnt_point, raise_exception=True) - return - except CalledProcessError as e: - logger.error( - f"retry#{_retry} failed to mount {target} on {mnt_point}: {e!r}" - ) - logger.error(f"{e.stderr=}\n{e.stdout=}") - - if _retry >= MAX_RETRY_COUNT: - logger.error( - f"exceed max retry count mounting {target} on {mnt_point}, abort" - ) - if raise_exception: - raise - return - - sleep(RETRY_INTERVAL) - continue - - -def ensure_umount( - mnt_point: StrOrPath, *, ignore_error: bool -) -> None: # pragma: no cover - """Try to umount the at our best. - - Raises: - If is False, raises the last failed attemp's CalledProcessError. - """ - for _retry in range(MAX_RETRY_COUNT + 1): - try: - if not cmdhelper.is_target_mounted(mnt_point, raise_exception=False): - break - cmdhelper.umount(mnt_point, raise_exception=True) - except CalledProcessError as e: - logger.warning(f"retry#{_retry} failed to umount {mnt_point}: {e!r}") - logger.warning(f"{e.stderr}\n{e.stdout}") - - if _retry >= MAX_RETRY_COUNT: - logger.error(f"reached max retry on umounting {mnt_point}, abort") - if not ignore_error: - raise - return - - sleep(RETRY_INTERVAL) - continue - - -def ensure_mointpoint( - mnt_point: Path, *, ignore_error: bool -) -> None: # pragma: no cover - """Ensure the exists, has no mount on it and ready for mount. - - If the is valid, but we failed to umount any previous mounts on it, - we still keep use the mountpoint as later mount will override the previous one. - """ - if mnt_point.is_symlink() or not mnt_point.is_dir(): - mnt_point.unlink(missing_ok=True) - - if not mnt_point.exists(): - mnt_point.mkdir(exist_ok=True, parents=True) - return - - try: - ensure_umount(mnt_point, ignore_error=ignore_error) - except Exception: - logger.warning( - f"{mnt_point} still has other mounts on it, " - f"but still use {mnt_point} and override the previous mount" - ) - class SlotMountHelper: # pragma: no cover """Helper class that provides methods for mounting slots.""" @@ -148,10 +61,10 @@ def mount_standby(self) -> None: CalledProcessedError on the last failed attemp. """ logger.debug("mount standby slot rootfs dev...") - ensure_mointpoint(self.standby_slot_mount_point, ignore_error=True) - ensure_umount(self.standby_slot_dev, ignore_error=False) + cmdhelper.ensure_mointpoint(self.standby_slot_mount_point, ignore_error=True) + cmdhelper.ensure_umount(self.standby_slot_dev, ignore_error=False) - ensure_mount( + cmdhelper.ensure_mount( target=self.standby_slot_dev, mnt_point=self.standby_slot_mount_point, mount_func=cmdhelper.mount_rw, @@ -165,8 +78,8 @@ def mount_active(self) -> None: CalledProcessedError on the last failed attemp. """ logger.debug("mount active slot rootfs dev...") - ensure_mointpoint(self.active_slot_mount_point, ignore_error=True) - ensure_mount( + cmdhelper.ensure_mointpoint(self.active_slot_mount_point, ignore_error=True) + cmdhelper.ensure_mount( target=self.active_rootfs, mnt_point=self.active_slot_mount_point, mount_func=cmdhelper.bind_mount_ro, @@ -195,7 +108,7 @@ def prepare_standby_dev( erase_standby: bool = False, fslabel: str | None = None, ) -> None: - ensure_umount(self.standby_slot_dev, ignore_error=True) + cmdhelper.ensure_umount(self.standby_slot_dev, ignore_error=True) if erase_standby: return cmdhelper.mkfs_ext4(self.standby_slot_dev, fslabel=fslabel) @@ -206,5 +119,7 @@ def prepare_standby_dev( def umount_all(self, *, ignore_error: bool = True): logger.debug("unmount standby slot and active slot mount point...") - ensure_umount(self.active_slot_mount_point, ignore_error=ignore_error) - ensure_umount(self.standby_slot_mount_point, ignore_error=ignore_error) + cmdhelper.ensure_umount(self.active_slot_mount_point, ignore_error=ignore_error) + cmdhelper.ensure_umount( + self.standby_slot_mount_point, ignore_error=ignore_error + ) diff --git a/src/otaclient_common/cmdhelper.py b/src/otaclient_common/cmdhelper.py index 5ca2e9ef2..af2557598 100644 --- a/src/otaclient_common/cmdhelper.py +++ b/src/otaclient_common/cmdhelper.py @@ -23,6 +23,8 @@ import logging import sys +import time +from pathlib import Path from subprocess import CalledProcessError from typing import Literal, NoReturn @@ -455,3 +457,100 @@ def reboot(args: list[str] | None = None) -> NoReturn: # pragma: no cover logger.warning("system will reboot now!") subprocess_call(cmd, raise_exception=True) sys.exit(0) + + +MAX_RETRY_COUNT = 6 +RETRY_INTERVAL = 2 + + +def ensure_mount( + target: StrOrPath, + mnt_point: StrOrPath, + *, + mount_func, + raise_exception: bool, + max_retry: int = MAX_RETRY_COUNT, + retry_interval: int = RETRY_INTERVAL, +) -> None: # pragma: no cover + """Ensure the mounted on by our best. + + Raises: + If is True, raises the last failed attemp's CalledProcessError. + """ + for _retry in range(max_retry + 1): + try: + mount_func(target=target, mount_point=mnt_point) + is_target_mounted(mnt_point, raise_exception=True) + return + except CalledProcessError as e: + logger.error( + f"retry#{_retry} failed to mount {target} on {mnt_point}: {e!r}" + ) + logger.error(f"{e.stderr=}\n{e.stdout=}") + + if _retry >= max_retry: + logger.error( + f"exceed max retry count mounting {target} on {mnt_point}, abort" + ) + if raise_exception: + raise + return + + time.sleep(retry_interval) + continue + + +def ensure_umount( + mnt_point: StrOrPath, + *, + ignore_error: bool, + max_retry: int = MAX_RETRY_COUNT, + retry_interval: int = RETRY_INTERVAL, +) -> None: # pragma: no cover + """Try to umount the at our best. + + Raises: + If is False, raises the last failed attemp's CalledProcessError. + """ + for _retry in range(max_retry + 1): + try: + if not is_target_mounted(mnt_point, raise_exception=False): + break + umount(mnt_point, raise_exception=True) + except CalledProcessError as e: + logger.warning(f"retry#{_retry} failed to umount {mnt_point}: {e!r}") + logger.warning(f"{e.stderr}\n{e.stdout}") + + if _retry >= max_retry: + logger.error(f"reached max retry on umounting {mnt_point}, abort") + if not ignore_error: + raise + return + + time.sleep(retry_interval) + continue + + +def ensure_mointpoint( + mnt_point: StrOrPath, *, ignore_error: bool +) -> None: # pragma: no cover + """Ensure the exists, has no mount on it and ready for mount. + + If the is valid, but we failed to umount any previous mounts on it, + we still keep use the mountpoint as later mount will override the previous one. + """ + mnt_point = Path(mnt_point) + if mnt_point.is_symlink() or not mnt_point.is_dir(): + mnt_point.unlink(missing_ok=True) + + if not mnt_point.exists(): + mnt_point.mkdir(exist_ok=True, parents=True) + return + + try: + ensure_umount(mnt_point, ignore_error=ignore_error) + except Exception: + logger.warning( + f"{mnt_point} still has other mounts on it, " + f"but still use {mnt_point} and override the previous mount" + ) From 21f47cf88527e94863dd0c891c0dda068e150b08 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:01:55 +0000 Subject: [PATCH 047/114] remove unused test_subprocess_launch_otaproxy --- .../test_subprocess_launch_otaproxy.py | 70 ------------------- 1 file changed, 70 deletions(-) delete mode 100644 tests/test_ota_proxy/test_subprocess_launch_otaproxy.py diff --git a/tests/test_ota_proxy/test_subprocess_launch_otaproxy.py b/tests/test_ota_proxy/test_subprocess_launch_otaproxy.py deleted file mode 100644 index 552f2ad44..000000000 --- a/tests/test_ota_proxy/test_subprocess_launch_otaproxy.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2022 TIER IV, INC. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import time -from pathlib import Path -from typing import Any, Dict - -from ota_proxy import OTAProxyContextProto, subprocess_otaproxy_launcher - - -class _DummyOTAProxyContext(OTAProxyContextProto): - def __init__(self, sentinel) -> None: - self.sentinel = sentinel - - @property - def extra_kwargs(self) -> Dict[str, Any]: - return {} - - def __enter__(self): - _subprocess_init(self.sentinel) - return self - - def __exit__(self, __exc_type, __exc_value, __traceback): - return - - -def _subprocess_init(_sentinel_file): - Path(_sentinel_file).touch() - - -def test_subprocess_start_otaproxy(tmp_path: Path): - # --- setup --- # - (ota_cache_dir := tmp_path / "ota-cache").mkdir(exist_ok=True) - ota_cache_db = ota_cache_dir / "cache_db" - subprocess_init_sentinel = tmp_path / "otaproxy_started" - - # --- execution --- # - _subprocess_entry = subprocess_otaproxy_launcher( - subprocess_ctx=_DummyOTAProxyContext(sentinel=str(subprocess_init_sentinel)) - ) - otaproxy_subprocess = _subprocess_entry( - host="127.0.0.1", - port=8082, - init_cache=True, - cache_dir=str(ota_cache_dir), - cache_db_f=str(ota_cache_db), - upper_proxy="", - enable_cache=True, - enable_https=False, - ) - time.sleep(3) # wait for subprocess to finish up initializing - - # --- assertion --- # - try: - assert otaproxy_subprocess.is_alive() - assert subprocess_init_sentinel.is_file() - finally: - otaproxy_subprocess.terminate() From 667931f39adba296da124e8037c7cd13b932d795 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:05:57 +0000 Subject: [PATCH 048/114] move some otaproxy related settings to otaproxy.config --- src/ota_proxy/config.py | 5 +++++ src/otaclient/configs/_cfg_configurable.py | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/ota_proxy/config.py b/src/ota_proxy/config.py index a6c483877..49af2d894 100644 --- a/src/ota_proxy/config.py +++ b/src/ota_proxy/config.py @@ -53,8 +53,13 @@ class Config: TMP_FILE_PREFIX = "tmp" URL_BASED_HASH_PREFIX = "URL_" + # ------ external cache ------ # # the file extension for compressed files in external cache storage EXTERNAL_CACHE_STORAGE_COMPRESS_ALG = "zst" + EXTERNAL_CACHE_DEV_FSLABEL: str = "ota_cache_src" + EXTERNAL_CACHE_DATA_DNAME: str = "data" + """The cache blob storage is located at /data.""" + config = Config() diff --git a/src/otaclient/configs/_cfg_configurable.py b/src/otaclient/configs/_cfg_configurable.py index 40b56850b..78d0027d4 100644 --- a/src/otaclient/configs/_cfg_configurable.py +++ b/src/otaclient/configs/_cfg_configurable.py @@ -110,9 +110,7 @@ class _MultipleECUSettings(BaseModel): class _OTAProxySettings(BaseModel): OTAPROXY_ENABLE_EXTERNAL_CACHE: bool = True - EXTERNAL_CACHE_DEV_FSLABEL: str = "ota_cache_src" EXTERNAL_CACHE_DEV_MOUNTPOINT: str = f"{cfg_consts.MOUNT_SPACE}/external_cache" - EXTERNAL_CACHE_SRC_PATH: str = f"{EXTERNAL_CACHE_DEV_MOUNTPOINT}/data" class ConfigurableSettings(_OTAClientSettings, _MultipleECUSettings, _OTAProxySettings): From 0e94cdc12c181e11142e23e872a2f82362f99938 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:12:36 +0000 Subject: [PATCH 049/114] minor fix to cmdhelper.ensure_mointpoint --- src/otaclient_common/cmdhelper.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/otaclient_common/cmdhelper.py b/src/otaclient_common/cmdhelper.py index af2557598..48fd93c58 100644 --- a/src/otaclient_common/cmdhelper.py +++ b/src/otaclient_common/cmdhelper.py @@ -548,9 +548,12 @@ def ensure_mointpoint( return try: - ensure_umount(mnt_point, ignore_error=ignore_error) - except Exception: + ensure_umount(mnt_point, ignore_error=False) + except Exception as e: + if not ignore_error: + logger.error(f"failed to prepare {mnt_point=}: {e!r}") + raise logger.warning( - f"{mnt_point} still has other mounts on it, " - f"but still use {mnt_point} and override the previous mount" + f"failed to prepare {mnt_point=}: {e!r} \n" + f"But still use {mnt_point} and override the previous mount" ) From 8da3a33d502917e3b99bb6b1cf449f07484db22c Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:19:42 +0000 Subject: [PATCH 050/114] otaproxy: implement external cache helper module --- src/ota_proxy/external_cache.py | 64 +++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/ota_proxy/external_cache.py diff --git a/src/ota_proxy/external_cache.py b/src/ota_proxy/external_cache.py new file mode 100644 index 000000000..03ef57f94 --- /dev/null +++ b/src/ota_proxy/external_cache.py @@ -0,0 +1,64 @@ +# Copyright 2022 TIER IV, INC. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Implementation of mounting/umounting external cache.""" + + +from __future__ import annotations + +import logging + +from ota_proxy.config import config +from otaclient_common import cmdhelper +from otaclient_common.typing import StrOrPath + +logger = logging.getLogger(__name__) + + +def mount_external_cache( + mnt_point: StrOrPath, *, cache_dev_fslabel: str = config.EXTERNAL_CACHE_DEV_FSLABEL +) -> StrOrPath | None: + _cache_dev = cmdhelper.get_dev_by_token( + "LABEL", + cache_dev_fslabel, + raise_exception=False, + ) + if not _cache_dev: + return + + if len(_cache_dev) > 1: + logger.warning( + f"multiple external cache storage device found, use the first one: {_cache_dev[0]}" + ) + _cache_dev = _cache_dev[0] + logger.info(f"external cache dev detected at {_cache_dev}") + + cmdhelper.ensure_mointpoint(mnt_point, ignore_error=True) + try: + cmdhelper.ensure_mount( + target=_cache_dev, + mnt_point=mnt_point, + mount_func=cmdhelper.mount_ro, + raise_exception=True, + max_retry=3, + ) + return mnt_point + except Exception as e: + logger.warning(f"failed to mount external cache: {e!r}") + + +def umount_external_cache(mnt_point: StrOrPath) -> None: + try: + cmdhelper.ensure_umount(mnt_point, ignore_error=False) + except Exception as e: + logger.warning(f"failed to umount external cache {mnt_point=}: {e!r}") From 02a105cb089a7a1dba0ce591ab6067116e877c51 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:24:57 +0000 Subject: [PATCH 051/114] OTACache now takes external_cache_mnt_point instead --- src/ota_proxy/ota_cache.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ota_proxy/ota_cache.py b/src/ota_proxy/ota_cache.py index c68ff7731..980f53615 100644 --- a/src/ota_proxy/ota_cache.py +++ b/src/ota_proxy/ota_cache.py @@ -108,7 +108,7 @@ def __init__( db_file: Optional[StrOrPath] = None, upper_proxy: str = "", enable_https: bool = False, - external_cache: Optional[str] = None, + external_cache_mnt_point: str | None = None, ): """Init ota_cache instance with configurations.""" logger.info( @@ -136,9 +136,14 @@ def __init__( thread_name_prefix="ota_cache_fileio_executor" ) - if external_cache and cache_enabled: - logger.info(f"external cache source is enabled at: {external_cache}") - self._external_cache = Path(external_cache) if external_cache else None + self._external_cache_data_dir = None + if external_cache_mnt_point and cache_enabled: + logger.info( + f"external cache source is enabled at: {external_cache_mnt_point}" + ) + self._external_cache_data_dir = ( + Path(external_cache_mnt_point) / cfg.EXTERNAL_CACHE_DATA_DNAME + ) self._storage_below_hard_limit_event = threading.Event() self._storage_below_soft_limit_event = threading.Event() @@ -443,13 +448,12 @@ async def _retrieve_file_by_cache( async def _retrieve_file_by_external_cache( self, client_cache_policy: OTAFileCacheControl - ) -> Optional[Tuple[AsyncIterator[bytes], Mapping[str, str]]]: - # skip if not external cache or otaclient doesn't sent valid file_sha256 - if not self._external_cache or not client_cache_policy.file_sha256: + ) -> tuple[AsyncIterator[bytes], Mapping[str, str]] | None: + if not self._external_cache_data_dir or not client_cache_policy.file_sha256: return cache_identifier = client_cache_policy.file_sha256 - cache_file = self._external_cache / cache_identifier + cache_file = self._external_cache_data_dir / cache_identifier cache_file_zst = cache_file.with_suffix( f".{cfg.EXTERNAL_CACHE_STORAGE_COMPRESS_ALG}" ) @@ -522,7 +526,7 @@ async def retrieve_file( # NOTE: if client requsts with retry_caching directive, it may indicate cache corrupted # in external cache storage, in such case we should skip the use of external cache. if ( - self._external_cache + self._external_cache_data_dir and not cache_policy.retry_caching and (_res := await self._retrieve_file_by_external_cache(cache_policy)) ): From 681e02f380fc86b3fe772afe40cc7852b46168cc Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 03:34:38 +0000 Subject: [PATCH 052/114] otaproxy.__main__: now otaproxy CLI will try to mount external cache by itself. --- src/ota_proxy/__init__.py | 5 +++-- src/ota_proxy/__main__.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/ota_proxy/__init__.py b/src/ota_proxy/__init__.py index 56893a364..8491084e0 100644 --- a/src/ota_proxy/__init__.py +++ b/src/ota_proxy/__init__.py @@ -43,7 +43,7 @@ async def run_otaproxy( upper_proxy: str, enable_cache: bool, enable_https: bool, - external_cache: str | None = None, + external_cache_mnt_point: str | None = None, ): import uvicorn @@ -56,7 +56,7 @@ async def run_otaproxy( upper_proxy=upper_proxy, enable_https=enable_https, init_cache=init_cache, - external_cache=external_cache, + external_cache_mnt_point=external_cache_mnt_point, ) _config = uvicorn.Config( App(_ota_cache), @@ -65,6 +65,7 @@ async def run_otaproxy( log_level="error", lifespan="on", loop="uvloop", + # NOTE: must use h11, other http implementation will break HTTP proxy http="h11", ) _server = uvicorn.Server(_config) diff --git a/src/ota_proxy/__main__.py b/src/ota_proxy/__main__.py index ea99ce2ce..0a816af9c 100644 --- a/src/ota_proxy/__main__.py +++ b/src/ota_proxy/__main__.py @@ -13,8 +13,11 @@ # limitations under the License. +from __future__ import annotations + import argparse import asyncio +import atexit import logging import uvloop @@ -25,6 +28,17 @@ logger = logging.getLogger(__name__) if __name__ == "__main__": + from .external_cache import mount_external_cache, umount_external_cache + + _external_cache_mnt_point = None + + def _atexit() -> None: + global _external_cache_mnt_point + if _external_cache_mnt_point: + umount_external_cache(_external_cache_mnt_point) + + atexit.register(_atexit) + parser = argparse.ArgumentParser( prog="ota_proxy", formatter_class=argparse.ArgumentDefaultsHelpFormatter, @@ -66,15 +80,23 @@ default=cfg.DB_FILE, ) parser.add_argument( - "--external-cache-folder", + "--external-cache-mnt-point", help=( - "if specified, otaproxy will use the files in " - "this folder as extra cache source" + "if specified, otaproxy will try to detect external cache dev, " + "mount the dev on this mount point, and use the cache store in it." ), default=None, ) args = parser.parse_args() + _expected_mnt_point = args.external_cache_mnt_point + if _expected_mnt_point: + logger.info( + f"otaproxy will try to detect external cache dev and mount to {_expected_mnt_point}" + ) + if _loaded_mnt := mount_external_cache(_expected_mnt_point): + _external_cache_mnt_point = str(_loaded_mnt) + logger.info(f"launch ota_proxy at {args.host}:{args.port}") uvloop.install() asyncio.run( @@ -87,6 +109,6 @@ upper_proxy=args.upper_proxy, enable_https=args.enable_https, init_cache=args.init_cache, - external_cache=args.external_cache_folder, + external_cache_mnt_point=_external_cache_mnt_point, ) ) From 5cbc91eaa59fe6ec8b48ea66099fd7783c6e6587 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:05:24 +0000 Subject: [PATCH 053/114] otaproxy: now run_otaproxy will mount/umount external cache dev --- src/ota_proxy/__init__.py | 21 ++++++++++++++++++--- src/ota_proxy/__main__.py | 22 +--------------------- src/ota_proxy/external_cache.py | 10 +++++++++- 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/ota_proxy/__init__.py b/src/ota_proxy/__init__.py index 8491084e0..e1025d598 100644 --- a/src/ota_proxy/__init__.py +++ b/src/ota_proxy/__init__.py @@ -15,7 +15,11 @@ from __future__ import annotations +import atexit import logging +from functools import partial + +from ota_proxy.external_cache import mount_external_cache, umount_external_cache from .cache_control_header import OTAFileCacheControl from .config import config @@ -38,8 +42,8 @@ async def run_otaproxy( port: int, *, init_cache: bool, - cache_dir: str, - cache_db_f: str, + cache_dir: str = config.BASE_DIR, + cache_db_f: str = config.DB_FILE, upper_proxy: str, enable_cache: bool, enable_https: bool, @@ -49,6 +53,17 @@ async def run_otaproxy( from . import App, OTACache + _loaded_mnt_point = None + if external_cache_mnt_point and ( + _loaded_mnt_point := mount_external_cache(external_cache_mnt_point) + ): + _loaded_mnt_point = str(_loaded_mnt_point) + + def _atexit(_mnt_point) -> None: + umount_external_cache(_mnt_point) + + atexit.register(partial(_atexit, _loaded_mnt_point)) + _ota_cache = OTACache( base_dir=cache_dir, db_file=cache_db_f, @@ -56,7 +71,7 @@ async def run_otaproxy( upper_proxy=upper_proxy, enable_https=enable_https, init_cache=init_cache, - external_cache_mnt_point=external_cache_mnt_point, + external_cache_mnt_point=_loaded_mnt_point, ) _config = uvicorn.Config( App(_ota_cache), diff --git a/src/ota_proxy/__main__.py b/src/ota_proxy/__main__.py index 0a816af9c..2c7aad4ff 100644 --- a/src/ota_proxy/__main__.py +++ b/src/ota_proxy/__main__.py @@ -17,7 +17,6 @@ import argparse import asyncio -import atexit import logging import uvloop @@ -28,17 +27,6 @@ logger = logging.getLogger(__name__) if __name__ == "__main__": - from .external_cache import mount_external_cache, umount_external_cache - - _external_cache_mnt_point = None - - def _atexit() -> None: - global _external_cache_mnt_point - if _external_cache_mnt_point: - umount_external_cache(_external_cache_mnt_point) - - atexit.register(_atexit) - parser = argparse.ArgumentParser( prog="ota_proxy", formatter_class=argparse.ArgumentDefaultsHelpFormatter, @@ -89,14 +77,6 @@ def _atexit() -> None: ) args = parser.parse_args() - _expected_mnt_point = args.external_cache_mnt_point - if _expected_mnt_point: - logger.info( - f"otaproxy will try to detect external cache dev and mount to {_expected_mnt_point}" - ) - if _loaded_mnt := mount_external_cache(_expected_mnt_point): - _external_cache_mnt_point = str(_loaded_mnt) - logger.info(f"launch ota_proxy at {args.host}:{args.port}") uvloop.install() asyncio.run( @@ -109,6 +89,6 @@ def _atexit() -> None: upper_proxy=args.upper_proxy, enable_https=args.enable_https, init_cache=args.init_cache, - external_cache_mnt_point=_external_cache_mnt_point, + external_cache_mnt_point=args.external_cache_mnt_point, ) ) diff --git a/src/ota_proxy/external_cache.py b/src/ota_proxy/external_cache.py index 03ef57f94..87edf60b7 100644 --- a/src/ota_proxy/external_cache.py +++ b/src/ota_proxy/external_cache.py @@ -28,12 +28,17 @@ def mount_external_cache( mnt_point: StrOrPath, *, cache_dev_fslabel: str = config.EXTERNAL_CACHE_DEV_FSLABEL ) -> StrOrPath | None: + logger.info( + f"otaproxy will try to detect external cache dev and mount to {mnt_point}" + ) + _cache_dev = cmdhelper.get_dev_by_token( "LABEL", cache_dev_fslabel, raise_exception=False, ) if not _cache_dev: + logger.info("no cache dev is attached") return if len(_cache_dev) > 1: @@ -43,8 +48,8 @@ def mount_external_cache( _cache_dev = _cache_dev[0] logger.info(f"external cache dev detected at {_cache_dev}") - cmdhelper.ensure_mointpoint(mnt_point, ignore_error=True) try: + cmdhelper.ensure_mointpoint(mnt_point, ignore_error=True) cmdhelper.ensure_mount( target=_cache_dev, mnt_point=mnt_point, @@ -52,6 +57,9 @@ def mount_external_cache( raise_exception=True, max_retry=3, ) + logger.info( + f"successfully mount external cache dev {_cache_dev} on {mnt_point}" + ) return mnt_point except Exception as e: logger.warning(f"failed to mount external cache: {e!r}") From f602e25fef196eb598ea24e63daf5d0d489e1292 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:08:49 +0000 Subject: [PATCH 054/114] _otaproxy_ctx: integrate --- src/otaclient/_otaproxy_ctx.py | 118 +++------------------------------ 1 file changed, 11 insertions(+), 107 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 955a840ff..84956918f 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -32,8 +32,8 @@ from ota_proxy import config as local_otaproxy_cfg from ota_proxy import run_otaproxy +from ota_proxy.config import config as otaproxy_cfg from otaclient.configs.cfg import cfg, proxy_info -from otaclient_common import cmdhelper from otaclient_common.common import ensure_otaproxy_start logger = logging.getLogger(__name__) @@ -50,112 +50,21 @@ def shutdown_otaproxy_server() -> None: OTAPROXY_CHECK_INTERVAL = 3 -OTAPROXY_MIN_STARTUP_TIME = 60 +OTAPROXY_MIN_STARTUP_TIME = 120 """Keep otaproxy running at least 60 seconds after startup.""" OTA_CACHE_DIR_CHECK_INTERVAL = 60 -SHUTDOWN_AFTER_CORE_EXIT = 16 -SHUTDOWN_AFTER_API_SERVER_EXIT = 3 - - -class OTAProxyContext: - EXTERNAL_CACHE_KEY = "external_cache" - - def __init__( - self, - *, - external_cache_enabled: bool = True, - external_cache_dev_fslable: str = cfg.EXTERNAL_CACHE_DEV_FSLABEL, - external_cache_dev_mp: str = cfg.EXTERNAL_CACHE_DEV_MOUNTPOINT, - external_cache_path: str = cfg.EXTERNAL_CACHE_SRC_PATH, - ) -> None: - self.upper_proxy = proxy_info.upper_ota_proxy - self.external_cache_enabled = external_cache_enabled - - self._external_cache_activated = False - self._external_cache_dev_fslabel = external_cache_dev_fslable - self._external_cache_dev = None # type: ignore[assignment] - self._external_cache_dev_mp = external_cache_dev_mp - self._external_cache_data_dir = external_cache_path - - def _mount_external_cache_storage(self) -> None: - # detect cache_dev on every startup - _cache_dev = cmdhelper.get_dev_by_token( - "LABEL", - self._external_cache_dev_fslabel, - raise_exception=False, - ) - if not _cache_dev: - return - - if len(_cache_dev) > 1: - logger.warning( - f"multiple external cache storage device found, use the first one: {_cache_dev[0]}" - ) - _cache_dev = _cache_dev[0] - logger.info(f"external cache dev detected at {_cache_dev}") - self._external_cache_dev = _cache_dev - # try to unmount the mount_point and cache_dev unconditionally - _mp = Path(self._external_cache_dev_mp) - cmdhelper.umount(_cache_dev, raise_exception=False) - _mp.mkdir(parents=True, exist_ok=True) - - try: - cmdhelper.mount_ro( - target=_cache_dev, mount_point=self._external_cache_dev_mp - ) - self._external_cache_activated = True - except Exception as e: - logger.warning( - f"failed to mount external cache dev({_cache_dev}) to {self._external_cache_dev_mp=}: {e!r}" - ) - - def _umount_external_cache_storage(self): - if not self._external_cache_activated or not self._external_cache_dev: - return - try: - cmdhelper.umount(self._external_cache_dev) - except Exception as e: - logger.warning( - f"failed to unmount external cache_dev {self._external_cache_dev}: {e!r}" - ) - finally: - self.started = self._external_cache_activated = False - - def __enter__(self) -> str | None: - try: - self._mount_external_cache_storage() - if self._external_cache_activated: - return self._external_cache_data_dir - except Exception as e: - logger.warning(f"failed to enable external cache source: {e!r}") - - def __exit__( - self, - __exc_type: type[BaseException] | None, - __exc_value: BaseException | None, - __traceback, - ): - if __exc_type: - _exc = __exc_value if __exc_value else __exc_type() - logger.warning(f"exception during otaproxy shutdown: {_exc!r}") - return True # suppress exception - - try: - # otaproxy post-shutdown cleanup: - # 1. umount external cache storage - self._umount_external_cache_storage() - except Exception as e: - logger.warning(f"failed to umount external cache source: {e!r}") - - -def otaproxy_process(*, init_cache: bool, enable_external_cache: bool) -> None: +def otaproxy_process(*, init_cache: bool) -> None: from otaclient._logging import configure_logging configure_logging() logger.info("otaproxy process started") + external_cache_mnt_point = None + if cfg.OTAPROXY_ENABLE_EXTERNAL_CACHE: + external_cache_mnt_point = cfg.EXTERNAL_CACHE_DEV_MOUNTPOINT + host, port = ( str(proxy_info.local_ota_proxy_listen_addr), proxy_info.local_ota_proxy_listen_port, @@ -167,7 +76,6 @@ def otaproxy_process(*, init_cache: bool, enable_external_cache: bool) -> None: logger.info(f"wait for {upper_proxy=} online...") ensure_otaproxy_start(str(upper_proxy)) - with OTAProxyContext(external_cache_enabled=enable_external_cache) as _cache_dir: asyncio.run( run_otaproxy( host=host, @@ -178,7 +86,7 @@ def otaproxy_process(*, init_cache: bool, enable_external_cache: bool) -> None: upper_proxy=upper_proxy, enable_cache=proxy_info.enable_local_ota_proxy_cache, enable_https=proxy_info.gateway_otaproxy, - external_cache=_cache_dir, + external_cache_mnt_point=external_cache_mnt_point, ) ) @@ -192,7 +100,7 @@ def otaproxy_control_thread( _mp_ctx = mp.get_context("spawn") - ota_cache_dir = Path(local_otaproxy_cfg.BASE_DIR) + ota_cache_dir = Path(otaproxy_cfg.BASE_DIR) next_ota_cache_dir_checkpoint = 0 global _otaproxy_p @@ -220,11 +128,7 @@ def otaproxy_control_thread( # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy # will still init the cache even init_cache is False. _otaproxy_p = _mp_ctx.Process( - target=partial( - otaproxy_process, - init_cache=False, - enable_external_cache=True, - ), + target=partial(run_otaproxy, init_cache=False), name="otaproxy", ) _otaproxy_p.start() @@ -232,6 +136,6 @@ def otaproxy_control_thread( time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown continue - if not _otaproxy_should_run and _otaproxy_running: + if _otaproxy_p and _otaproxy_running and not _otaproxy_should_run: logger.info("shutting down otaproxy as not needed now ...") shutdown_otaproxy_server() From 0ee26210d18cf1f17477301a3d65cbaa46eeec10 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:13:18 +0000 Subject: [PATCH 055/114] minor fix --- src/otaclient/_otaproxy_ctx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 84956918f..d3e14351d 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -128,7 +128,7 @@ def otaproxy_control_thread( # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy # will still init the cache even init_cache is False. _otaproxy_p = _mp_ctx.Process( - target=partial(run_otaproxy, init_cache=False), + target=partial(otaproxy_process, init_cache=False), name="otaproxy", ) _otaproxy_p.start() From a18db9ed25b542d4bcfa6d29a0ed5a814dc53f06 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:16:21 +0000 Subject: [PATCH 056/114] minor fix --- src/otaclient/_otaproxy_ctx.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index d3e14351d..5f9e28989 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -76,19 +76,19 @@ def otaproxy_process(*, init_cache: bool) -> None: logger.info(f"wait for {upper_proxy=} online...") ensure_otaproxy_start(str(upper_proxy)) - asyncio.run( - run_otaproxy( - host=host, - port=port, - init_cache=init_cache, - cache_dir=local_otaproxy_cfg.BASE_DIR, - cache_db_f=local_otaproxy_cfg.DB_FILE, - upper_proxy=upper_proxy, - enable_cache=proxy_info.enable_local_ota_proxy_cache, - enable_https=proxy_info.gateway_otaproxy, - external_cache_mnt_point=external_cache_mnt_point, - ) + asyncio.run( + run_otaproxy( + host=host, + port=port, + init_cache=init_cache, + cache_dir=local_otaproxy_cfg.BASE_DIR, + cache_db_f=local_otaproxy_cfg.DB_FILE, + upper_proxy=upper_proxy, + enable_cache=proxy_info.enable_local_ota_proxy_cache, + enable_https=proxy_info.gateway_otaproxy, + external_cache_mnt_point=external_cache_mnt_point, ) + ) def otaproxy_control_thread( From 52cb24365ca0c25d5e8afed2dc367ad0ca8ab2d3 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:30:28 +0000 Subject: [PATCH 057/114] minor fix --- src/otaclient/configs/_cfg_configurable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otaclient/configs/_cfg_configurable.py b/src/otaclient/configs/_cfg_configurable.py index 78d0027d4..c1300d9fc 100644 --- a/src/otaclient/configs/_cfg_configurable.py +++ b/src/otaclient/configs/_cfg_configurable.py @@ -41,7 +41,7 @@ class _OTAClientSettings(BaseModel): "otaclient": "INFO", "otaclient_api": "INFO", "otaclient_common": "INFO", - "otaproxy": "INFO", + "ota_proxy": "INFO", } LOG_FORMAT: str = ( "[%(asctime)s][%(levelname)s]-%(name)s:%(funcName)s:%(lineno)d,%(message)s" From 31b3169cb4eccb6234ad26149593616ee59f4fd0 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:43:17 +0000 Subject: [PATCH 058/114] slot_mnt_helper: register atexit hooks for umounting mounted devs --- .../boot_control/_slot_mnt_helper.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/otaclient/boot_control/_slot_mnt_helper.py b/src/otaclient/boot_control/_slot_mnt_helper.py index ae5055026..f993b6977 100644 --- a/src/otaclient/boot_control/_slot_mnt_helper.py +++ b/src/otaclient/boot_control/_slot_mnt_helper.py @@ -16,6 +16,9 @@ from __future__ import annotations +import atexit +from codecs import ignore_errors +from functools import partial import logging import shutil from pathlib import Path @@ -54,6 +57,9 @@ def __init__( ) ) + self._standby_slot_atexit = None + self._active_slot_atexit = None + def mount_standby(self) -> None: """Mount standby slot dev rw to . @@ -71,6 +77,19 @@ def mount_standby(self) -> None: raise_exception=True, ) + # ensure the standby slot is umounted at termination + atexit.register( + ( + _standby_slot_atexit := partial( + cmdhelper.ensure_umount, + self.standby_slot_mount_point, + ignore_error=True, + max_retry=3, + ) + ) + ) + self._standby_slot_atexit = _standby_slot_atexit + def mount_active(self) -> None: """Mount current active rootfs ready-only. @@ -86,6 +105,16 @@ def mount_active(self) -> None: raise_exception=True, ) + # ensure the active slot is umounted at termination + atexit.register( + _active_slot_atexit := partial( + cmdhelper.ensure_umount, + self.active_slot_mount_point, + ignore_error=True, + ) + ) + self._active_slot_atexit = _active_slot_atexit + def preserve_ota_folder_to_standby(self): """Copy the /boot/ota folder to standby slot to preserve it. @@ -123,3 +152,9 @@ def umount_all(self, *, ignore_error: bool = True): cmdhelper.ensure_umount( self.standby_slot_mount_point, ignore_error=ignore_error ) + + # also unregister the atexit umount + if self._active_slot_atexit: + atexit.unregister(self._active_slot_atexit) + if self._standby_slot_atexit: + atexit.unregister(self._standby_slot_atexit) From 76b7ee295985289ab20085984ca01653b6489c8e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 25 Nov 2024 04:50:49 +0000 Subject: [PATCH 059/114] slot_mnt_helper: keep the atexit hooks all the time --- .../boot_control/_slot_mnt_helper.py | 50 +++++++------------ 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/src/otaclient/boot_control/_slot_mnt_helper.py b/src/otaclient/boot_control/_slot_mnt_helper.py index f993b6977..868850b57 100644 --- a/src/otaclient/boot_control/_slot_mnt_helper.py +++ b/src/otaclient/boot_control/_slot_mnt_helper.py @@ -17,10 +17,9 @@ from __future__ import annotations import atexit -from codecs import ignore_errors -from functools import partial import logging import shutil +from functools import partial from pathlib import Path from otaclient.configs.cfg import cfg @@ -57,8 +56,22 @@ def __init__( ) ) - self._standby_slot_atexit = None - self._active_slot_atexit = None + # ensure the each mount points being umounted at termination + atexit.register( + partial( + cmdhelper.ensure_umount, + self.active_slot_mount_point, + ignore_error=True, + ) + ) + atexit.register( + partial( + cmdhelper.ensure_umount, + self.standby_slot_mount_point, + ignore_error=True, + max_retry=3, + ) + ) def mount_standby(self) -> None: """Mount standby slot dev rw to . @@ -77,19 +90,6 @@ def mount_standby(self) -> None: raise_exception=True, ) - # ensure the standby slot is umounted at termination - atexit.register( - ( - _standby_slot_atexit := partial( - cmdhelper.ensure_umount, - self.standby_slot_mount_point, - ignore_error=True, - max_retry=3, - ) - ) - ) - self._standby_slot_atexit = _standby_slot_atexit - def mount_active(self) -> None: """Mount current active rootfs ready-only. @@ -105,16 +105,6 @@ def mount_active(self) -> None: raise_exception=True, ) - # ensure the active slot is umounted at termination - atexit.register( - _active_slot_atexit := partial( - cmdhelper.ensure_umount, - self.active_slot_mount_point, - ignore_error=True, - ) - ) - self._active_slot_atexit = _active_slot_atexit - def preserve_ota_folder_to_standby(self): """Copy the /boot/ota folder to standby slot to preserve it. @@ -152,9 +142,3 @@ def umount_all(self, *, ignore_error: bool = True): cmdhelper.ensure_umount( self.standby_slot_mount_point, ignore_error=ignore_error ) - - # also unregister the atexit umount - if self._active_slot_atexit: - atexit.unregister(self._active_slot_atexit) - if self._standby_slot_atexit: - atexit.unregister(self._standby_slot_atexit) From e31859ea353d54f0011c6f409e5aa4b6673c4c17 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 01:44:34 +0000 Subject: [PATCH 060/114] revert src/ota_proxy to main branch's status for merge --- src/ota_proxy/__init__.py | 17 +---------------- src/ota_proxy/ota_cache.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/ota_proxy/__init__.py b/src/ota_proxy/__init__.py index e1025d598..7425d2811 100644 --- a/src/ota_proxy/__init__.py +++ b/src/ota_proxy/__init__.py @@ -15,11 +15,7 @@ from __future__ import annotations -import atexit import logging -from functools import partial - -from ota_proxy.external_cache import mount_external_cache, umount_external_cache from .cache_control_header import OTAFileCacheControl from .config import config @@ -53,17 +49,6 @@ async def run_otaproxy( from . import App, OTACache - _loaded_mnt_point = None - if external_cache_mnt_point and ( - _loaded_mnt_point := mount_external_cache(external_cache_mnt_point) - ): - _loaded_mnt_point = str(_loaded_mnt_point) - - def _atexit(_mnt_point) -> None: - umount_external_cache(_mnt_point) - - atexit.register(partial(_atexit, _loaded_mnt_point)) - _ota_cache = OTACache( base_dir=cache_dir, db_file=cache_db_f, @@ -71,7 +56,7 @@ def _atexit(_mnt_point) -> None: upper_proxy=upper_proxy, enable_https=enable_https, init_cache=init_cache, - external_cache_mnt_point=_loaded_mnt_point, + external_cache_mnt_point=external_cache_mnt_point, ) _config = uvicorn.Config( App(_ota_cache), diff --git a/src/ota_proxy/ota_cache.py b/src/ota_proxy/ota_cache.py index 980f53615..87efea741 100644 --- a/src/ota_proxy/ota_cache.py +++ b/src/ota_proxy/ota_cache.py @@ -37,6 +37,7 @@ from .config import config as cfg from .db import CacheMeta, check_db, init_db from .errors import BaseOTACacheError +from .external_cache import mount_external_cache, umount_external_cache from .lru_cache_helper import LRUCacheHelper from .utils import read_file, url_based_hash @@ -137,10 +138,16 @@ def __init__( ) self._external_cache_data_dir = None - if external_cache_mnt_point and cache_enabled: + self._external_cache_mp = None + if ( + cache_enabled + and external_cache_mnt_point + and mount_external_cache(external_cache_mnt_point) + ): logger.info( f"external cache source is enabled at: {external_cache_mnt_point}" ) + self._external_cache_mp = external_cache_mnt_point self._external_cache_data_dir = ( Path(external_cache_mnt_point) / cfg.EXTERNAL_CACHE_DATA_DNAME ) @@ -224,6 +231,9 @@ async def close(self): if self._cache_enabled: self._lru_helper.close() + if self._external_cache_mp: + umount_external_cache(self._external_cache_mp) + logger.info("shutdown ota-cache completed") def _background_check_free_space(self): From a9bf9e527303a4aa7fa150104ab7df5ad1019d77 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 01:45:20 +0000 Subject: [PATCH 061/114] revert src/otaclient_common/cmdhelper.py to main branch's status for merge --- src/otaclient_common/cmdhelper.py | 258 +++++++++++++++++------------- 1 file changed, 144 insertions(+), 114 deletions(-) diff --git a/src/otaclient_common/cmdhelper.py b/src/otaclient_common/cmdhelper.py index 48fd93c58..21af23af7 100644 --- a/src/otaclient_common/cmdhelper.py +++ b/src/otaclient_common/cmdhelper.py @@ -26,7 +26,7 @@ import time from pathlib import Path from subprocess import CalledProcessError -from typing import Literal, NoReturn +from typing import Literal, NoReturn, Protocol from otaclient_common.common import subprocess_call, subprocess_check_output from otaclient_common.typing import StrOrPath @@ -239,6 +239,103 @@ def set_ext4_fslabel( subprocess_call(cmd, raise_exception=raise_exception) +def mkfs_ext4( + dev: str, + *, + fslabel: str | None = None, + fsuuid: str | None = None, + raise_exception: bool = True, +) -> None: # pragma: no cover + """Create new ext4 formatted filesystem on , optionally with + and/or . + + Args: + dev (str): device to be formatted to ext4. + fslabel (Optional[str], optional): fslabel of the new ext4 filesystem. Defaults to None. + When it is None, this function will try to preserve the previous fslabel. + fsuuid (Optional[str], optional): fsuuid of the new ext4 filesystem. Defaults to None. + When it is None, this function will try to preserve the previous fsuuid. + raise_exception (bool, optional): raise exception on subprocess call failed. + Defaults to True. + """ + cmd = ["mkfs.ext4", "-F"] + + if not fsuuid: + try: + fsuuid = get_attrs_by_dev("UUID", dev) + assert fsuuid + logger.debug(f"reuse previous UUID: {fsuuid}") + except Exception: + pass + if fsuuid: + logger.debug(f"using UUID: {fsuuid}") + cmd.extend(["-U", fsuuid]) + + if not fslabel: + try: + fslabel = get_attrs_by_dev("LABEL", dev) + assert fslabel + logger.debug(f"reuse previous fs LABEL: {fslabel}") + except Exception: + pass + if fslabel: + logger.debug(f"using fs LABEL: {fslabel}") + cmd.extend(["-L", fslabel]) + + cmd.append(dev) + logger.warning(f"format {dev} to ext4: {cmd=}") + subprocess_call(cmd, raise_exception=raise_exception) + + +def reboot(args: list[str] | None = None) -> NoReturn: # pragma: no cover + """Reboot the system, with optional args passed to reboot command. + + This is implemented by calling: + reboot [args[0], args[1], ...] + + NOTE(20230614): this command makes otaclient exit immediately. + NOTE(20240421): rpi_boot's reboot takes args. + + Args: + args (Optional[list[str]], optional): args passed to reboot command. + Defaults to None, not passing any args. + + Raises: + CalledProcessError for the reboot call, or SystemExit on sys.exit(0). + """ + cmd = ["reboot"] + if args: + logger.info(f"will reboot with argument: {args=}") + cmd.extend(args) + + logger.warning("system will reboot now!") + subprocess_call(cmd, raise_exception=True) + sys.exit(0) + + +# +# ------ mount related helpers ------ # +# + +MAX_RETRY_COUNT = 6 +RETRY_INTERVAL = 2 + + +class MountHelper(Protocol): + """Protocol for mount helper functions. + + This is for typing purpose. + """ + + def __call__( + self, + target: StrOrPath, + mount_point: StrOrPath, + *, + raise_exception: bool = True, + ) -> None: ... + + def mount( target: StrOrPath, mount_point: StrOrPath, @@ -269,7 +366,7 @@ def mount( def mount_rw( - target: str, mount_point: StrOrPath, *, raise_exception: bool = True + target: StrOrPath, mount_point: StrOrPath, *, raise_exception: bool = True ) -> None: # pragma: no cover """Mount the to read-write. @@ -280,7 +377,7 @@ def mount_rw( mount events propagation to/from this mount point. Args: - target (str): target to be mounted. + target (StrOrPath): target to be mounted. mount_point (StrOrPath): mount point to mount to. raise_exception (bool, optional): raise exception on subprocess call failed. Defaults to True. @@ -290,7 +387,7 @@ def mount_rw( "mount", "-o", "rw", "--make-private", "--make-unbindable", - target, + str(target), str(mount_point), ] # fmt: on @@ -298,7 +395,7 @@ def mount_rw( def bind_mount_ro( - target: str, mount_point: StrOrPath, *, raise_exception: bool = True + target: StrOrPath, mount_point: StrOrPath, *, raise_exception: bool = True ) -> None: # pragma: no cover """Bind mount the to read-only. @@ -306,7 +403,7 @@ def bind_mount_ro( mount -o bind,ro --make-private --make-unbindable Args: - target (str): target to be mounted. + target (StrOrPath): target to be mounted. mount_point (StrOrPath): mount point to mount to. raise_exception (bool, optional): raise exception on subprocess call failed. Defaults to True. @@ -316,89 +413,15 @@ def bind_mount_ro( "mount", "-o", "bind,ro", "--make-private", "--make-unbindable", - target, + str(target), str(mount_point) ] # fmt: on subprocess_call(cmd, raise_exception=raise_exception) -def umount( - target: StrOrPath, *, raise_exception: bool = True -) -> None: # pragma: no cover - """Try to umount the . - - This is implemented by calling: - umount - - Before calling umount, the will be check whether it is mounted, - if it is not mounted, this function will return directly. - - Args: - target (StrOrPath): target to be umounted. - raise_exception (bool, optional): raise exception on subprocess call failed. - Defaults to True. - """ - # first try to check whether the target(either a mount point or a dev) - # is mounted - if not is_target_mounted(target, raise_exception=False): - return - - # if the target is mounted, try to unmount it. - _cmd = ["umount", str(target)] - subprocess_call(_cmd, raise_exception=raise_exception) - - -def mkfs_ext4( - dev: str, - *, - fslabel: str | None = None, - fsuuid: str | None = None, - raise_exception: bool = True, -) -> None: # pragma: no cover - """Create new ext4 formatted filesystem on , optionally with - and/or . - - Args: - dev (str): device to be formatted to ext4. - fslabel (Optional[str], optional): fslabel of the new ext4 filesystem. Defaults to None. - When it is None, this function will try to preserve the previous fslabel. - fsuuid (Optional[str], optional): fsuuid of the new ext4 filesystem. Defaults to None. - When it is None, this function will try to preserve the previous fsuuid. - raise_exception (bool, optional): raise exception on subprocess call failed. - Defaults to True. - """ - cmd = ["mkfs.ext4", "-F"] - - if not fsuuid: - try: - fsuuid = get_attrs_by_dev("UUID", dev) - assert fsuuid - logger.debug(f"reuse previous UUID: {fsuuid}") - except Exception: - pass - if fsuuid: - logger.debug(f"using UUID: {fsuuid}") - cmd.extend(["-U", fsuuid]) - - if not fslabel: - try: - fslabel = get_attrs_by_dev("LABEL", dev) - assert fslabel - logger.debug(f"reuse previous fs LABEL: {fslabel}") - except Exception: - pass - if fslabel: - logger.debug(f"using fs LABEL: {fslabel}") - cmd.extend(["-L", fslabel]) - - cmd.append(dev) - logger.warning(f"format {dev} to ext4: {cmd=}") - subprocess_call(cmd, raise_exception=raise_exception) - - def mount_ro( - *, target: str, mount_point: StrOrPath, raise_exception: bool = True + target: StrOrPath, mount_point: StrOrPath, *, raise_exception: bool = True ) -> None: # pragma: no cover """Mount to read-only. @@ -406,14 +429,16 @@ def mount_ro( if the target device is not mounted, we directly mount it to the mount_point. Args: - target (str): target to be mounted. + target (StrOrPath): target to be mounted. mount_point (StrOrPath): mount point to mount to. raise_exception (bool, optional): raise exception on subprocess call failed. Defaults to True. """ # NOTE: set raise_exception to false to allow not mounted # not mounted dev will have empty return str - if _active_mount_point := get_mount_point_by_dev(target, raise_exception=False): + if _active_mount_point := get_mount_point_by_dev( + str(target), raise_exception=False + ): bind_mount_ro( _active_mount_point, mount_point, @@ -426,48 +451,44 @@ def mount_ro( "mount", "-o", "ro", "--make-private", "--make-unbindable", - target, + str(target), str(mount_point), ] # fmt: on subprocess_call(cmd, raise_exception=raise_exception) -def reboot(args: list[str] | None = None) -> NoReturn: # pragma: no cover - """Reboot the system, with optional args passed to reboot command. +def umount( + target: StrOrPath, *, raise_exception: bool = True +) -> None: # pragma: no cover + """Try to umount the . This is implemented by calling: - reboot [args[0], args[1], ...] + umount - NOTE(20230614): this command makes otaclient exit immediately. - NOTE(20240421): rpi_boot's reboot takes args. + Before calling umount, the will be check whether it is mounted, + if it is not mounted, this function will return directly. Args: - args (Optional[list[str]], optional): args passed to reboot command. - Defaults to None, not passing any args. - - Raises: - CalledProcessError for the reboot call, or SystemExit on sys.exit(0). + target (StrOrPath): target to be umounted. + raise_exception (bool, optional): raise exception on subprocess call failed. + Defaults to True. """ - cmd = ["reboot"] - if args: - logger.info(f"will reboot with argument: {args=}") - cmd.extend(args) - - logger.warning("system will reboot now!") - subprocess_call(cmd, raise_exception=True) - sys.exit(0) - + # first try to check whether the target(either a mount point or a dev) + # is mounted + if not is_target_mounted(target, raise_exception=False): + return -MAX_RETRY_COUNT = 6 -RETRY_INTERVAL = 2 + # if the target is mounted, try to unmount it. + _cmd = ["umount", str(target)] + subprocess_call(_cmd, raise_exception=raise_exception) def ensure_mount( target: StrOrPath, mnt_point: StrOrPath, *, - mount_func, + mount_func: MountHelper, raise_exception: bool, max_retry: int = MAX_RETRY_COUNT, retry_interval: int = RETRY_INTERVAL, @@ -483,10 +504,13 @@ def ensure_mount( is_target_mounted(mnt_point, raise_exception=True) return except CalledProcessError as e: - logger.error( - f"retry#{_retry} failed to mount {target} on {mnt_point}: {e!r}" + logger.info( + ( + f"retry#{_retry} failed to mount {target} on {mnt_point}: {e!r}\n" + f"{e.stderr=}\n{e.stdout=}\n" + "retrying another mount ..." + ) ) - logger.error(f"{e.stderr=}\n{e.stdout=}") if _retry >= max_retry: logger.error( @@ -518,8 +542,12 @@ def ensure_umount( break umount(mnt_point, raise_exception=True) except CalledProcessError as e: - logger.warning(f"retry#{_retry} failed to umount {mnt_point}: {e!r}") - logger.warning(f"{e.stderr}\n{e.stdout}") + logger.info( + ( + f"retry#{_retry} failed to umount {mnt_point}: {e!r}\n" + f"{e.stderr=}\n{e.stdout=}" + ) + ) if _retry >= max_retry: logger.error(f"reached max retry on umounting {mnt_point}, abort") @@ -554,6 +582,8 @@ def ensure_mointpoint( logger.error(f"failed to prepare {mnt_point=}: {e!r}") raise logger.warning( - f"failed to prepare {mnt_point=}: {e!r} \n" - f"But still use {mnt_point} and override the previous mount" + ( + f"failed to prepare {mnt_point=}: {e!r} \n" + f"But still use {mnt_point} and override the previous mount" + ) ) From 4648dc7be132838e443415055865075b53d160ab Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 01:55:24 +0000 Subject: [PATCH 062/114] do not install SIGINT handler; SIGNTERM handler now only raises a SystemExit exception --- src/otaclient/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index acdde0072..a604d8f67 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -68,8 +68,7 @@ def _on_shutdown(sys_exit: bool = False): def _signal_handler(signame, _) -> None: print(f"otaclient receives {signame=}, shutting down ...") - # do not sys.exit when we are already shutting down - _on_shutdown(sys_exit=True) + sys.exit(1) def main() -> None: @@ -97,7 +96,6 @@ def main() -> None: atexit.register(_on_shutdown) signal.signal(signal.SIGTERM, _signal_handler) - signal.signal(signal.SIGINT, _signal_handler) mp_ctx = mp.get_context("spawn") _shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) From 8275cf99bd85a8ad987ae68aac3bc1ec126b5b11 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:02:55 +0000 Subject: [PATCH 063/114] ota_core: do not replace the original SIGINT handler --- src/otaclient/ota_core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 9ce4407a8..bb201e794 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -885,7 +885,6 @@ def ota_core_process( from otaclient.ota_core import OTAClient signal.signal(signal.SIGTERM, _sign_handler) - signal.signal(signal.SIGINT, _sign_handler) configure_logging() shm_writer = shm_writer_factory() From fbb1272721a95011f62280b942b1d9089799c635 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:04:11 +0000 Subject: [PATCH 064/114] signame -> signal_value --- src/otaclient/main.py | 4 ++-- src/otaclient/ota_core.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index a604d8f67..e8b4169aa 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -66,8 +66,8 @@ def _on_shutdown(sys_exit: bool = False): sys.exit(1) -def _signal_handler(signame, _) -> None: - print(f"otaclient receives {signame=}, shutting down ...") +def _signal_handler(signal_value, _) -> None: + print(f"otaclient receives {signal_value=}, shutting down ...") sys.exit(1) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index bb201e794..775a294e5 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -869,8 +869,8 @@ def main( ) -def _sign_handler(signame, frame) -> NoReturn: - print(f"ota_core process receives {signame=}, exits ...") +def _sign_handler(signal_value, frame) -> NoReturn: + print(f"ota_core process receives {signal_value=}, exits ...") sys.exit(1) From 693402132915d2df350174418e0bc36c16b0f148 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:20:00 +0000 Subject: [PATCH 065/114] ota_core: not install SIGTERM handler for now --- src/otaclient/ota_core.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 775a294e5..2308e6a80 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -869,11 +869,6 @@ def main( ) -def _sign_handler(signal_value, frame) -> NoReturn: - print(f"ota_core process receives {signal_value=}, exits ...") - sys.exit(1) - - def ota_core_process( shm_writer_factory: Callable[[], SharedOTAClientStatusWriter], control_flag: mp_sync.Event, @@ -884,7 +879,6 @@ def ota_core_process( from otaclient.configs.cfg import proxy_info from otaclient.ota_core import OTAClient - signal.signal(signal.SIGTERM, _sign_handler) configure_logging() shm_writer = shm_writer_factory() From 66ae925f96ba6099b93da11d212de0db873b42b5 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:21:52 +0000 Subject: [PATCH 066/114] main: call on_shutdown on sigint and sigterm --- src/otaclient/main.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index e8b4169aa..289b06799 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -23,7 +23,6 @@ import multiprocessing.shared_memory as mp_shm import secrets import signal -import sys import threading import time from functools import partial @@ -45,7 +44,7 @@ _shm: mp_shm.SharedMemory | None = None -def _on_shutdown(sys_exit: bool = False): +def _on_shutdown(): global _ota_core_p, _grpc_server_p, _shm if _ota_core_p: _ota_core_p.terminate() @@ -62,13 +61,10 @@ def _on_shutdown(sys_exit: bool = False): _shm.unlink() _shm = None - if sys_exit: - sys.exit(1) - def _signal_handler(signal_value, _) -> None: print(f"otaclient receives {signal_value=}, shutting down ...") - sys.exit(1) + _on_shutdown() def main() -> None: @@ -96,6 +92,7 @@ def main() -> None: atexit.register(_on_shutdown) signal.signal(signal.SIGTERM, _signal_handler) + signal.signal(signal.SIGINT, _signal_handler) mp_ctx = mp.get_context("spawn") _shm = mp_shm.SharedMemory(size=STATUS_SHM_SIZE, create=True) From 732b5e18c7ac6e8f04b02b4c7136f291ae32ca90 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:27:15 +0000 Subject: [PATCH 067/114] main: still use sys.exit --- src/otaclient/main.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 289b06799..1778e68fc 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -23,6 +23,7 @@ import multiprocessing.shared_memory as mp_shm import secrets import signal +import sys import threading import time from functools import partial @@ -44,7 +45,7 @@ _shm: mp_shm.SharedMemory | None = None -def _on_shutdown(): +def _on_shutdown(sys_exit: bool = False): global _ota_core_p, _grpc_server_p, _shm if _ota_core_p: _ota_core_p.terminate() @@ -61,10 +62,13 @@ def _on_shutdown(): _shm.unlink() _shm = None + if sys_exit: + sys.exit(1) + def _signal_handler(signal_value, _) -> None: print(f"otaclient receives {signal_value=}, shutting down ...") - _on_shutdown() + _on_shutdown(sys_exit=True) def main() -> None: @@ -157,11 +161,11 @@ def main() -> None: f"otaclient will exit in {SHUTDOWN_AFTER_CORE_EXIT}seconds ..." ) time.sleep(SHUTDOWN_AFTER_CORE_EXIT) - _on_shutdown() + return _on_shutdown() if not _grpc_server_p.is_alive(): logger.error( f"ota API server is dead, whole otaclient will exit in {SHUTDOWN_AFTER_API_SERVER_EXIT}seconds ..." ) time.sleep(SHUTDOWN_AFTER_API_SERVER_EXIT) - _on_shutdown() + return _on_shutdown() From 7f1c7e5b6c627187b2d694ec21e13565536f5d01 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 27 Nov 2024 02:30:53 +0000 Subject: [PATCH 068/114] Revert "ota_core: not install SIGTERM handler for now" This reverts commit 693402132915d2df350174418e0bc36c16b0f148. --- src/otaclient/ota_core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 2308e6a80..775a294e5 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -869,6 +869,11 @@ def main( ) +def _sign_handler(signal_value, frame) -> NoReturn: + print(f"ota_core process receives {signal_value=}, exits ...") + sys.exit(1) + + def ota_core_process( shm_writer_factory: Callable[[], SharedOTAClientStatusWriter], control_flag: mp_sync.Event, @@ -879,6 +884,7 @@ def ota_core_process( from otaclient.configs.cfg import proxy_info from otaclient.ota_core import OTAClient + signal.signal(signal.SIGTERM, _sign_handler) configure_logging() shm_writer = shm_writer_factory() From 187d8dcc182105e24e0a636e676b97cc4f38f25e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 02:37:08 +0000 Subject: [PATCH 069/114] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/otaclient/_types.py | 1 + src/otaclient/grpc/api_v2/types.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 32f2e4f10..c87393d83 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -20,6 +20,7 @@ from typing import ClassVar, Optional from _otaclient_version import __version__ + from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum diff --git a/src/otaclient/grpc/api_v2/types.py b/src/otaclient/grpc/api_v2/types.py index 8ec31a9d9..208672248 100644 --- a/src/otaclient/grpc/api_v2/types.py +++ b/src/otaclient/grpc/api_v2/types.py @@ -18,7 +18,7 @@ import time -from otaclient._types import OTAClientStatus, OTAStatus, UpdateRequestV2, UpdateTiming +from otaclient._types import OTAClientStatus, OTAStatus, UpdateTiming from otaclient_api.v2 import types as api_types from otaclient_common.proto_wrapper import Duration From f1c187adaaf49a1bae99db59c00ccefba5fd5c71 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 01:28:21 +0000 Subject: [PATCH 070/114] ready for merging main branch's shm_status --- src/otaclient_common/shm_status.py | 125 +++++++++++++++++++++-------- 1 file changed, 90 insertions(+), 35 deletions(-) diff --git a/src/otaclient_common/shm_status.py b/src/otaclient_common/shm_status.py index b187d6af9..4dc7e025a 100644 --- a/src/otaclient_common/shm_status.py +++ b/src/otaclient_common/shm_status.py @@ -15,14 +15,16 @@ shared memory layout: -rwlock(1byte) | hmac-sha3_512(64bytes) | msg_len(4bytes,big) | msg(bytes) +rwlock(1byte) | hmac-sha512 of msg(64bytes) | msg_len(4bytes,big) | msg(bytes) In which, msg is pickled python object. """ from __future__ import annotations +import hashlib import hmac +import logging import multiprocessing.shared_memory as mp_shm import pickle import time @@ -30,41 +32,85 @@ from otaclient_common.typing import T -HASH_ALG = "sha3_512" -DEFAULT_KEY_LEN = 64 # bytes +logger = logging.getLogger(__name__) + +DEFAULT_HASH_ALG = "sha512" +DEFAULT_KEY_LEN = hashlib.new(DEFAULT_HASH_ALG).digest_size RWLOCK_LEN = 1 # byte -HMAC_SHA3_512_LEN = 64 # bytes PAYLOAD_LEN_BYTES = 4 # bytes -MIN_ENCAP_MSG_LEN = RWLOCK_LEN + HMAC_SHA3_512_LEN + PAYLOAD_LEN_BYTES RWLOCK_LOCKED = b"\xab" RWLOCK_OPEN = b"\x54" -class MPSharedStatusReader(Generic[T]): +class RWBusy(Exception): ... + + +class SHA512Verifier: + """Base class for specifying hash alg related configurations.""" + + DIGEST_ALG = "sha512" + DIGEST_SIZE = hashlib.new(DIGEST_ALG).digest_size + MIN_ENCAP_MSG_LEN = RWLOCK_LEN + DIGEST_SIZE + PAYLOAD_LEN_BYTES + + _key: bytes + + def cal_hmac(self, _raw_msg: bytes) -> bytes: + return hmac.digest(key=self._key, msg=_raw_msg, digest=self.DIGEST_ALG) + + def verify_msg(self, _raw_msg: bytes, _expected_hmac: bytes) -> bool: + return hmac.compare_digest( + hmac.digest( + key=self._key, + msg=_raw_msg, + digest=self.DIGEST_ALG, + ), + _expected_hmac, + ) + + +def _ensure_connect_shm( + name: str, *, max_retry: int, retry_interval: int +) -> mp_shm.SharedMemory: + for _idx in range(max_retry): + try: + return mp_shm.SharedMemory(name=name, create=False) + except Exception as e: + logger.warning( + f"retry #{_idx}: failed to connect to {name=}: {e!r}, keep retrying ..." + ) + time.sleep(retry_interval) + raise ValueError(f"failed to connect share memory with {name=}") + + +class MPSharedStatusReader(SHA512Verifier, Generic[T]): def __init__( - self, *, name: str, key: bytes, max_retry: int = 6, retry_interval: int = 1 + self, + *, + name: str, + key: bytes, + max_retry: int = 6, + retry_interval: int = 1, ) -> None: - for _ in range(max_retry): - try: - self._shm = shm = mp_shm.SharedMemory(name=name, create=False) - break - except Exception: - print("retrying ...") - time.sleep(retry_interval) - else: - raise ValueError("failed to connect share memory") - + self._shm = shm = _ensure_connect_shm( + name, max_retry=max_retry, retry_interval=retry_interval + ) self.mem_size = size = shm.size - self.msg_max_size = size - MIN_ENCAP_MSG_LEN + self.msg_max_size = size - self.MIN_ENCAP_MSG_LEN self._key = key def atexit(self) -> None: self._shm.close() def sync_msg(self) -> T: + """Get msg from shared memory. + + Raises: + RWBusy if rwlock indicates the writer is writing or not yet ready. + ValueError for invalid msg. + """ buffer = self._shm.buf # check if we can read @@ -72,13 +118,13 @@ def sync_msg(self) -> T: rwlock = bytes(buffer[_cursor:RWLOCK_LEN]) if rwlock != RWLOCK_OPEN: if rwlock == RWLOCK_LOCKED: - raise ValueError("write in progress, abort") - raise ValueError(f"invalid input_msg: wrong rwlock bytes: {rwlock=}") + raise RWBusy("write in progress, abort") + raise RWBusy("no msg has been written yet") _cursor += RWLOCK_LEN # parsing the msg - input_hmac = bytes(buffer[_cursor : _cursor + HMAC_SHA3_512_LEN]) - _cursor += HMAC_SHA3_512_LEN + input_hmac = bytes(buffer[_cursor : _cursor + self.DIGEST_SIZE]) + _cursor += self.DIGEST_SIZE _payload_len_bytes = bytes(buffer[_cursor : _cursor + PAYLOAD_LEN_BYTES]) payload_len = int.from_bytes(_payload_len_bytes, "big", signed=False) @@ -88,37 +134,42 @@ def sync_msg(self) -> T: raise ValueError(f"invalid msg: {payload_len=} > {self.msg_max_size}") payload = bytes(buffer[_cursor : _cursor + payload_len]) - payload_hmac = hmac.digest(key=self._key, msg=payload, digest=HASH_ALG) - - if hmac.compare_digest(payload_hmac, input_hmac): + if self.verify_msg(payload, input_hmac): return pickle.loads(payload) raise ValueError("failed to validate input msg") -class MPSharedStatusWriter(Generic[T]): +class MPSharedStatusWriter(SHA512Verifier, Generic[T]): def __init__( self, *, name: str | None = None, size: int = 0, + key: bytes, create: bool = False, msg_max_size: int | None = None, - key: bytes, + max_retry: int = 6, + retry_interval: int = 1, ) -> None: if create: - _msg_max_size = size - MIN_ENCAP_MSG_LEN + _msg_max_size = size - self.MIN_ENCAP_MSG_LEN if _msg_max_size < 0: - raise ValueError(f"{size=} < {MIN_ENCAP_MSG_LEN=}") + raise ValueError(f"{size=} < {self.MIN_ENCAP_MSG_LEN=}") self._shm = shm = mp_shm.SharedMemory(name=name, size=size, create=True) self.mem_size = shm.size - else: - self._shm = shm = mp_shm.SharedMemory(name=name, create=False) + + elif name: + self._shm = shm = _ensure_connect_shm( + name, max_retry=max_retry, retry_interval=retry_interval + ) self.mem_size = size = shm.size - _msg_max_size = size - MIN_ENCAP_MSG_LEN + _msg_max_size = size - self.MIN_ENCAP_MSG_LEN if _msg_max_size < 0: shm.close() - raise ValueError(f"{size=} < {MIN_ENCAP_MSG_LEN=}") + raise ValueError(f"{size=} < {self.MIN_ENCAP_MSG_LEN=}") + else: + raise ValueError(" must be specified if is False") self.name = shm.name self._key = key @@ -128,6 +179,11 @@ def atexit(self) -> None: self._shm.close() def write_msg(self, obj: T) -> None: + """Write msg to shared memory. + + Raises: + ValueError on invalid msg or exceeding shared memory size. + """ buffer = self._shm.buf _pickled = pickle.dumps(obj) _pickled_len = len(_pickled) @@ -135,11 +191,10 @@ def write_msg(self, obj: T) -> None: if _pickled_len > self.msg_max_size: raise ValueError(f"exceed {self.msg_max_size=}: {_pickled_len=}") - _hmac = hmac.digest(key=self._key, msg=_pickled, digest=HASH_ALG) msg = b"".join( [ RWLOCK_LOCKED, - _hmac, + self.cal_hmac(_pickled), _pickled_len.to_bytes(PAYLOAD_LEN_BYTES, "big", signed=False), _pickled, ] From 5645585adf32fefba46d4da266063a0cf4d39cc9 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 04:59:42 +0000 Subject: [PATCH 071/114] _types: add MultipleECUStatusFlags --- src/otaclient/_types.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index c87393d83..6e7722117 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -16,11 +16,11 @@ from __future__ import annotations +import multiprocessing.synchronize as mp_sync from dataclasses import dataclass from typing import ClassVar, Optional from _otaclient_version import __version__ - from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum @@ -123,6 +123,13 @@ class OTAClientStatus: failure_traceback: str = "" +@dataclass +class MultipleECUStatusFlags: + any_in_update: mp_sync.Event + any_requires_network: mp_sync.Event + all_success: mp_sync.Event + + # # ------ OTA requests IPC ------ # # From 17a58c89d4b59b1938f33915c80c6135ca399f31 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 05:13:48 +0000 Subject: [PATCH 072/114] ecu_status: use MultipleECUStatusFlags --- src/otaclient/grpc/api_v2/ecu_status.py | 75 +++++++++++-------------- 1 file changed, 33 insertions(+), 42 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_status.py b/src/otaclient/grpc/api_v2/ecu_status.py index f2f21fb8b..11e9b7bcd 100644 --- a/src/otaclient/grpc/api_v2/ecu_status.py +++ b/src/otaclient/grpc/api_v2/ecu_status.py @@ -41,11 +41,12 @@ import asyncio import logging +import math import time from itertools import chain -from typing import TYPE_CHECKING, Dict, Iterable, Optional +from typing import Dict, Iterable, Optional -from otaclient._types import OTAClientStatus +from otaclient._types import MultipleECUStatusFlags, OTAClientStatus from otaclient.configs.cfg import cfg, ecu_info from otaclient.grpc.api_v2.types import convert_to_apiv2_status from otaclient_api.v2 import types as api_types @@ -53,9 +54,6 @@ logger = logging.getLogger(__name__) -if TYPE_CHECKING: - import multiprocessing.synchronize as mp_sync - # NOTE(20230522): # ECU will be treated as disconnected if we cannot get in touch with it # longer than * . @@ -88,11 +86,12 @@ class ECUStatusStorage: def __init__( self, *, - all_ecus_succeeded: mp_sync.Event, - any_requires_network: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, ) -> None: self.my_ecu_id = ecu_info.ecu_id self._writer_lock = asyncio.Lock() + + self.ecu_status_flags = ecu_status_flags # ECU status storage self.storage_last_updated_timestamp = 0 @@ -120,14 +119,13 @@ def __init__( ecu_info.get_available_ecu_ids() ) - self._all_ecus_status_v2: Dict[str, api_types.StatusResponseEcuV2] = {} - self._all_ecus_status_v1: Dict[str, api_types.StatusResponseEcu] = {} - self._all_ecus_last_contact_timestamp: Dict[str, int] = {} + self._all_ecus_status_v2: dict[str, api_types.StatusResponseEcuV2] = {} + self._all_ecus_status_v1: dict[str, api_types.StatusResponseEcu] = {} + self._all_ecus_last_contact_timestamp: dict[str, int] = {} # overall ECU status report self._properties_update_lock = asyncio.Lock() self.properties_last_update_timestamp = 0 - self.active_ota_update_present = asyncio.Event() self.lost_ecus_id = set() self.failed_ecus_id = set() @@ -137,10 +135,6 @@ def __init__( self.success_ecus_id = set() - # exposed external events - self.any_requires_network: mp_sync.Event = any_requires_network - self.all_success: mp_sync.Event = all_ecus_succeeded - # property update task # NOTE: _debug_properties_update_shutdown_event is for test only, # allow us to stop background task without changing codes. @@ -166,6 +160,7 @@ async def _generate_overall_status_report(self): NOTE: as special case, lost_ecus set is calculated against all reachable ECUs. """ self.properties_last_update_timestamp = cur_timestamp = int(time.time()) + ecu_status_flags = self.ecu_status_flags # check unreachable ECUs # NOTE(20230801): this property is calculated against all reachable ECUs, @@ -201,9 +196,9 @@ async def _generate_overall_status_report(self): f"{_new_in_update_ecu}, current updating ECUs: {in_update_ecus_id}" ) if in_update_ecus_id: - self.active_ota_update_present.set() + ecu_status_flags.any_in_update.set() else: - self.active_ota_update_present.clear() + ecu_status_flags.any_in_update.clear() # check if there is any failed child/self ECU in tracked active ECUs set _old_failed_ecus_id = self.failed_ecus_id @@ -232,12 +227,14 @@ async def _generate_overall_status_report(self): and status.ecu_id not in lost_ecus ) ): - self.any_requires_network.set() + ecu_status_flags.any_requires_network.set() else: - self.any_requires_network.clear() + ecu_status_flags.any_requires_network.clear() # check if all tracked active_ota_ecus are in SUCCESS ota_status - _old_all_success, _old_success_ecus_id = self.all_success, self.success_ecus_id + _old_all_success = ecu_status_flags.all_success.is_set() + _old_success_ecus_id = self.success_ecus_id + self.success_ecus_id = { status.ecu_id for status in chain( @@ -249,21 +246,15 @@ async def _generate_overall_status_report(self): } # NOTE: all_success doesn't count the lost ECUs if len(self.success_ecus_id) == len(self._tracked_active_ecus): - self.all_success.set() + ecu_status_flags.all_success.set() else: - self.all_success.clear() + ecu_status_flags.all_success.clear() if _new_success_ecu := self.success_ecus_id.difference(_old_success_ecus_id): logger.info(f"new succeeded ECU(s) detected: {_new_success_ecu}") - if not _old_all_success and self.all_success: + if ecu_status_flags.all_success.is_set() and not _old_all_success: logger.info("all ECUs in the cluster are in SUCCESS ota_status") - logger.debug( - "overall ECU status reporrt updated:" - f"{self.lost_ecus_id=}, {self.in_update_ecus_id=},{self.any_requires_network=}," - f"{self.failed_ecus_id=}, {self.success_ecus_id=}, {self.all_success=}" - ) - async def _loop_updating_properties(self): """ECU status storage's self generating overall ECU status report task. @@ -341,6 +332,7 @@ async def on_ecus_accept_update_request(self, ecus_accept_update: set[str]): their ota_status to UPDATING on-time due to status polling interval mismatch), the above set value will be kept for seconds. """ + ecu_status_flags = self.ecu_status_flags async with self._properties_update_lock: self._tracked_active_ecus = _OrderedSet(ecus_accept_update) @@ -350,12 +342,11 @@ async def on_ecus_accept_update_request(self, ecus_accept_update: set[str]): self.in_update_ecus_id.update(ecus_accept_update) self.in_update_child_ecus_id = self.in_update_ecus_id - {self.my_ecu_id} - - self.any_requires_network.set() - self.all_success.clear() self.success_ecus_id -= ecus_accept_update - self.active_ota_update_present.set() + ecu_status_flags.all_success.clear() + ecu_status_flags.any_requires_network.set() + ecu_status_flags.any_in_update.set() def get_polling_interval(self) -> int: """Return if there is active OTA update, @@ -364,9 +355,10 @@ def get_polling_interval(self) -> int: NOTE: use get_polling_waiter if want to wait, only call this method if one only wants to get the polling interval value. """ + ecu_status_flags = self.ecu_status_flags return ( ACTIVE_POLLING_INTERVAL - if self.active_ota_update_present.is_set() + if ecu_status_flags.any_in_update.is_set() else IDLE_POLLING_INTERVAL ) @@ -380,19 +372,18 @@ def get_polling_waiter(self): or self.active_ota_update_present is set, return when one of the condition is met. """ + _inner_wait_interval = 1 async def _waiter(): - if self.active_ota_update_present.is_set(): + ecu_status_flags = self.ecu_status_flags + if ecu_status_flags.any_in_update.is_set(): await asyncio.sleep(ACTIVE_POLLING_INTERVAL) return - try: - await asyncio.wait_for( - self.active_ota_update_present.wait(), - timeout=IDLE_POLLING_INTERVAL, - ) - except asyncio.TimeoutError: - return + for _ in range(math.ceil(IDLE_POLLING_INTERVAL / _inner_wait_interval)): + if ecu_status_flags.any_in_update.is_set(): + return + await asyncio.sleep(_inner_wait_interval) return _waiter From 95e300ba2f0aa17bc8a2cbe6e75352d7a8e9c51d Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 05:16:54 +0000 Subject: [PATCH 073/114] otaproxy_ctx: use MultipleECUStatusFlags --- src/otaclient/_otaproxy_ctx.py | 19 ++++++++----------- src/otaclient/ota_core.py | 8 -------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 5f9e28989..8186bcbcf 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -24,7 +24,6 @@ import logging import multiprocessing as mp import multiprocessing.context as mp_ctx -import multiprocessing.synchronize as mp_sync import shutil import time from functools import partial @@ -33,6 +32,7 @@ from ota_proxy import config as local_otaproxy_cfg from ota_proxy import run_otaproxy from ota_proxy.config import config as otaproxy_cfg +from otaclient._types import MultipleECUStatusFlags from otaclient.configs.cfg import cfg, proxy_info from otaclient_common.common import ensure_otaproxy_start @@ -92,9 +92,7 @@ def otaproxy_process(*, init_cache: bool) -> None: def otaproxy_control_thread( - *, - any_requires_network: mp_sync.Event, - all_ecus_succeeded: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, ) -> None: # pragma: no cover atexit.register(shutdown_otaproxy_server) @@ -105,16 +103,17 @@ def otaproxy_control_thread( global _otaproxy_p while True: - _now = time.time() time.sleep(OTAPROXY_CHECK_INTERVAL) + _now = time.time() _otaproxy_running = _otaproxy_p and _otaproxy_p.is_alive() - _otaproxy_should_run = any_requires_network.is_set() + _otaproxy_should_run = ecu_status_flags.any_requires_network.is_set() + _all_success = ecu_status_flags.all_success.is_set() if not _otaproxy_should_run and not _otaproxy_running: if ( _now > next_ota_cache_dir_checkpoint - and all_ecus_succeeded.is_set() + and _all_success and ota_cache_dir.is_dir() ): logger.info( @@ -122,9 +121,8 @@ def otaproxy_control_thread( ) next_ota_cache_dir_checkpoint = _now + OTA_CACHE_DIR_CHECK_INTERVAL shutil.rmtree(ota_cache_dir, ignore_errors=True) - continue - if _otaproxy_should_run and not _otaproxy_running: + elif _otaproxy_should_run and not _otaproxy_running: # NOTE: always try to re-use cache. If the cache dir is empty, otaproxy # will still init the cache even init_cache is False. _otaproxy_p = _mp_ctx.Process( @@ -134,8 +132,7 @@ def otaproxy_control_thread( _otaproxy_p.start() next_ota_cache_dir_checkpoint = _now + OTAPROXY_MIN_STARTUP_TIME time.sleep(OTAPROXY_MIN_STARTUP_TIME) # prevent pre-mature shutdown - continue - if _otaproxy_p and _otaproxy_running and not _otaproxy_should_run: + elif _otaproxy_p and _otaproxy_running and not _otaproxy_should_run: logger.info("shutting down otaproxy as not needed now ...") shutdown_otaproxy_server() diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 775a294e5..8b97844e6 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -597,14 +597,6 @@ def execute(self): class OTAClient: - """ - Init params: - boot_controller: boot control instance - create_standby_cls: type of create standby slot mechanism to use - my_ecu_id: ECU id of the device running this otaclient instance - control_flag: flags used by otaclient and ota_service stub for synchronization - proxy: upper otaproxy URL - """ def __init__( self, From d9d791e398a600d2e3d02c26936981fcbf04efa2 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 06:02:30 +0000 Subject: [PATCH 074/114] api_v2.servicer: not manage otaclient control flags here --- src/otaclient/grpc/api_v2/servicer.py | 44 ++------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index f2a33dae9..87f02d2f1 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -19,7 +19,6 @@ import asyncio import logging import multiprocessing.queues as mp_queue -import multiprocessing.synchronize as mp_sync from otaclient._types import ( IPCRequest, @@ -30,7 +29,7 @@ ) from otaclient._utils import gen_session_id from otaclient.configs import ECUContact -from otaclient.configs.cfg import cfg, ecu_info, proxy_info +from otaclient.configs.cfg import cfg, ecu_info from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage from otaclient_api.v2 import types as api_types from otaclient_api.v2.api_caller import ECUNoResponse, OTAClientCall @@ -50,61 +49,22 @@ class OTAClientAPIServicer: def __init__( self, + *, ecu_status_storage: ECUStatusStorage, op_queue: mp_queue.Queue[IPCRequest], resp_queue: mp_queue.Queue[IPCResponse], - *, - control_flag: mp_sync.Event, ): self.sub_ecus = ecu_info.secondaries self.listen_addr = ecu_info.ip_addr self.listen_port = cfg.OTA_API_SERVER_PORT self.my_ecu_id = ecu_info.ecu_id - self._otaclient_control_flag = control_flag self._op_queue = op_queue self._resp_queue = resp_queue self._ecu_status_storage = ecu_status_storage self._polling_waiter = self._ecu_status_storage.get_polling_waiter() - # otaproxy lifecycle and dependency managing - # NOTE: _debug_status_checking_shutdown_event is for test only, - # allow us to stop background task without changing codes. - # In normal running this event will never be set. - self._debug_status_checking_shutdown_event = asyncio.Event() - if proxy_info.enable_local_ota_proxy: - asyncio.create_task(self._otaclient_control_flag_managing()) - else: - # if otaproxy is not enabled, no dependency relationship will be formed, - # always allow local otaclient to reboot - self._otaclient_control_flag.set() - - # internal - - async def _otaclient_control_flag_managing(self): - """Task entry for set/clear otaclient control flags. - - Prevent self ECU from rebooting when their is at least one ECU - under UPDATING ota_status. - """ - while not self._debug_status_checking_shutdown_event.is_set(): - _can_reboot = self._otaclient_control_flag.is_set() - if not self._ecu_status_storage.in_update_child_ecus_id: - if not _can_reboot: - logger.info( - "local otaclient can reboot as no child ECU is in UPDATING ota_status" - ) - self._otaclient_control_flag.set() - else: - if _can_reboot: - logger.info( - f"local otaclient cannot reboot as child ECUs {self._ecu_status_storage.in_update_child_ecus_id}" - " are in UPDATING ota_status" - ) - self._otaclient_control_flag.clear() - await self._polling_waiter() - # API servicer def _local_update(self, request: UpdateRequestV2) -> api_types.UpdateResponseEcu: From a0cee327c5d71a82c7824180829aa577774ff148 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 06:03:10 +0000 Subject: [PATCH 075/114] api_v2.main: cleanup accordingly --- src/otaclient/grpc/api_v2/main.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/otaclient/grpc/api_v2/main.py b/src/otaclient/grpc/api_v2/main.py index 84fc63240..83a29b455 100644 --- a/src/otaclient/grpc/api_v2/main.py +++ b/src/otaclient/grpc/api_v2/main.py @@ -19,23 +19,21 @@ import asyncio import atexit import logging -import multiprocessing.synchronize as mp_sync from multiprocessing.queues import Queue as mp_Queue from typing import Callable, NoReturn -from otaclient._types import IPCRequest, IPCResponse +from otaclient._types import IPCRequest, IPCResponse, MultipleECUStatusFlags from otaclient._utils import SharedOTAClientStatusReader logger = logging.getLogger(__name__) def grpc_server_process( + *, shm_reader_factory: Callable[[], SharedOTAClientStatusReader], - control_flag: mp_sync.Event, op_queue: mp_Queue[IPCRequest], resp_queue: mp_Queue[IPCResponse], - all_ecus_succeeded: mp_sync.Event, - any_requires_network: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, ) -> NoReturn: # type: ignore from otaclient._logging import configure_logging @@ -55,10 +53,7 @@ async def _grpc_server_launcher(): from otaclient_api.v2 import otaclient_v2_pb2_grpc as v2_grpc from otaclient_api.v2.api_stub import OtaClientServiceV2 - ecu_status_storage = ECUStatusStorage( - all_ecus_succeeded=all_ecus_succeeded, - any_requires_network=any_requires_network, - ) + ecu_status_storage = ECUStatusStorage(ecu_status_flags=ecu_status_flags) ecu_tracker = ECUTracker(ecu_status_storage, shm_reader) ecu_tracker.start() @@ -66,7 +61,6 @@ async def _grpc_server_launcher(): ecu_status_storage=ecu_status_storage, op_queue=op_queue, resp_queue=resp_queue, - control_flag=control_flag, ) ota_client_service_v2 = OtaClientServiceV2(api_servicer) From d45dd27de63789ec2694da81f0d1fe8bab0d22ea Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 06:07:52 +0000 Subject: [PATCH 076/114] main: finish up integration --- src/otaclient/main.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 1778e68fc..8b3f46036 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -29,6 +29,7 @@ from functools import partial from otaclient import __version__ +from otaclient._types import MultipleECUStatusFlags from otaclient._utils import SharedOTAClientStatusReader, SharedOTAClientStatusWriter logger = logging.getLogger(__name__) @@ -103,19 +104,23 @@ def main() -> None: _key = secrets.token_bytes(SHM_HMAC_KEY_LEN) # shared queues and flags - local_otaclient_control_flag = mp_ctx.Event() local_otaclient_op_queue = mp_ctx.Queue() local_otaclient_resp_queue = mp_ctx.Queue() - all_ecus_succeeded = mp_ctx.Event() - any_requires_network = mp_ctx.Event() + ecu_status_flags = MultipleECUStatusFlags( + any_in_update=mp_ctx.Event(), + any_requires_network=mp_ctx.Event(), + all_success=mp_ctx.Event(), + ) _ota_core_p = mp_ctx.Process( target=partial( ota_core_process, - partial(SharedOTAClientStatusWriter, name=_shm.name, key=_key), - local_otaclient_control_flag, - local_otaclient_op_queue, - local_otaclient_resp_queue, + shm_writer_factory=partial( + SharedOTAClientStatusWriter, name=_shm.name, key=_key + ), + ecu_status_flags=ecu_status_flags, + op_queue=local_otaclient_op_queue, + resp_queue=local_otaclient_resp_queue, ), name="otaclient_ota_core", ) @@ -124,12 +129,12 @@ def main() -> None: _grpc_server_p = mp_ctx.Process( target=partial( grpc_server_process, - partial(SharedOTAClientStatusReader, name=_shm.name, key=_key), - local_otaclient_control_flag, - local_otaclient_op_queue, - local_otaclient_resp_queue, - all_ecus_succeeded, - any_requires_network, + shm_reader_factory=partial( + SharedOTAClientStatusReader, name=_shm.name, key=_key + ), + op_queue=local_otaclient_op_queue, + resp_queue=local_otaclient_resp_queue, + ecu_status_flags=ecu_status_flags, ), name="otaclient_api_server", ) @@ -142,11 +147,7 @@ def main() -> None: _otaproxy_control_t = None if proxy_info.enable_local_ota_proxy: _otaproxy_control_t = threading.Thread( - target=partial( - otaproxy_control_thread, - any_requires_network=any_requires_network, - all_ecus_succeeded=all_ecus_succeeded, - ), + target=partial(otaproxy_control_thread, ecu_status_flags), daemon=True, name="otaclient_otaproxy_control_t", ) From 674ea3e8f1b38483f403fdf7d98985540feb4421 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 07:14:46 +0000 Subject: [PATCH 077/114] utils.wait_and_log: take a func that returns bool --- src/otaclient/_utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index 05a99d0f6..6ac55ce49 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -22,9 +22,8 @@ import sys import time import traceback -from abc import abstractmethod from pathlib import Path -from typing import Callable, Protocol +from typing import Callable from otaclient._types import OTAClientStatus from otaclient_common._io import read_str_from_file, write_str_to_file_atomic @@ -34,16 +33,11 @@ logger = logging.getLogger(__name__) -class CheckableFlag(Protocol): - - @abstractmethod - def is_set(self) -> bool: ... - - def wait_and_log( - flag: CheckableFlag, + check_flag: Callable[[], bool], message: str = "", *, + check_for: bool = True, check_interval: int = 2, log_interval: int = 30, log_func: Callable[[str], None] = logger.info, @@ -51,7 +45,7 @@ def wait_and_log( """Wait for until it is set while print a log every .""" log_round = 0 for seconds in itertools.count(step=check_interval): - if flag.is_set(): + if check_flag() == check_for: return _new_log_round = seconds // log_interval From f1be14b067684b57fe0b43e9d6a19c9d1e3d4ce6 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 07:17:56 +0000 Subject: [PATCH 078/114] ota_core: use MultipleECUStatusFlags instead --- src/otaclient/ota_core.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 8b97844e6..df3e1bfff 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -58,6 +58,7 @@ IPCRequest, IPCResEnum, IPCResponse, + MultipleECUStatusFlags, OTAStatus, RollbackRequestV2, UpdatePhase, @@ -65,7 +66,7 @@ ) from otaclient._utils import SharedOTAClientStatusWriter, get_traceback, wait_and_log from otaclient.boot_control import BootControllerProtocol, get_boot_controller -from otaclient.configs.cfg import cfg, ecu_info +from otaclient.configs.cfg import cfg, ecu_info, proxy_info from otaclient.create_standby import ( StandbySlotCreatorProtocol, get_standby_slot_creator, @@ -158,7 +159,7 @@ def __init__( upper_otaproxy: str | None = None, boot_controller: BootControllerProtocol, create_standby_cls: Type[StandbySlotCreatorProtocol], - control_flag: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, status_report_queue: Queue[StatusReport], session_id: str, ) -> None: @@ -211,7 +212,7 @@ def __init__( self._upper_proxy = upper_otaproxy # ------ init updater implementation ------ # - self._control_flag = control_flag + self.ecu_status_flags = ecu_status_flags self._boot_controller = boot_controller self._create_standby_cls = create_standby_cls @@ -552,11 +553,16 @@ def _execute_update(self): session_id=self.session_id, ) ) - wait_and_log( - flag=self._control_flag, - message="permit reboot flag", - log_func=logger.info, - ) + + # NOTE: we don't need to wait for sub ECUs if sub ECUs don't + # depend on otaproxy on this ECU. + if proxy_info.enable_local_ota_proxy: + wait_and_log( + check_flag=self.ecu_status_flags.any_requires_network.is_set, + check_for=False, + message="permit reboot flag", + log_func=logger.info, + ) logger.info(f"device will reboot in {WAIT_BEFORE_REBOOT} seconds!") time.sleep(WAIT_BEFORE_REBOOT) @@ -601,13 +607,13 @@ class OTAClient: def __init__( self, *, - control_flag: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, proxy: Optional[str] = None, status_report_queue: Queue[StatusReport], ) -> None: self.my_ecu_id = ecu_info.ecu_id self.proxy = proxy - self.control_flag = control_flag + self.ecu_status_flags = ecu_status_flags self._status_report_queue = status_report_queue self._live_ota_status = OTAStatus.INITIALIZED @@ -738,7 +744,7 @@ def update(self, request: UpdateRequestV2) -> None: ca_chains_store=self.ca_chains_store, boot_controller=self.boot_controller, create_standby_cls=self.create_standby_cls, - control_flag=self.control_flag, + ecu_status_flags=self.ecu_status_flags, upper_otaproxy=self.proxy, status_report_queue=self._status_report_queue, session_id=new_session_id, @@ -867,8 +873,9 @@ def _sign_handler(signal_value, frame) -> NoReturn: def ota_core_process( + *, shm_writer_factory: Callable[[], SharedOTAClientStatusWriter], - control_flag: mp_sync.Event, + ecu_status_flags: MultipleECUStatusFlags, op_queue: mp_queue.Queue[IPCRequest], resp_queue: mp_queue.Queue[IPCResponse], ): @@ -889,7 +896,7 @@ def ota_core_process( _status_monitor.start() _ota_core = OTAClient( - control_flag=control_flag, + ecu_status_flags=ecu_status_flags, proxy=proxy_info.get_proxy_for_local_ota(), status_report_queue=_local_status_report_queue, ) From aaa19849851ce4cb8ffb2a2fba70c0c263bbc42b Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 28 Nov 2024 07:19:10 +0000 Subject: [PATCH 079/114] fix up test_utils --- src/otaclient/_utils.py | 4 ++-- tests/test_otaclient/test_utils.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index 6ac55ce49..32fb0b6ca 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -23,7 +23,7 @@ import time import traceback from pathlib import Path -from typing import Callable +from typing import Callable, Literal from otaclient._types import OTAClientStatus from otaclient_common._io import read_str_from_file, write_str_to_file_atomic @@ -37,7 +37,7 @@ def wait_and_log( check_flag: Callable[[], bool], message: str = "", *, - check_for: bool = True, + check_for: Literal[True] | Literal[False] = True, check_interval: int = 2, log_interval: int = 30, log_func: Callable[[str], None] = logger.info, diff --git a/tests/test_otaclient/test_utils.py b/tests/test_otaclient/test_utils.py index cf23594f9..0ffed4e0f 100644 --- a/tests/test_otaclient/test_utils.py +++ b/tests/test_otaclient/test_utils.py @@ -42,8 +42,9 @@ def test_wait_and_log(caplog: pytest.LogCaptureFixture): _msg = "ticking flag" wait_and_log( - _flag, + _flag.is_set, _msg, + check_for=True, check_interval=1, log_interval=2, log_func=logger.warning, From c2d8f5e2ff2a1d7dc000df1c59e21d86f5e2a11b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 28 Nov 2024 07:23:08 +0000 Subject: [PATCH 080/114] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/otaclient/_types.py | 1 + src/otaclient/ota_core.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 6e7722117..92981290d 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -21,6 +21,7 @@ from typing import ClassVar, Optional from _otaclient_version import __version__ + from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index df3e1bfff..88cec593b 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -19,7 +19,6 @@ import json import logging import multiprocessing.queues as mp_queue -import multiprocessing.synchronize as mp_sync import signal import sys import threading From c075d90cf8dd1557e663f255113119e415ef6301 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 29 Nov 2024 06:43:17 +0000 Subject: [PATCH 081/114] status_monitor: reduce min collect interval to 0.5 --- src/otaclient/_status_monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 11a862458..5626ec357 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -227,6 +227,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor # TERMINATE_SENTINEL = cast(StatusReport, object()) +MIN_COLLECT_INTERVAL = 0.5 SHM_PUSH_INTERVAL = 0.5 @@ -237,7 +238,7 @@ def __init__( msg_queue: queue.Queue[StatusReport], shm_status: SharedOTAClientStatusWriter, *, - min_collect_interval: int = 1, + min_collect_interval: float = MIN_COLLECT_INTERVAL, shm_push_interval: float = SHM_PUSH_INTERVAL, ) -> None: self.min_collect_interval = min_collect_interval From 77c4e9483c099e93409c4eaa72118d6014e430ad Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Fri, 29 Nov 2024 06:47:28 +0000 Subject: [PATCH 082/114] grpc.api_v2.ECUTracker: reduce local active poll interval on startup to 0.1 --- src/otaclient/grpc/api_v2/ecu_tracker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 95f713aef..4641e8250 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -33,7 +33,8 @@ # actively polling ECUs status until we get the first valid response # when otaclient is just starting. -_active_polling_interval_on_startup = 1 +_ACTIVE_POLL_SUB_ON_STARTUP = 1 +_ACTIVE_POLL_LOCAL_ON_STARTUP = 0.1 class ECUTracker: @@ -75,7 +76,7 @@ async def _polling_direct_subecu_status(self, ecu_contact: ECUContact): ) if self._startup_matrix[this_ecu_id]: - await asyncio.sleep(_active_polling_interval_on_startup) + await asyncio.sleep(_ACTIVE_POLL_SUB_ON_STARTUP) else: await self._polling_waiter() @@ -90,7 +91,7 @@ async def _polling_local_ecu_status(self): await self._ecu_status_storage.update_from_local_ecu(status_report) if self._startup_matrix[my_ecu_id]: - await asyncio.sleep(_active_polling_interval_on_startup) + await asyncio.sleep(_ACTIVE_POLL_LOCAL_ON_STARTUP) else: await self._polling_waiter() From 2b72d57a1ebc0215d34df96e3c96d9dce51c4d53 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Sat, 30 Nov 2024 13:14:47 +0000 Subject: [PATCH 083/114] squashed merge from fix/ota_image_invalid_handling --- src/ota_metadata/legacy/parser.py | 23 ++++++++++++++++++++ src/ota_proxy/server_app.py | 26 +++++++++++----------- src/otaclient/ota_core.py | 36 +++++++++++++++++++++++++------ 3 files changed, 66 insertions(+), 19 deletions(-) diff --git a/src/ota_metadata/legacy/parser.py b/src/ota_metadata/legacy/parser.py index 9ac5a2dad..36f5669aa 100644 --- a/src/ota_metadata/legacy/parser.py +++ b/src/ota_metadata/legacy/parser.py @@ -47,6 +47,7 @@ import shutil import time from dataclasses import dataclass, fields +from http import HTTPStatus from os import PathLike from pathlib import Path from tempfile import NamedTemporaryFile, TemporaryDirectory @@ -70,6 +71,8 @@ from cryptography.hazmat.primitives import hashes from cryptography.hazmat.primitives.asymmetric.ec import ECDSA, EllipticCurvePublicKey from cryptography.x509 import load_pem_x509_certificate +from requests import Response +from requests import exceptions as requests_exc from typing_extensions import Self from ota_metadata.utils.cert_store import CAChainStore @@ -106,6 +109,14 @@ def _python_exit(): atexit.register(_python_exit) +class OTAImageInvalid(Exception): + """OTA image itself is incompleted or metadata is missing.""" + + +class OTARequestsAuthTokenInvalid(Exception): + """Hit 401 or 403 when downloading metadata.""" + + class MetadataJWTPayloadInvalid(Exception): """Raised when verification passed, but input metadata.jwt is invalid.""" @@ -672,6 +683,18 @@ def _process_metadata_jwt(self) -> _MetadataJWTClaimsLayout: ) break except Exception as e: + if isinstance(e, requests_exc.HTTPError) and isinstance( + (_response := e.response), Response + ): + if _response.status_code == HTTPStatus.NOT_FOUND: + raise OTAImageInvalid("failed to download metadata") from e + + if _response.status_code in [ + HTTPStatus.FORBIDDEN, + HTTPStatus.UNAUTHORIZED, + ]: + raise OTARequestsAuthTokenInvalid from e + logger.warning(f"failed to download {cert_info}, retrying: {e!r}") time.sleep(self.retry_interval) diff --git a/src/ota_proxy/server_app.py b/src/ota_proxy/server_app.py index 0ab4e467c..d1f815b30 100644 --- a/src/ota_proxy/server_app.py +++ b/src/ota_proxy/server_app.py @@ -43,11 +43,11 @@ from .ota_cache import OTACache logger = logging.getLogger(__name__) -connection_err_logger = logging.getLogger(f"{__name__}.connection_err") -# NOTE: for connection_error, only allow max 6 lines of logging per 30 seconds -connection_err_logger.addFilter( +burst_suppressed_logger = logging.getLogger(f"{__name__}.request_error") +# NOTE: for request_error, only allow max 6 lines of logging per 30 seconds +burst_suppressed_logger.addFilter( BurstSuppressFilter( - f"{__name__}.connection_err", + f"{__name__}.request_error", upper_logger_name=__name__, burst_round_length=30, burst_max=6, @@ -211,11 +211,11 @@ async def _error_handling_for_cache_retrieving(self, url: str, send): yield _is_succeeded _is_succeeded.set() except aiohttp.ClientResponseError as e: - logger.error(f"{_common_err_msg} due to HTTP error: {e!r}") + burst_suppressed_logger.error(f"{_common_err_msg} due to HTTP error: {e!r}") # passthrough 4xx(currently 403 and 404) to otaclient await self._respond_with_error(e.status, e.message, send) except aiohttp.ClientConnectionError as e: - connection_err_logger.error( + burst_suppressed_logger.error( f"{_common_err_msg} due to connection error: {e!r}" ) await self._respond_with_error( @@ -224,12 +224,14 @@ async def _error_handling_for_cache_retrieving(self, url: str, send): send, ) except aiohttp.ClientError as e: - logger.error(f"{_common_err_msg} due to aiohttp client error: {e!r}") + burst_suppressed_logger.error( + f"{_common_err_msg} due to aiohttp client error: {e!r}" + ) await self._respond_with_error( HTTPStatus.SERVICE_UNAVAILABLE, f"client error: {e!r}", send ) except (BaseOTACacheError, StopAsyncIteration) as e: - logger.error( + burst_suppressed_logger.error( f"{_common_err_msg} due to handled ota_cache internal error: {e!r}" ) await self._respond_with_error( @@ -238,7 +240,7 @@ async def _error_handling_for_cache_retrieving(self, url: str, send): except Exception as e: # exceptions rather than aiohttp error indicates # internal errors of ota_cache - logger.exception( + burst_suppressed_logger.exception( f"{_common_err_msg} due to unhandled ota_cache internal error: {e!r}" ) await self._respond_with_error( @@ -255,13 +257,13 @@ async def _error_handling_during_transferring(self, url: str, send): try: yield except (BaseOTACacheError, StopAsyncIteration) as e: - logger.error( + burst_suppressed_logger.error( f"{_common_err_msg=} due to handled ota_cache internal error: {e!r}" ) await self._send_chunk(b"", False, send) except Exception as e: # unexpected internal errors of ota_cache - logger.exception( + burst_suppressed_logger.error( f"{_common_err_msg=} due to unhandled ota_cache internal error: {e!r}" ) await self._send_chunk(b"", False, send) @@ -292,7 +294,7 @@ async def _pull_data_and_send(self, url: str, scope, send): # retrieve_file executed successfully, but return nothing if _is_succeeded.is_set(): _msg = f"failed to retrieve fd for {url} from otacache" - logger.warning(_msg) + burst_suppressed_logger.warning(_msg) await self._respond_with_error( HTTPStatus.INTERNAL_SERVER_ERROR, _msg, send ) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index e3d7aa86b..97dfcc713 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -34,6 +34,7 @@ from urllib.parse import urlparse import requests.exceptions as requests_exc +from requests import Response from ota_metadata.legacy import parser as ota_metadata_parser from ota_metadata.legacy import types as ota_metadata_types @@ -79,7 +80,10 @@ DownloadPoolWatchdogFuncContext, ) from otaclient_common.persist_file_handling import PersistFilesHandler -from otaclient_common.retry_task_map import ThreadPoolExecutorWithRetry +from otaclient_common.retry_task_map import ( + TasksEnsureFailed, + ThreadPoolExecutorWithRetry, +) logger = logging.getLogger(__name__) @@ -117,12 +121,15 @@ def _download_exception_handler(_fut: Future[Any]) -> bool: try: # exceptions that cannot be handled by us if isinstance(exc, requests_exc.HTTPError): - http_errcode = exc.errno - - if http_errcode in [ - HTTPStatus.FORBIDDEN, - HTTPStatus.UNAUTHORIZED, - ]: + _response = exc.response + # NOTE(20241129): if somehow HTTPError doesn't contain response, + # don't do anything but let upper retry. + # NOTE: bool(Response) is False when status_code != 200. + if not isinstance(_response, Response): + return False + + http_errcode = _response.status_code + if http_errcode in [HTTPStatus.FORBIDDEN, HTTPStatus.UNAUTHORIZED]: raise ota_errors.UpdateRequestCookieInvalid( f"download failed with critical HTTP error: {exc.errno}, {exc!r}", module=__name__, @@ -451,6 +458,16 @@ def _execute_update(self): _err_msg = f"metadata.jwt is invalid: {e!r}" logger.error(_err_msg) raise ota_errors.MetadataJWTInvalid(_err_msg, module=__name__) from e + except ota_metadata_parser.OTAImageInvalid as e: + _err_msg = f"OTA image is invalid: {e!r}" + logger.error(_err_msg) + raise ota_errors.OTAImageInvalid(_err_msg, module=__name__) from e + except ota_metadata_parser.OTARequestsAuthTokenInvalid as e: + _err_msg = f"OTA requests auth token is invalid: {e!r}" + logger.error(_err_msg) + raise ota_errors.UpdateRequestCookieInvalid( + _err_msg, module=__name__ + ) from e except Exception as e: _err_msg = f"failed to prepare ota metafiles: {e!r}" logger.error(_err_msg) @@ -509,6 +526,11 @@ def _execute_update(self): # NOTE(20240705): download_files raises OTA Error directly, no need to capture exc here try: self._download_files(otameta, delta_bundle.get_download_list()) + except TasksEnsureFailed: + # NOTE: the only cause of a TaskEnsureFailed being raised is the download_watchdog timeout. + _err_msg = f"download stalls longer than {cfg.DOWNLOAD_INACTIVE_TIMEOUT}, abort OTA" + logger.error(_err_msg) + raise ota_errors.NetworkError(_err_msg, module=__name__) from None finally: del delta_bundle self._downloader_pool.shutdown() From b771c3fe9bb6e0f3df7e3a6dbc2128e7855a885c Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 02:56:32 +0000 Subject: [PATCH 084/114] ecu_tracker: log errors from failed local ecu status query --- src/otaclient/grpc/api_v2/ecu_tracker.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index 4641e8250..f1a3fdeb2 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -18,7 +18,6 @@ import asyncio import atexit -import contextlib import logging from collections import defaultdict @@ -28,8 +27,19 @@ from otaclient.grpc.api_v2.ecu_status import ECUStatusStorage from otaclient_api.v2 import types as api_types from otaclient_api.v2.api_caller import ECUNoResponse, OTAClientCall +from otaclient_common.logging import BurstSuppressFilter logger = logging.getLogger(__name__) +burst_suppressed_logger = logging.getLogger(f"{__name__}.local_ecu_check") +# NOTE: for request_error, only allow max 6 lines of logging per 30 seconds +burst_suppressed_logger.addFilter( + BurstSuppressFilter( + f"{__name__}.local_ecu_check", + upper_logger_name=__name__, + burst_round_length=30, + burst_max=6, + ) +) # actively polling ECUs status until we get the first valid response # when otaclient is just starting. @@ -84,11 +94,15 @@ async def _polling_local_ecu_status(self): """Task entry for loop polling local ECU status.""" my_ecu_id = ecu_info.ecu_id while True: - with contextlib.suppress(Exception): + try: status_report = self._local_ecu_status_reader.sync_msg() if status_report: self._startup_matrix[my_ecu_id] = False await self._ecu_status_storage.update_from_local_ecu(status_report) + except Exception as e: + burst_suppressed_logger.warning( + f"failed to query local ECU's status: {e!r}" + ) if self._startup_matrix[my_ecu_id]: await asyncio.sleep(_ACTIVE_POLL_LOCAL_ON_STARTUP) From 85de82e8ba711890b521080e3a3ccd8251ec1b07 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 03:13:16 +0000 Subject: [PATCH 085/114] _status_monitor: still record the exception during shm write with burst_limited_logger; use global shutdown cleanup --- src/otaclient/_status_monitor.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 5626ec357..757fda592 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -17,7 +17,6 @@ from __future__ import annotations import atexit -import contextlib import logging import queue import time @@ -36,8 +35,19 @@ UpdateTiming, ) from otaclient._utils import SharedOTAClientStatusWriter +from otaclient_common.logging import BurstSuppressFilter logger = logging.getLogger(__name__) +burst_suppressed_logger = logging.getLogger(f"{__name__}.shm_push") +# NOTE: for request_error, only allow max 6 lines of logging per 30 seconds +burst_suppressed_logger.addFilter( + BurstSuppressFilter( + f"{__name__}.shm_push", + upper_logger_name=__name__, + burst_round_length=30, + burst_max=6, + ) +) _status_report_queue: queue.Queue | None = None @@ -232,6 +242,7 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor class OTAClientStatusCollector: + """NOTE: status_monitor should only be started once during whole otaclient lifecycle!""" def __init__( self, @@ -245,6 +256,9 @@ def __init__( self.shm_push_interval = shm_push_interval self._input_queue = msg_queue + global _status_report_queue + _status_report_queue = msg_queue + self._status = None self._shm_status = shm_status @@ -295,9 +309,13 @@ def _status_collector_thread(self) -> None: # ------ push status on load_report ------ # if self.load_report(report) and self._status and _now > _next_shm_push: - with contextlib.suppress(Exception): + try: self._shm_status.write_msg(self._status) _next_shm_push = _now + self.shm_push_interval + except Exception as e: + burst_suppressed_logger.warning( + f"failed to push status to shm: {e!r}" + ) except queue.Empty: time.sleep(self.min_collect_interval) From 6e30cbb345050cef5d87066b599978831f374f67 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 03:15:02 +0000 Subject: [PATCH 086/114] minor update --- src/otaclient/grpc/api_v2/servicer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index 87f02d2f1..79d7baaa6 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -36,7 +36,7 @@ logger = logging.getLogger(__name__) -WAIT_FOR_ACK_TIMEOUT = 6 # seconds +WAIT_FOR_LOCAL_ECU_ACK_TIMEOUT = 6 # seconds class OTAClientAPIServicer: @@ -70,7 +70,7 @@ def __init__( def _local_update(self, request: UpdateRequestV2) -> api_types.UpdateResponseEcu: self._op_queue.put_nowait(request) try: - _req_response = self._resp_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + _req_response = self._resp_queue.get(timeout=WAIT_FOR_LOCAL_ECU_ACK_TIMEOUT) assert isinstance(_req_response, IPCResponse), "unexpected msg" assert ( _req_response.session_id == request.session_id @@ -179,7 +179,7 @@ def _local_rollback( ) -> api_types.RollbackResponseEcu: self._op_queue.put_nowait(rollback_request) try: - _req_response = self._resp_queue.get(timeout=WAIT_FOR_ACK_TIMEOUT) + _req_response = self._resp_queue.get(timeout=WAIT_FOR_LOCAL_ECU_ACK_TIMEOUT) assert isinstance( _req_response, IPCResponse ), f"unexpected response: {type(_req_response)}" From d7cf32e0c902b7e37dc3900c364d279b6bdfe266 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 03:47:29 +0000 Subject: [PATCH 087/114] api_v2.servicer: use threadpool to execute local update/rollback --- src/otaclient/grpc/api_v2/servicer.py | 34 ++++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index 79d7baaa6..b529da45e 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -19,6 +19,8 @@ import asyncio import logging import multiprocessing.queues as mp_queue +from concurrent.futures import ThreadPoolExecutor +from functools import partial from otaclient._types import ( IPCRequest, @@ -53,11 +55,13 @@ def __init__( ecu_status_storage: ECUStatusStorage, op_queue: mp_queue.Queue[IPCRequest], resp_queue: mp_queue.Queue[IPCResponse], + executor: ThreadPoolExecutor, ): self.sub_ecus = ecu_info.secondaries self.listen_addr = ecu_info.ip_addr self.listen_port = cfg.OTA_API_SERVER_PORT self.my_ecu_id = ecu_info.ecu_id + self._executor = executor self._op_queue = op_queue self._resp_queue = resp_queue @@ -68,6 +72,7 @@ def __init__( # API servicer def _local_update(self, request: UpdateRequestV2) -> api_types.UpdateResponseEcu: + """Thread worker for dispatching a local update.""" self._op_queue.put_nowait(request) try: _req_response = self._resp_queue.get(timeout=WAIT_FOR_LOCAL_ECU_ACK_TIMEOUT) @@ -151,13 +156,17 @@ async def update( # second: dispatch update request to local if required by incoming request if update_req_ecu := request.find_ecu(self.my_ecu_id): new_session_id = gen_session_id(update_req_ecu.version) - _resp = self._local_update( - UpdateRequestV2( - version=update_req_ecu.version, - url_base=update_req_ecu.url, - cookies_json=update_req_ecu.cookies, - session_id=new_session_id, - ) + _resp = await asyncio.get_running_loop().run_in_executor( + executor=self._executor, + func=partial( + self._local_update, + UpdateRequestV2( + version=update_req_ecu.version, + url_base=update_req_ecu.url, + cookies_json=update_req_ecu.cookies, + session_id=new_session_id, + ), + ), ) if _resp.result == api_types.FailureType.NO_FAILURE: @@ -177,6 +186,8 @@ async def update( def _local_rollback( self, rollback_request: RollbackRequestV2 ) -> api_types.RollbackResponseEcu: + """Thread worker for dispatching a local rollback.""" + self._op_queue.put_nowait(rollback_request) try: _req_response = self._resp_queue.get(timeout=WAIT_FOR_LOCAL_ECU_ACK_TIMEOUT) @@ -260,9 +271,14 @@ async def rollback( # second: dispatch rollback request to local if required if request.find_ecu(self.my_ecu_id): new_session_id = gen_session_id("__rollback") - response.add_ecu( - self._local_rollback(RollbackRequestV2(session_id=new_session_id)) + _local_resp = await asyncio.get_running_loop().run_in_executor( + executor=self._executor, + func=partial( + self._local_rollback, + RollbackRequestV2(session_id=new_session_id), + ), ) + response.add_ecu(_local_resp) return response async def status(self, _=None) -> api_types.StatusResponse: From c2322a0d161f3b5d834b180c0476a31d8ec628e8 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 03:51:15 +0000 Subject: [PATCH 088/114] api_v2: use thread pool for blocking operations --- src/otaclient/grpc/api_v2/main.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/otaclient/grpc/api_v2/main.py b/src/otaclient/grpc/api_v2/main.py index 83a29b455..38ca6481d 100644 --- a/src/otaclient/grpc/api_v2/main.py +++ b/src/otaclient/grpc/api_v2/main.py @@ -19,6 +19,7 @@ import asyncio import atexit import logging +from concurrent.futures import ThreadPoolExecutor from multiprocessing.queues import Queue as mp_Queue from typing import Callable, NoReturn @@ -57,14 +58,18 @@ async def _grpc_server_launcher(): ecu_tracker = ECUTracker(ecu_status_storage, shm_reader) ecu_tracker.start() + thread_pool = ThreadPoolExecutor( + thread_name_prefix="ota_api_server", + ) api_servicer = OTAClientAPIServicer( ecu_status_storage=ecu_status_storage, op_queue=op_queue, resp_queue=resp_queue, + executor=thread_pool, ) ota_client_service_v2 = OtaClientServiceV2(api_servicer) - server = grpc.aio.server() + server = grpc.aio.server(migration_thread_pool=thread_pool) v2_grpc.add_OtaClientServiceServicer_to_server( server=server, servicer=ota_client_service_v2 ) @@ -77,5 +82,6 @@ async def _grpc_server_launcher(): await server.wait_for_termination() finally: await server.stop(1) + thread_pool.shutdown(wait=True) asyncio.run(_grpc_server_launcher()) From a9e1293b1a278d93e72645d533c8e60aff9cb19d Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 03:55:05 +0000 Subject: [PATCH 089/114] minor update --- src/otaclient/grpc/api_v2/servicer.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index b529da45e..a380ae43f 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -20,7 +20,6 @@ import logging import multiprocessing.queues as mp_queue from concurrent.futures import ThreadPoolExecutor -from functools import partial from otaclient._types import ( IPCRequest, @@ -157,15 +156,13 @@ async def update( if update_req_ecu := request.find_ecu(self.my_ecu_id): new_session_id = gen_session_id(update_req_ecu.version) _resp = await asyncio.get_running_loop().run_in_executor( - executor=self._executor, - func=partial( - self._local_update, - UpdateRequestV2( - version=update_req_ecu.version, - url_base=update_req_ecu.url, - cookies_json=update_req_ecu.cookies, - session_id=new_session_id, - ), + self._executor, + self._local_update, + UpdateRequestV2( + version=update_req_ecu.version, + url_base=update_req_ecu.url, + cookies_json=update_req_ecu.cookies, + session_id=new_session_id, ), ) @@ -272,11 +269,9 @@ async def rollback( if request.find_ecu(self.my_ecu_id): new_session_id = gen_session_id("__rollback") _local_resp = await asyncio.get_running_loop().run_in_executor( - executor=self._executor, - func=partial( - self._local_rollback, - RollbackRequestV2(session_id=new_session_id), - ), + self._executor, + self._local_rollback, + RollbackRequestV2(session_id=new_session_id), ) response.add_ecu(_local_resp) return response From 9dcf67a2f36c73d5086f0abdd4ef83c57687ba2e Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 04:50:25 +0000 Subject: [PATCH 090/114] minor update --- src/otaclient/_status_monitor.py | 2 +- src/otaclient/grpc/api_v2/ecu_tracker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 757fda592..02a6d2554 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -313,7 +313,7 @@ def _status_collector_thread(self) -> None: self._shm_status.write_msg(self._status) _next_shm_push = _now + self.shm_push_interval except Exception as e: - burst_suppressed_logger.warning( + burst_suppressed_logger.debug( f"failed to push status to shm: {e!r}" ) except queue.Empty: diff --git a/src/otaclient/grpc/api_v2/ecu_tracker.py b/src/otaclient/grpc/api_v2/ecu_tracker.py index f1a3fdeb2..ba454f138 100644 --- a/src/otaclient/grpc/api_v2/ecu_tracker.py +++ b/src/otaclient/grpc/api_v2/ecu_tracker.py @@ -100,7 +100,7 @@ async def _polling_local_ecu_status(self): self._startup_matrix[my_ecu_id] = False await self._ecu_status_storage.update_from_local_ecu(status_report) except Exception as e: - burst_suppressed_logger.warning( + burst_suppressed_logger.debug( f"failed to query local ECU's status: {e!r}" ) From d85823ffc60e88e61e9502a89640b5bda640b3e1 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 04:59:17 +0000 Subject: [PATCH 091/114] minor update --- src/otaclient/ota_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 97dfcc713..fb0bdabb3 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -527,8 +527,10 @@ def _execute_update(self): try: self._download_files(otameta, delta_bundle.get_download_list()) except TasksEnsureFailed: - # NOTE: the only cause of a TaskEnsureFailed being raised is the download_watchdog timeout. - _err_msg = f"download stalls longer than {cfg.DOWNLOAD_INACTIVE_TIMEOUT}, abort OTA" + _err_msg = ( + "download aborted due to download stalls longer than " + f"{cfg.DOWNLOAD_INACTIVE_TIMEOUT}, or otaclient process is terminated, abort OTA" + ) logger.error(_err_msg) raise ota_errors.NetworkError(_err_msg, module=__name__) from None finally: From 9a61c79c29ba14f960f3df79134fc4e4e6c7daa6 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 06:34:09 +0000 Subject: [PATCH 092/114] limit the failure_traceback field's length --- src/otaclient/_status_monitor.py | 7 +++++++ src/otaclient/main.py | 2 ++ src/otaclient/ota_core.py | 2 ++ 3 files changed, 11 insertions(+) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 02a6d2554..9d85972ef 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -251,7 +251,9 @@ def __init__( *, min_collect_interval: float = MIN_COLLECT_INTERVAL, shm_push_interval: float = SHM_PUSH_INTERVAL, + max_traceback_size: int, ) -> None: + self.max_traceback_size = max_traceback_size self.min_collect_interval = min_collect_interval self.shm_push_interval = shm_push_interval @@ -277,6 +279,11 @@ def load_report(self, report: StatusReport) -> bool: # ------ on session start/end ------ # if isinstance(payload, OTAStatusChangeReport): + if (_traceback := payload.failure_traceback) and len( + _traceback + ) > self.max_traceback_size: + payload.failure_traceback = _traceback[-self.max_traceback_size :] + new_ota_status = payload.new_ota_status if new_ota_status in [OTAStatus.UPDATING, OTAStatus.ROLLBACKING]: status_storage.session_id = report.session_id diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 8b3f46036..ead66d666 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -39,6 +39,7 @@ SHUTDOWN_AFTER_API_SERVER_EXIT = 3 # seconds STATUS_SHM_SIZE = 4096 # bytes +MAX_TRACEBACK_SIZE = 2048 # bytes SHM_HMAC_KEY_LEN = 64 # bytes _ota_core_p: mp_ctx.SpawnProcess | None = None @@ -121,6 +122,7 @@ def main() -> None: ecu_status_flags=ecu_status_flags, op_queue=local_otaclient_op_queue, resp_queue=local_otaclient_resp_queue, + max_traceback_size=MAX_TRACEBACK_SIZE, ), name="otaclient_ota_core", ) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index fb0bdabb3..b2e9b56e2 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -901,6 +901,7 @@ def ota_core_process( ecu_status_flags: MultipleECUStatusFlags, op_queue: mp_queue.Queue[IPCRequest], resp_queue: mp_queue.Queue[IPCResponse], + max_traceback_size: int, # in bytes ): from otaclient._logging import configure_logging from otaclient.configs.cfg import proxy_info @@ -915,6 +916,7 @@ def ota_core_process( _status_monitor = OTAClientStatusCollector( msg_queue=_local_status_report_queue, shm_status=shm_writer, + max_traceback_size=max_traceback_size, ) _status_monitor.start() From 4de3d8ea9f72fdd5c8410d5e16aacceb969e3551 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 06:42:45 +0000 Subject: [PATCH 093/114] conftest: fix up ota_status_collector --- tests/conftest.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4697c4960..1df11b532 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -262,12 +262,21 @@ def proxy_info_fixture(tmp_path: Path) -> ProxyInfo: return parse_proxy_info(_yaml_f) +MAX_TRACEBACK_SIZE = 2048 + + @pytest.fixture(scope="class") -def ota_status_collector() -> ( - Generator[tuple[OTAClientStatusCollector, Queue[StatusReport]], Any, None] -): +def ota_status_collector( + class_mocker: pytest_mock.MockerFixture, +) -> Generator[tuple[OTAClientStatusCollector, Queue[StatusReport]], Any, None]: + _shm_mock = class_mocker.MagicMock() + _report_queue: Queue[StatusReport] = Queue() - _status_collector = OTAClientStatusCollector(_report_queue) + _status_collector = OTAClientStatusCollector( + msg_queue=_report_queue, + shm_status=_shm_mock, + max_traceback_size=MAX_TRACEBACK_SIZE, + ) _collector_thread = _status_collector.start() try: From dd51d91a46844e38b80c65cd0c730ad818591200 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 06:52:08 +0000 Subject: [PATCH 094/114] fix up test_status_monitor --- tests/test_otaclient/test_status_monitor.py | 54 ++++++++++----------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/test_otaclient/test_status_monitor.py b/tests/test_otaclient/test_status_monitor.py index d423d14da..411927e24 100644 --- a/tests/test_otaclient/test_status_monitor.py +++ b/tests/test_otaclient/test_status_monitor.py @@ -21,12 +21,10 @@ import random import time from queue import Queue -from typing import Generator import pytest from otaclient._status_monitor import ( - TERMINATE_SENTINEL, OTAClientStatusCollector, OTAStatusChangeReport, OTAUpdatePhaseChangeReport, @@ -51,24 +49,11 @@ class TestStatusMonitor: DOWNLOAD_NUM = DWONLOAD_SIZE = TOTAL_DOWNLOAD_SIZE = 600 MULTI_PATHS_FILE = MULTI_PATHS_FILE_SIZE = 100 - @pytest.fixture(autouse=True, scope="class") - def msg_queue(self) -> Generator[Queue[StatusReport], None, None]: - _queue = Queue() - yield _queue - - @pytest.fixture(autouse=True, scope="class") - def status_collector(self, msg_queue: Queue[StatusReport]): - status_collector = OTAClientStatusCollector(msg_queue=msg_queue) - _thread = status_collector.start() - try: - yield status_collector - finally: - msg_queue.put_nowait(TERMINATE_SENTINEL) - _thread.join() - def test_otaclient_start( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ): + status_collector, msg_queue = ota_status_collector + _test_failure_reason = "test_no_failure_reason" _test_current_version = "test_current_version" msg_queue.put_nowait( @@ -98,8 +83,10 @@ def test_otaclient_start( assert otaclient_status.failure_reason == _test_failure_reason def test_start_ota_update( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ): + status_collector, msg_queue = ota_status_collector + # ------ execution ------ # msg_queue.put_nowait( StatusReport( @@ -139,8 +126,9 @@ def test_start_ota_update( assert update_meta.update_firmware_version == self.UPDATE_VERSION_FOR_TEST def test_process_metadata( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector # ------ execution ------ # msg_queue.put_nowait( StatusReport( @@ -176,12 +164,16 @@ def test_process_metadata( assert update_progress.downloaded_bytes == self.METADATA_SIZE def test_filter_invalid_session_id( - self, msg_queue: Queue[StatusReport], caplog: pytest.LogCaptureFixture + self, + ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]], + caplog: pytest.LogCaptureFixture, ) -> None: """This test put reports with invalid session_id into the msg_queue. If the filter is working, all the later test methods will not fail. """ + _, msg_queue = ota_status_collector + _invalid_session_id = "invalid_session_id" # put an update meta change report @@ -224,8 +216,9 @@ def test_filter_invalid_session_id( assert all(_record.levelno == logging.WARNING for _record in caplog.records) def test_calculate_delta( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector _now = int(time.time()) # ------ execution ------ # @@ -282,8 +275,9 @@ def test_calculate_delta( assert update_meta.total_remove_files_num == 123 def test_download_ota_files( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector _now = int(time.time()) # ------ execution ------ # @@ -336,8 +330,10 @@ def test_download_ota_files( ) def test_apply_update( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector + _now = int(time.time()) # ------ execution ------ # @@ -374,8 +370,9 @@ def test_apply_update( ) and update_timing.update_apply_start_timestamp == _now def test_post_update( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector _now = int(time.time()) msg_queue.put_nowait( StatusReport( @@ -397,8 +394,10 @@ def test_post_update( ) and update_timing.post_update_start_timestamp == _now def test_finalizing_update( - self, status_collector: OTAClientStatusCollector, msg_queue: Queue[StatusReport] + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, msg_queue = ota_status_collector + _now = int(time.time()) msg_queue.put_nowait( StatusReport( @@ -417,8 +416,9 @@ def test_finalizing_update( assert otaclient_status.update_phase == UpdatePhase.FINALIZING_UPDATE def test_confirm_update_progress( - self, status_collector: OTAClientStatusCollector + self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]] ) -> None: + status_collector, _ = ota_status_collector time.sleep(2) # wait for reports being processed otaclient_status = status_collector.otaclient_status From 36170dbdf1e59502e519408004189ec35ed98c21 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 06:55:52 +0000 Subject: [PATCH 095/114] fix up test_create_standby --- tests/test_otaclient/test_create_standby.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/test_otaclient/test_create_standby.py b/tests/test_otaclient/test_create_standby.py index d2c7e1f2e..3f9ae2e6c 100644 --- a/tests/test_otaclient/test_create_standby.py +++ b/tests/test_otaclient/test_create_standby.py @@ -37,7 +37,7 @@ from otaclient.configs.cfg import cfg as otaclient_cfg from otaclient.create_standby import common, rebuild_mode from otaclient.create_standby.rebuild_mode import RebuildMode -from otaclient.ota_core import OTAClientControlFlags, _OTAUpdater +from otaclient.ota_core import _OTAUpdater from tests.conftest import TestConfiguration as cfg from tests.utils import SlotMeta, compare_dir @@ -105,14 +105,12 @@ def test_update_with_rebuild_mode( mocker: MockerFixture, ): status_collector, status_report_queue = ota_status_collector - - # ------ execution ------ # - otaclient_control_flags = typing.cast( - OTAClientControlFlags, mocker.MagicMock(spec=OTAClientControlFlags) + ecu_status_flags = mocker.MagicMock() + ecu_status_flags.any_requires_network.is_set = mocker.MagicMock( + return_value=True ) - otaclient_control_flags._can_reboot = _can_reboot = mocker.MagicMock() - _can_reboot.is_set = mocker.MagicMock(return_value=True) + # ------ execution ------ # ca_store = load_ca_cert_chains(cfg.CERTS_DIR) _updater = _OTAUpdater( @@ -122,8 +120,8 @@ def test_update_with_rebuild_mode( ca_chains_store=ca_store, upper_otaproxy=None, boot_controller=self._boot_control, + ecu_status_flags=ecu_status_flags, create_standby_cls=RebuildMode, - control_flags=otaclient_control_flags, status_report_queue=status_report_queue, session_id=self.SESSION_ID, ) @@ -144,7 +142,7 @@ def test_update_with_rebuild_mode( # ------ assertions ------ # persist_handler.assert_called_once() - otaclient_control_flags._can_reboot.is_set.assert_called_once() + ecu_status_flags.any_requires_network.is_set.assert_called_once() # --- ensure the update stats are collected collected_status = status_collector.otaclient_status assert collected_status From d9d5fb267302fc224fe0f275ec8879b30fc795be Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 06:56:19 +0000 Subject: [PATCH 096/114] remove test_main for now --- tests/test_otaclient/test_main.py | 68 ------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 tests/test_otaclient/test_main.py diff --git a/tests/test_otaclient/test_main.py b/tests/test_otaclient/test_main.py deleted file mode 100644 index 14b96cbc0..000000000 --- a/tests/test_otaclient/test_main.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2022 TIER IV, INC. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import time -from multiprocessing import Process -from pathlib import Path - -import pytest -from pytest import LogCaptureFixture -from pytest_mock import MockerFixture - -from otaclient.configs.cfg import cfg as otaclient_cfg - -FIRST_LINE_LOG = "d3b6bdb | 2021-10-27 09:36:48 +0900 | Initial commit" -MAIN_MODULE = "otaclient.main" -UTILS_MODULE = "otaclient.utils" - - -class TestMain: - @pytest.fixture(autouse=True) - def patch_main(self, mocker: MockerFixture, tmp_path: Path): - mocker.patch(f"{MAIN_MODULE}.launch_otaclient_grpc_server") - mocker.patch("otaclient._logging.configure_logging") - - self._sys_exit_mocker = mocker.MagicMock(side_effect=ValueError) - mocker.patch(f"{UTILS_MODULE}.sys.exit", self._sys_exit_mocker) - - @pytest.fixture - def background_process(self): - def _waiting(): - time.sleep(1234) - - _p = Process(target=_waiting) - try: - _p.start() - Path(otaclient_cfg.OTACLIENT_PID_FILE).write_text(f"{_p.pid}") - yield _p.pid - finally: - _p.kill() - - def test_main(self, caplog: LogCaptureFixture): - from otaclient.main import main - - main() - assert caplog.records[0].msg == "started" - assert Path(otaclient_cfg.OTACLIENT_PID_FILE).read_text() == f"{os.getpid()}" - - def test_with_other_otaclient_started(self, background_process): - from otaclient.main import main - - _other_pid = f"{background_process}" - with pytest.raises(ValueError): - main() - self._sys_exit_mocker.assert_called_once() - assert Path(otaclient_cfg.OTACLIENT_PID_FILE).read_text() == _other_pid From 137f952688116c04074ce2ec20f839fda66d9f2a Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 07:03:28 +0000 Subject: [PATCH 097/114] fix up test_ota_core --- tests/test_otaclient/test_create_standby.py | 2 +- tests/test_otaclient/test_ota_core.py | 29 ++++++++++++--------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/test_otaclient/test_create_standby.py b/tests/test_otaclient/test_create_standby.py index 3f9ae2e6c..b9fffd563 100644 --- a/tests/test_otaclient/test_create_standby.py +++ b/tests/test_otaclient/test_create_standby.py @@ -107,7 +107,7 @@ def test_update_with_rebuild_mode( status_collector, status_report_queue = ota_status_collector ecu_status_flags = mocker.MagicMock() ecu_status_flags.any_requires_network.is_set = mocker.MagicMock( - return_value=True + return_value=False ) # ------ execution ------ # diff --git a/tests/test_otaclient/test_ota_core.py b/tests/test_otaclient/test_ota_core.py index 0c283a5ad..fbcf456ea 100644 --- a/tests/test_otaclient/test_ota_core.py +++ b/tests/test_otaclient/test_ota_core.py @@ -38,7 +38,7 @@ from otaclient.create_standby import StandbySlotCreatorProtocol from otaclient.create_standby.common import DeltaBundle, RegularDelta from otaclient.errors import OTAErrorRecoverable -from otaclient.ota_core import OTAClient, OTAClientControlFlags, _OTAUpdater +from otaclient.ota_core import OTAClient, _OTAUpdater from tests.conftest import TestConfiguration as cfg from tests.utils import SlotMeta @@ -158,16 +158,14 @@ def test_otaupdater( self, ota_status_collector: tuple[OTAClientStatusCollector, Queue[StatusReport]], mocker: pytest_mock.MockerFixture, - ): - from otaclient.ota_core import OTAClientControlFlags, _OTAUpdater - + ) -> None: _, report_queue = ota_status_collector + ecu_status_flags = mocker.MagicMock() + ecu_status_flags.any_requires_network.is_set = mocker.MagicMock( + return_value=False + ) # ------ execution ------ # - otaclient_control_flags = mocker.MagicMock(spec=OTAClientControlFlags) - otaclient_control_flags._can_reboot = _can_reboot = mocker.MagicMock() - _can_reboot.is_set = mocker.MagicMock(return_value=True) - ca_store = load_ca_cert_chains(cfg.CERTS_DIR) _updater = _OTAUpdater( @@ -178,7 +176,7 @@ def test_otaupdater( boot_controller=self._boot_control, upper_otaproxy=None, create_standby_cls=self._create_standby_cls, - control_flags=otaclient_control_flags, + ecu_status_flags=ecu_status_flags, session_id=self.SESSION_ID, status_report_queue=report_queue, ) @@ -203,7 +201,7 @@ def test_otaupdater( assert _downloaded_files_size == self._delta_bundle.total_download_files_size # assert the control_flags has been waited - otaclient_control_flags._can_reboot.is_set.assert_called_once() + ecu_status_flags.any_requires_network.is_set.assert_called_once() assert _updater.update_version == str(cfg.UPDATE_VERSION) @@ -235,9 +233,13 @@ def mock_setup( mocker: pytest_mock.MockerFixture, ): _, status_report_queue = ota_status_collector + ecu_status_flags = mocker.MagicMock() + ecu_status_flags.any_requires_network.is_set = mocker.MagicMock( + return_value=False + ) # --- mock setup --- # - self.control_flags = mocker.MagicMock(spec=OTAClientControlFlags) + self.control_flags = ecu_status_flags self.ota_updater = mocker.MagicMock(spec=_OTAUpdater) self.boot_controller = mocker.MagicMock(spec=BootControllerProtocol) @@ -254,7 +256,7 @@ def mock_setup( # start otaclient self.ota_client = OTAClient( - control_flags=self.control_flags, + ecu_status_flags=ecu_status_flags, status_report_queue=status_report_queue, ) @@ -265,6 +267,7 @@ def test_update_normal_finished(self): version=self.UPDATE_FIRMWARE_VERSION, url_base=self.OTA_IMAGE_URL, cookies_json=self.UPDATE_COOKIES_JSON, + session_id="test_update_normal_finished", ) ) @@ -283,6 +286,7 @@ def test_update_interrupted(self): version=self.UPDATE_FIRMWARE_VERSION, url_base=self.OTA_IMAGE_URL, cookies_json=self.UPDATE_COOKIES_JSON, + session_id="test_updaste_interrupted", ) ) @@ -302,6 +306,7 @@ def test_status_in_update(self, mocker: pytest_mock.MockerFixture): version=self.UPDATE_FIRMWARE_VERSION, url_base=self.OTA_IMAGE_URL, cookies_json=self.UPDATE_COOKIES_JSON, + session_id="test_status_in_update", ) ) From 422ec3314320a064d146ed024167acd364a72234 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 07:08:12 +0000 Subject: [PATCH 098/114] fix up test_ecu_status --- .../test_grpc/test_api_v2/test_ecu_status.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py b/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py index 1da659250..a0aea754d 100644 --- a/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py +++ b/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py @@ -17,6 +17,7 @@ import asyncio import logging +import threading from typing import Any import pytest @@ -25,6 +26,7 @@ from otaclient import __version__ from otaclient import _types as _internal_types +from otaclient._types import MultipleECUStatusFlags from otaclient.configs import DefaultOTAClientConfigs from otaclient.configs._ecu_info import ECUInfo from otaclient.grpc.api_v2.servicer import ECUStatusStorage @@ -49,7 +51,13 @@ async def setup_test(self, mocker: MockerFixture, ecu_info_fixture: ECUInfo): mocker.patch(f"{ECU_STATUS_MODULE}.ecu_info", ecu_info) # init and setup the ecu_storage - self.ecu_storage = ECUStatusStorage() + # NOTE: here we use threading.Event instead + self.ecu_status_flags = ecu_status_flags = MultipleECUStatusFlags( + any_in_update=threading.Event(), # type: ignore[assignment] + any_requires_network=threading.Event(), # type: ignore[assignment] + all_success=threading.Event(), # type: ignore[assignment] + ) + self.ecu_storage = ECUStatusStorage(ecu_status_flags=ecu_status_flags) _mocked_otaclient_cfg = DefaultOTAClientConfigs() # NOTE: decrease the interval for faster testing @@ -571,7 +579,7 @@ async def test_on_receive_update_request( # --- assertion --- # for k, v in properties_dict.items(): assert getattr(self.ecu_storage, k) == v, f"status_report attr {k} mismatch" - assert self.ecu_storage.active_ota_update_present.is_set() + assert self.ecu_status_flags.any_in_update.is_set() async def test_polling_waiter_switching_from_idling_to_active(self): """Waiter should immediately return if active_ota_update_present is set.""" @@ -579,9 +587,9 @@ async def test_polling_waiter_switching_from_idling_to_active(self): async def _event_setter(): await asyncio.sleep(_sleep_time) - self.ecu_storage.active_ota_update_present.set() + self.ecu_status_flags.any_in_update.set() - self.ecu_storage.active_ota_update_present.clear() + self.ecu_status_flags.any_in_update.clear() _waiter = self.ecu_storage.get_polling_waiter() asyncio.create_task(_event_setter()) # waiter should return on active_ota_update_present is set, instead of waiting the From 0f330d7424c0af060024d563decd61c18e5b68a1 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 07:47:48 +0000 Subject: [PATCH 099/114] fix up test_ecu_status --- .../test_grpc/test_api_v2/test_ecu_status.py | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py b/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py index a0aea754d..8e3081f8d 100644 --- a/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py +++ b/tests/test_otaclient/test_grpc/test_api_v2/test_ecu_status.py @@ -334,7 +334,7 @@ async def test_export( compare_message(exported, expected) @pytest.mark.parametrize( - "local_ecu_status,sub_ecus_status,properties_dict", + "local_ecu_status,sub_ecus_status,properties_dict,flags_status", ( # case 1: ( @@ -376,8 +376,12 @@ async def test_export( "in_update_ecus_id": {"autoware", "p2"}, "in_update_child_ecus_id": {"p2"}, "failed_ecus_id": {"p1"}, - "any_requires_network": True, "success_ecus_id": set(), + }, + # ecu_status_flags + { + "any_in_update": True, + "any_requires_network": True, "all_success": False, }, ), @@ -421,8 +425,12 @@ async def test_export( "in_update_ecus_id": {"p2"}, "in_update_child_ecus_id": {"p2"}, "failed_ecus_id": {"p1"}, - "any_requires_network": True, "success_ecus_id": {"autoware"}, + }, + # ecu_status_flags + { + "any_in_update": True, + "any_requires_network": True, "all_success": False, }, ), @@ -433,6 +441,7 @@ async def test_overall_ecu_status_report_generation( local_ecu_status: _internal_types.OTAClientStatus, sub_ecus_status: list[api_types.StatusResponse], properties_dict: dict[str, Any], + flags_status: dict[str, bool], ): # --- prepare --- # await self.ecu_storage.update_from_local_ecu(local_ecu_status) @@ -446,8 +455,11 @@ async def test_overall_ecu_status_report_generation( for k, v in properties_dict.items(): assert getattr(self.ecu_storage, k) == v, f"status_report attr {k} mismatch" + for k, v in flags_status.items(): + assert getattr(self.ecu_status_flags, k).is_set() == v + @pytest.mark.parametrize( - "local_ecu_status,sub_ecus_status,ecus_accept_update_request,properties_dict", + "local_ecu_status,sub_ecus_status,ecus_accept_update_request,properties_dict,flags_status", ( # case 1: # There is FAILED/UPDATING ECUs existed in the cluster. @@ -494,8 +506,12 @@ async def test_overall_ecu_status_report_generation( "in_update_ecus_id": {"autoware", "p2"}, "in_update_child_ecus_id": {"p2"}, "failed_ecus_id": {"p1"}, - "any_requires_network": True, "success_ecus_id": set(), + }, + # ecu_status_flags + { + "any_in_update": True, + "any_requires_network": True, "all_success": False, }, ), @@ -542,8 +558,12 @@ async def test_overall_ecu_status_report_generation( "in_update_ecus_id": {"autoware", "p1"}, "in_update_child_ecus_id": {"p1"}, "failed_ecus_id": set(), - "any_requires_network": True, "success_ecus_id": {"p2"}, + }, + # ecu_status_flags + { + "any_in_update": True, + "any_requires_network": True, "all_success": False, }, ), @@ -555,6 +575,7 @@ async def test_on_receive_update_request( sub_ecus_status: list[api_types.StatusResponse], ecus_accept_update_request: list[str], properties_dict: dict[str, Any], + flags_status: dict[str, bool], mocker: pytest_mock.MockerFixture, ): # --- prepare --- # @@ -579,7 +600,9 @@ async def test_on_receive_update_request( # --- assertion --- # for k, v in properties_dict.items(): assert getattr(self.ecu_storage, k) == v, f"status_report attr {k} mismatch" - assert self.ecu_status_flags.any_in_update.is_set() + + for k, v in flags_status.items(): + assert getattr(self.ecu_status_flags, k).is_set() == v async def test_polling_waiter_switching_from_idling_to_active(self): """Waiter should immediately return if active_ota_update_present is set.""" From 2d44b6c990523b0aed56f853103bb7a31ea77107 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 07:58:53 +0000 Subject: [PATCH 100/114] ota_core: minor fix, now handler do the live_ota_status change, instead of main loop --- src/otaclient/ota_core.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index b2e9b56e2..f3df26eb3 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -741,6 +741,7 @@ def update(self, request: UpdateRequestV2) -> None: NOTE that update API will not raise any exceptions. The failure information is available via status API. """ + self._live_ota_status = OTAStatus.UPDATING new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( @@ -782,6 +783,7 @@ def update(self, request: UpdateRequestV2) -> None: ) def rollback(self, request: RollbackRequestV2) -> None: + self._live_ota_status = OTAStatus.ROLLBACKING new_session_id = request.session_id self._status_report_queue.put_nowait( StatusReport( @@ -834,10 +836,8 @@ def main( session_id=request.session_id, ) ) - continue - if isinstance(request, UpdateRequestV2): - self._live_ota_status = OTAStatus.UPDATING + elif isinstance(request, UpdateRequestV2): _update_thread = threading.Thread( target=self.update, @@ -854,14 +854,11 @@ def main( ) ) _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST - continue - if ( + elif ( isinstance(request, RollbackRequestV2) and self._live_ota_status == OTAStatus.SUCCESS ): - self._live_ota_status = OTAStatus.FAILURE - _rollback_thread = threading.Thread( target=self.rollback, args=[request], @@ -877,17 +874,17 @@ def main( ) ) _allow_request_after = _now + HOLD_REQ_HANDLING_ON_ACK_REQUEST - continue + else: - _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" - logger.error(_err_msg) - resp_queue.put_nowait( - IPCResponse( - res=IPCResEnum.REJECT_OTHER, - msg=_err_msg, - session_id=request.session_id, + _err_msg = f"request is invalid: {request=}, {self._live_ota_status=}" + logger.error(_err_msg) + resp_queue.put_nowait( + IPCResponse( + res=IPCResEnum.REJECT_OTHER, + msg=_err_msg, + session_id=request.session_id, + ) ) - ) def _sign_handler(signal_value, frame) -> NoReturn: From 04f497aaee6f5d5a11d6d007f4a3900fb4f1d777 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:00:52 +0000 Subject: [PATCH 101/114] ota_core: increase the minimum request interval to 16 seconds --- src/otaclient/ota_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index f3df26eb3..9bf3a000f 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -93,7 +93,7 @@ DOWNLOAD_REPORT_INTERVAL = 1 # second OP_CHECK_INTERVAL = 1 # second -HOLD_REQ_HANDLING_ON_ACK_REQUEST = 6 # seconds +HOLD_REQ_HANDLING_ON_ACK_REQUEST = 16 # seconds WAIT_FOR_OTAPROXY_ONLINE = 3 * 60 # 3mins From 17e71091708686acf75a87102d4d2cf3b34d8d83 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:02:46 +0000 Subject: [PATCH 102/114] fix up test_ota_core.py --- tests/test_otaclient/test_ota_core.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/tests/test_otaclient/test_ota_core.py b/tests/test_otaclient/test_ota_core.py index fbcf456ea..921113eee 100644 --- a/tests/test_otaclient/test_ota_core.py +++ b/tests/test_otaclient/test_ota_core.py @@ -246,7 +246,9 @@ def mock_setup( # patch boot_controller for otaclient initializing self.boot_controller.load_version.return_value = self.CURRENT_FIRMWARE_VERSION - self.boot_controller.get_booted_ota_status.return_value = OTAStatus.SUCCESS + self.boot_controller.get_booted_ota_status = mocker.MagicMock( + return_value=OTAStatus.SUCCESS + ) # patch inject mocked updater mocker.patch(f"{OTA_CORE_MODULE}._OTAUpdater", return_value=self.ota_updater) @@ -293,23 +295,3 @@ def test_update_interrupted(self): # --- assertion on interrupted OTA update --- # self.ota_updater.execute.assert_called_once() assert self.ota_client.live_ota_status == OTAStatus.FAILURE - - def test_status_in_update(self, mocker: pytest_mock.MockerFixture): - # --- mock setup --- # - _ota_updater_mocker = mocker.MagicMock(spec=_OTAUpdater) - mocker.patch(f"{OTA_CORE_MODULE}._OTAUpdater", _ota_updater_mocker) - self.ota_client._live_ota_status = OTAStatus.UPDATING - - # --- execution --- # - self.ota_client.update( - request=UpdateRequestV2( - version=self.UPDATE_FIRMWARE_VERSION, - url_base=self.OTA_IMAGE_URL, - cookies_json=self.UPDATE_COOKIES_JSON, - session_id="test_status_in_update", - ) - ) - - # --- assertion --- # - # confirm that the OTA update doesn't happen - _ota_updater_mocker.assert_not_called() From 420557f7f396665b2b5226325e254335ff88c893 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:11:36 +0000 Subject: [PATCH 103/114] minor update --- src/otaclient/_status_monitor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 9d85972ef..7f0369320 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -294,7 +294,9 @@ def load_report(self, report: StatusReport) -> bool: # ------ during OTA session ------ # report_session_id = report.session_id if report_session_id != status_storage.session_id: - logger.warning(f"drop reports from mismatched session: {report}") + logger.warning( + f"drop reports from mismatched session (expect {status_storage.session_id=}): {report}" + ) return False if isinstance(payload, OTAUpdatePhaseChangeReport): return _on_update_phase_changed(status_storage, payload) From 8b30cd78e38420b1ac05eee99c25b09b90d9ad64 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:14:48 +0000 Subject: [PATCH 104/114] minor fix --- src/otaclient/_types.py | 1 - tests/test_otaclient/test_create_standby.py | 23 ++++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 92981290d..6e7722117 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -21,7 +21,6 @@ from typing import ClassVar, Optional from _otaclient_version import __version__ - from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum diff --git a/tests/test_otaclient/test_create_standby.py b/tests/test_otaclient/test_create_standby.py index b9fffd563..b0abbbec8 100644 --- a/tests/test_otaclient/test_create_standby.py +++ b/tests/test_otaclient/test_create_standby.py @@ -113,6 +113,16 @@ def test_update_with_rebuild_mode( # ------ execution ------ # ca_store = load_ca_cert_chains(cfg.CERTS_DIR) + # update OTA status to update and assign session_id before OTAUpdate initialized + status_report_queue.put_nowait( + StatusReport( + payload=OTAStatusChangeReport( + new_ota_status=OTAStatus.UPDATING, + ), + session_id=self.SESSION_ID, + ) + ) + _updater = _OTAUpdater( version=cfg.UPDATE_VERSION, raw_url_base=cfg.OTA_IMAGE_URL, @@ -127,17 +137,10 @@ def test_update_with_rebuild_mode( ) _updater._process_persistents = persist_handler = mocker.MagicMock() - # update OTA status to update and assign session_id before execution - status_report_queue.put_nowait( - StatusReport( - payload=OTAStatusChangeReport( - new_ota_status=OTAStatus.UPDATING, - ), - session_id=self.SESSION_ID, - ) - ) + time.sleep(2) + + # ------ execution ------ # _updater.execute() - time.sleep(2) # wait for downloader to record stats # ------ assertions ------ # persist_handler.assert_called_once() From 14884149db5d197641ebae26a9c972e08c0afd11 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:15:33 +0000 Subject: [PATCH 105/114] minor fix --- tests/test_otaclient/test_ota_core.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_otaclient/test_ota_core.py b/tests/test_otaclient/test_ota_core.py index 921113eee..ffaab77b0 100644 --- a/tests/test_otaclient/test_ota_core.py +++ b/tests/test_otaclient/test_ota_core.py @@ -168,6 +168,16 @@ def test_otaupdater( # ------ execution ------ # ca_store = load_ca_cert_chains(cfg.CERTS_DIR) + # update OTA status to update and assign session_id before execution + report_queue.put_nowait( + StatusReport( + payload=OTAStatusChangeReport( + new_ota_status=OTAStatus.UPDATING, + ), + session_id=self.SESSION_ID, + ) + ) + _updater = _OTAUpdater( version=cfg.UPDATE_VERSION, raw_url_base=cfg.OTA_IMAGE_URL, @@ -182,15 +192,6 @@ def test_otaupdater( ) _updater._process_persistents = process_persists_handler = mocker.MagicMock() - # update OTA status to update and assign session_id before execution - report_queue.put_nowait( - StatusReport( - payload=OTAStatusChangeReport( - new_ota_status=OTAStatus.UPDATING, - ), - session_id=self.SESSION_ID, - ) - ) _updater.execute() # ------ assertions ------ # From 25688e189629c5545a446b2aec0c2a40c1bb2541 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Thu, 5 Dec 2024 08:25:17 +0000 Subject: [PATCH 106/114] temporary remove test_servicer as api_v2.servicer module changes a lot(being simplified a lot) --- src/otaclient/grpc/api_v2/servicer.py | 2 - .../test_grpc/test_api_v2/test_servicer.py | 298 ------------------ 2 files changed, 300 deletions(-) delete mode 100644 tests/test_otaclient/test_grpc/test_api_v2/test_servicer.py diff --git a/src/otaclient/grpc/api_v2/servicer.py b/src/otaclient/grpc/api_v2/servicer.py index a380ae43f..a44ccb1a5 100644 --- a/src/otaclient/grpc/api_v2/servicer.py +++ b/src/otaclient/grpc/api_v2/servicer.py @@ -46,8 +46,6 @@ class OTAClientAPIServicer: This class also handles otaproxy lifecyle and dependence managing. """ - OTAPROXY_SHUTDOWN_DELAY = cfg.OTAPROXY_MINIMUM_SHUTDOWN_INTERVAL - def __init__( self, *, diff --git a/tests/test_otaclient/test_grpc/test_api_v2/test_servicer.py b/tests/test_otaclient/test_grpc/test_api_v2/test_servicer.py deleted file mode 100644 index c9a96cb52..000000000 --- a/tests/test_otaclient/test_grpc/test_api_v2/test_servicer.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright 2022 TIER IV, INC. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import annotations - -import asyncio -import logging -from concurrent.futures import ThreadPoolExecutor -from typing import Set - -import pytest -from pytest_mock import MockerFixture - -from otaclient.configs._ecu_info import ECUInfo -from otaclient.configs._proxy_info import ProxyInfo -from otaclient.grpc.api_v2 import ecu_status, servicer -from otaclient.grpc.api_v2.ecu_tracker import ECUTracker -from otaclient.grpc.api_v2.servicer import ( - ECUStatusStorage, - OTAClientAPIServicer, - OTAProxyLauncher, -) -from otaclient.grpc.api_v2.types import convert_from_apiv2_update_request -from otaclient.ota_core import OTAClient, OTAClientControlFlags -from otaclient_api.v2 import types as api_types -from otaclient_api.v2.api_caller import OTAClientCall -from tests.utils import compare_message - -logger = logging.getLogger(__name__) - -SERVICER_MODULE = servicer.__name__ -ECU_STATUS_MODULE = ecu_status.__name__ - - -class TestOTAClientServiceStub: - POLLING_INTERVAL = 1 - ENSURE_NEXT_CHECKING_ROUND = 1.2 - - @staticmethod - async def _subecu_accept_update_request( - ecu_id, *args, **kwargs - ) -> api_types.UpdateResponse: - return api_types.UpdateResponse( - ecu=[ - api_types.UpdateResponseEcu( - ecu_id=ecu_id, result=api_types.FailureType.NO_FAILURE - ) - ] - ) - - @pytest.fixture(autouse=True) - async def setup_test( - self, - mocker: MockerFixture, - ecu_info_fixture: ECUInfo, - proxy_info_fixture: ProxyInfo, - ): - threadpool = ThreadPoolExecutor() - - # ------ mock and patch ------ # - self.ecu_info = ecu_info = ecu_info_fixture - mocker.patch(f"{SERVICER_MODULE}.ecu_info", ecu_info) - - # NOTE: decrease the interval to speed up testing - # (used by _otaproxy_lifecycle_managing/_otaclient_control_flags_managing task) - mocker.patch( - f"{ECU_STATUS_MODULE}.ACTIVE_POLLING_INTERVAL", self.POLLING_INTERVAL - ) - mocker.patch( - f"{ECU_STATUS_MODULE}.IDLE_POLLING_INTERVAL", self.POLLING_INTERVAL - ) - - # ------ init and setup the ecu_storage ------ # - self.control_flag = OTAClientControlFlags() - self.ecu_storage = ECUStatusStorage() - self.ecu_storage.on_ecus_accept_update_request = mocker.AsyncMock() - # NOTE: disable internal overall ecu status generation task as we - # will manipulate the values by ourselves. - self.ecu_storage._debug_properties_update_shutdown_event.set() - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) # ensure the task stopping - - # --- mocker --- # - self.otaclient_inst = mocker.MagicMock(spec=OTAClient) - type(self.otaclient_inst).started = mocker.PropertyMock(return_value=True) - type(self.otaclient_inst).is_busy = mocker.PropertyMock(return_value=False) - - self.ecu_status_tracker = mocker.MagicMock(spec=ECUTracker) - self.otaproxy_launcher = mocker.MagicMock(spec=OTAProxyLauncher) - # mock OTAClientCall, make update_call return success on any update dispatches to subECUs - self.otaclient_call = mocker.AsyncMock(spec=OTAClientCall) - self.otaclient_call.update_call = mocker.AsyncMock( - wraps=self._subecu_accept_update_request - ) - - # ------ mock and patch proxy_info ------ # - self.proxy_info = proxy_info = proxy_info_fixture - mocker.patch(f"{SERVICER_MODULE}.proxy_info", proxy_info) - - # --- patching and mocking --- # - mocker.patch( - f"{SERVICER_MODULE}.ECUStatusStorage", - mocker.MagicMock(return_value=self.ecu_storage), - ) - mocker.patch( - f"{SERVICER_MODULE}.OTAProxyLauncher", - mocker.MagicMock(return_value=self.otaproxy_launcher), - ) - mocker.patch(f"{SERVICER_MODULE}.OTAClientCall", self.otaclient_call) - - # --- start the OTAClientServiceStub --- # - self.otaclient_service_stub = OTAClientAPIServicer( - otaclient_inst=self.otaclient_inst, - ecu_status_storage=self.ecu_storage, - control_flag=self.control_flag, - executor=threadpool, - ) - - try: - yield - finally: - self.otaclient_service_stub._debug_status_checking_shutdown_event.set() - threadpool.shutdown() - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) # ensure shutdown - - async def test__otaproxy_lifecycle_managing(self): - """ - otaproxy startup/shutdown is only controlled by any_requires_network - in overall ECU status report. - """ - # ------ otaproxy startup ------- # - # --- prepartion --- # - self.otaproxy_launcher.is_running = False - self.ecu_storage.any_requires_network = True - - # --- wait for execution --- # - # wait for _otaproxy_lifecycle_managing to launch - # the otaproxy on overall ecu status changed - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) - - # --- assertion --- # - self.otaproxy_launcher.start.assert_called_once() - - # ------ otaproxy shutdown ------ # - # --- prepartion --- # - # set the OTAPROXY_SHUTDOWN_DELAY to allow start/stop in single test - self.otaclient_service_stub.OTAPROXY_SHUTDOWN_DELAY = 1 # type: ignore - self.otaproxy_launcher.is_running = True - self.ecu_storage.any_requires_network = False - - # --- wait for execution --- # - # wait for _otaproxy_lifecycle_managing to shutdown - # the otaproxy on overall ecu status changed - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) - - # --- assertion --- # - self.otaproxy_launcher.stop.assert_called_once() - - # ---- cache dir cleanup --- # - # only cleanup cache dir on all ECUs in SUCCESS ota_status - self.ecu_storage.any_requires_network = False - self.ecu_storage.all_success = True - self.otaproxy_launcher.is_running = False - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) - - # --- assertion --- # - self.otaproxy_launcher.cleanup_cache_dir.assert_called_once() - - async def test__otaclient_control_flags_managing(self): - otaclient_control_flags = self.control_flag - # there are child ECUs in UPDATING - self.ecu_storage.in_update_child_ecus_id = {"p1", "p2"} - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) - assert not otaclient_control_flags._can_reboot.is_set() - - # no more child ECUs in UPDATING - self.ecu_storage.in_update_child_ecus_id = set() - await asyncio.sleep(self.ENSURE_NEXT_CHECKING_ROUND) - assert otaclient_control_flags._can_reboot.is_set() - - @pytest.mark.parametrize( - "update_request, update_target_ids, expected", - ( - # update request for autoware, p1 ecus - ( - api_types.UpdateRequest( - ecu=[ - api_types.UpdateRequestEcu( - ecu_id="autoware", - version="789.x", - url="url", - cookies="cookies", - ), - api_types.UpdateRequestEcu( - ecu_id="p1", - version="789.x", - url="url", - cookies="cookies", - ), - ] - ), - {"autoware", "p1"}, - # NOTE: order matters! - # update request dispatching to subECUs happens first, - # and then to the local ECU. - api_types.UpdateResponse( - ecu=[ - api_types.UpdateResponseEcu( - ecu_id="p1", - result=api_types.FailureType.NO_FAILURE, - ), - api_types.UpdateResponseEcu( - ecu_id="autoware", - result=api_types.FailureType.NO_FAILURE, - ), - ] - ), - ), - # update only p2 - ( - api_types.UpdateRequest( - ecu=[ - api_types.UpdateRequestEcu( - ecu_id="p2", - version="789.x", - url="url", - cookies="cookies", - ), - ] - ), - {"p2"}, - api_types.UpdateResponse( - ecu=[ - api_types.UpdateResponseEcu( - ecu_id="p2", - result=api_types.FailureType.NO_FAILURE, - ), - ] - ), - ), - ), - ) - async def test_update_normal( - self, - update_request: api_types.UpdateRequest, - update_target_ids: Set[str], - expected: api_types.UpdateResponse, - ): - # --- execution --- # - resp = await self.otaclient_service_stub.update(update_request) - - # --- assertion --- # - compare_message(resp, expected) - - self.otaclient_call.update_call.assert_called() - self.ecu_storage.on_ecus_accept_update_request.assert_called_once_with( # type: ignore - update_target_ids - ) - # assert otaclient_inst receives the update request if we have update request for self ECU - if update_request.if_contains_ecu("autoware"): - _update_request_ecu = update_request.find_ecu("autoware") - assert _update_request_ecu - - self.otaclient_inst.update.assert_called_once_with( - convert_from_apiv2_update_request(_update_request_ecu) - ) - - async def test_update_local_ecu_busy( - self, - mocker: MockerFixture, - ): - # --- preparation --- # - is_busy_mock = mocker.PropertyMock(return_value=True) # is busy - type(self.otaclient_inst).is_busy = is_busy_mock - - update_request_ecu = api_types.UpdateRequestEcu( - ecu_id="autoware", version="version", url="url", cookies="cookies" - ) - - # --- execution --- # - await self.otaclient_service_stub.update( - api_types.UpdateRequest(ecu=[update_request_ecu]) - ) - - # --- assertion --- # - # assert otaclient_inst doesn't receive the update request - self.otaclient_inst.update.assert_not_called() From 0ebd9c62ef7730d58f067fbb0e4376c14bdbcf43 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Dec 2024 08:25:33 +0000 Subject: [PATCH 107/114] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/otaclient/_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/otaclient/_types.py b/src/otaclient/_types.py index 6e7722117..92981290d 100644 --- a/src/otaclient/_types.py +++ b/src/otaclient/_types.py @@ -21,6 +21,7 @@ from typing import ClassVar, Optional from _otaclient_version import __version__ + from otaclient.configs.cfg import ecu_info from otaclient_common.typing import StrEnum From 6868bd40846c052609df7417dc03bbc09e94065d Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 08:05:33 +0000 Subject: [PATCH 108/114] otaclient._utils: minor update to check_other_otaclient --- src/otaclient/_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index 32fb0b6ca..73017ab95 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -55,20 +55,20 @@ def wait_and_log( time.sleep(check_interval) -def check_other_otaclient(pid_fpath: StrOrPath) -> None: +def check_other_otaclient(pid_fpath: StrOrPath) -> None: # pragma: no cover """Check if there is another otaclient instance running, and then - create a pid lock file for this otaclient instance.""" - pid_fpath = Path(pid_fpath) + create a pid lock file for this otaclient instance. + NOTE that otaclient should not run inside a PID namespace. + """ + pid_fpath = Path(pid_fpath) if pid := read_str_from_file(pid_fpath, _default=""): # running process will have a folder under /proc if Path(f"/proc/{pid}").is_dir(): logger.error(f"another instance of ota-client({pid=}) is running, abort") sys.exit() - logger.warning(f"dangling otaclient lock file({pid=}) detected, cleanup") - Path(pid_fpath).unlink(missing_ok=True) - + pid_fpath.unlink(missing_ok=True) write_str_to_file_atomic(pid_fpath, f"{os.getpid()}") From d094d9f6550bc16ded99d053eea129d37fa23a22 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 08:08:55 +0000 Subject: [PATCH 109/114] minor update to _utils --- src/otaclient/_utils.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index 73017ab95..828c0cc02 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -72,7 +72,9 @@ def check_other_otaclient(pid_fpath: StrOrPath) -> None: # pragma: no cover write_str_to_file_atomic(pid_fpath, f"{os.getpid()}") -def create_otaclient_rundir(run_dir: StrOrPath = "/run/otaclient"): +def create_otaclient_rundir( + run_dir: StrOrPath = "/run/otaclient", +) -> None: # pragma: no cover """Create the otaclient runtime working dir. TODO: make a helper class for managing otaclient runtime dir. @@ -81,7 +83,7 @@ def create_otaclient_rundir(run_dir: StrOrPath = "/run/otaclient"): run_dir.mkdir(exist_ok=True, parents=True) -def get_traceback(exc: Exception, *, splitter: str = "\n") -> str: +def get_traceback(exc: Exception, *, splitter: str = "\n") -> str: # pragma: no cover """Format the traceback as string.""" return splitter.join(traceback.format_exception(type(exc), exc, exc.__traceback__)) @@ -94,13 +96,18 @@ class SharedOTAClientStatusReader(MPSharedStatusReader[OTAClientStatus]): """Util for reading OTAClientStatus from shm.""" -def gen_session_id(update_version: str) -> str: +SESSION_RANDOM_LEN = 4 # bytes, the corresponding hex string will be 8 chars + + +def gen_session_id( + update_version: str, *, random_bytes_num: int = SESSION_RANDOM_LEN +) -> str: # pragma: no cover """Generate a unique session_id for the new OTA session. token schema: --<4bytes_hex> """ _time_factor = str(int(time.time())) - _random_factor = os.urandom(4).hex() + _random_factor = os.urandom(random_bytes_num).hex() return f"{update_version}-{_time_factor}-{_random_factor}" From 96353cdac1d23cd57369d67fb175c37d4e1e8141 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 08:49:13 +0000 Subject: [PATCH 110/114] no need to write test code for main module --- src/otaclient/_utils.py | 4 +--- src/otaclient/main.py | 7 ++++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/otaclient/_utils.py b/src/otaclient/_utils.py index 828c0cc02..9fffe2c21 100644 --- a/src/otaclient/_utils.py +++ b/src/otaclient/_utils.py @@ -72,9 +72,7 @@ def check_other_otaclient(pid_fpath: StrOrPath) -> None: # pragma: no cover write_str_to_file_atomic(pid_fpath, f"{os.getpid()}") -def create_otaclient_rundir( - run_dir: StrOrPath = "/run/otaclient", -) -> None: # pragma: no cover +def create_otaclient_rundir(run_dir: StrOrPath = "/run/otaclient") -> None: """Create the otaclient runtime working dir. TODO: make a helper class for managing otaclient runtime dir. diff --git a/src/otaclient/main.py b/src/otaclient/main.py index ead66d666..2e782db5a 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -47,7 +47,7 @@ _shm: mp_shm.SharedMemory | None = None -def _on_shutdown(sys_exit: bool = False): +def _on_shutdown(sys_exit: bool = False) -> None: # pragma: no cover global _ota_core_p, _grpc_server_p, _shm if _ota_core_p: _ota_core_p.terminate() @@ -68,12 +68,13 @@ def _on_shutdown(sys_exit: bool = False): sys.exit(1) -def _signal_handler(signal_value, _) -> None: +def _signal_handler(signal_value, _) -> None: # pragma: no cover print(f"otaclient receives {signal_value=}, shutting down ...") + # NOTE: the daemon_process needs to exit also. _on_shutdown(sys_exit=True) -def main() -> None: +def main() -> None: # pragma: no cover from otaclient._logging import configure_logging from otaclient._otaproxy_ctx import otaproxy_control_thread from otaclient._utils import check_other_otaclient, create_otaclient_rundir From 209a302d31632554b479fc9cbd9d513a1a3159fb Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 08:51:46 +0000 Subject: [PATCH 111/114] add some comments --- src/otaclient/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 2e782db5a..1a278138d 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -35,6 +35,9 @@ logger = logging.getLogger(__name__) HEALTH_CHECK_INTERAVL = 6 # seconds +# NOTE: the reason to let daemon_process exits after 16 seconds of ota_core dead +# is to allow grpc API server to respond to the status API calls with up-to-date +# failure information from ota_core. SHUTDOWN_AFTER_CORE_EXIT = 16 # seconds SHUTDOWN_AFTER_API_SERVER_EXIT = 3 # seconds From e3b21f6a925744d869ef46794c5a8192463d9951 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 08:58:26 +0000 Subject: [PATCH 112/114] minor update --- src/otaclient/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/otaclient/main.py b/src/otaclient/main.py index 1a278138d..4d0316993 100644 --- a/src/otaclient/main.py +++ b/src/otaclient/main.py @@ -100,6 +100,10 @@ def main() -> None: # pragma: no cover # global _ota_core_p, _grpc_server_p, _shm + # NOTE: if the atexit hook is triggered by signal received, + # first the signal handler will be executed, and then atexit hook. + # At the time atexit hook is executed, the _ota_core_p, _grpc_server_p + # and _shm are set to None by signal handler. atexit.register(_on_shutdown) signal.signal(signal.SIGTERM, _signal_handler) signal.signal(signal.SIGINT, _signal_handler) From 22c5d3e95df1d0ed301fbbb74cb0be5d892a4b8b Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Mon, 9 Dec 2024 09:03:49 +0000 Subject: [PATCH 113/114] _otaproxy_ctx: add a _global_shutdown flag --- src/otaclient/_otaproxy_ctx.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/otaclient/_otaproxy_ctx.py b/src/otaclient/_otaproxy_ctx.py index 8186bcbcf..e08a50638 100644 --- a/src/otaclient/_otaproxy_ctx.py +++ b/src/otaclient/_otaproxy_ctx.py @@ -39,10 +39,12 @@ logger = logging.getLogger(__name__) _otaproxy_p: mp_ctx.SpawnProcess | None = None +_global_shutdown: bool = False def shutdown_otaproxy_server() -> None: - global _otaproxy_p + global _otaproxy_p, _global_shutdown + _global_shutdown = True if _otaproxy_p: _otaproxy_p.terminate() _otaproxy_p.join() @@ -102,7 +104,7 @@ def otaproxy_control_thread( next_ota_cache_dir_checkpoint = 0 global _otaproxy_p - while True: + while not _global_shutdown: time.sleep(OTAPROXY_CHECK_INTERVAL) _now = time.time() From 7ce694e17a1f5eea006b03155cd9c896b08baa63 Mon Sep 17 00:00:00 2001 From: "bodong.yang" Date: Wed, 11 Dec 2024 01:13:24 +0000 Subject: [PATCH 114/114] add some comments in the code --- src/otaclient/_status_monitor.py | 7 ++++--- src/otaclient/grpc/api_v2/ecu_status.py | 4 +++- src/otaclient/ota_core.py | 2 ++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/otaclient/_status_monitor.py b/src/otaclient/_status_monitor.py index 7f0369320..9db09cfe0 100644 --- a/src/otaclient/_status_monitor.py +++ b/src/otaclient/_status_monitor.py @@ -236,13 +236,14 @@ def _on_update_meta(status_storage: OTAClientStatus, payload: SetUpdateMetaRepor # ------ status monitor implementation ------ # # +# A sentinel object to tell the thread stop TERMINATE_SENTINEL = cast(StatusReport, object()) -MIN_COLLECT_INTERVAL = 0.5 -SHM_PUSH_INTERVAL = 0.5 +MIN_COLLECT_INTERVAL = 0.5 # seconds +SHM_PUSH_INTERVAL = 0.5 # seconds class OTAClientStatusCollector: - """NOTE: status_monitor should only be started once during whole otaclient lifecycle!""" + """NOTE: status_monitor will only be started once during whole otaclient lifecycle!""" def __init__( self, diff --git a/src/otaclient/grpc/api_v2/ecu_status.py b/src/otaclient/grpc/api_v2/ecu_status.py index 11e9b7bcd..66b8b68b3 100644 --- a/src/otaclient/grpc/api_v2/ecu_status.py +++ b/src/otaclient/grpc/api_v2/ecu_status.py @@ -372,7 +372,9 @@ def get_polling_waiter(self): or self.active_ota_update_present is set, return when one of the condition is met. """ - _inner_wait_interval = 1 + # waiter closure will slice the waiting time by <_inner_wait_interval>, + # add wait each slice one by one while checking the ecu_status_flags. + _inner_wait_interval = 1 # second async def _waiter(): ecu_status_flags = self.ecu_status_flags diff --git a/src/otaclient/ota_core.py b/src/otaclient/ota_core.py index 9bf3a000f..77696ff64 100644 --- a/src/otaclient/ota_core.py +++ b/src/otaclient/ota_core.py @@ -413,6 +413,8 @@ def _execute_update(self): f"use {_upper_proxy} for local OTA update, " f"wait for otaproxy@{_upper_proxy} online..." ) + + # NOTE: will raise a built-in ConnnectionError at timeout ensure_otaproxy_start( _upper_proxy, probing_timeout=WAIT_FOR_OTAPROXY_ONLINE,