Skip to content

Commit

Permalink
feat(retry_task_map): introduce worker-thread-scope backoff wait on f…
Browse files Browse the repository at this point in the history
…ailed tasks rescheduling. (#450)

This PR introduces worker-thread-scope backoff wait on failed tasks rescheduling for otaclient_common.retry_task_map.
For each worker thread, a coutinues_failed_count is maintained, when rescheduling failed tasks, now we will wait with backoff before put the failed task back into task queue.
This will help lower the CPU usage a lot when the network is fully disconnected.
  • Loading branch information
Bodong-Yang authored Dec 3, 2024
1 parent e6a93db commit 19c4c5c
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 12 deletions.
77 changes: 66 additions & 11 deletions src/otaclient_common/retry_task_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,23 @@
from queue import Empty, SimpleQueue
from typing import TYPE_CHECKING, Any, Callable, Generator, Iterable, Optional

from typing_extensions import ParamSpec

from otaclient_common.common import wait_with_backoff
from otaclient_common.typing import RT, T

P = ParamSpec("P")

logger = logging.getLogger(__name__)


class TasksEnsureFailed(Exception):
"""Exception for tasks ensuring failed."""


CONTINUES_FAILURE_COUNT_ATTRNAME = "continues_failed_count"


class _ThreadPoolExecutorWithRetry(ThreadPoolExecutor):

def __init__(
Expand All @@ -47,6 +55,8 @@ def __init__(
watchdog_check_interval: int = 3, # seconds
initializer: Callable[..., Any] | None = None,
initargs: tuple = (),
backoff_factor: float = 0.01,
backoff_max: float = 1,
) -> None:
self._start_lock, self._started = threading.Lock(), False
self._total_task_num = 0
Expand All @@ -57,6 +67,8 @@ def __init__(
no tasks, the task execution gen should stop immediately.
3. only value >=1 is valid.
"""
self.backoff_factor = backoff_factor
self.backoff_max = backoff_max

self._retry_counter = itertools.count(start=1)
self._retry_count = 0
Expand All @@ -70,13 +82,26 @@ def __init__(
if callable(watchdog_func):
self._checker_funcs.append(watchdog_func)

self._thread_local = threading.local()
super().__init__(
max_workers=max_workers,
thread_name_prefix=thread_name_prefix,
initializer=initializer,
initializer=self._rtm_initializer_gen(initializer),
initargs=initargs,
)

def _rtm_initializer_gen(
self, _input_initializer: Callable[P, RT] | None
) -> Callable[P, None]:
def _real_initializer(*args: P.args, **kwargs: P.kwargs) -> None:
_thread_local = self._thread_local
setattr(_thread_local, CONTINUES_FAILURE_COUNT_ATTRNAME, 0)

if callable(_input_initializer):
_input_initializer(*args, **kwargs)

return _real_initializer

def _max_retry_check(self, max_total_retry: int) -> None:
if self._retry_count > max_total_retry:
raise TasksEnsureFailed("exceed max retry count, abort")
Expand Down Expand Up @@ -110,16 +135,40 @@ def _task_done_cb(
return # on shutdown, no need to put done fut into fut_queue
self._fut_queue.put_nowait(fut)

# ------ on task failed ------ #
if fut.exception():
self._retry_count = next(self._retry_counter)
try: # try to re-schedule the failed task
self.submit(func, item).add_done_callback(
partial(self._task_done_cb, item=item, func=func)
)
except Exception: # if re-schedule doesn't happen, release se
self._concurrent_semaphore.release()
else: # release semaphore when succeeded
_thread_local = self._thread_local

# release semaphore only on success
# reset continues failure count on success
if not fut.exception():
self._concurrent_semaphore.release()
_thread_local.continues_failed_count = 0
return

# NOTE: when for some reason the continues_failed_count is gone,
# handle the AttributeError here and re-assign the count.
try:
_continues_failed_count = getattr(
_thread_local, CONTINUES_FAILURE_COUNT_ATTRNAME
)
except AttributeError:
_continues_failed_count = 0

_continues_failed_count += 1
setattr(
_thread_local, CONTINUES_FAILURE_COUNT_ATTRNAME, _continues_failed_count
)
wait_with_backoff(
_continues_failed_count,
_backoff_factor=self.backoff_factor,
_backoff_max=self.backoff_max,
)

self._retry_count = next(self._retry_counter)
try: # try to re-schedule the failed task
self.submit(func, item).add_done_callback(
partial(self._task_done_cb, item=item, func=func)
)
except Exception: # if re-schedule doesn't happen, release se
self._concurrent_semaphore.release()

def _fut_gen(self, interval: int) -> Generator[Future[Any], Any, None]:
Expand Down Expand Up @@ -221,6 +270,8 @@ def __init__(
watchdog_check_interval: int = 3, # seconds
initializer: Callable[..., Any] | None = None,
initargs: tuple = (),
backoff_factor: float = 0.01,
backoff_max: float = 1,
) -> None:
"""Initialize a ThreadPoolExecutorWithRetry instance.
Expand All @@ -236,6 +287,10 @@ def __init__(
Defaults to None.
initargs (tuple): The same <initargs> param passed through to ThreadPoolExecutor.
Defaults to ().
backoff_factor (float): The backoff factor on thread-scope backoff wait for failed tasks rescheduling.
Defaults to 0.01.
backoff_max (float): The backoff max on thread-scope backoff wait for failed tasks rescheduling.
Defaults to 1.
"""
raise NotImplementedError

Expand Down
12 changes: 11 additions & 1 deletion tests/test_otaclient_common/test_retry_task_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
MAX_CONCURRENT = 128
MAX_WAIT_BEFORE_SUCCESS = 10
THREAD_INIT_MSG = "thread init message"
BACKOFF_FACTOR = 0.001 # for faster test
BACKOFF_MAX = 0.1


class _RetryTaskMapTestErr(Exception):
Expand All @@ -47,7 +49,7 @@ def _thread_initializer(msg: str) -> None:
class TestRetryTaskMap:

@pytest.fixture(autouse=True)
def setup(self):
def setup(self) -> None:
self._start_time = time.time()
self._success_wait_dict = {
idx: random.randint(0, MAX_WAIT_BEFORE_SUCCESS)
Expand Down Expand Up @@ -83,6 +85,8 @@ def _exit_on_exceed_max_count():
watchdog_func=_exit_on_exceed_max_count,
initializer=_thread_initializer,
initargs=(THREAD_INIT_MSG,),
backoff_factor=BACKOFF_FACTOR,
backoff_max=BACKOFF_MAX,
) as executor:
with pytest.raises(retry_task_map.TasksEnsureFailed):
for _fut in executor.ensure_tasks(
Expand All @@ -99,6 +103,8 @@ def test_retry_exceed_retry_limit(self):
max_total_retry=MAX_TOTAL_RETRY,
initializer=_thread_initializer,
initargs=(THREAD_INIT_MSG,),
backoff_factor=BACKOFF_FACTOR,
backoff_max=BACKOFF_MAX,
) as executor:
with pytest.raises(retry_task_map.TasksEnsureFailed):
for _fut in executor.ensure_tasks(
Expand All @@ -115,6 +121,8 @@ def test_retry_finally_succeeded(self):
max_concurrent=MAX_CONCURRENT,
initializer=_thread_initializer,
initargs=(THREAD_INIT_MSG,),
backoff_factor=BACKOFF_FACTOR,
backoff_max=BACKOFF_MAX,
) as executor:
for _fut in executor.ensure_tasks(
self.workload_failed_and_then_succeed, range(TASKS_COUNT)
Expand All @@ -130,6 +138,8 @@ def test_succeeded_in_one_try(self):
max_concurrent=MAX_CONCURRENT,
initializer=_thread_initializer,
initargs=(THREAD_INIT_MSG,),
backoff_factor=BACKOFF_FACTOR,
backoff_max=BACKOFF_MAX,
) as executor:
for _fut in executor.ensure_tasks(
self.workload_succeed, range(TASKS_COUNT)
Expand Down

1 comment on commit 19c4c5c

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
src/ota_metadata/legacy
   __init__.py10100% 
   parser.py3354885%106, 170, 175, 211–212, 222–223, 226, 238, 289–291, 295–298, 324–327, 396, 399, 407–409, 422, 431–432, 435–436, 601–603, 653–654, 657, 685–686, 689–690, 692, 696, 698–699, 753, 756–758
   types.py841384%37, 40–42, 112–116, 122–125
src/ota_metadata/utils
   cert_store.py86890%58–59, 73, 87, 91, 102, 123, 127
src/ota_proxy
   __init__.py15660%48, 50, 52, 61, 71–72
   __main__.py880%16, 18–20, 22, 24–25, 27
   _consts.py170100% 
   cache_control_header.py68494%71, 91, 113, 121
   cache_streaming.py1442284%154–156, 184–186, 211, 225, 229–230, 265–266, 268, 280, 349, 355–356, 359, 367–370
   config.py200100% 
   db.py741875%110, 116, 154, 160–161, 164, 170, 172, 193–200, 202–203
   errors.py50100% 
   external_cache.py282028%31, 35, 40–42, 44–45, 48–49, 51–53, 60, 63–65, 69–72
   lru_cache_helper.py47295%84–85
   ota_cache.py2346472%71–72, 143, 146–147, 159–160, 192–193, 210, 231, 250–254, 258–260, 262, 264–271, 273–275, 278–279, 283–284, 288, 335, 343–345, 418, 445, 448–449, 471–473, 477–479, 485, 487–489, 494, 520–522, 557–559, 586, 592, 607
   server_app.py1413972%79, 82, 88, 107, 111, 170, 179, 221–222, 224–226, 229, 234–235, 238, 241–242, 245, 248, 251, 254, 267–268, 271–272, 274, 277, 303–306, 309, 323–325, 331–333
   utils.py140100% 
src/otaclient
   __init__.py5260%17, 19
   __main__.py110%16
   _logging.py513335%43–44, 46–47, 49–54, 56–57, 59–60, 62–65, 67, 77, 80–82, 84–86, 89–90, 92–96
   _status_monitor.py1611193%46, 48–49, 157, 160, 177, 180, 196–197, 204, 207
   _types.py750100% 
   errors.py120199%97
   main.py492940%30, 32–39, 41–42, 44–45, 47–48, 52, 54, 59, 61, 67–69, 72–73, 77–80, 82
   ota_core.py31610566%101, 128, 130–131, 135–136, 138–140, 144–145, 150–151, 157, 159, 198–201, 207, 211, 217, 356, 388–389, 391, 400, 403, 408–409, 412, 418, 420–424, 460–463, 466–477, 480–483, 519–522, 538, 540–542, 605–612, 617, 620–627, 660–661, 667, 671–672, 678, 703–705, 707, 782, 810–811, 813–814, 822, 824–830
   utils.py37294%73–74
src/otaclient/boot_control
   __init__.py40100% 
   _firmware_package.py932276%82, 86, 136, 180, 186, 209–210, 213–218, 220–221, 224–229, 231
   _grub.py41812769%214, 262–265, 271–275, 312–313, 320–325, 328–334, 337, 340–341, 346, 348–350, 359–365, 367–368, 370–372, 381–383, 385–387, 466–467, 471–472, 524, 530, 556, 578, 582–583, 598–600, 624–627, 639, 643–645, 647–649, 708–711, 736–739, 762–765, 777–778, 781–782, 817, 823, 843–844, 846, 871–873, 891–894, 919–922, 929–932, 937–945, 950–957
   _jetson_cboot.py2612610%20, 22–25, 27–29, 35–40, 42, 58–60, 62, 64–65, 71, 75, 134, 137, 139–140, 143, 150–151, 159–160, 163, 169–170, 178, 187–191, 193, 199, 202–203, 209, 212–213, 218–219, 221, 227–228, 231–232, 235–237, 239, 245, 250–252, 254–256, 261, 263–266, 268–269, 278–279, 282–283, 288–289, 292–296, 299–300, 305–306, 309, 312–316, 321–324, 327, 330–331, 334, 337–338, 341, 345–350, 354–355, 359, 362–363, 366, 369–372, 374, 377–378, 382, 385, 388–391, 393, 400, 404–405, 408–409, 415–416, 422, 424–425, 429, 431, 433–435, 438, 442, 445, 448–449, 451, 454, 462–463, 470, 480, 483, 491–492, 497–500, 502, 509, 511–513, 519–520, 524–525, 528, 532, 535, 537, 544–548, 550, 562–565, 568, 571, 573, 580, 590–592, 594, 596, 599, 602, 605, 607–608, 611–615, 619–621, 623, 631–635, 637, 640, 644, 647, 658–659, 664, 674, 677–683, 687–693, 697–706, 710–718, 722, 724, 726–728
   _jetson_common.py1724573%132, 140, 288–291, 294, 311, 319, 354, 359–364, 382, 408–409, 411–413, 417–420, 422–423, 425–429, 431, 438–439, 442–443, 453, 456–457, 460, 462, 506–507
   _jetson_uefi.py40427432%124–126, 131–132, 151–153, 158–161, 328, 446, 448–451, 455, 459–460, 462–470, 472, 484–485, 488–489, 492–493, 496–498, 502–503, 508–510, 514, 518–519, 522–523, 526–527, 531, 534–535, 537, 542–543, 547, 550–551, 556, 560–561, 564, 568–570, 572, 576–579, 581–582, 604–605, 609–610, 612, 616, 620–621, 624–625, 632, 635–637, 640, 642–643, 648–649, 652–655, 657–658, 663, 665–666, 674, 677–680, 682–683, 685, 689–690, 694, 702–706, 709–710, 712, 715–719, 722, 725–729, 733–734, 737–742, 745–746, 749–752, 754–755, 762–763, 773–776, 779, 782–785, 788–792, 795–796, 799, 802–805, 808, 810, 815–816, 819, 822–825, 827, 833, 838–839, 858–859, 862, 870–871, 878, 888, 891, 898–899, 904–907, 915–918, 926–927, 939–942, 944, 947, 950, 958, 969–971, 973–975, 977–981, 986–987, 989, 1002, 1006, 1009, 1019, 1024, 1032–1033, 1036, 1040, 1042–1044, 1050–1051, 1056, 1064–1069, 1074–1079, 1084–1092, 1097–1104, 1112–1114
   _ota_status_control.py1021189%117, 122, 127, 240, 244–245, 248, 255, 257–258, 273
   _rpi_boot.py28713353%53, 56, 120–121, 125, 133–136, 150–153, 158–159, 161–162, 167–168, 171–172, 181–182, 222, 228–232, 235, 253–255, 259–261, 266–268, 272–274, 284–285, 288, 291, 293–294, 296–297, 299–301, 307, 310–311, 321–324, 332–336, 338, 340–341, 346–347, 354, 357–362, 393, 395–398, 408–411, 415–416, 418–422, 450–453, 472–475, 501–504, 509–517, 522–529, 544–547, 554–557, 565–567
   _slot_mnt_helper.py100100% 
   configs.py510100% 
   protocol.py60100% 
   selecter.py412929%44–46, 49–50, 54–55, 58–60, 63, 65, 69, 77–79, 81–82, 84–85, 89, 91, 93–94, 96, 98–99, 101, 103
src/otaclient/configs
   __init__.py170100% 
   _cfg_configurable.py470100% 
   _cfg_consts.py47197%97
   _common.py80100% 
   _ecu_info.py56394%59, 64–65
   _proxy_info.py51590%85, 87–88, 90, 101
   cfg.py190100% 
src/otaclient/create_standby
   __init__.py13192%36
   common.py2264480%59, 62–63, 67–69, 71, 75–76, 78, 126, 174–176, 178–180, 182, 185–188, 192, 203, 279–280, 282–287, 299, 339, 367, 370–372, 388–389, 403, 407, 429–430
   interface.py70100% 
   rebuild_mode.py1151091%98–100, 119, 150–155
src/otaclient/grpc
   _otaproxy_ctx.py644135%38, 40–41, 43–45, 47, 52–56, 58, 77–78, 80, 83, 87, 100–102, 106–107, 109, 111–112, 118–120, 124, 133–134, 136–141, 143–145
src/otaclient/grpc/api_v2
   ecu_status.py1531093%75, 78, 81, 147, 171, 173, 300, 370–371, 410
   ecu_tracker.py341944%40–42, 48, 52–54, 61–63, 66, 70–74, 77–79
   servicer.py1273671%82, 171–173, 180, 191–192, 233–234, 237–241, 250–259, 266, 272, 275–278, 284–285, 292, 295, 301, 304
   types.py46295%78–79
src/otaclient_api/v2
   api_caller.py39684%45–47, 83–85
   types.py2562391%86, 89–92, 131, 209–210, 212, 259, 262–263, 506–508, 512–513, 515, 518–519, 522–523, 586
src/otaclient_common
   __init__.py341555%42–44, 61, 63, 68–77
   _io.py64198%41
   cmdhelper.py130100% 
   common.py1061090%148, 151–153, 168, 175–177, 271, 275
   downloader.py1991094%107–108, 126, 153, 369, 424, 428, 516–517, 526
   linux.py611575%51–53, 59, 69, 74, 76, 108–109, 133–134, 190, 195–196, 198
   logging.py29196%55
   persist_file_handling.py1181884%113, 118, 150–152, 163, 192–193, 228–232, 242–244, 246–247
   proto_streamer.py42880%33, 48, 66–67, 72, 81–82, 100
   proto_wrapper.py3984688%87, 165, 172, 184–186, 205, 210, 221, 257, 263, 268, 299, 303, 307, 402, 462, 469, 472, 492, 499, 501, 526, 532, 535, 537, 562, 568, 571, 573, 605, 609, 611, 625, 642, 669, 672, 676, 707, 713, 762–763, 765, 803–805
   retry_task_map.py129993%134–135, 153–154, 207–208, 210, 230–231
   shm_status.py952177%79–80, 83–84, 105, 120–122, 134, 139, 156–160, 169–170, 172, 179, 192, 204
   typing.py31487%48, 97–98, 100
TOTAL6629169774% 

Tests Skipped Failures Errors Time
236 0 💤 0 ❌ 0 🔥 12m 5s ⏱️

Please sign in to comment.