Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lost my socks #15751

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion awx/main/dispatch/worker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,12 @@
except psycopg.InterfaceError:
logger.warning("Stale Postgres message bus connection, reconnecting")
continue
except (db.DatabaseError, psycopg.OperationalError):
except (db.DatabaseError, psycopg.OperationalError) as exc:

Check warning on line 255 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L255

Added line #L255 was not covered by tests
# If we never connected to begin with, then be brief, no risk of losing work
if init is False:
logger.info(f'Could not create listener connection: {exc}')
time.sleep(1) # Patience to avoid log spam
sys.exit(1)

Check warning on line 260 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L258-L260

Added lines #L258 - L260 were not covered by tests
# If we have attained stady state operation, tolerate short-term database hickups
if not self.pg_is_down:
logger.exception(f"Error consuming new events from postgres, will retry for {self.pg_max_wait} s")
Expand Down
11 changes: 11 additions & 0 deletions awx/main/management/commands/run_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# All Rights Reserved.
import logging
import yaml
import os
import sys
import time

Check warning on line 7 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L5-L7

Added lines #L5 - L7 were not covered by tests

from django.conf import settings
from django.core.management.base import BaseCommand
Expand All @@ -11,6 +14,8 @@
from awx.main.dispatch.pool import AutoscalePool
from awx.main.dispatch.worker import AWXConsumerPG, TaskWorker
from awx.main.analytics.subsystem_metrics import DispatcherMetricsServer
from awx.main.utils.redis import exit_if_redis_down
from awx.main.tasks.receptor import RECEPTOR_SOCK_FILE

Check warning on line 18 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L17-L18

Added lines #L17 - L18 were not covered by tests

logger = logging.getLogger('awx.main.dispatch')

Expand Down Expand Up @@ -63,8 +68,14 @@

consumer = None

exit_if_redis_down(logger)

Check warning on line 71 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L71

Added line #L71 was not covered by tests
DispatcherMetricsServer().start()

if not os.path.exists(RECEPTOR_SOCK_FILE):
logger.info(f'Receptor sock file does not exist at {RECEPTOR_SOCK_FILE}')
time.sleep(1) # Patience to avoid log spam
sys.exit(1)

Check warning on line 77 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L75-L77

Added lines #L75 - L77 were not covered by tests

try:
queues = ['tower_broadcast_all', 'tower_settings_change', get_task_queuename()]
consumer = AWXConsumerPG('dispatcher', TaskWorker(), queues, AutoscalePool(min_workers=4), schedule=settings.CELERYBEAT_SCHEDULE)
Expand Down
3 changes: 2 additions & 1 deletion awx/main/models/ha.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from awx.main.models.unified_jobs import UnifiedJob
from awx.main.utils.common import get_corrected_cpu, get_cpu_effective_capacity, get_corrected_memory, get_mem_effective_capacity
from awx.main.utils.redis import ping_redis
from awx.main.models.mixins import RelatedJobsMixin, ResourceMixin
from awx.main.models.receptor_address import ReceptorAddress

Expand Down Expand Up @@ -397,7 +398,7 @@ def local_health_check(self):
try:
# if redis is down for some reason, that means we can't persist
# playbook event data; we should consider this a zero capacity event
redis.Redis.from_url(settings.BROKER_URL).ping()
ping_redis()
except redis.ConnectionError:
errors = _('Failed to connect to Redis')

Expand Down
3 changes: 2 additions & 1 deletion awx/main/tasks/receptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

logger = logging.getLogger('awx.main.tasks.receptor')
__RECEPTOR_CONF = '/etc/receptor/receptor.conf'
RECEPTOR_SOCK_FILE = '/var/run/receptor/receptor.sock'
__RECEPTOR_CONF_LOCKFILE = f'{__RECEPTOR_CONF}.lock'
RECEPTOR_ACTIVE_STATES = ('Pending', 'Running')

Expand Down Expand Up @@ -758,7 +759,7 @@ def kube_config(self):
{'local-only': None},
{'log-level': settings.RECEPTOR_LOG_LEVEL},
{'node': {'firewallrules': [{'action': 'reject', 'tonode': settings.CLUSTER_HOST_ID, 'toservice': 'control'}]}},
{'control-service': {'service': 'control', 'filename': '/var/run/receptor/receptor.sock', 'permissions': '0660'}},
{'control-service': {'service': 'control', 'filename': RECEPTOR_SOCK_FILE, 'permissions': '0660'}},
{'work-command': {'worktype': 'local', 'command': 'ansible-runner', 'params': 'worker', 'allowruntimeparams': True}},
{'work-signing': {'privatekey': '/etc/receptor/work_private_key.pem', 'tokenexpiration': '1m'}},
{
Expand Down
19 changes: 19 additions & 0 deletions awx/main/utils/redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import sys
import time

from django.conf import settings

import redis


def ping_redis():
redis.Redis.from_url(settings.BROKER_URL).ping()


def exit_if_redis_down(logger):
try:
ping_redis()
except redis.ConnectionError as exc:
logger.info(f'Redis ping error: {exc}')
time.sleep(1) # Patience to avoid log spam
sys.exit(1)

Check warning on line 19 in awx/main/utils/redis.py

View check run for this annotation

Codecov / codecov/patch

awx/main/utils/redis.py#L14-L19

Added lines #L14 - L19 were not covered by tests
Loading