Skip to content

Commit

Permalink
Merge branch 'skypilot-org:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
hyoxt121 authored Feb 11, 2025
2 parents a14b4cf + 8136023 commit 660ead7
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 12 deletions.
14 changes: 11 additions & 3 deletions sky/jobs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,18 @@
# We use 50 GB disk size to reduce the cost.
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}

# TODO(zhwu): This is no longer accurate, after #4592, which increases the
# length of user hash appended to the cluster name from 4 to 8 chars. This makes
# the cluster name on GCP being wrapped twice. However, we cannot directly
# update this constant, because the job cluster cleanup and many other logic
# in managed jobs depends on this constant, i.e., updating this constant will
# break backward compatibility and existing jobs.
#
# Max length of the cluster name for GCP is 35, the user hash to be attached is
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
# length of the cluster name prefix is 25 to avoid the cluster name being too
# long and truncated twice during the cluster creation.
# 4(now 8)+1 chars, and we assume the maximum length of the job id is
# 4(now 8)+1, so the max length of the cluster name prefix is 25(should be 21
# now) to avoid the cluster name being too long and truncated twice during the
# cluster creation.
JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25

# The version of the lib files that jobs/utils use. Whenever there is an API
Expand Down
8 changes: 6 additions & 2 deletions tests/smoke_tests/smoke_tests_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,16 +240,20 @@ def get_cluster_name() -> str:
"""
caller_func_name = inspect.stack()[1][3]
test_name = caller_func_name.replace('_', '-').replace('test-', 't-')
test_name = test_name.replace('managed-jobs', 'jobs')
# Use 20 to avoid cluster name to be truncated twice for managed jobs.
test_name = common_utils.make_cluster_name_on_cloud(test_name,
24,
20,
add_user_hash=False)
return f'{test_name}-{test_id}'


def terminate_gcp_replica(name: str, zone: str, replica_id: int) -> str:
cluster_name = serve.generate_replica_cluster_name(name, replica_id)
name_on_cloud = common_utils.make_cluster_name_on_cloud(
cluster_name, sky.GCP.max_cluster_name_length())
query_cmd = (f'gcloud compute instances list --filter='
f'"(labels.ray-cluster-name:{cluster_name})" '
f'"(labels.ray-cluster-name:{name_on_cloud})" '
f'--zones={zone} --format="value(name)"')
return (f'gcloud compute instances delete --zone={zone}'
f' --quiet $({query_cmd})')
Expand Down
18 changes: 11 additions & 7 deletions tests/smoke_tests/test_managed_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,9 @@ def test_managed_jobs_cancellation_aws(aws_config_region):
@pytest.mark.managed_jobs
def test_managed_jobs_cancellation_gcp():
name = smoke_tests_utils.get_cluster_name()
name_3 = f'{name}-3'
# Reduce the name length further to avoid cluster name to be truncated twice
# after adding the suffix '-3'.
name_3 = name.replace('-jobs', '-j') + '-3'
name_3_on_cloud = common_utils.make_cluster_name_on_cloud(
name_3, jobs.JOBS_CLUSTER_NAME_PREFIX_LENGTH, add_user_hash=False)
zone = 'us-west3-b'
Expand Down Expand Up @@ -630,20 +632,20 @@ def test_managed_jobs_cancellation_gcp():
job_status=[sky.ManagedJobStatus.CANCELLED],
timeout=155),
# Test cancellation during spot job is recovering.
f'sky jobs launch --cloud gcp --zone {zone} -n {name}-3 --use-spot "sleep 1000" -y -d',
f'sky jobs launch --cloud gcp --zone {zone} -n {name_3} --use-spot "sleep 1000" -y -d',
smoke_tests_utils.
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
job_name=f'{name}-3',
job_name=name_3,
job_status=[sky.ManagedJobStatus.RUNNING],
timeout=335),
# Terminate the cluster manually.
terminate_cmd,
smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=f'{name}-3'),
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name}-3 | head -n1 | grep "RECOVERING"',
f'sky jobs cancel -y -n {name}-3',
smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name_3),
f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name_3} | head -n1 | grep "RECOVERING"',
f'sky jobs cancel -y -n {name_3}',
smoke_tests_utils.
get_cmd_wait_until_managed_job_status_contains_matching_job_name(
job_name=f'{name}-3',
job_name=name_3,
job_status=[sky.ManagedJobStatus.CANCELLED],
timeout=155),
# The cluster should be terminated (STOPPING) after cancellation. We don't use the `=` operator here because
Expand All @@ -669,6 +671,8 @@ def test_managed_jobs_retry_logs(generic_cloud: str):
test = smoke_tests_utils.Test(
'managed_jobs_retry_logs',
[
# TODO(zhwu): we should make the override for generic_cloud work
# with multiple stages in pipeline.
f'sky jobs launch -n {name} {yaml_path} -y -d',
f'sky jobs logs -n {name} | tee {log_file.name}',
# First attempt
Expand Down

0 comments on commit 660ead7

Please sign in to comment.