From fab6d0770decd52d97b74caa248a20a9072b5e19 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 24 Sep 2024 13:45:06 -0700 Subject: [PATCH 1/7] [UX] default to minimal logging (no module/line number/timestamp). --- sky/optimizer.py | 7 +++---- sky/utils/env_options.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/sky/optimizer.py b/sky/optimizer.py index 4326329579d..a4ce4f39f83 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -965,10 +965,10 @@ def _print_candidates(node_to_candidate_map: _TaskToPerCloudCandidates): f'Multiple {cloud} instances satisfy ' f'{acc_name}:{int(acc_count)}. ' f'The cheapest {candidate_list[0]!r} is considered ' - f'among:\n{instance_list}.\n') + f'among:\n{instance_list}.') if is_multi_instances: logger.info( - f'To list more details, run \'sky show-gpus {acc_name}\'.') + f'To list more details, run: sky show-gpus {acc_name}\n') @staticmethod def _optimize_dag( @@ -1101,8 +1101,7 @@ def ordinal_number(n): Optimizer.print_optimized_plan(graph, topo_order, best_plan, total_time, total_cost, node_to_cost_map, minimize_cost) - if not env_options.Options.MINIMIZE_LOGGING.get(): - Optimizer._print_candidates(local_node_to_candidate_map) + Optimizer._print_candidates(local_node_to_candidate_map) return best_plan diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index 166bf42ce80..d56ac15ba3e 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -5,17 +5,24 @@ class Options(enum.Enum): """Environment variables for SkyPilot.""" - IS_DEVELOPER = 'SKYPILOT_DEV' - SHOW_DEBUG_INFO = 'SKYPILOT_DEBUG' - DISABLE_LOGGING = 'SKYPILOT_DISABLE_USAGE_COLLECTION' - MINIMIZE_LOGGING = 'SKYPILOT_MINIMIZE_LOGGING' + + # (env var name, default value) + IS_DEVELOPER = ('SKYPILOT_DEV', False) + SHOW_DEBUG_INFO = ('SKYPILOT_DEBUG', False) + DISABLE_LOGGING = ('SKYPILOT_DISABLE_USAGE_COLLECTION', False) + MINIMIZE_LOGGING = ('SKYPILOT_MINIMIZE_LOGGING', True) # Internal: this is used to skip the cloud user identity check, which is # used to protect cluster operations in a multi-identity scenario. # Currently, this is only used in the job and serve controller, as there # will not be multiple identities, and skipping the check can increase # robustness. - SKIP_CLOUD_IDENTITY_CHECK = 'SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK' + SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False) + + def __init__(self, env_var, default): + self.env_var = env_var + self.default = default def get(self): """Check if an environment variable is set to True.""" - return os.getenv(self.value, 'False').lower() in ('true', '1') + return os.getenv(self.env_var, + str(self.default)).lower() in ('true', '1') From 34ad9b1ce6d184351344c8f4a9257b1758f211bb Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 24 Sep 2024 13:55:36 -0700 Subject: [PATCH 2/7] Fix mypy. --- sky/utils/controller_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 866aaf1ee1a..548b734b27a 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -362,7 +362,7 @@ def shared_controller_vars_to_fill( 'sky_python_cmd': constants.SKY_PYTHON_CMD, } env_vars: Dict[str, str] = { - env.value: '1' for env in env_options.Options if env.get() + env.value: str(int(env.get())) for env in env_options.Options } env_vars.update({ # Should not use $USER here, as that env var can be empty when @@ -370,7 +370,7 @@ def shared_controller_vars_to_fill( constants.USER_ENV_VAR: getpass.getuser(), constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), # Skip cloud identity check to avoid the overhead. - env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value: '1', + env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value[0]: '1', }) if skypilot_config.loaded(): # Only set the SKYPILOT_CONFIG env var if the user has a config file. From 9fe7f478aab413988af67cf3e1f28af779d3ab03 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Tue, 24 Sep 2024 14:20:51 -0700 Subject: [PATCH 3/7] Fix typing --- sky/execution.py | 2 +- sky/utils/controller_utils.py | 4 ++-- sky/utils/env_options.py | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 1f6bd09f9c3..3faa6c84be1 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -334,7 +334,7 @@ def _execute( # # Disable the usage collection for this status command. env = dict(os.environ, - **{env_options.Options.DISABLE_LOGGING.value: '1'}) + **{str(env_options.Options.DISABLE_LOGGING): '1'}) subprocess_utils.run( 'sky status --no-show-managed-jobs --no-show-services', env=env) print() diff --git a/sky/utils/controller_utils.py b/sky/utils/controller_utils.py index 548b734b27a..9680af065aa 100644 --- a/sky/utils/controller_utils.py +++ b/sky/utils/controller_utils.py @@ -362,7 +362,7 @@ def shared_controller_vars_to_fill( 'sky_python_cmd': constants.SKY_PYTHON_CMD, } env_vars: Dict[str, str] = { - env.value: str(int(env.get())) for env in env_options.Options + str(env): str(int(env.get())) for env in env_options.Options } env_vars.update({ # Should not use $USER here, as that env var can be empty when @@ -370,7 +370,7 @@ def shared_controller_vars_to_fill( constants.USER_ENV_VAR: getpass.getuser(), constants.USER_ID_ENV_VAR: common_utils.get_user_hash(), # Skip cloud identity check to avoid the overhead. - env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.value[0]: '1', + str(env_options.Options.SKIP_CLOUD_IDENTITY_CHECK): '1', }) if skypilot_config.loaded(): # Only set the SKYPILOT_CONFIG env var if the user has a config file. diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index d56ac15ba3e..d7ab87d0368 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -22,6 +22,9 @@ def __init__(self, env_var, default): self.env_var = env_var self.default = default + def __repr__(self) -> str: + return self.env_var + def get(self): """Check if an environment variable is set to True.""" return os.getenv(self.env_var, From 60bcec50d5b49d79059edeee6a2c24bc8a839072 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 25 Sep 2024 14:33:18 -0700 Subject: [PATCH 4/7] Update sky/utils/env_options.py Co-authored-by: Tian Xia --- sky/utils/env_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index d7ab87d0368..849d81678d9 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -18,7 +18,7 @@ class Options(enum.Enum): # robustness. SKIP_CLOUD_IDENTITY_CHECK = ('SKYPILOT_SKIP_CLOUD_IDENTITY_CHECK', False) - def __init__(self, env_var, default): + def __init__(self, env_var: str, default: bool) -> None: self.env_var = env_var self.default = default From 9b2c43768d95142c3999aaa7ce895c6cb3f74a5e Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 25 Sep 2024 14:33:24 -0700 Subject: [PATCH 5/7] Update sky/utils/env_options.py Co-authored-by: Tian Xia --- sky/utils/env_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/utils/env_options.py b/sky/utils/env_options.py index 849d81678d9..48855e6cbf6 100644 --- a/sky/utils/env_options.py +++ b/sky/utils/env_options.py @@ -25,7 +25,7 @@ def __init__(self, env_var: str, default: bool) -> None: def __repr__(self) -> str: return self.env_var - def get(self): + def get(self) -> bool: """Check if an environment variable is set to True.""" return os.getenv(self.env_var, str(self.default)).lower() in ('true', '1') From 8ef0dd7e543b28e0ac47368f1fbbe67ced20395f Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 25 Sep 2024 14:46:28 -0700 Subject: [PATCH 6/7] Account for debug flag. --- sky/sky_logging.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sky/sky_logging.py b/sky/sky_logging.py index c8a243c72cf..232fc6dd9d5 100644 --- a/sky/sky_logging.py +++ b/sky/sky_logging.py @@ -10,10 +10,11 @@ from sky.utils import env_options from sky.utils import rich_utils -# If the SKYPILOT_MINIMIZE_LOGGING environment variable is set to True, -# remove logging prefixes and unnecessary information in optimizer -_FORMAT = (None if env_options.Options.MINIMIZE_LOGGING.get() else - '%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s') +# UX: Should we show logging prefixes and some extra information in optimizer? +_show_logging_prefix = (env_options.Options.SHOW_DEBUG_INFO.get() or + not env_options.Options.MINIMIZE_LOGGING.get()) +_FORMAT = ('%(levelname).1s %(asctime)s %(filename)s:%(lineno)d] %(message)s' + if _show_logging_prefix else None) _DATE_FORMAT = '%m-%d %H:%M:%S' From fe7b739ec11034c172daaee644c6181f81461a43 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 25 Sep 2024 14:54:33 -0700 Subject: [PATCH 7/7] Remove prefixes from docs. --- docs/source/examples/auto-failover.rst | 91 ++++++++++++++------------ 1 file changed, 49 insertions(+), 42 deletions(-) diff --git a/docs/source/examples/auto-failover.rst b/docs/source/examples/auto-failover.rst index 99ee5703738..c23f6273697 100644 --- a/docs/source/examples/auto-failover.rst +++ b/docs/source/examples/auto-failover.rst @@ -60,18 +60,22 @@ provisioner handles such a request: .. code-block:: console $ sky launch -c gpu --gpus V100 - ... # optimizer output - I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})]. - I 02-11 21:17:43 cloud_vm_ray_backend.py:1034] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. - I 02-11 21:17:43 cloud_vm_ray_backend.py:614] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log - I 02-11 21:17:43 cloud_vm_ray_backend.py:624] - I 02-11 21:17:43 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-a) - W 02-11 21:17:56 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + + ... + Creating a new cluster: "gpu" [1x GCP(n1-highmem-8, {'V100': 1.0})]. + Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. + To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-11-21-17-43-171661/provision.log + + Launching on GCP us-central1 (us-central1-a) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on GCP us-central1 (us-central1-f) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on GCP us-west1 (us-west1-a) ... - I 02-11 21:18:24 cloud_vm_ray_backend.py:624] Launching on GCP us-central1 (us-central1-f) - W 02-11 21:18:38 cloud_vm_ray_backend.py:358] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-f (message: The zone 'projects/intercloud-320520/zones/us-central1-f' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) - I 02-11 21:18:38 cloud_vm_ray_backend.py:624] - I 02-11 21:18:38 cloud_vm_ray_backend.py:624] Launching on GCP us-west1 (us-west1-a) Successfully connected to 35.230.120.87. GCP was chosen as the best cloud to run the task. There was no capacity in any of the regions in US Central, so the auto-failover provisioner moved to US West instead, allowing for our instance to be successfully provisioned. @@ -88,21 +92,24 @@ AWS, where it succeeded after two regions: .. code-block:: console $ sky launch -c v100-8 --gpus V100:8 - ... # optimizer output - I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})]. - I 02-23 16:39:59 cloud_vm_ray_backend.py:1010] Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. - I 02-23 16:39:59 cloud_vm_ray_backend.py:658] To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log - I 02-23 16:39:59 cloud_vm_ray_backend.py:668] - I 02-23 16:39:59 cloud_vm_ray_backend.py:668] Launching on GCP us-central1 (us-central1-a) - W 02-23 16:40:17 cloud_vm_ray_backend.py:403] Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... - I 02-23 16:42:15 cloud_vm_ray_backend.py:668] Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c) - W 02-23 16:42:26 cloud_vm_ray_backend.py:477] Got error(s) in all zones of us-east-2: - W 02-23 16:42:26 cloud_vm_ray_backend.py:479] create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying. + Creating a new cluster: "v100-8" [1x GCP(n1-highmem-8, {'V100': 8.0})]. + Tip: to reuse an existing cluster, specify --cluster-name (-c) in the CLI or use sky.launch(.., cluster_name=..) in the Python API. Run `sky status` to see existing clusters. + To view detailed progress: tail -n100 -f sky_logs/sky-2022-02-23-16-39-58-577551/provision.log + + Launching on GCP us-central1 (us-central1-a) + Got ZONE_RESOURCE_POOL_EXHAUSTED in us-central1-a (message: The zone 'projects/intercloud-320520/zones/us-central1-a' does not have enough resources available to fulfill the request. Try a different zone, or try again later.) + ... + + Launching on AWS us-east-2 (us-east-2a,us-east-2b,us-east-2c) + Got error(s) in all zones of us-east-2: + create_instances: Attempt failed with An error occurred (InsufficientInstanceCapacity) when calling the RunInstances operation (reached max retries: 0): We currently do not have sufficient p3.16xlarge capacity in the Availability Zone you requested (us-east-2a). Our system will be working on provisioning additional capacity. You can currently get p3.16xlarge capacity by not specifying an Availability Zone in your request or choosing us-east-2b., retrying. ... - I 02-23 16:42:26 cloud_vm_ray_backend.py:668] - I 02-23 16:42:26 cloud_vm_ray_backend.py:668] Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d) - I 02-23 16:47:04 cloud_vm_ray_backend.py:740] Successfully provisioned or found existing VM. Setup completed. + + Launching on AWS us-west-2 (us-west-2a,us-west-2b,us-west-2c,us-west-2d) + ... + Successfully provisioned or found existing VM. Setup completed. Multiple Candidate GPUs @@ -125,13 +132,13 @@ A10, L4, and A10g GPUs, using :code:`sky launch task.yaml`. $ sky launch task.yaml ... - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- - I 11-19 08:07:45 optimizer.py:910] CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- - I 11-19 08:07:45 optimizer.py:910] Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔ - I 11-19 08:07:45 optimizer.py:910] GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70 - I 11-19 08:07:45 optimizer.py:910] AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01 - I 11-19 08:07:45 optimizer.py:910] ----------------------------------------------------------------------------------------------------- + ----------------------------------------------------------------------------------------------------- + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + ----------------------------------------------------------------------------------------------------- + Azure Standard_NV6ads_A10_v5 6 55 A10:1 eastus 0.45 ✔ + GCP g2-standard-4 4 16 L4:1 us-east4-a 0.70 + AWS g5.xlarge 4 16 A10G:1 us-east-1 1.01 + ----------------------------------------------------------------------------------------------------- @@ -212,15 +219,15 @@ This will generate the following output: $ sky launch -c mycluster task.yaml ... - I 12-20 23:55:56 optimizer.py:717] - I 12-20 23:55:56 optimizer.py:840] Considered resources (1 node): - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔ - I 12-20 23:55:56 optimizer.py:910] AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29 - I 12-20 23:55:56 optimizer.py:910] GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39 - I 12-20 23:55:56 optimizer.py:910] AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77 - I 12-20 23:55:56 optimizer.py:910] --------------------------------------------------------------------------------------------- - I 12-20 23:55:56 optimizer.py:910] + + Considered resources (1 node): + --------------------------------------------------------------------------------------------- + CLOUD INSTANCE vCPUs Mem(GB) ACCELERATORS REGION/ZONE COST ($) CHOSEN + --------------------------------------------------------------------------------------------- + GCP g2-standard-96 96 384 L4:8 us-east4-a 7.98 ✔ + AWS g5.48xlarge 192 768 A10G:8 us-east-1 16.29 + GCP a2-highgpu-8g 96 680 A100:8 us-east1-b 29.39 + AWS p4d.24xlarge 96 1152 A100:8 us-east-1 32.77 + --------------------------------------------------------------------------------------------- + Launching a new cluster 'mycluster'. Proceed? [Y/n]: