diff --git a/.gitignore b/.gitignore index 21b4b31a483..3172e74a312 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ docs/build/ docs/_build/ build/ sky_logs/ +sky/clouds/service_catalog/data_fetchers/*.csv diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index fcbc54133ba..bf86942d105 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -235,7 +235,12 @@ def make_deploy_resources_variables( else: # Convert to GCP names: # https://cloud.google.com/compute/docs/gpus - resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower()) + if acc == 'A100-80GB': + # A100-80GB has a different name pattern. + resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower()) + else: + resources_vars['gpu'] = 'nvidia-tesla-{}'.format( + acc.lower()) resources_vars['gpu_count'] = acc_count if acc == 'K80': # CUDA driver version 470.57.02, CUDA Library 11.4 diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index cd827106c50..a5e1f3b88e8 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -3,7 +3,6 @@ This module loads the service catalog file and can be used to query instance types and pricing information for Azure. """ -import ast from typing import Dict, List, Optional, Tuple from sky.clouds import cloud @@ -68,13 +67,7 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: - cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0] - cap_list = ast.literal_eval(cell) - gen_version = None - for cap in cap_list: - if cap['name'] == 'HyperVGenerations': - gen_version = cap['value'] - return gen_version + return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] def list_accelerators(gpus_only: bool, diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index baa2cf8c6eb..acec209e923 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -2,5 +2,5 @@ import os HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long -CATALOG_SCHEMA_VERSION = 'v2' +CATALOG_SCHEMA_VERSION = 'v3' LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/') diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py new file mode 100644 index 00000000000..c9de9e9a06c --- /dev/null +++ b/sky/clouds/service_catalog/data_fetchers/analyze.py @@ -0,0 +1,54 @@ +import copy +from typing import Tuple +import pandas as pd + +from sky.clouds.service_catalog import common + + +def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, + check_tuple: Tuple[str]) -> pd.DataFrame: + """Returns the difference between two dataframes.""" + original_resources = original_df[check_tuple] + new_resources = new_df[check_tuple] + + return new_resources.merge( + original_resources, on=check_tuple, how='left', + indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values( + by=check_tuple) + + +CLOUD_CHECKS = { + 'aws': ['InstanceType', 'Region', 'AvailabilityZone'], + 'azure': ['InstanceType', 'Region'], + 'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount'] +} + +table = {} + +for cloud in CLOUD_CHECKS: + result = {} + print(f'=> Checking {cloud}') + original_df = common.read_catalog(f'{cloud}.csv') + new_df = pd.read_csv(f'{cloud}.csv') + + current_check_tuple = CLOUD_CHECKS[cloud] + + diff_df = resource_diff(original_df, new_df, current_check_tuple) + diff_df.merge(new_df, on=current_check_tuple, + how='left').to_csv(f'{cloud}_diff.csv', index=False) + + result['#resources'] = len(diff_df) + + check_price = current_check_tuple + ['Price'] + diff_df = resource_diff(original_df, new_df, check_price) + result['#prices'] = len(diff_df) + + check_price = current_check_tuple + ['SpotPrice'] + diff_df = resource_diff(original_df, new_df, check_price) + result['#spot_prices'] = len(diff_df) + + table[cloud] = result + +summary = pd.DataFrame(table).T +summary.to_csv('diff_summary.csv') +print(summary) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index bf61cff7996..2e674cce295 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -1,9 +1,8 @@ """A script that queries AWS API to get instance types and pricing information. - This script takes about 1 minute to finish. """ import datetime -from typing import Tuple +from typing import Tuple, Union import numpy as np import pandas as pd @@ -11,7 +10,39 @@ from sky.adaptors import aws -REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] +ALL_REGIONS = [ + 'us-east-1', + 'us-east-2', + 'us-west-1', + 'us-west-2', + 'ca-central-1', + 'eu-central-1', + 'eu-west-1', + 'eu-west-2', + 'eu-south-1', + 'eu-west-3', + 'eu-north-1', + 'me-south-1', + # 'me-central-1', # failed for no credential + 'af-south-1', + 'ap-east-1', + 'ap-southeast-3', + # 'ap-south-1', # failed for no credential + 'ap-northeast-3', + 'ap-northeast-2', + 'ap-southeast-1', + 'ap-southeast-2', + 'ap-northeast-1', +] +US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] + +REGIONS = US_REGIONS + +USEFUL_COLUMNS = [ + 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone' +] + # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is # only available in this region, but it serves pricing information for all regions. PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' @@ -44,12 +75,18 @@ def get_pricing_table(region: str) -> pd.DataFrame: print(f'{region} downloading pricing table') url = PRICING_TABLE_URL_FMT.format(region=region) df = pd.read_csv(url, skiprows=5, low_memory=False) + df.rename(columns={ + 'Instance Type': 'InstanceType', + 'PricePerUnit': 'Price', + }, + inplace=True) return df[(df['TermType'] == 'OnDemand') & (df['Operating System'] == 'Linux') & df['Pre Installed S/W'].isnull() & (df['CapacityStatus'] == 'Used') & - (df['Tenancy'].isin(['Host', 'Shared'])) & - df['PricePerUnit'] > 0].set_index('Instance Type') + (df['Tenancy'].isin(['Host', 'Shared'])) & df['Price'] > 0][[ + 'InstanceType', 'Price', 'vCPU', 'Memory' + ]] @ray.remote @@ -62,78 +99,97 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame: ret = [] for response in response_iterator: ret = ret + response['SpotPriceHistory'] - df = pd.DataFrame(ret).set_index(['InstanceType', 'AvailabilityZone']) + df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice']] + df = df.set_index(['InstanceType', 'AvailabilityZone']) return df @ray.remote -def get_instance_types_df(region: str) -> pd.DataFrame: - df, zone_df, pricing_df, spot_pricing_df = ray.get([ - get_instance_types.remote(region), - get_availability_zones.remote(region), - get_pricing_table.remote(region), - get_spot_pricing_table.remote(region) - ]) - print(f'{region} Processing dataframes') - - def get_price(row): - t = row['InstanceType'] - try: - return pricing_df.loc[t]['PricePerUnit'] - except KeyError: - return np.nan - - def get_spot_price(row): - instance = row['InstanceType'] - zone = row['AvailabilityZone'] - try: - return spot_pricing_df.loc[(instance, zone)]['SpotPrice'] - except KeyError: - return np.nan - - def get_acc_info(row) -> Tuple[str, float]: - accelerator = None - for col, info_key in [('GpuInfo', 'Gpus'), - ('InferenceAcceleratorInfo', 'Accelerators'), - ('FpgaInfo', 'Fpgas')]: - info = row.get(col) - if isinstance(info, dict): - accelerator = info[info_key][0] - if accelerator is None: - return None, np.nan - return accelerator['Name'], accelerator['Count'] - - def get_vcpus(row) -> float: - return float(row['VCpuInfo']['DefaultVCpus']) - - def get_memory_gib(row) -> float: - return row['MemoryInfo']['SizeInMiB'] // 1024 - - def get_additional_columns(row): - acc_name, acc_count = get_acc_info(row) - # AWS p3dn.24xlarge offers a different V100 GPU. - # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ - if row['InstanceType'] == 'p3dn.24xlarge': - acc_name = 'V100-32GB' - return pd.Series({ - 'Price': get_price(row), - 'SpotPrice': get_spot_price(row), - 'AcceleratorName': acc_name, - 'AcceleratorCount': acc_count, - 'vCPUs': get_vcpus(row), - 'MemoryGiB': get_memory_gib(row), - }) - - df['Region'] = region - df = df.merge(pd.DataFrame(zone_df), how='cross') - df = pd.concat([df, df.apply(get_additional_columns, axis='columns')], - axis='columns') +def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]: + try: + df, zone_df, pricing_df, spot_pricing_df = ray.get([ + get_instance_types.remote(region), + get_availability_zones.remote(region), + get_pricing_table.remote(region), + get_spot_pricing_table.remote(region), + ]) + print(f'{region} Processing dataframes') + + def get_acc_info(row) -> Tuple[str, float]: + accelerator = None + for col, info_key in [('GpuInfo', 'Gpus'), + ('InferenceAcceleratorInfo', 'Accelerators'), + ('FpgaInfo', 'Fpgas')]: + info = row.get(col) + if isinstance(info, dict): + accelerator = info[info_key][0] + if accelerator is None: + return None, np.nan + return accelerator['Name'], accelerator['Count'] + + def get_vcpus(row) -> float: + if not np.isnan(row['vCPU']): + return float(row['vCPU']) + return float(row['VCpuInfo']['DefaultVCpus']) + + def get_memory_gib(row) -> float: + if isinstance(row['MemoryInfo'], dict): + return row['MemoryInfo']['SizeInMiB'] // 1024 + return int(row['Memory'].split(' GiB')[0]) + + def get_additional_columns(row) -> pd.Series: + acc_name, acc_count = get_acc_info(row) + # AWS p3dn.24xlarge offers a different V100 GPU. + # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ + if row['InstanceType'] == 'p3dn.24xlarge': + acc_name = 'V100-32GB' + if row['InstanceType'] == 'p4de.24xlarge': + acc_name = 'A100-80GB' + acc_count = 8 + return pd.Series({ + 'AcceleratorName': acc_name, + 'AcceleratorCount': acc_count, + 'vCPUs': get_vcpus(row), + 'MemoryGiB': get_memory_gib(row), + }) + + # The AWS API may not have all the instance types in the pricing table, + # so we need to merge the two dataframes. + df = df.merge(pricing_df, on=['InstanceType'], how='outer') + df['Region'] = region + # Cartesian product of instance types and availability zones, so that + # we can join the spot pricing table per instance type and zone. + df = df.merge(pd.DataFrame(zone_df), how='cross') + + # Add spot price column, by joining the spot pricing table. + df = df.merge(spot_pricing_df, + left_on=['InstanceType', 'AvailabilityZone'], + right_index=True, + how='outer') + + # Extract vCPUs, memory, and accelerator info from the columns. + df = pd.concat( + [df, df.apply(get_additional_columns, axis='columns')], + axis='columns') + # patch the GpuInfo for p4de.24xlarge + df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB' + df = df[USEFUL_COLUMNS] + except Exception as e: + print(f'{region} failed with {e}') + return region return df def get_all_regions_instance_types_df(): - dfs = ray.get([get_instance_types_df.remote(r) for r in REGIONS]) - df = pd.concat(dfs) + df_or_regions = ray.get([get_instance_types_df.remote(r) for r in REGIONS]) + new_dfs = [] + for df_or_region in df_or_regions: + if isinstance(df_or_region, str): + print(f'{df_or_region} failed') + else: + new_dfs.append(df_or_region) + + df = pd.concat(new_dfs) df.sort_values(['InstanceType', 'Region'], inplace=True) return df diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 15ab13e70eb..922cdf65216 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -4,7 +4,7 @@ """ import json import subprocess -from typing import Optional, Tuple +from typing import Optional import urllib import numpy as np @@ -12,7 +12,7 @@ import ray import requests -REGIONS = [ +US_REGIONS = [ 'centralus', 'eastus', 'eastus2', @@ -23,11 +23,28 @@ 'westus2', # 'WestUS3', # WestUS3 pricing table is broken as of 2021/11. ] + +# To enable all the regions, uncomment the following line. +# def get_regions() -> Tuple[str]: +# """Get all available regions.""" +# proc = subprocess.run('az account list-locations --query "[?not_null(metadata.latitude)] .{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json', shell=True, check=True, stdout=subprocess.PIPE) +# items = json.loads(proc.stdout.decode('utf-8')) +# regions = [item['RegionName'] for item in items if not item['RegionName'].endswith('stg')] +# return tuple(regions) +# all_regions = get_regions() + +# REGIONS = all_regions +REGIONS = US_REGIONS REGION_SET = set(REGIONS) # Azure secretly deprecated the M60 family which is still returned by its API. # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] +USEFUL_COLUMNS = [ + 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation' +] + def get_pricing_url(region: Optional[str] = None) -> str: filters = [ @@ -61,6 +78,7 @@ def get_pricing_df(region: Optional[str] = None) -> pd.DataFrame: url = content.get('NextPageLink') print(f'Done fetching pricing {region}') df = pd.DataFrame(all_items) + assert 'productName' in df.columns, (region, df.columns) return df[(~df['productName'].str.contains(' Windows')) & (df['unitPrice'] > 0)] @@ -83,10 +101,18 @@ def get_sku_df() -> pd.DataFrame: ) print(f'Done fetching SKUs') items = json.loads(proc.stdout.decode('ascii')) - df = pd.DataFrame(items) + filtered_items = [] + for item in items: + # zones = item['locationInfo'][0]['zones'] + region = item['locations'][0] + if region not in REGION_SET: + continue + item['Region'] = region + filtered_items.append(item) + + df = pd.DataFrame(filtered_items) df = df[(df['resourceType'] == 'virtualMachines')] - df['Region'] = df.apply(lambda row: row['locations'][0], axis='columns') - return df[df.apply(lambda row: row['Region'] in REGION_SET, axis='columns')] + return df def get_gpu_name(family: str) -> str: @@ -120,47 +146,51 @@ def get_all_regions_instance_types_df(): get_all_regions_pricing_df.remote(), get_sku_df.remote(), ]) - df.drop_duplicates(inplace=True) print(f'Processing dataframes') + df.drop_duplicates(inplace=True) - def get_price(row): - is_promo = row['name'].endswith('_Promo') - sku = row['name'].replace('_Promo', '') - region = row['Region'] - pricing_rows = df[(df['armSkuName'] == sku) & - (df['armRegionName'] == region) & - (df['unitPrice'] > 0) & - (~df['skuName'].str.contains(' Spot'))] - if is_promo: - pricing_rows = pricing_rows[pricing_rows['skuName'].str.contains( - ' Low Priority')] - else: - pricing_rows = pricing_rows[~pricing_rows['skuName'].str. - contains(' Low Priority')] - assert len(pricing_rows) <= 1, (sku, pricing_rows) - if len(pricing_rows) == 0: - return np.nan - return pricing_rows.iloc[0]['unitPrice'] - - def get_spot_price(row): - sku = row['name'] - region = row['Region'] - spot_pricing_rows = df[(df['armSkuName'] == sku) & - (df['armRegionName'] == region) & - (df['unitPrice'] > 0) & - (df['skuName'].str.contains(' Spot'))] - assert len(spot_pricing_rows) <= 1, (sku, spot_pricing_rows) - if len(spot_pricing_rows) == 0: - return np.nan - return spot_pricing_rows.iloc[0]['unitPrice'] + df = df[df['unitPrice'] > 0] + + print('Getting price df') + df['merge_name'] = df['armSkuName'] + df['is_promo'] = df['skuName'].str.endswith(' Low Priority') + df.rename(columns={ + 'armSkuName': 'InstanceType', + 'armRegionName': 'Region', + }, + inplace=True) + demand_df = df[~df['skuName'].str.contains(' Spot')][[ + 'is_promo', 'InstanceType', 'Region', 'unitPrice' + ]] + spot_df = df[df['skuName'].str.contains(' Spot')][[ + 'is_promo', 'InstanceType', 'Region', 'unitPrice' + ]] + demand_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) + spot_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) + + demand_df = demand_df.rename(columns={'unitPrice': 'Price'}) + spot_df = spot_df.rename(columns={'unitPrice': 'SpotPrice'}) + + print('Getting sku df') + df_sku['is_promo'] = df_sku['name'].str.endswith('_Promo') + df_sku.rename(columns={'name': 'InstanceType'}, inplace=True) + df_sku['merge_name'] = df_sku['InstanceType'].str.replace('_Promo', '') + + print('Joining') + df = df_sku.join(demand_df, + on=['merge_name', 'Region', 'is_promo'], + how='left') + df = df.join(spot_df, on=['merge_name', 'Region', 'is_promo'], how='left') def get_capabilities(row): gpu_name = None gpu_count = np.nan vcpus = np.nan memory_gb = np.nan + gen_version = None caps = row['capabilities'] for item in caps: + assert isinstance(item, dict), (item, caps) if item['name'] == 'GPUs': gpu_name = get_gpu_name(row['family']) if gpu_name is not None: @@ -169,26 +199,35 @@ def get_capabilities(row): vcpus = float(item['value']) elif item['name'] == 'MemoryGB': memory_gb = item['value'] - return gpu_name, gpu_count, vcpus, memory_gb + elif item['name'] == 'HyperVGenerations': + gen_version = item['value'] + return gpu_name, gpu_count, vcpus, memory_gb, gen_version def get_additional_columns(row): - gpu_name, gpu_count, vcpus, memory_gb = get_capabilities(row) + gpu_name, gpu_count, vcpus, memory_gb, gen_version = get_capabilities( + row) return pd.Series({ - 'Price': get_price(row), - 'SpotPrice': get_spot_price(row), 'AcceleratorName': gpu_name, 'AcceleratorCount': gpu_count, 'vCPUs': vcpus, 'MemoryGiB': memory_gb, 'GpuInfo': gpu_name, + 'Generation': gen_version, }) df_ret = pd.concat( - [df_sku, df_sku.apply(get_additional_columns, axis='columns')], + [df, df.apply(get_additional_columns, axis='columns')], axis='columns', - ).rename(columns={'name': 'InstanceType'}) + ) + + before_drop_len = len(df_ret) + df_ret.dropna(subset=['InstanceType'], inplace=True, how='all') + after_drop_len = len(df_ret) + print('Dropped {} duplicated rows'.format(before_drop_len - after_drop_len)) + # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] + df_ret = df_ret[USEFUL_COLUMNS] return df_ret diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 9bcf75f1853..b29c0864345 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -17,6 +17,12 @@ NOT_AVAILABLE_STR = 'Not available in this region' +ALL_REGION_PREFIX = '' +US_REGION_PREFIX = 'us-' +REGION_PREFIX = US_REGION_PREFIX +# Uncomment the following line to VM pricings from all regions. +# REGION_PREFIX = ALL_REGION_PREFIX + # Refer to: https://github.com/skypilot-org/skypilot/issues/1006 UNSUPPORTED_VMS = ['t2a-standard', 'f1-micro'] @@ -35,7 +41,6 @@ } # FIXME(woosuk): This URL can change. -A2_PRICING_URL = '/compute/vm-instance-pricing_04e9ea153c2ab1ea37254107b92dc081d4459ca9fe9a46ef5b39e5d63353f089.frame' # pylint: disable=line-too-long A2_INSTANCE_TYPES = { 'a2-highgpu-1g': { 'vCPUs': 12, @@ -235,7 +240,7 @@ def parse_price(price_str): if 'InstanceType' in df.columns: # Price table for pre-defined instance types. instance_type = df['InstanceType'].iloc[0] - if instance_type == 'a2-highgpu-1g': + if instance_type in ['a2-highgpu-1g', 'a2-ultragpu-1g']: # The A2 price table includes the GPU cost. return None @@ -259,6 +264,9 @@ def parse_price(price_str): else: # Others (e.g., per vCPU hour or per GB hour pricing rule table). df = df[['Item', 'Region', 'Price', 'SpotPrice']] + item = df['Item'].iloc[0] + if item == 'Predefined vCPUs': + df = get_a2_df(df) return df @@ -294,13 +302,12 @@ def parse_machine_type_list(list_str): return df -def get_a2_df(): - a2_pricing = get_vm_price_table(GCP_URL + A2_PRICING_URL) - cpu_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined vCPUs'] - memory_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined Memory'] +def get_a2_df(a2_pricing_df): + cpu_pricing = a2_pricing_df[a2_pricing_df['Item'] == 'Predefined vCPUs'] + memory_pricing = a2_pricing_df[a2_pricing_df['Item'] == 'Predefined Memory'] table = [] - for region in a2_pricing['Region'].unique(): + for region in a2_pricing_df['Region'].unique(): per_cpu_price = cpu_pricing[cpu_pricing['Region'] == region]['Price'].values[0] per_cpu_spot_price = cpu_pricing[cpu_pricing['Region'] == @@ -344,9 +351,7 @@ def get_vm_df(): df for df in vm_dfs if df is not None and 'InstanceType' in df.columns ] - # Handle A2 instance types separately. - a2_df = get_a2_df() - vm_df = pd.concat(vm_dfs + [a2_df]) + vm_df = pd.concat(vm_dfs) vm_zones = get_vm_zones(GCP_VM_ZONES_URL) # Remove regions not in the pricing data. @@ -370,7 +375,7 @@ def get_vm_df(): # Block non-US regions. # FIXME(woosuk): Allow all regions. - vm_df = vm_df[vm_df['Region'].str.startswith('us-')] + vm_df = vm_df[vm_df['Region'].str.startswith(REGION_PREFIX)] return vm_df @@ -526,7 +531,7 @@ def get_gpu_df(): # Block non-US regions. # FIXME(woosuk): Allow all regions. - gpu_df = gpu_df[gpu_df['Region'].str.startswith('us-')] + gpu_df = gpu_df[gpu_df['Region'].str.startswith(REGION_PREFIX)] return gpu_df diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 67f0cf931d8..e63015be0a4 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -3,6 +3,7 @@ For now this service catalog is manually coded. In the future it should be queried from GCP API. """ +from collections import defaultdict import typing from typing import Dict, List, Optional, Tuple @@ -30,12 +31,20 @@ # TODO(zongheng): fix A100 info directly in catalog. # https://cloud.google.com/blog/products/compute/a2-vms-with-nvidia-a100-gpus-are-ga # count -> vm type -_A100_INSTANCE_TYPES = { - 1: 'a2-highgpu-1g', - 2: 'a2-highgpu-2g', - 4: 'a2-highgpu-4g', - 8: 'a2-highgpu-8g', - 16: 'a2-megagpu-16g', +_A100_INSTANCE_TYPE_DICTS = { + 'A100': { + 1: 'a2-highgpu-1g', + 2: 'a2-highgpu-2g', + 4: 'a2-highgpu-4g', + 8: 'a2-highgpu-8g', + 16: 'a2-megagpu-16g', + }, + 'A100-80GB': { + 1: 'a2-ultragpu-1g', + 2: 'a2-ultragpu-2g', + 4: 'a2-ultragpu-4g', + 8: 'a2-ultragpu-8g', + } } # Number of CPU cores per GPU based on the AWS setting. @@ -167,10 +176,10 @@ def get_instance_type_for_accelerator( if instance_list is None: return None, fuzzy_candidate_list - if acc_name == 'A100': + if acc_name in _A100_INSTANCE_TYPE_DICTS: # If A100 is used, host VM type must be A2. # https://cloud.google.com/compute/docs/gpus#a100-gpus - return [_A100_INSTANCE_TYPES[acc_count]], [] + return [_A100_INSTANCE_TYPE_DICTS[acc_name][acc_count]], [] if acc_name not in _NUM_ACC_TO_NUM_CPU: acc_name = 'DEFAULT' @@ -258,17 +267,18 @@ def list_accelerators( results = common.list_accelerators_impl('GCP', _df, gpus_only, name_filter, case_sensitive) - a100_infos = results.get('A100', None) - if a100_infos is None: + a100_infos = results.get('A100', []) + results.get('A100-80GB', []) + if not a100_infos: return results # Unlike other GPUs that can be attached to different sizes of N1 VMs, # A100 GPUs can only be attached to fixed-size A2 VMs. # Thus, we can show their exact cost including the host VM prices. - new_infos = [] + new_infos = defaultdict(list) for info in a100_infos: assert pd.isna(info.instance_type) and pd.isna(info.memory), a100_infos - a100_host_vm_type = _A100_INSTANCE_TYPES[info.accelerator_count] + a100_host_vm_type = _A100_INSTANCE_TYPE_DICTS[info.accelerator_name][ + info.accelerator_count] df = _df[_df['InstanceType'] == a100_host_vm_type] cpu_count = df['vCPUs'].iloc[0] memory = df['MemoryGiB'].iloc[0] @@ -280,7 +290,7 @@ def list_accelerators( a100_host_vm_type, None, use_spot=True) - new_infos.append( + new_infos[info.accelerator_name].append( info._replace( instance_type=a100_host_vm_type, cpu_count=cpu_count, @@ -289,7 +299,7 @@ def list_accelerators( price=info.price + vm_price, spot_price=info.spot_price + vm_spot_price, )) - results['A100'] = new_infos + results.update(new_infos) return results @@ -342,7 +352,7 @@ def check_host_accelerator_compatibility( return # Treat A100 as a special case. - if acc_name == 'A100': + if acc_name in _A100_INSTANCE_TYPE_DICTS: # A100 must be attached to A2 instance type. if not instance_type.startswith('a2-'): with ux_utils.print_exception_no_traceback(): @@ -382,8 +392,8 @@ def check_accelerator_attachable_to_host(instance_type: str, assert instance_type == 'TPU-VM' or instance_type.startswith('n1-') return - if acc_name == 'A100': - valid_counts = list(_A100_INSTANCE_TYPES.keys()) + if acc_name in _A100_INSTANCE_TYPE_DICTS: + valid_counts = list(_A100_INSTANCE_TYPE_DICTS[acc_name].keys()) else: valid_counts = list(_NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name].keys()) if acc_count not in valid_counts: @@ -392,8 +402,8 @@ def check_accelerator_attachable_to_host(instance_type: str, f'{acc_name}:{acc_count} is not launchable on GCP. ' f'The valid {acc_name} counts are {valid_counts}.') - if acc_name == 'A100': - a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + if acc_name in _A100_INSTANCE_TYPE_DICTS: + a100_instance_type = _A100_INSTANCE_TYPE_DICTS[acc_name][acc_count] if instance_type != a100_instance_type: with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( diff --git a/tests/test_list_accelerators.py b/tests/test_list_accelerators.py index ffabe5de897..56bde6b4f9d 100644 --- a/tests/test_list_accelerators.py +++ b/tests/test_list_accelerators.py @@ -6,6 +6,7 @@ def test_list_accelerators(): assert 'V100' in result, result assert 'tpu-v3-8' in result, result assert 'Inferentia' not in result, result + assert 'A100-80GB' in result, result def test_list_ccelerators_all(): @@ -13,6 +14,7 @@ def test_list_ccelerators_all(): assert 'V100' in result, result assert 'tpu-v3-8' in result, result assert 'Inferentia' in result, result + assert 'A100-80GB' in result, result def test_list_accelerators_filters():