From 6411b3a21d88a35608bb9fd81bbf00791d6ed42f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Oct 2022 22:42:00 -0700 Subject: [PATCH 01/30] Port changes --- .../data_fetchers/fetch_aws.py | 167 +++++++++++------- .../data_fetchers/fetch_azure.py | 110 ++++++++---- .../data_fetchers/fetch_gcp.py | 10 +- 3 files changed, 180 insertions(+), 107 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index bf61cff7996..4063742d489 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -1,9 +1,8 @@ """A script that queries AWS API to get instance types and pricing information. - This script takes about 1 minute to finish. """ import datetime -from typing import Tuple +from typing import Tuple, Union import numpy as np import pandas as pd @@ -11,7 +10,34 @@ from sky.adaptors import aws -REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] +ALL_REGIONS = [ + 'us-east-1', + 'us-east-2', + 'us-west-1', + 'us-west-2', + 'ca-central-1', + 'eu-central-1', + 'eu-west-1', + 'eu-west-2', + 'eu-south-1', + 'eu-west-3', + 'eu-north-1', + 'me-south-1', + # 'me-central-1', # failed for no credential + 'af-south-1', + 'ap-east-1', + 'ap-southeast-3', + # 'ap-south-1', # failed for no credential + 'ap-northeast-3', + 'ap-northeast-2', + 'ap-southeast-1', + 'ap-southeast-2', + 'ap-northeast-1', +] +US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2'] + +REGIONS = US_REGIONS + # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is # only available in this region, but it serves pricing information for all regions. PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' @@ -67,73 +93,84 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame: @ray.remote -def get_instance_types_df(region: str) -> pd.DataFrame: - df, zone_df, pricing_df, spot_pricing_df = ray.get([ - get_instance_types.remote(region), - get_availability_zones.remote(region), - get_pricing_table.remote(region), - get_spot_pricing_table.remote(region) - ]) - print(f'{region} Processing dataframes') - - def get_price(row): - t = row['InstanceType'] - try: - return pricing_df.loc[t]['PricePerUnit'] - except KeyError: - return np.nan - - def get_spot_price(row): - instance = row['InstanceType'] - zone = row['AvailabilityZone'] - try: - return spot_pricing_df.loc[(instance, zone)]['SpotPrice'] - except KeyError: - return np.nan - - def get_acc_info(row) -> Tuple[str, float]: - accelerator = None - for col, info_key in [('GpuInfo', 'Gpus'), - ('InferenceAcceleratorInfo', 'Accelerators'), - ('FpgaInfo', 'Fpgas')]: - info = row.get(col) - if isinstance(info, dict): - accelerator = info[info_key][0] - if accelerator is None: - return None, np.nan - return accelerator['Name'], accelerator['Count'] - - def get_vcpus(row) -> float: - return float(row['VCpuInfo']['DefaultVCpus']) - - def get_memory_gib(row) -> float: - return row['MemoryInfo']['SizeInMiB'] // 1024 - - def get_additional_columns(row): - acc_name, acc_count = get_acc_info(row) - # AWS p3dn.24xlarge offers a different V100 GPU. - # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ - if row['InstanceType'] == 'p3dn.24xlarge': - acc_name = 'V100-32GB' - return pd.Series({ - 'Price': get_price(row), - 'SpotPrice': get_spot_price(row), - 'AcceleratorName': acc_name, - 'AcceleratorCount': acc_count, - 'vCPUs': get_vcpus(row), - 'MemoryGiB': get_memory_gib(row), - }) - - df['Region'] = region - df = df.merge(pd.DataFrame(zone_df), how='cross') - df = pd.concat([df, df.apply(get_additional_columns, axis='columns')], - axis='columns') +def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]: + try: + df, zone_df, pricing_df, spot_pricing_df = ray.get([ + get_instance_types.remote(region), + get_availability_zones.remote(region), + get_pricing_table.remote(region), + get_spot_pricing_table.remote(region) + ]) + print(f'{region} Processing dataframes') + + def get_price(row): + t = row['InstanceType'] + try: + return pricing_df.loc[t]['PricePerUnit'] + except KeyError: + return np.nan + + def get_spot_price(row): + instance = row['InstanceType'] + zone = row['AvailabilityZone'] + try: + return spot_pricing_df.loc[(instance, zone)]['SpotPrice'] + except KeyError: + return np.nan + + def get_acc_info(row) -> Tuple[str, float]: + accelerator = None + for col, info_key in [('GpuInfo', 'Gpus'), + ('InferenceAcceleratorInfo', 'Accelerators'), + ('FpgaInfo', 'Fpgas')]: + info = row.get(col) + if isinstance(info, dict): + accelerator = info[info_key][0] + if accelerator is None: + return None, np.nan + return accelerator['Name'], accelerator['Count'] + + def get_vcpus(row) -> float: + return float(row['VCpuInfo']['DefaultVCpus']) + + def get_memory_gib(row) -> float: + return row['MemoryInfo']['SizeInMiB'] // 1024 + + def get_additional_columns(row): + acc_name, acc_count = get_acc_info(row) + # AWS p3dn.24xlarge offers a different V100 GPU. + # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ + if row['InstanceType'] == 'p3dn.24xlarge': + acc_name = 'V100-32GB' + return pd.Series({ + 'Price': get_price(row), + 'SpotPrice': get_spot_price(row), + 'AcceleratorName': acc_name, + 'AcceleratorCount': acc_count, + 'vCPUs': get_vcpus(row), + 'MemoryGiB': get_memory_gib(row), + }) + + df['Region'] = region + df = df.merge(pd.DataFrame(zone_df), how='cross') + df = pd.concat([df, df.apply(get_additional_columns, axis='columns')], + axis='columns') + except Exception as e: + print(f'{region} failed with {e}') + return region return df def get_all_regions_instance_types_df(): dfs = ray.get([get_instance_types_df.remote(r) for r in REGIONS]) - df = pd.concat(dfs) + new_dfs = [] + for df in dfs: + if isinstance(df, str): + print(f'{df} failed') + else: + new_dfs.append(df) + + df = pd.concat(new_dfs) df.sort_values(['InstanceType', 'Region'], inplace=True) return df diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 15ab13e70eb..31d99e4a8ae 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -3,6 +3,7 @@ This script takes about 1 minute to finish. """ import json +import os import subprocess from typing import Optional, Tuple import urllib @@ -12,7 +13,7 @@ import ray import requests -REGIONS = [ +US_REGIONS = [ 'centralus', 'eastus', 'eastus2', @@ -23,11 +24,24 @@ 'westus2', # 'WestUS3', # WestUS3 pricing table is broken as of 2021/11. ] + +# def get_regions() -> Tuple[str]: +# """Get all available regions.""" +# proc = subprocess.run('az account list-locations --query "[?not_null(metadata.latitude)] .{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json', shell=True, check=True, stdout=subprocess.PIPE) +# items = json.loads(proc.stdout.decode('utf-8')) +# regions = [item['RegionName'] for item in items if not item['RegionName'].endswith('stg')] +# return tuple(regions) +# all_regions = get_regions() + +# REGIONS = all_regions +REGIONS = US_REGIONS REGION_SET = set(REGIONS) # Azure secretly deprecated the M60 family which is still returned by its API. # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] +USEFULE_COLUMNS = ['InstanceType','AcceleratorName','AcceleratorCount','vCPUs','MemoryGiB','GpuInfo','Price','SpotPrice','Region','AvailabilityZone'] + def get_pricing_url(region: Optional[str] = None) -> str: filters = [ @@ -61,6 +75,7 @@ def get_pricing_df(region: Optional[str] = None) -> pd.DataFrame: url = content.get('NextPageLink') print(f'Done fetching pricing {region}') df = pd.DataFrame(all_items) + assert 'productName' in df.columns, (region, df.columns) return df[(~df['productName'].str.contains(' Windows')) & (df['unitPrice'] > 0)] @@ -83,10 +98,27 @@ def get_sku_df() -> pd.DataFrame: ) print(f'Done fetching SKUs') items = json.loads(proc.stdout.decode('ascii')) - df = pd.DataFrame(items) + new_items = [] + for item in items: + zones = item['locationInfo'][0]['zones'] + region = item['locations'][0] + if region not in REGION_SET: + continue + if len(zones) == 0: + new_item = item.copy() + new_item['Region'] = region + new_item['AvailabilityZone'] = f'{region}-0' + new_items.append(new_item) + + for zone in zones: + new_item = item.copy() + new_item['Region'] = region + new_item['AvailabilityZone'] = f'{region}-{zone}' + new_items.append(new_item) + + df = pd.DataFrame(new_items) df = df[(df['resourceType'] == 'virtualMachines')] - df['Region'] = df.apply(lambda row: row['locations'][0], axis='columns') - return df[df.apply(lambda row: row['Region'] in REGION_SET, axis='columns')] + return df def get_gpu_name(family: str) -> str: @@ -120,39 +152,33 @@ def get_all_regions_instance_types_df(): get_all_regions_pricing_df.remote(), get_sku_df.remote(), ]) - df.drop_duplicates(inplace=True) print(f'Processing dataframes') + df.drop_duplicates(inplace=True) + + df = df[df['unitPrice'] > 0] + + print('Getting price df') + df['merge_name'] = df['armSkuName'] + df['is_promo'] = df['skuName'].str.endswith(' Low Priority') + df.rename(columns={'armSkuName': 'InstanceType', 'armRegionName': 'Region'}, inplace=True) + demand_df = df[~df['skuName'].str.contains(' Spot')][['is_promo', 'InstanceType', 'Region', 'unitPrice']] + spot_df = df[df['skuName'].str.contains(' Spot')][['is_promo', 'InstanceType', 'Region', 'unitPrice']] + demand_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) + spot_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) + + demand_df = demand_df.rename(columns={'unitPrice': 'Price'}) + spot_df = spot_df.rename(columns={'unitPrice': 'SpotPrice'}) + + print('Getting sku df') + df_sku['is_promo'] = df_sku['name'].str.endswith('_Promo') + df_sku.rename(columns={'name': 'InstanceType'}, inplace=True) + df_sku['merge_name'] = df_sku['InstanceType'].str.replace('_Promo', '') + + print('Joining') + df = df_sku.join(demand_df, on=['merge_name', 'Region', 'is_promo'], how='left') + df = df.join(spot_df, on=['merge_name', 'Region', 'is_promo'], how='left') + # df.dropna(subset=['Price', 'SpotPrice'], inplace=True, how='all') - def get_price(row): - is_promo = row['name'].endswith('_Promo') - sku = row['name'].replace('_Promo', '') - region = row['Region'] - pricing_rows = df[(df['armSkuName'] == sku) & - (df['armRegionName'] == region) & - (df['unitPrice'] > 0) & - (~df['skuName'].str.contains(' Spot'))] - if is_promo: - pricing_rows = pricing_rows[pricing_rows['skuName'].str.contains( - ' Low Priority')] - else: - pricing_rows = pricing_rows[~pricing_rows['skuName'].str. - contains(' Low Priority')] - assert len(pricing_rows) <= 1, (sku, pricing_rows) - if len(pricing_rows) == 0: - return np.nan - return pricing_rows.iloc[0]['unitPrice'] - - def get_spot_price(row): - sku = row['name'] - region = row['Region'] - spot_pricing_rows = df[(df['armSkuName'] == sku) & - (df['armRegionName'] == region) & - (df['unitPrice'] > 0) & - (df['skuName'].str.contains(' Spot'))] - assert len(spot_pricing_rows) <= 1, (sku, spot_pricing_rows) - if len(spot_pricing_rows) == 0: - return np.nan - return spot_pricing_rows.iloc[0]['unitPrice'] def get_capabilities(row): gpu_name = None @@ -161,6 +187,7 @@ def get_capabilities(row): memory_gb = np.nan caps = row['capabilities'] for item in caps: + assert isinstance(item, dict), (item, caps) if item['name'] == 'GPUs': gpu_name = get_gpu_name(row['family']) if gpu_name is not None: @@ -174,8 +201,6 @@ def get_capabilities(row): def get_additional_columns(row): gpu_name, gpu_count, vcpus, memory_gb = get_capabilities(row) return pd.Series({ - 'Price': get_price(row), - 'SpotPrice': get_spot_price(row), 'AcceleratorName': gpu_name, 'AcceleratorCount': gpu_count, 'vCPUs': vcpus, @@ -184,11 +209,18 @@ def get_additional_columns(row): }) df_ret = pd.concat( - [df_sku, df_sku.apply(get_additional_columns, axis='columns')], + [df, df.apply(get_additional_columns, axis='columns')], axis='columns', - ).rename(columns={'name': 'InstanceType'}) + ) + + before_drop_len = len(df_ret) + df_ret.dropna(subset=['InstanceType', 'AvailabilityZone'], inplace=True, how='all') + after_drop_len = len(df_ret) + print('Dropped {} duplicated rows'.format(before_drop_len - after_drop_len)) + # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] + df_ret = df_ret[USEFULE_COLUMNS] return df_ret diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 9bcf75f1853..3f8edd91a7d 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -17,8 +17,12 @@ NOT_AVAILABLE_STR = 'Not available in this region' +ALL_REGION_PREFIX = '' +US_REGION_PREFIX = 'us-' +REGION_PREFIX = US_REGION_PREFIX + # Refer to: https://github.com/skypilot-org/skypilot/issues/1006 -UNSUPPORTED_VMS = ['t2a-standard', 'f1-micro'] +UNSUPPORTED_VMS = ['f1-micro'] # Supported GPU types and counts. # NOTE: GCP officially uses 'A100 40GB' and 'A100 80GB' as the names of the @@ -370,7 +374,7 @@ def get_vm_df(): # Block non-US regions. # FIXME(woosuk): Allow all regions. - vm_df = vm_df[vm_df['Region'].str.startswith('us-')] + vm_df = vm_df[vm_df['Region'].str.startswith(REGION_PREFIX)] return vm_df @@ -526,7 +530,7 @@ def get_gpu_df(): # Block non-US regions. # FIXME(woosuk): Allow all regions. - gpu_df = gpu_df[gpu_df['Region'].str.startswith('us-')] + gpu_df = gpu_df[gpu_df['Region'].str.startswith(REGION_PREFIX)] return gpu_df From 7b81e3864f2342388d836119f059ee1bc2df69e7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Oct 2022 22:47:14 -0700 Subject: [PATCH 02/30] format --- .../data_fetchers/fetch_aws.py | 11 ++++--- .../data_fetchers/fetch_azure.py | 33 ++++++++++++++----- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 4063742d489..b23e62b5ac3 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -121,8 +121,8 @@ def get_spot_price(row): def get_acc_info(row) -> Tuple[str, float]: accelerator = None for col, info_key in [('GpuInfo', 'Gpus'), - ('InferenceAcceleratorInfo', 'Accelerators'), - ('FpgaInfo', 'Fpgas')]: + ('InferenceAcceleratorInfo', 'Accelerators'), + ('FpgaInfo', 'Fpgas')]: info = row.get(col) if isinstance(info, dict): accelerator = info[info_key][0] @@ -153,8 +153,9 @@ def get_additional_columns(row): df['Region'] = region df = df.merge(pd.DataFrame(zone_df), how='cross') - df = pd.concat([df, df.apply(get_additional_columns, axis='columns')], - axis='columns') + df = pd.concat( + [df, df.apply(get_additional_columns, axis='columns')], + axis='columns') except Exception as e: print(f'{region} failed with {e}') return region @@ -169,7 +170,7 @@ def get_all_regions_instance_types_df(): print(f'{df} failed') else: new_dfs.append(df) - + df = pd.concat(new_dfs) df.sort_values(['InstanceType', 'Region'], inplace=True) return df diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 31d99e4a8ae..8c4175bf86d 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -40,7 +40,10 @@ # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] -USEFULE_COLUMNS = ['InstanceType','AcceleratorName','AcceleratorCount','vCPUs','MemoryGiB','GpuInfo','Price','SpotPrice','Region','AvailabilityZone'] +USEFULE_COLUMNS = [ + 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone' +] def get_pricing_url(region: Optional[str] = None) -> str: @@ -156,13 +159,21 @@ def get_all_regions_instance_types_df(): df.drop_duplicates(inplace=True) df = df[df['unitPrice'] > 0] - + print('Getting price df') - df['merge_name'] = df['armSkuName'] + df['merge_name'] = df['armSkuName'] df['is_promo'] = df['skuName'].str.endswith(' Low Priority') - df.rename(columns={'armSkuName': 'InstanceType', 'armRegionName': 'Region'}, inplace=True) - demand_df = df[~df['skuName'].str.contains(' Spot')][['is_promo', 'InstanceType', 'Region', 'unitPrice']] - spot_df = df[df['skuName'].str.contains(' Spot')][['is_promo', 'InstanceType', 'Region', 'unitPrice']] + df.rename(columns={ + 'armSkuName': 'InstanceType', + 'armRegionName': 'Region' + }, + inplace=True) + demand_df = df[~df['skuName'].str.contains(' Spot')][[ + 'is_promo', 'InstanceType', 'Region', 'unitPrice' + ]] + spot_df = df[df['skuName'].str.contains(' Spot')][[ + 'is_promo', 'InstanceType', 'Region', 'unitPrice' + ]] demand_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) spot_df.set_index(['InstanceType', 'Region', 'is_promo'], inplace=True) @@ -175,10 +186,12 @@ def get_all_regions_instance_types_df(): df_sku['merge_name'] = df_sku['InstanceType'].str.replace('_Promo', '') print('Joining') - df = df_sku.join(demand_df, on=['merge_name', 'Region', 'is_promo'], how='left') + df = df_sku.join(demand_df, + on=['merge_name', 'Region', 'is_promo'], + how='left') df = df.join(spot_df, on=['merge_name', 'Region', 'is_promo'], how='left') - # df.dropna(subset=['Price', 'SpotPrice'], inplace=True, how='all') + # df.dropna(subset=['Price', 'SpotPrice'], inplace=True, how='all') def get_capabilities(row): gpu_name = None @@ -214,7 +227,9 @@ def get_additional_columns(row): ) before_drop_len = len(df_ret) - df_ret.dropna(subset=['InstanceType', 'AvailabilityZone'], inplace=True, how='all') + df_ret.dropna(subset=['InstanceType', 'AvailabilityZone'], + inplace=True, + how='all') after_drop_len = len(df_ret) print('Dropped {} duplicated rows'.format(before_drop_len - after_drop_len)) From 7546a321a300f234637cab429a17349740ebdb09 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 5 Oct 2022 23:29:31 -0700 Subject: [PATCH 03/30] add t2a exclusion back --- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 3f8edd91a7d..0cd241cb30b 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -22,7 +22,7 @@ REGION_PREFIX = US_REGION_PREFIX # Refer to: https://github.com/skypilot-org/skypilot/issues/1006 -UNSUPPORTED_VMS = ['f1-micro'] +UNSUPPORTED_VMS = ['t2a-standard', 'f1-micro'] # Supported GPU types and counts. # NOTE: GCP officially uses 'A100 40GB' and 'A100 80GB' as the names of the From 4416ca6fca235cf0ef02ce7928e73c9e27a6bea9 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 6 Oct 2022 00:29:11 -0700 Subject: [PATCH 04/30] fix A100 for GCP --- .../data_fetchers/fetch_gcp.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index 0cd241cb30b..b48c4410799 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -39,7 +39,6 @@ } # FIXME(woosuk): This URL can change. -A2_PRICING_URL = '/compute/vm-instance-pricing_04e9ea153c2ab1ea37254107b92dc081d4459ca9fe9a46ef5b39e5d63353f089.frame' # pylint: disable=line-too-long A2_INSTANCE_TYPES = { 'a2-highgpu-1g': { 'vCPUs': 12, @@ -239,7 +238,7 @@ def parse_price(price_str): if 'InstanceType' in df.columns: # Price table for pre-defined instance types. instance_type = df['InstanceType'].iloc[0] - if instance_type == 'a2-highgpu-1g': + if instance_type in ['a2-highgpu-1g', 'a2-ultragpu-1g']: # The A2 price table includes the GPU cost. return None @@ -263,6 +262,9 @@ def parse_price(price_str): else: # Others (e.g., per vCPU hour or per GB hour pricing rule table). df = df[['Item', 'Region', 'Price', 'SpotPrice']] + item = df['Item'].iloc[0] + if item == 'Predefined vCPUs': + return get_a2_df(df) return df @@ -298,13 +300,12 @@ def parse_machine_type_list(list_str): return df -def get_a2_df(): - a2_pricing = get_vm_price_table(GCP_URL + A2_PRICING_URL) - cpu_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined vCPUs'] - memory_pricing = a2_pricing[a2_pricing['Item'] == 'Predefined Memory'] +def get_a2_df(a2_pricing_df): + cpu_pricing = a2_pricing_df[a2_pricing_df['Item'] == 'Predefined vCPUs'] + memory_pricing = a2_pricing_df[a2_pricing_df['Item'] == 'Predefined Memory'] table = [] - for region in a2_pricing['Region'].unique(): + for region in a2_pricing_df['Region'].unique(): per_cpu_price = cpu_pricing[cpu_pricing['Region'] == region]['Price'].values[0] per_cpu_spot_price = cpu_pricing[cpu_pricing['Region'] == @@ -348,9 +349,7 @@ def get_vm_df(): df for df in vm_dfs if df is not None and 'InstanceType' in df.columns ] - # Handle A2 instance types separately. - a2_df = get_a2_df() - vm_df = pd.concat(vm_dfs + [a2_df]) + vm_df = pd.concat(vm_dfs) vm_zones = get_vm_zones(GCP_VM_ZONES_URL) # Remove regions not in the pricing data. From dcca1e3072dc899cc30098abfe8b5c1d2ae823a8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 6 Oct 2022 02:17:07 -0700 Subject: [PATCH 05/30] fix aws fetching for p4de.24xlarge --- .../data_fetchers/fetch_aws.py | 46 ++++++++++--------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index b23e62b5ac3..937366b5a01 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -70,12 +70,18 @@ def get_pricing_table(region: str) -> pd.DataFrame: print(f'{region} downloading pricing table') url = PRICING_TABLE_URL_FMT.format(region=region) df = pd.read_csv(url, skiprows=5, low_memory=False) + df.rename(columns={ + 'Instance Type': 'InstanceType', + 'PricePerUnit': 'Price' + }, + inplace=True) return df[(df['TermType'] == 'OnDemand') & (df['Operating System'] == 'Linux') & df['Pre Installed S/W'].isnull() & (df['CapacityStatus'] == 'Used') & - (df['Tenancy'].isin(['Host', 'Shared'])) & - df['PricePerUnit'] > 0].set_index('Instance Type') + (df['Tenancy'].isin(['Host', 'Shared'])) & df['Price'] > 0][[ + 'InstanceType', 'Price', 'vCPU', 'Memory' + ]] @ray.remote @@ -88,7 +94,8 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame: ret = [] for response in response_iterator: ret = ret + response['SpotPriceHistory'] - df = pd.DataFrame(ret).set_index(['InstanceType', 'AvailabilityZone']) + df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice' + ]].set_index(['InstanceType', 'AvailabilityZone']) return df @@ -103,21 +110,6 @@ def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]: ]) print(f'{region} Processing dataframes') - def get_price(row): - t = row['InstanceType'] - try: - return pricing_df.loc[t]['PricePerUnit'] - except KeyError: - return np.nan - - def get_spot_price(row): - instance = row['InstanceType'] - zone = row['AvailabilityZone'] - try: - return spot_pricing_df.loc[(instance, zone)]['SpotPrice'] - except KeyError: - return np.nan - def get_acc_info(row) -> Tuple[str, float]: accelerator = None for col, info_key in [('GpuInfo', 'Gpus'), @@ -131,10 +123,14 @@ def get_acc_info(row) -> Tuple[str, float]: return accelerator['Name'], accelerator['Count'] def get_vcpus(row) -> float: + if not np.isnan(row['vCPU']): + return float(row['vCPU']) return float(row['VCpuInfo']['DefaultVCpus']) def get_memory_gib(row) -> float: - return row['MemoryInfo']['SizeInMiB'] // 1024 + if isinstance(row['MemoryInfo'], dict): + return row['MemoryInfo']['SizeInMiB'] // 1024 + return int(row['Memory'].split(' GiB')[0]) def get_additional_columns(row): acc_name, acc_count = get_acc_info(row) @@ -142,17 +138,25 @@ def get_additional_columns(row): # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ if row['InstanceType'] == 'p3dn.24xlarge': acc_name = 'V100-32GB' + if row['InstanceType'] == 'p4de.24xlarge': + acc_name = 'A100-80GB' + acc_count = 8 return pd.Series({ - 'Price': get_price(row), - 'SpotPrice': get_spot_price(row), 'AcceleratorName': acc_name, 'AcceleratorCount': acc_count, 'vCPUs': get_vcpus(row), 'MemoryGiB': get_memory_gib(row), }) + df = df.merge(pricing_df, on=['InstanceType'], how='outer') df['Region'] = region df = df.merge(pd.DataFrame(zone_df), how='cross') + + df = df.merge(spot_pricing_df, + left_on=['InstanceType', 'AvailabilityZone'], + right_index=True, + how='outer') + df = pd.concat( [df, df.apply(get_additional_columns, axis='columns')], axis='columns') From 0c1c31dab431309d98c6aa2910f8263c75ba0ef8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 6 Oct 2022 02:40:48 -0700 Subject: [PATCH 06/30] Fill GPUInfo --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 937366b5a01..a11272a2ef6 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -160,6 +160,7 @@ def get_additional_columns(row): df = pd.concat( [df, df.apply(get_additional_columns, axis='columns')], axis='columns') + df['GPUInfo'].fillna(df['AcceleratorName'], inplace=True) except Exception as e: print(f'{region} failed with {e}') return region From deeba693a280ea5cf0ea247da5b63ff7be7979b8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 6 Oct 2022 02:41:50 -0700 Subject: [PATCH 07/30] fix --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index a11272a2ef6..cdaaf61233b 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -160,7 +160,7 @@ def get_additional_columns(row): df = pd.concat( [df, df.apply(get_additional_columns, axis='columns')], axis='columns') - df['GPUInfo'].fillna(df['AcceleratorName'], inplace=True) + df['GpuInfo'].fillna(df['AcceleratorName'], inplace=True) except Exception as e: print(f'{region} failed with {e}') return region From 820434f64bc0965ef9def64218eaad18573dfc7b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sat, 8 Oct 2022 23:07:09 -0700 Subject: [PATCH 08/30] address part of comments --- sky/clouds/service_catalog/common.py | 2 +- .../data_fetchers/fetch_aws.py | 24 ++++++++++++------- .../data_fetchers/fetch_azure.py | 1 + 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 5e0d2c6cf2d..0ac3562c7de 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -237,7 +237,7 @@ def list_accelerators_impl( instance types offered by this cloud. """ if gpus_only: - df = df[~pd.isna(df['GpuInfo'])] + df = df[~pd.isna(df['AcceleratorName'])] df = df[[ 'InstanceType', 'AcceleratorName', diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index cdaaf61233b..f3a22604e93 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -94,8 +94,8 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame: ret = [] for response in response_iterator: ret = ret + response['SpotPriceHistory'] - df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice' - ]].set_index(['InstanceType', 'AvailabilityZone']) + df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice']] + df = df.set_index(['InstanceType', 'AvailabilityZone']) return df @@ -106,7 +106,7 @@ def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]: get_instance_types.remote(region), get_availability_zones.remote(region), get_pricing_table.remote(region), - get_spot_pricing_table.remote(region) + get_spot_pricing_table.remote(region), ]) print(f'{region} Processing dataframes') @@ -132,7 +132,7 @@ def get_memory_gib(row) -> float: return row['MemoryInfo']['SizeInMiB'] // 1024 return int(row['Memory'].split(' GiB')[0]) - def get_additional_columns(row): + def get_additional_columns(row) -> pd.Series: acc_name, acc_count = get_acc_info(row) # AWS p3dn.24xlarge offers a different V100 GPU. # See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/ @@ -148,15 +148,21 @@ def get_additional_columns(row): 'MemoryGiB': get_memory_gib(row), }) + # The AWS API may not have all the instance types in the pricing table, + # so we need to merge the two dataframes. df = df.merge(pricing_df, on=['InstanceType'], how='outer') df['Region'] = region + # Cartesian product of instance types and availability zones, so that + # we can join the spot pricing table per instance type and zone. df = df.merge(pd.DataFrame(zone_df), how='cross') + # Add spot price column, by joining the spot pricing table. df = df.merge(spot_pricing_df, left_on=['InstanceType', 'AvailabilityZone'], right_index=True, how='outer') + # Extract vCPUs, memory, and accelerator info from the columns. df = pd.concat( [df, df.apply(get_additional_columns, axis='columns')], axis='columns') @@ -168,13 +174,13 @@ def get_additional_columns(row): def get_all_regions_instance_types_df(): - dfs = ray.get([get_instance_types_df.remote(r) for r in REGIONS]) + df_or_regions = ray.get([get_instance_types_df.remote(r) for r in REGIONS]) new_dfs = [] - for df in dfs: - if isinstance(df, str): - print(f'{df} failed') + for df_or_region in df_or_regions: + if isinstance(df_or_region, str): + print(f'{df_or_region} failed') else: - new_dfs.append(df) + new_dfs.append(df_or_region) df = pd.concat(new_dfs) df.sort_values(['InstanceType', 'Region'], inplace=True) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 8c4175bf86d..ae8074a2512 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -25,6 +25,7 @@ # 'WestUS3', # WestUS3 pricing table is broken as of 2021/11. ] +# To enable all the regions, uncomment the following line. # def get_regions() -> Tuple[str]: # """Get all available regions.""" # proc = subprocess.run('az account list-locations --query "[?not_null(metadata.latitude)] .{RegionName:name , RegionDisplayName:regionalDisplayName}" -o json', shell=True, check=True, stdout=subprocess.PIPE) From 8d06ad9d12ffb4cea4f4b59f096ef6b54d955ad6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 01:14:41 -0700 Subject: [PATCH 09/30] address comments --- .../service_catalog/data_fetchers/fetch_azure.py | 14 +++++--------- .../service_catalog/data_fetchers/fetch_gcp.py | 4 +++- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index ae8074a2512..9b3ddd35335 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -41,7 +41,7 @@ # We have to manually remove it. DEPRECATED_FAMILIES = ['standardNVSv2Family'] -USEFULE_COLUMNS = [ +USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone' ] @@ -109,10 +109,8 @@ def get_sku_df() -> pd.DataFrame: if region not in REGION_SET: continue if len(zones) == 0: - new_item = item.copy() - new_item['Region'] = region - new_item['AvailabilityZone'] = f'{region}-0' - new_items.append(new_item) + # The default zone is '0'. + zones = ['0'] for zone in zones: new_item = item.copy() @@ -166,7 +164,7 @@ def get_all_regions_instance_types_df(): df['is_promo'] = df['skuName'].str.endswith(' Low Priority') df.rename(columns={ 'armSkuName': 'InstanceType', - 'armRegionName': 'Region' + 'armRegionName': 'Region', }, inplace=True) demand_df = df[~df['skuName'].str.contains(' Spot')][[ @@ -192,8 +190,6 @@ def get_all_regions_instance_types_df(): how='left') df = df.join(spot_df, on=['merge_name', 'Region', 'is_promo'], how='left') - # df.dropna(subset=['Price', 'SpotPrice'], inplace=True, how='all') - def get_capabilities(row): gpu_name = None gpu_count = np.nan @@ -236,7 +232,7 @@ def get_additional_columns(row): # Filter out deprecated families df_ret = df_ret.loc[~df_ret['family'].isin(DEPRECATED_FAMILIES)] - df_ret = df_ret[USEFULE_COLUMNS] + df_ret = df_ret[USEFUL_COLUMNS] return df_ret diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py index b48c4410799..b29c0864345 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_gcp.py @@ -20,6 +20,8 @@ ALL_REGION_PREFIX = '' US_REGION_PREFIX = 'us-' REGION_PREFIX = US_REGION_PREFIX +# Uncomment the following line to VM pricings from all regions. +# REGION_PREFIX = ALL_REGION_PREFIX # Refer to: https://github.com/skypilot-org/skypilot/issues/1006 UNSUPPORTED_VMS = ['t2a-standard', 'f1-micro'] @@ -264,7 +266,7 @@ def parse_price(price_str): df = df[['Item', 'Region', 'Price', 'SpotPrice']] item = df['Item'].iloc[0] if item == 'Predefined vCPUs': - return get_a2_df(df) + df = get_a2_df(df) return df From fcf00a307c0093202c1aae51ab70f6a2e4c9d375 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 01:38:40 -0700 Subject: [PATCH 10/30] add test for A100 --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 2 +- tests/test_list_accelerators.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index f3a22604e93..ff24747c3da 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -72,7 +72,7 @@ def get_pricing_table(region: str) -> pd.DataFrame: df = pd.read_csv(url, skiprows=5, low_memory=False) df.rename(columns={ 'Instance Type': 'InstanceType', - 'PricePerUnit': 'Price' + 'PricePerUnit': 'Price', }, inplace=True) return df[(df['TermType'] == 'OnDemand') & diff --git a/tests/test_list_accelerators.py b/tests/test_list_accelerators.py index ffabe5de897..56bde6b4f9d 100644 --- a/tests/test_list_accelerators.py +++ b/tests/test_list_accelerators.py @@ -6,6 +6,7 @@ def test_list_accelerators(): assert 'V100' in result, result assert 'tpu-v3-8' in result, result assert 'Inferentia' not in result, result + assert 'A100-80GB' in result, result def test_list_ccelerators_all(): @@ -13,6 +14,7 @@ def test_list_ccelerators_all(): assert 'V100' in result, result assert 'tpu-v3-8' in result, result assert 'Inferentia' in result, result + assert 'A100-80GB' in result, result def test_list_accelerators_filters(): From 0fca40f70117c88542518e5896439c57f5b42e03 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:00:20 -0700 Subject: [PATCH 11/30] patch GpuInfo --- sky/clouds/service_catalog/common.py | 2 +- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sky/clouds/service_catalog/common.py b/sky/clouds/service_catalog/common.py index 0ac3562c7de..5e0d2c6cf2d 100644 --- a/sky/clouds/service_catalog/common.py +++ b/sky/clouds/service_catalog/common.py @@ -237,7 +237,7 @@ def list_accelerators_impl( instance types offered by this cloud. """ if gpus_only: - df = df[~pd.isna(df['AcceleratorName'])] + df = df[~pd.isna(df['GpuInfo'])] df = df[[ 'InstanceType', 'AcceleratorName', diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index ff24747c3da..8a9b7a668c1 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -166,7 +166,8 @@ def get_additional_columns(row) -> pd.Series: df = pd.concat( [df, df.apply(get_additional_columns, axis='columns')], axis='columns') - df['GpuInfo'].fillna(df['AcceleratorName'], inplace=True) + # patch the GpuInfo for p4de.24xlarge + df[df['InstanceType'] == 'p4de.24xlarge']['GpuInfo'] = 'A100-80GB' except Exception as e: print(f'{region} failed with {e}') return region From 7af5c324113c990b5b2034ad28668b68b1543f00 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:21:00 -0700 Subject: [PATCH 12/30] Add generation info --- sky/clouds/service_catalog/azure_catalog.py | 8 +------- .../service_catalog/data_fetchers/fetch_azure.py | 10 +++++++--- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index cd827106c50..e2f6fae4586 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -68,13 +68,7 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: - cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0] - cap_list = ast.literal_eval(cell) - gen_version = None - for cap in cap_list: - if cap['name'] == 'HyperVGenerations': - gen_version = cap['value'] - return gen_version + return _df[_df['InstanceType'] == instance_type]['Generation'] def list_accelerators(gpus_only: bool, diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 9b3ddd35335..b1ea6c7517a 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -43,7 +43,7 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone' + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Generation' ] @@ -195,6 +195,7 @@ def get_capabilities(row): gpu_count = np.nan vcpus = np.nan memory_gb = np.nan + gen_version = None caps = row['capabilities'] for item in caps: assert isinstance(item, dict), (item, caps) @@ -206,16 +207,19 @@ def get_capabilities(row): vcpus = float(item['value']) elif item['name'] == 'MemoryGB': memory_gb = item['value'] - return gpu_name, gpu_count, vcpus, memory_gb + elif item['name'] == 'HyperVGenerations': + gen_version = item['value'] + return gpu_name, gpu_count, vcpus, memory_gb, gen_version def get_additional_columns(row): - gpu_name, gpu_count, vcpus, memory_gb = get_capabilities(row) + gpu_name, gpu_count, vcpus, memory_gb, gen_version = get_capabilities(row) return pd.Series({ 'AcceleratorName': gpu_name, 'AcceleratorCount': gpu_count, 'vCPUs': vcpus, 'MemoryGiB': memory_gb, 'GpuInfo': gpu_name, + 'Generation': gen_version, }) df_ret = pd.concat( From a83ea77c0bd2a94891215d0da32db2c11dabc8d3 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:30:47 -0700 Subject: [PATCH 13/30] Add capabilities back to azure and fix aws --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 2 +- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 8a9b7a668c1..bab26227ec0 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -167,7 +167,7 @@ def get_additional_columns(row) -> pd.Series: [df, df.apply(get_additional_columns, axis='columns')], axis='columns') # patch the GpuInfo for p4de.24xlarge - df[df['InstanceType'] == 'p4de.24xlarge']['GpuInfo'] = 'A100-80GB' + df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB' except Exception as e: print(f'{region} failed with {e}') return region diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index b1ea6c7517a..b8ac1354be5 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -3,9 +3,8 @@ This script takes about 1 minute to finish. """ import json -import os import subprocess -from typing import Optional, Tuple +from typing import Optional import urllib import numpy as np @@ -43,7 +42,8 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Generation' + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Generation', + 'capabilities' ] From a49becd81670011094fb20e1298d2db4ebdbd546 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:37:39 -0700 Subject: [PATCH 14/30] fix azure catalog --- sky/clouds/service_catalog/azure_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index e2f6fae4586..bb02e9a887e 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -68,7 +68,7 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: - return _df[_df['InstanceType'] == instance_type]['Generation'] + return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] def list_accelerators(gpus_only: bool, From c6032249f976277584f5fdeee779c93a503c3f8a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:46:00 -0700 Subject: [PATCH 15/30] format --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index b8ac1354be5..d7d9caeb911 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -212,7 +212,8 @@ def get_capabilities(row): return gpu_name, gpu_count, vcpus, memory_gb, gen_version def get_additional_columns(row): - gpu_name, gpu_count, vcpus, memory_gb, gen_version = get_capabilities(row) + gpu_name, gpu_count, vcpus, memory_gb, gen_version = get_capabilities( + row) return pd.Series({ 'AcceleratorName': gpu_name, 'AcceleratorCount': gpu_count, From 42d51d333c7fb057519fdc2ee8f4404ba3400a1f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 02:46:14 -0700 Subject: [PATCH 16/30] lint --- sky/clouds/service_catalog/azure_catalog.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index bb02e9a887e..a5e1f3b88e8 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -3,7 +3,6 @@ This module loads the service catalog file and can be used to query instance types and pricing information for Azure. """ -import ast from typing import Dict, List, Optional, Tuple from sky.clouds import cloud From 54869e20442ed2306c29e44a93ef851b4cb33790 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 11:35:11 -0700 Subject: [PATCH 17/30] remove zone from azure --- .../data_fetchers/fetch_azure.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index d7d9caeb911..f3bbda15552 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -42,7 +42,7 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Generation', + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation', 'capabilities' ] @@ -102,23 +102,14 @@ def get_sku_df() -> pd.DataFrame: ) print(f'Done fetching SKUs') items = json.loads(proc.stdout.decode('ascii')) - new_items = [] for item in items: - zones = item['locationInfo'][0]['zones'] + # zones = item['locationInfo'][0]['zones'] region = item['locations'][0] if region not in REGION_SET: continue - if len(zones) == 0: - # The default zone is '0'. - zones = ['0'] + item['Region'] = region - for zone in zones: - new_item = item.copy() - new_item['Region'] = region - new_item['AvailabilityZone'] = f'{region}-{zone}' - new_items.append(new_item) - - df = pd.DataFrame(new_items) + df = pd.DataFrame(items) df = df[(df['resourceType'] == 'virtualMachines')] return df @@ -229,7 +220,7 @@ def get_additional_columns(row): ) before_drop_len = len(df_ret) - df_ret.dropna(subset=['InstanceType', 'AvailabilityZone'], + df_ret.dropna(subset=['InstanceType'], inplace=True, how='all') after_drop_len = len(df_ret) From 95d935581baef9558d2b3318dfb2894cfa4710d0 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:14:26 -0700 Subject: [PATCH 18/30] fix azure --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index f3bbda15552..8e38dd2e65d 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -102,14 +102,16 @@ def get_sku_df() -> pd.DataFrame: ) print(f'Done fetching SKUs') items = json.loads(proc.stdout.decode('ascii')) + filtered_items = [] for item in items: # zones = item['locationInfo'][0]['zones'] region = item['locations'][0] if region not in REGION_SET: continue item['Region'] = region + filtered_items.append(item) - df = pd.DataFrame(items) + df = pd.DataFrame(filtered_items) df = df[(df['resourceType'] == 'virtualMachines')] return df From 0251b0046f0252e56cb51225257cb0713ac8351d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:28:18 -0700 Subject: [PATCH 19/30] Add analyze for csv --- .gitignore | 1 + .../service_catalog/data_fetchers/analyze.py | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 sky/clouds/service_catalog/data_fetchers/analyze.py diff --git a/.gitignore b/.gitignore index 21b4b31a483..3172e74a312 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ docs/build/ docs/_build/ build/ sky_logs/ +sky/clouds/service_catalog/data_fetchers/*.csv diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py new file mode 100644 index 00000000000..3f75e914331 --- /dev/null +++ b/sky/clouds/service_catalog/data_fetchers/analyze.py @@ -0,0 +1,47 @@ +import copy +from typing import Tuple +import pandas as pd + +from sky.clouds.service_catalog import common + + +def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: Tuple[str]) -> pd.DataFrame: + """Returns the difference between two dataframes.""" + original_resources = original_df[check_tuple] + new_resources = new_df[check_tuple] + + return new_resources.merge(original_resources, on=check_tuple, how='left', indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values(by=check_tuple) + + +CLOUD_CHECKS = { + 'aws': ['InstanceType', 'Region', 'AvailabilityZone'], + 'azure': ['InstanceType', 'Region'], + 'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount']} + + + +for cloud in CLOUD_CHECKS: + print(f'=> Checking {cloud}') + original_df = common.read_catalog(f'{cloud}.csv') + new_df = pd.read_csv(f'{cloud}.csv') + + current_check_tuple = CLOUD_CHECKS[cloud] + + diff_df = resource_diff(original_df, new_df, current_check_tuple) + diff_df.merge(new_df, on=current_check_tuple, how='left').to_csv(f'{cloud}_diff.csv', index=False) + print(f'New resources in {cloud}: {len(diff_df)}') + + check_price = current_check_tuple + ['Price'] + diff_df = resource_diff(original_df, new_df, check_price) + print(f'New prices in {cloud}: {len(diff_df)}') + + check_price = current_check_tuple + ['SpotPrice'] + diff_df = resource_diff(original_df, new_df, check_price) + print(f'New spot prices in {cloud}: {len(diff_df)}') + + + + + + + From 893de58c189b888e9d2dc705f6445b56eb997013 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:37:20 -0700 Subject: [PATCH 20/30] update catalog analysis --- .../service_catalog/data_fetchers/analyze.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py index 3f75e914331..e2144626f83 100644 --- a/sky/clouds/service_catalog/data_fetchers/analyze.py +++ b/sky/clouds/service_catalog/data_fetchers/analyze.py @@ -19,8 +19,10 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: 'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount']} +table = {} for cloud in CLOUD_CHECKS: + result = {} print(f'=> Checking {cloud}') original_df = common.read_catalog(f'{cloud}.csv') new_df = pd.read_csv(f'{cloud}.csv') @@ -29,15 +31,22 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: diff_df = resource_diff(original_df, new_df, current_check_tuple) diff_df.merge(new_df, on=current_check_tuple, how='left').to_csv(f'{cloud}_diff.csv', index=False) - print(f'New resources in {cloud}: {len(diff_df)}') + + result['#resources'] = len(diff_df) check_price = current_check_tuple + ['Price'] diff_df = resource_diff(original_df, new_df, check_price) - print(f'New prices in {cloud}: {len(diff_df)}') + result['#prices'] = len(diff_df) check_price = current_check_tuple + ['SpotPrice'] diff_df = resource_diff(original_df, new_df, check_price) - print(f'New spot prices in {cloud}: {len(diff_df)}') + result['#spot_prices'] = len(diff_df) + + table[cloud] = result + +summary = pd.DataFrame(table).T +summary.to_csv('diff_summary.csv') +print(summary) From 3bc4503c9aff06af850cd06a372e0cd65eba9df3 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:38:22 -0700 Subject: [PATCH 21/30] format --- .../service_catalog/data_fetchers/analyze.py | 28 +++++++++---------- .../data_fetchers/fetch_azure.py | 7 ++--- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/sky/clouds/service_catalog/data_fetchers/analyze.py b/sky/clouds/service_catalog/data_fetchers/analyze.py index e2144626f83..c9de9e9a06c 100644 --- a/sky/clouds/service_catalog/data_fetchers/analyze.py +++ b/sky/clouds/service_catalog/data_fetchers/analyze.py @@ -5,19 +5,23 @@ from sky.clouds.service_catalog import common -def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: Tuple[str]) -> pd.DataFrame: +def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, + check_tuple: Tuple[str]) -> pd.DataFrame: """Returns the difference between two dataframes.""" original_resources = original_df[check_tuple] new_resources = new_df[check_tuple] - - return new_resources.merge(original_resources, on=check_tuple, how='left', indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values(by=check_tuple) - + + return new_resources.merge( + original_resources, on=check_tuple, how='left', + indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values( + by=check_tuple) + CLOUD_CHECKS = { 'aws': ['InstanceType', 'Region', 'AvailabilityZone'], 'azure': ['InstanceType', 'Region'], - 'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount']} - + 'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount'] +} table = {} @@ -28,9 +32,10 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: new_df = pd.read_csv(f'{cloud}.csv') current_check_tuple = CLOUD_CHECKS[cloud] - + diff_df = resource_diff(original_df, new_df, current_check_tuple) - diff_df.merge(new_df, on=current_check_tuple, how='left').to_csv(f'{cloud}_diff.csv', index=False) + diff_df.merge(new_df, on=current_check_tuple, + how='left').to_csv(f'{cloud}_diff.csv', index=False) result['#resources'] = len(diff_df) @@ -47,10 +52,3 @@ def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame, check_tuple: summary = pd.DataFrame(table).T summary.to_csv('diff_summary.csv') print(summary) - - - - - - - diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 8e38dd2e65d..6559d6f0e71 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -42,8 +42,7 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation', - 'capabilities' + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation', 'capabilities' ] @@ -222,9 +221,7 @@ def get_additional_columns(row): ) before_drop_len = len(df_ret) - df_ret.dropna(subset=['InstanceType'], - inplace=True, - how='all') + df_ret.dropna(subset=['InstanceType'], inplace=True, how='all') after_drop_len = len(df_ret) print('Dropped {} duplicated rows'.format(before_drop_len - after_drop_len)) From 39b8a25850913a6744a212c896ff3ee3b3d9c503 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:42:25 -0700 Subject: [PATCH 22/30] backward compatible for azure_catalog --- sky/clouds/service_catalog/azure_catalog.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index a5e1f3b88e8..345261c9fd3 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -3,6 +3,7 @@ This module loads the service catalog file and can be used to query instance types and pricing information for Azure. """ +import ast from typing import Dict, List, Optional, Tuple from sky.clouds import cloud @@ -67,7 +68,17 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: - return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] + if 'Generation' in _df.columns: + return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] + + # Backward compatibility for the older catalog. + cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0] + cap_list = ast.literal_eval(cell) + gen_version = None + for cap in cap_list: + if cap['name'] == 'HyperVGenerations': + gen_version = cap['value'] + return gen_version def list_accelerators(gpus_only: bool, From fdb56c8b422b60967ed7024a7eff42f7ce7a7e47 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 12:43:57 -0700 Subject: [PATCH 23/30] yapf --- sky/clouds/service_catalog/azure_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 345261c9fd3..5ac7c3846bb 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -70,7 +70,7 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: if 'Generation' in _df.columns: return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] - + # Backward compatibility for the older catalog. cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0] cap_list = ast.literal_eval(cell) From a802c8554af5d0199fd8b306daa2ba9bc0bae231 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 14:46:14 -0700 Subject: [PATCH 24/30] fix GCP catalog --- sky/clouds/service_catalog/gcp_catalog.py | 37 ++++++++++++++--------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 67f0cf931d8..554c350f5c2 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -3,6 +3,7 @@ For now this service catalog is manually coded. In the future it should be queried from GCP API. """ +from collections import defaultdict import typing from typing import Dict, List, Optional, Tuple @@ -30,12 +31,20 @@ # TODO(zongheng): fix A100 info directly in catalog. # https://cloud.google.com/blog/products/compute/a2-vms-with-nvidia-a100-gpus-are-ga # count -> vm type -_A100_INSTANCE_TYPES = { +_A100_INSTANCE_TYPE_DICTS = { + 'A100': { 1: 'a2-highgpu-1g', 2: 'a2-highgpu-2g', 4: 'a2-highgpu-4g', 8: 'a2-highgpu-8g', 16: 'a2-megagpu-16g', + }, + 'A100-80GB': { + 1: 'a2-ultragpu-1g', + 2: 'a2-ultragpu-2g', + 4: 'a2-ultragpu-4g', + 8: 'a2-ultragpu-8g', + } } # Number of CPU cores per GPU based on the AWS setting. @@ -167,10 +176,10 @@ def get_instance_type_for_accelerator( if instance_list is None: return None, fuzzy_candidate_list - if acc_name == 'A100': + if acc_name in _A100_INSTANCE_TYPE_DICTS: # If A100 is used, host VM type must be A2. # https://cloud.google.com/compute/docs/gpus#a100-gpus - return [_A100_INSTANCE_TYPES[acc_count]], [] + return [_A100_INSTANCE_TYPE_DICTS[acc_name][acc_count]], [] if acc_name not in _NUM_ACC_TO_NUM_CPU: acc_name = 'DEFAULT' @@ -258,17 +267,17 @@ def list_accelerators( results = common.list_accelerators_impl('GCP', _df, gpus_only, name_filter, case_sensitive) - a100_infos = results.get('A100', None) - if a100_infos is None: + a100_infos = results.get('A100', []) + results.get('A100-80GB', []) + if not a100_infos: return results # Unlike other GPUs that can be attached to different sizes of N1 VMs, # A100 GPUs can only be attached to fixed-size A2 VMs. # Thus, we can show their exact cost including the host VM prices. - new_infos = [] + new_infos = defaultdict(list) for info in a100_infos: assert pd.isna(info.instance_type) and pd.isna(info.memory), a100_infos - a100_host_vm_type = _A100_INSTANCE_TYPES[info.accelerator_count] + a100_host_vm_type = _A100_INSTANCE_TYPE_DICTS[info.accelerator_name][info.accelerator_count] df = _df[_df['InstanceType'] == a100_host_vm_type] cpu_count = df['vCPUs'].iloc[0] memory = df['MemoryGiB'].iloc[0] @@ -280,7 +289,7 @@ def list_accelerators( a100_host_vm_type, None, use_spot=True) - new_infos.append( + new_infos[info.accelerator_name].append( info._replace( instance_type=a100_host_vm_type, cpu_count=cpu_count, @@ -289,7 +298,7 @@ def list_accelerators( price=info.price + vm_price, spot_price=info.spot_price + vm_spot_price, )) - results['A100'] = new_infos + results.update(new_infos) return results @@ -342,7 +351,7 @@ def check_host_accelerator_compatibility( return # Treat A100 as a special case. - if acc_name == 'A100': + if acc_name in _A100_INSTANCE_TYPE_DICTS: # A100 must be attached to A2 instance type. if not instance_type.startswith('a2-'): with ux_utils.print_exception_no_traceback(): @@ -382,8 +391,8 @@ def check_accelerator_attachable_to_host(instance_type: str, assert instance_type == 'TPU-VM' or instance_type.startswith('n1-') return - if acc_name == 'A100': - valid_counts = list(_A100_INSTANCE_TYPES.keys()) + if acc_name in _A100_INSTANCE_TYPE_DICTS: + valid_counts = list(_A100_INSTANCE_TYPE_DICTS[acc_name].keys()) else: valid_counts = list(_NUM_ACC_TO_MAX_CPU_AND_MEMORY[acc_name].keys()) if acc_count not in valid_counts: @@ -392,8 +401,8 @@ def check_accelerator_attachable_to_host(instance_type: str, f'{acc_name}:{acc_count} is not launchable on GCP. ' f'The valid {acc_name} counts are {valid_counts}.') - if acc_name == 'A100': - a100_instance_type = _A100_INSTANCE_TYPES[acc_count] + if acc_name in _A100_INSTANCE_TYPE_DICTS: + a100_instance_type = _A100_INSTANCE_TYPE_DICTS[acc_name][acc_count] if instance_type != a100_instance_type: with ux_utils.print_exception_no_traceback(): raise exceptions.ResourcesMismatchError( From 349caed3d75cbc6c9c1a22ee0ec0eb27aafdcde8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 15:31:08 -0700 Subject: [PATCH 25/30] fix A100-80GB --- sky/clouds/gcp.py | 6 +++++- sky/clouds/service_catalog/gcp_catalog.py | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index fcbc54133ba..989ad959577 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -235,7 +235,11 @@ def make_deploy_resources_variables( else: # Convert to GCP names: # https://cloud.google.com/compute/docs/gpus - resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower()) + if acc == 'A100-80GB': + # A100-80GB has a different name pattern. + resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower()) + else: + resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower()) resources_vars['gpu_count'] = acc_count if acc == 'K80': # CUDA driver version 470.57.02, CUDA Library 11.4 diff --git a/sky/clouds/service_catalog/gcp_catalog.py b/sky/clouds/service_catalog/gcp_catalog.py index 554c350f5c2..e63015be0a4 100644 --- a/sky/clouds/service_catalog/gcp_catalog.py +++ b/sky/clouds/service_catalog/gcp_catalog.py @@ -33,17 +33,17 @@ # count -> vm type _A100_INSTANCE_TYPE_DICTS = { 'A100': { - 1: 'a2-highgpu-1g', - 2: 'a2-highgpu-2g', - 4: 'a2-highgpu-4g', - 8: 'a2-highgpu-8g', - 16: 'a2-megagpu-16g', + 1: 'a2-highgpu-1g', + 2: 'a2-highgpu-2g', + 4: 'a2-highgpu-4g', + 8: 'a2-highgpu-8g', + 16: 'a2-megagpu-16g', }, 'A100-80GB': { 1: 'a2-ultragpu-1g', - 2: 'a2-ultragpu-2g', - 4: 'a2-ultragpu-4g', - 8: 'a2-ultragpu-8g', + 2: 'a2-ultragpu-2g', + 4: 'a2-ultragpu-4g', + 8: 'a2-ultragpu-8g', } } @@ -277,7 +277,8 @@ def list_accelerators( new_infos = defaultdict(list) for info in a100_infos: assert pd.isna(info.instance_type) and pd.isna(info.memory), a100_infos - a100_host_vm_type = _A100_INSTANCE_TYPE_DICTS[info.accelerator_name][info.accelerator_count] + a100_host_vm_type = _A100_INSTANCE_TYPE_DICTS[info.accelerator_name][ + info.accelerator_count] df = _df[_df['InstanceType'] == a100_host_vm_type] cpu_count = df['vCPUs'].iloc[0] memory = df['MemoryGiB'].iloc[0] From 1a83602bbaa05df5f0ac41603609eed45b71f576 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 9 Oct 2022 15:31:47 -0700 Subject: [PATCH 26/30] format --- sky/clouds/gcp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 989ad959577..bf86942d105 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -239,7 +239,8 @@ def make_deploy_resources_variables( # A100-80GB has a different name pattern. resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower()) else: - resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower()) + resources_vars['gpu'] = 'nvidia-tesla-{}'.format( + acc.lower()) resources_vars['gpu_count'] = acc_count if acc == 'K80': # CUDA driver version 470.57.02, CUDA Library 11.4 From 6e839db52d6d923167b277577c4c1e01346f76c5 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 13:38:36 -0700 Subject: [PATCH 27/30] increase version number --- sky/clouds/service_catalog/azure_catalog.py | 13 +------------ sky/clouds/service_catalog/constants.py | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/sky/clouds/service_catalog/azure_catalog.py b/sky/clouds/service_catalog/azure_catalog.py index 5ac7c3846bb..a5e1f3b88e8 100644 --- a/sky/clouds/service_catalog/azure_catalog.py +++ b/sky/clouds/service_catalog/azure_catalog.py @@ -3,7 +3,6 @@ This module loads the service catalog file and can be used to query instance types and pricing information for Azure. """ -import ast from typing import Dict, List, Optional, Tuple from sky.clouds import cloud @@ -68,17 +67,7 @@ def get_region_zones_for_instance_type(instance_type: str, def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]: - if 'Generation' in _df.columns: - return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] - - # Backward compatibility for the older catalog. - cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0] - cap_list = ast.literal_eval(cell) - gen_version = None - for cap in cap_list: - if cap['name'] == 'HyperVGenerations': - gen_version = cap['value'] - return gen_version + return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0] def list_accelerators(gpus_only: bool, diff --git a/sky/clouds/service_catalog/constants.py b/sky/clouds/service_catalog/constants.py index baa2cf8c6eb..acec209e923 100644 --- a/sky/clouds/service_catalog/constants.py +++ b/sky/clouds/service_catalog/constants.py @@ -2,5 +2,5 @@ import os HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long -CATALOG_SCHEMA_VERSION = 'v2' +CATALOG_SCHEMA_VERSION = 'v3' LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/') From 6c3acd23127c82f82ba76805033da8df1067fafe Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 17:02:56 -0700 Subject: [PATCH 28/30] only keep useful columns for aws --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index bab26227ec0..a68940083ac 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -38,6 +38,11 @@ REGIONS = US_REGIONS +USEFUL_COLUMNS = [ + 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', + 'GpuInfo', 'Price', 'SpotPrice', 'Region' +] + # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is # only available in this region, but it serves pricing information for all regions. PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' @@ -168,6 +173,7 @@ def get_additional_columns(row) -> pd.Series: axis='columns') # patch the GpuInfo for p4de.24xlarge df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB' + df = df[USEFUL_COLUMNS] except Exception as e: print(f'{region} failed with {e}') return region From 068d2e82ef14a4a9f9d21fca4d333044ef8e7fa6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 17:06:48 -0700 Subject: [PATCH 29/30] remove capabilities from azure --- sky/clouds/service_catalog/data_fetchers/fetch_azure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py index 6559d6f0e71..922cdf65216 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_azure.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_azure.py @@ -42,7 +42,7 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation', 'capabilities' + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'Generation' ] From c543aaf77986c575fb7b04eddf3b5ccc1f8fd7c7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 10 Oct 2022 17:10:49 -0700 Subject: [PATCH 30/30] add az to AWS --- sky/clouds/service_catalog/data_fetchers/fetch_aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index a68940083ac..2e674cce295 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -40,7 +40,7 @@ USEFUL_COLUMNS = [ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB', - 'GpuInfo', 'Price', 'SpotPrice', 'Region' + 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone' ] # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is