Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add all region option in catalog fetcher and speed up azure fetcher #1204

Merged
merged 31 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
6411b3a
Port changes
Michaelvll Oct 6, 2022
7b81e38
format
Michaelvll Oct 6, 2022
27938ac
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Oct 6, 2022
7546a32
add t2a exclusion back
Michaelvll Oct 6, 2022
4416ca6
fix A100 for GCP
Michaelvll Oct 6, 2022
dcca1e3
fix aws fetching for p4de.24xlarge
Michaelvll Oct 6, 2022
0c1c31d
Fill GPUInfo
Michaelvll Oct 6, 2022
deeba69
fix
Michaelvll Oct 6, 2022
820434f
address part of comments
Michaelvll Oct 9, 2022
8d06ad9
address comments
Michaelvll Oct 9, 2022
fcf00a3
add test for A100
Michaelvll Oct 9, 2022
0fca40f
patch GpuInfo
Michaelvll Oct 9, 2022
7af5c32
Add generation info
Michaelvll Oct 9, 2022
a83ea77
Add capabilities back to azure and fix aws
Michaelvll Oct 9, 2022
a49becd
fix azure catalog
Michaelvll Oct 9, 2022
c603224
format
Michaelvll Oct 9, 2022
42d51d3
lint
Michaelvll Oct 9, 2022
54869e2
remove zone from azure
Michaelvll Oct 9, 2022
95d9355
fix azure
Michaelvll Oct 9, 2022
0251b00
Add analyze for csv
Michaelvll Oct 9, 2022
893de58
update catalog analysis
Michaelvll Oct 9, 2022
3bc4503
format
Michaelvll Oct 9, 2022
39b8a25
backward compatible for azure_catalog
Michaelvll Oct 9, 2022
fdb56c8
yapf
Michaelvll Oct 9, 2022
a802c85
fix GCP catalog
Michaelvll Oct 9, 2022
349caed
fix A100-80GB
Michaelvll Oct 9, 2022
1a83602
format
Michaelvll Oct 9, 2022
6e839db
increase version number
Michaelvll Oct 10, 2022
6c3acd2
only keep useful columns for aws
Michaelvll Oct 11, 2022
068d2e8
remove capabilities from azure
Michaelvll Oct 11, 2022
c543aaf
add az to AWS
Michaelvll Oct 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ docs/build/
docs/_build/
build/
sky_logs/
sky/clouds/service_catalog/data_fetchers/*.csv
7 changes: 6 additions & 1 deletion sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,12 @@ def make_deploy_resources_variables(
else:
# Convert to GCP names:
# https://cloud.google.com/compute/docs/gpus
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(acc.lower())
if acc == 'A100-80GB':
# A100-80GB has a different name pattern.
resources_vars['gpu'] = 'nvidia-{}'.format(acc.lower())
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
else:
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
acc.lower())
resources_vars['gpu_count'] = acc_count
if acc == 'K80':
# CUDA driver version 470.57.02, CUDA Library 11.4
Expand Down
9 changes: 1 addition & 8 deletions sky/clouds/service_catalog/azure_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
This module loads the service catalog file and can be used to query
instance types and pricing information for Azure.
"""
import ast
from typing import Dict, List, Optional, Tuple

from sky.clouds import cloud
Expand Down Expand Up @@ -68,13 +67,7 @@ def get_region_zones_for_instance_type(instance_type: str,


def get_gen_version_from_instance_type(instance_type: str) -> Optional[int]:
cell = _df[_df['InstanceType'] == instance_type]['capabilities'].iloc[0]
cap_list = ast.literal_eval(cell)
gen_version = None
for cap in cap_list:
if cap['name'] == 'HyperVGenerations':
gen_version = cap['value']
return gen_version
return _df[_df['InstanceType'] == instance_type]['Generation'].iloc[0]


def list_accelerators(gpus_only: bool,
Expand Down
2 changes: 1 addition & 1 deletion sky/clouds/service_catalog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
import os

HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
CATALOG_SCHEMA_VERSION = 'v2'
CATALOG_SCHEMA_VERSION = 'v3'
LOCAL_CATALOG_DIR = os.path.expanduser('~/.sky/catalogs/')
54 changes: 54 additions & 0 deletions sky/clouds/service_catalog/data_fetchers/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import copy
from typing import Tuple
import pandas as pd

from sky.clouds.service_catalog import common


def resource_diff(original_df: pd.DataFrame, new_df: pd.DataFrame,
check_tuple: Tuple[str]) -> pd.DataFrame:
"""Returns the difference between two dataframes."""
original_resources = original_df[check_tuple]
new_resources = new_df[check_tuple]

return new_resources.merge(
original_resources, on=check_tuple, how='left',
indicator=True)[lambda x: x['_merge'] == 'left_only'].sort_values(
by=check_tuple)


CLOUD_CHECKS = {
'aws': ['InstanceType', 'Region', 'AvailabilityZone'],
'azure': ['InstanceType', 'Region'],
'gcp': ['InstanceType', 'Region', 'AcceleratorName', 'AcceleratorCount']
}

table = {}

for cloud in CLOUD_CHECKS:
result = {}
print(f'=> Checking {cloud}')
original_df = common.read_catalog(f'{cloud}.csv')
new_df = pd.read_csv(f'{cloud}.csv')

current_check_tuple = CLOUD_CHECKS[cloud]

diff_df = resource_diff(original_df, new_df, current_check_tuple)
diff_df.merge(new_df, on=current_check_tuple,
how='left').to_csv(f'{cloud}_diff.csv', index=False)

result['#resources'] = len(diff_df)

check_price = current_check_tuple + ['Price']
diff_df = resource_diff(original_df, new_df, check_price)
result['#prices'] = len(diff_df)

check_price = current_check_tuple + ['SpotPrice']
diff_df = resource_diff(original_df, new_df, check_price)
result['#spot_prices'] = len(diff_df)

table[cloud] = result

summary = pd.DataFrame(table).T
summary.to_csv('diff_summary.csv')
print(summary)
188 changes: 119 additions & 69 deletions sky/clouds/service_catalog/data_fetchers/fetch_aws.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
"""A script that queries AWS API to get instance types and pricing information.

This script takes about 1 minute to finish.
"""
import datetime
from typing import Tuple
from typing import Tuple, Union

import numpy as np
import pandas as pd
import ray

from sky.adaptors import aws

REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
ALL_REGIONS = [
'us-east-1',
'us-east-2',
'us-west-1',
'us-west-2',
'ca-central-1',
'eu-central-1',
'eu-west-1',
'eu-west-2',
'eu-south-1',
'eu-west-3',
'eu-north-1',
'me-south-1',
# 'me-central-1', # failed for no credential
'af-south-1',
'ap-east-1',
'ap-southeast-3',
# 'ap-south-1', # failed for no credential
'ap-northeast-3',
'ap-northeast-2',
'ap-southeast-1',
'ap-southeast-2',
'ap-northeast-1',
WoosukKwon marked this conversation as resolved.
Show resolved Hide resolved
]
US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']

REGIONS = US_REGIONS

# NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
# only available in this region, but it serves pricing information for all regions.
PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv'
Expand Down Expand Up @@ -44,12 +70,18 @@ def get_pricing_table(region: str) -> pd.DataFrame:
print(f'{region} downloading pricing table')
url = PRICING_TABLE_URL_FMT.format(region=region)
df = pd.read_csv(url, skiprows=5, low_memory=False)
df.rename(columns={
'Instance Type': 'InstanceType',
'PricePerUnit': 'Price',
},
inplace=True)
return df[(df['TermType'] == 'OnDemand') &
(df['Operating System'] == 'Linux') &
df['Pre Installed S/W'].isnull() &
(df['CapacityStatus'] == 'Used') &
(df['Tenancy'].isin(['Host', 'Shared'])) &
df['PricePerUnit'] > 0].set_index('Instance Type')
(df['Tenancy'].isin(['Host', 'Shared'])) & df['Price'] > 0][[
'InstanceType', 'Price', 'vCPU', 'Memory'
]]


@ray.remote
Expand All @@ -62,78 +94,96 @@ def get_spot_pricing_table(region: str) -> pd.DataFrame:
ret = []
for response in response_iterator:
ret = ret + response['SpotPriceHistory']
df = pd.DataFrame(ret).set_index(['InstanceType', 'AvailabilityZone'])
df = pd.DataFrame(ret)[['InstanceType', 'AvailabilityZone', 'SpotPrice']]
df = df.set_index(['InstanceType', 'AvailabilityZone'])
return df


@ray.remote
def get_instance_types_df(region: str) -> pd.DataFrame:
df, zone_df, pricing_df, spot_pricing_df = ray.get([
get_instance_types.remote(region),
get_availability_zones.remote(region),
get_pricing_table.remote(region),
get_spot_pricing_table.remote(region)
])
print(f'{region} Processing dataframes')

def get_price(row):
t = row['InstanceType']
try:
return pricing_df.loc[t]['PricePerUnit']
except KeyError:
return np.nan

def get_spot_price(row):
instance = row['InstanceType']
zone = row['AvailabilityZone']
try:
return spot_pricing_df.loc[(instance, zone)]['SpotPrice']
except KeyError:
return np.nan

def get_acc_info(row) -> Tuple[str, float]:
accelerator = None
for col, info_key in [('GpuInfo', 'Gpus'),
('InferenceAcceleratorInfo', 'Accelerators'),
('FpgaInfo', 'Fpgas')]:
info = row.get(col)
if isinstance(info, dict):
accelerator = info[info_key][0]
if accelerator is None:
return None, np.nan
return accelerator['Name'], accelerator['Count']

def get_vcpus(row) -> float:
return float(row['VCpuInfo']['DefaultVCpus'])

def get_memory_gib(row) -> float:
return row['MemoryInfo']['SizeInMiB'] // 1024

def get_additional_columns(row):
acc_name, acc_count = get_acc_info(row)
# AWS p3dn.24xlarge offers a different V100 GPU.
# See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/
if row['InstanceType'] == 'p3dn.24xlarge':
acc_name = 'V100-32GB'
return pd.Series({
'Price': get_price(row),
'SpotPrice': get_spot_price(row),
'AcceleratorName': acc_name,
'AcceleratorCount': acc_count,
'vCPUs': get_vcpus(row),
'MemoryGiB': get_memory_gib(row),
})

df['Region'] = region
df = df.merge(pd.DataFrame(zone_df), how='cross')
df = pd.concat([df, df.apply(get_additional_columns, axis='columns')],
axis='columns')
def get_instance_types_df(region: str) -> Union[str, pd.DataFrame]:
try:
df, zone_df, pricing_df, spot_pricing_df = ray.get([
get_instance_types.remote(region),
get_availability_zones.remote(region),
get_pricing_table.remote(region),
get_spot_pricing_table.remote(region),
])
print(f'{region} Processing dataframes')

def get_acc_info(row) -> Tuple[str, float]:
accelerator = None
for col, info_key in [('GpuInfo', 'Gpus'),
('InferenceAcceleratorInfo', 'Accelerators'),
('FpgaInfo', 'Fpgas')]:
info = row.get(col)
if isinstance(info, dict):
accelerator = info[info_key][0]
if accelerator is None:
return None, np.nan
return accelerator['Name'], accelerator['Count']

def get_vcpus(row) -> float:
if not np.isnan(row['vCPU']):
return float(row['vCPU'])
return float(row['VCpuInfo']['DefaultVCpus'])

def get_memory_gib(row) -> float:
if isinstance(row['MemoryInfo'], dict):
return row['MemoryInfo']['SizeInMiB'] // 1024
return int(row['Memory'].split(' GiB')[0])

def get_additional_columns(row) -> pd.Series:
acc_name, acc_count = get_acc_info(row)
# AWS p3dn.24xlarge offers a different V100 GPU.
# See https://aws.amazon.com/blogs/compute/optimizing-deep-learning-on-p3-and-p3dn-with-efa/
if row['InstanceType'] == 'p3dn.24xlarge':
acc_name = 'V100-32GB'
if row['InstanceType'] == 'p4de.24xlarge':
acc_name = 'A100-80GB'
acc_count = 8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: why do we need to override acc_count from get_acc_info?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The describe_instance_types from AWS API will not include the p4de.8xlarge. Since we merge the pricing table with the instance_type table (the GpuInfo in get_acc_info comes from) together to found out those missing instance types with outer join, the GpuInfo is null and we have to set the accelerator and number ourselves here.

return pd.Series({
'AcceleratorName': acc_name,
'AcceleratorCount': acc_count,
'vCPUs': get_vcpus(row),
'MemoryGiB': get_memory_gib(row),
})

# The AWS API may not have all the instance types in the pricing table,
# so we need to merge the two dataframes.
df = df.merge(pricing_df, on=['InstanceType'], how='outer')
df['Region'] = region
# Cartesian product of instance types and availability zones, so that
# we can join the spot pricing table per instance type and zone.
df = df.merge(pd.DataFrame(zone_df), how='cross')

# Add spot price column, by joining the spot pricing table.
df = df.merge(spot_pricing_df,
left_on=['InstanceType', 'AvailabilityZone'],
right_index=True,
how='outer')

# Extract vCPUs, memory, and accelerator info from the columns.
df = pd.concat(
[df, df.apply(get_additional_columns, axis='columns')],
axis='columns')
# patch the GpuInfo for p4de.24xlarge
df.loc[df['InstanceType'] == 'p4de.24xlarge', 'GpuInfo'] = 'A100-80GB'
except Exception as e:
print(f'{region} failed with {e}')
return region
return df


def get_all_regions_instance_types_df():
dfs = ray.get([get_instance_types_df.remote(r) for r in REGIONS])
df = pd.concat(dfs)
df_or_regions = ray.get([get_instance_types_df.remote(r) for r in REGIONS])
new_dfs = []
for df_or_region in df_or_regions:
if isinstance(df_or_region, str):
print(f'{df_or_region} failed')
else:
new_dfs.append(df_or_region)

df = pd.concat(new_dfs)
df.sort_values(['InstanceType', 'Region'], inplace=True)
return df

Expand Down
Loading