Skip to content

Commit

Permalink
Don't refresh during ondemand cluster init
Browse files Browse the repository at this point in the history
  • Loading branch information
carolineechen committed Sep 27, 2024
1 parent f210fca commit bdad4b0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 20 deletions.
44 changes: 25 additions & 19 deletions runhouse/resources/hardware/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,36 +136,42 @@ def address(self, addr):

@property
def client(self):
if not self._http_client:
if not self._ping(retry=True):
# ping cluster, and refresh ips if ondemand cluster and first ping fails
raise Exception(
f"Could not reach cluster {self.name} ({self.ips}). Is it up?"
)

def check_connect_server():
connect_call = threading.Thread(
target=self.connect_server_client, kwargs={"force_reconnect": True}
)
connect_call.start()
connect_call.join(timeout=5)
if connect_call.is_alive():
return False
return True

if not self._http_client and not check_connect_server():
if self.__class__.__name__ == "OnDemandCluster":
self._update_from_sky_status(dryrun=False)
if not self._ping(retry=False):
raise ConnectionError(
f"Timed out trying to form connection for cluster {self.name}."
f"Could not reach {self.name} {self.ips}. Is cluster up?"
)
if not self._http_client:
if not check_connect_server():
raise ConnectionError(
f"Error occurred trying to form connection for cluster {self.name}."
f"Timed out trying to form connection for cluster {self.name}."
)

try:
self._http_client.check_server()
except (
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
requests.exceptions.ChunkedEncodingError,
ValueError,
) as e:
raise ConnectionError(f"Check server failed: {e}.")
if not self._http_client:
raise ConnectionError(
f"Error occurred trying to form connection for cluster {self.name}."
)

try:
self._http_client.check_server()
except (
requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
requests.exceptions.ChunkedEncodingError,
ValueError,
) as e:
raise ConnectionError(f"Check server failed: {e}.")
return self._http_client

@property
Expand Down
2 changes: 1 addition & 1 deletion runhouse/resources/hardware/on_demand_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __init__(
# Checks if state info is in local sky db, populates if so.
if not dryrun and not self.ips and not self.creds_values:
# Cluster status is set to INIT in the Sky DB right after starting, so we need to refresh once
self._update_from_sky_status(dryrun=False)
self._update_from_sky_status(dryrun=True)

@property
def client(self):
Expand Down

0 comments on commit bdad4b0

Please sign in to comment.