ITISFoundation · sanderegg · Sep 12, 2023 · Sep 11, 2023 · Sep 11, 2023 · Sep 4, 2023
@@ -25,6 +25,9 @@ CATALOG_DEV_FEATURES_ENABLED=0
 CATALOG_SERVICES_DEFAULT_RESOURCES='{"CPU": {"limit": 0.1, "reservation": 0.1}, "RAM": {"limit": 2147483648, "reservation": 2147483648}}'
 CATALOG_SERVICES_DEFAULT_SPECIFICATIONS='{}'
 
+CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=5
+CLUSTERS_KEEPER_TASK_INTERVAL=60
+
 DASK_SCHEDULER_HOST=dask-scheduler
 DASK_SCHEDULER_PORT=8786
 

@@ -36,6 +36,10 @@ class ComputationCreate(BaseModel):
         description="the computation shall use the cluster described by its id, 0 is the default cluster",
     )
     simcore_user_agent: str = ""
+    use_on_demand_clusters: bool = Field(
+        default=False,
+        description="if True, a cluster will be created as necessary (wallet_id cannot be None, and cluster_id must be None)",
+    )
 
     @validator("product_name", always=True)
     @classmethod
@@ -45,6 +49,14 @@ def ensure_product_name_defined_if_computation_starts(cls, v, values):
             raise ValueError(msg)
         return v
 
+    @validator("use_on_demand_clusters", always=True)
+    @classmethod
+    def ensure_expected_options(cls, v, values):
+        if v is True and ("cluster_id" in values and values["cluster_id"] is not None):
+            msg = "cluster_id cannot be set if use_on_demand_clusters is set"
+            raise ValueError(msg)
+        return v
+
 
 class ComputationStop(BaseModel):
     user_id: UserID

@@ -109,7 +109,7 @@ class BaseCluster(BaseModel):
     type: ClusterTypeInModel
     owner: GroupID
     thumbnail: HttpUrl | None = Field(
-        None,
+        default=None,
         description="url to the image describing this cluster",
         examples=["https://placeimg.com/171/96/tech/grayscale/?0.jpg"],
     )

@@ -26,13 +26,15 @@ class RunningState(str, Enum):
     SUCCESS = "SUCCESS"
     FAILED = "FAILED"
     ABORTED = "ABORTED"
+    WAITING_FOR_CLUSTER = "WAITING_FOR_CLUSTER"
 
     def is_running(self) -> bool:
         return self in (
             RunningState.PUBLISHED,
             RunningState.PENDING,
             RunningState.WAITING_FOR_RESOURCES,
             RunningState.STARTED,
+            RunningState.WAITING_FOR_CLUSTER,
         )
 
 

@@ -0,0 +1,23 @@
+from enum import auto
+
+from pydantic import AnyUrl, BaseModel
+
+from ..clusters import ClusterAuthentication
+from ..users import UserID
+from ..utils.enums import StrAutoEnum
+from ..wallets import WalletID
+
+
+class ClusterState(StrAutoEnum):
+    STARTED = auto()
+    RUNNING = auto()
+    STOPPED = auto()
+
+
+class OnDemandCluster(BaseModel):
+    endpoint: AnyUrl
+    authentication: ClusterAuthentication
+    state: ClusterState
+    user_id: UserID
+    wallet_id: WalletID
+    gateway_ready: bool
@@ -0,0 +1,58 @@
+"""add use on demand clusters in comp_runs
+
+Revision ID: 2cd329e47ea1
+Revises: 763666c698fb
+Create Date: 2023-09-04 06:57:51.291084+00:00
+
+"""
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "2cd329e47ea1"
+down_revision = "f53806935760"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "comp_runs", sa.Column("use_on_demand_clusters", sa.Boolean(), nullable=True)
+    )
+    # ### end Alembic commands ###
+    op.execute(
+        sa.DDL(
+            "UPDATE comp_runs SET use_on_demand_clusters = false WHERE use_on_demand_clusters IS NULL"
+        )
+    )
+
+    op.alter_column(
+        "comp_runs",
+        "use_on_demand_clusters",
+        existing_type=sa.Boolean(),
+        nullable=False,
+    )
+
+    # new statetype
+    op.execute(
+        sa.DDL("ALTER TYPE statetype ADD VALUE IF NOT EXISTS 'WAITING_FOR_CLUSTER'")
+    )
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("comp_runs", "use_on_demand_clusters")
+    # ### end Alembic commands ###
+
+    # no need to downgrade the enum type, postgres does not allow to just remove a type
+    # instead the tables that use it are updated
+    op.execute(
+        sa.DDL(
+            """
+UPDATE comp_tasks SET state = 'PUBLISHED' WHERE state = 'WAITING_FOR_CLUSTER';
+UPDATE comp_pipeline SET state = 'PUBLISHED' WHERE state = 'WAITING_FOR_CLUSTER';
+UPDATE comp_runs SET result = 'PUBLISHED' WHERE result = 'WAITING_FOR_CLUSTER';
+    """
+        )
+    )
@@ -23,6 +23,7 @@ class StateType(enum.Enum):
     FAILED = "FAILED"
     ABORTED = "ABORTED"
     WAITING_FOR_RESOURCES = "WAITING_FOR_RESOURCES"
+    WAITING_FOR_CLUSTER = "WAITING_FOR_CLUSTER"
 
 
 def _new_uuid():

@@ -100,5 +100,11 @@
         doc="When the run was finished",
     ),
     sa.Column("metadata", JSONB, nullable=True, doc="the run optional metadata"),
+    sa.Column(
+        "use_on_demand_clusters",
+        sa.Boolean(),
+        nullable=False,
+        doc="the run uses on demand clusters",
+    ),
     sa.UniqueConstraint("project_uuid", "user_id", "iteration"),
 )
@@ -63,10 +63,6 @@ if [ "${SC_BUILD_TARGET}" = "development" ]; then
   fi
 fi
 
-if [ "${SC_BOOT_MODE}" = "debug-ptvsd" ]; then
-  # NOTE: production does NOT pre-installs ptvsd
-  pip install --no-cache-dir debugpy
-fi
 
 # Appends docker group if socket is mounted
 DOCKER_MOUNT=/var/run/docker.sock

@@ -43,12 +43,12 @@ def get_summary() -> str:
  _______  _                 _______ _________ _______  _______  _______         _        _______  _______  _______  _______  _______
 (  ____ \( \      |\     /|(  ____ \\__   __/(  ____ \(  ____ )(  ____ \       | \    /\(  ____ \(  ____ \(  ____ )(  ____ \(  ____ )
 | (    \/| (      | )   ( || (    \/   ) (   | (    \/| (    )|| (    \/       |  \  / /| (    \/| (    \/| (    )|| (    \/| (    )|
-| |      | |      | |   | || (_____    | |   | (__    | (____)|| (_____        |  (_/ / | (__    | (__    | (____)|| (__    | (____)|
-| |      | |      | |   | |(_____  )   | |   |  __)   |     __)(_____  )       |   _ (  |  __)   |  __)   |  _____)|  __)   |     __)
+| |      | |      | |   | || (_____    | |   | (__    | (____)|| (_____  _____ |  (_/ / | (__    | (__    | (____)|| (__    | (____)|
+| |      | |      | |   | |(_____  )   | |   |  __)   |     __)(_____  )(_____)|   _ (  |  __)   |  __)   |  _____)|  __)   |     __)
 | |      | |      | |   | |      ) |   | |   | (      | (\ (         ) |       |  ( \ \ | (      | (      | (      | (      | (\ (
 | (____/\| (____/\| (___) |/\____) |   | |   | (____/\| ) \ \__/\____) |       |  /  \ \| (____/\| (____/\| )      | (____/\| ) \ \__
-(_______/(_______/(_______)\_______)   )_(   (_______/|/   \__/\_______) _____ |_/    \/(_______/(_______/|/       (_______/|/   \__/
-                                                                        (_____)                                                            {}
+(_______/(_______/(_______)\_______)   )_(   (_______/|/   \__/\_______)       |_/    \/(_______/(_______/|/       (_______/|/   \__/
+                                                                                                                                    {}
 """.format(
     f"v{__version__}"
 )

@@ -1,13 +1,8 @@
 import datetime
 from dataclasses import dataclass
-from enum import auto
 from typing import TypeAlias
 
-from models_library.clusters import ClusterAuthentication, SimpleAuthentication
-from models_library.users import UserID
-from models_library.utils.enums import StrAutoEnum
-from models_library.wallets import WalletID
-from pydantic import AnyUrl, BaseModel, ByteSize, PositiveInt, SecretStr, parse_obj_as
+from pydantic import ByteSize, PositiveInt
 from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType
 
 
@@ -31,48 +26,3 @@ class EC2InstanceData:
     type: InstanceTypeType  # noqa: A003
     state: InstanceStateNameType
     tags: EC2Tags
-
-
-class ClusterState(StrAutoEnum):
-    STARTED = auto()
-    RUNNING = auto()
-    STOPPED = auto()
-
-
-def _convert_ec2_state_to_cluster_state(
-    ec2_state: InstanceStateNameType,
-) -> ClusterState:
-    match ec2_state:
-        case "pending":
-            return ClusterState.STARTED  # type: ignore
-        case "running":
-            return ClusterState.RUNNING  # type: ignore
-        case _:
-            return ClusterState.STOPPED  # type: ignore
-
-
-class ClusterGet(BaseModel):
-    endpoint: AnyUrl
-    authentication: ClusterAuthentication
-    state: ClusterState
-    user_id: UserID
-    wallet_id: WalletID
-    gateway_ready: bool = False
-
-    @classmethod
-    def from_ec2_instance_data(
-        cls,
-        instance: EC2InstanceData,
-        user_id: UserID,
-        wallet_id: WalletID,
-        gateway_password: SecretStr,
-    ) -> "ClusterGet":
-        return cls(
-            endpoint=parse_obj_as(AnyUrl, f"http://{instance.aws_public_ip}"),
-            authentication=SimpleAuthentication(
-                username=f"{user_id}", password=gateway_password
-            ),
-            state=_convert_ec2_state_to_cluster_state(instance.state),
-            user_id=user_id,
-            wallet_id=wallet_id,
-        )
@@ -80,7 +80,7 @@ async def cluster_heartbeat(
 
 async def set_instance_heartbeat(app: FastAPI, *, instance: EC2InstanceData) -> None:
     with log_context(
-        _logger, logging.INFO, msg=f"set instance heartbeat for {instance.id}"
+        _logger, logging.DEBUG, msg=f"set instance heartbeat for {instance.id}"
     ):
         ec2_client = get_ec2_client(app)
         await ec2_client.set_instances_tags(

@@ -30,17 +30,22 @@ async def _find_terminateable_instances(
     # get the corresponding ec2 instance data
     terminateable_instances: list[EC2InstanceData] = []
 
+    time_to_wait_before_termination = (
+        app_settings.CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION
+        * app_settings.SERVICE_TRACKING_HEARTBEAT
+    )
     for instance in instances:
         last_heartbeat = _get_instance_last_heartbeat(instance)
 
         elapsed_time_since_heartbeat = (
             datetime.datetime.now(datetime.timezone.utc) - last_heartbeat
         )
-
-        if elapsed_time_since_heartbeat >= (
-            app_settings.CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION
-            * app_settings.SERVICE_TRACKING_HEARTBEAT
-        ):
+        _logger.info(
+            "%s has still %ss before being terminateable",
+            f"{instance.id=}",
+            f"{(time_to_wait_before_termination - elapsed_time_since_heartbeat).total_seconds()}",
+        )
+        if elapsed_time_since_heartbeat >= time_to_wait_before_termination:
             # let's terminate that one
             terminateable_instances.append(instance)
 

@@ -1,6 +1,7 @@
 import asyncio
 import logging
-from typing import Any, Coroutine, Final
+from collections.abc import Coroutine
+from typing import Any, Final
 
 import dask_gateway
 from aiohttp.client_exceptions import ClientError
@@ -22,15 +23,19 @@ async def ping_gateway(*, url: AnyUrl, password: SecretStr) -> bool:
             auth=basic_auth,
             asynchronous=True,
         ) as gateway:
-            cluster_reports = await asyncio.wait_for(gateway.list_clusters(), timeout=5)
-        _logger.info("found %s clusters", len(cluster_reports))
+            await asyncio.wait_for(gateway.list_clusters(), timeout=5)
         return True
     except asyncio.TimeoutError:
-        _logger.debug("gateway ping timed-out, it is still starting...")
-    except ClientError:
+        _logger.info(
+            "osparc-gateway %s ping timed-out, the machine is likely still starting...",
+            url,
+        )
+    except (ClientError, ValueError):
         # this could happen if the gateway is not properly started, but it should not last
         # unless the wrong password is used.
-        _logger.info("dask-gateway is not reachable", exc_info=True)
+        _logger.info(
+            "Machine is up but osparc-gateway %s is not reachable...yet?!", url
+        )
 
     return False
 
@@ -66,9 +71,20 @@ async def is_gateway_busy(*, url: AnyUrl, gateway_auth: SimpleAuthentication) ->
                 client.list_datasets()
             )
             _logger.info(
-                "cluster currently has %s datasets, it is %s",
-                len(datasets_on_scheduler),
-                "BUSY" if len(datasets_on_scheduler) > 0 else "NOT BUSY",
+                "cluster currently has %s datasets", len(datasets_on_scheduler)
             )
-            currently_processing = await _wrap_client_async_routine(client.processing())
-            return bool(datasets_on_scheduler or currently_processing)
+            num_processing_tasks = 0
+            if worker_to_processing_tasks := await _wrap_client_async_routine(
+                client.processing()
+            ):
+                _logger.info(
+                    "cluster current workers: %s", worker_to_processing_tasks.keys()
+                )
+                num_processing_tasks = sum(
+                    len(tasks) for tasks in worker_to_processing_tasks.values()
+                )
+                _logger.info(
+                    "cluster currently processes %s tasks", num_processing_tasks
+                )
+
+            return bool(datasets_on_scheduler or num_processing_tasks)