Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ocm-machine-pools] nodepool subnet specdrift #4140

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions reconcile/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2394,11 +2394,14 @@ def ocm_addons_upgrade_tests_trigger(ctx):


@integration.command(short_help="Manage Machine Pools in OCM.")
@gitlab_project_id
@click.pass_context
def ocm_machine_pools(ctx):
def ocm_machine_pools(ctx, gitlab_project_id):
import reconcile.ocm_machine_pools

run_integration(reconcile.ocm_machine_pools, ctx.obj)
run_integration(
reconcile.ocm_machine_pools, ctx.obj, gitlab_project_id=gitlab_project_id
)


@integration.command(short_help="Manage Upgrade Policy schedules in OCM organizations.")
Expand Down
121 changes: 110 additions & 11 deletions reconcile/ocm_machine_pools.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
root_validator,
)

from reconcile import queries
from reconcile import mr_client_gateway, queries
from reconcile.gql_definitions.common.clusters import (
ClusterMachinePoolV1,
ClusterSpecAutoScaleV1,
Expand All @@ -25,6 +25,7 @@
from reconcile.typed_queries.clusters import get_clusters
from reconcile.utils.differ import diff_mappings
from reconcile.utils.disabled_integrations import integration_is_enabled
from reconcile.utils.mr.cluster_machine_pool_updates import ClustersMachinePoolUpdates
from reconcile.utils.ocm import (
DEFAULT_OCM_MACHINE_POOL_ID,
OCM,
Expand Down Expand Up @@ -134,18 +135,32 @@ def has_diff(self, pool: ClusterMachinePoolV1) -> bool:
def invalid_diff(self, pool: ClusterMachinePoolV1) -> Optional[str]:
pass

def get_pool_spec_to_update(
self, pool: ClusterMachinePoolV1
) -> Optional[ClusterMachinePoolV1]:
"""
For situations where OCM holds the source of truth for parts of the spec,
this function detects a drift in the app-interface spec and returns a
new pool spec to be updated in app-interface.

If no drift is detected, None is returned.
"""
return None

@abstractmethod
def deletable(self) -> bool:
pass

def _has_diff_autoscale(self, pool):
def _has_diff_autoscale(self, pool: ClusterMachinePoolV1) -> bool:
match (self.autoscaling, pool.autoscale):
case (None, None):
return False
case (None, _) | (_, None):
return True
case _:
case _ if self.autoscaling and pool.autoscale:
return self.autoscaling.has_diff(pool.autoscale)
case _:
return False


class MachinePool(AbstractPool):
Expand Down Expand Up @@ -232,7 +247,7 @@ class NodePool(AbstractPool):
subnet: Optional[str]

def delete(self, ocm: OCM) -> None:
ocm.delete_node_pool(self.cluster, self.dict(by_alias=True))
ocm.delete_node_pool(self.cluster, self.id)

def create(self, ocm: OCM) -> None:
spec = self.dict(by_alias=True)
Expand Down Expand Up @@ -262,7 +277,11 @@ def has_diff(self, pool: ClusterMachinePoolV1) -> bool:
or self.taints != pool.taints
or self.labels != pool.labels
or self.aws_node_pool.instance_type != pool.instance_type
or self.subnet != pool.subnet
# if the nodepool in app-interface does not define a subnet explicitely
# we don't consider it a diff. we don't manage it, but `get_pool_spec_to_update`
# will highlight it as a spec drift that is going to be fixed with an MR
# towards app-interface
or (pool.subnet is not None and self.subnet != pool.subnet)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this means we'll have non matching data between app-interface and the reality ?

Copy link
Contributor Author

@geoberle geoberle Mar 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added the promised phase 2 to this MR in the to open an app-interface MR to patch the missing subnets. this way we don't have non-matching data.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does that mean we should never apply subnet from app-interface to ocm? When app-interface pool subnet is not None and we have a different subnet value in current pool, this condition will be true, and show as diff, is this expected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can and we should specify subnets for day2 node pool operations. but at installtime we can't influence it and we end up with a set of pools where ROSA decides how to spread them accross the available private subnets.

we can improve on this behaviour once there is the ability to create HCP clusters without node pools at install time. becaues then we wait until the cluster is ready and ocm-machine-pools can create the pools in the explicitely defined subnets.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does that mean we should never apply subnet from app-interface to ocm? When app-interface pool subnet is not None and we have a different subnet value in current pool, this condition will be true, and show as diff, is this expected?

once a node pool has been created, its subnet ID can't be changed anymore. at this point OCM becomes the source of truth for it. we implemented the notion of an invalid_diff to highlight the problem, so it is expected that has_diff shows a diff in subnet between a-i and OCM.

So for the use case of nodepools have been recreated on OCM side without app-interface involvement, this diff will show in has_diff but will fail invalid_diff check, the integration will fail until patch mr merged, is this expected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, it still makes us aware and we need to intervene. we should not do things manually in OCM.

or self._has_diff_autoscale(pool)
)

Expand All @@ -273,6 +292,27 @@ def invalid_diff(self, pool: ClusterMachinePoolV1) -> Optional[str]:
return "subnet"
return None

def get_pool_spec_to_update(
self, pool: ClusterMachinePoolV1
) -> Optional[ClusterMachinePoolV1]:
"""
if app-interface does not define a subnet explicitely or if the
subnet is different from OCM, we consider the spec in app-interface oudated.

In such a case this method returns a clone of the current pool definition
in app-interface with the subnet updated to the one in OCM. if no update
is required, None is returned

How can this happen?
* a cluster was created without specified subnets and ROSA CLI picks the subnet
assignment for the nodepools
* nodepools have been recreated on OCM side without app-interface involvement
and the subnet changed
"""
if pool.subnet is None or self.subnet != pool.subnet:
return pool.copy(update={"subnet": self.subnet})
return None

def deletable(self) -> bool:
return True

Expand Down Expand Up @@ -327,6 +367,7 @@ def act(self, dry_run: bool, ocm: OCM) -> None:


class DesiredMachinePool(BaseModel):
cluster_path: str
cluster_name: str
cluster_type: ClusterType
pools: list[ClusterMachinePoolV1]
Expand Down Expand Up @@ -354,6 +395,7 @@ def fetch_current_state(
return {
c.name: fetch_current_state_for_cluster(c, ocm_map.get(c.name))
for c in clusters
if c.spec and c.spec.q_id
}


Expand All @@ -373,7 +415,7 @@ def _classify_cluster_type(cluster: ClusterV1) -> ClusterType:
raise ValueError(f"unknown cluster type for cluster {cluster.name}")


def fetch_current_state_for_cluster(cluster, ocm):
def fetch_current_state_for_cluster(cluster: ClusterV1, ocm: OCM) -> list[AbstractPool]:
cluster_type = _classify_cluster_type(cluster)
if cluster_type == ClusterType.ROSA_HCP:
return [
Expand Down Expand Up @@ -423,12 +465,13 @@ def create_desired_state_from_gql(
) -> dict[str, DesiredMachinePool]:
return {
cluster.name: DesiredMachinePool(
cluster_path=cluster.path,
cluster_name=cluster.name,
cluster_type=_classify_cluster_type(cluster),
pools=cluster.machine_pools,
)
for cluster in clusters
if cluster.machine_pools is not None
if cluster.machine_pools is not None and cluster.spec and cluster.spec.q_id
}


Expand Down Expand Up @@ -504,19 +547,70 @@ def calculate_diff(
return diffs, errors


def act(dry_run: bool, diffs: Iterable[PoolHandler], ocm_map: OCMMap) -> None:
def _drift_updates(
current_pools: list[AbstractPool],
desired: DesiredMachinePool,
) -> list[ClusterMachinePoolV1]:
current_machine_pools = {pool.id: pool for pool in current_pools}
return [
spec
for pool in desired.pools
if (current_pool := current_machine_pools.get(pool.q_id))
and (spec := current_pool.get_pool_spec_to_update(pool)) is not None
]


def calculate_spec_drift(
current_state: Mapping[str, list[AbstractPool]],
desired_state: Mapping[str, DesiredMachinePool],
) -> dict[str, list[ClusterMachinePoolV1]]:
"""
Finds spec drifts between OCM and app-interface and returns a list of them.
"""
return {
desired_state[k].cluster_path: updates
for k in current_state.keys() & desired_state.keys()
if (updates := _drift_updates(current_state[k], desired_state[k]))
}


def update_ocm(dry_run: bool, diffs: Iterable[PoolHandler], ocm_map: OCMMap) -> None:
for diff in diffs:
logging.info([diff.action, diff.pool.cluster, diff.pool.id])
if not dry_run:
ocm = ocm_map.get(diff.pool.cluster)
diff.act(dry_run, ocm)


def update_app_interface(
dry_run: bool,
gitlab_project_id: Optional[str],
diffs: dict[str, list[ClusterMachinePoolV1]],
) -> None:
if not diffs:
return

mr = ClustersMachinePoolUpdates(
machine_pool_updates={
cluster_path: [pool.dict(by_alias=True) for pool in pool_updates]
for cluster_path, pool_updates in diffs.items()
}
)
if not dry_run:
with mr_client_gateway.init(gitlab_project_id=gitlab_project_id) as mr_cli:
mr.submit(cli=mr_cli)


def _cluster_is_compatible(cluster: ClusterV1) -> bool:
return cluster.ocm is not None and cluster.machine_pools is not None
return (
cluster.ocm is not None
and cluster.machine_pools is not None
and cluster.spec is not None
and cluster.spec.q_id is not None
)


def run(dry_run: bool):
def run(dry_run: bool, gitlab_project_id: Optional[str] = None):
clusters = get_clusters()

filtered_clusters = [
Expand All @@ -540,9 +634,14 @@ def run(dry_run: bool):

current_state = fetch_current_state(ocm_map, filtered_clusters)
desired_state = create_desired_state_from_gql(filtered_clusters)

# handle diff towards OCM
diffs, errors = calculate_diff(current_state, desired_state)
update_ocm(dry_run, diffs, ocm_map)

act(dry_run, diffs, ocm_map)
# handle diffs towards app-interface
spec_drift = calculate_spec_drift(current_state, desired_state)
update_app_interface(dry_run, gitlab_project_id, spec_drift)

if errors:
for err in errors:
Expand Down
Loading