Skip to content

Commit

Permalink
fix: Wrong creation of raw_configs in _create_kernels_in_one_agent (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jopemachine authored and Yaminyam committed Feb 12, 2025
1 parent 375b45c commit fd71c07
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 50 deletions.
1 change: 1 addition & 0 deletions changes/2896.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix wrong creation of `raw_configs` in `_create_kernels_in_one_agent`
97 changes: 47 additions & 50 deletions src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,7 @@ def get_image_conf(kernel: KernelRow) -> ImageConfig:

kernel_image_refs: dict[KernelId, ImageRef] = {}

raw_configs = []
async with self.db.begin_readonly_session() as db_sess:
for binding in items:
kernel_image_refs[binding.kernel.id] = (
Expand All @@ -1647,57 +1648,53 @@ def get_image_conf(kernel: KernelRow) -> ImageConfig:
)
).image_ref

raw_configs = [
{
"image": {
# TODO: refactor registry and is_local to be specified per kernel.
"registry": get_image_conf(binding.kernel)["registry"],
"digest": get_image_conf(binding.kernel)["digest"],
"repo_digest": get_image_conf(binding.kernel)["repo_digest"],
"canonical": get_image_conf(binding.kernel)["canonical"],
"architecture": get_image_conf(binding.kernel)["architecture"],
"labels": get_image_conf(binding.kernel)["labels"],
"is_local": get_image_conf(binding.kernel)["is_local"],
},
"session_type": scheduled_session.session_type.value,
"cluster_role": binding.kernel.cluster_role,
"cluster_idx": binding.kernel.cluster_idx,
"local_rank": binding.kernel.local_rank,
"cluster_hostname": binding.kernel.cluster_hostname,
"idle_timeout": idle_timeout,
"mounts": [
item.to_json() for item in scheduled_session.vfolder_mounts
raw_configs.append({
"image": {
# TODO: refactor registry and is_local to be specified per kernel.
"registry": get_image_conf(binding.kernel)["registry"],
"digest": get_image_conf(binding.kernel)["digest"],
"repo_digest": get_image_conf(binding.kernel)["repo_digest"],
"canonical": get_image_conf(binding.kernel)["canonical"],
"architecture": get_image_conf(binding.kernel)["architecture"],
"labels": get_image_conf(binding.kernel)["labels"],
"is_local": get_image_conf(binding.kernel)["is_local"],
},
"session_type": scheduled_session.session_type.value,
"cluster_role": binding.kernel.cluster_role,
"cluster_idx": binding.kernel.cluster_idx,
"local_rank": binding.kernel.local_rank,
"cluster_hostname": binding.kernel.cluster_hostname,
"idle_timeout": idle_timeout,
"mounts": [item.to_json() for item in scheduled_session.vfolder_mounts],
"environ": {
# inherit per-session environment variables
**scheduled_session.environ,
# set per-kernel environment variables
"BACKENDAI_KERNEL_ID": str(binding.kernel.id),
"BACKENDAI_KERNEL_IMAGE": get_image_conf(binding.kernel)[
"canonical"
],
"environ": {
# inherit per-session environment variables
**scheduled_session.environ,
# set per-kernel environment variables
"BACKENDAI_KERNEL_ID": str(binding.kernel.id),
"BACKENDAI_KERNEL_IMAGE": get_image_conf(binding.kernel)[
"canonical"
],
"BACKENDAI_CLUSTER_ROLE": binding.kernel.cluster_role,
"BACKENDAI_CLUSTER_IDX": str(binding.kernel.cluster_idx),
"BACKENDAI_CLUSTER_LOCAL_RANK": str(binding.kernel.local_rank),
"BACKENDAI_CLUSTER_HOST": str(binding.kernel.cluster_hostname),
"BACKENDAI_SERVICE_PORTS": str(
get_image_conf(binding.kernel)["labels"].get(
"ai.backend.service-ports"
)
),
},
"resource_slots": binding.kernel.requested_slots.to_json(),
"resource_opts": binding.kernel.resource_opts,
"bootstrap_script": binding.kernel.bootstrap_script,
"startup_command": binding.kernel.startup_command,
"internal_data": scheduled_session.main_kernel.internal_data,
"auto_pull": get_image_conf(binding.kernel)["auto_pull"],
"preopen_ports": scheduled_session.main_kernel.preopen_ports,
"allocated_host_ports": list(binding.allocated_host_ports),
"agent_addr": binding.agent_alloc_ctx.agent_addr,
"scaling_group": binding.agent_alloc_ctx.scaling_group,
}
]
"BACKENDAI_CLUSTER_ROLE": binding.kernel.cluster_role,
"BACKENDAI_CLUSTER_IDX": str(binding.kernel.cluster_idx),
"BACKENDAI_CLUSTER_LOCAL_RANK": str(binding.kernel.local_rank),
"BACKENDAI_CLUSTER_HOST": str(binding.kernel.cluster_hostname),
"BACKENDAI_SERVICE_PORTS": str(
get_image_conf(binding.kernel)["labels"].get(
"ai.backend.service-ports"
)
),
},
"resource_slots": binding.kernel.requested_slots.to_json(),
"resource_opts": binding.kernel.resource_opts,
"bootstrap_script": binding.kernel.bootstrap_script,
"startup_command": binding.kernel.startup_command,
"internal_data": scheduled_session.main_kernel.internal_data,
"auto_pull": get_image_conf(binding.kernel)["auto_pull"],
"preopen_ports": scheduled_session.main_kernel.preopen_ports,
"allocated_host_ports": list(binding.allocated_host_ports),
"agent_addr": binding.agent_alloc_ctx.agent_addr,
"scaling_group": binding.agent_alloc_ctx.scaling_group,
})

raw_kernel_ids = [str(binding.kernel.id) for binding in items]

Expand Down

0 comments on commit fd71c07

Please sign in to comment.