Skip to content

Commit

Permalink
fix: Make DataSourceOracle more resilient to early network issues (#5025
Browse files Browse the repository at this point in the history
)

If DataSourceOracle fails to retrieve IMDS info in init-local, try
again during network timeframe. Additionally, ensure max_wait,
timeout, and retries are respected.

LP: #2056194
  • Loading branch information
TheRealFalcon authored Mar 8, 2024
1 parent 621c522 commit eadaaf0
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 95 deletions.
116 changes: 87 additions & 29 deletions cloudinit/sources/DataSourceOracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@

import base64
import ipaddress
import json
import logging
import time
from collections import namedtuple
from typing import Optional, Tuple
from typing import Dict, Optional, Tuple

from cloudinit import atomic_helper, dmi, net, sources, util
from cloudinit.distros.networking import NetworkConfig
Expand All @@ -27,7 +29,7 @@
get_interfaces_by_mac,
is_netfail_master,
)
from cloudinit.url_helper import UrlError, readurl
from cloudinit.url_helper import wait_for_url

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -122,6 +124,11 @@ class DataSourceOracle(sources.DataSource):
)

_network_config: dict = {"config": [], "version": 1}
perform_dhcp_setup = True

# Careful...these can be overridden in __init__
url_max_wait = 30
url_timeout = 5

def __init__(self, sys_cfg, *args, **kwargs):
super(DataSourceOracle, self).__init__(sys_cfg, *args, **kwargs)
Expand All @@ -135,6 +142,10 @@ def __init__(self, sys_cfg, *args, **kwargs):
)
self._network_config_source = KlibcOracleNetworkConfigSource()

url_params = self.get_url_params()
self.url_max_wait = url_params.max_wait_seconds
self.url_timeout = url_params.timeout_seconds

def _has_network_config(self) -> bool:
return bool(self._network_config.get("config", []))

Expand All @@ -147,23 +158,31 @@ def _get_data(self):

self.system_uuid = _read_system_uuid()

network_context = ephemeral.EphemeralDHCPv4(
self.distro,
iface=net.find_fallback_nic(),
connectivity_url_data={
"url": METADATA_PATTERN.format(version=2, path="instance"),
"headers": V2_HEADERS,
},
)
if self.perform_dhcp_setup:
network_context = ephemeral.EphemeralDHCPv4(
self.distro,
iface=net.find_fallback_nic(),
connectivity_url_data={
"url": METADATA_PATTERN.format(version=2, path="instance"),
"headers": V2_HEADERS,
},
)
else:
network_context = util.nullcontext()
fetch_primary_nic = not self._is_iscsi_root()
fetch_secondary_nics = self.ds_cfg.get(
"configure_secondary_nics",
BUILTIN_DS_CONFIG["configure_secondary_nics"],
)

with network_context:
fetched_metadata = read_opc_metadata(
fetch_vnics_data=fetch_primary_nic or fetch_secondary_nics
fetch_vnics_data=fetch_primary_nic or fetch_secondary_nics,
max_wait=self.url_max_wait,
timeout=self.url_timeout,
)
if not fetched_metadata:
return False

data = self._crawled_metadata = fetched_metadata.instance_data
self.metadata_address = METADATA_ROOT.format(
Expand Down Expand Up @@ -331,6 +350,10 @@ def _add_network_config_from_opc_imds(self, set_primary: bool = False):
self._network_config["ethernets"][name] = interface_config


class DataSourceOracleNet(DataSourceOracle):
perform_dhcp_setup = False


def _read_system_uuid() -> Optional[str]:
sys_uuid = dmi.read_dmi_data("system-uuid")
return None if sys_uuid is None else sys_uuid.lower()
Expand All @@ -341,15 +364,20 @@ def _is_platform_viable() -> bool:
return asset_tag == CHASSIS_ASSET_TAG


def _fetch(metadata_version: int, path: str, retries: int = 2) -> dict:
return readurl(
url=METADATA_PATTERN.format(version=metadata_version, path=path),
headers=V2_HEADERS if metadata_version > 1 else None,
retries=retries,
)._response.json()
def _url_version(url: str) -> int:
return 2 if url.startswith("http://169.254.169.254/opc/v2") else 1


def _headers_cb(url: str) -> Optional[Dict[str, str]]:
return V2_HEADERS if _url_version(url) == 2 else None


def read_opc_metadata(*, fetch_vnics_data: bool = False) -> OpcMetadata:
def read_opc_metadata(
*,
fetch_vnics_data: bool = False,
max_wait=DataSourceOracle.url_max_wait,
timeout=DataSourceOracle.url_timeout,
) -> Optional[OpcMetadata]:
"""Fetch metadata from the /opc/ routes.
:return:
Expand All @@ -358,30 +386,60 @@ def read_opc_metadata(*, fetch_vnics_data: bool = False) -> OpcMetadata:
The JSON-decoded value of the instance data endpoint on the IMDS
The JSON-decoded value of the vnics data endpoint if
`fetch_vnics_data` is True, else None
or None if fetching metadata failed
"""
# Per Oracle, there are short windows (measured in milliseconds) throughout
# an instance's lifetime where the IMDS is being updated and may 404 as a
# result. To work around these windows, we retry a couple of times.
metadata_version = 2
try:
instance_data = _fetch(metadata_version, path="instance")
except UrlError:
metadata_version = 1
instance_data = _fetch(metadata_version, path="instance")
# result.
urls = [
METADATA_PATTERN.format(version=2, path="instance"),
METADATA_PATTERN.format(version=1, path="instance"),
]
start_time = time.time()
instance_url, instance_response = wait_for_url(
urls,
max_wait=max_wait,
timeout=timeout,
headers_cb=_headers_cb,
sleep_time=0,
)
if not instance_url:
LOG.warning("Failed to fetch IMDS metadata!")
return None
instance_data = json.loads(instance_response.decode("utf-8"))

metadata_version = _url_version(instance_url)

vnics_data = None
if fetch_vnics_data:
try:
vnics_data = _fetch(metadata_version, path="vnics")
except UrlError:
util.logexc(LOG, "Failed to fetch IMDS network configuration!")
# This allows us to go over the max_wait time by the timeout length,
# but if we were able to retrieve instance metadata, that seems
# like a worthwhile tradeoff rather than having incomplete metadata.
vnics_url, vnics_response = wait_for_url(
[METADATA_PATTERN.format(version=metadata_version, path="vnics")],
max_wait=max_wait - (time.time() - start_time),
timeout=timeout,
headers_cb=_headers_cb,
sleep_time=0,
)
if vnics_url:
vnics_data = json.loads(vnics_response.decode("utf-8"))
else:
LOG.warning("Failed to fetch IMDS network configuration!")
return OpcMetadata(metadata_version, instance_data, vnics_data)


# Used to match classes to dependencies
datasources = [
(DataSourceOracle, (sources.DEP_FILESYSTEM,)),
(
DataSourceOracleNet,
(
sources.DEP_FILESYSTEM,
sources.DEP_NETWORK,
),
),
]


Expand Down
13 changes: 12 additions & 1 deletion cloudinit/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@
import time
from base64 import b64decode
from collections import deque, namedtuple
from contextlib import suppress
from contextlib import contextmanager, suppress
from errno import ENOENT
from functools import lru_cache, total_ordering
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Callable,
Deque,
Dict,
Generator,
List,
Mapping,
Optional,
Expand Down Expand Up @@ -3293,3 +3295,12 @@ def read_hotplug_enabled_file(paths: "Paths") -> dict:
if "scopes" not in content:
content["scopes"] = []
return content


@contextmanager
def nullcontext() -> Generator[None, Any, None]:
"""Context manager that does nothing.
Note: In python-3.7+, this can be substituted by contextlib.nullcontext
"""
yield
14 changes: 14 additions & 0 deletions doc/rtd/reference/datasources/oracle.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ to configure the non-primary network interface controllers in the system. If
set to True on an OCI Bare Metal Machine, it will have no effect (though this
may change in the future).

``max_wait``
------------

An integer, defaulting to 30. The maximum time in seconds to wait for the
metadata service to become available. If the metadata service is not
available within this time, the datasource will fail.

``timeout``
-----------
An integer, defaulting to 5. The time in seconds to wait for a response from
the metadata service before retrying.

Example configuration
---------------------

Expand All @@ -49,5 +61,7 @@ An example configuration with the default values is provided below:
datasource:
Oracle:
configure_secondary_nics: false
max_wait: 30
timeout: 5
.. _Oracle Compute Infrastructure: https://cloud.oracle.com/
1 change: 1 addition & 0 deletions tests/unittests/sources/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
MAAS.DataSourceMAAS,
NoCloud.DataSourceNoCloudNet,
OpenStack.DataSourceOpenStack,
Oracle.DataSourceOracleNet,
OVF.DataSourceOVFNet,
UpCloud.DataSourceUpCloud,
Akamai.DataSourceAkamai,
Expand Down
Loading

0 comments on commit eadaaf0

Please sign in to comment.