diff --git a/src/oomd/cfgen/src/cfgen.rs b/src/oomd/cfgen/src/cfgen.rs index 6cdc9e62..2702a07d 100644 --- a/src/oomd/cfgen/src/cfgen.rs +++ b/src/oomd/cfgen/src/cfgen.rs @@ -67,18 +67,20 @@ fn devserver_json_config(node: &Node, attrs: &ConfigParams) -> json::JsonValue { } } -fn od_json_config(_attrs: &ConfigParams) -> json::JsonValue { - // TODO(chengxiong): implement this. - json::object! {} +fn od_json_config(attrs: &ConfigParams) -> json::JsonValue { + let mut rulesets = json::Array::new(); + rulesets.push(rule_system_overview(attrs)); + rulesets.push(rule_protection_against_high_memory_pressure(attrs)); + rulesets.append(&mut rules_restart_cgroup_on_mem_threshold(attrs)); + rulesets.push(rule_senpai_drop_in_ruleset(attrs)); + rulesets.push(rule_od_protection_against_low_swap(attrs)); + json::object! { + "rulesets": rulesets, + "version": CONFIG_VERSION, + } } fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue { - let cgroup = if [HostType::DevServer, HostType::OnDemand].contains(&attrs.host_type) { - attrs.oomd2.oomd_target.as_str() - } else { - "workload.slice" - }; - let mut rule = json::object! { "name": "system overview", "silence-logs": "engine", @@ -88,7 +90,7 @@ fn rule_system_overview(attrs: &ConfigParams) -> json::JsonValue { { "name": "dump_cgroup_overview", "args": { - "cgroup": cgroup, + "cgroup": attrs.oomd2.oomd_target.as_str(), } } ] @@ -243,7 +245,7 @@ fn rule_protection_against_heavy_workload_thrashing_detectors( } _ = slow_growing_mem_pressure_detector.push(json::object! { - "name": "pressure_rising_beyong", + "name": "pressure_rising_beyond", "args": { "cgroup": attrs.fbtax2.workload_monitoring_slice.as_str(), "resource": "memory", @@ -416,7 +418,7 @@ fn rule_senpai_ruleset(attrs: &ConfigParams) -> json::JsonValue { fn rule_senpai_drop_in_ruleset(attrs: &ConfigParams) -> json::JsonValue { json::object! { "name": "senpai drop-in ruleset", - "silence-logs": "engine", + "silence-logs": if attrs.host_type == HostType::OnDemand {"engine,plugins"} else {"engine"}, "drop-in": { "actions": true, "disable-on-drop-in": true, @@ -566,6 +568,97 @@ fn rule_user_session_protection(node: &Node, attrs: &ConfigParams) -> json::Json } } +fn rule_protection_against_high_memory_pressure(attrs: &ConfigParams) -> json::JsonValue { + json::object! { + "name": "protection against high memory pressure", + "drop-in": { + "detectors": true, + "actions": true, + "disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in, + }, + "detectors": [ + [ + "detects fast growing memory pressure", + { + "name": attrs.oomd2.plugins["pressure_above"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_target.as_str(), + "resource": "memory", + "threshold": attrs.oomd2.oomd_high_threshold.as_str(), + "duration": attrs.oomd2.oomd_high_threshold_duration.as_str(), + } + }, + { + "name": attrs.oomd2.plugins["memory_reclaim"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_target.as_str(), + "duration": attrs.oomd2.oomd_reclaim_duation.as_str(), + } + } + ], + [ + "detects slow growing memory pressure", + { + "name": attrs.oomd2.plugins["pressure_rising_beyond"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_target.as_str(), + "resource": "memory", + "threshold": attrs.oomd2.oomd_threshold.as_str(), + "duration": attrs.oomd2.oomd_threshold_duration.as_str(), + } + }, + { + "name": attrs.oomd2.plugins["memory_reclaim"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_target.as_str(), + "duration": attrs.oomd2.oomd_reclaim_duation.as_str(), + } + } + ] + ], + "actions": [ + { + "name": attrs.oomd2.plugins["kill_by_memory_size_or_growth"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_action_target.as_str(), + "dry": if attrs.oomd2.oomd_dry { "true" } else {"false"}, + } + } + ] + } +} + +fn rule_od_protection_against_low_swap(attrs: &ConfigParams) -> json::JsonValue { + json::object! { + "name": "protection against low swap", + "drop-in": { + "detectors": true, + "actions": true, + "disable-on-drop-in": attrs.oomd2.oomd_disable_on_drop_in, + }, + "detectors": [ + [ + "free swap goes below 5 percent", + { + "name": attrs.oomd2.plugins["swap_free"].as_str(), + "args": { + "threshold_pct": "5", + } + } + ] + ], + "actions": [ + { + "name": attrs.oomd2.plugins["kill_by_swap_usage"].as_str(), + "args": { + "cgroup": attrs.oomd2.oomd_action_target.as_str(), + "dry": if attrs.oomd2.oomd_dry { "true" } else {"false"}, + } + } + ] + } +} + fn get_attributes(node: &Node) -> ConfigParams { ConfigParams { host_type: get_host_type(node), @@ -597,14 +690,13 @@ fn get_attributes(node: &Node) -> ConfigParams { "senpai" => "senpai", )), oomd_dry: true, - oomd_disable_on_drop_in: false, - oomd_target: String::from("system.slice"), - oomd_action_target: String::from("system.slice"), + oomd_disable_on_drop_in: true, + oomd_target: oomd2_oomd_target(node), + oomd_action_target: String::from("system.slice/*"), oomd_high_threshold: String::from("80"), oomd_high_threshold_duration: String::from("60"), oomd_threshold: String::from("60"), oomd_threshold_duration: String::from("90"), - oomd_min_swap_pct: String::from("15"), oomd_restart_threshold: oomd2_oomd_restart_threshold(), oomd_reclaim_duation: String::from("10"), oomd_post_action_delay: String::from("15"), @@ -625,7 +717,7 @@ fn get_attributes(node: &Node) -> ConfigParams { memory_high_timeout_ms: String::from("20"), scuba_logger_dataset: String::from("perfpipe_senpai_events"), }, - disable_senpai_dropin: false, + disable_senpai_dropin: disable_senpai_dropin(node), } } @@ -707,12 +799,33 @@ fn senpai_limit_min_bytes(node: &Node) -> Option { None } +fn oomd2_oomd_target(node: &Node) -> String { + match get_host_type(node) { + HostType::DevServer => String::from("system.slice"), + HostType::OnDemand => { + String::from("system.slice,workload.slice/workload-tw.slice/quicksand*.service") + } + _ => String::from("workload.slice"), + } +} + +fn disable_senpai_dropin(node: &Node) -> bool { + if get_host_type(node) == HostType::OnDemand { + return true; + } + false +} + fn get_host_type(node: &Node) -> HostType { // TODO(chengxiong): add logic to determine host types. if node.hostname_prefix() == "twshared".into() { return HostType::TwShared; } + if node.hostname_prefix() == "od".into() { + return HostType::OnDemand; + } + if node.is_devserver() { return HostType::DevServer; } @@ -736,6 +849,7 @@ mod tests { #[rstest] #[case::shard99("twshared2434.02.cco1", HostType::TwShared)] #[case::shard99("devvm3170.cln0", HostType::DevServer)] + #[case::shard99("od2228.eag1", HostType::OnDemand)] fn test_get_host_type(#[case] hostname: &str, #[case] expected: HostType) { let node = FakeNodeBuilder::new().hostname(hostname).build(); assert_eq!(get_host_type(&node), expected); diff --git a/src/oomd/cfgen/src/types.rs b/src/oomd/cfgen/src/types.rs index 83cdb1e6..72ebb27e 100644 --- a/src/oomd/cfgen/src/types.rs +++ b/src/oomd/cfgen/src/types.rs @@ -34,7 +34,7 @@ pub struct FBTax2Attributes { } pub struct Oomd2Attributes { - pub blacklisted_jobs: Vec, + pub blacklisted_jobs: Vec<&'static str>, pub disable_swap_protection: bool, pub kill_target: String, pub plugins: BTreeMap, @@ -46,7 +46,6 @@ pub struct Oomd2Attributes { pub oomd_high_threshold_duration: String, pub oomd_threshold: String, pub oomd_threshold_duration: String, - pub oomd_min_swap_pct: String, pub oomd_restart_threshold: BTreeMap, pub oomd_reclaim_duation: String, pub oomd_post_action_delay: String, diff --git a/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json b/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json new file mode 100644 index 00000000..937aee44 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_inputs/ondemand.json @@ -0,0 +1,93 @@ +@generated SignedSource<> +@codegen-command arc cfgen update-inputs fb-oomd +{ + "fqdn": "od2228.eag1.facebook.com", + "region": "utah", + "clusterType": "SERVICE_GENERIC_NON_MEMCACHE", + "modelId": 341072, + "kernelRelease": "5.19.0-0_fbk12_11583_g0bef9520ca2b", + "serverType": "TYPE_X_SEARCH", + "experiments": [], + "cpuArchitecture": "cooperlake", + "metalosRootfs": false, + "provisioningConfig": { + "ethtoolByInterface": { + "eth0": { + "maxChannelsCombined": 52 + } + }, + "cpuCoreCount": 26, + "parentModelId": 338998, + "recoveryEnvironment": false, + "deviceType": "SERVER", + "datacenter": "eag1", + "cluster": "05", + "memTotal": 66870956032, + "osVersion": { + "distribution_name": "CentOS Stream release", + "version": 9, + "is_in_ramdisk": false, + "is_metalos": false + }, + "pciByAddress": { + "0000:65:00.0": { + "vendor_id": 5555, + "device_id": 4125, + "class_code": 131072, + "board_part_number": "MCX623435MC-CDAE_FB" + } + }, + "static_smc_tiers": [], + "machine": "x86_64" + }, + "bootConfig": { + "ethtoolByInterface": { + "eth0": { + "driver": "mlx5_core", + "driver_version": "5.19.0-0_fbk12_11583_g0bef9520c", + "firmware_version": "22.32.1206 (FB_0000000018)", + "bus_info": "0000:65:00.0" + } + } + }, + "runtimeConfig": { + "hasHighPrivCert": true, + "regionRoutableCluster": "eag1.02", + "block_devices": { + "block_devices": { + "nvme0n1": { + "size_bytes": 256055095296, + "is_rotational": false, + "model": "HFS512GDE9X083N", + "serial": "2621CDA6N79781110H6O", + "physical_block_size": 512, + "logical_block_size": 512, + "is_root": true + }, + "nvme1n1": { + "size_bytes": 1800360124416, + "is_rotational": false, + "model": "MZOL21T9HCJR-00AFB", + "serial": "S5X8NG0T524955", + "physical_block_size": 4096, + "logical_block_size": 4096, + "is_root": false + } + } + }, + "dynamic_smc_tiers": [], + "cluster_state": "CLUSTER_IN_USE", + "installed_platforms": [ + "platform010", + "platform010-compat" + ], + "device_nics_enum": [ + "ETH0", + "SVC0" + ] + }, + "reservationConfig": { + "active_machine_materialization_id": "", + "current_reservation_host_profile_id": "NEWLY_PROVISIONED_PROFILE" + } +} diff --git a/src/oomd/cfgen/test/cfgen_test_manifest.yml b/src/oomd/cfgen/test/cfgen_test_manifest.yml index 1c4b37a5..fb04a6d1 100644 --- a/src/oomd/cfgen/test/cfgen_test_manifest.yml +++ b/src/oomd/cfgen/test/cfgen_test_manifest.yml @@ -12,3 +12,7 @@ library_samples: - devvm - twshared_vll_shard00 # Add more samples from https://fburl.com/code/vjwmkoa1 if needed +samples: + ondemand: + # A random host with od hostname prefix. + production_host: od2228.eag1.facebook.com diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf new file mode 100644 index 00000000..240e90fc --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/50-change-propagator.conf @@ -0,0 +1,4 @@ +@generated SignedSource<<31b3f2f747768088bd5523d8e690bfac>> +@codegen-command arc cfgen update-outputs fb-oomd +[Service] +[Unit] diff --git a/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json new file mode 100644 index 00000000..4fda3f59 --- /dev/null +++ b/src/oomd/cfgen/test/cfgen_test_outputs/ondemand/oomd2.json @@ -0,0 +1,169 @@ +@generated SignedSource<<60b23c1a40e237850d38e23d98d5a1bb>> +@codegen-command arc cfgen update-outputs fb-oomd +{ + "rulesets": [ + { + "name": "system overview", + "silence-logs": "engine", + "detectors": [ + [ + "records system stats", + { + "name": "dump_cgroup_overview", + "args": { + "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service" + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ], + "drop-in": { + "detectors": true, + "actions": true + } + }, + { + "name": "protection against high memory pressure", + "drop-in": { + "detectors": true, + "actions": true, + "disable-on-drop-in": true + }, + "detectors": [ + [ + "detects fast growing memory pressure", + { + "name": "pressure_above", + "args": { + "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service", + "resource": "memory", + "threshold": "80", + "duration": "60" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service", + "duration": "10" + } + } + ], + [ + "detects slow growing memory pressure", + { + "name": "pressure_rising_beyond", + "args": { + "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service", + "resource": "memory", + "threshold": "60", + "duration": "90" + } + }, + { + "name": "memory_reclaim", + "args": { + "cgroup": "system.slice,workload.slice/workload-tw.slice/quicksand*.service", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_memory_size_or_growth", + "args": { + "cgroup": "system.slice/*", + "dry": "true" + } + } + ] + }, + { + "name": "restart smc_proxy.service on memory threshold", + "detectors": [ + [ + "memory usage above", + { + "name": "memory_above", + "args": { + "cgroup": "smc_proxy.service", + "threshold_anon": "15G", + "duration": "10" + } + } + ] + ], + "actions": [ + { + "name": "systemd_restart", + "args": { + "service": "smc_proxy.service", + "post_action_delay": "20", + "dry": "false" + } + } + ] + }, + { + "name": "senpai drop-in ruleset", + "silence-logs": "engine,plugins", + "drop-in": { + "actions": true, + "disable-on-drop-in": true + }, + "detectors": [ + [ + "stop detector group", + { + "name": "exists", + "args": { + "cgroup": "/", + "negate": true + } + } + ] + ], + "actions": [ + { + "name": "continue", + "args": {} + } + ] + }, + { + "name": "protection against low swap", + "drop-in": { + "detectors": true, + "actions": true, + "disable-on-drop-in": true + }, + "detectors": [ + [ + "free swap goes below 5 percent", + { + "name": "swap_free", + "args": { + "threshold_pct": "5" + } + } + ] + ], + "actions": [ + { + "name": "kill_by_swap_usage", + "args": { + "cgroup": "system.slice/*", + "dry": "true" + } + } + ] + } + ], + "version": "1.0.0" +} \ No newline at end of file