diff --git a/trace-normalization/src/normalize_utils.rs b/trace-normalization/src/normalize_utils.rs index 9a0d18e23..3a7e28b7b 100644 --- a/trace-normalization/src/normalize_utils.rs +++ b/trace-normalization/src/normalize_utils.rs @@ -3,10 +3,17 @@ // developed at Datadog (https://www.datadoghq.com/). Copyright 2023-Present // Datadog, Inc. +// DEFAULT_SERVICE_NAME is the default name we assign a service if it's missing and we have no reasonable fallback +const DEFAULT_SERVICE_NAME: &str = "unnamed-service"; + // MAX_NAME_LEN the maximum length a name can have pub(crate) const MAX_NAME_LEN: usize = 100; +// MAX_SERVICE_LEN the maximum length a service can have +const MAX_SERVICE_LEN: usize = 100; +// MAX_SERVICE_LEN the maximum length a tag can have +const MAX_TAG_LEN: usize = 200; -// TruncateUTF8 truncates the given string to make sure it uses less than limit bytes. +// truncate_utf8 truncates the given string to make sure it uses less than limit bytes. // If the last character is a utf8 character that would be split, it removes it // entirely to make sure the resulting string is not broken. pub(crate) fn truncate_utf8(s: &str, limit: usize) -> &str { @@ -25,23 +32,132 @@ pub(crate) fn truncate_utf8(s: &str, limit: usize) -> &str { s } -// NormalizeService returns a span service or an error describing why normalization failed. -// TODO: Implement this in a future PR -// pub fn normalize_service(svc: String, lang: String) -> (String, Option) { -// if svc == "" { -// return (fallback_service(lang), errors::NormalizeErrors::ErrorEmpty); -// } -// if svc.len() > MAX_SERVICE_LEN { -// return (truncate_utf8(svc, MAX_SERVICE_LEN), errors::NormalizeErrors::ErrorTooLong.into()); -// } -// TODO: implement tag normalization -// let s: String = normalize_tag(svc); -// if s == "" { -// return (fallbackService(lang), errors::NormalizeErrors::ErrorInvalid) -// } -// return (s, err) -// (svc, None) -// } +// fallback_service returns the fallback service name for a service +// belonging to language lang. +// In the go agent implementation, if a lang was specified in TagStats +// (extracted from the payload header) the fallback_service name would be "unnamed-{lang}-service". +pub(crate) fn fallback_service() -> String { + DEFAULT_SERVICE_NAME.to_string() +} + +// normalize_service normalizes a span service +pub(crate) fn normalize_service(svc: &str) -> anyhow::Result { + anyhow::ensure!(!svc.is_empty(), "Normalizer Error: Empty service name."); + + let truncated_service = truncate_utf8(svc, MAX_SERVICE_LEN); + + normalize_tag(truncated_service) +} + +// normalize_tag applies some normalization to ensure the tags match the backend requirements. +pub(crate) fn normalize_tag(tag: &str) -> anyhow::Result { + // Fast path: Check if the tag is valid and only contains ASCII characters, + // if yes return it as-is right away. For most use-cases this reduces CPU usage. + if is_normalized_ascii_tag(tag) { + return Ok(tag.to_string()); + } + + anyhow::ensure!(!tag.is_empty(), "Normalizer Error: Empty tag name."); + + // given a dummy value + let mut last_char: char = 'a'; + + let mut result = String::with_capacity(tag.len()); + + for cur_char in tag.chars() { + if result.len() == MAX_TAG_LEN { + break; + } + if cur_char.is_lowercase() { + result.push(cur_char); + last_char = cur_char; + continue; + } + if cur_char.is_uppercase() { + let mut iter = cur_char.to_lowercase(); + if let Some(c) = iter.next() { + result.push(c); + last_char = c; + } + continue; + } + if cur_char.is_alphabetic() { + result.push(cur_char); + last_char = cur_char; + continue; + } + if cur_char == ':' { + result.push(cur_char); + last_char = cur_char; + continue; + } + if !result.is_empty() + && (cur_char.is_ascii_digit() || cur_char == '.' || cur_char == '/' || cur_char == '-') + { + result.push(cur_char); + last_char = cur_char; + continue; + } + if !result.is_empty() && last_char != '_' { + result.push('_'); + last_char = '_'; + } + } + + if last_char == '_' { + result.remove(result.len() - 1); + } + + Ok(result.to_string()) +} + +pub(crate) fn is_normalized_ascii_tag(tag: &str) -> bool { + if tag.is_empty() { + return true; + } + if tag.len() > MAX_TAG_LEN { + return false; + } + + let mut tag_iter = tag.chars(); + + match tag_iter.next() { + Some(c) => { + if !is_valid_ascii_start_char(c) { + return false; + } + } + None => return false, + } + + while let Some(cur_char) = tag_iter.next() { + if is_valid_ascii_tag_char(cur_char) { + continue; + } + if cur_char == '_' { + // an underscore is only okay if followed by a valid non-underscore character + match tag_iter.next() { + Some(c) => { + if !is_valid_ascii_tag_char(c) { + return false; + } + } + None => return false, + }; + } else { + return false; + } + } + true +} + +pub(crate) fn is_valid_ascii_start_char(c: char) -> bool { + ('a'..='z').contains(&c) || c == ':' +} + +pub(crate) fn is_valid_ascii_tag_char(c: char) -> bool { + is_valid_ascii_start_char(c) || ('0'..='9').contains(&c) || c == '.' || c == '/' || c == '-' +} // normalize_name normalizes a span name or an error describing why normalization failed. pub(crate) fn normalize_name(name: &str) -> anyhow::Result { @@ -56,58 +172,6 @@ pub(crate) fn normalize_name(name: &str) -> anyhow::Result { normalize_metric_names(truncated_name) } -// TODO: Implement this in a future PR -// NormalizeTag applies some normalization to ensure the tags match the backend requirements. -// pub fn normalize_tag(v: String) -> String { -// Fast path: Check if the tag is valid and only contains ASCII characters, -// if yes return it as-is right away. For most use-cases this reduces CPU usage. -// if is_normalized_ascii_tag(v.clone()) { -// return v; -// } - -// if v.is_empty() { -// return "".to_string(); -// } - -// "".to_string() -// } - -// pub fn is_normalized_ascii_tag(tag: String) -> bool { -// if tag.is_empty() { -// return true; -// } -// if tag.len() > MAX_TAG_LEN { -// return false; -// } -// if !is_valid_ascii_start_char(tag.chars().next().unwrap()) { -// return false; -// } -// for mut i in 0..tag.len() { -// let b: char = tag.chars().nth(i).unwrap(); -// if is_valid_ascii_tag_char(b) { -// continue; -// } -// if b == '_' { -// // an underscore is only okay if followed by a valid non-underscore character -// i+=1; -// if i == tag.len() || !is_valid_ascii_tag_char(tag.chars().nth(i).unwrap()) { -// return false; -// } -// } else { -// return false; -// } -// } -// true -// } - -// pub fn is_valid_ascii_start_char(c: char) -> bool { -// ('a'..='z').contains(&c) || c == ':' -// } - -// pub fn is_valid_ascii_tag_char(c: char) -> bool { -// is_valid_ascii_start_char(c) || ('0'..='9').contains(&c) || c == '.' || c == '/' || c == '-' -// } - pub(crate) fn normalize_metric_names(name: &str) -> anyhow::Result { let mut result = String::with_capacity(name.len()); @@ -190,4 +254,80 @@ mod tests { } } } + + #[duplicate_item( + test_name input expected expected_err; + [test_normalize_empty_service] [""] [normalize_utils::DEFAULT_SERVICE_NAME] ["Normalizer Error: Empty service name."]; + [test_normalize_valid_service] ["good"] ["good"] [""]; + [test_normalize_long_service] ["Too$Long$.".repeat(20).as_str()] ["too_long_.".repeat(10)] [""]; + [test_normalize_dash_service] ["bad&service"] ["bad_service"] [""]; + )] + #[test] + fn test_name() { + match normalize_utils::normalize_service(input) { + Ok(val) => { + assert_eq!(expected_err, ""); + assert_eq!(val, expected) + } + Err(err) => { + assert_eq!(format!("{err}"), expected_err); + } + } + } + #[duplicate_item( + test_name input expected expected_err; + [test_normalize_tag_1] ["#test_starting_hash"] ["test_starting_hash"] [""]; + [test_normalize_tag_2] ["TestCAPSandSuch"] ["testcapsandsuch"] [""]; + [test_normalize_tag_3] ["Test Conversion Of Weird !@#$%^&**() Characters"] ["test_conversion_of_weird_characters"] [""]; + [test_normalize_tag_4] ["$#weird_starting"] ["weird_starting"] [""]; + [test_normalize_tag_5] ["allowed:c0l0ns"] ["allowed:c0l0ns"] [""]; + [test_normalize_tag_6] ["1love"] ["love"] [""]; + [test_normalize_tag_7] ["ünicöde"] ["ünicöde"] [""]; + [test_normalize_tag_8] ["ünicöde:metäl"] ["ünicöde:metäl"] [""]; + [test_normalize_tag_9] ["Data🐨dog🐶 繋がっ⛰てて"] ["data_dog_繋がっ_てて"] [""]; + [test_normalize_tag_10] [" spaces "] ["spaces"] [""]; + [test_normalize_tag_11] [" #hashtag!@#spaces #__<># "] ["hashtag_spaces"] [""]; + [test_normalize_tag_12] [":testing"] [":testing"] [""]; + [test_normalize_tag_13] ["_foo"] ["foo"] [""]; + [test_normalize_tag_14] [":::test"] [":::test"] [""]; + [test_normalize_tag_15] ["contiguous_____underscores"] ["contiguous_underscores"] [""]; + [test_normalize_tag_16] ["foo_"] ["foo"] [""]; + [test_normalize_tag_17] ["\u{017F}odd_\u{017F}case\u{017F}"] ["\u{017F}odd_\u{017F}case\u{017F}"] [""]; // edge-case + [test_normalize_tag_18] [""] [""] [""]; + [test_normalize_tag_19] [" "] [""] [""]; + [test_normalize_tag_20] ["ok"] ["ok"] [""]; + [test_normalize_tag_21] ["™Ö™Ö™™Ö™"] ["ö_ö_ö"] [""]; + [test_normalize_tag_22] ["AlsO:ök"] ["also:ök"] [""]; + [test_normalize_tag_23] [":still_ok"] [":still_ok"] [""]; + [test_normalize_tag_24] ["___trim"] ["trim"] [""]; + [test_normalize_tag_25] ["12.:trim@"] [":trim"] [""]; + [test_normalize_tag_26] ["12.:trim@@"] [":trim"] [""]; + [test_normalize_tag_27] ["fun:ky__tag/1"] ["fun:ky_tag/1"] [""]; + [test_normalize_tag_28] ["fun:ky@tag/2"] ["fun:ky_tag/2"] [""]; + [test_normalize_tag_29] ["fun:ky@@@tag/3"] ["fun:ky_tag/3"] [""]; + [test_normalize_tag_30] ["tag:1/2.3"] ["tag:1/2.3"] [""]; + [test_normalize_tag_31] ["---fun:k####y_ta@#g/1_@@#"]["fun:k_y_ta_g/1"] [""]; + [test_normalize_tag_32] ["AlsO:œ#@ö))œk"] ["also:œ_ö_œk"] [""]; + [test_normalize_tag_33] ["a".repeat(888).as_str()] ["a".repeat(200)] [""]; + [test_normalize_tag_34] [("a".to_owned() + &"🐶".repeat(799)).as_str()] ["a"] [""]; + [test_normalize_tag_35] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string()).as_str()] ["a"] [""]; + [test_normalize_tag_36] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string() + &char::REPLACEMENT_CHARACTER.to_string()).as_str()] ["a"] [""]; + [test_normalize_tag_37] [("a".to_string() + &char::REPLACEMENT_CHARACTER.to_string() + &char::REPLACEMENT_CHARACTER.to_string() + "b").as_str()] ["a_b"] [""]; + [test_normalize_tag_38] + ["A00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 000000000000"] + ["a00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000_0"] + [""]; + )] + #[test] + fn test_name() { + match normalize_utils::normalize_tag(input) { + Ok(normalized_tag) => { + assert_eq!(expected_err, ""); + assert_eq!(normalized_tag, expected) + } + Err(err) => { + assert_eq!(format!("{err}"), expected_err); + } + } + } } diff --git a/trace-normalization/src/normalizer.rs b/trace-normalization/src/normalizer.rs index 099904d9d..345ad80a3 100644 --- a/trace-normalization/src/normalizer.rs +++ b/trace-normalization/src/normalizer.rs @@ -21,11 +21,14 @@ pub fn normalize(s: &mut pb::Span) -> anyhow::Result<()> { anyhow::ensure!(s.trace_id != 0, "TraceID is zero (reason:trace_id_zero)"); anyhow::ensure!(s.span_id != 0, "SpanID is zero (reason:span_id_zero)"); - // TODO: Implement service name normalizer in future PR - // let (svc, _) = normalize_utils::normalize_service(s.service.clone(), "".to_string()); - // s.service = svc; + let normalized_service = match normalize_utils::normalize_service(&s.service) { + Ok(service) => service, + Err(_) => normalize_utils::fallback_service(), + }; + + s.service = normalized_service; - // TODO: check for a feature flag to determine the component tag to become the span name + // TODO: component2name: check for a feature flag to determine the component tag to become the span name // https://github.com/DataDog/datadog-agent/blob/dc88d14851354cada1d15265220a39dce8840dcc/pkg/trace/agent/normalizer.go#L64 let normalized_name = match normalize_utils::normalize_name(&s.name) { @@ -76,14 +79,16 @@ pub fn normalize(s: &mut pb::Span) -> anyhow::Result<()> { s.r#type = normalize_utils::truncate_utf8(&s.r#type, MAX_TYPE_LEN).to_string(); } - // TODO: Implement tag normalization in future PR - // if s.meta.contains_key("env") { - // let env_tag: String = s.meta.get("env").unwrap().to_string(); - // s.meta.insert("env".to_string(), normalize_utils::normalize_tag(env_tag)); - // } + if s.meta.contains_key("env") { + if let Some(env_tag) = s.meta.get("env") { + if let Ok(normalized_tag) = normalize_utils::normalize_tag(env_tag) { + s.meta.insert("env".to_string(), normalized_tag); + } + } + }; if let Some(code) = s.meta.get("http.status_code") { - if !is_valid_status_code(code.to_string()) { + if !is_valid_status_code(code) { s.meta.remove("http.status_code"); } }; @@ -91,7 +96,7 @@ pub fn normalize(s: &mut pb::Span) -> anyhow::Result<()> { Ok(()) } -pub(crate) fn is_valid_status_code(sc: String) -> bool { +pub(crate) fn is_valid_status_code(sc: &str) -> bool { if let Ok(code) = sc.parse::() { return (100..600).contains(&code); } @@ -106,8 +111,9 @@ mod tests { use crate::pb; use rand::Rng; use std::collections::HashMap; + use std::time::SystemTime; - pub fn new_test_span() -> pb::Span { + fn new_test_span() -> pb::Span { let mut rng = rand::thread_rng(); pb::Span { @@ -131,7 +137,7 @@ mod tests { } #[test] - pub fn test_normalize_name_passes() { + fn test_normalize_name_passes() { let mut test_span = new_test_span(); let before_name = test_span.name.clone(); assert!(normalizer::normalize(&mut test_span).is_ok()); @@ -139,7 +145,7 @@ mod tests { } #[test] - pub fn test_normalize_empty_name() { + fn test_normalize_empty_name() { let mut test_span = new_test_span(); test_span.name = "".to_string(); assert!(normalizer::normalize(&mut test_span).is_ok()); @@ -147,7 +153,7 @@ mod tests { } #[test] - pub fn test_normalize_long_name() { + fn test_normalize_long_name() { let mut test_span = new_test_span(); test_span.name = "CAMEMBERT".repeat(100); assert!(normalizer::normalize(&mut test_span).is_ok()); @@ -155,7 +161,7 @@ mod tests { } #[test] - pub fn test_normalize_name_no_alphanumeric() { + fn test_normalize_name_no_alphanumeric() { let mut test_span = new_test_span(); test_span.name = "/".to_string(); assert!(normalizer::normalize(&mut test_span).is_ok()); @@ -163,7 +169,7 @@ mod tests { } #[test] - pub fn test_normalize_name_for_metrics() { + fn test_normalize_name_for_metrics() { let expected_names = HashMap::from([ ( "pylons.controller".to_string(), @@ -184,7 +190,7 @@ mod tests { } #[test] - pub fn test_normalize_resource_passes() { + fn test_normalize_resource_passes() { let mut test_span = new_test_span(); let before_resource = test_span.resource.clone(); assert!(normalizer::normalize(&mut test_span).is_ok()); @@ -192,10 +198,237 @@ mod tests { } #[test] - pub fn test_normalize_empty_resource() { + fn test_normalize_empty_resource() { let mut test_span = new_test_span(); test_span.resource = "".to_string(); assert!(normalizer::normalize(&mut test_span).is_ok()); assert_eq!(test_span.resource, test_span.name); } + + #[test] + fn test_normalize_trace_id_passes() { + let mut test_span = new_test_span(); + let before_trace_id = test_span.trace_id; + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_trace_id, test_span.trace_id); + } + + #[test] + fn test_normalize_no_trace_id() { + let mut test_span = new_test_span(); + test_span.trace_id = 0; + assert!(normalizer::normalize(&mut test_span).is_err()); + } + + #[test] + fn test_normalize_component_to_name() { + let mut test_span = new_test_span(); + let before_trace_id = test_span.trace_id; + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_trace_id, test_span.trace_id); + } + + // TODO: Add a unit test for testing Component2Name, one that is + // implemented within the normalize function. + + #[test] + fn test_normalize_span_id_passes() { + let mut test_span = new_test_span(); + let before_span_id = test_span.span_id; + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_span_id, test_span.span_id); + } + + #[test] + fn test_normalize_no_span_id() { + let mut test_span = new_test_span(); + test_span.span_id = 0; + assert!(normalizer::normalize(&mut test_span).is_err()); + } + + #[test] + fn test_normalize_start_passes() { + let mut test_span = new_test_span(); + let before_start = test_span.start; + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_start, test_span.start); + } + + fn get_current_time() -> i64 { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_nanos() as i64 + } + + #[test] + fn test_normalize_start_too_small() { + let mut test_span = new_test_span(); + + test_span.start = 42; + let min_start = get_current_time() - test_span.duration; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert!(test_span.start >= min_start); + assert!(test_span.start <= get_current_time()); + } + + #[test] + fn test_normalize_start_too_small_with_large_duration() { + let mut test_span = new_test_span(); + + test_span.start = 42; + test_span.duration = get_current_time() * 2; + let min_start = get_current_time(); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert!(test_span.start >= min_start); // start should have been reset to current time + assert!(test_span.start <= get_current_time()); //start should have been reset to current time + } + + #[test] + fn test_normalize_duration_passes() { + let mut test_span = new_test_span(); + let before_duration = test_span.duration; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_duration, test_span.duration); + } + + #[test] + fn test_normalize_empty_duration() { + let mut test_span = new_test_span(); + test_span.duration = 0; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.duration, 0); + } + + #[test] + fn test_normalize_negative_duration() { + let mut test_span = new_test_span(); + test_span.duration = -50; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.duration, 0); + } + + #[test] + fn test_normalize_large_duration() { + let mut test_span = new_test_span(); + test_span.duration = std::i64::MAX; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.duration, 0); + } + + #[test] + fn test_normalize_error_passes() { + let mut test_span = new_test_span(); + let before_error = test_span.error; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_error, test_span.error); + } + + #[test] + fn test_normalize_metrics_passes() { + let mut test_span = new_test_span(); + let before_metrics = test_span.metrics.clone(); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_metrics, test_span.metrics); + } + + #[test] + fn test_normalize_meta_passes() { + let mut test_span = new_test_span(); + let before_meta = test_span.meta.clone(); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_meta, test_span.meta); + } + + #[test] + fn test_normalize_parent_id_passes() { + let mut test_span = new_test_span(); + let before_parent_id = test_span.parent_id; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_parent_id, test_span.parent_id); + } + + #[test] + fn test_normalize_type_passes() { + let mut test_span = new_test_span(); + let before_type = test_span.r#type.clone(); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(before_type, test_span.r#type); + } + + #[test] + fn test_normalize_type_too_long() { + let mut test_span = new_test_span(); + test_span.r#type = "sql".repeat(1000); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.r#type.len(), normalizer::MAX_TYPE_LEN); + } + + #[test] + fn test_normalize_service_tag() { + let mut test_span = new_test_span(); + test_span.service = "retargeting(api-Staging ".to_string(); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.service, "retargeting_api-staging"); + } + + #[test] + fn test_normalize_env() { + let mut test_span = new_test_span(); + test_span + .meta + .insert("env".to_string(), "DEVELOPMENT".to_string()); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!("development", test_span.meta.get("env").unwrap()); + } + + #[test] + fn test_special_zipkin_root_span() { + let mut test_span = new_test_span(); + test_span.parent_id = 42; + test_span.trace_id = 42; + test_span.span_id = 42; + + let before_trace_id = test_span.trace_id; + let before_span_id = test_span.span_id; + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!(test_span.parent_id, 0); + assert_eq!(test_span.trace_id, before_trace_id); + assert_eq!(test_span.span_id, before_span_id); + } + + #[test] + fn test_normalize_trace_empty() { + let mut test_span = new_test_span(); + test_span + .meta + .insert("env".to_string(), "DEVELOPMENT".to_string()); + + assert!(normalizer::normalize(&mut test_span).is_ok()); + assert_eq!("development", test_span.meta.get("env").unwrap()); + } + + #[test] + fn test_is_valid_status_code() { + assert!(normalizer::is_valid_status_code("100")); + assert!(normalizer::is_valid_status_code("599")); + assert!(!normalizer::is_valid_status_code("99")); + assert!(!normalizer::is_valid_status_code("600")); + assert!(!normalizer::is_valid_status_code("Invalid status code")); + } }