diff --git a/src/uu/join/BENCHMARKING.md b/src/uu/join/BENCHMARKING.md index 80814ed1cb..988259aa73 100644 --- a/src/uu/join/BENCHMARKING.md +++ b/src/uu/join/BENCHMARKING.md @@ -55,7 +55,7 @@ The following options can have a non-trivial impact on performance: - `-a`/`-v` if one of the two files has significantly more lines than the other - `-j`/`-1`/`-2` cause work to be done to grab the appropriate field -- `-i` adds a call to `to_ascii_lowercase()` that adds some time for allocating and dropping memory for the lowercase key +- `-i` uses our custom code for case-insensitive text comparisons - `--nocheck-order` causes some calls of `Input::compare` to be skipped The content of the files being joined has a very significant impact on the performance. diff --git a/src/uu/join/src/join.rs b/src/uu/join/src/join.rs index e4f3cdba22..e7bc7da696 100644 --- a/src/uu/join/src/join.rs +++ b/src/uu/join/src/join.rs @@ -288,6 +288,40 @@ impl<'a, Sep: Separator> Repr<'a, Sep> { } } +/// Byte slice wrapper whose Ord implementation is case-insensitive on ASCII. +#[derive(Eq)] +struct CaseInsensitiveSlice<'a> { + v: &'a [u8], +} + +impl Ord for CaseInsensitiveSlice<'_> { + fn cmp(&self, other: &Self) -> Ordering { + if let Some((s, o)) = + std::iter::zip(self.v.iter(), other.v.iter()).find(|(s, o)| !s.eq_ignore_ascii_case(o)) + { + // first characters that differ, return the case-insensitive comparison + let s = s.to_ascii_lowercase(); + let o = o.to_ascii_lowercase(); + s.cmp(&o) + } else { + // one of the strings is a substring or equal of the other + self.v.len().cmp(&other.v.len()) + } + } +} + +impl PartialOrd for CaseInsensitiveSlice<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for CaseInsensitiveSlice<'_> { + fn eq(&self, other: &Self) -> bool { + self.v.eq_ignore_ascii_case(other.v) + } +} + /// Input processing parameters. struct Input { separator: Sep, @@ -307,9 +341,9 @@ impl Input { fn compare(&self, field1: Option<&[u8]>, field2: Option<&[u8]>) -> Ordering { if let (Some(field1), Some(field2)) = (field1, field2) { if self.ignore_case { - field1 - .to_ascii_lowercase() - .cmp(&field2.to_ascii_lowercase()) + let field1 = CaseInsensitiveSlice { v: field1 }; + let field2 = CaseInsensitiveSlice { v: field2 }; + field1.cmp(&field2) } else { field1.cmp(field2) }