From 485abcdc6f83b2a0281eecd4f8df6c37070f2d13 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 15 Mar 2023 08:40:42 -0400 Subject: [PATCH] syntax: \p{Lc} should map to \p{Cased_Letter} This is more similar to the \p{Cf} bug than the \p{Sc} bug, but basically, 'lc' is an abbreviation for both 'Cased_Letter' and 'Lowercase_Mapping'. Since we don't support the latter (currently), we make 'lc' map to 'Cased_Letter'. If we do ever add 'Lowercase_Mapping' in the future, then we will just require users to type out its full form. Fixes #965 --- regex-syntax/src/unicode.rs | 7 ++++++- tests/unicode.rs | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 84e781db4..5c22f66ac 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -248,7 +248,12 @@ impl<'a> ClassQuery<'a> { // also the abbreviation for the 'Script' property. So we avoid calling // 'canonical_prop' for it too, which would erroneously normalize it // to 'Script'. - if norm != "cf" && norm != "sc" { + // + // Another case: 'lc' is an abbreviation for the 'Cased_Letter' + // general category, but is also an abbreviation for the 'Lowercase_Mapping' + // property. We don't currently support the latter, so as with 'cf' + // above, we treat 'lc' as 'Cased_Letter'. + if norm != "cf" && norm != "sc" && norm != "lc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } diff --git a/tests/unicode.rs b/tests/unicode.rs index 748bbb79c..d7dbdd31b 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -35,6 +35,8 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); // We should test more, but there's a lot. Write a script to generate more of // these tests. mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3))); mat!( uni_class_gencat_close_punctuation, r"\p{Close_Punctuation}",