Skip to content

Commit

Permalink
Implement RegExp v flag (#90)
Browse files Browse the repository at this point in the history
* Implement RegExp `v` flag
  • Loading branch information
raskad committed Feb 26, 2024
1 parent 2ab0dfb commit 0108770
Show file tree
Hide file tree
Showing 22 changed files with 33,622 additions and 19,300 deletions.
2 changes: 2 additions & 0 deletions gen-unicode/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ This crate generates unicode tables and code specific for regress.
cd /tmp/ucd-15.0.0
curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
unzip UCD.zip
curl -LO https://www.unicode.org/Public/emoji/15.0/emoji-sequences.txt
curl -LO https://www.unicode.org/Public/emoji/15.0/emoji-zwj-sequences.txt
```

2. Run this crate and redirect the output in the specific rs file in the regress crate:
Expand Down
124 changes: 60 additions & 64 deletions gen-unicode/src/binary_properties.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{codepoints_to_range, codepoints_to_ranges, pack_adjacent_codepoints, GenUnicode};
use crate::{codepoints_to_range, format_interval_table, pack_adjacent_codepoints, GenUnicode};
use codegen::{Block, Enum, Function};

impl GenUnicode {
Expand All @@ -10,14 +10,13 @@ impl GenUnicode {
.derive("Clone")
.derive("Copy");

let mut is_property_fn = Function::new("is_property_binary");
is_property_fn
let mut as_ranges_fn = Function::new("binary_property_ranges");
as_ranges_fn
.vis("pub(crate)")
.arg("cp", "u32")
.arg("value", "&UnicodePropertyBinary")
.ret("bool")
.ret("&'static [Interval]")
.line("use UnicodePropertyBinary::*;");
let mut is_property_fn_match_block = Block::new("match value");
let mut as_ranges_fn_match_block = Block::new("match value");

let mut property_from_str_fn = Function::new("unicode_property_binary_from_str");
property_from_str_fn
Expand All @@ -30,48 +29,28 @@ impl GenUnicode {
for (alias, orig_name, name, ucd_file) in BINARY_PROPERTIES {
let mut codepoints = ucd_file.chars(orig_name, self);

codepoints.sort();
pack_adjacent_codepoints(&mut codepoints);

// Some properties cannot be packed into a CodePointRange.
if ["Noncharacter_Code_Point"].contains(orig_name) {
self.scope.raw(&format!(
"pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];",
orig_name.to_uppercase(),
codepoints.len(),
codepoints
.iter()
.map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1))
.collect::<Vec<String>>()
.join("\n ")
));
} else {
let ranges = codepoints_to_ranges(&codepoints);
self.scope.raw(&format!(
"pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];",
orig_name.to_uppercase(),
ranges.len(),
ranges.join("\n ")
));
}
self.scope.raw(format_interval_table(
&orig_name.to_uppercase(),
&codepoints,
));

self.scope
.new_fn(&format!("is_{}", orig_name.to_lowercase()))
.new_fn(&format!("{}_ranges", orig_name.to_lowercase()))
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line(&format!(
"{}.binary_search_by(|&cpr| cpr.compare(cp)).is_ok()",
orig_name.to_uppercase()
))
.ret("&'static [Interval]")
.line(&format!("&{}", orig_name.to_uppercase()))
.doc(&format!(
"Return whether cp has the '{}' Unicode property.",
"Return the code point ranges of the '{}' Unicode property.",
orig_name
));

property_enum.new_variant(*name);

is_property_fn_match_block.line(format!(
"{} => is_{}(cp),",
as_ranges_fn_match_block.line(format!(
"{} => {}_ranges(),",
name,
orig_name.to_lowercase()
));
Expand All @@ -88,54 +67,71 @@ impl GenUnicode {
property_enum.new_variant("Any");
property_enum.new_variant("Assigned");

let ascii_ranges = codepoints_to_ranges(&[(0, 127)]);

self.scope.raw(&format!(
"pub(crate) const ASCII: [CodePointRange; 1] = [\n {}\n];",
ascii_ranges.join("\n ")
));
self.scope
.raw("pub(crate) const ASCII: [Interval; 1] = [Interval::new(0, 127)];");

self.scope
.new_fn("is_ascii")
.new_fn("ascii_ranges")
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line("ASCII.binary_search_by(|&cpr| cpr.compare(cp)).is_ok()")
.doc("Return whether cp has the 'ASCII' Unicode property.");
.ret("&'static [Interval]")
.line("&ASCII")
.doc("Return the code point ranges of the 'ASCII' Unicode property.");

self.scope.raw("pub(crate) const ANY: [CodePointRangeUnpacked; 1] = [\n CodePointRangeUnpacked::from(0, 1114111)\n];");
self.scope
.raw("pub(crate) const ANY: [Interval; 1] = [Interval::new(0, 1114111)];");

self.scope
.new_fn("is_any")
.new_fn("any_ranges")
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line("ANY.binary_search_by(|&cpr| cpr.compare(cp)).is_ok()")
.doc("Return whether cp has the 'Any' Unicode property.");
.ret("&'static [Interval]")
.line("&ANY")
.doc("Return the code point ranges of the 'ANY' Unicode property.");

let mut unassigned_codepoints = Vec::new();
for row in &self.derived_general_category {
if row.general_category == "Cn" {
unassigned_codepoints.push(codepoints_to_range(&row.codepoints));
}
}
unassigned_codepoints.sort();
pack_adjacent_codepoints(&mut unassigned_codepoints);
let mut assigned_codepoints = Vec::new();
let mut start = 0;
for iv in unassigned_codepoints {
if start < iv.0 {
assigned_codepoints.push((start, iv.0 - 1))
}
start = iv.1 + 1;
}
if start <= 0x10FFFF {
assigned_codepoints.push((start, 0x10FFFF))
}

self.scope
.raw(format_interval_table("ASSIGNED", &assigned_codepoints));

self.scope
.new_fn("is_assigned")
.new_fn("assigned_ranges")
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line("UNASSIGNED.binary_search_by(|&cpr| cpr.compare(cp)).is_err()")
.doc("Return whether cp has the 'Any' Unicode property.");
.ret("&'static [Interval]")
.line("&ASSIGNED")
.doc("Return the code point ranges of the 'ANY' Unicode property.");

is_property_fn_match_block.line("Ascii => is_ascii(cp),");
is_property_fn_match_block.line("Any => is_any(cp),");
is_property_fn_match_block.line("Assigned => is_assigned(cp),");
as_ranges_fn_match_block.line("Ascii => ascii_ranges(),");
as_ranges_fn_match_block.line("Any => any_ranges(),");
as_ranges_fn_match_block.line("Assigned => assigned_ranges(),");

property_from_str_fn_match_block.line("\"ASCII\" => Some(Ascii),");
property_from_str_fn_match_block.line("\"Any\" => Some(Any),");
property_from_str_fn_match_block.line("\"Assigned\" => Some(Assigned),");

is_property_fn.push_block(is_property_fn_match_block);
as_ranges_fn.push_block(as_ranges_fn_match_block);

property_from_str_fn_match_block.line("_ => None,");
property_from_str_fn.push_block(property_from_str_fn_match_block);

self.scope
.push_fn(is_property_fn)
.push_fn(as_ranges_fn)
.push_enum(property_enum)
.push_fn(property_from_str_fn);
}
Expand Down
114 changes: 53 additions & 61 deletions gen-unicode/src/general_category_values.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{codepoints_to_range, codepoints_to_ranges, pack_adjacent_codepoints, GenUnicode};
use crate::{codepoints_to_range, format_interval_table, pack_adjacent_codepoints, GenUnicode};
use codegen::{Block, Enum, Function};
use std::collections::HashMap;

Expand All @@ -11,14 +11,13 @@ impl GenUnicode {
.derive("Clone")
.derive("Copy");

let mut is_property_fn = Function::new("is_property_value_general_category");
is_property_fn
let mut as_ranges_fn = Function::new("general_category_property_value_ranges");
as_ranges_fn
.vis("pub(crate)")
.arg("cp", "u32")
.arg("value", "&UnicodePropertyValueGeneralCategory")
.ret("bool")
.ret("&'static [Interval]")
.line("use UnicodePropertyValueGeneralCategory::*;");
let mut is_property_fn_match_block = Block::new("match value");
let mut as_ranges_fn_match_block = Block::new("match value");

let mut property_from_str_fn =
Function::new("unicode_property_value_general_category_from_str");
Expand All @@ -38,48 +37,28 @@ impl GenUnicode {
}
}

codepoints.sort();
pack_adjacent_codepoints(&mut codepoints);

// Some properties cannot be packed into a CodePointRange.
if ["Unassigned", "Private_Use"].contains(orig_name) {
self.scope.raw(&format!(
"pub(crate) const {}: [CodePointRangeUnpacked; {}] = [\n {}\n];",
orig_name.to_uppercase(),
codepoints.len(),
codepoints
.iter()
.map(|cs| format!("CodePointRangeUnpacked::from({}, {}),", cs.0, cs.1))
.collect::<Vec<String>>()
.join("\n ")
));
} else {
let ranges = codepoints_to_ranges(&codepoints);
self.scope.raw(&format!(
"pub(crate) const {}: [CodePointRange; {}] = [\n {}\n];",
orig_name.to_uppercase(),
ranges.len(),
ranges.join("\n ")
));
}
self.scope.raw(format_interval_table(
&orig_name.to_uppercase(),
&codepoints,
));

self.scope
.new_fn(&format!("is_{}", orig_name.to_lowercase()))
.new_fn(&format!("{}_ranges", orig_name.to_lowercase()))
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line(&format!(
"{}.binary_search_by(|&cpr| cpr.compare(cp)).is_ok()",
orig_name.to_uppercase()
))
.ret("&'static [Interval]")
.line(&format!("&{}", orig_name.to_uppercase()))
.doc(&format!(
"Return whether cp has the '{}' Unicode property.",
"Return the code point ranges of the '{}' Unicode property.",
orig_name
));

property_enum.new_variant(*name);

is_property_fn_match_block.line(format!(
"{} => is_{}(cp),",
as_ranges_fn_match_block.line(format!(
"{} => {}_ranges(),",
name,
orig_name.to_lowercase()
));
Expand All @@ -94,27 +73,39 @@ impl GenUnicode {
});
}

for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED {
let value_name_ifs: Vec<String> = value_names_str
.split(',')
.map(|name| format!("is_{}(cp)", name.to_lowercase()))
.collect();
for (alias0, alias1, orig_name, name, _, alias1_names) in GENERAL_CATEGORY_VALUES_DERIVED {
let alias1_strings: Vec<&str> = alias1_names.split(',').collect();

let mut codepoints = Vec::new();

for row in &self.derived_general_category {
if alias1_strings.contains(&row.general_category.as_str()) {
codepoints.push(codepoints_to_range(&row.codepoints));
}
}

codepoints.sort();
pack_adjacent_codepoints(&mut codepoints);

self.scope.raw(format_interval_table(
&orig_name.to_uppercase(),
&codepoints,
));

self.scope
.new_fn(&format!("is_{}", orig_name.to_lowercase()))
.new_fn(&format!("{}_ranges", orig_name.to_lowercase()))
.vis("pub(crate)")
.arg("cp", "u32")
.ret("bool")
.line(value_name_ifs.join(" || "))
.ret("&'static [Interval]")
.line(&format!("&{}", orig_name.to_uppercase()))
.doc(&format!(
"Return whether cp has the '{}' Unicode property.",
"Return the code point ranges of the '{}' Unicode property.",
orig_name
));

property_enum.new_variant(*name);

is_property_fn_match_block.line(format!(
"{} => is_{}(cp),",
as_ranges_fn_match_block.line(format!(
"{} => {}_ranges(),",
name,
orig_name.to_lowercase()
));
Expand All @@ -129,13 +120,13 @@ impl GenUnicode {
});
}

is_property_fn.push_block(is_property_fn_match_block);
as_ranges_fn.push_block(as_ranges_fn_match_block);

property_from_str_fn_match_block.line("_ => None,");
property_from_str_fn.push_block(property_from_str_fn_match_block);

self.scope
.push_fn(is_property_fn)
.push_fn(as_ranges_fn)
.push_enum(property_enum)
.push_fn(property_from_str_fn);
}
Expand Down Expand Up @@ -223,7 +214,8 @@ impl GenUnicode {
f.push_block(b);
}

for (alias0, alias1, orig_name, name, value_names_str) in GENERAL_CATEGORY_VALUES_DERIVED {
for (alias0, alias1, orig_name, name, value_names_str, _) in GENERAL_CATEGORY_VALUES_DERIVED
{
let mut chars = Vec::new();

for value_name in value_names_str.split(',') {
Expand Down Expand Up @@ -299,15 +291,15 @@ impl GenUnicode {
}

// Structure: (Alias, Alias, Name, CamelCaseName, CommaSeparatedValueNames)
const GENERAL_CATEGORY_VALUES_DERIVED: &[(&str, &str,&str, &str, &str); 8] = &[
("", "LC", "Cased_Letter", "CasedLetter", "Lowercase_Letter,Titlecase_Letter,Uppercase_Letter"),
("", "C", "Other", "Other", "Control,Format,Surrogate,Unassigned,Private_Use"),
("", "L", "Letter", "Letter", "Lowercase_Letter,Modifier_Letter,Other_Letter,Titlecase_Letter,Uppercase_Letter"),
("Combining_Mark", "M", "Mark", "Mark", "Spacing_Mark,Enclosing_Mark,Nonspacing_Mark"),
("", "N", "Number", "Number","Decimal_Number,Letter_Number,Other_Number"),
("punct", "P", "Punctuation", "Punctuation", "Connector_Punctuation,Dash_Punctuation,Close_Punctuation,Final_Punctuation,Initial_Punctuation,Other_Punctuation,Open_Punctuation"),
("", "S", "Symbol", "Symbol", "Currency_Symbol,Modifier_Symbol,Math_Symbol,Other_Symbol"),
("", "Z", "Separator", "Separator", "Line_Separator,Paragraph_Separator,Space_Separator"),
const GENERAL_CATEGORY_VALUES_DERIVED: &[(&str, &str, &str, &str, &str, &str); 8] = &[
("", "LC", "Cased_Letter", "CasedLetter", "Lowercase_Letter,Titlecase_Letter,Uppercase_Letter", "Ll,Lt,Lu"),
("", "C", "Other", "Other", "Control,Format,Surrogate,Unassigned,Private_Use", "Cc,Cf,Cs,Cn,Co"),
("", "L", "Letter", "Letter", "Lowercase_Letter,Modifier_Letter,Other_Letter,Titlecase_Letter,Uppercase_Letter", "Ll,Lm,Lo,Lt,Lu"),
("Combining_Mark", "M", "Mark", "Mark", "Spacing_Mark,Enclosing_Mark,Nonspacing_Mark", "Mc,Me,Mn"),
("", "N", "Number", "Number","Decimal_Number,Letter_Number,Other_Number", "Nd,Nl,No"),
("punct", "P", "Punctuation", "Punctuation", "Connector_Punctuation,Dash_Punctuation,Close_Punctuation,Final_Punctuation,Initial_Punctuation,Other_Punctuation,Open_Punctuation", "Pc,Pd,Pe,Pf,Pi,Po,Ps"),
("", "S", "Symbol", "Symbol", "Currency_Symbol,Modifier_Symbol,Math_Symbol,Other_Symbol", "Sc,Sk,Sm,So"),
("", "Z", "Separator", "Separator", "Line_Separator,Paragraph_Separator,Space_Separator", "Zl,Zp,Zs"),
];

// Structure: (Alias, Alias, Name, CamelCaseName)
Expand Down
Loading

0 comments on commit 0108770

Please sign in to comment.