Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove _manual_slice constructors #5312

Merged
merged 6 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions components/locale_core/src/extensions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,12 @@ impl ExtensionType {
}
}

pub(crate) const fn try_from_utf8_manual_slice(
code_units: &[u8],
start: usize,
end: usize,
) -> Result<Self, ParseError> {
if end - start != 1 {
pub(crate) const fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
let &[first] = code_units else {
return Err(ParseError::InvalidExtension);
}
#[allow(clippy::indexing_slicing)]
Self::try_from_byte(code_units[start])
};

Self::try_from_byte(first)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ impl Keywords {
}
current_keyword = Some(Key::try_from_utf8(subtag)?);
} else if current_keyword.is_some() {
match Value::parse_subtag(subtag) {
match Value::parse_subtag_from_utf8(subtag) {
Ok(Some(t)) => current_value.push(t),
Ok(None) => {}
Err(_) => break,
Expand Down
12 changes: 2 additions & 10 deletions components/locale_core/src/extensions/unicode/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,16 +255,8 @@ impl Value {
Self(input)
}

pub(crate) fn parse_subtag(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
Self::parse_subtag_from_utf8_manual_slice(t, 0, t.len())
}

pub(crate) const fn parse_subtag_from_utf8_manual_slice(
code_units: &[u8],
start: usize,
end: usize,
) -> Result<Option<Subtag>, ParseError> {
match Subtag::try_from_utf8_manual_slice(code_units, start, end) {
pub(crate) const fn parse_subtag_from_utf8(t: &[u8]) -> Result<Option<Subtag>, ParseError> {
match Subtag::try_from_utf8(t) {
Ok(TRUE_VALUE) => Ok(None),
Ok(s) => Ok(Some(s)),
Err(_) => Err(ParseError::InvalidSubtag),
Expand Down
19 changes: 4 additions & 15 deletions components/locale_core/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,15 @@ macro_rules! impl_tinystr_subtag {
}

/// See [`Self::try_from_str`]
#[inline]
pub const fn try_from_utf8(code_units: &[u8]) -> Result<Self, crate::parser::errors::ParseError> {
Self::try_from_utf8_manual_slice(code_units, 0, code_units.len())
}

/// Equivalent to [`try_from_utf8(bytes[start..end])`](Self::try_from_utf8),
/// but callable in a `const` context (which range indexing is not).
pub const fn try_from_utf8_manual_slice(
pub const fn try_from_utf8(
code_units: &[u8],
start: usize,
end: usize,
) -> Result<Self, crate::parser::errors::ParseError> {
let slen = end - start;

#[allow(clippy::double_comparisons)] // if len_start == len_end
if slen < $len_start || slen > $len_end {
#[allow(clippy::double_comparisons)] // if code_units.len() === 0
if code_units.len() < $len_start || code_units.len() > $len_end {
return Err(crate::parser::errors::ParseError::$error);
}

match tinystr::TinyAsciiStr::try_from_utf8_manual_slice(code_units, start, end) {
match tinystr::TinyAsciiStr::try_from_utf8(code_units) {
Ok($tinystr_ident) if $validate => Ok(Self($normalize)),
_ => Err(crate::parser::errors::ParseError::$error),
}
Expand Down
50 changes: 21 additions & 29 deletions components/locale_core/src/parser/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
let mut variant = None;
let mut keyword = None;

if let (i, Some((start, end))) = iter.next_manual() {
if let (i, Some(subtag)) = iter.next_const() {
iter = i;
match subtags::Language::try_from_utf8_manual_slice(iter.slice, start, end) {
match subtags::Language::try_from_utf8(subtag) {
Ok(l) => language = l,
Err(e) => return Err(e),
}
Expand All @@ -139,23 +139,19 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f

let mut position = ParserPosition::Script;

while let Some((start, end)) = iter.peek_manual() {
if !matches!(mode, ParserMode::LanguageIdentifier) && end - start == 1 {
while let Some(subtag) = iter.peek() {
if !matches!(mode, ParserMode::LanguageIdentifier) && subtag.len() == 1 {
break;
}

if matches!(position, ParserPosition::Script) {
if let Ok(s) = subtags::Script::try_from_utf8_manual_slice(iter.slice, start, end) {
if let Ok(s) = subtags::Script::try_from_utf8(subtag) {
script = Some(s);
position = ParserPosition::Region;
} else if let Ok(r) =
subtags::Region::try_from_utf8_manual_slice(iter.slice, start, end)
{
} else if let Ok(r) = subtags::Region::try_from_utf8(subtag) {
region = Some(r);
position = ParserPosition::Variant;
} else if let Ok(v) =
subtags::Variant::try_from_utf8_manual_slice(iter.slice, start, end)
{
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
Expand All @@ -166,12 +162,10 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
return Err(ParseError::InvalidSubtag);
}
} else if matches!(position, ParserPosition::Region) {
if let Ok(s) = subtags::Region::try_from_utf8_manual_slice(iter.slice, start, end) {
if let Ok(s) = subtags::Region::try_from_utf8(subtag) {
region = Some(s);
position = ParserPosition::Variant;
} else if let Ok(v) =
subtags::Variant::try_from_utf8_manual_slice(iter.slice, start, end)
{
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
Expand All @@ -181,7 +175,7 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
} else {
return Err(ParseError::InvalidSubtag);
}
} else if let Ok(v) = subtags::Variant::try_from_utf8_manual_slice(iter.slice, start, end) {
} else if let Ok(v) = subtags::Variant::try_from_utf8(subtag) {
debug_assert!(matches!(position, ParserPosition::Variant));
if variant.is_some() {
// We cannot handle multiple variants in a const context
Expand All @@ -194,16 +188,16 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
return Err(ParseError::InvalidSubtag);
}

iter = iter.next_manual().0;
iter = iter.next_const().0;
}

if matches!(mode, ParserMode::Locale) {
if let Some((start, end)) = iter.peek_manual() {
match ExtensionType::try_from_utf8_manual_slice(iter.slice, start, end) {
if let Some(subtag) = iter.peek() {
match ExtensionType::try_from_utf8(subtag) {
Ok(ExtensionType::Unicode) => {
iter = iter.next_manual().0;
if let Some((start, end)) = iter.peek_manual() {
if Attribute::try_from_utf8_manual_slice(iter.slice, start, end).is_ok() {
iter = iter.next_const().0;
if let Some(peek) = iter.peek() {
if Attribute::try_from_utf8(peek).is_ok() {
// We cannot handle Attributes in a const context
return Err(ParseError::InvalidSubtag);
}
Expand All @@ -212,20 +206,18 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
let mut key = None;
let mut current_type = None;

while let Some((start, end)) = iter.peek_manual() {
let slen = end - start;
if slen == 2 {
while let Some(peek) = iter.peek() {
if peek.len() == 2 {
if key.is_some() {
// We cannot handle more than one Key in a const context
return Err(ParseError::InvalidSubtag);
}
match Key::try_from_utf8_manual_slice(iter.slice, start, end) {
match Key::try_from_utf8(peek) {
Ok(k) => key = Some(k),
Err(e) => return Err(e),
};
} else if key.is_some() {
match Value::parse_subtag_from_utf8_manual_slice(iter.slice, start, end)
{
match Value::parse_subtag_from_utf8(peek) {
Ok(Some(t)) => {
if current_type.is_some() {
// We cannot handle more than one type in a const context
Expand All @@ -239,7 +231,7 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
} else {
break;
}
iter = iter.next_manual().0
iter = iter.next_const().0
}
if let Some(k) = key {
keyword = Some((k, current_type));
Expand Down
28 changes: 15 additions & 13 deletions components/locale_core/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
// If it's a separator, set the start to idx+1 and advance the idx to the next char.
(idx + 1, idx + 1)
} else {
// If it's idx=0, start is 0 and end is set to 1
// If it's idx=0, end is set to 1
debug_assert!(idx == 0);
(0, 1)
};
Expand All @@ -45,6 +45,14 @@ const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
(start, end)
}

pub const fn split_out_range(slice: &[u8], start: usize, end: usize) -> &[u8] {
assert!(start <= slice.len());
assert!(end <= slice.len());
assert!(start <= end);
// SAFETY: assertions and align = size = 1.
unsafe { core::slice::from_raw_parts(slice.as_ptr().add(start), end - start) }
}
Comment on lines +48 to +54
Copy link
Member

@sffc sffc Jul 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split_at_checked is new (and const) in 1.80: rust-lang/rust#85122

I think I prefer holding this until we can use that in MSRV so we don't introduce unsafe code.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We did this because clients didn't have a safe way to do this at all. I'm assuming the majority of our clients will be on stable, so now the majority of our clients have a safe way to do this. One line more or less unsafe code to make this work isn't the end of the world. Also note that I have a follow-up PR that simplifies the unsafety, but it needs to be benched.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, if this is just about getting rid of _manual_slice in public APIs, then just remove those functions without adding unsafe code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parser calls the _manual_slice functions, because it needs to const-slice the input. I'm doing this in one location in the parser, and this is not the only unsafe code in this crate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add at least a comment here saying something like

// TODO(MSRV): Use slice_at_checked in Rust 1.80


// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
//
// It is quite extraordinary due to focus on performance and Rust limitations for `const`
Expand Down Expand Up @@ -81,7 +89,7 @@ impl<'a> SubtagIterator<'a> {
}
}

pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) {
pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
if self.done {
return (self, None);
}
Expand All @@ -91,30 +99,24 @@ impl<'a> SubtagIterator<'a> {
} else {
self.done = true;
}
(self, Some(result))
(self, Some(split_out_range(self.slice, result.0, result.1)))
}

pub const fn peek_manual(&self) -> Option<(usize, usize)> {
pub const fn peek(&self) -> Option<&'a [u8]> {
if self.done {
return None;
}
Some(self.subtag)
}

pub fn peek(&self) -> Option<&'a [u8]> {
#[allow(clippy::indexing_slicing)] // peek_manual returns valid indices
self.peek_manual().map(|(s, e)| &self.slice[s..e])
Some(split_out_range(self.slice, self.subtag.0, self.subtag.1))
}
}

impl<'a> Iterator for SubtagIterator<'a> {
type Item = &'a [u8];

fn next(&mut self) -> Option<Self::Item> {
let (s, res) = self.next_manual();
let (s, res) = self.next_const();
*self = s;
#[allow(clippy::indexing_slicing)] // next_manual returns valid indices
res.map(|(s, e)| &self.slice[s..e])
res
}
}

Expand Down
40 changes: 9 additions & 31 deletions utils/tinystr/src/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl<const N: usize> TinyAsciiStr<N> {
/// `code_units` may contain at most `N` non-null ASCII code points.
#[inline]
pub const fn try_from_utf8(code_units: &[u8]) -> Result<Self, TinyStrError> {
Self::try_from_utf8_inner(code_units, 0, code_units.len(), false)
Self::try_from_utf8_inner(code_units, false)
}

/// Creates a `TinyAsciiStr<N>` from the given UTF-16 slice.
Expand Down Expand Up @@ -126,49 +126,27 @@ impl<const N: usize> TinyAsciiStr<N> {
/// assert!(matches!(TinyAsciiStr::<3>::try_from_raw(*b"\0A\0"), Err(_)));
/// ```
pub const fn try_from_raw(raw: [u8; N]) -> Result<Self, TinyStrError> {
Self::try_from_utf8_inner(&raw, 0, N, true)
}

/// Equivalent to [`try_from_utf8(bytes[start..end])`](Self::try_from_utf8),
/// but callable in a `const` context (which range indexing is not).
#[inline]
pub const fn try_from_utf8_manual_slice(
code_units: &[u8],
start: usize,
end: usize,
) -> Result<Self, TinyStrError> {
Self::try_from_utf8_inner(code_units, start, end, false)
}

/// Equivalent to [`try_from_utf16(bytes[start..end])`](Self::try_from_utf16),
/// but callable in a `const` context (which range indexing is not).
#[inline]
pub const fn try_from_utf16_manual_slice(
code_units: &[u16],
start: usize,
end: usize,
) -> Result<Self, TinyStrError> {
Self::try_from_utf16_inner(code_units, start, end, false)
Self::try_from_utf8_inner(&raw, true)
}

pub(crate) const fn try_from_utf8_inner(
code_units: &[u8],
start: usize,
end: usize,
allow_trailing_null: bool,
) -> Result<Self, TinyStrError> {
let len = end - start;
if len > N {
return Err(TinyStrError::TooLarge { max: N, len });
if code_units.len() > N {
return Err(TinyStrError::TooLarge {
max: N,
len: code_units.len(),
});
}

let mut out = [0; N];
let mut i = 0;
let mut found_null = false;
// Indexing is protected by TinyStrError::TooLarge
#[allow(clippy::indexing_slicing)]
while i < len {
let b = code_units[start + i];
while i < code_units.len() {
let b = code_units[i];

if b == 0 {
found_null = true;
Expand Down
2 changes: 1 addition & 1 deletion utils/tinystr/src/ule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ unsafe impl<const N: usize> ULE for TinyAsciiStr<N> {
}
// Validate the bytes
for chunk in bytes.chunks_exact(N) {
let _ = TinyAsciiStr::<N>::try_from_utf8_inner(chunk, 0, N, true)
let _ = TinyAsciiStr::<N>::try_from_utf8_inner(chunk, true)
.map_err(|_| ZeroVecError::parse::<Self>())?;
}
Ok(())
Expand Down