Skip to content

Commit

Permalink
Auto merge of rust-lang#109698 - epage:wtf, r=Amanieu
Browse files Browse the repository at this point in the history
Allow limited access to `OsStr` bytes

`OsStr` has historically kept its implementation details private out of
concern for locking us into a specific encoding on Windows.

This is an alternative to rust-lang#95290 which proposed specifying the encoding on Windows.  Instead, this
only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines
rules for safely interacting with it

At minimum, this can greatly simplify the `os_str_bytes` crate and every
arg parser that interacts with `OsStr` directly (which is most of those
that support invalid UTF-8).

Tracking issue: rust-lang#111544
  • Loading branch information
bors committed May 30, 2023
2 parents f0411ff + e6a35c4 commit 9610dfe
Show file tree
Hide file tree
Showing 14 changed files with 159 additions and 83 deletions.
8 changes: 8 additions & 0 deletions library/std/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@
//! trait, which provides a [`from_wide`] method to convert a native Windows
//! string (without the terminating nul character) to an [`OsString`].
//!
//! ## On all platforms
//!
//! On all platforms, [`OsStr`] consists of a sequence of bytes that is encoded as a superset of
//! UTF-8; see [`OsString`] for more details on its encoding on different platforms.
//!
//! For limited, inexpensive conversions from and to bytes, see [`OsStr::as_os_str_bytes`] and
//! [`OsStr::from_os_str_bytes_unchecked`].
//!
//! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
//! [Unicode code point]: https://www.unicode.org/glossary/#code_point
//! [`env::set_var()`]: crate::env::set_var "env::set_var"
Expand Down
82 changes: 69 additions & 13 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,51 @@ impl OsStr {
s.as_ref()
}

/// Converts a slice of bytes to an OS string slice without checking that the string contains
/// valid `OsStr`-encoded data.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// See the [module's toplevel documentation about conversions][conversions] for safe,
/// cross-platform [conversions] from/to native representations.
///
/// # Safety
///
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
/// built for the same target platform. For example, reconstructing an `OsStr` from bytes sent
/// over the network or stored in a file will likely violate these safety rules.
///
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub unsafe fn from_os_str_bytes_unchecked(bytes: &[u8]) -> &Self {
Self::from_inner(Slice::from_os_str_bytes_unchecked(bytes))
}

#[inline]
fn from_inner(inner: &Slice) -> &OsStr {
// SAFETY: OsStr is just a wrapper of Slice,
Expand Down Expand Up @@ -837,13 +882,24 @@ impl OsStr {
OsString { inner: Buf::from_box(boxed) }
}

/// Gets the underlying byte representation.
/// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS
/// string slice, use the [`OsStr::from_os_str_bytes_unchecked`] function.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
/// be treated as opaque and only comparable within the same rust version built for the same
/// target platform. For example, sending the slice over the network or storing it in a file
/// will likely result in incompatible byte slices. See [`OsString`] for more encoding details
/// and [`std::ffi`] for platform-specific, specified conversions.
///
/// Note: it is *crucial* that this API is not externally public, to avoid
/// revealing the internal, platform-specific encodings.
/// [`std::ffi`]: crate::ffi
#[inline]
pub(crate) fn bytes(&self) -> &[u8] {
unsafe { &*(&self.inner as *const _ as *const [u8]) }
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn as_os_str_bytes(&self) -> &[u8] {
self.inner.as_os_str_bytes()
}

/// Converts this string to its ASCII lower case equivalent in-place.
Expand Down Expand Up @@ -1131,7 +1187,7 @@ impl Default for &OsStr {
impl PartialEq for OsStr {
#[inline]
fn eq(&self, other: &OsStr) -> bool {
self.bytes().eq(other.bytes())
self.as_os_str_bytes().eq(other.as_os_str_bytes())
}
}

Expand All @@ -1158,23 +1214,23 @@ impl Eq for OsStr {}
impl PartialOrd for OsStr {
#[inline]
fn partial_cmp(&self, other: &OsStr) -> Option<cmp::Ordering> {
self.bytes().partial_cmp(other.bytes())
self.as_os_str_bytes().partial_cmp(other.as_os_str_bytes())
}
#[inline]
fn lt(&self, other: &OsStr) -> bool {
self.bytes().lt(other.bytes())
self.as_os_str_bytes().lt(other.as_os_str_bytes())
}
#[inline]
fn le(&self, other: &OsStr) -> bool {
self.bytes().le(other.bytes())
self.as_os_str_bytes().le(other.as_os_str_bytes())
}
#[inline]
fn gt(&self, other: &OsStr) -> bool {
self.bytes().gt(other.bytes())
self.as_os_str_bytes().gt(other.as_os_str_bytes())
}
#[inline]
fn ge(&self, other: &OsStr) -> bool {
self.bytes().ge(other.bytes())
self.as_os_str_bytes().ge(other.as_os_str_bytes())
}
}

Expand All @@ -1193,7 +1249,7 @@ impl PartialOrd<str> for OsStr {
impl Ord for OsStr {
#[inline]
fn cmp(&self, other: &OsStr) -> cmp::Ordering {
self.bytes().cmp(other.bytes())
self.as_os_str_bytes().cmp(other.as_os_str_bytes())
}
}

Expand Down Expand Up @@ -1243,7 +1299,7 @@ impl_cmp!(Cow<'a, OsStr>, OsString);
impl Hash for OsStr {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.bytes().hash(state)
self.as_os_str_bytes().hash(state)
}
}

Expand Down
52 changes: 24 additions & 28 deletions library/std/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ impl<'a> Prefix<'a> {
fn len(&self) -> usize {
use self::Prefix::*;
fn os_str_len(s: &OsStr) -> usize {
s.bytes().len()
s.as_os_str_bytes().len()
}
match *self {
Verbatim(x) => 4 + os_str_len(x),
Expand Down Expand Up @@ -299,20 +299,6 @@ where
}
}

unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
// SAFETY: See note at the top of this module to understand why this and
// `OsStr::bytes` are used:
//
// This casts are safe as OsStr is internally a wrapper around [u8] on all
// platforms.
//
// Note that currently this relies on the special knowledge that std has;
// these types are single-element structs but are not marked
// repr(transparent) or repr(C) which would make these casts not allowable
// outside std.
unsafe { &*(s as *const [u8] as *const OsStr) }
}

// Detect scheme on Redox
fn has_redox_scheme(s: &[u8]) -> bool {
cfg!(target_os = "redox") && s.contains(&b':')
Expand All @@ -330,26 +316,31 @@ fn has_physical_root(s: &[u8], prefix: Option<Prefix<'_>>) -> bool {

// basic workhorse for splitting stem and extension
fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
if file.bytes() == b".." {
if file.as_os_str_bytes() == b".." {
return (Some(file), None);
}

// The unsafety here stems from converting between &OsStr and &[u8]
// and back. This is safe to do because (1) we only look at ASCII
// contents of the encoding and (2) new &OsStr values are produced
// only from ASCII-bounded slices of existing &OsStr values.
let mut iter = file.bytes().rsplitn(2, |b| *b == b'.');
let mut iter = file.as_os_str_bytes().rsplitn(2, |b| *b == b'.');
let after = iter.next();
let before = iter.next();
if before == Some(b"") {
(Some(file), None)
} else {
unsafe { (before.map(|s| u8_slice_as_os_str(s)), after.map(|s| u8_slice_as_os_str(s))) }
unsafe {
(
before.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
after.map(|s| OsStr::from_os_str_bytes_unchecked(s)),
)
}
}
}

fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
let slice = file.bytes();
let slice = file.as_os_str_bytes();
if slice == b".." {
return (file, None);
}
Expand All @@ -364,7 +355,12 @@ fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
};
let before = &slice[..i];
let after = &slice[i + 1..];
unsafe { (u8_slice_as_os_str(before), Some(u8_slice_as_os_str(after))) }
unsafe {
(
OsStr::from_os_str_bytes_unchecked(before),
Some(OsStr::from_os_str_bytes_unchecked(after)),
)
}
}

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -743,7 +739,7 @@ impl<'a> Components<'a> {
// separately via `include_cur_dir`
b".." => Some(Component::ParentDir),
b"" => None,
_ => Some(Component::Normal(unsafe { u8_slice_as_os_str(comp) })),
_ => Some(Component::Normal(unsafe { OsStr::from_os_str_bytes_unchecked(comp) })),
}
}

Expand Down Expand Up @@ -900,7 +896,7 @@ impl<'a> Iterator for Components<'a> {
let raw = &self.path[..self.prefix_len()];
self.path = &self.path[self.prefix_len()..];
return Some(Component::Prefix(PrefixComponent {
raw: unsafe { u8_slice_as_os_str(raw) },
raw: unsafe { OsStr::from_os_str_bytes_unchecked(raw) },
parsed: self.prefix.unwrap(),
}));
}
Expand Down Expand Up @@ -972,7 +968,7 @@ impl<'a> DoubleEndedIterator for Components<'a> {
State::Prefix if self.prefix_len() > 0 => {
self.back = State::Done;
return Some(Component::Prefix(PrefixComponent {
raw: unsafe { u8_slice_as_os_str(self.path) },
raw: unsafe { OsStr::from_os_str_bytes_unchecked(self.path) },
parsed: self.prefix.unwrap(),
}));
}
Expand Down Expand Up @@ -1481,17 +1477,17 @@ impl PathBuf {
fn _set_extension(&mut self, extension: &OsStr) -> bool {
let file_stem = match self.file_stem() {
None => return false,
Some(f) => f.bytes(),
Some(f) => f.as_os_str_bytes(),
};

// truncate until right after the file stem
let end_file_stem = file_stem[file_stem.len()..].as_ptr().addr();
let start = self.inner.bytes().as_ptr().addr();
let start = self.inner.as_os_str_bytes().as_ptr().addr();
let v = self.as_mut_vec();
v.truncate(end_file_stem.wrapping_sub(start));

// add the new extension, if any
let new = extension.bytes();
let new = extension.as_os_str_bytes();
if !new.is_empty() {
v.reserve_exact(new.len() + 1);
v.push(b'.');
Expand Down Expand Up @@ -2011,11 +2007,11 @@ impl Path {
// The following (private!) function allows construction of a path from a u8
// slice, which is only safe when it is known to follow the OsStr encoding.
unsafe fn from_u8_slice(s: &[u8]) -> &Path {
unsafe { Path::new(u8_slice_as_os_str(s)) }
unsafe { Path::new(OsStr::from_os_str_bytes_unchecked(s)) }
}
// The following (private!) function reveals the byte encoding used for OsStr.
fn as_u8_slice(&self) -> &[u8] {
self.inner.bytes()
self.inner.as_os_str_bytes()
}

/// Directly wraps a string slice as a `Path` slice.
Expand Down
2 changes: 1 addition & 1 deletion library/std/src/sys/common/small_c_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub fn run_path_with_cstr<T, F>(path: &Path, f: F) -> io::Result<T>
where
F: FnOnce(&CStr) -> io::Result<T>,
{
run_with_cstr(path.as_os_str().bytes(), f)
run_with_cstr(path.as_os_str().as_os_str_bytes(), f)
}

#[inline]
Expand Down
4 changes: 2 additions & 2 deletions library/std/src/sys/common/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use core::iter::repeat;
fn stack_allocation_works() {
let path = Path::new("abc");
let result = run_path_with_cstr(path, |p| {
assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
Ok(42)
});
assert_eq!(result.unwrap(), 42);
Expand All @@ -25,7 +25,7 @@ fn heap_allocation_works() {
let path = repeat("a").take(384).collect::<String>();
let path = Path::new(&path);
let result = run_path_with_cstr(path, |p| {
assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
Ok(42)
});
assert_eq!(result.unwrap(), 42);
Expand Down
9 changes: 7 additions & 2 deletions library/std/src/sys/unix/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,13 +193,18 @@ impl Buf {

impl Slice {
#[inline]
fn from_u8_slice(s: &[u8]) -> &Slice {
pub fn as_os_str_bytes(&self) -> &[u8] {
&self.inner
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
unsafe { mem::transmute(s) }
}

#[inline]
pub fn from_str(s: &str) -> &Slice {
Slice::from_u8_slice(s.as_bytes())
unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
}

pub fn to_str(&self) -> Option<&str> {
Expand Down
9 changes: 4 additions & 5 deletions library/std/src/sys/unix/os_str/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::*;

#[test]
fn slice_debug_output() {
let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
let expected = r#""\xF0hello,\tworld""#;
let output = format!("{input:?}");

Expand All @@ -11,8 +11,7 @@ fn slice_debug_output() {

#[test]
fn display() {
assert_eq!(
"Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
);
assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
},);
}
2 changes: 1 addition & 1 deletion library/std/src/sys/unix/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {

// Get the components, skipping the redundant leading "." component if it exists.
let mut components = path.strip_prefix(".").unwrap_or(path).components();
let path_os = path.as_os_str().bytes();
let path_os = path.as_os_str().as_os_str_bytes();

let mut normalized = if path.is_absolute() {
// "If a pathname begins with two successive <slash> characters, the
Expand Down
4 changes: 2 additions & 2 deletions library/std/src/sys/unix/process/process_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ pub enum ProgramKind {

impl ProgramKind {
fn new(program: &OsStr) -> Self {
if program.bytes().starts_with(b"/") {
if program.as_os_str_bytes().starts_with(b"/") {
Self::Absolute
} else if program.bytes().contains(&b'/') {
} else if program.as_os_str_bytes().contains(&b'/') {
// If the program has more than one component in it, it is a relative path.
Self::Relative
} else {
Expand Down
Loading

0 comments on commit 9610dfe

Please sign in to comment.