From 27393d5ca63c55313182bd6cbb8ef2ecd4c1472f Mon Sep 17 00:00:00 2001 From: Count Count Date: Sun, 21 Mar 2021 08:19:34 +0100 Subject: [PATCH 01/12] fix incomplete UTF-8 writes in Windows console stdio --- library/std/src/sys/windows/stdio.rs | 116 +++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index be3141e46a1cc..160d9bcd4d226 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -14,8 +14,18 @@ use crate::sys::handle::Handle; pub struct Stdin { surrogate: u16, } -pub struct Stdout; -pub struct Stderr; +pub struct Stdout { + incomplete_utf8: IncompleteUtf8, +} + +pub struct Stderr { + incomplete_utf8: IncompleteUtf8, +} + +struct IncompleteUtf8 { + bytes: [u8; 4], + len: u8, +} // Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see // #13304 for details). @@ -50,7 +60,27 @@ fn is_console(handle: c::HANDLE) -> bool { unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } } -fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { +// Simple reimplementation of std::str::utf8_char_width() which is feature-gated +fn utf8_char_width(b: u8) -> usize { + match b { + 0x00..=0x7F => 1, + 0x80..=0xC1 => 0, + 0xC2..=0xDF => 2, + 0xE0..=0xEF => 3, + 0xF0..=0xF4 => 4, + 0xF5..=0xFF => 0, + } +} + +fn write( + handle_id: c::DWORD, + data: &[u8], + incomplete_utf8: &mut IncompleteUtf8, +) -> io::Result { + if data.is_empty() { + return Ok(0); + } + let handle = get_handle(handle_id)?; if !is_console(handle) { let handle = Handle::new(handle); @@ -59,22 +89,74 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { return ret; } - // As the console is meant for presenting text, we assume bytes of `data` come from a string - // and are encoded as UTF-8, which needs to be encoded as UTF-16. + match incomplete_utf8.len { + 0 => {} + 1..=3 => { + if data[0] >> 6 != 0b10 { + incomplete_utf8.len = 0; + // not a continuation byte - reject + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; + incomplete_utf8.len += 1; + let char_width = utf8_char_width(incomplete_utf8.bytes[0]); + if (incomplete_utf8.len as usize) < char_width { + // more bytes needed + return Ok(1); + } + let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); + incomplete_utf8.len = 0; + match s { + Ok(s) => { + assert_eq!(char_width, s.len()); + let written = write_valid_utf8(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write0() for single codepoint writes + return Ok(1); + } + Err(_) => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + } + } + _ => { + panic!("Unexpected number of incomplete UTF-8 chars."); + } + } + + // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8, + // which needs to be encoded as UTF-16. // // If the data is not valid UTF-8 we write out as many bytes as are valid. - // Only when there are no valid bytes (which will happen on the next call), return an error. + // If the first byte is invalid it is either first byte of a multi-byte sequence but the + // provided byte slice is too short or it is the first byte of an invalide multi-byte sequence. let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { - return Err(io::Error::new_const( - io::ErrorKind::InvalidData, - &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", - )); + if data.len() < utf8_char_width(data[0]) { + incomplete_utf8.bytes[0] = data[0]; + incomplete_utf8.len = 1; + return Ok(1); + } else { + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } } Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), }; + + write_valid_utf8(handle, utf8) +} + +fn write_valid_utf8(handle: c::HANDLE, utf8: &str) -> io::Result { let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; let mut len_utf16 = 0; for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { @@ -254,15 +336,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result { Ok(written) } +impl IncompleteUtf8 { + pub const fn new() -> IncompleteUtf8 { + IncompleteUtf8 { bytes: [0; 4], len: 0 } + } +} + impl Stdout { pub const fn new() -> Stdout { - Stdout + Stdout { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stdout { fn write(&mut self, buf: &[u8]) -> io::Result { - write(c::STD_OUTPUT_HANDLE, buf) + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { @@ -272,13 +360,13 @@ impl io::Write for Stdout { impl Stderr { pub const fn new() -> Stderr { - Stderr + Stderr { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stderr { fn write(&mut self, buf: &[u8]) -> io::Result { - write(c::STD_ERROR_HANDLE, buf) + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { From a941e68e08e3a20bf918d2291a01fda9facefb74 Mon Sep 17 00:00:00 2001 From: Count Count Date: Sun, 21 Mar 2021 11:48:51 +0100 Subject: [PATCH 02/12] fix fmt --- library/std/src/sys/windows/stdio.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 160d9bcd4d226..0fade85d8d4db 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -144,9 +144,9 @@ fn write( incomplete_utf8.len = 1; return Ok(1); } else { - return Err(io::Error::new_const( + return Err(io::Error::new( io::ErrorKind::InvalidData, - &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", )); } } From 60b149f1821377bad19d6323c9603b336922a777 Mon Sep 17 00:00:00 2001 From: Count Count Date: Sun, 21 Mar 2021 17:42:30 +0100 Subject: [PATCH 03/12] Export utf8_char_width() publicly in core::std behind the "str_internals" feature gate and use it in sys::windows::stdio instead of reimplementing it there. --- library/core/src/str/mod.rs | 3 +++ library/std/src/sys/windows/stdio.rs | 13 +------------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 03ed301eacf8c..8f30ce91933ee 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -71,6 +71,9 @@ pub use iter::SplitInclusive; #[unstable(feature = "str_internals", issue = "none")] pub use validations::next_code_point; +#[unstable(feature = "str_internals", issue = "none")] +pub use validations::utf8_char_width; + use iter::MatchIndicesInternal; use iter::SplitInternal; use iter::{MatchesInternal, SplitNInternal}; diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 0fade85d8d4db..570d32b89b4d0 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -8,6 +8,7 @@ use crate::str; use crate::sys::c; use crate::sys::cvt; use crate::sys::handle::Handle; +use core::str::utf8_char_width; // Don't cache handles but get them fresh for every read/write. This allows us to track changes to // the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490. @@ -60,18 +61,6 @@ fn is_console(handle: c::HANDLE) -> bool { unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } } -// Simple reimplementation of std::str::utf8_char_width() which is feature-gated -fn utf8_char_width(b: u8) -> usize { - match b { - 0x00..=0x7F => 1, - 0x80..=0xC1 => 0, - 0xC2..=0xDF => 2, - 0xE0..=0xEF => 3, - 0xF0..=0xF4 => 4, - 0xF5..=0xFF => 0, - } -} - fn write( handle_id: c::DWORD, data: &[u8], From 0202273d403d534b4cd9a27d6de332197b85b60f Mon Sep 17 00:00:00 2001 From: Count Count Date: Sun, 21 Mar 2021 22:15:33 +0100 Subject: [PATCH 04/12] fix c&p error --- library/std/src/sys/windows/stdio.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 570d32b89b4d0..0812fa51f07a9 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -339,7 +339,7 @@ impl Stdout { impl io::Write for Stdout { fn write(&mut self, buf: &[u8]) -> io::Result { - write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) + write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { From d11469404274c7162b295105846f98111bc273ab Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 07:05:17 +0100 Subject: [PATCH 05/12] Reject byte if it cannot start a valid UTF-8 sequence. --- library/std/src/sys/windows/stdio.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 0812fa51f07a9..19fd900580b3d 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -128,7 +128,8 @@ fn write( let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { - if data.len() < utf8_char_width(data[0]) { + first_byte_char_width = utf8_char_width(data[0]); + if first_byte_char_width > 1 && data.len() < first_byte_char_width { incomplete_utf8.bytes[0] = data[0]; incomplete_utf8.len = 1; return Ok(1); From 52713a478188203a4b4a6592e7014d5a4ef44495 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 07:06:24 +0100 Subject: [PATCH 06/12] fix --- library/std/src/sys/windows/stdio.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 19fd900580b3d..1ca10de037c2d 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -128,7 +128,7 @@ fn write( let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { - first_byte_char_width = utf8_char_width(data[0]); + let first_byte_char_width = utf8_char_width(data[0]); if first_byte_char_width > 1 && data.len() < first_byte_char_width { incomplete_utf8.bytes[0] = data[0]; incomplete_utf8.len = 1; From fb1fa97fdcad00785cece5fbd9d42157c253e976 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 07:17:24 +0100 Subject: [PATCH 07/12] use io::Error::new_const() everywhere --- library/std/src/sys/windows/stdio.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 1ca10de037c2d..24bc2a1e54046 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -84,9 +84,9 @@ fn write( if data[0] >> 6 != 0b10 { incomplete_utf8.len = 0; // not a continuation byte - reject - return Err(io::Error::new( + return Err(io::Error::new_const( io::ErrorKind::InvalidData, - "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", )); } incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; @@ -106,9 +106,9 @@ fn write( return Ok(1); } Err(_) => { - return Err(io::Error::new( + return Err(io::Error::new_const( io::ErrorKind::InvalidData, - "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", )); } } @@ -134,9 +134,9 @@ fn write( incomplete_utf8.len = 1; return Ok(1); } else { - return Err(io::Error::new( + return Err(io::Error::new_const( io::ErrorKind::InvalidData, - "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", )); } } From 34cfe383e5ecdea538582bd08736a80f555c0adc Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 10:06:31 +0100 Subject: [PATCH 08/12] correct comment --- library/std/src/sys/windows/stdio.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 24bc2a1e54046..a2def41625d43 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -102,7 +102,7 @@ fn write( Ok(s) => { assert_eq!(char_width, s.len()); let written = write_valid_utf8(handle, s)?; - assert_eq!(written, s.len()); // guaranteed by write0() for single codepoint writes + assert_eq!(written, s.len()); // guaranteed by write_valid_utf8() for single codepoint writes return Ok(1); } Err(_) => { From 3103f5f5501cf10c04023b74d738b245dd42cea2 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 10:24:05 +0100 Subject: [PATCH 09/12] rename fn write_valid_utf8() to write_valid_utf8_to_console() --- library/std/src/sys/windows/stdio.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index a2def41625d43..5789484bad07a 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -101,8 +101,8 @@ fn write( match s { Ok(s) => { assert_eq!(char_width, s.len()); - let written = write_valid_utf8(handle, s)?; - assert_eq!(written, s.len()); // guaranteed by write_valid_utf8() for single codepoint writes + let written = write_valid_utf8_to_console(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes return Ok(1); } Err(_) => { @@ -143,10 +143,10 @@ fn write( Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), }; - write_valid_utf8(handle, utf8) + write_valid_utf8_to_console(handle, utf8) } -fn write_valid_utf8(handle: c::HANDLE, utf8: &str) -> io::Result { +fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result { let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; let mut len_utf16 = 0; for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { From 7cfbe5429458151f6cb2fbd5fe2e44c56a18b644 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 18:22:09 +0100 Subject: [PATCH 10/12] assert!() instead of panic!() for expected invariant --- library/std/src/sys/windows/stdio.rs | 62 ++++++++++++++-------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 5789484bad07a..8fb0f2f9ee7f5 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -78,43 +78,41 @@ fn write( return ret; } - match incomplete_utf8.len { - 0 => {} - 1..=3 => { - if data[0] >> 6 != 0b10 { - incomplete_utf8.len = 0; - // not a continuation byte - reject + if incomplete_utf8.len > 0 { + assert!( + incomplete_utf8.len < 4, + "Unexpected number of bytes for incomplete UTF-8 codepoint." + ); + if data[0] >> 6 != 0b10 { + incomplete_utf8.len = 0; + // not a continuation byte - reject + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; + incomplete_utf8.len += 1; + let char_width = utf8_char_width(incomplete_utf8.bytes[0]); + if (incomplete_utf8.len as usize) < char_width { + // more bytes needed + return Ok(1); + } + let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); + incomplete_utf8.len = 0; + match s { + Ok(s) => { + assert_eq!(char_width, s.len()); + let written = write_valid_utf8_to_console(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes + return Ok(1); + } + Err(_) => { return Err(io::Error::new_const( io::ErrorKind::InvalidData, &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", )); } - incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; - incomplete_utf8.len += 1; - let char_width = utf8_char_width(incomplete_utf8.bytes[0]); - if (incomplete_utf8.len as usize) < char_width { - // more bytes needed - return Ok(1); - } - let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); - incomplete_utf8.len = 0; - match s { - Ok(s) => { - assert_eq!(char_width, s.len()); - let written = write_valid_utf8_to_console(handle, s)?; - assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes - return Ok(1); - } - Err(_) => { - return Err(io::Error::new_const( - io::ErrorKind::InvalidData, - &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", - )); - } - } - } - _ => { - panic!("Unexpected number of incomplete UTF-8 chars."); } } From dd3b79e9ffd7559896d20546b6ba4069032f2811 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 24 Mar 2021 18:23:01 +0100 Subject: [PATCH 11/12] comment pos --- library/std/src/sys/windows/stdio.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 8fb0f2f9ee7f5..0379a54e507f1 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -84,8 +84,8 @@ fn write( "Unexpected number of bytes for incomplete UTF-8 codepoint." ); if data[0] >> 6 != 0b10 { - incomplete_utf8.len = 0; // not a continuation byte - reject + incomplete_utf8.len = 0; return Err(io::Error::new_const( io::ErrorKind::InvalidData, &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", From fbfde7eaaf6e95ec186c311d5044281d7b2415c6 Mon Sep 17 00:00:00 2001 From: Count Count Date: Wed, 7 Apr 2021 08:07:09 +0200 Subject: [PATCH 12/12] Style only: merge with other pub use statement --- library/core/src/str/mod.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 8f30ce91933ee..77487f436ea7d 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -69,10 +69,7 @@ pub use iter::SplitAsciiWhitespace; pub use iter::SplitInclusive; #[unstable(feature = "str_internals", issue = "none")] -pub use validations::next_code_point; - -#[unstable(feature = "str_internals", issue = "none")] -pub use validations::utf8_char_width; +pub use validations::{next_code_point, utf8_char_width}; use iter::MatchIndicesInternal; use iter::SplitInternal;