diff --git a/crates/js-sys/src/lib.rs b/crates/js-sys/src/lib.rs index 85fcab47ce7..840d7fdfe66 100644 --- a/crates/js-sys/src/lib.rs +++ b/crates/js-sys/src/lib.rs @@ -3522,6 +3522,22 @@ impl JsString { None } } + + /// Returns whether this string is a valid UTF-16 string. + /// + /// This is useful for learning whether `as_string()` will return a lossless + /// representation of the JS string. If this string contains lone surrogates + /// then `as_string()` will succeed but it will be a lossy representation of + /// the JS string because lone surrogates will become replacement + /// characters. + /// + /// If this function returns `false` then to get a lossless representation + /// of the string you'll need to manually use `char_code_at` accessor to + /// access the raw code points. + pub fn is_valid_utf16(&self) -> bool { + let iter = (0..self.length()).map(|i| self.char_code_at(i) as u16); + std::char::decode_utf16(iter).all(|i| i.is_ok()) + } } impl PartialEq for JsString { diff --git a/crates/js-sys/tests/wasm/JsString.rs b/crates/js-sys/tests/wasm/JsString.rs index bb4a6ac0510..c7f229f1613 100644 --- a/crates/js-sys/tests/wasm/JsString.rs +++ b/crates/js-sys/tests/wasm/JsString.rs @@ -541,3 +541,15 @@ fn raw() { ); assert!(JsString::raw_0(&JsValue::null().unchecked_into()).is_err()); } + +#[wasm_bindgen_test] +fn is_valid_utf16() { + assert!(JsString::from("a").is_valid_utf16()); + assert!(JsString::from("").is_valid_utf16()); + assert!(JsString::from("🥑").is_valid_utf16()); + assert!(JsString::from("Why hello there this, 🥑, is 🥑 and is 🥑").is_valid_utf16()); + + assert!(JsString::from_char_code1(0x00).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xd800).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xdc00).is_valid_utf16()); +} diff --git a/guide/src/reference/types/str.md b/guide/src/reference/types/str.md index 999bbc183f9..740e3d8b15d 100644 --- a/guide/src/reference/types/str.md +++ b/guide/src/reference/types/str.md @@ -20,3 +20,24 @@ with handles to JavaScript string values, use the `js_sys::JsString` type. ```js {{#include ../../../../examples/guide-supported-types-examples/str.js}} ``` + +## UTF-16 vs UTF-8 + +Strings in JavaScript are by default encoded as if they're almost UTF-16. They +may, however, contain lone surrogates (only one element of a two-u16 pair to +create one unicode code point). + +When passing a string to Rust from JS the `TextEncoder` API will be used to +convert between utf-16 and utf-8. If there are no lone surrogates then both +strings will be equivalent in terms of the unicode code point sequences they +describe. + +If the JS string has a lone surrogate, however, then the `TextEncoder` +implementation will replace lone surrogates with a unicode replacement +character. This means that the string Rust receives is a lossy representation of +the string in JS. + +If you want to guarantee a lossless representation of the JS string in +Rust it's recommended you use `js_sys::JsString` as an argument type, and then +afterwards use `js_sys::JsString::is_valid_utf16_string` to determine whether +the string can be losslessly represented as `String` in Rust. diff --git a/guide/src/reference/types/string.md b/guide/src/reference/types/string.md index 568e20b63e1..3b846704abf 100644 --- a/guide/src/reference/types/string.md +++ b/guide/src/reference/types/string.md @@ -8,6 +8,9 @@ Copies the string's contents back and forth between the JavaScript garbage-collected heap and the Wasm linear memory with `TextDecoder` and `TextEncoder` +> **Note**: Be sure to check out the [documentation for `str`](str.html) to +> learn about some caveats when working with strings between JS and Rust. + ## Example Rust Usage ```rust diff --git a/src/lib.rs b/src/lib.rs index 0cd5034b812..10d1ab23d65 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -260,6 +260,27 @@ impl JsValue { /// /// If this JS value is not an instance of a string or if it's not valid /// utf-8 then this returns `None`. + /// + /// # UTF-16 vs UTF-8 + /// + /// Strings in JavaScript are by default encoded as if they're almost + /// UTF-16. They may, however, contain lone surrogates (only one element of + /// a two-u16 pair to create one unicode code point). + /// + /// If the `JsValue` is a string, then `TextEncoder` will be used to convert + /// between utf-16 and utf-8. If there are no lone surrogates then both + /// strings will be equivalent in terms of the unicode code point sequences + /// they describe. + /// + /// If the JS string has a lone surrogate, however, then this function will + /// still return `Some`. The `TextEncoder` implementation will replace lone + /// surrogates with a unicode replacement character. + /// + /// If you want to guarantee a lossless representation of the JS string in + /// Rust it's recommended you use `js_sys::JsString::is_valid_utf16_string`. + /// If that returns `true` then this function is lossless. If that function + /// returns `false` then this function is lossy and you'll need to access + /// the raw u16 values instead. #[cfg(feature = "std")] pub fn as_string(&self) -> Option { unsafe {