From 3ef1133c366a7a4779cf384e9536ba6ca891db08 Mon Sep 17 00:00:00 2001
From: Steven Bosnick <sbosnick@sympatico.ca>
Date: Sun, 19 Apr 2020 20:01:21 -0400
Subject: [PATCH] Add comments to describe safety of Scheme

The comments describe the postcondition on parse_exact() that makes the
one use of "unsafe" in Scheme::try_from(&'a [u8]) sound.
---
 src/uri/scheme.rs | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/src/uri/scheme.rs b/src/uri/scheme.rs
index 0ed83f1f..682b11ee 100644
--- a/src/uri/scheme.rs
+++ b/src/uri/scheme.rs
@@ -77,10 +77,13 @@ impl<'a> TryFrom<&'a [u8]> for Scheme {
             None => Err(ErrorKind::InvalidScheme.into()),
             Standard(p) => Ok(Standard(p).into()),
             Other(_) => {
-                // Unsafe: parse_exact already checks for a strict subset of UTF-8
-                Ok(Other(Box::new(unsafe {
-                    ByteStr::from_utf8_unchecked(Bytes::copy_from_slice(s))
-                })).into())
+                let bytes = Bytes::copy_from_slice(s);
+
+                // Safety: postcondition on parse_exact() means that s and
+                // hence bytes are valid UTF-8.
+                let string = unsafe { ByteStr::from_utf8_unchecked(bytes) };
+
+                Ok(Other(Box::new(string)).into())
             }
         }
     }
@@ -195,6 +198,12 @@ const MAX_SCHEME_LEN: usize = 64;
 
 // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
 //
+// SCHEME_CHARS is a table of valid characters in the scheme part of a URI.  An
+// entry in the table is 0 for invalid characters. For valid characters the
+// entry is itself (i.e.  the entry for 43 is b'+' because b'+' == 43u8). An
+// important characteristic of this table is that all entries above 127 are
+// invalid. This makes all of the valid entries a valid single-byte UTF-8 code
+// point. This means that a slice of such valid entries is valid UTF-8.
 const SCHEME_CHARS: [u8; 256] = [
     //  0      1      2      3      4      5      6      7      8      9
         0,     0,     0,     0,     0,     0,     0,     0,     0,     0, //   x
@@ -226,6 +235,7 @@ const SCHEME_CHARS: [u8; 256] = [
 ];
 
 impl Scheme2<usize> {
+    // Postcondition: On all Ok() returns, s is valid UTF-8
     fn parse_exact(s: &[u8]) -> Result<Scheme2<()>, InvalidUri> {
         match s {
             b"http" => Ok(Protocol::Http.into()),
@@ -235,6 +245,8 @@ impl Scheme2<usize> {
                     return Err(ErrorKind::SchemeTooLong.into());
                 }
 
+                // check that each byte in s is a SCHEME_CHARS which implies
+                // that it is a valid single byte UTF-8 code point.
                 for &b in s {
                     match SCHEME_CHARS[b as usize] {
                         b':' => {