-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tweak diagnostics with invalid UTF-8 so they can pass over the wire (#…
…237) A correct provider should only ever return valid UTF-8 strings as the diagnostic Summary or Detail, but since diagnostics tend to be describing unexpected situations and are often derived from errors in downstream libraries it's possible that a provider might incorrectly return incorrect garbage as part of a diagnostic message. The protobuf serializer rejects non-UTF8 strings with a generic message that is unhelpful to end-users: string field contains invalid UTF-8 Here we make the compromise that it's better to make a best effort to return a diagnostic that is probably only partially invalid so that the end user has a chance of still getting some clue about what problem occurred. The new helper functions here achieve that by replacing any invalid bytes with a correctly-encoded version of the Unicode Replacement Character, which will then allow the string to pass over the wire protocol successfully and hopefully end up as an obviously-invalid character in the CLI output or web UI that's rendering the diagnostics. This does introduce some slight additional overhead when returning responses, but it should be immaterial for any response that doesn't include any diagnostics, relatively minor for responses that include valid diagnostics, and only markedly expensive for a diagnostic string with invalid bytes that will therefore need to be re-encoded on a rune-by-rune basis.
- Loading branch information
1 parent
434a0b0
commit 0488e08
Showing
5 changed files
with
237 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
```release-note:bug | ||
tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case. | ||
``` | ||
|
||
```release-note:bug | ||
tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case. | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package toproto | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestForceValidUTF8(t *testing.T) { | ||
tests := []struct { | ||
Input string | ||
Want string | ||
}{ | ||
{ | ||
"hello", | ||
"hello", | ||
}, | ||
{ | ||
"こんにちは", | ||
"こんにちは", | ||
}, | ||
{ | ||
"baffle", // NOTE: "ffl" is a single-character ligature | ||
"baffle", // ligature is preserved exactly | ||
}, | ||
{ | ||
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics | ||
"wé́́é́́é́́!", // diacritics are preserved exactly | ||
}, | ||
{ | ||
"😸😾", // Astral-plane characters | ||
"😸😾", // preserved exactly | ||
}, | ||
{ | ||
"\xff\xff", // neither byte is valid UTF-8 | ||
"\ufffd\ufffd", // both are replaced by replacement character | ||
}, | ||
{ | ||
"\xff\xff\xff\xff\xff", // more than three invalid bytes | ||
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation | ||
}, | ||
{ | ||
"t\xffe\xffst", // invalid bytes interleaved with other content | ||
"t\ufffde\ufffdst", // the valid content is preserved | ||
}, | ||
{ | ||
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences | ||
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
t.Run(test.Input, func(t *testing.T) { | ||
got := forceValidUTF8(test.Input) | ||
if got != test.Want { | ||
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package toproto | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestForceValidUTF8(t *testing.T) { | ||
tests := []struct { | ||
Input string | ||
Want string | ||
}{ | ||
{ | ||
"hello", | ||
"hello", | ||
}, | ||
{ | ||
"こんにちは", | ||
"こんにちは", | ||
}, | ||
{ | ||
"baffle", // NOTE: "ffl" is a single-character ligature | ||
"baffle", // ligature is preserved exactly | ||
}, | ||
{ | ||
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics | ||
"wé́́é́́é́́!", // diacritics are preserved exactly | ||
}, | ||
{ | ||
"😸😾", // Astral-plane characters | ||
"😸😾", // preserved exactly | ||
}, | ||
{ | ||
"\xff\xff", // neither byte is valid UTF-8 | ||
"\ufffd\ufffd", // both are replaced by replacement character | ||
}, | ||
{ | ||
"\xff\xff\xff\xff\xff", // more than three invalid bytes | ||
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation | ||
}, | ||
{ | ||
"t\xffe\xffst", // invalid bytes interleaved with other content | ||
"t\ufffde\ufffdst", // the valid content is preserved | ||
}, | ||
{ | ||
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences | ||
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved | ||
}, | ||
} | ||
|
||
for _, test := range tests { | ||
t.Run(test.Input, func(t *testing.T) { | ||
got := forceValidUTF8(test.Input) | ||
if got != test.Want { | ||
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want) | ||
} | ||
}) | ||
} | ||
} |