diff --git a/CHANGELOG.md b/CHANGELOG.md index fa46a5cf..15e59feb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Add new `\e` shorthand for the escape character. - Add \x00 notation to basic strings. - Seconds in Date-Time and Time values are now optional. +- Allow non-English scripts in unquoted (bare) keys ## 1.0.0 / 2021-01-11 diff --git a/toml.abnf b/toml.abnf index a902c19b..01eabd6b 100644 --- a/toml.abnf +++ b/toml.abnf @@ -46,19 +46,32 @@ comment = comment-start-symbol *non-eol ;; Key-Value pairs keyval = key keyval-sep val - key = simple-key / dotted-key +val = string / boolean / array / inline-table / date-time / float / integer + simple-key = quoted-key / unquoted-key -unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _ +;; Unquoted key + +unquoted-key = 1*unquoted-key-char +unquoted-key-char = ALPHA / DIGIT / %x2D / %x5F ; a-z A-Z 0-9 - _ +unquoted-key-char =/ %xB2 / %xB3 / %xB9 / %xBC-BE ; superscript digits, fractions +unquoted-key-char =/ %xC0-D6 / %xD8-F6 / %xF8-37D ; non-symbol chars in Latin block +unquoted-key-char =/ %x37F-1FFF ; exclude GREEK QUESTION MARK, which is basically a semi-colon +unquoted-key-char =/ %x200C-200D / %x203F-2040 ; from General Punctuation Block, include the two tie symbols and ZWNJ, ZWJ +unquoted-key-char =/ %x2070-218F / %x2460-24FF ; include super-/subscripts, letterlike/numberlike forms, enclosed alphanumerics +unquoted-key-char =/ %x2C00-2FEF / %x3001-D7FF ; skip arrows, math, box drawing etc, skip 2FF0-3000 ideographic up/down markers and spaces +unquoted-key-char =/ %xF900-FDCF / %xFDF0-FFFD ; skip D800-DFFF surrogate block, E000-F8FF Private Use area, FDD0-FDEF intended for process-internal use (unicode) +unquoted-key-char =/ %x10000-EFFFF ; all chars outside BMP range, excluding Private Use planes (F0000-10FFFF) + +;; Quoted and dotted key + quoted-key = basic-string / literal-string dotted-key = simple-key 1*( dot-sep simple-key ) dot-sep = ws %x2E ws ; . Period keyval-sep = ws %x3D ws ; = -val = string / boolean / array / inline-table / date-time / float / integer - ;; String string = ml-basic-string / basic-string / ml-literal-string / literal-string diff --git a/toml.md b/toml.md index e3e16776..df076598 100644 --- a/toml.md +++ b/toml.md @@ -97,27 +97,37 @@ first = "Tom" last = "Preston-Werner" # INVALID A key may be either bare, quoted, or dotted. -**Bare keys** may only contain ASCII letters, ASCII digits, underscores, and -dashes (`A-Za-z0-9_-`). Note that bare keys are allowed to be composed of only -ASCII digits, e.g. `1234`, but are always interpreted as strings. +**Bare keys** may contain any letter-like or number-like Unicode character from +any Unicode script, as well as ASCII digits, dashes and underscores. +Punctuation, spaces, arrows, box drawing and private use characters are not +allowed. Note that bare keys are allowed to be composed of only ASCII digits, +e.g. 1234, but are always interpreted as strings. + +ℹ️ The exact ranges of allowed code points can be found in the +[ABNF grammar file][abnf]. ```toml key = "value" bare_key = "value" bare-key = "value" 1234 = "value" +Fuß = "value" +😂 = "value" +汉语大字典 = "value" +辭源 = "value" +பெண்டிரேம் = "value" ``` **Quoted keys** follow the exact same rules as either basic strings or literal -strings and allow you to use a much broader set of key names. Best practice is -to use bare keys except when absolutely necessary. +strings and allow you to use any Unicode character in a key name, including +spaces. Best practice is to use bare keys except when absolutely necessary. ```toml "127.0.0.1" = "value" "character encoding" = "value" -"ʎǝʞ" = "value" -'key2' = "value" 'quoted "value"' = "value" +"╠═╣" = "value" +"⋰∫∬∭⋱" = "value" ``` A bare key must be non-empty, but an empty quoted key is allowed (though @@ -138,6 +148,7 @@ name = "Orange" physical.color = "orange" physical.shape = "round" site."google.com" = true +பெண்.டிரேம் = "we are women" ``` In JSON land, that would give you the following structure: @@ -151,6 +162,9 @@ In JSON land, that would give you the following structure: }, "site": { "google.com": true + }, + "பெண்": { + "டிரேம்": "we are women" } } ```