diff --git a/NEWS.md b/NEWS.md index 5fd6406725e68..9edc8836abf79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -84,6 +84,10 @@ This section lists changes that do not have deprecation warnings. * In macro calls with parentheses, e.g. `@m(a=1)`, assignments are now parsed as `=` expressions, instead of as `kw` expressions. ([#7669]) + * (µ "micro" and ɛ "latin epsilon") are considered equivalent to + the corresponding Greek characters in identifiers. `\varepsilon` + now tab-completes to U+03B5 (greek small letter epsilon) ([#19464]). + Library improvements -------------------- diff --git a/base/latex_symbols.jl b/base/latex_symbols.jl index ffa122464446e..2e2d2af8da657 100644 --- a/base/latex_symbols.jl +++ b/base/latex_symbols.jl @@ -257,7 +257,7 @@ const latex_symbols = Dict( "\\Elzopeno" => "ɔ", "\\Elzrtld" => "ɖ", "\\Elzschwa" => "ə", - "\\varepsilon" => "ɛ", + "\\varepsilon" => "ε", "\\Elzpgamma" => "ɣ", "\\Elzpbgam" => "ɤ", "\\Elztrnh" => "ɥ", diff --git a/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/md5 b/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/md5 new file mode 100644 index 0000000000000..0a257a0c03698 --- /dev/null +++ b/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/md5 @@ -0,0 +1 @@ +f33af304538c3afba3b1d0ebae8e4555 diff --git a/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/sha512 b/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/sha512 new file mode 100644 index 0000000000000..7fb13653ee1fa --- /dev/null +++ b/deps/checksums/utf8proc-40e605959eb5cb90b2587fa88e3b661558fbc55a.tar.gz/sha512 @@ -0,0 +1 @@ +17a2df079e726a4ae1f10fcf48a7a771c2bcc93c7938f88148e1aa3b6cf9d250eb33cd7a9d8de54f29360e71c71e59b77996ba28dd894676888dc0453d67e9bb diff --git a/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/md5 b/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/md5 deleted file mode 100644 index 2b394ed2436f8..0000000000000 --- a/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/md5 +++ /dev/null @@ -1 +0,0 @@ -c85e1275fb558d9bad5c3fa0b496e457 diff --git a/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/sha512 b/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/sha512 deleted file mode 100644 index 08976e711026b..0000000000000 --- a/deps/checksums/utf8proc-e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b.tar.gz/sha512 +++ /dev/null @@ -1 +0,0 @@ -e7dd0a575f94621c7b56eca28fced5934c3dd6675877842287d04ad74bacaf446d8e8c8cdce14963c819b09786037c0c46679ff2dc42818497bb1acbbb9751e6 diff --git a/deps/utf8proc.version b/deps/utf8proc.version index ab5f84a39687f..cf8cc7de8f10c 100644 --- a/deps/utf8proc.version +++ b/deps/utf8proc.version @@ -1,2 +1,2 @@ -UTF8PROC_BRANCH=v2.0.2 -UTF8PROC_SHA1=e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b +UTF8PROC_BRANCH=v2.1 +UTF8PROC_SHA1=40e605959eb5cb90b2587fa88e3b661558fbc55a diff --git a/doc/src/manual/variables.md b/doc/src/manual/variables.md index 087f4d98d5473..e834150771a86 100644 --- a/doc/src/manual/variables.md +++ b/doc/src/manual/variables.md @@ -108,6 +108,13 @@ ERROR: syntax: unexpected "=" ... ``` +Some Unicode characters are considered to be equivalent in identifiers. +Different ways of entering Unicode combining characters (e.g., accents) +are treated as equivalent (specifically, Julia identifiers are NFC-normalized). +The Unicode characters `ɛ` (U+025B: Latin small letter open e) +and `µ` (U+00B5: micro sign) are treated as equivalent to the corresponding +Greek letters, because the former are easily accessible via some input methods. + ## Stylistic Conventions While Julia imposes few restrictions on valid names, it has become useful to adopt the following diff --git a/src/flisp/flisp.c b/src/flisp/flisp.c index 6b485d0362428..bc3b2e93137a4 100644 --- a/src/flisp/flisp.c +++ b/src/flisp/flisp.c @@ -2305,6 +2305,7 @@ static const builtinspec_t core_builtin_info[] = { extern void builtins_init(fl_context_t *fl_ctx); extern void comparehash_init(fl_context_t *fl_ctx); +extern void jl_charmap_init(fl_context_t *fl_ctx); static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize) { @@ -2337,6 +2338,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize) fl_ctx->consflags = bitvector_new(fl_ctx->heapsize/sizeof(cons_t), 1); fl_print_init(fl_ctx); comparehash_init(fl_ctx); + jl_charmap_init(fl_ctx); fl_ctx->N_STACK = 262144; fl_ctx->Stack = (value_t*)malloc(fl_ctx->N_STACK*sizeof(value_t)); CHECK_ALIGN8(fl_ctx->Stack); diff --git a/src/flisp/flisp.h b/src/flisp/flisp.h index fea46e570ea5e..c661d7a319205 100644 --- a/src/flisp/flisp.h +++ b/src/flisp/flisp.h @@ -389,6 +389,7 @@ struct _fl_context_t { fltype_t *builtintype; htable_t equal_eq_hashtable; + htable_t jl_charmap; value_t tablesym; fltype_t *tabletype; diff --git a/src/flisp/julia_charmap.h b/src/flisp/julia_charmap.h new file mode 100644 index 0000000000000..bed88a9ace4cd --- /dev/null +++ b/src/flisp/julia_charmap.h @@ -0,0 +1,7 @@ +/* Array of {original codepoint, replacement codepoint} normalizations + to perform on Julia identifiers, to canonicalize characters that + are both easily confused and easily inputted by accident. */ +static const uint32_t charmap[][2] = { + { 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon + { 0x00B5, 0x03BC }, // micro sign -> greek small letter mu +}; diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c index f43fbd7479ec2..32aa5cab31d80 100644 --- a/src/flisp/julia_extensions.c +++ b/src/flisp/julia_extensions.c @@ -152,14 +152,46 @@ value_t fl_julia_identifier_start_char(fl_context_t *fl_ctx, value_t *args, uint return jl_id_start_char(wc) ? fl_ctx->T : fl_ctx->F; } -// return NFC-normalized UTF8-encoded version of s +#include "julia_charmap.h" +#define _equal_wchar_(x, y, ctx) ((x) == (y)) +#define _hash_wchar_(x, ctx) inthash((uint32_t) ((uintptr_t) (x))) +#include "htable.inc" +HTIMPL_R(wcharhash, _hash_wchar_, _equal_wchar_) + +void jl_charmap_init(fl_context_t *fl_ctx) +{ + size_t charmap_len = sizeof(charmap) / (2*sizeof(uint32_t)); + size_t i; + htable_t *h = htable_new(&fl_ctx->jl_charmap, charmap_len); + assert(sizeof(uint32_t) <= sizeof(void*)); + for (i = 0; i < charmap_len; ++i) { + /* Store charmap in a hash table. Typecasting codepoints + directly to pointer keys works because pointers are at + least 32 bits on all Julia-supported systems, and because + we never map anything to U+0001 (since HT_NOTFOUND is (void*)1). */ + assert((void*)charmap[i][1] != HT_NOTFOUND); + wcharhash_put_r(h, (void*)((uintptr_t)charmap[i][0]), + (void*)((uintptr_t)charmap[i][1]), (void*)fl_ctx); + } +} +utf8proc_int32_t jl_charmap_map(utf8proc_int32_t c, void *fl_ctx_) +{ + fl_context_t *fl_ctx = (fl_context_t *) fl_ctx_; + htable_t *h = &fl_ctx->jl_charmap; + void *v = wcharhash_get_r(h, (void*)((uintptr_t)c), (void*) fl_ctx); + return v == HT_NOTFOUND ? c : (utf8proc_int32_t) ((uintptr_t) v); +} + +// return NFC-normalized UTF8-encoded version of s, with +// additional custom normalizations defined by jl_charmap above. static char *normalize(fl_context_t *fl_ctx, char *s) { // options equivalent to utf8proc_NFC: const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE; ssize_t result; size_t newlen; - result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options); + result = utf8proc_decompose_custom((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options, + jl_charmap_map, (void*) fl_ctx); if (result < 0) goto error; newlen = result * sizeof(int32_t) + 1; if (newlen > fl_ctx->jlbuflen) { @@ -167,7 +199,8 @@ static char *normalize(fl_context_t *fl_ctx, char *s) fl_ctx->jlbuf = realloc(fl_ctx->jlbuf, fl_ctx->jlbuflen); if (!fl_ctx->jlbuf) lerror(fl_ctx, fl_ctx->OutOfMemoryError, "error allocating UTF8 buffer"); } - result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options); + result = utf8proc_decompose_custom((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options, + jl_charmap_map, (void*) fl_ctx); if (result < 0) goto error; result = utf8proc_reencode((int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options); if (result < 0) goto error; diff --git a/test/parse.jl b/test/parse.jl index a73eb5de957c2..eb3ac9f351849 100644 --- a/test/parse.jl +++ b/test/parse.jl @@ -888,6 +888,19 @@ let f = function (x; kw...) @test g(1) == (1, 2) end +# normalization of Unicode symbols (#19464) +let ε=1, μ=2, x=3, î=4 + # issue #5434 (mu vs micro): + @test parse("\u00b5") === parse("\u03bc") + @test µ == μ == 2 + # NFC normalization of identifiers: + @test parse("\u0069\u0302") === parse("\u00ee") + @test î == 4 + # latin vs greek ε (#14751) + @test parse("\u025B") === parse("\u03B5") + @test ɛ == ε == 1 +end + # issue #8925 let global const (c8925, d8925) = (3, 4)