Skip to content

Commit

Permalink
WIP: custom Unicode normalization for Julia identifiers (#19464)
Browse files Browse the repository at this point in the history
* implement custom Julia Unicode normalization for confusable characters in identifiers

* whoops

* separated julia_charmap into its own file to make it easier to update

* normalize fullwidth -> halfwidth in identifiers, ala NFKC

* make \varepsilon complete to ε (u+03b5), fixes #14751

* docs for canonicalization

* normalize fullwidth characters during parsing (fixes #5903)

* typo

* tests

* be more cautious about normalizing chars when parsing, so as not to normalize string literals

* test fullwidth numeric literals and parens

* typo/clarification

* update to utf8proc-2.1

* checksum for utf8proc 2.1

* moved symbol-normalization test from test/core to test/parse

* Revert "be more cautious about normalizing chars when parsing, so as not to normalize string literals"

This reverts commit 81033fa.

* Revert "normalize fullwidth characters during parsing (fixes #5903)"

This reverts commit cf61972.

* remove more references to fullwidth normalization

* rm fullwidth identifier normalization
  • Loading branch information
stevengj authored and tkelman committed Jan 6, 2017
1 parent 5407a0d commit 62c423b
Show file tree
Hide file tree
Showing 13 changed files with 75 additions and 8 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ This section lists changes that do not have deprecation warnings.
* In macro calls with parentheses, e.g. `@m(a=1)`, assignments are now parsed as
`=` expressions, instead of as `kw` expressions. ([#7669])

* (µ "micro" and ɛ "latin epsilon") are considered equivalent to
the corresponding Greek characters in identifiers. `\varepsilon`
now tab-completes to U+03B5 (greek small letter epsilon) ([#19464]).

Library improvements
--------------------

Expand Down
2 changes: 1 addition & 1 deletion base/latex_symbols.jl
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ const latex_symbols = Dict(
"\\Elzopeno" => "ɔ",
"\\Elzrtld" => "ɖ",
"\\Elzschwa" => "ə",
"\\varepsilon" => "ɛ",
"\\varepsilon" => "ε",
"\\Elzpgamma" => "ɣ",
"\\Elzpbgam" => "ɤ",
"\\Elztrnh" => "ɥ",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f33af304538c3afba3b1d0ebae8e4555
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
17a2df079e726a4ae1f10fcf48a7a771c2bcc93c7938f88148e1aa3b6cf9d250eb33cd7a9d8de54f29360e71c71e59b77996ba28dd894676888dc0453d67e9bb

This file was deleted.

This file was deleted.

4 changes: 2 additions & 2 deletions deps/utf8proc.version
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
UTF8PROC_BRANCH=v2.0.2
UTF8PROC_SHA1=e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b
UTF8PROC_BRANCH=v2.1
UTF8PROC_SHA1=40e605959eb5cb90b2587fa88e3b661558fbc55a
7 changes: 7 additions & 0 deletions doc/src/manual/variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ ERROR: syntax: unexpected "="
...
```

Some Unicode characters are considered to be equivalent in identifiers.
Different ways of entering Unicode combining characters (e.g., accents)
are treated as equivalent (specifically, Julia identifiers are NFC-normalized).
The Unicode characters `ɛ` (U+025B: Latin small letter open e)
and `µ` (U+00B5: micro sign) are treated as equivalent to the corresponding
Greek letters, because the former are easily accessible via some input methods.

## Stylistic Conventions

While Julia imposes few restrictions on valid names, it has become useful to adopt the following
Expand Down
2 changes: 2 additions & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2305,6 +2305,7 @@ static const builtinspec_t core_builtin_info[] = {

extern void builtins_init(fl_context_t *fl_ctx);
extern void comparehash_init(fl_context_t *fl_ctx);
extern void jl_charmap_init(fl_context_t *fl_ctx);

static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
{
Expand Down Expand Up @@ -2337,6 +2338,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
fl_ctx->consflags = bitvector_new(fl_ctx->heapsize/sizeof(cons_t), 1);
fl_print_init(fl_ctx);
comparehash_init(fl_ctx);
jl_charmap_init(fl_ctx);
fl_ctx->N_STACK = 262144;
fl_ctx->Stack = (value_t*)malloc(fl_ctx->N_STACK*sizeof(value_t));
CHECK_ALIGN8(fl_ctx->Stack);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ struct _fl_context_t {
fltype_t *builtintype;

htable_t equal_eq_hashtable;
htable_t jl_charmap;

value_t tablesym;
fltype_t *tabletype;
Expand Down
7 changes: 7 additions & 0 deletions src/flisp/julia_charmap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
};
39 changes: 36 additions & 3 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,55 @@ value_t fl_julia_identifier_start_char(fl_context_t *fl_ctx, value_t *args, uint
return jl_id_start_char(wc) ? fl_ctx->T : fl_ctx->F;
}

// return NFC-normalized UTF8-encoded version of s
#include "julia_charmap.h"
#define _equal_wchar_(x, y, ctx) ((x) == (y))
#define _hash_wchar_(x, ctx) inthash((uint32_t) ((uintptr_t) (x)))
#include "htable.inc"
HTIMPL_R(wcharhash, _hash_wchar_, _equal_wchar_)

void jl_charmap_init(fl_context_t *fl_ctx)
{
size_t charmap_len = sizeof(charmap) / (2*sizeof(uint32_t));
size_t i;
htable_t *h = htable_new(&fl_ctx->jl_charmap, charmap_len);
assert(sizeof(uint32_t) <= sizeof(void*));
for (i = 0; i < charmap_len; ++i) {
/* Store charmap in a hash table. Typecasting codepoints
directly to pointer keys works because pointers are at
least 32 bits on all Julia-supported systems, and because
we never map anything to U+0001 (since HT_NOTFOUND is (void*)1). */
assert((void*)charmap[i][1] != HT_NOTFOUND);
wcharhash_put_r(h, (void*)((uintptr_t)charmap[i][0]),
(void*)((uintptr_t)charmap[i][1]), (void*)fl_ctx);
}
}
utf8proc_int32_t jl_charmap_map(utf8proc_int32_t c, void *fl_ctx_)
{
fl_context_t *fl_ctx = (fl_context_t *) fl_ctx_;
htable_t *h = &fl_ctx->jl_charmap;
void *v = wcharhash_get_r(h, (void*)((uintptr_t)c), (void*) fl_ctx);
return v == HT_NOTFOUND ? c : (utf8proc_int32_t) ((uintptr_t) v);
}

// return NFC-normalized UTF8-encoded version of s, with
// additional custom normalizations defined by jl_charmap above.
static char *normalize(fl_context_t *fl_ctx, char *s)
{
// options equivalent to utf8proc_NFC:
const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE;
ssize_t result;
size_t newlen;
result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options,
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
newlen = result * sizeof(int32_t) + 1;
if (newlen > fl_ctx->jlbuflen) {
fl_ctx->jlbuflen = newlen * 2;
fl_ctx->jlbuf = realloc(fl_ctx->jlbuf, fl_ctx->jlbuflen);
if (!fl_ctx->jlbuf) lerror(fl_ctx, fl_ctx->OutOfMemoryError, "error allocating UTF8 buffer");
}
result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options,
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
result = utf8proc_reencode((int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
if (result < 0) goto error;
Expand Down
13 changes: 13 additions & 0 deletions test/parse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,19 @@ let f = function (x; kw...)
@test g(1) == (1, 2)
end

# normalization of Unicode symbols (#19464)
let ε=1, μ=2, x=3, î=4
# issue #5434 (mu vs micro):
@test parse("\u00b5") === parse("\u03bc")
@test µ == μ == 2
# NFC normalization of identifiers:
@test parse("\u0069\u0302") === parse("\u00ee")
@test== 4
# latin vs greek ε (#14751)
@test parse("\u025B") === parse("\u03B5")
@test ɛ == ε == 1
end

# issue #8925
let
global const (c8925, d8925) = (3, 4)
Expand Down

0 comments on commit 62c423b

Please sign in to comment.