Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: custom Unicode normalization for Julia identifiers #19464

Merged
merged 20 commits into from
Jan 6, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ This section lists changes that do not have deprecation warnings.
* In macro calls with parentheses, e.g. `@m(a=1)`, assignments are now parsed as
`=` expressions, instead of as `kw` expressions. ([#7669])

* (µ "micro" and ɛ "latin epsilon") are considered equivalent to
the corresponding Greek characters in identifiers. `\varepsilon`
now tab-completes to U+03B5 (greek small letter epsilon) ([#19464]).

Library improvements
--------------------

Expand Down
2 changes: 1 addition & 1 deletion base/latex_symbols.jl
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ const latex_symbols = Dict(
"\\Elzopeno" => "ɔ",
"\\Elzrtld" => "ɖ",
"\\Elzschwa" => "ə",
"\\varepsilon" => "ɛ",
"\\varepsilon" => "ε",
"\\Elzpgamma" => "ɣ",
"\\Elzpbgam" => "ɤ",
"\\Elztrnh" => "ɥ",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
f33af304538c3afba3b1d0ebae8e4555
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
17a2df079e726a4ae1f10fcf48a7a771c2bcc93c7938f88148e1aa3b6cf9d250eb33cd7a9d8de54f29360e71c71e59b77996ba28dd894676888dc0453d67e9bb

This file was deleted.

This file was deleted.

4 changes: 2 additions & 2 deletions deps/utf8proc.version
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
UTF8PROC_BRANCH=v2.0.2
UTF8PROC_SHA1=e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b
UTF8PROC_BRANCH=v2.1
UTF8PROC_SHA1=40e605959eb5cb90b2587fa88e3b661558fbc55a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update the checksums

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, fixed

7 changes: 7 additions & 0 deletions doc/src/manual/variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ ERROR: syntax: unexpected "="
...
```

Some Unicode characters are considered to be equivalent in identifiers.
Different ways of entering Unicode combining characters (e.g., accents)
are treated as equivalent (specifically, Julia identifiers are NFC-normalized).
The Unicode characters `ɛ` (U+025B: Latin small letter open e)
and `µ` (U+00B5: micro sign) are treated as equivalent to the corresponding
Greek letters, because the former are easily accessible via some input methods.

## Stylistic Conventions

While Julia imposes few restrictions on valid names, it has become useful to adopt the following
Expand Down
2 changes: 2 additions & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2305,6 +2305,7 @@ static const builtinspec_t core_builtin_info[] = {

extern void builtins_init(fl_context_t *fl_ctx);
extern void comparehash_init(fl_context_t *fl_ctx);
extern void jl_charmap_init(fl_context_t *fl_ctx);

static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
{
Expand Down Expand Up @@ -2337,6 +2338,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
fl_ctx->consflags = bitvector_new(fl_ctx->heapsize/sizeof(cons_t), 1);
fl_print_init(fl_ctx);
comparehash_init(fl_ctx);
jl_charmap_init(fl_ctx);
fl_ctx->N_STACK = 262144;
fl_ctx->Stack = (value_t*)malloc(fl_ctx->N_STACK*sizeof(value_t));
CHECK_ALIGN8(fl_ctx->Stack);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ struct _fl_context_t {
fltype_t *builtintype;

htable_t equal_eq_hashtable;
htable_t jl_charmap;

value_t tablesym;
fltype_t *tabletype;
Expand Down
7 changes: 7 additions & 0 deletions src/flisp/julia_charmap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
};
39 changes: 36 additions & 3 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,55 @@ value_t fl_julia_identifier_start_char(fl_context_t *fl_ctx, value_t *args, uint
return jl_id_start_char(wc) ? fl_ctx->T : fl_ctx->F;
}

// return NFC-normalized UTF8-encoded version of s
#include "julia_charmap.h"
#define _equal_wchar_(x, y, ctx) ((x) == (y))
#define _hash_wchar_(x, ctx) inthash((uint32_t) ((uintptr_t) (x)))
#include "htable.inc"
HTIMPL_R(wcharhash, _hash_wchar_, _equal_wchar_)

void jl_charmap_init(fl_context_t *fl_ctx)
{
size_t charmap_len = sizeof(charmap) / (2*sizeof(uint32_t));
size_t i;
htable_t *h = htable_new(&fl_ctx->jl_charmap, charmap_len);
assert(sizeof(uint32_t) <= sizeof(void*));
for (i = 0; i < charmap_len; ++i) {
/* Store charmap in a hash table. Typecasting codepoints
directly to pointer keys works because pointers are at
least 32 bits on all Julia-supported systems, and because
we never map anything to U+0001 (since HT_NOTFOUND is (void*)1). */
assert((void*)charmap[i][1] != HT_NOTFOUND);
wcharhash_put_r(h, (void*)((uintptr_t)charmap[i][0]),
(void*)((uintptr_t)charmap[i][1]), (void*)fl_ctx);
}
}
utf8proc_int32_t jl_charmap_map(utf8proc_int32_t c, void *fl_ctx_)
{
fl_context_t *fl_ctx = (fl_context_t *) fl_ctx_;
htable_t *h = &fl_ctx->jl_charmap;
void *v = wcharhash_get_r(h, (void*)((uintptr_t)c), (void*) fl_ctx);
return v == HT_NOTFOUND ? c : (utf8proc_int32_t) ((uintptr_t) v);
}

// return NFC-normalized UTF8-encoded version of s, with
// additional custom normalizations defined by jl_charmap above.
static char *normalize(fl_context_t *fl_ctx, char *s)
{
// options equivalent to utf8proc_NFC:
const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE;
ssize_t result;
size_t newlen;
result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options,
stevengj marked this conversation as resolved.
Show resolved Hide resolved
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
newlen = result * sizeof(int32_t) + 1;
if (newlen > fl_ctx->jlbuflen) {
fl_ctx->jlbuflen = newlen * 2;
fl_ctx->jlbuf = realloc(fl_ctx->jlbuf, fl_ctx->jlbuflen);
if (!fl_ctx->jlbuf) lerror(fl_ctx, fl_ctx->OutOfMemoryError, "error allocating UTF8 buffer");
}
result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options,
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
result = utf8proc_reencode((int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
if (result < 0) goto error;
Expand Down
13 changes: 13 additions & 0 deletions test/parse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,19 @@ let f = function (x; kw...)
@test g(1) == (1, 2)
end

# normalization of Unicode symbols (#19464)
let ε=1, μ=2, x=3, î=4
# issue #5434 (mu vs micro):
@test parse("\u00b5") === parse("\u03bc")
@test µ == μ == 2
# NFC normalization of identifiers:
@test parse("\u0069\u0302") === parse("\u00ee")
@test î == 4
# latin vs greek ε (#14751)
@test parse("\u025B") === parse("\u03B5")
@test ɛ == ε == 1
end

# issue #8925
let
global const (c8925, d8925) = (3, 4)
Expand Down