Skip to content

Commit

Permalink
implement custom Julia Unicode normalization for confusable character…
Browse files Browse the repository at this point in the history
…s in identifiers
  • Loading branch information
stevengj committed Nov 30, 2016
1 parent 4a805b3 commit c477dcf
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
4 changes: 2 additions & 2 deletions deps/utf8proc.version
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
UTF8PROC_BRANCH=v2.0.2
UTF8PROC_SHA1=e3a5ed7b8bb5d0c6bb313d3e1f4d072c04113c4b
UTF8PROC_BRANCH=master
UTF8PROC_SHA1=e46d213241b254144e5109611f228023219efa84
2 changes: 2 additions & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2305,6 +2305,7 @@ static const builtinspec_t core_builtin_info[] = {

extern void builtins_init(fl_context_t *fl_ctx);
extern void comparehash_init(fl_context_t *fl_ctx);
extern void jl_charmap_init(fl_context_t *fl_ctx);

static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
{
Expand Down Expand Up @@ -2337,6 +2338,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
fl_ctx->consflags = bitvector_new(fl_ctx->heapsize/sizeof(cons_t), 1);
fl_print_init(fl_ctx);
comparehash_init(fl_ctx);
jl_charmap_init(fl_ctx);
fl_ctx->N_STACK = 262144;
fl_ctx->Stack = (value_t*)malloc(fl_ctx->N_STACK*sizeof(value_t));
CHECK_ALIGN8(fl_ctx->Stack);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ struct _fl_context_t {
fltype_t *builtintype;

htable_t equal_eq_hashtable;
htable_t jl_charmap;

value_t tablesym;
fltype_t *tabletype;
Expand Down
48 changes: 45 additions & 3 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,22 +152,64 @@ value_t fl_julia_identifier_start_char(fl_context_t *fl_ctx, value_t *args, uint
return jl_id_start_char(wc) ? fl_ctx->T : fl_ctx->F;
}

// return NFC-normalized UTF8-encoded version of s
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. We convert
this to a hash table, below, in the expectation that it may grow to
be large in the future (e.g. we may implement a subset of NFKC). */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
};

#define _equal_wchar_(x, y, ctx) ((x) == (y))
#define _hash_wchar_(x, ctx) inthash((uint32_t) ((uintptr_t) (x)))
#include "htable.inc"
HTIMPL_R(wcharhash, _hash_wchar_, _equal_wchar_)

void jl_charmap_init(fl_context_t *fl_ctx)
{
size_t charmap_len = sizeof(charmap) / (2*sizeof(uint32_t));
size_t i;
htable_t *h = htable_new(&fl_ctx->jl_charmap, charmap_len);
assert(sizeof(uint32_t) >= sizeof(void*));
for (i = 0; i < charmap_len; ++i) {
/* Store charmap in a hash table. Typecasting codepoints
directly to pointer keys works because pointers are a
least 32 bits on all Julia-supported systems, and because
we never map anything to U+0001 (since HT_NOTFOUND is (void*)1). */
assert((void*)charmap[i][1] != HT_NOTFOUND);
wcharhash_put_r(h, (void*)((uintptr_t)charmap[i][0]),
(void*)((uintptr_t)charmap[i][1]), (void*)fl_ctx);
}
}
utf8proc_int32_t jl_charmap_map(utf8proc_int32_t c, void *fl_ctx_)
{
fl_context_t *fl_ctx = (fl_context_t *) fl_ctx_;
htable_t *h = &fl_ctx->jl_charmap;
void *v = wcharhash_get_r(h, (void*)((uintptr_t)c), (void*) fl_ctx);
return v == HT_NOTFOUND ? c : (utf8proc_int32_t) ((uintptr_t) v);
}

// return NFC-normalized UTF8-encoded version of s, with
// additional custom normalizations defined by jl_charmap above.
static char *normalize(fl_context_t *fl_ctx, char *s)
{
// options equivalent to utf8proc_NFC:
const int options = UTF8PROC_NULLTERM|UTF8PROC_STABLE|UTF8PROC_COMPOSE;
ssize_t result;
size_t newlen;
result = utf8proc_decompose((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*) s, 0, NULL, 0, (utf8proc_option_t)options,
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
newlen = result * sizeof(int32_t) + 1;
if (newlen > fl_ctx->jlbuflen) {
fl_ctx->jlbuflen = newlen * 2;
fl_ctx->jlbuf = realloc(fl_ctx->jlbuf, fl_ctx->jlbuflen);
if (!fl_ctx->jlbuf) lerror(fl_ctx, fl_ctx->OutOfMemoryError, "error allocating UTF8 buffer");
}
result = utf8proc_decompose((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
result = utf8proc_decompose_custom((uint8_t*)s,0, (int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options,
jl_charmap_map, (void*) fl_ctx);
if (result < 0) goto error;
result = utf8proc_reencode((int32_t*)fl_ctx->jlbuf,result, (utf8proc_option_t)options);
if (result < 0) goto error;
Expand Down

0 comments on commit c477dcf

Please sign in to comment.