Skip to content

Commit

Permalink
net: use icu's punycode implementation
Browse files Browse the repository at this point in the history
ICU has a punycode implementation built in. Use it instead of the
javascript implementation because it's much faster.

PR-URL: #7355
Reviewed-By: Trevor Norris <trev.norris@gmail.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
  • Loading branch information
jasnell authored and Fishrock123 committed Jul 5, 2016
1 parent 12b1993 commit d0e2492
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 12 deletions.
75 changes: 75 additions & 0 deletions benchmark/net/punycode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
'use strict';

const common = require('../common.js');
const icu = process.binding('icu');
const punycode = require('punycode');

const bench = common.createBenchmark(main, {
method: ['punycode', 'icu'],
n: [1024],
val: [
'افغانستا.icom.museum',
'الجزائر.icom.museum',
'österreich.icom.museum',
'বাংলাদেশ.icom.museum',
'беларусь.icom.museum',
'belgië.icom.museum',
'българия.icom.museum',
'تشادر.icom.museum',
'中国.icom.museum',
'القمر.icom.museum',
'κυπρος.icom.museum',
'českárepublika.icom.museum',
'مصر.icom.museum',
'ελλάδα.icom.museum',
'magyarország.icom.museum',
'ísland.icom.museum',
'भारत.icom.museum',
'ايران.icom.museum',
'éire.icom.museum',
'איקו״ם.ישראל.museum',
'日本.icom.museum',
'الأردن.icom.museum'
]
});

function usingPunycode(val) {
punycode.toUnicode(punycode.toASCII(val));
}

function usingICU(val) {
icu.toUnicode(icu.toASCII(val));
}

function runPunycode(n, val) {
common.v8ForceOptimization(usingPunycode, val);
var i = 0;
bench.start();
for (; i < n; i++)
usingPunycode(val);
bench.end(n);
}

function runICU(n, val) {
common.v8ForceOptimization(usingICU, val);
var i = 0;
bench.start();
for (; i < n; i++)
usingICU(val);
bench.end(n);
}

function main(conf) {
const n = +conf.n;
const val = conf.val;
switch (conf.method) {
case 'punycode':
runPunycode(n, val);
break;
case 'icu':
runICU(n, val);
break;
default:
throw new Error('Unexpected method');
}
}
12 changes: 10 additions & 2 deletions lib/url.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
'use strict';

const punycode = require('punycode');
function importPunycode() {
try {
return process.binding('icu');
} catch (e) {
return require('punycode');
}
}

const { toASCII } = importPunycode();

exports.parse = urlParse;
exports.resolve = urlResolve;
Expand Down Expand Up @@ -309,7 +317,7 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) {
// It only converts parts of the domain name that
// have non-ASCII characters, i.e. it doesn't matter if
// you call it with a domain that already is ASCII-only.
this.hostname = punycode.toASCII(this.hostname);
this.hostname = toASCII(this.hostname);
}

var p = this.port ? ':' + this.port : '';
Expand Down
132 changes: 132 additions & 0 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,16 @@

#if defined(NODE_HAVE_I18N_SUPPORT)

#include "node.h"
#include "env.h"
#include "env-inl.h"
#include "util.h"
#include "util-inl.h"
#include "v8.h"

#include <unicode/putil.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>

#ifdef NODE_HAVE_SMALL_ICU
/* if this is defined, we have a 'secondary' entry point.
Expand All @@ -43,6 +51,13 @@ extern "C" const char U_DATA_API SMALL_ICUDATA_ENTRY_POINT[];

namespace node {

using v8::Context;
using v8::FunctionCallbackInfo;
using v8::Local;
using v8::Object;
using v8::String;
using v8::Value;

bool flag_icu_data_dir = false;

namespace i18n {
Expand All @@ -64,7 +79,124 @@ bool InitializeICUDirectory(const char* icu_data_path) {
}
}

static int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_UNICODE;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToUnicodeUTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);
}

if (U_FAILURE(status))
len = -1;

uidna_close(uidna);
return len;
}

static int32_t ToASCII(MaybeStackBuffer<char>* buf,
const char* input,
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_ASCII;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
UIDNAInfo info = UIDNA_INFO_INITIALIZER;

int32_t len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);

if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
buf->AllocateSufficientStorage(len);
len = uidna_nameToASCII_UTF8(uidna,
input, length,
**buf, buf->length(),
&info,
&status);
}

if (U_FAILURE(status))
len = -1;

uidna_close(uidna);
return len;
}

static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
MaybeStackBuffer<char> buf;
int32_t len = ToUnicode(&buf, *val, val.length());

if (len < 0) {
return env->ThrowError("Cannot convert name to Unicode");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
v8::NewStringType::kNormal,
len).ToLocalChecked());
}

static void ToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
MaybeStackBuffer<char> buf;
int32_t len = ToASCII(&buf, *val, val.length());

if (len < 0) {
return env->ThrowError("Cannot convert name to ASCII");
}

args.GetReturnValue().Set(
String::NewFromUtf8(env->isolate(),
*buf,
v8::NewStringType::kNormal,
len).ToLocalChecked());
}

void Init(Local<Object> target,
Local<Value> unused,
Local<Context> context,
void* priv) {
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "toUnicode", ToUnicode);
env->SetMethod(target, "toASCII", ToASCII);
}

} // namespace i18n
} // namespace node

NODE_MODULE_CONTEXT_AWARE_BUILTIN(icu, node::i18n::Init)

#endif // NODE_HAVE_I18N_SUPPORT
72 changes: 72 additions & 0 deletions test/parallel/test-icu-punycode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
'use strict';

const common = require('../common');
const icu = getPunycode();
const assert = require('assert');

function getPunycode() {
try {
return process.binding('icu');
} catch (err) {
return undefined;
}
}

if (!icu) {
common.skip('icu punycode tests because ICU is not present.');
return;
}

// Credit for list: http://www.i18nguy.com/markup/idna-examples.html
const tests = [
'افغانستا.icom.museum',
'الجزائر.icom.museum',
'österreich.icom.museum',
'বাংলাদেশ.icom.museum',
'беларусь.icom.museum',
'belgië.icom.museum',
'българия.icom.museum',
'تشادر.icom.museum',
'中国.icom.museum',
'القمر.icom.museum',
'κυπρος.icom.museum',
'českárepublika.icom.museum',
'مصر.icom.museum',
'ελλάδα.icom.museum',
'magyarország.icom.museum',
'ísland.icom.museum',
'भारत.icom.museum',
'ايران.icom.museum',
'éire.icom.museum',
'איקו״ם.ישראל.museum',
'日本.icom.museum',
'الأردن.icom.museum',
'қазақстан.icom.museum',
'한국.icom.museum',
'кыргызстан.icom.museum',
'ລາວ.icom.museum',
'لبنان.icom.museum',
'македонија.icom.museum',
'موريتانيا.icom.museum',
'méxico.icom.museum',
'монголулс.icom.museum',
'المغرب.icom.museum',
'नेपाल.icom.museum',
'عمان.icom.museum',
'قطر.icom.museum',
'românia.icom.museum',
'россия.иком.museum',
'србијаицрнагора.иком.museum',
'இலங்கை.icom.museum',
'españa.icom.museum',
'ไทย.icom.museum',
'تونس.icom.museum',
'türkiye.icom.museum',
'украина.icom.museum',
'việtnam.icom.museum'
];

// Testing the roundtrip
tests.forEach((i) => {
assert.strictEqual(i, icu.toUnicode(icu.toASCII(i)));
});
9 changes: 1 addition & 8 deletions tools/icu/icu-generic.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@
'defines': [
# ICU cannot swap the initial data without this.
# http://bugs.icu-project.org/trac/ticket/11046
'UCONFIG_NO_LEGACY_CONVERSION=1',
'UCONFIG_NO_IDNA=1',
'UCONFIG_NO_LEGACY_CONVERSION=1'
],
}],
],
Expand Down Expand Up @@ -428,9 +427,6 @@
#'<(icu_path)/source/common/ubidi_props_data.h',
# and the callers
'<(icu_path)/source/common/ushape.cpp',
'<(icu_path)/source/common/usprep.cpp',
'<(icu_path)/source/common/uts46.cpp',
'<(icu_path)/source/common/uidna.cpp',
]}],
[ 'icu_ver_major == 57', { 'sources!': [
# work around http://bugs.icu-project.org/trac/ticket/12451
Expand All @@ -447,9 +443,6 @@
#'<(icu_path)/source/common/ubidi_props_data.h',
# and the callers
'<(icu_path)/source/common/ushape.cpp',
'<(icu_path)/source/common/usprep.cpp',
'<(icu_path)/source/common/uts46.cpp',
'<(icu_path)/source/common/uidna.cpp',
]}],
[ 'OS == "solaris"', { 'defines': [
'_XOPEN_SOURCE_EXTENDED=0',
Expand Down
3 changes: 1 addition & 2 deletions tools/icu/icu_small.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"region": "none",
"zone": "locales",
"converters": "none",
"stringprep": "none",
"stringprep": "locales",
"translit": "none",
"brkfiles": "none",
"brkdict": "none",
Expand All @@ -34,7 +34,6 @@
"remove": [
"cnvalias.icu",
"postalCodeData.res",
"uts46.nrm",
"genderList.res",
"brkitr/root.res",
"unames.icu"
Expand Down

0 comments on commit d0e2492

Please sign in to comment.