From 2ee246fb5ebf2bacf1d8b6ee1d16e9f7eb100f8c Mon Sep 17 00:00:00 2001 From: Rajaram Gaunker Date: Thu, 11 May 2017 00:19:15 -0700 Subject: [PATCH 1/4] url: ignore IDN errors when domainname have two hyphens There are valid domain names with hyphens at 3 and 4th position, new node WHATWG URL parser was failing for it assume its an invalid IDN. Fixes: https://github.com/nodejs/node/issues/12965 --- src/node_i18n.cc | 1 + test/parallel/test-url-parse-format.js | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 6d966bb117d382..5e412c7ba88332 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -516,6 +516,7 @@ int32_t ToASCII(MaybeStackBuffer* buf, info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; diff --git a/test/parallel/test-url-parse-format.js b/test/parallel/test-url-parse-format.js index 0e12fe52516f16..b8db6d6f6424aa 100644 --- a/test/parallel/test-url-parse-format.js +++ b/test/parallel/test-url-parse-format.js @@ -487,6 +487,16 @@ const parseTests = { path: '/bar' }, + 'https://r4---sn-a5mlrn7s.gevideo.com/bar': { + href: 'https://r4---sn-a5mlrn7s.gevideo.com/bar', + host: 'r4---sn-a5mlrn7s.gevideo.com', + hostname: 'r4---sn-a5mlrn7s.gevideo.com', + protocol: 'https:', + slashes: true, + pathname: '/bar', + path: '/bar' + }, + // IDNA tests 'http://www.日本語.com/': { href: 'http://www.xn--wgv71a119e.com/', From 9dee867d73ba23237232b376213d9c69d1351618 Mon Sep 17 00:00:00 2001 From: Rajaram Gaunker Date: Thu, 11 May 2017 00:19:15 -0700 Subject: [PATCH 2/4] url: ignore IDN errors when domainname have hyphens There are valid domain names with hyphens at 3 and 4th position, new node WHATWG URL parser was failing for it assume its an invalid IDN. Also filters IDN errors when domain label start or end with hyphen. Fixes: https://github.com/nodejs/node/issues/12965 Refs: https://www.icann.org/news/announcement-2000-01-07-en --- src/node_i18n.cc | 29 ++++++++++++++++++++++++++ test/fixtures/url-idna.js | 13 ++++++++++++ test/parallel/test-url-parse-format.js | 10 --------- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 5e412c7ba88332..f740e77e69f97c 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -468,6 +468,21 @@ int32_t ToUnicode(MaybeStackBuffer* buf, // #46. Therefore, explicitly filters out that error here. info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; + // These error conditions are mandated unconditionally by UTS #46 version + // 9.0.0 (rev. 17), but were found to be incompatible with many actual domain + // names in the wild. As such, in the current UTS #46 draft (rev. 18) these + // checks are made optional depending on the CheckHyphens flag, which will be + // disabled in WHATWG URL's "domain to ASCII" algorithm as soon as the UTS #46 + // draft becomes standard. + // Refs: + // - https://github.com/whatwg/url/issues/53 + // - http://www.unicode.org/review/pri317/ + // - http://www.unicode.org/reports/tr46/tr46-18.html + // - https://www.icann.org/news/announcement-2000-01-07-en + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; buf->SetLength(0); @@ -516,7 +531,21 @@ int32_t ToASCII(MaybeStackBuffer* buf, info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + + // These error conditions are mandated unconditionally by UTS #46 version + // 9.0.0 (rev. 17), but were found to be incompatible with many actual domain + // names in the wild. As such, in the current UTS #46 draft (rev. 18) these + // checks are made optional depending on the CheckHyphens flag, which will be + // disabled in WHATWG URL's "domain to ASCII" algorithm as soon as the UTS #46 + // draft becomes standard. + // Refs: + // - https://github.com/whatwg/url/issues/53 + // - http://www.unicode.org/review/pri317/ + // - http://www.unicode.org/reports/tr46/tr46-18.html + // - https://www.icann.org/news/announcement-2000-01-07-en info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; diff --git a/test/fixtures/url-idna.js b/test/fixtures/url-idna.js index f105adeed1243b..6ef0089d5258b2 100644 --- a/test/fixtures/url-idna.js +++ b/test/fixtures/url-idna.js @@ -191,6 +191,19 @@ module.exports = { { ascii: `${`${'a'.repeat(64)}.`.repeat(4)}com`, unicode: `${`${'a'.repeat(64)}.`.repeat(4)}com` + }, + // URLs with hyphen + { + ascii: 'r4---sn-a5mlrn7s.gevideo.com', + unicode: 'r4---sn-a5mlrn7s.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s.gevideo.com', + unicode: '-sn-a5mlrn7s.gevideo.com' + }, + { + ascii: 'sn-a5mlrn7s-.gevideo.com', + unicode: 'sn-a5mlrn7s-.gevideo.com' } ], invalid: [ diff --git a/test/parallel/test-url-parse-format.js b/test/parallel/test-url-parse-format.js index b8db6d6f6424aa..0e12fe52516f16 100644 --- a/test/parallel/test-url-parse-format.js +++ b/test/parallel/test-url-parse-format.js @@ -487,16 +487,6 @@ const parseTests = { path: '/bar' }, - 'https://r4---sn-a5mlrn7s.gevideo.com/bar': { - href: 'https://r4---sn-a5mlrn7s.gevideo.com/bar', - host: 'r4---sn-a5mlrn7s.gevideo.com', - hostname: 'r4---sn-a5mlrn7s.gevideo.com', - protocol: 'https:', - slashes: true, - pathname: '/bar', - path: '/bar' - }, - // IDNA tests 'http://www.日本語.com/': { href: 'http://www.xn--wgv71a119e.com/', From 311667bc415ebd81d9a22104caf1a7b30e607bf9 Mon Sep 17 00:00:00 2001 From: Rajaram Gaunker Date: Thu, 11 May 2017 00:19:15 -0700 Subject: [PATCH 3/4] url: ignore IDN errors when domainname have hyphens There are valid domain names with hyphens at 3 and 4th position, new node WHATWG URL parser was failing for it assume its an invalid IDN. Also filters IDN errors when domain label start or end with hyphen. Fixes: https://github.com/nodejs/node/issues/12965 Refs: https://www.icann.org/news/announcement-2000-01-07-en --- src/node_i18n.cc | 4 ++-- test/fixtures/url-idna.js | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index f740e77e69f97c..06ecf85eab9234 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -472,8 +472,8 @@ int32_t ToUnicode(MaybeStackBuffer* buf, // 9.0.0 (rev. 17), but were found to be incompatible with many actual domain // names in the wild. As such, in the current UTS #46 draft (rev. 18) these // checks are made optional depending on the CheckHyphens flag, which will be - // disabled in WHATWG URL's "domain to ASCII" algorithm as soon as the UTS #46 - // draft becomes standard. + // disabled in WHATWG URL's "domain to unicode" algorithm as soon as the UTS + // #46 draft becomes standard. // Refs: // - https://github.com/whatwg/url/issues/53 // - http://www.unicode.org/review/pri317/ diff --git a/test/fixtures/url-idna.js b/test/fixtures/url-idna.js index 6ef0089d5258b2..9a6a206abbaf21 100644 --- a/test/fixtures/url-idna.js +++ b/test/fixtures/url-idna.js @@ -204,6 +204,14 @@ module.exports = { { ascii: 'sn-a5mlrn7s-.gevideo.com', unicode: 'sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s-.gevideo.com', + unicode: '-sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn--a5mlrn7s-.gevideo.com', + unicode: '-sn--a5mlrn7s-.gevideo.com' } ], invalid: [ From 5e0f58c9405b94f4b169f3e883454b7ad37986d4 Mon Sep 17 00:00:00 2001 From: Rajaram Gaunker Date: Thu, 11 May 2017 00:19:15 -0700 Subject: [PATCH 4/4] url: ignore IDN errors when domainname have hyphens There are valid domain names with hyphens at 3 and 4th position, new node WHATWG URL parser was failing for it assume its an invalid IDN. Also filters IDN errors when domain label start or end with hyphen. Also fix error in ToUnicode Fixes: https://github.com/nodejs/node/issues/12965 Refs: https://www.icann.org/news/announcement-2000-01-07-en Refs: https://github.com/whatwg/url/pull/309#issuecomment-302096434 --- src/node_i18n.cc | 31 +++---------------------------- src/node_i18n.h | 3 +-- test/fixtures/url-idna.js | 9 --------- 3 files changed, 4 insertions(+), 39 deletions(-) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 06ecf85eab9234..2e7801fa896ef7 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -435,8 +435,7 @@ bool InitializeICUDirectory(const std::string& path) { int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, - size_t length, - bool lenient) { + size_t length) { UErrorCode status = U_ZERO_ERROR; uint32_t options = UIDNA_DEFAULT; options |= UIDNA_NONTRANSITIONAL_TO_UNICODE; @@ -461,29 +460,7 @@ int32_t ToUnicode(MaybeStackBuffer* buf, &status); } - // UTS #46's ToUnicode operation applies no validation of domain name length - // (nor a flag requesting it to do so, like VerifyDnsLength for ToASCII). For - // that reason, unlike ToASCII below, ICU4C correctly accepts long domain - // names. However, ICU4C still sets the EMPTY_LABEL error in contrary to UTS - // #46. Therefore, explicitly filters out that error here. - info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; - - // These error conditions are mandated unconditionally by UTS #46 version - // 9.0.0 (rev. 17), but were found to be incompatible with many actual domain - // names in the wild. As such, in the current UTS #46 draft (rev. 18) these - // checks are made optional depending on the CheckHyphens flag, which will be - // disabled in WHATWG URL's "domain to unicode" algorithm as soon as the UTS - // #46 draft becomes standard. - // Refs: - // - https://github.com/whatwg/url/issues/53 - // - http://www.unicode.org/review/pri317/ - // - http://www.unicode.org/reports/tr46/tr46-18.html - // - https://www.icann.org/news/announcement-2000-01-07-en - info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; - info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; - info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; - - if (U_FAILURE(status) || (!lenient && info.errors != 0)) { + if (U_FAILURE(status)) { len = -1; buf->SetLength(0); } else { @@ -563,11 +540,9 @@ static void ToUnicode(const FunctionCallbackInfo& args) { CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); - // optional arg - bool lenient = args[1]->BooleanValue(env->context()).FromJust(); MaybeStackBuffer buf; - int32_t len = ToUnicode(&buf, *val, val.length(), lenient); + int32_t len = ToUnicode(&buf, *val, val.length()); if (len < 0) { return env->ThrowError("Cannot convert name to Unicode"); diff --git a/src/node_i18n.h b/src/node_i18n.h index 78e18fdb370c94..cc1f3e6ea53569 100644 --- a/src/node_i18n.h +++ b/src/node_i18n.h @@ -43,8 +43,7 @@ int32_t ToASCII(MaybeStackBuffer* buf, bool lenient = false); int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, - size_t length, - bool lenient = false); + size_t length); } // namespace i18n } // namespace node diff --git a/test/fixtures/url-idna.js b/test/fixtures/url-idna.js index 9a6a206abbaf21..41f12edc6c2f7d 100644 --- a/test/fixtures/url-idna.js +++ b/test/fixtures/url-idna.js @@ -219,15 +219,6 @@ module.exports = { { url: '\ufffd.com', mode: 'ascii' - }, - { - url: '\ufffd.com', - mode: 'unicode' - }, - // invalid Punycode - { - url: 'xn---abc.com', - mode: 'unicode' } ] }