diff --git a/include/ada/url_aggregator.h b/include/ada/url_aggregator.h index 0490de74c..2c269a3c3 100644 --- a/include/ada/url_aggregator.h +++ b/include/ada/url_aggregator.h @@ -232,10 +232,12 @@ struct url_aggregator : url_base { } /** - * Return true on success. + * Return true on success. The 'in_place' parameter indicates whether the + * the string_view input is pointing in the buffer. When in_place is false, + * we must nearly always update the buffer. * @see https://url.spec.whatwg.org/#concept-ipv4-parser */ - [[nodiscard]] bool parse_ipv4(std::string_view input); + [[nodiscard]] bool parse_ipv4(std::string_view input, bool in_place); /** * Return true on success. diff --git a/src/url.cpp b/src/url.cpp index 32d81fd8c..d39ae054b 100644 --- a/src/url.cpp +++ b/src/url.cpp @@ -9,7 +9,7 @@ namespace ada { bool url::parse_opaque_host(std::string_view input) { - ada_log("parse_opaque_host ", input, "[", input.size(), " bytes]"); + ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]"); if (std::any_of(input.begin(), input.end(), ada::unicode::is_forbidden_host_code_point)) { return is_valid = false; @@ -23,7 +23,7 @@ bool url::parse_opaque_host(std::string_view input) { } bool url::parse_ipv4(std::string_view input) { - ada_log("parse_ipv4 ", input, "[", input.size(), " bytes]"); + ada_log("parse_ipv4 ", input, " [", input.size(), " bytes]"); if (input.back() == '.') { input.remove_suffix(1); } @@ -98,7 +98,7 @@ bool url::parse_ipv4(std::string_view input) { } bool url::parse_ipv6(std::string_view input) { - ada_log("parse_ipv6 ", input, "[", input.size(), " bytes]"); + ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]"); if (input.empty()) { return is_valid = false; @@ -422,7 +422,7 @@ ada_really_inline bool url::parse_scheme(const std::string_view input) { } ada_really_inline bool url::parse_host(std::string_view input) { - ada_log("parse_host ", input, "[", input.size(), " bytes]"); + ada_log("parse_host ", input, " [", input.size(), " bytes]"); if (input.empty()) { return is_valid = false; } // technically unnecessary. @@ -474,6 +474,8 @@ ada_really_inline bool url::parse_host(std::string_view input) { ada_log("parse_host to_ascii returns false"); return is_valid = false; } + ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(), + " bytes]"); if (std::any_of(host.value().begin(), host.value().end(), ada::unicode::is_forbidden_domain_code_point)) { @@ -484,7 +486,7 @@ ada_really_inline bool url::parse_host(std::string_view input) { // If asciiDomain ends in a number, then return the result of IPv4 parsing // asciiDomain. if (checkers::is_ipv4(host.value())) { - ada_log("parse_host got ipv4", *host); + ada_log("parse_host got ipv4 ", *host); return parse_ipv4(host.value()); } diff --git a/src/url_aggregator.cpp b/src/url_aggregator.cpp index e9a3a1546..bd5c99d1f 100644 --- a/src/url_aggregator.cpp +++ b/src/url_aggregator.cpp @@ -411,7 +411,7 @@ void url_aggregator::set_hash(const std::string_view input) { bool url_aggregator::set_href(const std::string_view input) { ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer)); - ada_log("url_aggregator::set_href ", input, "[", input.size(), " bytes]"); + ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]"); ada::result out = ada::parse(input); ada_log("url_aggregator::set_href, success :", out.has_value()); @@ -425,7 +425,8 @@ bool url_aggregator::set_href(const std::string_view input) { } ada_really_inline bool url_aggregator::parse_host(std::string_view input) { - ada_log("url_aggregator:parse_host ", input, "[", input.size(), " bytes]"); + ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(), + " bytes]"); ADA_ASSERT_TRUE(validate()); ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer)); if (input.empty()) { @@ -475,7 +476,7 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) { update_base_hostname(input); if (checkers::is_ipv4(get_hostname())) { ada_log("parse_host fast path ipv4"); - return parse_ipv4(get_hostname()); + return parse_ipv4(get_hostname(), true); } ada_log("parse_host fast path ", get_hostname()); return true; @@ -491,6 +492,8 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) { ada_log("parse_host to_ascii returns false"); return is_valid = false; } + ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(), + " bytes]"); if (std::any_of(host.value().begin(), host.value().end(), ada::unicode::is_forbidden_domain_code_point)) { @@ -500,8 +503,8 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) { // If asciiDomain ends in a number, then return the result of IPv4 parsing // asciiDomain. if (checkers::is_ipv4(host.value())) { - ada_log("parse_host got ipv4", *host); - return parse_ipv4(host.value()); + ada_log("parse_host got ipv4 ", *host); + return parse_ipv4(host.value(), false); } update_base_hostname(host.value()); @@ -754,7 +757,7 @@ bool url_aggregator::set_hostname(const std::string_view input) { } [[nodiscard]] std::string ada::url_aggregator::to_string() const { - ada_log("url_aggregator::to_string buffer:", buffer, "[", buffer.size(), + ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(), " bytes]"); if (!is_valid) { return "null"; @@ -853,8 +856,8 @@ bool url_aggregator::set_hostname(const std::string_view input) { return checkers::verify_dns_length(get_hostname()); } -bool url_aggregator::parse_ipv4(std::string_view input) { - ada_log("parse_ipv4 ", input, "[", input.size(), +bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) { + ada_log("parse_ipv4 ", input, " [", input.size(), " bytes], overlaps with buffer: ", helpers::overlaps(input, buffer) ? "yes" : "no"); ADA_ASSERT_TRUE(validate()); @@ -878,20 +881,25 @@ bool url_aggregator::parse_ipv4(std::string_view input) { } else { std::from_chars_result r; if (is_hex) { + ada_log("parse_ipv4 trying to parse hex number"); r = std::from_chars(input.data() + 2, input.data() + input.size(), segment_result, 16); } else if ((input.length() >= 2) && input[0] == '0' && checkers::is_digit(input[1])) { + ada_log("parse_ipv4 trying to parse octal number"); r = std::from_chars(input.data() + 1, input.data() + input.size(), segment_result, 8); } else { + ada_log("parse_ipv4 trying to parse decimal number"); pure_decimal_count++; r = std::from_chars(input.data(), input.data() + input.size(), segment_result, 10); } if (r.ec != std::errc()) { + ada_log("parse_ipv4 parsing failed"); return is_valid = false; } + ada_log("parse_ipv4 parsed ", segment_result); input.remove_prefix(r.ptr - input.data()); } if (input.empty()) { @@ -916,6 +924,7 @@ bool url_aggregator::parse_ipv4(std::string_view input) { } } if ((digit_count != 4) || (!input.empty())) { + ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) "); return is_valid = false; } final: @@ -923,10 +932,14 @@ bool url_aggregator::parse_ipv4(std::string_view input) { " host: ", get_host()); // We could also check r.ptr to see where the parsing ended. - if (pure_decimal_count == 4 && !trailing_dot) { + if (in_place && pure_decimal_count == 4 && !trailing_dot) { + ada_log( + "url_aggregator::parse_ipv4 completed and was already correct in the " + "buffer"); // The original input was already all decimal and we validated it. So we // don't need to do anything. } else { + ada_log("url_aggregator::parse_ipv4 completed and we need to update it"); // Optimization opportunity: Get rid of unnecessary string return in ipv4 // serializer. // TODO: This is likely a bug because it goes back update_base_hostname, not @@ -940,8 +953,11 @@ bool url_aggregator::parse_ipv4(std::string_view input) { } bool url_aggregator::parse_ipv6(std::string_view input) { + // TODO: Implement in_place optimization: we know that input points + // in the buffer, so we can just check whether the buffer is already + // well formatted. // TODO: Find a way to merge parse_ipv6 with url.cpp implementation. - ada_log("parse_ipv6 ", input, "[", input.size(), " bytes]"); + ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]"); ADA_ASSERT_TRUE(validate()); ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer)); if (input.empty()) { @@ -1175,7 +1191,7 @@ bool url_aggregator::parse_ipv6(std::string_view input) { } bool url_aggregator::parse_opaque_host(std::string_view input) { - ada_log("parse_opaque_host ", input, "[", input.size(), " bytes]"); + ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]"); ADA_ASSERT_TRUE(validate()); ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer)); if (std::any_of(input.begin(), input.end(), diff --git a/tests/basic_tests.cpp b/tests/basic_tests.cpp index 4f44badc8..335137f58 100644 --- a/tests/basic_tests.cpp +++ b/tests/basic_tests.cpp @@ -402,3 +402,10 @@ TYPED_TEST(basic_tests, nodejs_51514) { auto out = ada::parse("http://1.1.1.256"); ASSERT_FALSE(out); } +// https://github.com/nodejs/node/issues/51593 +TYPED_TEST(basic_tests, nodejs_51593) { + auto out = ada::parse("http://\u200b123.123.123.123"); + ASSERT_TRUE(out); + ASSERT_EQ(out->get_href(), "http://123.123.123.123/"); + SUCCEED(); +}