Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add url_components for start/end #250

Merged
merged 14 commits into from
Mar 3, 2023
1 change: 1 addition & 0 deletions include/ada.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "ada/state.h"
#include "ada/unicode.h"
#include "ada/url-inl.h"
#include "ada/url_components.h"

// Public API
#include "ada/ada_version.h"
Expand Down
6 changes: 6 additions & 0 deletions include/ada/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
*/
namespace ada::helpers {

/**
* @private
*/
template <typename out_iter>
void encode_json(std::string_view view, out_iter out);

/**
* This function is used to prune a fragment from a url, and returning the removed string if input has fragment.
*
Expand Down
107 changes: 107 additions & 0 deletions include/ada/url-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@

#include "ada/checkers.h"
#include "ada/url.h"
#include "ada/url_components.h"
#include <optional>
#include <string>
#if ADA_REGULAR_VISUAL_STUDIO
#include <intrin.h>
#endif // ADA_REGULAR_VISUAL_STUDIO

namespace ada {
[[nodiscard]] ada_really_inline bool url::includes_credentials() const noexcept {
Expand Down Expand Up @@ -73,6 +79,107 @@ namespace ada {
inline std::ostream& operator<<(std::ostream& out, const ada::url& u) {
return out << u.to_string();
}

// number of 'leading zeroes'.
inline int leading_zeroes(uint32_t input_num) {
#if ADA_REGULAR_VISUAL_STUDIO
unsigned long leading_zero(0);
unsigned long in(input_num);
return _BitScanReverse(&leading_zero, in) ? int(31 - leading_zero) : 32;
#else
return __builtin_clz(input_num);
#endif// ADA_REGULAR_VISUAL_STUDIO
}

// integer logarithm of x (ceil(log2(x)))
inline int int_log2(uint32_t x) {
return 31 - leading_zeroes(x | 1);
}

// faster than std::to_string(x).size().
inline int fast_digit_count(uint32_t x) {
// Compiles to very few instructions. Note that the
// table is static and thus effectively a constant.
// We leave it inside the function because it is meaningless
// outside of it (this comes at no performance cost).
const static uint64_t table[] = {
4294967296, 8589934582, 8589934582, 8589934582, 12884901788,
12884901788, 12884901788, 17179868184, 17179868184, 17179868184,
21474826480, 21474826480, 21474826480, 21474826480, 25769703776,
25769703776, 25769703776, 30063771072, 30063771072, 30063771072,
34349738368, 34349738368, 34349738368, 34349738368, 38554705664,
38554705664, 38554705664, 41949672960, 41949672960, 41949672960,
42949672960, 42949672960};
return int((x + table[int_log2(x)]) >> 32);
}

[[nodiscard]] ada_really_inline ada::url_components url::get_components() noexcept {
url_components out{};

// protocol ends with ':'. for example: "https:"
out.protocol_end = uint32_t(get_scheme().size());

// Trailing index is always the next character of the current one.
size_t running_index = out.protocol_end + 1;

if (host.has_value()) {
// 2 characters for "//" and 1 character for starting index
out.host_start = out.protocol_end + 3;

if (includes_credentials()) {
out.username_end = uint32_t(out.host_start + username.size() - 1);

out.host_start += uint32_t(username.size() + 1);

if (!password.empty()) {
out.host_start += uint32_t(password.size() + 1);
}
} else {
out.username_end = out.host_start;
}

out.host_end = uint32_t(out.host_start + host.value().size()) - 1;
running_index = out.host_end + 1;
} else {
// Update host start and end date to the same index, since it does not exist.
out.host_start = out.protocol_end + 1;
out.host_end = out.protocol_end + 1;

size_t url_delimiter_count = std::count(path.begin(), path.end(), '/');

if (!has_opaque_path && url_delimiter_count > 1 && path.length() >= 2 && path[0] == '/' && path[1] == '/') {
// If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1,
// and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
running_index = out.protocol_end + 3;
} else {
running_index = out.protocol_end + 1;
}
}

if (port.has_value()) {
out.port = *port;
running_index += fast_digit_count(*port) + 1; // Port omits ':'
}

out.pathname_start = uint32_t(running_index);

if (!path.empty()) {
running_index += path.size();
}

if (query.has_value()) {
out.search_start = uint32_t(running_index);
running_index += get_search().size();
if (get_search().size() == 0) { running_index++; }
}

if (fragment.has_value()) {
out.hash_start = uint32_t(running_index);
}

return out;
}

} // namespace ada

#endif // ADA_URL_H
20 changes: 20 additions & 0 deletions include/ada/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "ada/serializers.h"
#include "ada/unicode.h"
#include "ada/log.h"
#include "ada/url_components.h"

#include <algorithm>
#include <charconv>
Expand Down Expand Up @@ -349,6 +350,25 @@ namespace ada {
*/
std::string to_string() const;

/**
* Useful for implementing efficient serialization for the URL.
*
* https://user@pass:example.com:1234/foo/bar?baz#quux
* | | | | ^^^^| | |
* | | | | | | | `----- hash_start
* | | | | | | `--------- search_start
* | | | | | `----------------- pathname_start
* | | | | `--------------------- port
* | | | `----------------------- host_end
* | | `---------------------------------- host_start
* | `--------------------------------------- username_end
* `---------------------------------------------- protocol_end
*
* Inspired after servo/url
* @see https://github.com/servo/rust-url/blob/b65a45515c10713f6d212e6726719a020203cc98/url/src/quirks.rs#L31
*/
[[nodiscard]] ada_really_inline ada::url_components get_components() noexcept;

private:

/**
Expand Down
64 changes: 64 additions & 0 deletions include/ada/url_components.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/**
* @file url_components.h
* @brief Declaration for the URL Components
*/
#ifndef ADA_URL_COMPONENTS_H
#define ADA_URL_COMPONENTS_H

#include "ada/common_defs.h"

#include <optional>
#include <string_view>

namespace ada {

/**
* We design the url_components struct so that it is as small
* and simple as possible. This version uses 32 bytes.
*
* This struct is used to extract components from a single 'href'.
*/
struct url_components {
constexpr static uint32_t omitted = uint32_t(-1);

url_components() = default;
url_components(const url_components &u) = default;
url_components(url_components &&u) noexcept = default;
url_components &operator=(url_components &&u) noexcept = default;
url_components &operator=(const url_components &u) = default;
~url_components() = default;

/*
* By using 32-bit integers, we implicitly assume that the URL string
* cannot exceed 4 GB.
*
* https://user@pass:example.com:1234/foo/bar?baz#quux
* | | | | ^^^^| | |
* | | | | | | | `----- hash_start
* | | | | | | `--------- search_start
* | | | | | `----------------- pathname_start
* | | | | `--------------------- port
* | | | `----------------------- host_end
* | | `---------------------------------- host_start
* | `--------------------------------------- username_end
* `---------------------------------------------- protocol_end
*/
uint32_t protocol_end{0};
uint32_t username_end{0};
uint32_t host_start{0};
uint32_t host_end{0};
uint32_t port{omitted};
uint32_t pathname_start{0};
uint32_t search_start{omitted};
uint32_t hash_start{omitted};

/**
* @private
* Converts a url_components to JSON stringified version.
*/
std::string to_string() const;

}; // struct url_components

} // namespace ada
#endif
1 change: 1 addition & 0 deletions src/ada.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
#include "url-getters.cpp"
#include "url-setters.cpp"
#include "parser.cpp"
#include "url_components.cpp"
6 changes: 5 additions & 1 deletion src/parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "ada/log.h"

#include <iostream>

#include <limits>
#include <optional>
#include <string_view>

Expand All @@ -22,6 +22,10 @@ namespace ada::parser {
ada::state state = ada::state::SCHEME_START;
ada::url url = ada::url();

// We refuse to parse URL strings that exceed 4GB. Such strings are almost
// surely the result of a bug or are otherwise a security concern.
if(user_input.size() >= std::string_view::size_type(std::numeric_limits<uint32_t>::max)) { url.is_valid = false; }

// If we are provided with an invalid base, or the optional_url was invalid,
// we must return.
if(base_url != nullptr) { url.is_valid &= base_url->is_valid; }
Expand Down
1 change: 1 addition & 0 deletions src/url.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,4 +522,5 @@ namespace ada {
if(!host.has_value()) { return false; }
return checkers::verify_dns_length(host.value());
}

} // namespace ada
51 changes: 51 additions & 0 deletions src/url_components.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include "ada.h"
#include "ada/helpers.h"

#include <numeric>
#include <algorithm>
#include <string>

namespace ada {

std::string url_components::to_string() const {
std::string answer;
auto back = std::back_insert_iterator(answer);
answer.append("{\n");

answer.append("\t\"protocol_end\":\"");
helpers::encode_json(std::to_string(protocol_end), back);
answer.append("\",\n");

answer.append("\t\"username_end\":\"");
helpers::encode_json(std::to_string(username_end), back);
answer.append("\",\n");

answer.append("\t\"host_start\":\"");
helpers::encode_json(std::to_string(host_start), back);
answer.append("\",\n");

answer.append("\t\"host_end\":\"");
helpers::encode_json(std::to_string(host_end), back);
answer.append("\",\n");

answer.append("\t\"port\":\"");
helpers::encode_json(std::to_string(port), back);
answer.append("\",\n");

answer.append("\t\"pathname_start\":\"");
helpers::encode_json(std::to_string(pathname_start), back);
answer.append("\",\n");

answer.append("\t\"search_start\":\"");
helpers::encode_json(std::to_string(search_start), back);
answer.append("\",\n");

answer.append("\t\"hash_start\":\"");
helpers::encode_json(std::to_string(hash_start), back);
answer.append("\",\n");

answer.append("\n}");
return answer;
}

} // namespace ada
11 changes: 6 additions & 5 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ include(${PROJECT_SOURCE_DIR}/cmake/add-cpp-test.cmake)
link_libraries(ada)

add_cpp_test(wpt_tests)
add_cpp_test(url_components)
target_link_libraries(wpt_tests PRIVATE simdjson)
target_link_libraries(url_components PRIVATE simdjson)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
target_link_libraries(wpt_tests PUBLIC stdc++fs)
if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9)
target_link_libraries(wpt_tests PUBLIC stdc++fs)
target_link_libraries(url_components PUBLIC stdc++fs)
endif()
endif()
endif()


add_cpp_test(basic_fuzzer)
add_cpp_test(from_file_tests)
add_cpp_test(basic_tests)

Loading