From 431cfb139d74cc405ceb8917c6af2c90e22f2aa0 Mon Sep 17 00:00:00 2001 From: Roy Shilkrot Date: Sun, 27 Oct 2024 22:00:12 -0400 Subject: [PATCH] Fix major mem leak in HTML parsing (#126) * Refactor README.md to update download links and badges for different platforms * Refactor HTML and JSON parsers --- README.md | 6 +- buildspec.json | 2 +- src/parsers/html.cpp | 166 ++++++++++++++++++++++----------------- src/parsers/jsonpath.cpp | 63 ++++++--------- src/parsers/regex.cpp | 51 ++++++------ src/parsers/xml.cpp | 95 ++++++++++++---------- 6 files changed, 208 insertions(+), 175 deletions(-) diff --git a/README.md b/README.md index 8cb8e17..d01a932 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,9 @@
Download
- - - + + +
## Introduction diff --git a/buildspec.json b/buildspec.json index 7eaa80e..6ddb303 100644 --- a/buildspec.json +++ b/buildspec.json @@ -37,7 +37,7 @@ } }, "name": "obs-urlsource", - "version": "0.3.6", + "version": "0.3.7", "author": "Roy Shilkrot", "website": "https://github.com/locaal-ai/obs-urlsource", "email": "roy.shil@gmail.com", diff --git a/src/parsers/html.cpp b/src/parsers/html.cpp index 2b710a7..caade5a 100644 --- a/src/parsers/html.cpp +++ b/src/parsers/html.cpp @@ -20,99 +20,125 @@ lxb_status_t find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t { UNUSED_PARAMETER(spec); std::string str; - (void)lxb_html_serialize_deep_cb(node, serializer_callback, &str); - ((std::vector *)data)->push_back(str); - return LXB_STATUS_OK; + lxb_status_t status = lxb_html_serialize_deep_cb(node, serializer_callback, &str); + if (status == LXB_STATUS_OK) { + ((std::vector *)data)->push_back(str); + } + return status; } lxb_status_t find_with_selectors(const std::string &slctrs, lxb_html_document_t *document, std::vector &found) { - /* Create CSS parser. */ - lxb_css_parser_t *parser; - lxb_css_selector_list_t *list; - lxb_status_t status; - lxb_dom_node_t *body; - lxb_selectors_t *selectors; - - parser = lxb_css_parser_create(); - status = lxb_css_parser_init(parser, NULL); - if (status != LXB_STATUS_OK) { - obs_log(LOG_ERROR, "Failed to setup CSS parser"); - return EXIT_FAILURE; - } + lxb_css_parser_t *parser = nullptr; + lxb_css_selector_list_t *list = nullptr; + lxb_selectors_t *selectors = nullptr; + lxb_status_t status = LXB_STATUS_ERROR; + + do { + parser = lxb_css_parser_create(); + if (!parser) { + obs_log(LOG_ERROR, "Failed to create CSS parser"); + break; + } - /* Selectors. */ - selectors = lxb_selectors_create(); - status = lxb_selectors_init(selectors); - if (status != LXB_STATUS_OK) { - obs_log(LOG_ERROR, "Failed to setup Selectors"); - return EXIT_FAILURE; - } + status = lxb_css_parser_init(parser, nullptr); + if (status != LXB_STATUS_OK) { + obs_log(LOG_ERROR, "Failed to init CSS parser"); + break; + } - /* Parse and get the log. */ + selectors = lxb_selectors_create(); + if (!selectors) { + obs_log(LOG_ERROR, "Failed to create selectors"); + break; + } - list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(), slctrs.length()); - if (parser->status != LXB_STATUS_OK) { - obs_log(LOG_ERROR, "Failed to parse CSS selectors"); - return EXIT_FAILURE; - } + status = lxb_selectors_init(selectors); + if (status != LXB_STATUS_OK) { + obs_log(LOG_ERROR, "Failed to init selectors"); + break; + } - /* Find HTML nodes by CSS Selectors. */ - body = lxb_dom_interface_node(lxb_html_document_body_element(document)); + list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(), + slctrs.length()); + if (!list || parser->status != LXB_STATUS_OK) { + obs_log(LOG_ERROR, "Failed to parse CSS selectors"); + break; + } - status = lxb_selectors_find(selectors, body, list, find_callback, &found); - if (status != LXB_STATUS_OK) { - obs_log(LOG_ERROR, "Failed to find HTML nodes by CSS Selectors"); - return EXIT_FAILURE; - } + lxb_dom_node_t *body = + lxb_dom_interface_node(lxb_html_document_body_element(document)); + if (!body) { + obs_log(LOG_ERROR, "Failed to get document body"); + break; + } - /* Destroy Selectors object. */ - (void)lxb_selectors_destroy(selectors, true); + status = lxb_selectors_find(selectors, body, list, find_callback, &found); + if (status != LXB_STATUS_OK) { + obs_log(LOG_ERROR, "Failed to find nodes by CSS Selectors"); + break; + } - /* Destroy resources for CSS Parser. */ - (void)lxb_css_parser_destroy(parser, true); + } while (0); - /* Destroy all object for all CSS Selector List. */ - lxb_css_selector_list_destroy_memory(list); + // Cleanup + if (list) { + lxb_css_selector_list_destroy_memory(list); + } + if (selectors) { + lxb_selectors_destroy(selectors, true); + } + if (parser) { + lxb_css_parser_destroy(parser, true); + } - return LXB_STATUS_OK; + return status; } struct request_data_handler_response parse_html(struct request_data_handler_response response, const url_source_request_data *request_data) { - lxb_status_t status; - lxb_html_document_t *document; + lxb_html_document_t *document = nullptr; - document = lxb_html_document_create(); - if (document == NULL) { - return make_fail_parse_response("Failed to setup HTML parser"); - } + try { + document = lxb_html_document_create(); + if (!document) { + return make_fail_parse_response("Failed to create HTML document"); + } - status = lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(), - response.body.length()); - if (status != LXB_STATUS_OK) { - return make_fail_parse_response("Failed to parse HTML"); - } + lxb_status_t status = + lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(), + response.body.length()); - std::string parsed_output = response.body; - // Get the output value - if (request_data->output_cssselector != "") { - std::vector found; - if (find_with_selectors(request_data->output_cssselector, document, found) != - LXB_STATUS_OK) { - return make_fail_parse_response("Failed to find element with CSS selector"); - } else { - if (found.size() > 0) { - std::copy(found.begin(), found.end(), - std::back_inserter(response.body_parts_parsed)); + if (status != LXB_STATUS_OK) { + lxb_html_document_destroy(document); + return make_fail_parse_response("Failed to parse HTML"); + } + + if (!request_data->output_cssselector.empty()) { + std::vector found; + status = find_with_selectors(request_data->output_cssselector, document, + found); + + if (status != LXB_STATUS_OK) { + lxb_html_document_destroy(document); + return make_fail_parse_response( + "Failed to find element with CSS selector"); } + + response.body_parts_parsed = std::move(found); + } else { + response.body_parts_parsed.push_back(response.body); } - } else { - // Return the whole HTML object - response.body_parts_parsed.push_back(parsed_output); - } - return response; + lxb_html_document_destroy(document); + return response; + + } catch (const std::exception &e) { + if (document) { + lxb_html_document_destroy(document); + } + return make_fail_parse_response(std::string("HTML parsing exception: ") + e.what()); + } } diff --git a/src/parsers/jsonpath.cpp b/src/parsers/jsonpath.cpp index 993a1d7..bec1ab8 100644 --- a/src/parsers/jsonpath.cpp +++ b/src/parsers/jsonpath.cpp @@ -2,67 +2,56 @@ #include "errors.h" #include -#include #include -#include #include +#include struct request_data_handler_response parse_json(struct request_data_handler_response response, const url_source_request_data *request_data) { UNUSED_PARAMETER(request_data); - - // Parse the response as JSON - jsoncons::json json; try { - json = jsoncons::json::parse(response.body); + // Parse JSON only once and store in both formats + auto json_cons = jsoncons::json::parse(response.body); response.body_json = nlohmann::json::parse(response.body); - } catch (jsoncons::json_exception &e) { + return response; + } catch (const jsoncons::json_exception &e) { return make_fail_parse_response(e.what()); - } catch (nlohmann::json::parse_error &e) { + } catch (const nlohmann::json::exception &e) { return make_fail_parse_response(e.what()); } - // Return the whole JSON object - response.body_parts_parsed.push_back(json.as_string()); - return response; } struct request_data_handler_response parse_json_path(struct request_data_handler_response response, const url_source_request_data *request_data) { - - // Parse the response as JSON - jsoncons::json json; try { - json = jsoncons::json::parse(response.body); + auto json = jsoncons::json::parse(response.body); response.body_json = nlohmann::json::parse(response.body); - } catch (jsoncons::json_exception &e) { - return make_fail_parse_response(e.what()); - } catch (nlohmann::json::parse_error &e) { - return make_fail_parse_response(e.what()); - } - std::vector parsed_output = {}; - // Get the output value - if (request_data->output_json_path != "") { - try { - const auto value = jsoncons::jsonpath::json_query( - json, request_data->output_json_path); + + if (!request_data->output_json_path.empty()) { + // Create and evaluate JSONPath expression + auto value = jsoncons::jsonpath::json_query(json, + request_data->output_json_path); + if (value.is_array()) { - // extract array items as strings + response.body_parts_parsed.reserve(value.size()); for (const auto &item : value.array_range()) { - parsed_output.push_back(item.as_string()); + response.body_parts_parsed.push_back( + item.as()); } } else { - parsed_output.push_back(value.as_string()); + response.body_parts_parsed.push_back(value.as()); } - } catch (jsoncons::json_exception &e) { - return make_fail_parse_response(e.what()); + } else { + response.body_parts_parsed.push_back(json.as()); } - } else { - // Return the whole JSON object - parsed_output.clear(); - parsed_output.push_back(json.as_string()); + + return response; + + } catch (const jsoncons::jsonpath::jsonpath_error &e) { + return make_fail_parse_response(std::string("JSONPath error: ") + e.what()); + } catch (const std::exception &e) { + return make_fail_parse_response(std::string("JSON parse error: ") + e.what()); } - response.body_parts_parsed = parsed_output; - return response; } diff --git a/src/parsers/regex.cpp b/src/parsers/regex.cpp index 4fc3bf3..dae2842 100644 --- a/src/parsers/regex.cpp +++ b/src/parsers/regex.cpp @@ -1,6 +1,6 @@ - #include "request-data.h" #include "plugin-support.h" +#include "errors.h" #include #include @@ -8,30 +8,35 @@ struct request_data_handler_response parse_regex(struct request_data_handler_response response, const url_source_request_data *request_data) { - std::string parsed_output = ""; - if (request_data->output_regex == "") { - // Return the whole response body - parsed_output = response.body; - } else { - // Parse the response as a regex - std::regex regex(request_data->output_regex, - std::regex_constants::ECMAScript | std::regex_constants::optimize); + try { + if (request_data->output_regex.empty()) { + response.body_parts_parsed.push_back(response.body); + return response; + } + + // Cache compiled regex patterns for better performance + static thread_local std::unordered_map regex_cache; + + auto ®ex = regex_cache[request_data->output_regex]; + if (regex_cache.find(request_data->output_regex) == regex_cache.end()) { + regex = std::regex(request_data->output_regex, + std::regex_constants::ECMAScript | + std::regex_constants::optimize); + } + std::smatch match; if (std::regex_search(response.body, match, regex)) { - if (match.size() > 1) { - parsed_output = match[1].str(); - } else { - parsed_output = match[0].str(); - } - } else { - obs_log(LOG_INFO, "Failed to match regex"); - // Return an error response - struct request_data_handler_response responseFail; - responseFail.error_message = "Failed to match regex"; - responseFail.status_code = URL_SOURCE_REQUEST_PARSING_ERROR_CODE; - return responseFail; + // Get the appropriate capture group + size_t group = match.size() > 1 ? 1 : 0; + response.body_parts_parsed.push_back(match[group].str()); + return response; } + + return make_fail_parse_response("No regex match found"); + + } catch (const std::regex_error &e) { + return make_fail_parse_response(std::string("Regex error: ") + e.what()); + } catch (const std::exception &e) { + return make_fail_parse_response(std::string("Parse error: ") + e.what()); } - response.body_parts_parsed.push_back(parsed_output); - return response; } diff --git a/src/parsers/xml.cpp b/src/parsers/xml.cpp index 8bca2e1..de6929f 100644 --- a/src/parsers/xml.cpp +++ b/src/parsers/xml.cpp @@ -1,4 +1,3 @@ - #include "request-data.h" #include "plugin-support.h" #include "errors.h" @@ -9,69 +8,83 @@ struct request_data_handler_response parse_xml(struct request_data_handler_response response, const url_source_request_data *request_data) { - // Parse the response as XML using pugixml pugi::xml_document doc; + try { - pugi::xml_parse_result result = doc.load_string(response.body.c_str()); - if (!result) { + // Use load_buffer instead of load_string for better performance with known size + pugi::xml_parse_result result = + doc.load_buffer(response.body.c_str(), response.body.size(), + pugi::parse_default, pugi::encoding_utf8); + if (!result) { obs_log(LOG_INFO, "Failed to parse XML response: %s", result.description()); return make_fail_parse_response(result.description()); } + if (doc.empty()) { - obs_log(LOG_INFO, "Failed to parse XML response: Empty XML"); - return make_fail_parse_response("Empty XML"); + return make_fail_parse_response("Empty XML document"); } - std::string parsed_output = ""; - // Get the output value - if (request_data->output_xpath != "") { + + if (!request_data->output_xpath.empty()) { obs_log(LOG_INFO, "Parsing XML with XPath: %s", request_data->output_xpath.c_str()); - pugi::xpath_node_set nodes = - doc.select_nodes(request_data->output_xpath.c_str()); - if (nodes.size() > 0) { - parsed_output = nodes[0].node().text().get(); - } else { - obs_log(LOG_INFO, "Failed to get XML value"); - return make_fail_parse_response("Failed to get XML value"); + + // Compile XPath expression once for better performance + pugi::xpath_query query(request_data->output_xpath.c_str()); + pugi::xpath_node_set nodes = query.evaluate_node_set(doc); + + if (nodes.empty()) { + return make_fail_parse_response("XPath query returned no results"); + } + + // Get all matching nodes + for (const auto &node : nodes) { + response.body_parts_parsed.push_back(node.node().text().get()); } } else { // Return the whole XML object - parsed_output = response.body; + response.body_parts_parsed.push_back(response.body); } - response.body_parts_parsed.push_back(parsed_output); + + return response; + + } catch (const pugi::xpath_exception &e) { + obs_log(LOG_INFO, "XPath evaluation failed: %s", e.what()); + return make_fail_parse_response(e.what()); } catch (const std::exception &e) { - obs_log(LOG_INFO, "Failed to parse XML response: %s", e.what()); + obs_log(LOG_INFO, "XML parsing failed: %s", e.what()); return make_fail_parse_response(e.what()); } - return response; } struct request_data_handler_response parse_xml_by_xquery(struct request_data_handler_response response, const url_source_request_data *request_data) { - // Parse the response as XML using pugixml pugi::xml_document doc; - pugi::xml_parse_result result = doc.load_string(response.body.c_str()); - if (!result) { - obs_log(LOG_INFO, "Failed to parse XML response: %s", result.description()); - // Return an error response - struct request_data_handler_response responseFail; - responseFail.error_message = result.description(); - responseFail.status_code = URL_SOURCE_REQUEST_PARSING_ERROR_CODE; - return responseFail; - } - std::string parsed_output = ""; - // Get the output value - if (request_data->output_xquery != "") { - pugi::xpath_query query_entity(request_data->output_xquery.c_str()); - std::string s = query_entity.evaluate_string(doc); - parsed_output = s; - } else { - // Return the whole XML object - parsed_output = response.body; + + try { + pugi::xml_parse_result result = + doc.load_buffer(response.body.c_str(), response.body.size(), + pugi::parse_default, pugi::encoding_utf8); + + if (!result) { + return make_fail_parse_response(result.description()); + } + + if (!request_data->output_xquery.empty()) { + pugi::xpath_query query(request_data->output_xquery.c_str()); + std::string result = query.evaluate_string(doc); + response.body_parts_parsed.push_back(std::move(result)); + } else { + response.body_parts_parsed.push_back(response.body); + } + + return response; + + } catch (const pugi::xpath_exception &e) { + return make_fail_parse_response(e.what()); + } catch (const std::exception &e) { + return make_fail_parse_response(e.what()); } - response.body_parts_parsed.push_back(parsed_output); - return response; }