Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix major mem leak in HTML parsing #126

Merged
merged 2 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

<div align="center">
Download</br>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-windows-x64-Installer.exe"><img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-macos-universal.pkg"><img src="https://img.shields.io/badge/Mac-000000?style=for-the-badge&logo=Apple" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-x86_64-linux-gnu.deb"><img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"/></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-windows-x64-Installer.exe"><img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-macos-universal.pkg"><img src="https://img.shields.io/badge/Mac-000000?style=for-the-badge&logo=Apple" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-x86_64-linux-gnu.deb"><img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"/></a>
</div>

## Introduction
Expand Down
2 changes: 1 addition & 1 deletion buildspec.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
}
},
"name": "obs-urlsource",
"version": "0.3.6",
"version": "0.3.7",
"author": "Roy Shilkrot",
"website": "https://github.com/locaal-ai/obs-urlsource",
"email": "roy.shil@gmail.com",
Expand Down
166 changes: 96 additions & 70 deletions src/parsers/html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,99 +20,125 @@ lxb_status_t find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t
{
UNUSED_PARAMETER(spec);
std::string str;
(void)lxb_html_serialize_deep_cb(node, serializer_callback, &str);
((std::vector<std::string> *)data)->push_back(str);
return LXB_STATUS_OK;
lxb_status_t status = lxb_html_serialize_deep_cb(node, serializer_callback, &str);
if (status == LXB_STATUS_OK) {
((std::vector<std::string> *)data)->push_back(str);
}
return status;
}

lxb_status_t find_with_selectors(const std::string &slctrs, lxb_html_document_t *document,
std::vector<std::string> &found)
{
/* Create CSS parser. */
lxb_css_parser_t *parser;
lxb_css_selector_list_t *list;
lxb_status_t status;
lxb_dom_node_t *body;
lxb_selectors_t *selectors;

parser = lxb_css_parser_create();
status = lxb_css_parser_init(parser, NULL);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup CSS parser");
return EXIT_FAILURE;
}
lxb_css_parser_t *parser = nullptr;
lxb_css_selector_list_t *list = nullptr;
lxb_selectors_t *selectors = nullptr;
lxb_status_t status = LXB_STATUS_ERROR;

do {
parser = lxb_css_parser_create();
if (!parser) {
obs_log(LOG_ERROR, "Failed to create CSS parser");
break;
}

/* Selectors. */
selectors = lxb_selectors_create();
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup Selectors");
return EXIT_FAILURE;
}
status = lxb_css_parser_init(parser, nullptr);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to init CSS parser");
break;
}

/* Parse and get the log. */
selectors = lxb_selectors_create();
if (!selectors) {
obs_log(LOG_ERROR, "Failed to create selectors");
break;
}

list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(), slctrs.length());
if (parser->status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to parse CSS selectors");
return EXIT_FAILURE;
}
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to init selectors");
break;
}

/* Find HTML nodes by CSS Selectors. */
body = lxb_dom_interface_node(lxb_html_document_body_element(document));
list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(),
slctrs.length());
if (!list || parser->status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to parse CSS selectors");
break;
}

status = lxb_selectors_find(selectors, body, list, find_callback, &found);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to find HTML nodes by CSS Selectors");
return EXIT_FAILURE;
}
lxb_dom_node_t *body =
lxb_dom_interface_node(lxb_html_document_body_element(document));
if (!body) {
obs_log(LOG_ERROR, "Failed to get document body");
break;
}

/* Destroy Selectors object. */
(void)lxb_selectors_destroy(selectors, true);
status = lxb_selectors_find(selectors, body, list, find_callback, &found);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to find nodes by CSS Selectors");
break;
}

/* Destroy resources for CSS Parser. */
(void)lxb_css_parser_destroy(parser, true);
} while (0);

/* Destroy all object for all CSS Selector List. */
lxb_css_selector_list_destroy_memory(list);
// Cleanup
if (list) {
lxb_css_selector_list_destroy_memory(list);
}
if (selectors) {
lxb_selectors_destroy(selectors, true);
}
if (parser) {
lxb_css_parser_destroy(parser, true);
}

return LXB_STATUS_OK;
return status;
}

struct request_data_handler_response parse_html(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
lxb_status_t status;
lxb_html_document_t *document;
lxb_html_document_t *document = nullptr;

document = lxb_html_document_create();
if (document == NULL) {
return make_fail_parse_response("Failed to setup HTML parser");
}
try {
document = lxb_html_document_create();
if (!document) {
return make_fail_parse_response("Failed to create HTML document");
}

status = lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(),
response.body.length());
if (status != LXB_STATUS_OK) {
return make_fail_parse_response("Failed to parse HTML");
}
lxb_status_t status =
lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(),
response.body.length());

std::string parsed_output = response.body;
// Get the output value
if (request_data->output_cssselector != "") {
std::vector<std::string> found;
if (find_with_selectors(request_data->output_cssselector, document, found) !=
LXB_STATUS_OK) {
return make_fail_parse_response("Failed to find element with CSS selector");
} else {
if (found.size() > 0) {
std::copy(found.begin(), found.end(),
std::back_inserter(response.body_parts_parsed));
if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(document);
return make_fail_parse_response("Failed to parse HTML");
}

if (!request_data->output_cssselector.empty()) {
std::vector<std::string> found;
status = find_with_selectors(request_data->output_cssselector, document,
found);

if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(document);
return make_fail_parse_response(
"Failed to find element with CSS selector");
}

response.body_parts_parsed = std::move(found);
} else {
response.body_parts_parsed.push_back(response.body);
}
} else {
// Return the whole HTML object
response.body_parts_parsed.push_back(parsed_output);
}

return response;
lxb_html_document_destroy(document);
return response;

} catch (const std::exception &e) {
if (document) {
lxb_html_document_destroy(document);
}
return make_fail_parse_response(std::string("HTML parsing exception: ") + e.what());
}
}
63 changes: 26 additions & 37 deletions src/parsers/jsonpath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,67 +2,56 @@
#include "errors.h"

#include <jsoncons/basic_json.hpp>
#include <jsoncons/json_parser.hpp>
#include <jsoncons_ext/jsonpath/jsonpath.hpp>
#include <obs-module.h>
#include <nlohmann/json.hpp>
#include <util/c99defs.h>

struct request_data_handler_response parse_json(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
UNUSED_PARAMETER(request_data);

// Parse the response as JSON
jsoncons::json json;
try {
json = jsoncons::json::parse(response.body);
// Parse JSON only once and store in both formats
auto json_cons = jsoncons::json::parse(response.body);
response.body_json = nlohmann::json::parse(response.body);
} catch (jsoncons::json_exception &e) {
return response;
} catch (const jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} catch (nlohmann::json::parse_error &e) {
} catch (const nlohmann::json::exception &e) {
return make_fail_parse_response(e.what());
}
// Return the whole JSON object
response.body_parts_parsed.push_back(json.as_string());
return response;
}

struct request_data_handler_response parse_json_path(struct request_data_handler_response response,
const url_source_request_data *request_data)
{

// Parse the response as JSON
jsoncons::json json;
try {
json = jsoncons::json::parse(response.body);
auto json = jsoncons::json::parse(response.body);
response.body_json = nlohmann::json::parse(response.body);
} catch (jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} catch (nlohmann::json::parse_error &e) {
return make_fail_parse_response(e.what());
}
std::vector<std::string> parsed_output = {};
// Get the output value
if (request_data->output_json_path != "") {
try {
const auto value = jsoncons::jsonpath::json_query(
json, request_data->output_json_path);

if (!request_data->output_json_path.empty()) {
// Create and evaluate JSONPath expression
auto value = jsoncons::jsonpath::json_query(json,
request_data->output_json_path);

if (value.is_array()) {
// extract array items as strings
response.body_parts_parsed.reserve(value.size());
for (const auto &item : value.array_range()) {
parsed_output.push_back(item.as_string());
response.body_parts_parsed.push_back(
item.as<std::string>());
}
} else {
parsed_output.push_back(value.as_string());
response.body_parts_parsed.push_back(value.as<std::string>());
}
} catch (jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} else {
response.body_parts_parsed.push_back(json.as<std::string>());
}
} else {
// Return the whole JSON object
parsed_output.clear();
parsed_output.push_back(json.as_string());

return response;

} catch (const jsoncons::jsonpath::jsonpath_error &e) {
return make_fail_parse_response(std::string("JSONPath error: ") + e.what());
} catch (const std::exception &e) {
return make_fail_parse_response(std::string("JSON parse error: ") + e.what());
}
response.body_parts_parsed = parsed_output;
return response;
}
51 changes: 28 additions & 23 deletions src/parsers/regex.cpp
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@

#include "request-data.h"
#include "plugin-support.h"
#include "errors.h"

#include <regex>
#include <obs-module.h>

struct request_data_handler_response parse_regex(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
std::string parsed_output = "";
if (request_data->output_regex == "") {
// Return the whole response body
parsed_output = response.body;
} else {
// Parse the response as a regex
std::regex regex(request_data->output_regex,
std::regex_constants::ECMAScript | std::regex_constants::optimize);
try {
if (request_data->output_regex.empty()) {
response.body_parts_parsed.push_back(response.body);
return response;
}

// Cache compiled regex patterns for better performance
static thread_local std::unordered_map<std::string, std::regex> regex_cache;

auto &regex = regex_cache[request_data->output_regex];
if (regex_cache.find(request_data->output_regex) == regex_cache.end()) {
regex = std::regex(request_data->output_regex,
std::regex_constants::ECMAScript |
std::regex_constants::optimize);
}

std::smatch match;
if (std::regex_search(response.body, match, regex)) {
if (match.size() > 1) {
parsed_output = match[1].str();
} else {
parsed_output = match[0].str();
}
} else {
obs_log(LOG_INFO, "Failed to match regex");
// Return an error response
struct request_data_handler_response responseFail;
responseFail.error_message = "Failed to match regex";
responseFail.status_code = URL_SOURCE_REQUEST_PARSING_ERROR_CODE;
return responseFail;
// Get the appropriate capture group
size_t group = match.size() > 1 ? 1 : 0;
response.body_parts_parsed.push_back(match[group].str());
return response;
}

return make_fail_parse_response("No regex match found");

} catch (const std::regex_error &e) {
return make_fail_parse_response(std::string("Regex error: ") + e.what());
} catch (const std::exception &e) {
return make_fail_parse_response(std::string("Parse error: ") + e.what());
}
response.body_parts_parsed.push_back(parsed_output);
return response;
}
Loading