Skip to content

Commit

Permalink
Fix major mem leak in HTML parsing (#126)
Browse files Browse the repository at this point in the history
* Refactor README.md to update download links and badges for different platforms

* Refactor HTML and JSON parsers
  • Loading branch information
royshil authored Oct 28, 2024
1 parent b162f12 commit 431cfb1
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 175 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

<div align="center">
Download</br>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-windows-x64-Installer.exe"><img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-macos-universal.pkg"><img src="https://img.shields.io/badge/Mac-000000?style=for-the-badge&logo=Apple" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.6/obs-urlsource-0.3.6-x86_64-linux-gnu.deb"><img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"/></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-windows-x64-Installer.exe"><img src="https://img.shields.io/badge/Windows-0078D6?style=for-the-badge" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-macos-universal.pkg"><img src="https://img.shields.io/badge/Mac-000000?style=for-the-badge&logo=Apple" /></a>
<a href="https://github.com/locaal-ai/obs-urlsource/releases/download/0.3.7/obs-urlsource-0.3.7-x86_64-linux-gnu.deb"><img src="https://img.shields.io/badge/Linux-FCC624?style=for-the-badge&logo=linux&logoColor=black"/></a>
</div>

## Introduction
Expand Down
2 changes: 1 addition & 1 deletion buildspec.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
}
},
"name": "obs-urlsource",
"version": "0.3.6",
"version": "0.3.7",
"author": "Roy Shilkrot",
"website": "https://github.com/locaal-ai/obs-urlsource",
"email": "roy.shil@gmail.com",
Expand Down
166 changes: 96 additions & 70 deletions src/parsers/html.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,99 +20,125 @@ lxb_status_t find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t
{
UNUSED_PARAMETER(spec);
std::string str;
(void)lxb_html_serialize_deep_cb(node, serializer_callback, &str);
((std::vector<std::string> *)data)->push_back(str);
return LXB_STATUS_OK;
lxb_status_t status = lxb_html_serialize_deep_cb(node, serializer_callback, &str);
if (status == LXB_STATUS_OK) {
((std::vector<std::string> *)data)->push_back(str);
}
return status;
}

lxb_status_t find_with_selectors(const std::string &slctrs, lxb_html_document_t *document,
std::vector<std::string> &found)
{
/* Create CSS parser. */
lxb_css_parser_t *parser;
lxb_css_selector_list_t *list;
lxb_status_t status;
lxb_dom_node_t *body;
lxb_selectors_t *selectors;

parser = lxb_css_parser_create();
status = lxb_css_parser_init(parser, NULL);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup CSS parser");
return EXIT_FAILURE;
}
lxb_css_parser_t *parser = nullptr;
lxb_css_selector_list_t *list = nullptr;
lxb_selectors_t *selectors = nullptr;
lxb_status_t status = LXB_STATUS_ERROR;

do {
parser = lxb_css_parser_create();
if (!parser) {
obs_log(LOG_ERROR, "Failed to create CSS parser");
break;
}

/* Selectors. */
selectors = lxb_selectors_create();
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup Selectors");
return EXIT_FAILURE;
}
status = lxb_css_parser_init(parser, nullptr);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to init CSS parser");
break;
}

/* Parse and get the log. */
selectors = lxb_selectors_create();
if (!selectors) {
obs_log(LOG_ERROR, "Failed to create selectors");
break;
}

list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(), slctrs.length());
if (parser->status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to parse CSS selectors");
return EXIT_FAILURE;
}
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to init selectors");
break;
}

/* Find HTML nodes by CSS Selectors. */
body = lxb_dom_interface_node(lxb_html_document_body_element(document));
list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(),
slctrs.length());
if (!list || parser->status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to parse CSS selectors");
break;
}

status = lxb_selectors_find(selectors, body, list, find_callback, &found);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to find HTML nodes by CSS Selectors");
return EXIT_FAILURE;
}
lxb_dom_node_t *body =
lxb_dom_interface_node(lxb_html_document_body_element(document));
if (!body) {
obs_log(LOG_ERROR, "Failed to get document body");
break;
}

/* Destroy Selectors object. */
(void)lxb_selectors_destroy(selectors, true);
status = lxb_selectors_find(selectors, body, list, find_callback, &found);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to find nodes by CSS Selectors");
break;
}

/* Destroy resources for CSS Parser. */
(void)lxb_css_parser_destroy(parser, true);
} while (0);

/* Destroy all object for all CSS Selector List. */
lxb_css_selector_list_destroy_memory(list);
// Cleanup
if (list) {
lxb_css_selector_list_destroy_memory(list);
}
if (selectors) {
lxb_selectors_destroy(selectors, true);
}
if (parser) {
lxb_css_parser_destroy(parser, true);
}

return LXB_STATUS_OK;
return status;
}

struct request_data_handler_response parse_html(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
lxb_status_t status;
lxb_html_document_t *document;
lxb_html_document_t *document = nullptr;

document = lxb_html_document_create();
if (document == NULL) {
return make_fail_parse_response("Failed to setup HTML parser");
}
try {
document = lxb_html_document_create();
if (!document) {
return make_fail_parse_response("Failed to create HTML document");
}

status = lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(),
response.body.length());
if (status != LXB_STATUS_OK) {
return make_fail_parse_response("Failed to parse HTML");
}
lxb_status_t status =
lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(),
response.body.length());

std::string parsed_output = response.body;
// Get the output value
if (request_data->output_cssselector != "") {
std::vector<std::string> found;
if (find_with_selectors(request_data->output_cssselector, document, found) !=
LXB_STATUS_OK) {
return make_fail_parse_response("Failed to find element with CSS selector");
} else {
if (found.size() > 0) {
std::copy(found.begin(), found.end(),
std::back_inserter(response.body_parts_parsed));
if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(document);
return make_fail_parse_response("Failed to parse HTML");
}

if (!request_data->output_cssselector.empty()) {
std::vector<std::string> found;
status = find_with_selectors(request_data->output_cssselector, document,
found);

if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(document);
return make_fail_parse_response(
"Failed to find element with CSS selector");
}

response.body_parts_parsed = std::move(found);
} else {
response.body_parts_parsed.push_back(response.body);
}
} else {
// Return the whole HTML object
response.body_parts_parsed.push_back(parsed_output);
}

return response;
lxb_html_document_destroy(document);
return response;

} catch (const std::exception &e) {
if (document) {
lxb_html_document_destroy(document);
}
return make_fail_parse_response(std::string("HTML parsing exception: ") + e.what());
}
}
63 changes: 26 additions & 37 deletions src/parsers/jsonpath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,67 +2,56 @@
#include "errors.h"

#include <jsoncons/basic_json.hpp>
#include <jsoncons/json_parser.hpp>
#include <jsoncons_ext/jsonpath/jsonpath.hpp>
#include <obs-module.h>
#include <nlohmann/json.hpp>
#include <util/c99defs.h>

struct request_data_handler_response parse_json(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
UNUSED_PARAMETER(request_data);

// Parse the response as JSON
jsoncons::json json;
try {
json = jsoncons::json::parse(response.body);
// Parse JSON only once and store in both formats
auto json_cons = jsoncons::json::parse(response.body);
response.body_json = nlohmann::json::parse(response.body);
} catch (jsoncons::json_exception &e) {
return response;
} catch (const jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} catch (nlohmann::json::parse_error &e) {
} catch (const nlohmann::json::exception &e) {
return make_fail_parse_response(e.what());
}
// Return the whole JSON object
response.body_parts_parsed.push_back(json.as_string());
return response;
}

struct request_data_handler_response parse_json_path(struct request_data_handler_response response,
const url_source_request_data *request_data)
{

// Parse the response as JSON
jsoncons::json json;
try {
json = jsoncons::json::parse(response.body);
auto json = jsoncons::json::parse(response.body);
response.body_json = nlohmann::json::parse(response.body);
} catch (jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} catch (nlohmann::json::parse_error &e) {
return make_fail_parse_response(e.what());
}
std::vector<std::string> parsed_output = {};
// Get the output value
if (request_data->output_json_path != "") {
try {
const auto value = jsoncons::jsonpath::json_query(
json, request_data->output_json_path);

if (!request_data->output_json_path.empty()) {
// Create and evaluate JSONPath expression
auto value = jsoncons::jsonpath::json_query(json,
request_data->output_json_path);

if (value.is_array()) {
// extract array items as strings
response.body_parts_parsed.reserve(value.size());
for (const auto &item : value.array_range()) {
parsed_output.push_back(item.as_string());
response.body_parts_parsed.push_back(
item.as<std::string>());
}
} else {
parsed_output.push_back(value.as_string());
response.body_parts_parsed.push_back(value.as<std::string>());
}
} catch (jsoncons::json_exception &e) {
return make_fail_parse_response(e.what());
} else {
response.body_parts_parsed.push_back(json.as<std::string>());
}
} else {
// Return the whole JSON object
parsed_output.clear();
parsed_output.push_back(json.as_string());

return response;

} catch (const jsoncons::jsonpath::jsonpath_error &e) {
return make_fail_parse_response(std::string("JSONPath error: ") + e.what());
} catch (const std::exception &e) {
return make_fail_parse_response(std::string("JSON parse error: ") + e.what());
}
response.body_parts_parsed = parsed_output;
return response;
}
51 changes: 28 additions & 23 deletions src/parsers/regex.cpp
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@

#include "request-data.h"
#include "plugin-support.h"
#include "errors.h"

#include <regex>
#include <obs-module.h>

struct request_data_handler_response parse_regex(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
std::string parsed_output = "";
if (request_data->output_regex == "") {
// Return the whole response body
parsed_output = response.body;
} else {
// Parse the response as a regex
std::regex regex(request_data->output_regex,
std::regex_constants::ECMAScript | std::regex_constants::optimize);
try {
if (request_data->output_regex.empty()) {
response.body_parts_parsed.push_back(response.body);
return response;
}

// Cache compiled regex patterns for better performance
static thread_local std::unordered_map<std::string, std::regex> regex_cache;

auto &regex = regex_cache[request_data->output_regex];
if (regex_cache.find(request_data->output_regex) == regex_cache.end()) {
regex = std::regex(request_data->output_regex,
std::regex_constants::ECMAScript |
std::regex_constants::optimize);
}

std::smatch match;
if (std::regex_search(response.body, match, regex)) {
if (match.size() > 1) {
parsed_output = match[1].str();
} else {
parsed_output = match[0].str();
}
} else {
obs_log(LOG_INFO, "Failed to match regex");
// Return an error response
struct request_data_handler_response responseFail;
responseFail.error_message = "Failed to match regex";
responseFail.status_code = URL_SOURCE_REQUEST_PARSING_ERROR_CODE;
return responseFail;
// Get the appropriate capture group
size_t group = match.size() > 1 ? 1 : 0;
response.body_parts_parsed.push_back(match[group].str());
return response;
}

return make_fail_parse_response("No regex match found");

} catch (const std::regex_error &e) {
return make_fail_parse_response(std::string("Regex error: ") + e.what());
} catch (const std::exception &e) {
return make_fail_parse_response(std::string("Parse error: ") + e.what());
}
response.body_parts_parsed.push_back(parsed_output);
return response;
}
Loading

0 comments on commit 431cfb1

Please sign in to comment.