Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[experiment] gumbo using a malloc arena #2790

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion ext/nokogiri/gumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ VALUE cNokogiriHtml5Document;
static ID internal_subset;
static ID parent;

#define GUMBO_ARENA_SIZE (10 * 1024 * 1024)

/* Backwards compatibility to Ruby 2.1.0 */
#if RUBY_API_VERSION_CODE < 20200
#define ONIG_ESCAPE_UCHAR_COLLISION 1
Expand Down Expand Up @@ -273,7 +275,7 @@ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
char *msg;
size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
VALUE err_str = rb_utf8_str_new(msg, size);
free(msg);
gumbo_free(msg);
VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
const char *error_code = gumbo_error_code(err);
VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
Expand Down Expand Up @@ -313,6 +315,7 @@ parse_cleanup(VALUE parse_args)
if (args->doc != NULL) {
xmlFreeDoc(args->doc);
}
gumbo_arena_free_all();
return Qnil;
}

Expand All @@ -329,6 +332,8 @@ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors
options.max_errors = NUM2INT(max_errors);
options.max_tree_depth = NUM2INT(max_depth);

gumbo_arena_init(GUMBO_ARENA_SIZE);

GumboOutput *output = perform_parse(&options, input);
ParseArgs args = {
.output = output,
Expand Down Expand Up @@ -547,6 +552,8 @@ fragment(
options.quirks_mode = quirks_mode;
options.fragment_context_has_form_ancestor = form;

gumbo_arena_init(GUMBO_ARENA_SIZE);

GumboOutput *output = perform_parse(&options, tags);
ParseArgs args = {
.output = output,
Expand Down
4 changes: 4 additions & 0 deletions gumbo-parser/src/nokogiri_gumbo.h
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,10 @@ void gumbo_print_caret_diagnostic (
size_t source_length
);

void gumbo_free(void* ptr);
void gumbo_arena_init(size_t backing_buffer_length);
void gumbo_arena_free_all(void);

#ifdef __cplusplus
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion gumbo-parser/src/string_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@ static void maybe_resize_string_buffer (
GumboStringBuffer* buffer
) {
size_t new_length = buffer->length + additional_chars;
size_t prev_capacity = buffer->capacity;
size_t new_capacity = buffer->capacity;
while (new_capacity < new_length) {
new_capacity *= 2;
}
if (new_capacity != buffer->capacity) {
buffer->data = gumbo_realloc(buffer->data, new_capacity);
buffer->data = gumbo_realloc(buffer->data, prev_capacity, new_capacity);
buffer->capacity = new_capacity;
}
}
Expand Down
3 changes: 2 additions & 1 deletion gumbo-parser/src/token_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ void gumbo_character_token_buffer_append (
assert(token->type == GUMBO_TOKEN_WHITESPACE
|| token->type == GUMBO_TOKEN_CHARACTER);
if (buffer->length == buffer->capacity) {
size_t prev_bytes = sizeof(*buffer->data) * buffer->capacity;
if (buffer->capacity == 0)
buffer->capacity = 10;
else
buffer->capacity *= 2;
size_t bytes = sizeof(*buffer->data) * buffer->capacity;
buffer->data = gumbo_realloc(buffer->data, bytes);
buffer->data = gumbo_realloc(buffer->data, prev_bytes, bytes);
}
size_t index = buffer->length++;
buffer->data[index].position = token->position;
Expand Down
144 changes: 142 additions & 2 deletions gumbo-parser/src/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,153 @@
#include "util.h"
#include "nokogiri_gumbo.h"

#if GUMBO_USE_ARENA
#include <stdint.h>
#include <assert.h>

static bool is_power_of_two(uintptr_t x) {
return (x & (x-1)) == 0;
}

static uintptr_t align_forward(uintptr_t ptr, size_t align) {
uintptr_t p, a, modulo;

assert(is_power_of_two(align));

p = ptr;
a = (uintptr_t)align;
// Same as (p % a) but faster as 'a' is a power of two
modulo = p & (a-1);

if (modulo != 0) {
// If 'p' address is not aligned, push the address to the
// next value which is aligned
p += a - modulo;
}
return p;
}

#ifndef DEFAULT_ALIGNMENT
#define DEFAULT_ALIGNMENT (2*sizeof(void *))
#endif

typedef struct Arena Arena;
struct Arena {
unsigned char *buf;
size_t buf_len;
size_t prev_offset; // This will be useful for later on
size_t curr_offset;
};

static Arena gumbo_arena;

void gumbo_arena_init(size_t backing_buffer_length) {
void* backing_buffer = malloc(backing_buffer_length);
gumbo_arena.buf = (unsigned char *)backing_buffer;
gumbo_arena.buf_len = backing_buffer_length;
gumbo_arena.curr_offset = 0;
gumbo_arena.prev_offset = 0;
}

void gumbo_arena_free_all(void) {
free(gumbo_arena.buf);
gumbo_arena.buf = 0;
gumbo_arena.buf_len = 0;
gumbo_arena.curr_offset = 0;
gumbo_arena.prev_offset = 0;
}

static void *gumbo_arena_alloc_align(size_t size, size_t align) {
// Align 'curr_offset' forward to the specified alignment
uintptr_t curr_ptr = (uintptr_t)gumbo_arena.buf + (uintptr_t)gumbo_arena.curr_offset;
uintptr_t offset = align_forward(curr_ptr, align);
offset -= (uintptr_t)gumbo_arena.buf; // Change to relative offset

// Check to see if the backing memory has space left
if (offset+size <= gumbo_arena.buf_len) {
void *ptr = &gumbo_arena.buf[offset];
gumbo_arena.prev_offset = offset;
gumbo_arena.curr_offset = offset+size;

// Zero new memory by default
memset(ptr, 0, size);
return ptr;
}
// Return NULL if the arena is out of memory (or handle differently)
assert(0 && "arena out of memory");
return NULL;
}

// Because C doesn't have default parameters
static void *gumbo_arena_alloc(size_t size) {
return gumbo_arena_alloc_align(size, DEFAULT_ALIGNMENT);
}

static void gumbo_arena_free(void *ptr) {
// Do nothing
}

static void *gumbo_arena_resize_align(void *old_memory, size_t old_size, size_t new_size, size_t align) {
unsigned char *old_mem = (unsigned char *)old_memory;

assert(is_power_of_two(align));

if (old_mem == NULL || old_size == 0) {
return gumbo_arena_alloc_align(new_size, align);
} else if (gumbo_arena.buf <= old_mem && old_mem < gumbo_arena.buf+gumbo_arena.buf_len) {
if (gumbo_arena.buf+gumbo_arena.prev_offset == old_mem) {
gumbo_arena.curr_offset = gumbo_arena.prev_offset + new_size;
if (new_size > old_size) {
// Zero the new memory by default
memset(&gumbo_arena.buf[gumbo_arena.curr_offset], 0, new_size-old_size);
}
return old_memory;
} else {
void *new_memory = gumbo_arena_alloc_align(new_size, align);
size_t copy_size = old_size < new_size ? old_size : new_size;
// Copy across old memory to the new memory
memmove(new_memory, old_memory, copy_size);
return new_memory;
}

} else {
assert(0 && "Memory is out of bounds of the buffer in this arena");
return NULL;
}

}

// Because C doesn't have default parameters
static void *gumbo_arena_resize(void *old_memory, size_t old_size, size_t new_size) {
return gumbo_arena_resize_align(old_memory, old_size, new_size, DEFAULT_ALIGNMENT);
}
#else
void gumbo_arena_init(size_t backing_buffer_length) {
}

void gumbo_arena_free_all(void) {
}
#endif /* GUMBO_USE_ARENA */

void* gumbo_alloc(size_t size) {
#if GUMBO_USE_ARENA
void* ptr = gumbo_arena_alloc(size);
#else
void* ptr = malloc(size);
#endif
if (unlikely(ptr == NULL)) {
perror(__func__);
abort();
}
return ptr;
}

void* gumbo_realloc(void* ptr, size_t size) {
ptr = realloc(ptr, size);
void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) {
#if GUMBO_USE_ARENA
void* ptr = gumbo_arena_resize(prev_ptr, prev_size, size);
#else
void* ptr = realloc(prev_ptr, size);
#endif
if (unlikely(ptr == NULL)) {
perror(__func__);
abort();
Expand All @@ -40,7 +176,11 @@ void* gumbo_realloc(void* ptr, size_t size) {
}

void gumbo_free(void* ptr) {
#if GUMBO_USE_ARENA
gumbo_arena_free(ptr);
#else
free(ptr);
#endif
}

char* gumbo_strdup(const char* str) {
Expand Down
5 changes: 3 additions & 2 deletions gumbo-parser/src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <stddef.h>
#include "macros.h"

#define GUMBO_USE_ARENA 1

#ifdef __cplusplus
extern "C" {
#endif
Expand All @@ -17,8 +19,7 @@ extern "C" {
char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS;

void* gumbo_alloc(size_t size) XMALLOC;
void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
void gumbo_free(void* ptr);
void* gumbo_realloc(void* prev_ptr, size_t prev_size, size_t size) RETURNS_NONNULL;

// Debug wrapper for printf
#ifdef GUMBO_DEBUG
Expand Down
3 changes: 2 additions & 1 deletion gumbo-parser/src/vector.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ void gumbo_vector_destroy(GumboVector* vector) {
static void enlarge_vector_if_full(GumboVector* vector) {
if (vector->length >= vector->capacity) {
if (vector->capacity) {
size_t prev_num_bytes = sizeof(void*) * vector->capacity;
vector->capacity *= 2;
size_t num_bytes = sizeof(void*) * vector->capacity;
vector->data = gumbo_realloc(vector->data, num_bytes);
vector->data = gumbo_realloc(vector->data, prev_num_bytes, num_bytes);
} else {
// 0-capacity vector; no previous array to deallocate.
vector->capacity = 2;
Expand Down