Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add remaining data types to schema visiting in ffi #187

Merged
merged 29 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1d5a9dd
Add most types other than a few less common primitive ones
nicklan May 1, 2024
32428a5
add remaining types
nicklan May 1, 2024
7f59e54
include pragma once
nicklan May 2, 2024
bcc5a4f
include schema printing
nicklan May 2, 2024
9593038
docs mostly
nicklan May 2, 2024
6fd15cf
fmt
nicklan May 2, 2024
4baa68a
free schema!
nicklan May 2, 2024
8747fe3
cleanup
nicklan May 2, 2024
6c4a173
fix pointer issue
nicklan May 6, 2024
87ef58c
strndup + macro
nicklan May 6, 2024
21a3402
Apply suggestions from code review
nicklan May 6, 2024
78378b8
Merge branch 'main' into add-most-schema-types-to-ffi
nicklan May 7, 2024
0fc9bd7
don't forget long
nicklan May 7, 2024
a3ff414
untabify :sigh:
nicklan May 7, 2024
03c34db
Merge branch 'main' into add-most-schema-types-to-ffi
nicklan May 7, 2024
184e2c3
fix up selection vector stuff
nicklan May 7, 2024
0805f38
maps can have nulls + print if array/map has null in schema.h
nicklan May 7, 2024
76b942b
cleaner visit macro
nicklan May 8, 2024
245da8a
comment update
nicklan May 8, 2024
d479631
clearer todo
nicklan May 8, 2024
872b0e7
fix schema printing for deep nesting
nicklan May 8, 2024
47abc39
use schema appropriate names
nicklan May 8, 2024
84af4ba
Merge branch 'main' into add-most-schema-types-to-ffi
nicklan May 8, 2024
cd86766
Merge branch 'main' into add-most-schema-types-to-ffi
nicklan May 10, 2024
93fc17c
macros are fun
nicklan May 10, 2024
711b16b
Apply suggestions from code review
nicklan May 10, 2024
953c3e8
doc update
nicklan May 10, 2024
2498d10
slightly more consistent formatting
nicklan May 10, 2024
8cdf7f8
more consistent array indexing
nicklan May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ffi/cbindgen.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# default to generating c bindings
language = "C"

pragma_once = true

# only applies to Cxx
namespace = "ffi"

Expand Down
13 changes: 8 additions & 5 deletions ffi/examples/read-table/read_table.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <stdio.h>
#include <string.h>

#include "delta_kernel_ffi.h"
#include "schema.h"

#ifdef PRINT_ARROW_DATA
#include "arrow.h"
Expand Down Expand Up @@ -65,13 +67,13 @@ void visit_callback(void* engine_context, const KernelStringSlice path, long siz
return;
}
KernelBoolSlice selection_vector = selection_vector_res.ok;
if (selection_vector) {
if (selection_vector.len > 0) {
printf(" Selection vector:\n");
print_selection_vector(" ", selection_vector);
drop_bool_slice(selection_vector);
print_selection_vector(" ", &selection_vector);
} else {
printf(" No selection vector for this call\n");
}
drop_bool_slice(selection_vector);
// normally this would be picked out of the schema
char* letter_key = "letter";
KernelStringSlice key = {letter_key, strlen(letter_key)};
Expand All @@ -89,7 +91,7 @@ void visit_data(void *engine_context, EngineDataHandle *engine_data, KernelBoolS
printf(" Of this data, here is a selection vector\n");
print_selection_vector(" ", &selection_vec);
visit_scan_data(engine_data, selection_vec, engine_context, visit_callback);
drop_bool_slice(&selection_vec);
drop_bool_slice(selection_vec);
}

int main(int argc, char* argv[]) {
Expand Down Expand Up @@ -145,7 +147,8 @@ int main(int argc, char* argv[]) {
const SnapshotHandle *snapshot_handle = snapshot_handle_res.ok;

uint64_t v = version(snapshot_handle);
printf("version: %llu\n", v);
printf("version: %llu\n\n", v);
print_schema(snapshot_handle);

ExternResultScan scan_res = scan(snapshot_handle, engine, NULL);
if (scan_res.tag != OkScan) {
Expand Down
234 changes: 234 additions & 0 deletions ffi/examples/read-table/schema.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
#include <stdint.h>
#include "delta_kernel_ffi.h"

/**
* This module defines a very simple model of a schema, used only to be able to print the schema of
* a table. It consists of a "SchemaBuilder" which is our user data that gets passed into each visit_x
* call. This simply keeps track of all the lists we are asked to allocate.
*
* Each list is a "SchemaItemList", which tracks its length an an array of "SchemaItem"s.
*
* Each "SchemaItem" has a name and a type, which are just strings. It can also have a list which is
* its children. This is initially always UINTPTR_MAX, but when visiting a struct, map, or array, we
* point this at the list id specified in the callback, which allows us to traverse the schema when
* printing it.
*/

// If you want the visitor to print out what it's being asked to do at each step, uncomment the
// following line
//#define VERBOSE

#ifdef VERBOSE
#define _NTH_ARG(_1, _2, _3, _4, _5, N, ...) N
#define NUMARGS(...) _NTH_ARG(__VA_ARGS__, 5, 4, 3, 2, 1)
#define CHILD_FMT "Asked to visit %s named %s belonging to list %i. %s are in %i.\n"
#define NO_CHILD_FMT "Asked to visit %s named %s belonging to list %i.\n"
#define PRINT_VISIT(...) NUMARGS(__VA_ARGS__) == 5?\
printf(CHILD_FMT, __VA_ARGS__): \
printf(NO_CHILD_FMT, __VA_ARGS__)
#else
#define PRINT_VISIT(...)
#endif

typedef struct SchemaItemList SchemaItemList;

typedef struct {
char* name;
char* type;
uintptr_t children;
} SchemaItem;

typedef struct SchemaItemList {
uint32_t len;
SchemaItem* list;
} SchemaItemList;

typedef struct {
int list_count;
SchemaItemList* lists;
} SchemaBuilder;

char* allocate_name(const KernelStringSlice slice) {
return strndup(slice.ptr, slice.len);
}

// lists are preallocated to have exactly enough space, so we just fill in the next open slot and
// increment our length
SchemaItem* add_to_list(SchemaItemList *list, char* name, char* type) {
int idx = list->len;
list->list[idx].name = name;
list->list[idx].type = type;
list->len++;
return &list->list[idx];
}

// print out all items in a list, recursing into any children they may have
void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int parents_on_last) {
SchemaItemList *list = &builder->lists[list_id];
for (int i = 0; i < list->len; i++) {
bool is_last = i == list->len - 1;
for (int j = 0; j < indent; j++) {
if ((indent - parents_on_last) <= j) {
// don't print a dangling | on any parents that are on their last item
printf(" ");
} else {
printf("│ ");
}
}
SchemaItem* item = &list->list[i];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably decide whether to use &array[index] vs. array + index and be consistent?

(I personally prefer the former because it's obvious at a glance that it's not normal arithmetic... but I recognize it's also a bit more typing)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, makes sense. I tend to use array + index when I know I just want the pointer, and array[index] when I want to use the item there (like array[index].name), but consistency is good. I've updated to all &array[index] style for getting pointers.

char* prefix = is_last? "└" : "├";
printf("%s─ %s: %s\n", prefix, item->name, item->type);
if (list->list[i].children != UINTPTR_MAX) {
print_list(builder, list->list[i].children, indent+1, parents_on_last + is_last);
}
}
}

// declare all our visitor methods
uintptr_t make_field_list(void *data, uintptr_t reserve) {
SchemaBuilder *builder = data;
int id = builder->list_count;
#ifdef VERBOSE
printf("Making a list of lenth %i with id %i\n", reserve, id);
#endif
builder->list_count++;
builder->lists = realloc(builder->lists, sizeof(SchemaItemList) * builder->list_count);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why realloc, out of curiosity? What previous usage could there have been?

(if there was some valid previous usage, wouldn't we need to free the lists it contained?)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yikes, this is being used to grow the array... but realloc often (usually?) moves the memory to a new location, which would invalidate all the child pointers. To avoid dangling pointers, SchemaItem::children needs to be an index into SchemaBuilder::lists. Doing that would also make it obvious in the free_builder function, why we don't need to free the child list when freeing its parent.

Copy link
Collaborator Author

@nicklan nicklan May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, real bad on the pointers 🤦🏽 .

I've updated things to use indexes.

Why realloc, out of curiosity? What previous usage could there have been?

We don't know the total number of lists ahead of time. SchemaBuilder.lists is an array of SchemaItemList, so this realloc is just adding one more to it (list_count gets incremented before this realloc call). Previous usage was when we made the builder and did

  SchemaBuilder builder = {
    .list_count = 0,
    .lists = calloc(0, sizeof(SchemaItem*)),
  };

as well as each other call to this method.

(if there was some valid previous usage, wouldn't we need to free the lists it contained?)

realloc does that for you (afaiu): "Unless ptr is NULL, it must have been returned by an earlier call to malloc or related functions. If the area pointed to was moved, a free(ptr) is done."

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(if there was some valid previous usage, wouldn't we need to free the lists it contained?)

realloc does that for you (afaiu)

I was referring to the lists of SchemaItem that each SchemaItemList in the builder referred to. But it's not a problem as you say, because we're doing realloc to extend rather than replace.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One gotcha with using realloc this way -- it has O(n**2) cost, where n is the final list_count. The usual workaround is to increase the capacity of the array by a fixed ratio like 3/2; in return for potentially 50% wasted space, it is amortized O(n) cost. Probably doesn't matter for this example code tho.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I considered increasing more each time, but:
a) it requires tracking an extra capacity param and
b) for example code as you say it's probably overkill

SchemaItem* list = calloc(reserve, sizeof(SchemaItem));
for (int i = 0; i < reserve; i++) {
list[i].children = UINTPTR_MAX;
}
builder->lists[id].len = 0;
builder->lists[id].list = list;
return id;
}

void visit_struct(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
PRINT_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id);
SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct");
struct_item->children = child_list_id;
}
void visit_array(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
bool contains_null,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = malloc(sizeof(char) * (name.len + 24));
snprintf(name_ptr, name.len+1, "%s", name.ptr);
snprintf(name_ptr+name.len, 24, " (contains null: %s)", contains_null ? "true" : "false");
PRINT_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id);
SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array");
array_item->children = child_list_id;
}
void visit_map(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
bool value_contains_null,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = malloc(sizeof(char) * (name.len + 24));
snprintf(name_ptr, name.len+1, "%s", name.ptr);
snprintf(name_ptr+name.len, 24, " (contains null: %s)", value_contains_null ? "true" : "false");
PRINT_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id);
SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map");
map_item->children = child_list_id;
}

void visit_decimal(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
uint8_t precision,
int8_t scale) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
char* type = malloc(16 * sizeof(char));
snprintf(type, 16, "decimal(%i)(%i)", precision, scale);
PRINT_VISIT(type, name_ptr, sibling_list_id);
add_to_list(&builder->lists[sibling_list_id], name_ptr, type);
}



void visit_simple_type(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name, char* type) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
PRINT_VISIT(type, name_ptr, sibling_list_id);
add_to_list(&builder->lists[sibling_list_id], name_ptr, type);
}

#define DEFINE_VISIT_SIMPLE_TYPE(typename) \
void visit_##typename(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name) { \
visit_simple_type(data, sibling_list_id, name, #typename); \
}

DEFINE_VISIT_SIMPLE_TYPE(string);
DEFINE_VISIT_SIMPLE_TYPE(integer);
DEFINE_VISIT_SIMPLE_TYPE(short);
DEFINE_VISIT_SIMPLE_TYPE(byte);
DEFINE_VISIT_SIMPLE_TYPE(long);
DEFINE_VISIT_SIMPLE_TYPE(float);
DEFINE_VISIT_SIMPLE_TYPE(double);
DEFINE_VISIT_SIMPLE_TYPE(boolean);
DEFINE_VISIT_SIMPLE_TYPE(binary);
DEFINE_VISIT_SIMPLE_TYPE(date);
DEFINE_VISIT_SIMPLE_TYPE(timestamp);
DEFINE_VISIT_SIMPLE_TYPE(timestamp_ntz);

// free all the data in the builder (but not the builder itself, it's stack allocated)
void free_builder(SchemaBuilder builder) {
scovich marked this conversation as resolved.
Show resolved Hide resolved
for (int i = 0; i < builder.list_count; i++) {
SchemaItemList *list = (builder.lists)+i;
for (int j = 0; j < list->len; j++) {
SchemaItem *item = list->list+j;
free(item->name);
// don't free item->type, those are static strings
if (!strncmp(item->type, "decimal", 7)) {
// except decimal types, we malloc'd those :)
free(item->type);
}
}
free(list->list); // free all the items in this list (we alloc'd them together)
}
free(builder.lists);
}

// Print the schema of the snapshot
void print_schema(const SnapshotHandle *snapshot) {
SchemaBuilder builder = {
.list_count = 0,
.lists = calloc(0, sizeof(SchemaItem*)),
};
EngineSchemaVisitor visitor = {
nicklan marked this conversation as resolved.
Show resolved Hide resolved
.data = &builder,
.make_field_list = make_field_list,
.visit_struct = visit_struct,
.visit_array = visit_array,
.visit_map = visit_map,
.visit_decimal = visit_decimal,
.visit_string = visit_string,
.visit_long = visit_long,
.visit_integer = visit_integer,
.visit_short = visit_short,
.visit_byte = visit_byte,
.visit_float = visit_float,
.visit_double = visit_double,
.visit_boolean = visit_boolean,
.visit_binary = visit_binary,
.visit_date = visit_date,
.visit_timestamp = visit_timestamp,
.visit_timestamp_ntz = visit_timestamp_ntz
};
uintptr_t schema_list_id = visit_schema(snapshot, &visitor);
#ifdef PRINT_VISITS
printf("Schema returned in list %i\n", schema_list_id);
#endif
printf("Schema:\n");
print_list(&builder, schema_list_id, 0, 0);
printf("\n");
free_builder(builder);
}
Loading
Loading