Skip to content

Commit

Permalink
Add remaining data types to schema visiting in ffi (#187)
Browse files Browse the repository at this point in the history
Adds more types, and uses a macro to reduce repetition for the primitive
ones. Also adds some rust docs to try and explain how the visitors for
schema work.

Tested by adding a visitor to the `read_table` program that builds and
prints the schema.

Output:
```
.$ /read_table file:///home/nick/databricks/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/nested_types/delta/
Reading table at file:///home/nick/databricks/delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/nested_types/delta/
version: 0

Schema:
├─ pk: integer
├─ struct: struct
│  ├─ float64: double
│  └─ bool: boolean
├─ array: array
│  └─ array_data_type: short
└─ map: map
   ├─ map_key_type: string
   └─ map_value_type: integer

Got some data
  Of this data, here is a selection vector
    sel[0] = 0
    sel[1] = 0
    sel[2] = 0
    sel[3] = 1
called back to actually read!
  path: part-00000-3b8ca3f0-6444-4ed5-8961-c605cba95bf1-c000.snappy.parquet
  No selection vector for this call
  no partition here
Iterator done
All done
```

---------

Co-authored-by: Ryan Johnson <scovich@users.noreply.github.com>
  • Loading branch information
nicklan and scovich authored May 10, 2024
1 parent 0e844a6 commit 663585d
Show file tree
Hide file tree
Showing 6 changed files with 427 additions and 45 deletions.
2 changes: 2 additions & 0 deletions ffi/cbindgen.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# default to generating c bindings
language = "C"

pragma_once = true

# only applies to Cxx
namespace = "ffi"

Expand Down
13 changes: 8 additions & 5 deletions ffi/examples/read-table/read_table.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <stdio.h>
#include <string.h>

#include "delta_kernel_ffi.h"
#include "schema.h"

#ifdef PRINT_ARROW_DATA
#include "arrow.h"
Expand Down Expand Up @@ -65,13 +67,13 @@ void visit_callback(void* engine_context, const KernelStringSlice path, long siz
return;
}
KernelBoolSlice selection_vector = selection_vector_res.ok;
if (selection_vector) {
if (selection_vector.len > 0) {
printf(" Selection vector:\n");
print_selection_vector(" ", selection_vector);
drop_bool_slice(selection_vector);
print_selection_vector(" ", &selection_vector);
} else {
printf(" No selection vector for this call\n");
}
drop_bool_slice(selection_vector);
// normally this would be picked out of the schema
char* letter_key = "letter";
KernelStringSlice key = {letter_key, strlen(letter_key)};
Expand All @@ -89,7 +91,7 @@ void visit_data(void *engine_context, EngineDataHandle *engine_data, KernelBoolS
printf(" Of this data, here is a selection vector\n");
print_selection_vector(" ", &selection_vec);
visit_scan_data(engine_data, selection_vec, engine_context, visit_callback);
drop_bool_slice(&selection_vec);
drop_bool_slice(selection_vec);
}

int main(int argc, char* argv[]) {
Expand Down Expand Up @@ -145,7 +147,8 @@ int main(int argc, char* argv[]) {
const SnapshotHandle *snapshot_handle = snapshot_handle_res.ok;

uint64_t v = version(snapshot_handle);
printf("version: %llu\n", v);
printf("version: %llu\n\n", v);
print_schema(snapshot_handle);

ExternResultScan scan_res = scan(snapshot_handle, engine, NULL);
if (scan_res.tag != OkScan) {
Expand Down
234 changes: 234 additions & 0 deletions ffi/examples/read-table/schema.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
#include <stdint.h>
#include "delta_kernel_ffi.h"

/**
* This module defines a very simple model of a schema, used only to be able to print the schema of
* a table. It consists of a "SchemaBuilder" which is our user data that gets passed into each visit_x
* call. This simply keeps track of all the lists we are asked to allocate.
*
* Each list is a "SchemaItemList", which tracks its length an an array of "SchemaItem"s.
*
* Each "SchemaItem" has a name and a type, which are just strings. It can also have a list which is
* its children. This is initially always UINTPTR_MAX, but when visiting a struct, map, or array, we
* point this at the list id specified in the callback, which allows us to traverse the schema when
* printing it.
*/

// If you want the visitor to print out what it's being asked to do at each step, uncomment the
// following line
//#define VERBOSE

#ifdef VERBOSE
#define _NTH_ARG(_1, _2, _3, _4, _5, N, ...) N
#define NUMARGS(...) _NTH_ARG(__VA_ARGS__, 5, 4, 3, 2, 1)
#define CHILD_FMT "Asked to visit %s named %s belonging to list %i. %s are in %i.\n"
#define NO_CHILD_FMT "Asked to visit %s named %s belonging to list %i.\n"
#define PRINT_VISIT(...) NUMARGS(__VA_ARGS__) == 5?\
printf(CHILD_FMT, __VA_ARGS__): \
printf(NO_CHILD_FMT, __VA_ARGS__)
#else
#define PRINT_VISIT(...)
#endif

typedef struct SchemaItemList SchemaItemList;

typedef struct {
char* name;
char* type;
uintptr_t children;
} SchemaItem;

typedef struct SchemaItemList {
uint32_t len;
SchemaItem* list;
} SchemaItemList;

typedef struct {
int list_count;
SchemaItemList* lists;
} SchemaBuilder;

char* allocate_name(const KernelStringSlice slice) {
return strndup(slice.ptr, slice.len);
}

// lists are preallocated to have exactly enough space, so we just fill in the next open slot and
// increment our length
SchemaItem* add_to_list(SchemaItemList *list, char* name, char* type) {
int idx = list->len;
list->list[idx].name = name;
list->list[idx].type = type;
list->len++;
return &list->list[idx];
}

// print out all items in a list, recursing into any children they may have
void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int parents_on_last) {
SchemaItemList *list = &builder->lists[list_id];
for (int i = 0; i < list->len; i++) {
bool is_last = i == list->len - 1;
for (int j = 0; j < indent; j++) {
if ((indent - parents_on_last) <= j) {
// don't print a dangling | on any parents that are on their last item
printf(" ");
} else {
printf("│ ");
}
}
SchemaItem* item = &list->list[i];
char* prefix = is_last? "└" : "├";
printf("%s─ %s: %s\n", prefix, item->name, item->type);
if (list->list[i].children != UINTPTR_MAX) {
print_list(builder, list->list[i].children, indent+1, parents_on_last + is_last);
}
}
}

// declare all our visitor methods
uintptr_t make_field_list(void *data, uintptr_t reserve) {
SchemaBuilder *builder = data;
int id = builder->list_count;
#ifdef VERBOSE
printf("Making a list of lenth %i with id %i\n", reserve, id);
#endif
builder->list_count++;
builder->lists = realloc(builder->lists, sizeof(SchemaItemList) * builder->list_count);
SchemaItem* list = calloc(reserve, sizeof(SchemaItem));
for (int i = 0; i < reserve; i++) {
list[i].children = UINTPTR_MAX;
}
builder->lists[id].len = 0;
builder->lists[id].list = list;
return id;
}

void visit_struct(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
PRINT_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id);
SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct");
struct_item->children = child_list_id;
}
void visit_array(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
bool contains_null,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = malloc(sizeof(char) * (name.len + 24));
snprintf(name_ptr, name.len+1, "%s", name.ptr);
snprintf(name_ptr+name.len, 24, " (contains null: %s)", contains_null ? "true" : "false");
PRINT_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id);
SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array");
array_item->children = child_list_id;
}
void visit_map(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
bool value_contains_null,
uintptr_t child_list_id) {
SchemaBuilder *builder = data;
char* name_ptr = malloc(sizeof(char) * (name.len + 24));
snprintf(name_ptr, name.len+1, "%s", name.ptr);
snprintf(name_ptr+name.len, 24, " (contains null: %s)", value_contains_null ? "true" : "false");
PRINT_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id);
SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map");
map_item->children = child_list_id;
}

void visit_decimal(void *data,
uintptr_t sibling_list_id,
struct KernelStringSlice name,
uint8_t precision,
int8_t scale) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
char* type = malloc(16 * sizeof(char));
snprintf(type, 16, "decimal(%i)(%i)", precision, scale);
PRINT_VISIT(type, name_ptr, sibling_list_id);
add_to_list(&builder->lists[sibling_list_id], name_ptr, type);
}



void visit_simple_type(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name, char* type) {
SchemaBuilder *builder = data;
char* name_ptr = allocate_name(name);
PRINT_VISIT(type, name_ptr, sibling_list_id);
add_to_list(&builder->lists[sibling_list_id], name_ptr, type);
}

#define DEFINE_VISIT_SIMPLE_TYPE(typename) \
void visit_##typename(void *data, uintptr_t sibling_list_id, struct KernelStringSlice name) { \
visit_simple_type(data, sibling_list_id, name, #typename); \
}

DEFINE_VISIT_SIMPLE_TYPE(string);
DEFINE_VISIT_SIMPLE_TYPE(integer);
DEFINE_VISIT_SIMPLE_TYPE(short);
DEFINE_VISIT_SIMPLE_TYPE(byte);
DEFINE_VISIT_SIMPLE_TYPE(long);
DEFINE_VISIT_SIMPLE_TYPE(float);
DEFINE_VISIT_SIMPLE_TYPE(double);
DEFINE_VISIT_SIMPLE_TYPE(boolean);
DEFINE_VISIT_SIMPLE_TYPE(binary);
DEFINE_VISIT_SIMPLE_TYPE(date);
DEFINE_VISIT_SIMPLE_TYPE(timestamp);
DEFINE_VISIT_SIMPLE_TYPE(timestamp_ntz);

// free all the data in the builder (but not the builder itself, it's stack allocated)
void free_builder(SchemaBuilder builder) {
for (int i = 0; i < builder.list_count; i++) {
SchemaItemList *list = (builder.lists)+i;
for (int j = 0; j < list->len; j++) {
SchemaItem *item = list->list+j;
free(item->name);
// don't free item->type, those are static strings
if (!strncmp(item->type, "decimal", 7)) {
// except decimal types, we malloc'd those :)
free(item->type);
}
}
free(list->list); // free all the items in this list (we alloc'd them together)
}
free(builder.lists);
}

// Print the schema of the snapshot
void print_schema(const SnapshotHandle *snapshot) {
SchemaBuilder builder = {
.list_count = 0,
.lists = calloc(0, sizeof(SchemaItem*)),
};
EngineSchemaVisitor visitor = {
.data = &builder,
.make_field_list = make_field_list,
.visit_struct = visit_struct,
.visit_array = visit_array,
.visit_map = visit_map,
.visit_decimal = visit_decimal,
.visit_string = visit_string,
.visit_long = visit_long,
.visit_integer = visit_integer,
.visit_short = visit_short,
.visit_byte = visit_byte,
.visit_float = visit_float,
.visit_double = visit_double,
.visit_boolean = visit_boolean,
.visit_binary = visit_binary,
.visit_date = visit_date,
.visit_timestamp = visit_timestamp,
.visit_timestamp_ntz = visit_timestamp_ntz
};
uintptr_t schema_list_id = visit_schema(snapshot, &visitor);
#ifdef PRINT_VISITS
printf("Schema returned in list %i\n", schema_list_id);
#endif
printf("Schema:\n");
print_list(&builder, schema_list_id, 0, 0);
printf("\n");
free_builder(builder);
}
Loading

0 comments on commit 663585d

Please sign in to comment.