Vector stubbing (rust-lang#377)

Vector Stubbing for the Rust Standard Library * This PR includes stub implementations for the Vector module for the Standard Library. * It includes 3 abstractions - RMCVec, CVec and NobackVec. * It also includes an experimental CHashSet implementation. Co-authored-by: Chinmay <chindesh@amazon.com> Co-authored-by: Daniel Schwartz-Narbonne <danielsn@users.noreply.github.com>
celinval · Sep 10, 2021 · 1bdea21 · 1bdea21
1 parent 2a41fcd
commit 1bdea21
Show file tree

Hide file tree

Showing 13 changed files with 2,071 additions and 8 deletions.
diff --git a/library/rmc/stubs/C/hashset/hashset.c b/library/rmc/stubs/C/hashset/hashset.c
@@ -0,0 +1,153 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+
+// This HashSet stub implementation is supposed to work with c_hashset.rs.
+// Please refer to that file for an introduction to the idea of a HashSet and
+// some other implemenntation details. Public methods defined in c_hashset.rs
+// act as wrappers around methods implemented here.
+
+// As noted before, this HashSet implementation is specifically for inputs which
+// are u16. The details below can be extended to larger sets if necessary. The 
+// domain of the output is i16.
+//
+// The hash function that we choose is the identity function.
+// For all input x, hasher(x) = x. For our case, this satisfies all the
+// requirements of an ideal hash function, it is 1:1 and there exists only one
+// element in the input domain for each value in the output domain.
+//
+// An important thing to note here is that the hash function can be
+// appropriately modified depending on the type of the input value which is
+// stored in the hashset. As an example, if the HashSet needs to store a tuple
+// of integers, say <u32, u32>, the hash function can be modified to be:
+//
+// hash((x, y)) = prime * x + y;
+//
+// Although this value can be greater than the chosen output domain, the
+// function is still sound if the value wraps around because it guarantees a
+// unique output for a given pair of x and y.
+//
+// Another way to think about this problem could be through the lens of
+// uninterpreted functions where : if x == y => f(x) == f(y). Exploring this can
+// be future work. The idea would be to implement a HashSet similar to that seen
+// in functional programming languages.
+//
+// For the purpose of a HashSet, we dont necessarily need a SENTINEL outside the
+// range of the hashing function because of the way we design the HashSet
+// operations.
+const uint16_t SENTINEL = 1;
+
+uint16_t hasher(uint16_t value) {
+	return value;
+}
+
+// We initialize all values of the domain to be 0 by initializing it with
+// calloc. This lets us get around the problem of looping through all elements
+// to initialize them individually with a special value.
+//
+// The domain array is to be interpreted such that 
+// if domain[index] != 0, value such that hash(value) = index is present.
+//
+// However, this logic does not work for the value 0. For this, we choose the
+// SENTINEL value to initialize that element. 
+typedef struct {
+	uint16_t* domain;
+} hashset;
+
+// Ideally, this approach is much more suitable if we can work with arrays of
+// arbitrary size, specifically infinity. This would allow us to define hash
+// functions for any type because the output domain can be considered to be
+// infinite. However, CBMC currently does not handle unbounded arrays correctly.
+// Please see: https://github.com/diffblue/cbmc/issues/6261. Even in that case,
+// we might run into theoretical limitations of how solvers handle uninterpreted
+// symbols such as unbounded arrays. For the case of this API, the consumer can
+// request for an arbitrary number of HashSets which can be dynamically chosen.
+// As a result, the solver cannot know apriori how many unbounded arrays it
+// needs to initialize which might lead to errors.
+//
+// Firecracker uses HashSet<u32> (src/devices/src/virtio/vsock/unix/muxer.rs).
+// But for working with u32s, we run into the problem that the entire domain
+// cannot be allocated through malloc. We run into the error "array too large
+// for flattening". For that reason, we choose to work with u16 to demonstrate
+// the feasability of this approach. However, it should be extensible to other
+// integer types.
+//
+// Returns: pointer to a hashset instance which tracks the domain memory. This
+// pointer is used in later callbacks such as insert() and remove().
+hashset* hashset_new() {
+	hashset* set = (hashset *) malloc(sizeof(hashset));
+	// Initializes value all indexes to be 0, indicating that those elements are
+	// not present in the HashSet.
+	set->domain = calloc(UINT16_MAX, sizeof(uint16_t));
+	// For 0, choose another value to achieve the same.
+	set->domain[0] = SENTINEL;
+	return set;
+}
+
+// For insert, we need to first check if the value exists in the HashSet. If it
+// does, we immediately return a 0 (false) value back.
+//
+// If it doesnt, then we mark that element of the domain array with the value to 
+// indicate that this element has been inserted. For element 0, we mark it with
+// the SENTINEL.
+//
+// To check if a value exists, we simply check if domain[hash] != 0 and
+// in the case of 0 if domain[0] != SENTINEL.
+//
+// Returns: an integer value 1 or 0. If the value is already present in the
+// hashset, this function returns a 0. If the value is sucessfully inserted, we
+// return a 1.
+uint32_t hashset_insert(hashset* s, uint16_t value) {
+	uint16_t hash = hasher(value);
+
+	if ((hash == 0 && s->domain[hash] != SENTINEL) || 
+		(hash !=0 && s->domain[hash] != 0)) {
+		return 0;
+	}
+
+	s->domain[hash] = value;
+	return 1;
+}
+
+// We perform a similar check here as described in hashset_insert(). We do not
+// duplicate code so as to not compute the hash twice. This can be improved.
+//
+// Returns: an integer value 1 or 0. If the value is present in the hashset,
+// this function returns a 1, otherwise 0.
+uint32_t hashset_contains(hashset* s, uint16_t value) {
+	uint16_t hash = hasher(value);
+
+	if ((hash == 0 && s->domain[hash] != SENTINEL) || 
+		(hash != 0 && s->domain[hash] != 0)) {
+		return 1;
+	}
+
+	return 0;
+}
+
+// We check if the element exists in the array. If it does not, we return a 0
+// (false) value back. If it does, we mark it with 0 and in the case of 0, we
+// mark it with the SENTINEL and return 1.
+//
+// Returns: an integer value 1 or 0. If the value is not present in the hashset,
+// this function returns a 0. If the value is sucessfully removed from the
+// hashset, it returns a 1.
+uint32_t hashset_remove(hashset* s, uint16_t value) {
+	uint16_t hash = hasher(value);
+
+	if ((hash == 0 && s->domain[hash] == SENTINEL) || 
+		(hash !=0 && s->domain[hash] == 0)) {
+		return 0;
+	}
+
+	if (hash == 0) {
+		s->domain[hash] = SENTINEL;
+	} else {
+		s->domain[hash] = 0;
+	}
+
+	return 1;
+}
diff --git a/library/rmc/stubs/C/vec/vec.c b/library/rmc/stubs/C/vec/vec.c
@@ -0,0 +1,182 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This Vector stub implementation is supposed to work with c_vec.rs. Please
+// refer to that file for a detailed explanation about the workings of this 
+// abstraction. Public methods implemented in c_vec.rs act as wrappers around 
+// methods implemented here.
+
+// __CPROVER_max_malloc_size is dependent on the number of offset bits used to
+// represent a pointer variable. By default, this is chosen to be 56, in which
+// case the max_malloc_size is 2 ** (offset_bits - 1). We could go as far as to
+// assign the default capacity to be the max_malloc_size but that would be overkill.
+// Instead, we choose a high-enough value 2 ** 10. Another reason to do
+// this is that it would be easier for the solver to reason about memory if multiple
+// Vectors are initialized by the abstraction consumer.
+//
+// For larger array sizes such as 2 ** (31 - 1) we encounter "array size too large
+// for flattening" error.
+#define DEFAULT_CAPACITY 1024
+#define MAX_MALLOC_SIZE 18014398509481984
+
+// A Vector is a dynamically growing array type with contiguous memory. We track
+// allocated memory, the length of the Vector and the capacity of the
+// allocation.
+
+// As can be seen from the pointer to mem (unint32_t*), we track memory in terms
+// of words. The current implementation works only if the containing type is
+// u32. This was specifically chosen due to a use case seen in the Firecracker 
+// codebase. This structure is used to communicate over the FFI boundary.
+// Future work:
+// Ideally, the pointer to memory would be uint8_t* - representing that we treat
+// memory as an array of bytes. This would allow us to be generic over the type
+// of the element contained in the Vector. In that case, we would have to treat
+// every sizeof(T) bytes as an indivdual element and cast memory accordingly.
+typedef struct {
+	uint32_t* mem;
+	size_t len;
+	size_t capacity;
+} vec;
+
+// The grow operation resizes the vector and copies its original contents into a
+// new allocation. This is one of the more expensive operations for the solver
+// to reason about and one way to get around this problem is to use a large
+// allocation size. We also implement sized_grow which takes a argument
+// definining the minimum number of additional elements that need to be fit into
+// the Vector memory. This aims to replicate behavior as seen in the Rust
+// standard library where the size of the vector is decided based on the
+// following equation:
+// new_capacity = max(capacity * 2, capacity + additional).
+// Please refer to method amortized_grow in raw_vec.rs in the Standard Library
+// for additional information.
+// The current implementation performance depends on CBMCs performance about
+// reasoning about realloc. If CBMC does better, do would we in the case of
+// this abstraction.
+//
+// One important callout to make here is that because we allocate a large enough
+// buffer, we cant reason about buffer overflow bugs. This is because the
+// allocated memory will (most-likely) always have enough space allocated after
+// the required vec capacity.
+//
+// Future work:
+// Ideally, we would like to get around the issue of resizing altogether since
+// CBMC supports unbounded arrays. In that case, we would allocate memory of
+// size infinity and work with that. For program verification, this would
+// optimize a lot of operations since the solver does not really have to worry
+// about the bounds of memory. The appropriate constant for capacity would be 
+// __CPROVER_constant_infinity_uint but this is currently blocked due to
+// incorrect translation of the constant: https://github.com/diffblue/cbmc/issues/6261.
+//
+// Another way to approach this problem would be to implement optimizations in
+// the realloc operation of CBMC. Rather than allocating a new memory block and
+// copying over elements, we can track only the end pointer of the memory and
+// shift it to track the new length. Since this behavior is that of the
+// allocator, the consumer of the API is blind to it.
+void vec_grow_exact(vec* v, size_t new_cap) {
+	uint32_t* new_mem = (uint32_t* ) realloc(v->mem, new_cap * sizeof(*v->mem));
+
+	v->mem = new_mem;
+	v->capacity = new_cap;
+}
+
+void vec_grow(vec* v) {
+	size_t new_cap = v->capacity * 2;
+	if (new_cap > MAX_MALLOC_SIZE) {
+		// Panic if the new size requirement is greater than max size that can
+		// be allocated through malloc.
+		assert(0);
+	}
+
+	vec_grow_exact(v, new_cap);
+}
+
+void vec_sized_grow(vec* v, size_t additional) {
+	size_t min_cap = v->capacity + additional;
+	size_t grow_cap = v->capacity * 2;
+
+	// This resembles the Rust Standard Library behavior - amortized_grow in
+	// alloc/raw_vec.rs
+	// 
+	// Reference: https://doc.rust-lang.org/src/alloc/raw_vec.rs.html#421
+	size_t new_cap = min_cap > grow_cap ? min_cap : grow_cap;
+	if (new_cap > MAX_MALLOC_SIZE) {
+		// Panic if the new size requirement is greater than max size that can
+		// be allocated through malloc.
+		assert(0);
+	}
+
+	vec_grow_exact(v, new_cap);
+}
+
+vec* vec_new() {
+	vec *v = (vec *) malloc(sizeof(vec));
+	// Default size is DEFAULT_CAPACITY. We compute the maximum number of
+	// elements to ensure that allocation size is aligned.
+	size_t max_elements = DEFAULT_CAPACITY / sizeof(*v->mem);
+	v->mem = (uint32_t *) malloc(max_elements * sizeof(*v->mem));
+	v->len = 0;
+	v->capacity = max_elements;
+	// Return a pointer to the allocated vec structure, which is used in future
+	// callbacks.
+	return v;
+}
+
+vec* vec_with_capacity(size_t capacity) {
+	vec *v = (vec *) malloc(sizeof(vec));
+	if (capacity > MAX_MALLOC_SIZE) {
+		// Panic if the new size requirement is greater than max size that can
+		// be allocated through malloc.
+		assert(0);
+	}
+
+	v->mem = (uint32_t *) malloc(capacity * sizeof(*v->mem));
+	v->len = 0;
+	v->capacity = capacity;
+	return v;
+}
+
+void vec_push(vec* v, uint32_t elem) {
+	// If we have already reached capacity, resize the Vector before
+	// pushing in new elements.
+	if (v->len == v->capacity) {
+		// Ensure that we have capacity to hold atleast one more element
+		vec_sized_grow(v, 1);
+	}
+
+	v->mem[v->len] = elem;
+	v->len += 1;
+}
+
+uint32_t vec_pop(vec* v) {
+	assert(v->len > 0);
+	v->len -= 1;
+
+	return v->mem[v->len];
+}
+
+void vec_append(vec* v1, vec* v2) {
+	// Reserve enough space before adding in new elements.
+	vec_sized_grow(v1, v2->len);
+	// Perform a memcpy of elements which is cheaper than pushing each element
+	// at once.
+	memcpy(v1->mem + v1->len, v2->mem, v2->len * sizeof(*v2->mem));
+	v1->len = v1->len + v2->len;
+}
+
+size_t vec_len(vec* v) {
+	return v->len;
+}
+
+size_t vec_cap(vec* v) {
+	return v->capacity;
+}
+
+void vec_free(vec* v) {
+	free(v->mem);
+	free(v);
+}
diff --git a/library/rmc/stubs/README.md b/library/rmc/stubs/README.md
@@ -0,0 +1,4 @@
+Verification-friendly Vector stubs
+----------
+
+