Skip to content

Commit

Permalink
Merge pull request #98425 from darksylinc/matias-breadcrumbs-race-fix
Browse files Browse the repository at this point in the history
Fix race conditions in breadcrumbs
  • Loading branch information
Repiteo committed Oct 25, 2024
2 parents 15bec19 + 668c9b7 commit edad871
Show file tree
Hide file tree
Showing 9 changed files with 236 additions and 59 deletions.
6 changes: 6 additions & 0 deletions core/config/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,12 @@ bool Engine::is_extra_gpu_memory_tracking_enabled() const {
return extra_gpu_memory_tracking;
}

#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool Engine::is_accurate_breadcrumbs_enabled() const {
return accurate_breadcrumbs;
}
#endif

void Engine::set_print_to_stdout(bool p_enabled) {
CoreGlobals::print_line_enabled = p_enabled;
}
Expand Down
6 changes: 6 additions & 0 deletions core/config/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class Engine {
bool use_validation_layers = false;
bool generate_spirv_debug_info = false;
bool extra_gpu_memory_tracking = false;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool accurate_breadcrumbs = false;
#endif
int32_t gpu_idx = -1;

uint64_t _process_frames = 0;
Expand Down Expand Up @@ -186,6 +189,9 @@ class Engine {
bool is_validation_layers_enabled() const;
bool is_generate_spirv_debug_info_enabled() const;
bool is_extra_gpu_memory_tracking_enabled() const;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
bool is_accurate_breadcrumbs_enabled() const;
#endif
int32_t get_gpu_index() const;

void increment_frames_drawn();
Expand Down
22 changes: 22 additions & 0 deletions core/error/error_macros.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ void _err_print_error(const char *p_function, const char *p_file, int p_line, co
_global_unlock();
}

// For printing errors when we may crash at any point, so we must flush ASAP a lot of lines
// but we don't want to make it noisy by printing lots of file & line info (because it's already
// been printing by a preceding _err_print_error).
void _err_print_error_asap(const String &p_error, ErrorHandlerType p_type) {
if (OS::get_singleton()) {
OS::get_singleton()->printerr("ERROR: %s\n", p_error.utf8().get_data());
} else {
// Fallback if errors happen before OS init or after it's destroyed.
const char *err_details = p_error.utf8().get_data();
fprintf(stderr, "ERROR: %s\n", err_details);
}

_global_lock();
ErrorHandlerList *l = error_handler_list;
while (l) {
l->errfunc(l->userdata, "", "", 0, p_error.utf8().get_data(), "", false, p_type);
l = l->next;
}

_global_unlock();
}

// Errors with message. (All combinations of p_error and p_message as String or char*.)
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const char *p_message, bool p_editor_notify, ErrorHandlerType p_type) {
_err_print_error(p_function, p_file, p_line, p_error.utf8().get_data(), p_message, p_editor_notify, p_type);
Expand Down
1 change: 1 addition & 0 deletions core/error/error_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ void _err_print_error(const char *p_function, const char *p_file, int p_line, co
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const char *p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error(const char *p_function, const char *p_file, int p_line, const char *p_error, const String &p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error(const char *p_function, const char *p_file, int p_line, const String &p_error, const String &p_message, bool p_editor_notify = false, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_error_asap(const String &p_error, ErrorHandlerType p_type = ERR_HANDLER_ERROR);
void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const char *p_message = "", bool p_editor_notify = false, bool fatal = false);
void _err_print_index_error(const char *p_function, const char *p_file, int p_line, int64_t p_index, int64_t p_size, const char *p_index_str, const char *p_size_str, const String &p_message, bool p_editor_notify = false, bool fatal = false);
void _err_flush_stdout();
Expand Down
235 changes: 178 additions & 57 deletions drivers/vulkan/rendering_device_driver_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
/**** GENERIC ****/
/*****************/

#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
static const uint32_t BREADCRUMB_BUFFER_ENTRIES = 512u;
#endif

static const VkFormat RD_TO_VK_FORMAT[RDD::DATA_FORMAT_MAX] = {
VK_FORMAT_R4G4_UNORM_PACK8,
VK_FORMAT_R4G4B4A4_UNORM_PACK16,
Expand Down Expand Up @@ -1370,7 +1374,10 @@ Error RenderingDeviceDriverVulkan::initialize(uint32_t p_device_index, uint32_t
ERR_FAIL_COND_V(err != OK, err);

max_descriptor_sets_per_pool = GLOBAL_GET("rendering/rendering_device/vulkan/max_descriptors_per_pool");
breadcrumb_buffer = buffer_create(sizeof(uint32_t), BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU);

#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
breadcrumb_buffer = buffer_create(2u * sizeof(uint32_t) * BREADCRUMB_BUFFER_ENTRIES, BufferUsageBits::BUFFER_USAGE_TRANSFER_TO_BIT, MemoryAllocationType::MEMORY_ALLOCATION_TYPE_CPU);
#endif

return OK;
}
Expand Down Expand Up @@ -5004,10 +5011,65 @@ void RenderingDeviceDriverVulkan::command_end_label(CommandBufferID p_cmd_buffer
/**** DEBUG *****/
/****************/
void RenderingDeviceDriverVulkan::command_insert_breadcrumb(CommandBufferID p_cmd_buffer, uint32_t p_data) {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
if (p_data == BreadcrumbMarker::NONE) {
return;
}
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, 0, sizeof(uint32_t), p_data);

if (Engine::get_singleton()->is_accurate_breadcrumbs_enabled()) {
// Force a full barrier so commands are not executed in parallel.
// This will mean that the last breadcrumb to see was actually the
// last (group of) command to be executed (hence, the one causing the crash).
VkMemoryBarrier memoryBarrier;
memoryBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
memoryBarrier.pNext = nullptr;
memoryBarrier.srcAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT |
VK_ACCESS_HOST_READ_BIT |
VK_ACCESS_HOST_WRITE_BIT;
memoryBarrier.dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT |
VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_INPUT_ATTACHMENT_READ_BIT |
VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_SHADER_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT |
VK_ACCESS_HOST_READ_BIT |
VK_ACCESS_HOST_WRITE_BIT;

vkCmdPipelineBarrier(
(VkCommandBuffer)p_cmd_buffer.id,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, 1u, &memoryBarrier, 0u, nullptr, 0u, nullptr);
}

// We write to a circular buffer. If you're getting barrier sync errors here,
// increase the value of BREADCRUMB_BUFFER_ENTRIES.
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, breadcrumb_offset, sizeof(uint32_t), breadcrumb_id++);
vkCmdFillBuffer((VkCommandBuffer)p_cmd_buffer.id, ((BufferInfo *)breadcrumb_buffer.id)->vk_buffer, breadcrumb_offset + sizeof(uint32_t), sizeof(uint32_t), p_data);
breadcrumb_offset += sizeof(uint32_t) * 2u;
if (breadcrumb_offset >= BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u) {
breadcrumb_offset = 0u;
}
#endif
}

void RenderingDeviceDriverVulkan::on_device_lost() const {
Expand Down Expand Up @@ -5089,64 +5151,121 @@ void RenderingDeviceDriverVulkan::on_device_lost() const {

void RenderingDeviceDriverVulkan::print_lost_device_info() {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
void *breadcrumb_ptr;
vmaFlushAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, sizeof(uint32_t));
vmaInvalidateAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, sizeof(uint32_t));

vmaMapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, &breadcrumb_ptr);
uint32_t last_breadcrumb = *(uint32_t *)breadcrumb_ptr;
vmaUnmapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle);
uint32_t phase = last_breadcrumb & uint32_t(~((1 << 16) - 1));
uint32_t user_data = last_breadcrumb & ((1 << 16) - 1);
String error_msg = "Last known breadcrumb: ";

switch (phase) {
case BreadcrumbMarker::ALPHA_PASS:
error_msg += "ALPHA_PASS";
break;
case BreadcrumbMarker::BLIT_PASS:
error_msg += "BLIT_PASS";
break;
case BreadcrumbMarker::DEBUG_PASS:
error_msg += "DEBUG_PASS";
break;
case BreadcrumbMarker::LIGHTMAPPER_PASS:
error_msg += "LIGHTMAPPER_PASS";
break;
case BreadcrumbMarker::OPAQUE_PASS:
error_msg += "OPAQUE_PASS";
break;
case BreadcrumbMarker::POST_PROCESSING_PASS:
error_msg += "POST_PROCESSING_PASS";
break;
case BreadcrumbMarker::REFLECTION_PROBES:
error_msg += "REFLECTION_PROBES";
break;
case BreadcrumbMarker::SHADOW_PASS_CUBE:
error_msg += "SHADOW_PASS_CUBE";
break;
case BreadcrumbMarker::SHADOW_PASS_DIRECTIONAL:
error_msg += "SHADOW_PASS_DIRECTIONAL";
break;
case BreadcrumbMarker::SKY_PASS:
error_msg += "SKY_PASS";
break;
case BreadcrumbMarker::TRANSPARENT_PASS:
error_msg += "TRANSPARENT_PASS";
break;
case BreadcrumbMarker::UI_PASS:
error_msg += "UI_PASS";
break;
default:
error_msg += "UNKNOWN_BREADCRUMB(" + itos((uint32_t)phase) + ')';
break;
{
String error_msg = "Printing last known breadcrumbs in reverse order (last executed first).";
if (!Engine::get_singleton()->is_accurate_breadcrumbs_enabled()) {
error_msg += "\nSome of them might be inaccurate. Try running with --accurate-breadcrumbs for precise information.";
}
_err_print_error(FUNCTION_STR, __FILE__, __LINE__, error_msg);
}

if (user_data != 0) {
error_msg += " | User data: " + itos(user_data);
}
uint8_t *breadcrumb_ptr = nullptr;
VkResult map_result = VK_SUCCESS;

vmaFlushAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u);
vmaInvalidateAllocation(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, 0, BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u);
{
void *ptr = nullptr;
map_result = vmaMapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle, &ptr);
breadcrumb_ptr = reinterpret_cast<uint8_t *>(ptr);
}

if (breadcrumb_ptr && map_result == VK_SUCCESS) {
uint32_t last_breadcrumb_offset = 0;
{
_err_print_error_asap("Searching last breadcrumb. We've sent up to ID: " + itos(breadcrumb_id - 1u));

// Scan the whole buffer to find the offset with the highest ID.
// That means that was the last one to be written.
//
// We use "breadcrumb_id - id" to account for wraparound.
// e.g. breadcrumb_id = 2 and id = 4294967294; then 2 - 4294967294 = 4.
// The one with the smallest difference is the closest to breadcrumb_id, which means it's
// the last written command.
uint32_t biggest_id = 0u;
uint32_t smallest_id_diff = std::numeric_limits<uint32_t>::max();
const uint32_t *breadcrumb_ptr32 = reinterpret_cast<const uint32_t *>(breadcrumb_ptr);
for (size_t i = 0u; i < BREADCRUMB_BUFFER_ENTRIES; ++i) {
const uint32_t id = breadcrumb_ptr32[i * 2u];
const uint32_t id_diff = breadcrumb_id - id;
if (id_diff < smallest_id_diff) {
biggest_id = i;
smallest_id_diff = id_diff;
}
}

_err_print_error_asap("Last breadcrumb ID found: " + itos(breadcrumb_ptr32[biggest_id * 2u]));

last_breadcrumb_offset = biggest_id * sizeof(uint32_t) * 2u;
}

const size_t entries_to_print = 8u; // Note: The value is arbitrary.
for (size_t i = 0u; i < entries_to_print; ++i) {
const uint32_t last_breadcrumb = *reinterpret_cast<uint32_t *>(breadcrumb_ptr + last_breadcrumb_offset + sizeof(uint32_t));
const uint32_t phase = last_breadcrumb & uint32_t(~((1 << 16) - 1));
const uint32_t user_data = last_breadcrumb & ((1 << 16) - 1);
String error_msg = "Last known breadcrumb: ";

switch (phase) {
case BreadcrumbMarker::ALPHA_PASS:
error_msg += "ALPHA_PASS";
break;
case BreadcrumbMarker::BLIT_PASS:
error_msg += "BLIT_PASS";
break;
case BreadcrumbMarker::DEBUG_PASS:
error_msg += "DEBUG_PASS";
break;
case BreadcrumbMarker::LIGHTMAPPER_PASS:
error_msg += "LIGHTMAPPER_PASS";
break;
case BreadcrumbMarker::OPAQUE_PASS:
error_msg += "OPAQUE_PASS";
break;
case BreadcrumbMarker::POST_PROCESSING_PASS:
error_msg += "POST_PROCESSING_PASS";
break;
case BreadcrumbMarker::REFLECTION_PROBES:
error_msg += "REFLECTION_PROBES";
break;
case BreadcrumbMarker::SHADOW_PASS_CUBE:
error_msg += "SHADOW_PASS_CUBE";
break;
case BreadcrumbMarker::SHADOW_PASS_DIRECTIONAL:
error_msg += "SHADOW_PASS_DIRECTIONAL";
break;
case BreadcrumbMarker::SKY_PASS:
error_msg += "SKY_PASS";
break;
case BreadcrumbMarker::TRANSPARENT_PASS:
error_msg += "TRANSPARENT_PASS";
break;
case BreadcrumbMarker::UI_PASS:
error_msg += "UI_PASS";
break;
default:
error_msg += "UNKNOWN_BREADCRUMB(" + itos((uint32_t)phase) + ')';
break;
}

if (user_data != 0) {
error_msg += " | User data: " + itos(user_data);
}

_err_print_error_asap(error_msg);

_err_print_error(FUNCTION_STR, __FILE__, __LINE__, error_msg);
if (last_breadcrumb_offset == 0u) {
// Decrement last_breadcrumb_idx, wrapping underflow.
last_breadcrumb_offset = BREADCRUMB_BUFFER_ENTRIES * sizeof(uint32_t) * 2u;
}
last_breadcrumb_offset -= sizeof(uint32_t) * 2u;
}

vmaUnmapMemory(allocator, ((BufferInfo *)breadcrumb_buffer.id)->allocation.handle);
breadcrumb_ptr = nullptr;
} else {
_err_print_error(FUNCTION_STR, __FILE__, __LINE__, "Couldn't map breadcrumb buffer. VkResult = " + itos(map_result));
}
#endif
on_device_lost();
}
Expand Down Expand Up @@ -5417,7 +5536,9 @@ RenderingDeviceDriverVulkan::RenderingDeviceDriverVulkan(RenderingContextDriverV
}

RenderingDeviceDriverVulkan::~RenderingDeviceDriverVulkan() {
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
buffer_free(breadcrumb_buffer);
#endif

while (small_allocs_pools.size()) {
HashMap<uint32_t, VmaPool>::Iterator E = small_allocs_pools.begin();
Expand Down
5 changes: 5 additions & 0 deletions drivers/vulkan/rendering_device_driver_vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,12 @@ class RenderingDeviceDriverVulkan : public RenderingDeviceDriver {
VmaPool _find_or_create_small_allocs_pool(uint32_t p_mem_type_index);

private:
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
// It's a circular buffer.
BufferID breadcrumb_buffer;
uint32_t breadcrumb_offset = 0u;
uint32_t breadcrumb_id = 0u;
#endif

public:
/*****************/
Expand Down
5 changes: 5 additions & 0 deletions main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,7 @@ void Main::print_help(const char *p_binary) {
print_help_option("--generate-spirv-debug-info", "Generate SPIR-V debug information. This allows source-level shader debugging with RenderDoc.\n");
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
print_help_option("--extra-gpu-memory-tracking", "Enables additional memory tracking (see class reference for `RenderingDevice.get_driver_and_device_memory_report()` and linked methods). Currently only implemented for Vulkan. Enabling this feature may cause crashes on some systems due to buggy drivers or bugs in the Vulkan Loader. See https://github.com/godotengine/godot/issues/95967\n");
print_help_option("--accurate-breadcrumbs", "Force barriers between breadcrumbs. Useful for narrowing down a command causing GPU resets. Currently only implemented for Vulkan.\n");
#endif
print_help_option("--remote-debug <uri>", "Remote debug (<protocol>://<host/IP>[:<port>], e.g. tcp://127.0.0.1:6007).\n");
print_help_option("--single-threaded-scene", "Force scene tree to run in single-threaded mode. Sub-thread groups are disabled and run on the main thread.\n");
Expand Down Expand Up @@ -1236,8 +1237,12 @@ Error Main::setup(const char *execpath, int argc, char *argv[], bool p_second_ph
#endif
} else if (arg == "--generate-spirv-debug-info") {
Engine::singleton->generate_spirv_debug_info = true;
#if defined(DEBUG_ENABLED) || defined(DEV_ENABLED)
} else if (arg == "--extra-gpu-memory-tracking") {
Engine::singleton->extra_gpu_memory_tracking = true;
} else if (arg == "--accurate-breadcrumbs") {
Engine::singleton->accurate_breadcrumbs = true;
#endif
} else if (arg == "--tablet-driver") {
if (N) {
tablet_driver = N->get();
Expand Down
Loading

0 comments on commit edad871

Please sign in to comment.