diff --git a/.clangd b/.clangd index 32523ca..7ac321a 100644 --- a/.clangd +++ b/.clangd @@ -10,3 +10,4 @@ CompileFlags: - '-fif-conversion2' - '-fmove-loop-invariants' - '-ftree-switch-conversion' + - '-mpoke-function-name' diff --git a/CMakeLists.txt b/CMakeLists.txt index ee5c645..d2d0542 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,12 @@ add_compile_options( # includes macros. This, however, incrases the object file sizes by 50 times. # Oh no. By default only level 2 is enabled. $<$,$>:-ggdb3> + # Getting rid of the frame pointer gives the compiler one more + # general-purpose register to work with and makes the function + # prologues/epilogues much shorter. The only downside is that a more + # complicated stack unwinding strategy must be employed, but we exploit the + # exception handling metadata for that. For some reason this option is not + # enabled by default on Clang. -fomit-frame-pointer ) @@ -205,14 +211,23 @@ add_compile_options( $<$:-fno-exceptions> # Don't generate extra code for exception handling ) -FetchContent_GetProperties(stm32f4xx_hal_driver) +# However, generate the metadata for stack unwinding for both C++ and C. +add_compile_options(-funwind-tables) +add_compile_definitions(ARM_UNWIND_DIRECTIVES) + +if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + # + add_compile_options(-mpoke-function-name) + add_compile_definitions(ARM_POKE_FUNCTION_NAME) +endif() + # Substitute file paths with a shorter string in expansions of the `__FILE__` # preprocessor variable (most notably used by the assertion macros), which has # non-negligible impact on the executable size. add_compile_options( -fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/= - -fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/src/stmes=stmes - -fmacro-prefix-map=${stm32f4xx_hal_driver_SOURCE_DIR}/Src/= + -fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/src/= + -fmacro-prefix-map=${DEPENDENCIES_DIR}/stm32f4xx_hal_driver/Src/= ) # We are now done with project-wide compilation flags! External dependencies, diff --git a/lib/printf/printf.h b/lib/printf/printf.h index 24f026d..4ae5bee 100644 --- a/lib/printf/printf.h +++ b/lib/printf/printf.h @@ -37,9 +37,9 @@ #ifdef __GNUC__ -#define GCC_ATTRIBUTE(expr) __attribute__((expr)) +#define PRINTF_GCC_ATTRIBUTE(expr) __attribute__((expr)) #else -#define GCC_ATTRIBUTE(expr) +#define PRINTF_GCC_ATTRIBUTE(expr) #endif @@ -65,7 +65,7 @@ void _putchar(char character); * \return The number of characters that are written into the array, not counting the terminating null character */ #define printf printf_ -GCC_ATTRIBUTE(format(__printf__, 1, 2)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 1, 2)) int printf_(const char* format, ...); @@ -77,7 +77,7 @@ int printf_(const char* format, ...); * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character */ #define sprintf sprintf_ -GCC_ATTRIBUTE(format(__printf__, 2, 3)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 2, 3)) int sprintf_(char* buffer, const char* format, ...); @@ -92,9 +92,9 @@ int sprintf_(char* buffer, const char* format, ...); */ #define snprintf snprintf_ #define vsnprintf vsnprintf_ -GCC_ATTRIBUTE(format(__printf__, 3, 4)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 3, 4)) int snprintf_(char* buffer, size_t count, const char* format, ...); -GCC_ATTRIBUTE(format(__printf__, 3, 0)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 3, 0)) int vsnprintf_(char* buffer, size_t count, const char* format, va_list va); @@ -105,7 +105,7 @@ int vsnprintf_(char* buffer, size_t count, const char* format, va_list va); * \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character */ #define vprintf vprintf_ -GCC_ATTRIBUTE(format(__printf__, 1, 0)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 1, 0)) int vprintf_(const char* format, va_list va); @@ -117,7 +117,7 @@ int vprintf_(const char* format, va_list va); * \param format A string that specifies the format of the output * \return The number of characters that are sent to the output function, not counting the terminating null character */ -GCC_ATTRIBUTE(format(__printf__, 3, 4)) +PRINTF_GCC_ATTRIBUTE(format(__printf__, 3, 4)) int fctprintf(void (*out)(char character, void* arg), void* arg, const char* format, ...); diff --git a/src/stmes/drivers/sdmmc.c b/src/stmes/drivers/sdmmc.c index e04e64a..e91d012 100644 --- a/src/stmes/drivers/sdmmc.c +++ b/src/stmes/drivers/sdmmc.c @@ -60,6 +60,7 @@ #include #include +#define SDMMC_DRIVER_LOGS 1 #define SDMMC_LOG_COMMANDS 0 // The SDIO peripheral has two DMA streams attached to it, DMA2_Stream3 and @@ -78,6 +79,12 @@ #define rca_arg(card) ((u32)(card)->rca << 16) +#if SDMMC_DRIVER_LOGS +#define sdmmc_log(...) printf(__VA_ARGS__) +#else +#define sdmmc_log(...) ((void)0) +#endif + static struct SdmmcCard sdmmc_card; static struct Mutex sdio_lock = MUTEX_INIT; // A global lock for the SDIO peripheral. @@ -254,7 +261,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { .rca = 0x0000, // The card starts out with a zero RCA. }; - printf("enabling the SDIO peripheral\n"); + sdmmc_log("enabling the SDIO peripheral\n"); __HAL_RCC_SDIO_CLK_ENABLE(); __HAL_RCC_DMA2_CLK_ENABLE(); @@ -270,7 +277,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // TODO: Implement card power control through an external voltage regulator. // NOTE: The power up procedure is described in section 6.4 "Power scheme". - printf("power cycling the card\n"); + sdmmc_log("power cycling the card\n"); // Disable the SDIO clock and reset all of its parameters. CLEAR_BIT(SDIO->CLKCR, SDIO_CLKCR_CLKEN | CLKCR_CLEAR_MASK); // Cut off the power from the SDIO peripheral (the name of this register is a @@ -304,12 +311,12 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // is covered in great detail in the section 4.2 "Card Identification Mode" // of the SD specification, and an overview is given on figures 4-1 and 4-2. - printf("starting card initialization\n"); + sdmmc_log("starting card initialization\n"); Systime init_start_time = systime_now(); // The card here may be in any state if the MCU has been rebooted. - printf("resetting the card\n"); + sdmmc_log("resetting the card\n"); for (u32 attempt = 0; attempt < 10; attempt++) { if (!(err = sdmmc_command(SDMMC_CMD0_GO_IDLE_STATE, 0, response))) break; task_sleep(1); @@ -319,7 +326,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // After a software reset with CMD0, the card switches to the Idle state. // Now, CMD8 must be issued to check support for SD spec v2.00-or-later. - printf("sending interface conditions... "); + sdmmc_log("sending interface conditions... "); bool is_v2_x_card; union SdmmcIfCond if_cond = { .word = 0 }; if_cond.bits.check_pattern = 0xAA; // 0b10101010, recommended by the spec @@ -330,9 +337,9 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // The spec recommends using the check pattern (in addition to the CRC // built into the protocol) to test signal integrity. if (resp_if_cond.bits.check_pattern == if_cond.bits.check_pattern) { - printf("card is at least v2.x\n"); + sdmmc_log("card is at least v2.x\n"); } else { - printf("check pattern error\n"); + sdmmc_log("check pattern error\n"); err = SDMMC_ERROR_CMD_CRC_FAIL; goto exit; } @@ -346,7 +353,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // voltages, so if the execution has reached here, it really must've been // an illegal command. is_v2_x_card = false; - printf("card is v1.x\n"); + sdmmc_log("card is v1.x\n"); // Upon receiving an illegal command, the card will set the ILLEGAL_COMMAND // error bit returned in the R1 response of the next command. This bit is // reset simply by issuing any other valid command, but in the Idle state @@ -360,7 +367,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // support for High/Extra Capacity cards, and wait until the card reaches // the Ready state. - printf("sending operating conditions...\n"); + sdmmc_log("sending operating conditions...\n"); // A timeout of 1 second for initialization with ACMD41 is recommended by the spec. deadline = timeout_to_deadline(1000); while (true) { @@ -403,15 +410,15 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { } Systime init_time = systime_now() - init_start_time; - printf("initialization completed in %" PRIu32 " ms\n", (u32)systime_as_millis(init_time)); + sdmmc_log("initialization completed in %" PRIu32 " ms\n", (u32)systime_as_millis(init_time)); - printf("starting card identification\n"); + sdmmc_log("starting card identification\n"); Systime ident_start_time = systime_now(); // The card is now in the Ready state. The host now requests its unique CID // register by issuing CMD2. - printf("reading CID data\n"); + sdmmc_log("reading CID data\n"); if ((err = sdmmc_command(SDMMC_CMD2_ALL_SEND_CID, 0, response))) goto exit; // The words in the CID must be flipped. The individual bytes are fine though. card->cid.words[0] = response[3]; @@ -423,10 +430,10 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // state. The host then asks the card to publish its RCA which will be later // used for addressing it when transferring data. - printf("requesting RCA... "); + sdmmc_log("requesting RCA... "); if ((err = sdmmc_command(SD_CMD3_SEND_RELATIVE_ADDR, 0, response))) goto exit; card->rca = (u16)(response[0] >> 16); // Extract the top 16 bits of the response - printf("0x%04" PRIX16 "\n", card->rca); + sdmmc_log("0x%04" PRIX16 "\n", card->rca); // NOTE: Strictly speaking the identification procedure is complete - the // card now enters the Stand-by state in the data transfer mode. However, a @@ -435,7 +442,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // section 4.3 "Data Transfer Mode" of the spec and on figure 4-13 "SD Memory // Card State Diagram". - printf("reading CSD register\n"); + sdmmc_log("reading CSD register\n"); // CMD9 must be issued before selecting the card. if ((err = sdmmc_command(SDMMC_CMD9_SEND_CSD, rca_arg(card), response))) goto exit; // The words of the CSD must be reversed as well. @@ -449,17 +456,17 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { bytes_str[0] = '\0'; u32 blocks = sdmmc_get_blocks_count(card); humanize_bytes(bytes_str, sizeof(bytes_str), (u64)blocks * SDMMC_BLOCK_SIZE); - printf("blocks = %" PRIu32 ", capacity = %sB\n", blocks, bytes_str); + sdmmc_log("blocks = %" PRIu32 ", capacity = %sB\n", blocks, bytes_str); } - printf("selecting card 0x%04" PRIX16 "\n", card->rca); + sdmmc_log("selecting card 0x%04" PRIX16 "\n", card->rca); // Selecting the card moves it to the Transfer state. if ((err = sdmmc_command(SDMMC_CMD7_SELECT_CARD, rca_arg(card), response))) goto exit; // TODO: Check the CARD_IS_LOCKED bit at this point u32 scr_data[2]; - printf("reading SCR data\n"); + sdmmc_log("reading SCR data\n"); if ((err = sdmmc_command(SDMMC_CMD16_SET_BLOCKLEN, sizeof(scr_data), response))) goto exit; // For some reason the SCR is sent via the DAT lines and not, for example, // with the long response format R2. @@ -489,7 +496,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { case SD_SPEC_V8_X: spec_version_name = "8.00"; break; case SD_SPEC_V9_X: spec_version_name = "9.00"; break; } - printf("card implements specification v%s\n", spec_version_name); + sdmmc_log("card implements specification v%s\n", spec_version_name); // Here comes the most interesting part of the card initialization sequence // and what actually makes my driver the fastest: activation of the @@ -502,13 +509,13 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { const u32 ACCESS_MODE_HIGH_SPEED = 1; struct SdFuncStatus status; - printf("checking support of High-Speed mode... "); + sdmmc_log("checking support of High-Speed mode... "); if ((err = sd_switch_function(false, ACCESS_MODE_GROUP, ACCESS_MODE_HIGH_SPEED, &status))) { goto exit; } if (status.supported && !status.busy && status.selected) { - printf("ok\n"); - printf("switching to High-Speed mode... "); + sdmmc_log("ok\n"); + sdmmc_log("switching to High-Speed mode... "); if ((err = sd_switch_function(true, ACCESS_MODE_GROUP, ACCESS_MODE_HIGH_SPEED, &status))) { goto exit; } @@ -516,9 +523,9 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // The specification requires the host to wait for at least 8 clocks // after CMD8 before making use of the new functions. task_sleep(1); - printf("ok\n"); + sdmmc_log("ok\n"); } else { - printf("fail\n"); + sdmmc_log("fail\n"); } // Confirm the success of the switch by refreshing the CSD register. @@ -527,7 +534,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // is accepted only in the Stand-by state, so the card must be first // deselected, and then re-selected again afterwards to be returned to // the Transfer state. - printf("reading CSD register\n"); + sdmmc_log("reading CSD register\n"); if ((err = sdmmc_command(SDMMC_CMD7_DESELECT_CARD, 0, response))) goto exit; if ((err = sdmmc_command(SDMMC_CMD9_SEND_CSD, rca_arg(card), response))) goto exit; card->csd.words[0] = response[3]; @@ -536,14 +543,14 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { card->csd.words[3] = response[0]; if ((err = sdmmc_command(SDMMC_CMD7_SELECT_CARD, rca_arg(card), response))) goto exit; } else { - printf("unsupported\n"); + sdmmc_log("unsupported\n"); } } } // This step isn't strictly necessary, but I guess we might want to equalize // the electrical characteristics of all data pins. - printf("disabling pull-up resistor on CD/DAT3 pin\n"); + sdmmc_log("disabling pull-up resistor on CD/DAT3 pin\n"); if ((err = sdmmc_command(SD_ACMD42_SET_CLR_CARD_DETECT, 0, response))) goto exit; u32 bus_width = SDIO_BUS_WIDE_1B; @@ -551,7 +558,7 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // bus, but a check beforehand won't hurt. if (host_caps->use_4bit_data_bus && card->scr.v1.supports_4bit_wide_bus) { bus_width = SDIO_BUS_WIDE_4B; - printf("switching data bus to 4-bit mode\n"); + sdmmc_log("switching data bus to 4-bit mode\n"); if ((err = sdmmc_command(SD_ACMD6_SET_BUS_WIDTH, 2, response))) goto exit; } @@ -564,11 +571,11 @@ u32 sdmmc_init_card(const struct SdmmcHostCapabilities* host_caps) { // which have variable logical block sizes, with the default not necessarily // being 512 bytes. Newer cards simply ignore CMD16. const u32 block_len = SDMMC_BLOCK_SIZE; - printf("setting block length to %" PRIu32 "\n", block_len); + sdmmc_log("setting block length to %" PRIu32 "\n", block_len); if ((err = sdmmc_command(SDMMC_CMD16_SET_BLOCKLEN, block_len, response))) goto exit; Systime ident_time = systime_now() - ident_start_time; - printf( + sdmmc_log( "indentification sequence completed in %" PRIu32 " ms\n", (u32)systime_as_millis(ident_time) ); @@ -832,7 +839,7 @@ static u32 configure_sdio_bus(u32 freq, u32 bus_width, u32 power_saving) { char str[10]; humanize_units(str, sizeof(str), sdio_ck); - printf("setting SDIO_CK to %sHz\n", str); + sdmmc_log("setting SDIO_CK to %sHz\n", str); WRITE_REG(SDIO->CLKCR, clkcr); return sdio_ck; } diff --git a/src/stmes/kernel/CMakeLists.txt b/src/stmes/kernel/CMakeLists.txt index 4102d2a..89ab4f9 100644 --- a/src/stmes/kernel/CMakeLists.txt +++ b/src/stmes/kernel/CMakeLists.txt @@ -4,4 +4,5 @@ target_sources(firmware PRIVATE sync.c task.c time.c + unwind.c ) diff --git a/src/stmes/kernel/crash.c b/src/stmes/kernel/crash.c index 9193e01..5f0b2cc 100644 --- a/src/stmes/kernel/crash.c +++ b/src/stmes/kernel/crash.c @@ -47,12 +47,6 @@ // TODO: A CrashReporter API for formatting the error messages after the crash // and outputting more contextual information. -// TODO: Backtrace and/or stack unwinder. Some ideas: -// -// -// -// - // TODO: Add more CFI directives to the inline assembly because the debugger // probably won't be able to unwind and restore local variables correctly due // to us carelessly messing with the registers. More information: @@ -276,7 +270,7 @@ static u32 hardfault_handler_impl(u32 exc_return, u32 msp, u32 psp) { // Bit 2 of EXC_RETURN determines which stack pointer was in use prior to // entering the exception handler, which, consequently, contains the frame // with the stacked registers. - u32 active_sp = (exc_return & BIT(2)) == 0 ? msp : psp; + u32 active_sp = READ_BIT(exc_return, BIT(2)) == 0 ? msp : psp; // See PM0214 section 2.3.7. The hardware ensures correct stack alignment, so // this struct doesn't need to have the __PACKED attribute (as suggested by @@ -303,7 +297,7 @@ static u32 hardfault_handler_impl(u32 exc_return, u32 msp, u32 psp) { ctx->mpu_diagnosis = MPU_FAULT_UNKNOWN; if (ctx->cfsr & SCB_CFSR_MEMFAULTSR_Msk) { - u32 ipsr = regs->xpsr & 0x1FF; + u32 ipsr = regs->xpsr & MASK(9); // This won't tell apart unprivileged load instructions (LDR*T)... Not a // big deal though. bool privileged = ipsr != 0 || (__get_CONTROL() & CONTROL_nPRIV_Msk) == 0; @@ -614,7 +608,7 @@ __NO_RETURN void enter_crash_screen(void) { console_putchar('\n'); console_set_color(0x07); - u32 interrupt_nr = crash_context.cpu_registers.xpsr & 0x1FF; + u32 interrupt_nr = crash_context.cpu_registers.xpsr & MASK(9); if (interrupt_nr != 0) { console_print("[in ISR "); print_number(interrupt_nr, 10); diff --git a/src/stmes/kernel/task.c b/src/stmes/kernel/task.c index 366bd42..436e782 100644 --- a/src/stmes/kernel/task.c +++ b/src/stmes/kernel/task.c @@ -384,8 +384,17 @@ static void syscall_handler_entry(usize arg1, usize arg2, usize arg3, enum Sysca // This function is the bottom-most frame of every task stack and exists // largely to be a barrier for debugger's stack trace unwinder (by being // implemented in assembly and having no hint directives whatsoever). -static __NO_RETURN __NAKED void task_launchpad(__UNUSED void* user_data, __UNUSED TaskFn* func) { +static CLANG_ATTRIBUTE(nouwtable) __NO_RETURN __NAKED + void task_launchpad(__UNUSED void* user_data, __UNUSED TaskFn* func) { __ASM volatile( // + // Clang doesn't support the ARM unwinding directives in naked functions. + // Well, at least it has an attribute for marking functions as CANTUNWIND. +#if !defined(__clang__) && ARM_UNWIND_DIRECTIVES + // Generate a "refuse to unwind" opcode instead of using the ".cantunwind" + // directive to make the runtime stack unwinder stop at this function, but + // not ignore it, and include it in the backtrace. + ".unwind_raw 0, 0x80, 0x00\n\t" +#endif // Call the function passed in the second parameter with the user data as // its first parameter which is already in r0. "blx r1\n\t" @@ -495,13 +504,14 @@ static __NAKED void context_switch(__UNUSED enum Syscall syscall_nr) { // can be freely clobbered by us since the hardware will restore them. // + // TODO: Add the ARM stack unwinding instructions for this function __ASM volatile( // // We need to back up the LR register (since we will need its EXC_RETURN // value for returning from the exception) before calling the scheduler // function, but to keep the stack aligned, another register has to be // backed up as well (which will come in handy later). "push {r4, lr}\n\t" -#ifdef CFI_DIRECTIVES +#if CFI_DIRECTIVES // The CFI directives make the assembler put a certain section into the // binary that informs the debugger how to unwind the stack and recover // local variables in caller frames. They don't generate any machine code @@ -539,7 +549,7 @@ static __NAKED void context_switch(__UNUSED enum Syscall syscall_nr) { // otherwise it will be lost: "mov r1, r4\n\t" "pop {r4, lr}\n\t" -#ifdef CFI_DIRECTIVES +#if CFI_DIRECTIVES // Here come the CFI directives once again, this time to inform the // debugger that the values of the backed up registers are the same that // they were at the beginning of the function. @@ -585,7 +595,7 @@ static __NAKED void context_switch(__UNUSED enum Syscall syscall_nr) { // Handler mode, and we always use PSP for tasks anyway) and CONTROL.FPCA // are reset upon entering the interrupt. "mrs r3, control\n\t" -#if __FPU_USED == 1 +#if __FPU_USED // The bit 4 of EXC_RETURN determines whether the stacked state includes // the FPU registers, and thus whether the task has issued any FPU // instructions so far. @@ -615,7 +625,7 @@ static __NAKED void context_switch(__UNUSED enum Syscall syscall_nr) { // Pop the other task's core registers from its stack, its value of the // CONTROL register into r3 and its EXC_RETURN value into LR. "ldmia r2!, {r3, r4-r11, lr}\n\t" -#if __FPU_USED == 1 +#if __FPU_USED // Reload the next task's floating-point context. See the note about // preserving it above. "tst lr, #16\n\t" @@ -672,6 +682,7 @@ __NAKED void SVC_Handler(void) { // tail-chained, the register values will be those at the point of the `SVC` // instruction. + // TODO: Add the ARM stack unwinding instructions for this function __ASM volatile( // // The bit 2 of LR specifies which stack was in use prior to entering the // interrupt. Use it to figure out whether the caller was using MSP or PSP. @@ -698,7 +709,7 @@ __NAKED void SVC_Handler(void) { // The normal syscall entry path. Save the LR and the syscall number before // calling the handler. "push {r0, lr}\n\t" -#ifdef CFI_DIRECTIVES +#if CFI_DIRECTIVES ".cfi_adjust_cfa_offset 8\n\t" ".cfi_rel_offset r0, 0\n\t" ".cfi_rel_offset lr, 4\n\t" @@ -720,7 +731,7 @@ __NAKED void SVC_Handler(void) { // Restore the LR and the syscall number, loading it into the first // argument register. "pop {r0, lr}\n\t" -#ifdef CFI_DIRECTIVES +#if CFI_DIRECTIVES ".cfi_adjust_cfa_offset -8\n\t" ".cfi_restore r0\n\t" ".cfi_restore lr\n\t" diff --git a/src/stmes/kernel/unwind.c b/src/stmes/kernel/unwind.c new file mode 100644 index 0000000..317b937 --- /dev/null +++ b/src/stmes/kernel/unwind.c @@ -0,0 +1,532 @@ +// This module performs stack unwinding by utilizing the metadata used for C++ +// exception handling. For an exception to be able to "bubble up" the call +// stack until it reaches a try-catch block, the compiler generates special +// tables with instructions on how to interpret the stack frame created by any +// given function, even in the absence of a frame pointer (i.e. when using +// -fomit-frame-pointer), plus some additional information like where on the +// stack do local variables reside. These tables can also be generated when +// compiling C functions, so that exceptions may be thrown through them (even +// though they have no way of catching them) when mixing C and C++ code. The +// language runtime then uses this information when an exception is thrown to +// essentially forcibly perform an early return from every function by +// restoring the registers from the stack, running the destructors of local +// variables in the process, and one by one pop each frame off the stack before +// reaching the enclosing catch block and resuming execution there - this is +// what's called stack unwinding. However, this metadata may also be used for +// other purposes, e.g. to just analyze and recover the function call stack and +// produce a backtrace, which this module's primary use-case. +// +// It turns out that ARM defines its own special ABI for the stack unwinding +// tables, found in sections `.ARM.extab` and `.ARM.exidx` of the binary, which +// are pretty compact and relatively easy to parse and interpret. The format of +// these tables is principally described in the ARM EHABI32 specification: +// +// +// (with the relevant info being contained in sections 6, 7 and 10). +// +// Additional resources that were helpful in understanding that stuff: +// +// +// +// +// - even more interesting links on the matter +// +// +// +// +// Some existing unwinder implementations: +// +// +// +// +// (the last two are largely derived from the Linux code) +// +// These are involved in implementing the whole exception handling runtime +// infrastructure, not only unwinding: +// +// +// +// +// +// +// +// There exist alternative approaches of stack unwinding, based on analyzing +// the function prologues and epilogues: +// +// +// +// +// Or using frame pointers, which turn the stack into a linked list of frames: +// +// Or by looking for LR pointers on the stack with sophisticated heuristics: +// +// +// This link is not super relevant, but contains interesting discussions on the +// internals of the ARM architecture: +// + +// TODO: Unwinding through interrupt frames +// TODO: Stack pointer range checks + +#include "stmes/kernel/unwind.h" +#include "stmes/kernel/crash.h" +#include "stmes/kernel/task.h" +#include "stmes/utils.h" +#include + +#if 0 +#define unwind_log(...) printf(__VA_ARGS__) +#else +#define unwind_log(...) ((void)0) +#endif + +// An entry in the `.ARM.exidx` table. +struct UnwindIndex { + // An R_ARM_PREL31 relocation or a so-called "prel31 offset" which points to + // the start of a function covered by this entry. The region of this function + // ends where that of the next entry starts. + u32 function_addr; + // Contains either a prel31 offset to the list of unwinding instructions in + // the `.ARM.extab` segment or an inlined list of instructions when it is + // short enough (which it often is for C functions). + u32 instructions; +}; + +// The pointers to the boundaries of the `.ARM.exidx` segment, are provided by +// the linker script. +extern const struct UnwindIndex __exidx_start[], __exidx_end[]; + +// Decodes a prel31 offset to an absolute address. +__STATIC_FORCEINLINE usize prel31_to_addr(const u32* ptr) { + // Sign-extend the low 31 bits (the 32nd bit is used for data) to a 32-bit + // int. + i32 offset = ((i32)*ptr) << 1 >> 1; + return (usize)ptr + offset; +} + +// The `.ARM.exidx` table is sorted by the function address in ascending order, +// which means we can binary-search it. +static const struct UnwindIndex* +unwind_search_index(usize addr, const struct UnwindIndex* start, const struct UnwindIndex* end) { + if (end <= start) { + return NULL; // Return immediately if the table is empty + } + while (start < end - 1) { // Unsure about the plus/minus ones here, but, well, it works + const struct UnwindIndex* middle = start + (end - start + 1) / 2; + if (addr < prel31_to_addr(&middle->function_addr)) { + end = middle; + } else { + start = middle; + } + } + if (addr < prel31_to_addr(&start->function_addr)) { + // The requested address points outside the index table. + // NOTE: This branch only handles addresses before the index with the + // smallest address. For catching addresses after the end of the memory + // region covered by the unwind tables the compiler emits an entry with the + // EXIDX_CANTUNWIND bit at the very end of the index table. + return NULL; + } else { + return start; + } +} + +// A crude way of getting the symbol names from within the firmware without +// access to the symbol table, depends on the conveniently available (since at +// least 1999) option -mpoke-function-name: +// +// +const char* peek_function_name(usize addr, usize* name_len) { +#if ARM_POKE_FUNCTION_NAME + const u32 flag_bits = 0xFF000000; + u32 word = *(const u32*)(addr - 4); + if (test_all_bits(word, flag_bits)) { + *name_len = clear_bit(word, flag_bits); + return (const char*)(addr - 4 - *name_len); + } +#else + UNUSED(addr); +#endif + *name_len = 0; + return NULL; +} + +struct UnwindReader { + const u32* ptr; + usize len, pos; + u32 modified_registers; +}; + +static enum UnwindError +unwind_create_reader(struct UnwindReader* reader, const struct UnwindIndex* index) { + const u32* instructions; + bool is_inline = false; + if (index->instructions & BIT(31)) { + is_inline = true; // An inline entry embedded in `.ARM.exidx` + instructions = &index->instructions; + } else { + // A prel31 pointer to an entry in `.ARM.extab` + instructions = (const u32*)prel31_to_addr(&index->instructions); + } + + bool is_compact = *instructions & BIT(31); + if (!is_compact) return UNWIND_NOT_COMPACT_ENTRY; + reader->ptr = instructions; + + u8 personality = EXTRACT_BITS(*instructions, 24, 7); + if (personality == 0) { + // A short description inlined into the remaining bytes of the word. + reader->pos = 1; + reader->len = 4; + return UNWIND_OK; + } else if (is_inline) { + // Entries inlined into the index can only have personality 0. + return UNWIND_UNSUPPORTED_PERSONALITY; + } else if (personality == 1 || personality == 2) { + // A long description consisting of N words, plus two inlined bytes. + reader->pos = 2; + reader->len = 4 + 4 * EXTRACT_BITS(*instructions, 16, 8); + return UNWIND_OK; + } else { + return UNWIND_UNSUPPORTED_PERSONALITY; + } +} + +static u8 unwind_read_byte(struct UnwindReader* reader) { + u32 pos = reader->pos; + ASSERT(pos < reader->len); // TODO: Handle malformed instructions more gracefully... + reader->pos = pos + 1; + u8 byte = reader->ptr[pos / 4] >> ((3 - pos % 4) * 8); + unwind_log("%02" PRIX8 " ", byte); + return byte; +} + +static u32 unwind_read_uleb128(struct UnwindReader* reader) { + // + // NOTE: I chose to decode at most 4 bytes, which can encode a 28 bit number + // with the max value of 0xFFFFFFF. This gives a maximum possible SP + // increment of 1073742336, which is more than enough for all practical + // intents and purposes. + u32 result = 0; + u32 shift = 0; + u8 byte; + do { + ASSERT(shift < sizeof(u32) * 7); + byte = unwind_read_byte(reader); + result |= (byte & MASK(7)) << shift; + shift += 7; + } while (byte & BIT(7)); + return result; +} + +static enum UnwindError +unwind_pop_registers(struct UnwindReader* reader, struct UnwindContext* ctx, u32 regs_mask) { + if (regs_mask == 0) { + return UNWIND_RESERVED_SPARE_INSTRUCTION; + } + unwind_log("pop {"); + const u32* sp = (const u32*)ctx->registers[REG_SP]; + // Find the first and the last set bits in the mask to reduce the number of + // useless iterations in the loop below. Typically, a contiguous range of + // core registers will be popped. + u32 first_reg = __builtin_ctz(regs_mask), last_reg = 31 - __builtin_clz(regs_mask); + last_reg = MIN(last_reg, SIZEOF(ctx->registers) - 1); + for (u32 i = first_reg; i <= last_reg; i++) { + if (regs_mask & BIT(i)) { + ctx->registers[i] = *sp++; + unwind_log("%sr%" PRIu32, i != first_reg ? ", " : "", i); + } + } + if (!(regs_mask & BIT(REG_SP))) { + ctx->registers[REG_SP] = (usize)sp; + } + reader->modified_registers |= regs_mask | BIT(REG_SP); + unwind_log("}\n"); + return UNWIND_OK; +} + +// What the hell is the FSTMFDX instruction? +// +enum FpuFrame { FPU_FRAME_VPUSH, FPU_FRAME_FSTMFDX }; + +static enum UnwindError unwind_pop_fpu_registers( + struct UnwindReader* reader, + struct UnwindContext* ctx, + u32 first_reg, + u32 regs_count, + enum FpuFrame frame_type +) { + u32 max_reg = frame_type == FPU_FRAME_FSTMFDX ? 16 : 32; + if (first_reg >= max_reg || first_reg + regs_count > max_reg || regs_count < 1) { + return UNWIND_RESERVED_SPARE_INSTRUCTION; + } + // The FPU registers aren't actually tracked right now, only the SP is + // sufficiently adjusted AS IF a bunch of double-precision FPU registers have + // been popped. + ctx->registers[REG_SP] += 8 * regs_count + (frame_type == FPU_FRAME_FSTMFDX ? 4 : 0); + reader->modified_registers |= BIT(REG_SP); + if (regs_count > 1) { + unwind_log("vpop {d%" PRIu32 "-d%" PRIu32 "}\n", first_reg, first_reg + regs_count - 1); + } else { + unwind_log("vpop {d%" PRIu32 "}\n", first_reg); + } + return UNWIND_OK; +} + +// A comprehensive summary of the unwind instruction encodings: +// +static enum UnwindError +unwind_exec_instruction(struct UnwindReader* reader, struct UnwindContext* ctx) { + u8 insn = unwind_read_byte(reader); + + if ((insn & BIT(7)) == 0) { + // 00xxxxxx: vsp = vsp + (xxxxxx << 2) + 4 + // 01xxxxxx: vsp = vsp - (xxxxxx << 2) - 4 + u32 offset = 4 + (insn & MASK(6)) * 4; + if ((insn & BIT(6)) == 0) { + ctx->registers[REG_SP] += offset; + } else { + ctx->registers[REG_SP] -= offset; + } + reader->modified_registers |= BIT(REG_SP); + unwind_log("sp %c= %" PRIu32 "\n", (insn & BIT(6)) ? '-' : '+', offset); + return UNWIND_OK; + } + + switch ((insn >> 4) & MASK(3)) { + case 0: { + // 1000iiii iiiiiiii: pop under masks {r15-r12}, {r11-r4} + u8 insn2 = unwind_read_byte(reader); + u32 regs_mask = ((insn & MASK(4)) << 12) | ((insn2 & MASK(8)) << 4); + if (regs_mask == 0) { + // 10000000 00000000: refuse to unwind + unwind_log("refused\n"); + return UNWIND_REFUSED; + } + return unwind_pop_registers(reader, ctx, regs_mask); + } + + case 1: { + // 1001nnnn: vsp = r[nnnn] + u32 reg = insn & MASK(4); + if (reg == 13 || reg == 15) { + // 10011101: reserved for register-to-register moves + // 10011111: reserved for Intel WMMX register to ARM register moves + return UNWIND_RESERVED_SPARE_INSTRUCTION; + } + ctx->registers[REG_SP] = ctx->registers[reg]; + reader->modified_registers |= BIT(REG_SP); + unwind_log("sp = r%" PRIu32 "\n", reg); + return UNWIND_OK; + } + + case 2: { + // 10100nnn: pop r4-r[4+nnn] + // 10101nnn: pop r4-r[4+nnn], r14 + u32 last_reg = 4 + (insn & MASK(3)); + u32 regs_mask = MASK(last_reg + 1) & ~MASK(4); + if ((insn & BIT(3)) != 0) regs_mask |= BIT(REG_LR); + return unwind_pop_registers(reader, ctx, regs_mask); + } + + case 3: { + if ((insn & BIT(3)) != 0) { + // 10111nnn: pop D[8]-D[8+nnn] saved with FSTMFDX + u32 regs_count = insn & MASK(3); + return unwind_pop_fpu_registers(reader, ctx, 8, regs_count, FPU_FRAME_FSTMFDX); + } + + switch (insn & MASK(3)) { + case 0: { + // 10110000: finish + unwind_log("finish\n"); + reader->pos = reader->len; // Skip to the end of the instruction stream + return UNWIND_OK; + } + + case 1: { + // 10110001 0000iiii: pop under mask {r3,r2,r1,r0} + u8 insn2 = unwind_read_byte(reader); + u32 regs_mask = insn2 & MASK(4); + if (regs_mask == 0 || (insn2 & ~MASK(4)) != 0) { + // 10110001 00000000: spare + // 10110001 xxxxyyyy: spare (xxxx != 0) + return UNWIND_RESERVED_SPARE_INSTRUCTION; + } + return unwind_pop_registers(reader, ctx, regs_mask); + } + + case 2: { + // 10110010 uleb128: vsp = vsp + 0x204 + (uleb128 << 2) + u32 offset = 0x204 + unwind_read_uleb128(reader) * 4; + ctx->registers[REG_SP] += offset; + reader->modified_registers |= BIT(REG_SP); + unwind_log("sp += %" PRIu32 "\n", offset); + return UNWIND_OK; + } + + case 3: { + // 10110011 sssscccc: pop D[ssss]-D[ssss+cccc] saved with FSTMFDX + u8 insn2 = unwind_read_byte(reader); + u32 first_reg = (insn2 >> 4) & MASK(4), regs_count = (insn2 & MASK(4)) + 1; + return unwind_pop_fpu_registers(reader, ctx, first_reg, regs_count, FPU_FRAME_FSTMFDX); + } + } + break; + } + + case 4: { + if ((insn & BIT(3)) != 0) { + if ((insn & BITS4(0, 1, 1, 0)) != 0) { + // 11001yyy: spare (yyy != 000, 001) + return UNWIND_RESERVED_SPARE_INSTRUCTION; + } + // 11001000 sssscccc: pop D[16+ssss]-D[16+ssss+cccc] saved with VPUSH + // 11001001 sssscccc: pop D[ssss]-D[ssss+cccc] saved with VPUSH + u8 insn2 = unwind_read_byte(reader); + u32 first_reg = (insn2 >> 4) & MASK(4), regs_count = (insn2 & MASK(4)) + 1; + if ((insn & BIT(0)) == 0) first_reg += 16; + return unwind_pop_fpu_registers(reader, ctx, first_reg, regs_count, FPU_FRAME_VPUSH); + } + break; + } + + case 5: { + if ((insn & BIT(3)) == 0) { + // 11010nnn: pop D[8]-D[8+nnn] saved with VPUSH + u32 regs_count = insn & MASK(3); + return unwind_pop_fpu_registers(reader, ctx, 8, regs_count, FPU_FRAME_VPUSH); + } + break; + } + } + + return UNWIND_RESERVED_SPARE_INSTRUCTION; +} + +enum UnwindError unwind_frame(struct UnwindContext* ctx, struct UnwindFrame* frame) { + enum UnwindError err = UNWIND_OK; + + *frame = (struct UnwindFrame){ 0 }; + + // Clear the Thumb state bit of the address in PC + usize pc = clear_bit(ctx->registers[REG_PC], BIT(0)); + frame->instruction_addr = (void*)pc; + frame->stack_ptr = (void*)ctx->registers[REG_SP]; + + // NOTE: This is a bad idea, I am disassembling the wrong instruction. + // + // u8 pc_bits_8_15 = *(const u8*)(pc + 1); + // pc -= 2; + // if ((pc_bits_8_15 >> 5) == 0x7) { + // u8 op1 = (pc_bits_8_15 >> 3) & MASK(2); + // if (op1 == 0x1 || op1 == 0x2 || op1 == 0x3) { + // pc -= 2; + // } + // } + + const struct UnwindIndex* index = unwind_search_index(pc, __exidx_start, __exidx_end); + if (index == NULL || index->instructions == /* EXIDX_CANTUNWIND */ 0x1) { + return UNWIND_NO_MATCHING_ENTRY; + } + + usize func_addr = prel31_to_addr(&index->function_addr); + frame->function_addr = (void*)func_addr; + frame->function_name = peek_function_name(func_addr, &frame->function_name_len); + + struct UnwindReader reader; + if ((err = unwind_create_reader(&reader, index))) return err; + + reader.modified_registers = 0; + while (reader.pos < reader.len) { + if ((err = unwind_exec_instruction(&reader, ctx))) return err; + } + + // Prevent infinite loops if none of the important registers were changed. + if (!test_any_bit(reader.modified_registers, BIT(REG_SP) | BIT(REG_LR) | BIT(REG_PC))) { + return UNWIND_REFUSED; + } + + // `mov pc, lr` if the PC hasn't already been touched. + if (!test_bit(reader.modified_registers, BIT(REG_PC))) { + ctx->registers[REG_PC] = ctx->registers[REG_LR]; + } + + return err; +} + +// Records the register values AT THE CALL SITE into the given context struct, +// so that unwinding starts directly at the caller function. Unfortunately, due +// to the fact that the pointer to the context is passed in r0, the first +// argument register, its original value will be lost, but honestly, who cares? +__NOINLINE __NAKED void unwind_capture_context(__UNUSED struct UnwindContext* ctx) { + __ASM volatile( // + // r13 and r15 (SP and PC respectively) can't be present in the register + // list of the `STM` instruction (the encoding doesn't allow it), so save + // what we can first. The address in r0 is not incremented, so it can be + // present in the list. + "stmia r0, {r0-r12}\n\t" + // Now, since the task at hand is to create a context as if being located + // at the call site, write our current return address and stack pointer as + // the values of PC and SP. The calling convention didn't require changing + // the SP here in any way, and the LR points to the next instruction in the + // calling function, so as far as I can tell this is completely legal. + "str sp, [r0, #(4 * 13)]\n\t" // REG_SP + "str lr, [r0, #(4 * 14)]\n\t" // REG_LR + "str lr, [r0, #(4 * 15)]\n\t" // REG_PC + // Aaand return. + "bx lr" + ); +} + +// A prototype of recording the context of another running task to be able to +// get its backtrace. +void unwind_capture_task_context(struct UnwindContext* ctx, const struct Task* task) { + struct ExceptionStackedContext { + u32 r0, r1, r2, r3, r12, lr, pc, xpsr; + }; + + struct TaskStackedContext { + u32 control, r4, r5, r6, r7, r8, r9, r10, r11, exc_return; + }; + + // We can't trace ourselves like that - the stack must contain a valid + // context switching frame. + ASSERT(task != get_current_task()); + const u8* stack_ptr = task->stack_ptr; + + const struct TaskStackedContext* task_ctx = (void*)stack_ptr; + stack_ptr += sizeof(*task_ctx); + ASSERT((task_ctx->exc_return & BIT(4)) != 0); // TODO + for (usize reg = 4; reg <= 11; reg++) { + ctx->registers[reg] = (&task_ctx->r4)[reg - 4]; + } + + const struct ExceptionStackedContext* exc_ctx = (void*)stack_ptr; + stack_ptr += sizeof(*exc_ctx); + for (usize reg = 0; reg <= 3; reg++) { + ctx->registers[reg] = (&exc_ctx->r0)[reg]; + } + ctx->registers[12] = exc_ctx->r12; + ctx->registers[REG_LR] = exc_ctx->lr; + ctx->registers[REG_PC] = exc_ctx->pc; + ctx->registers[REG_SP] = (usize)stack_ptr; +} + +enum UnwindError backtrace(struct UnwindContext* ctx) { + struct UnwindFrame frame; + for (u32 i = 0; true; i++) { + enum UnwindError err = unwind_frame(ctx, &frame); + if (err != UNWIND_OK && err != UNWIND_REFUSED) break; + printf( + "%2" PRIu32 ": %p %.*s+0x%0" PRIXPTR "\n", + i + 1, + frame.instruction_addr, + frame.function_name_len, + frame.function_name != NULL ? frame.function_name : "", + (usize)frame.instruction_addr - (usize)frame.function_addr + ); + if (err == UNWIND_REFUSED) break; + } + return UNWIND_OK; +} diff --git a/src/stmes/kernel/unwind.h b/src/stmes/kernel/unwind.h new file mode 100644 index 0000000..b53bc1d --- /dev/null +++ b/src/stmes/kernel/unwind.h @@ -0,0 +1,50 @@ +#pragma once + +#include "stmes/kernel/task.h" +#include "stmes/utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + REG_SL = 10, + REG_FP = 11, + REG_IP = 12, + REG_SP = 13, + REG_LR = 14, + REG_PC = 15, +}; + +enum UnwindError { + UNWIND_OK = 0, + UNWIND_REFUSED, + UNWIND_RESERVED_SPARE_INSTRUCTION, + UNWIND_NO_MATCHING_ENTRY, + UNWIND_UNSUPPORTED_PERSONALITY, + UNWIND_NOT_COMPACT_ENTRY, +}; + +struct UnwindContext { + usize registers[16]; +}; + +struct UnwindFrame { + void* stack_ptr; + void* instruction_addr; + void* function_addr; + const char* function_name; + usize function_name_len; +}; + +void unwind_capture_context(struct UnwindContext* ctx); +void unwind_capture_task_context(struct UnwindContext* ctx, const struct Task* task); + +enum UnwindError unwind_frame(struct UnwindContext* ctx, struct UnwindFrame* frame); +enum UnwindError backtrace(struct UnwindContext* ctx); + +const char* peek_function_name(usize addr, usize* name_len); + +#ifdef __cplusplus +} +#endif diff --git a/src/stmes/newlib_support.c b/src/stmes/newlib_support.c index b8533ca..23b68d5 100644 --- a/src/stmes/newlib_support.c +++ b/src/stmes/newlib_support.c @@ -13,14 +13,27 @@ #include #include +#ifndef __clang__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-declarations" +#endif + // Define fallbacks for _init/_fini as weak symbols, so that linking doesn't // fail in case the crt*.o objects are missing. -void _init(void); -void _fini(void); __WEAK void _init(void) {} __WEAK void _fini(void) {} -__USED void* _sbrk(ptrdiff_t incr) { +void __aeabi_unwind_cpp_pr0(void) { + __builtin_trap(); +} +void __aeabi_unwind_cpp_pr1(void) { + __builtin_trap(); +} +void __aeabi_unwind_cpp_pr2(void) { + __builtin_trap(); +} + +void* _sbrk(ptrdiff_t incr) { extern u32 __heap_start[], __heap_end[]; // These are defined by the linker script // Static initializers are allowed to reference addresses of other symbols static usize heap_ptr = (usize)__heap_start; @@ -69,6 +82,15 @@ off_t _lseek(int fd, off_t offset, int whence) { return 0; } +void _exit(int code) { + UNUSED(code); + __builtin_trap(); +} + +#ifndef __clang__ +#pragma GCC diagnostic pop +#endif + // Newlib has a facility for locking under a multi-threaded system, which // simply requires providing all locking-related subroutines to override the // default empty stubs. Note that replacements for every symbol must be diff --git a/src/stmes/newlib_support.h b/src/stmes/newlib_support.h index a159a0e..38baad4 100644 --- a/src/stmes/newlib_support.h +++ b/src/stmes/newlib_support.h @@ -12,6 +12,7 @@ int _write(int fd, const char* buf, size_t len); int _read(int fd, char* buf, size_t len); off_t _lseek(int fd, off_t offset, int whence); int _close(int fd); +void _exit(int code); #ifdef __cplusplus } diff --git a/src/stmes/utils.h b/src/stmes/utils.h index 109a108..d5c8666 100644 --- a/src/stmes/utils.h +++ b/src/stmes/utils.h @@ -51,6 +51,12 @@ extern "C" { #define unlikely(x) (x) #endif +#ifdef __clang__ +#define CLANG_ATTRIBUTE(expr) __attribute__((expr)) +#else +#define CLANG_ATTRIBUTE(expr) +#endif + #define __WEAK_ALIAS(name) __attribute__((weak, alias(name))) #define __ALIAS(name) __attribute__((alias(name))) #define __SECTION(name) __attribute__((section(name))) @@ -112,6 +118,10 @@ i32 humanize_bytes(char* buf, usize buf_size, i64 bytes); #define test_bit(x, bit) (((x) & (bit)) != 0) #define test_any_bit(x, bit) (((x) & (bit)) != 0) #define test_all_bits(x, bit) (((x) & (bit)) == bit) +#define set_bit(x, bit) ((x) | (bit)) +#define clear_bit(x, bit) ((x) & ~(bit)) +#define read_bit(x, bit) ((x) & (reg)) +#define toggle_bit(x, bit) ((x) ^ (bit)) #ifdef __cplusplus }