From 74ada5be8a2f2b675ac605069f0a1feea34fe83f Mon Sep 17 00:00:00 2001 From: Yu Ding Date: Mon, 5 Aug 2024 16:01:29 -0700 Subject: [PATCH] Remove the hard coded page tables Currently the guest VM starts with a hardcoded page table to jump to 64-bit long mode, and set up another one in the low memory area, and we accept a 2MiB page for the low memory area. However, Linux kernel may not like it if the memory between 640KiB and 1MiB was accepted. In this CR we use the 640KiB TEMP_MEM section in the low memory area for the page tables. The hypervisor adds it and guest can use it without doing tdcall. So we can dynamically set up the page tables in the .bss section. Change-Id: I7dfc18304e91fbbb7849d2bd2fbc1784ba5bae91 --- stage0_bin_tdx/layout.ld | 34 --------------- stage0_bin_tdx/src/asm/tdx.s | 80 +++++++++++++++++------------------- 2 files changed, 38 insertions(+), 76 deletions(-) diff --git a/stage0_bin_tdx/layout.ld b/stage0_bin_tdx/layout.ld index 7ea222cd36..8bacdb5a32 100644 --- a/stage0_bin_tdx/layout.ld +++ b/stage0_bin_tdx/layout.ld @@ -246,40 +246,6 @@ SECTIONS { ASSERT(. == TOP - BFV_SIZE + 120 + 24 + 4, "wrong BFV header size") - /* The ROM area is read only. We need to hard code the page table here - * and use it to switch to 64-bit long mode. After entering the 64-bit - * long mode, we are able to make tdcall to accept memory for ram_low - * section and create another set of page tables there. */ - .page_tables ALIGN(4K): AT (TOP - BFV_SIZE + 0x1000) { - bios_pml4 = .; /* CR3 */ - QUAD(ADDR(.page_tables) + 0x1000 + 0x23) /* 0..512GiB */ - FILL(0) - . = bios_pml4 + 0x1000; - - bios_pdpt = .; - QUAD(ADDR(.page_tables) + 0x2000 + 0x23) /* 0..1GiB */ - QUAD(0) - QUAD(0) - QUAD(ADDR(.page_tables) + 0x3000 + 0x23) /* 3..4GiB */ - FILL(0) - . = bios_pdpt + 0x1000; - - bios_pd_0 = .; - QUAD(0x0 + 0xA3) /* 0..2MiB, HUGE_PAGE | PAGE_ACCESSED | PAGE_READ_WRITE | PAGE_PRESENT */ - FILL(0) - . = bios_pd_0 + 0x1000; - - bios_pd_3 = .; - FILL(0) - . = bios_pd_3 + 0xFF8; - QUAD(0xFFE000A1) /* 4GiB-2MiB..4GiB, HUGE_PAGE | PAGE_ACCESSED | PAGE_PRESENT */ - } > bios - ASSERT((bios_pml4 == ADDR(.page_tables)), "wrong pml4 address") - ASSERT((bios_pdpt == ADDR(.page_tables) + 0x1000), "wrong pdpt address") - ASSERT((bios_pd_0 == ADDR(.page_tables) + 0x2000), "wrong pd_0 address") - ASSERT((bios_pd_3 == ADDR(.page_tables) + 0x3000), "wrong pd_3 address") - ASSERT((. == ADDR(.page_tables) + 0x4000), "wrong page table size") - .rodata : { /* Include large section (.lrodata) to support large code model. * See . diff --git a/stage0_bin_tdx/src/asm/tdx.s b/stage0_bin_tdx/src/asm/tdx.s index 3453e14936..04c6e65a41 100644 --- a/stage0_bin_tdx/src/asm/tdx.s +++ b/stage0_bin_tdx/src/asm/tdx.s @@ -43,9 +43,44 @@ _tdx_32bit_long_mode_start: bts $0x05, %eax # PAE movl %eax, %cr4 - # page tables are set in the linker script - movl $bios_pml4, %ecx - movl %ecx, %cr3 + # Clear BSS: base address goes to EDI, value (0) goes to EAX, + # count goes into ECX. Page tables will be located in BSS + movl $bss_start, %edi + movl $bss_size, %ecx + xorl %eax, %eax + rep stosb + + # Set the first entry of PML4 to point to PDPT (0..512GiB). + movl ${pdpt}, %esi + orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) + movl %esi, ({pml4}) # set first half of PML4[0] + + # Set the first entry of PDPT to point to PD_0 (0..1GiB). + movl ${pd_0}, %esi + orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) + movl %esi, ({pdpt}) # set first half of PDPT[0] + + # Set the fourth entry of PDPT to point to PD_3 (3..4GiB). + movl ${pdpt}, %eax + movl ${pd_3}, %esi + orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) + movl %esi, 24(%eax) # set first half of PDPT[3], each entry is 8 bytes + + # Set the first entry of PD_0 to point to and identity mapped huge page (0..2MiB). + movl $0x83, %esi # esi = 0x0 | 131 (PRESENT and WRITABLE and HUGE_PAGE) + movl %esi, ({pd_0}) # set first half of PD_0[0] + + # Set the last entry of PD_3 to point to an identity-mapped 2MiB huge page ((4GiB-2MiB)..4GiB). + # This is where the firmware ROM image is mapped, so we don't make it writable. + movl ${pd_3}, %eax + movl $0xFFE00000, %esi # address of 4GiB-2MiB + orl $0x81, %esi # esi |= 129 (PRESENT and HUGE_PAGE) + movl %esi, 0xFF8(%eax) # set first half of PML4[511], each entry is 8 bytes + + # Reload PML4 to use the writable PML4 + #xorl %eax, %eax + movl ${pml4}, %eax + movl %eax, %cr3 # In a TDX VM, IA32_EFER msr is set by tdx module. # No need for rdmsr/wrmsr @@ -80,50 +115,11 @@ _tdx_64bit_start: movl $data_size, %ecx rep movsd - # Clear BSS: base address goes to EDI, value (0) goes to EAX, - # count goes into ECX. - movq $bss_start, %rdi - movq $bss_size, %rcx - xorq %rax, %rax - rep stosq - # Set up the stack. Stack now is in ram_low movl $stack_start, %esp push $0 movl $0xdeadbeaf, (TEST_DATA) - # Set the first entry of PML4 to point to PDPT (0..512GiB). - movl ${pdpt}, %esi - orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) - movl %esi, ({pml4}) # set first half of PML4[0] - - # Set the first entry of PDPT to point to PD_0 (0..1GiB). - movl ${pd_0}, %esi - orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) - movl %esi, ({pdpt}) # set first half of PDPT[0] - - # Set the fourth entry of PDPT to point to PD_3 (3..4GiB). - movl ${pdpt}, %eax - movl ${pd_3}, %esi - orl $3, %esi # esi |= 3 (PRESENT and WRITABLE) - movl %esi, 24(%eax) # set first half of PDPT[3], each entry is 8 bytes - - # Set the first entry of PD_0 to point to and identity mapped huge page (0..2MiB). - movl $0x83, %esi # esi = 0x0 | 131 (PRESENT and WRITABLE and HUGE_PAGE) - movl %esi, ({pd_0}) # set first half of PD_0[0] - - # Set the last entry of PD_3 to point to an identity-mapped 2MiB huge page ((4GiB-2MiB)..4GiB). - # This is where the firmware ROM image is mapped, so we don't make it writable. - movl ${pd_3}, %eax - movl $0xFFE00000, %esi # address of 4GiB-2MiB - orl $0x81, %esi # esi |= 129 (PRESENT and HUGE_PAGE) - movl %esi, 0xFF8(%eax) # set first half of PML4[511], each entry is 8 bytes - - # Reload PML4 to use the writable PML4 - xorq %rax, %rax - movl ${pml4}, %eax - movq %rax, %cr3 - # ...and jump to Rust code. jmp rust64_start