Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PMP] add support for NA4 and NAPOT modes #566

Merged
merged 6 commits into from
Apr 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ mimpid = 0x01040312 => Version 01.04.03.12 => v1.4.3.12

| Date (*dd.mm.yyyy*) | Version | Comment |
|:-------------------:|:-------:|:--------|
| 01.04.2023 | 1.8.3.1 | :sparkles: add full `NA4` and `NAPOT` support to the (now) RISC-V-compatible **physical memory protection (PMP)**; [#566](https://github.com/stnolting/neorv32/pull/566) |
| 31.03.2023 | [**:rocket:1.8.3**](https://github.com/stnolting/neorv32/releases/tag/v1.8.3) | **New release** |
| 29.03.2023 | 1.8.2.9 | :warning: remove `CPU_EXTENSION_RISCV_Zicsr` generic - `Zicsr` ISA extension is always enabled; optimize bus switch; VHDL code cleanups; [#562](https://github.com/stnolting/neorv32/pull/562) |
| 25.03.2023 | 1.8.2.8 | :test_tube: add configurable data cache (**dCACHE**); [#560](https://github.com/stnolting/neorv32/pull/560) |
Expand Down
13 changes: 4 additions & 9 deletions docs/datasheet/cpu.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ instruction exception (-> <<_full_virtualization>>).

**Incompatibility Issues and Limitations**

.Physical Memory Protection (PMP)
[WARNING]
The RISC-V-compatible NEORV32 <<_machine_physical_memory_protection_csrs>> only implements the **TOR**
(top of region) mode and only up to 16 PMP regions.

.No Hardware Support of Misaligned Memory Accesses
[IMPORTANT]
The CPU does not support resolving unaligned memory access by the hardware (this is not a
Expand Down Expand Up @@ -561,14 +556,14 @@ to the RISC-V Privileged Architecture Specifications. In general, the PMP can **
which by default has none, and can **revoke permissions from M-mode**, which by default has full permissions.
The PMP is configured via the <<_machine_physical_memory_protection_csrs>>.

[IMPORTANT]
The NEORV32 PMP only supports **TOR** (top of region) mode, which basically is a "base-and-bound" concept, and only
up to 16 PMP regions.

.PMP Rules when in Debug Mode
[NOTE]
When in debug-mode all PMP rules are ignored making the debugger have maximum access rights.

[IMPORTANT]
Instruction fetches are also triggered when denied by a certain PMP rule. However, the fetched instruction(s)
will not be executed and will not change CPU core state to preserve memory access protection.


==== `Sdext` ISA Extension

Expand Down
169 changes: 83 additions & 86 deletions docs/datasheet/cpu_csr.adoc

Large diffs are not rendered by default.

204 changes: 104 additions & 100 deletions rtl/core/neorv32_cpu_bus.vhd
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ architecture neorv32_cpu_bus_rtl of neorv32_cpu_bus is
constant pmp_cfg_ah_c : natural := 4; -- mode bit high
constant pmp_cfg_l_c : natural := 7; -- locked entry

-- PMP minimal granularity --
constant pmp_lsb_c : natural := index_size_f(PMP_MIN_GRANULARITY); -- min = 2
-- PMP helpers --
constant pmp_lsb_c : natural := index_size_f(PMP_MIN_GRANULARITY); -- min = 2
constant pmp_zero_c : std_ulogic_vector(XLEN-1 downto pmp_lsb_c) := (others => '0');

-- misc --
signal data_sign : std_ulogic; -- signed load
Expand All @@ -110,21 +111,28 @@ architecture neorv32_cpu_bus_rtl of neorv32_cpu_bus is
signal arbiter : bus_arbiter_t;

-- physical memory protection --
type pmp_mask_t is array (0 to PMP_NUM_REGIONS-1) of std_ulogic_vector(XLEN-1 downto pmp_lsb_c);
type pmp_t is record
i_cmp_mm : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_cmp_ge : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_cmp_lt : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_mm : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_ge : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_cmp_lt : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
i_match : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
d_match : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_ex : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_rd : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
perm_wr : std_ulogic_vector(PMP_NUM_REGIONS-1 downto 0);
fail_ex : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
fail_rd : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
fail_wr : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
if_fault : std_ulogic;
ld_fault : std_ulogic;
st_fault : std_ulogic;
end record;
signal pmp : pmp_t;
signal pmp_mask : pmp_mask_t;
signal pmp : pmp_t;

-- pmp faults --
signal if_pmp_fault : std_ulogic; -- pmp instruction access fault
Expand Down Expand Up @@ -380,8 +388,7 @@ begin
arbiter.pmp_r_err <= '0';
arbiter.pmp_w_err <= '0';
elsif rising_edge(clk_i) then
arbiter.pmp_r_err <= ld_pmp_fault;
arbiter.pmp_w_err <= st_pmp_fault;
-- arbiter --
if (arbiter.pend = '0') then -- idle
if (ctrl_i.bus_req = '1') then -- start bus access
arbiter.pend <= '1';
Expand All @@ -399,6 +406,11 @@ begin
arbiter.pend <= '0';
end if;
end if;
-- PMP error --
if (ctrl_i.bus_mo_we = '1') then -- sample PMP errors only once
arbiter.pmp_r_err <= ld_pmp_fault;
arbiter.pmp_w_err <= st_pmp_fault;
end if;
end if;
end process data_access_arbiter;

Expand All @@ -421,111 +433,103 @@ begin
-- RISC-V Physical Memory Protection (PMP) ------------------------------------------------
-- -------------------------------------------------------------------------------------------

-- check address --
pmp_check_address: process(fetch_pc_i, addr_i, pmp_addr_i)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
if (r = 0) then -- first entry: use ZERO as base and current entry as bound
pmp.i_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.i_cmp_lt(r) <= '0'; -- unused
pmp.d_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.d_cmp_lt(r) <= '0'; -- unused
else -- use previous entry as base and current entry as bound
pmp.i_cmp_ge(r) <= bool_to_ulogic_f(unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c)));
pmp.i_cmp_lt(r) <= bool_to_ulogic_f(unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r-0)(XLEN-1 downto pmp_lsb_c)));
pmp.d_cmp_ge(r) <= bool_to_ulogic_f(unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c)));
pmp.d_cmp_lt(r) <= bool_to_ulogic_f(unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r-0)(XLEN-1 downto pmp_lsb_c)));
end if;
end loop; -- r
end process pmp_check_address;


-- check mode --
pmp_check_mode: process(pmp_ctrl_i, pmp)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
if (pmp_ctrl_i(r)(pmp_cfg_ah_c downto pmp_cfg_al_c) = pmp_mode_tor_c) then -- TOR mode
if (r < (PMP_NUM_REGIONS-1)) then
-- this saves a LOT of comparators --
pmp.i_match(r) <= pmp.i_cmp_ge(r) and (not pmp.i_cmp_ge(r+1));
pmp.d_match(r) <= pmp.d_cmp_ge(r) and (not pmp.d_cmp_ge(r+1));
else -- very last entry
pmp.i_match(r) <= pmp.i_cmp_ge(r) and pmp.i_cmp_lt(r);
pmp.d_match(r) <= pmp.d_cmp_ge(r) and pmp.d_cmp_lt(r);
-- compute address masks for NAPOT modes (iterative!) --
pmp_masking_gen:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp_masking: process(rstn_i, clk_i)
begin
if (rstn_i = '0') then
pmp_mask(r) <= (others => '0');
elsif rising_edge(clk_i) then -- address mask computation has a latency of max 32 cycles
if (pmp_ctrl_i(r)(pmp_cfg_al_c) = '1') then -- NAPOT (or TOR, but that's irrelevant here)
pmp_mask(r)(pmp_lsb_c) <= '0';
for i in pmp_lsb_c+1 to XLEN-1 loop
pmp_mask(r)(i) <= pmp_mask(r)(i-1) or (not pmp_addr_i(r)(i-1)); -- skip address byte offset
end loop; -- i
else -- NA4
pmp_mask(r) <= (others => '1');
end if;
else -- entry disabled
pmp.i_match(r) <= '0';
pmp.d_match(r) <= '0';
end if;
end loop; -- r
end process pmp_check_mode;
end process pmp_masking;
end generate;


-- check permission --
pmp_check_permission: process(ctrl_i, pmp_ctrl_i)
begin
for r in 0 to PMP_NUM_REGIONS-1 loop
-- check address --
pmp_check_address:
for r in 0 to PMP_NUM_REGIONS-1 generate
-- NA4 and NAPOT --
pmp.i_cmp_mm(r) <= '1' when ((fetch_pc_i(XLEN-1 downto pmp_lsb_c) and pmp_mask(r)) = (pmp_addr_i(r)(XLEN-1 downto pmp_lsb_c) and pmp_mask(r))) else '0';
pmp.d_cmp_mm(r) <= '1' when (( addr_i(XLEN-1 downto pmp_lsb_c) and pmp_mask(r)) = (pmp_addr_i(r)(XLEN-1 downto pmp_lsb_c) and pmp_mask(r))) else '0';
-- TOR region 0 --
pmp_check_address_r0:
if (r = 0) generate -- first entry: use ZERO as base and current entry as bound
pmp.i_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.i_cmp_lt(r) <= '0'; -- unused
pmp.d_cmp_ge(r) <= '1'; -- address is always greater than or equal to zero
pmp.d_cmp_lt(r) <= '0'; -- unused
end generate;
-- TOR region any --
pmp_check_address_rany:
if (r > 0) generate -- use previous entry as base and current entry as bound
pmp.i_cmp_ge(r) <= '1' when (unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.i_cmp_lt(r) <= '1' when (unsigned(fetch_pc_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r )(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.d_cmp_ge(r) <= '1' when (unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmp_addr_i(r-1)(XLEN-1 downto pmp_lsb_c))) else '0';
pmp.d_cmp_lt(r) <= '1' when (unsigned( addr_i(XLEN-1 downto pmp_lsb_c)) < unsigned(pmp_addr_i(r )(XLEN-1 downto pmp_lsb_c))) else '0';
end generate;
end generate;

-- instruction fetch access --
if (ctrl_i.cpu_priv = priv_mode_m_c) then -- M mode: always allow if lock bit not set, otherwise check permission
pmp.perm_ex(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_x_c);
else -- U mode: always check permission
pmp.perm_ex(r) <= pmp_ctrl_i(r)(pmp_cfg_x_c);
end if;

-- load/store accesses from M mod (can also use U mode's permissions if MSTATUS.MPRV is set) --
if (ctrl_i.bus_priv = priv_mode_m_c) then -- M mode: always allow if lock bit not set, otherwise check permission
pmp.perm_rd(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= (not pmp_ctrl_i(r)(pmp_cfg_l_c)) or pmp_ctrl_i(r)(pmp_cfg_w_c);
else -- U mode: always check permission
pmp.perm_rd(r) <= pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= pmp_ctrl_i(r)(pmp_cfg_w_c);
end if;
-- check mode --
pmp_check_mode_gen:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp_check_mode: process(pmp_ctrl_i, pmp)
begin
case pmp_ctrl_i(r)(pmp_cfg_ah_c downto pmp_cfg_al_c) is
when pmp_mode_off_c => -- entry disabled
pmp.i_match(r) <= '0';
pmp.d_match(r) <= '0';
when pmp_mode_tor_c => -- top of region
if (r = (PMP_NUM_REGIONS-1)) then -- very last entry
pmp.i_match(r) <= pmp.i_cmp_ge(r) and pmp.i_cmp_lt(r);
pmp.d_match(r) <= pmp.d_cmp_ge(r) and pmp.d_cmp_lt(r);
else -- this saves a LOT of comparators
pmp.i_match(r) <= pmp.i_cmp_ge(r) and (not pmp.i_cmp_ge(r+1));
pmp.d_match(r) <= pmp.d_cmp_ge(r) and (not pmp.d_cmp_ge(r+1));
end if;
when others => -- naturally-aligned region
pmp.i_match(r) <= pmp.i_cmp_mm(r);
pmp.d_match(r) <= pmp.d_cmp_mm(r);
end case;
end process pmp_check_mode;
end generate;

end loop; -- r
end process pmp_check_permission;

-- check permission --
-- M mode: always allow if lock bit not set, otherwise check permission
pmp_check_permission:
for r in 0 to PMP_NUM_REGIONS-1 generate
pmp.perm_ex(r) <= pmp_ctrl_i(r)(pmp_cfg_x_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.cpu_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_x_c);
pmp.perm_rd(r) <= pmp_ctrl_i(r)(pmp_cfg_r_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.bus_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_r_c);
pmp.perm_wr(r) <= pmp_ctrl_i(r)(pmp_cfg_w_c) or (not pmp_ctrl_i(r)(pmp_cfg_l_c)) when (ctrl_i.bus_priv = priv_mode_m_c) else pmp_ctrl_i(r)(pmp_cfg_w_c);
end generate;


-- check for access fault (using static prioritization) --
pmp_check_fault: process(ctrl_i, pmp)
variable tmp_if_v, tmp_ld_v, tmp_st_v : std_ulogic_vector(PMP_NUM_REGIONS downto 0);
begin
-- > This is a *structural* description of a prioritization logic (a multiplexer chain).
-- > I prefer this style as I do not like using a loop with 'exit' - and I also think this style might be smaller
-- > and faster (could use the carry chain?!) as the synthesizer has less freedom doing what *I* want. ;)
tmp_if_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.cpu_priv /= priv_mode_m_c); -- default: fault if U mode
tmp_ld_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.bus_priv /= priv_mode_m_c); -- default: fault if U mode
tmp_st_v(PMP_NUM_REGIONS) := bool_to_ulogic_f(ctrl_i.bus_priv /= priv_mode_m_c); -- default: fault if U mode

for r in PMP_NUM_REGIONS-1 downto 0 loop -- start with lowest priority
-- instruction fetch access --
if (pmp.i_match(r) = '1') then -- address matches region r
tmp_if_v(r) := not pmp.perm_ex(r); -- fault if no execute permission
else
tmp_if_v(r) := tmp_if_v(r+1);
end if;
-- data load/store access --
if (pmp.d_match(r) = '1') then -- address matches region r
tmp_ld_v(r) := not pmp.perm_rd(r); -- fault if no read permission
tmp_st_v(r) := not pmp.perm_wr(r); -- fault if no write permission
else
tmp_ld_v(r) := tmp_ld_v(r+1);
tmp_st_v(r) := tmp_st_v(r+1);
end if;
end loop; -- r
pmp.if_fault <= tmp_if_v(0);
pmp.ld_fault <= tmp_ld_v(0);
pmp.st_fault <= tmp_st_v(0);

-- > this is the behavioral version of the code above (instruction fetch access)
-- pmp.if_fault <= bool_to_ulogic_f(ctrl_i.cpu_priv /= priv_mode_m_c); -- default: fault if U mode
-- for r in 0 to PMP_NUM_REGIONS-1 loop
-- if (pmp.i_match(r) = '1') then
-- pmp.if_fault <= not pmp.perm_ex(r); -- fault if no execute permission
-- exit;
-- end if;
-- end loop; -- r
end process pmp_check_fault;
-- default: fault if not M-mode --
pmp.fail_ex(PMP_NUM_REGIONS) <= '1' when (ctrl_i.cpu_priv /= priv_mode_m_c) else '0';
pmp.fail_rd(PMP_NUM_REGIONS) <= '1' when (ctrl_i.bus_priv /= priv_mode_m_c) else '0';
pmp.fail_wr(PMP_NUM_REGIONS) <= '1' when (ctrl_i.bus_priv /= priv_mode_m_c) else '0';
-- This is a *structural* description of a prioritization logic implemented as a multiplexer chain. --
pmp_chech_fault:
for r in PMP_NUM_REGIONS-1 downto 0 generate -- start with lowest priority
pmp.fail_ex(r) <= not pmp.perm_ex(r) when (pmp.i_match(r) = '1') else pmp.fail_ex(r+1);
pmp.fail_rd(r) <= not pmp.perm_rd(r) when (pmp.d_match(r) = '1') else pmp.fail_rd(r+1);
pmp.fail_wr(r) <= not pmp.perm_wr(r) when (pmp.d_match(r) = '1') else pmp.fail_wr(r+1);
end generate;
pmp.if_fault <= pmp.fail_ex(0);
pmp.ld_fault <= pmp.fail_rd(0);
pmp.st_fault <= pmp.fail_wr(0);


-- final PMP access fault signals (ignored when in debug mode) --
if_pmp_fault <= '1' when (pmp.if_fault = '1') and (PMP_NUM_REGIONS > 0) and (ctrl_i.cpu_debug = '0') else '0';
Expand Down
Loading