From 385eb5c1820d8ab503c334bc1d7f512226f619a0 Mon Sep 17 00:00:00 2001 From: stnolting Date: Tue, 26 Nov 2024 21:28:11 +0100 Subject: [PATCH 1/6] [pmp] use time-multiplex for I/D access checks reducing hardware costs by ~50% --- rtl/core/neorv32_cpu_pmp.vhd | 287 +++++++++++++++++------------------ 1 file changed, 136 insertions(+), 151 deletions(-) diff --git a/rtl/core/neorv32_cpu_pmp.vhd b/rtl/core/neorv32_cpu_pmp.vhd index 9e941b044..daf22ce85 100644 --- a/rtl/core/neorv32_cpu_pmp.vhd +++ b/rtl/core/neorv32_cpu_pmp.vhd @@ -3,6 +3,8 @@ -- -------------------------------------------------------------------------------- -- -- Compatible to the RISC-V PMP privilege architecture specifications. Granularity -- -- and supported modes can be constrained via generics to reduce area requirements. -- +-- This PMP module uses a "time multiplex" architecture to check instruction fetch -- +-- and load/store requests in a serial way to minimize area requirements. -- -- -------------------------------------------------------------------------------- -- -- The NEORV32 RISC-V Processor - https://github.com/stnolting/neorv32 -- -- Copyright (c) NEORV32 contributors. -- @@ -38,16 +40,15 @@ entity neorv32_cpu_pmp is -- address input -- addr_if_i : in std_ulogic_vector(XLEN-1 downto 0); -- instruction fetch address addr_ls_i : in std_ulogic_vector(XLEN-1 downto 0); -- load/store address - -- faults -- - fault_ex_o : out std_ulogic; -- instruction fetch fault - fault_rw_o : out std_ulogic -- read/write access fault + -- access error -- + fault_o : out std_ulogic -- permission violation ); end neorv32_cpu_pmp; architecture neorv32_cpu_pmp_rtl of neorv32_cpu_pmp is -- auto-configuration -- - constant granularity_c : natural := cond_sel_natural_f(boolean(GRANULARITY < 4), 4, 2**index_size_f(GRANULARITY)); + constant g_c : natural := cond_sel_natural_f(boolean(GRANULARITY < 4), 4, 2**index_size_f(GRANULARITY)); -- configuration register bits -- constant cfg_r_c : natural := 0; -- read permit @@ -55,124 +56,132 @@ architecture neorv32_cpu_pmp_rtl of neorv32_cpu_pmp is constant cfg_x_c : natural := 2; -- execute permit constant cfg_al_c : natural := 3; -- mode bit low constant cfg_ah_c : natural := 4; -- mode bit high - constant cfg_rl_c : natural := 5; -- reserved - constant cfg_rh_c : natural := 6; -- reserved constant cfg_l_c : natural := 7; -- locked entry -- operation modes -- constant mode_off_c : std_ulogic_vector(1 downto 0) := "00"; -- null region (disabled) constant mode_tor_c : std_ulogic_vector(1 downto 0) := "01"; -- top of range constant mode_na4_c : std_ulogic_vector(1 downto 0) := "10"; -- naturally aligned four-byte region - constant mode_napot_c : std_ulogic_vector(1 downto 0) := "11"; -- naturally aligned power-of-two region (>= 8 bytes) + constant mode_napot_c : std_ulogic_vector(1 downto 0) := "11"; -- naturally aligned power-of-two region (> 4 bytes) -- address LSB according to granularity -- - constant pmp_lsb_c : natural := index_size_f(granularity_c); -- min = 2 + constant pmp_lsb_c : natural := index_size_f(g_c); -- min = 2 - -- CSRs -- - type csr_cfg_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(7 downto 0); - type csr_addr_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(XLEN-1 downto 0); + -- configuration CSRs -- + type pmpcfg_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(7 downto 0); + signal pmpcfg : pmpcfg_t; + signal pmpcfg_we : std_ulogic_vector(3 downto 0); + + -- address CSRs -- + type pmpaddr_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(XLEN-1 downto 0); + signal pmpaddr : pmpaddr_t; + signal pmpaddr_we : std_ulogic_vector(15 downto 0); + + -- CSR read-back -- type csr_cfg_rd_t is array (0 to 15) of std_ulogic_vector(7 downto 0); type csr_cfg_rd32_t is array (0 to 03) of std_ulogic_vector(XLEN-1 downto 0); type csr_addr_rd_t is array (0 to 15) of std_ulogic_vector(XLEN-1 downto 0); - type csr_t is record - we_cfg : std_ulogic_vector(3 downto 0); - we_addr : std_ulogic_vector(15 downto 0); - cfg : csr_cfg_t; - addr : csr_addr_t; - end record; - signal csr : csr_t; signal cfg_rd : csr_cfg_rd_t; signal cfg_rd32 : csr_cfg_rd32_t; signal addr_rd : csr_addr_rd_t; - -- extended address (34-bit) -- - type xaddr_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(XLEN+1 downto 0); - signal xaddr : xaddr_t; + -- CPU access -- + signal acc_addr : std_ulogic_vector(XLEN-1 downto 0); + signal acc_priv : std_ulogic; - -- region access logic -- + -- address mask (NA$/NAPOT) -- type addr_mask_t is array (0 to NUM_REGIONS-1) of std_ulogic_vector(XLEN-1 downto pmp_lsb_c); signal addr_mask_napot, addr_mask : addr_mask_t; - type region_t is record - i_cmp_mm, d_cmp_mm : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- masked match - i_cmp_ge, d_cmp_ge : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- greater or equal - i_cmp_lt, d_cmp_lt : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- less than - i_match, d_match : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- region address match - perm_ex, perm_rw : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- region's permission - end record; - signal region : region_t; - - -- permission check violation -- - signal fail_ex, fail_rw : std_ulogic_vector(NUM_REGIONS downto 0); + + -- comparators -- + signal cmp_na, cmp_ge, cmp_lt : std_ulogic_vector(NUM_REGIONS-1 downto 0); + + -- region access logic -- + signal match : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- region address match + signal allow : std_ulogic_vector(NUM_REGIONS-1 downto 0); -- access allowed (permission OK) + signal fail : std_ulogic_vector(NUM_REGIONS downto 0); -- access failed (prioritized) begin -- Sanity Checks -------------------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - assert (GRANULARITY = granularity_c) report + assert (GRANULARITY = g_c) report "[NEORV32] Auto-adjusting invalid PMP granularity configuration." severity warning; - -- CSR Write Access ----------------------------------------------------------------------- + -- CSR Write Access: Configuration (PMPCFG) ----------------------------------------------- -- ------------------------------------------------------------------------------------------- - csr_we: process(csr_we_i, csr_addr_i) -- write enable decoder + csr_we_cfg: process(csr_we_i, csr_addr_i) -- write enable decoder begin - -- Configuration registers -- - csr.we_cfg <= (others => '0'); + pmpcfg_we <= (others => '0'); if (csr_addr_i(11 downto 2) = csr_pmpcfg0_c(11 downto 2)) and (csr_we_i = '1') then - csr.we_cfg(to_integer(unsigned(csr_addr_i(1 downto 0)))) <= '1'; - end if; - -- Address registers -- - csr.we_addr <= (others => '0'); - if (csr_addr_i(11 downto 4) = csr_pmpaddr0_c(11 downto 4)) and (csr_we_i = '1') then - csr.we_addr(to_integer(unsigned(csr_addr_i(3 downto 0)))) <= '1'; + pmpcfg_we(to_integer(unsigned(csr_addr_i(1 downto 0)))) <= '1'; end if; - end process csr_we; + end process csr_we_cfg; - -- PMP CSR registers -- - csr_reg_gen: + -- CSRs -- + csr_pmpcfg_gen: for i in 0 to NUM_REGIONS-1 generate - csr_reg: process(rstn_i, clk_i) + csr_pmpcfg: process(rstn_i, clk_i) variable mode_v : std_ulogic_vector(1 downto 0); begin if (rstn_i = '0') then - csr.cfg(i) <= (others => '0'); - csr.addr(i) <= (others => '0'); + pmpcfg(i) <= (others => '0'); elsif rising_edge(clk_i) then - - -- configuration -- - if (csr.we_cfg(i/4) = '1') and (csr.cfg(i)(cfg_l_c) = '0') then -- unlocked write access - csr.cfg(i)(cfg_r_c) <= csr_wdata_i((i mod 4)*8+cfg_r_c); -- R (read) - csr.cfg(i)(cfg_w_c) <= csr_wdata_i((i mod 4)*8+cfg_w_c); -- W (write) - csr.cfg(i)(cfg_x_c) <= csr_wdata_i((i mod 4)*8+cfg_x_c); -- X (execute) - -- A (mode) -- + if (pmpcfg_we(i/4) = '1') and (pmpcfg(i)(cfg_l_c) = '0') then -- unlocked write access + -- permissions -- + pmpcfg(i)(cfg_r_c) <= csr_wdata_i((i mod 4)*8+cfg_r_c); -- R (read) + pmpcfg(i)(cfg_w_c) <= csr_wdata_i((i mod 4)*8+cfg_w_c); -- W (write) + pmpcfg(i)(cfg_x_c) <= csr_wdata_i((i mod 4)*8+cfg_x_c); -- X (execute) + -- mode -- mode_v := csr_wdata_i((i mod 4)*8+cfg_ah_c downto (i mod 4)*8+cfg_al_c); if ((mode_v = mode_tor_c) and (not TOR_EN)) or -- TOR mode not implemented ((mode_v = mode_na4_c) and (not NAP_EN)) or -- NA4 mode not implemented ((mode_v = mode_napot_c) and (not NAP_EN)) or -- NAPOT mode not implemented - ((mode_v = mode_na4_c) and (granularity_c > 4)) then -- NA4 not available - csr.cfg(i)(cfg_ah_c downto cfg_al_c) <= mode_off_c; + ((mode_v = mode_na4_c) and (g_c > 4)) then -- NA4 not available + pmpcfg(i)(cfg_ah_c downto cfg_al_c) <= mode_off_c; else -- valid configuration - csr.cfg(i)(cfg_ah_c downto cfg_al_c) <= mode_v; + pmpcfg(i)(cfg_ah_c downto cfg_al_c) <= mode_v; end if; - -- - csr.cfg(i)(cfg_rl_c) <= '0'; -- reserved - csr.cfg(i)(cfg_rh_c) <= '0'; -- reserved - csr.cfg(i)(cfg_l_c) <= csr_wdata_i((i mod 4)*8+cfg_l_c); -- L (locked) + -- reserved -- + pmpcfg(i)(6 downto 5) <= (others => '0'); + -- locked -- + pmpcfg(i)(cfg_l_c) <= csr_wdata_i((i mod 4)*8+cfg_l_c); end if; + end if; + end process csr_pmpcfg; + end generate; + + + -- CSR Write Access: Address (PMPADDR) ---------------------------------------------------- + -- ------------------------------------------------------------------------------------------- + csr_we_addr: process(csr_we_i, csr_addr_i) -- write enable decoder + begin + pmpaddr_we <= (others => '0'); + if (csr_addr_i(11 downto 4) = csr_pmpaddr0_c(11 downto 4)) and (csr_we_i = '1') then + pmpaddr_we(to_integer(unsigned(csr_addr_i(3 downto 0)))) <= '1'; + end if; + end process csr_we_addr; - -- address -- - if (csr.we_addr(i) = '1') and (csr.cfg(i)(cfg_l_c) = '0') then -- unlocked write access + -- CSRs -- + csr_pmpaddr_gen: + for i in 0 to NUM_REGIONS-1 generate + csr_pmpaddr: process(rstn_i, clk_i) + begin + if (rstn_i = '0') then + pmpaddr(i) <= (others => '0'); + elsif rising_edge(clk_i) then + if (pmpaddr_we(i) = '1') and (pmpcfg(i)(cfg_l_c) = '0') then -- unlocked write access if (i < NUM_REGIONS-1) then - if (csr.cfg(i+1)(cfg_l_c) = '0') or (csr.cfg(i+1)(cfg_ah_c downto cfg_al_c) /= mode_tor_c) then -- cfg(i+1) not "LOCKED TOR" - csr.addr(i) <= "00" & csr_wdata_i(XLEN-3 downto 0); + if (pmpcfg(i+1)(cfg_l_c) = '0') or (pmpcfg(i+1)(cfg_ah_c downto cfg_al_c) /= mode_tor_c) then -- pmpcfg(i+1) not "LOCKED TOR" + pmpaddr(i) <= "00" & csr_wdata_i(XLEN-3 downto 0); end if; else -- very last entry - csr.addr(i) <= "00" & csr_wdata_i(XLEN-3 downto 0); + pmpaddr(i) <= "00" & csr_wdata_i(XLEN-3 downto 0); end if; end if; - end if; - end process csr_reg; + end process csr_pmpaddr; end generate; @@ -195,22 +204,22 @@ begin csr_read_back_gen: for i in 0 to NUM_REGIONS-1 generate -- configuration -- - cfg_rd(i) <= csr.cfg(i); + cfg_rd(i) <= pmpcfg(i); -- address -- - address_read_back: process(csr) + address_read_back: process(pmpaddr, pmpcfg) begin addr_rd(i) <= (others => '0'); - addr_rd(i)(XLEN-1 downto pmp_lsb_c-2) <= csr.addr(i)(XLEN-1 downto pmp_lsb_c-2); - if (granularity_c = 8) and TOR_EN then -- bit G-1 reads as zero in TOR or OFF mode - if (csr.cfg(i)(cfg_ah_c) = '0') then -- TOR/OFF mode + addr_rd(i)(XLEN-3 downto pmp_lsb_c-2) <= pmpaddr(i)(XLEN-3 downto pmp_lsb_c-2); + if (g_c = 8) and TOR_EN then -- bit G-1 reads as zero in TOR or OFF mode + if (pmpcfg(i)(cfg_ah_c) = '0') then -- TOR/OFF mode addr_rd(i)(pmp_lsb_c) <= '0'; end if; - elsif (granularity_c > 8) then + elsif (g_c > 8) then if NAP_EN then addr_rd(i)(pmp_lsb_c-2 downto 0) <= (others => '1'); -- in NAPOT mode bits G-2:0 must read as one end if; if TOR_EN then - if (csr.cfg(i)(cfg_ah_c) = '0') then -- TOR/OFF mode + if (pmpcfg(i)(cfg_ah_c) = '0') then -- TOR/OFF mode addr_rd(i)(pmp_lsb_c-1 downto 0) <= (others => '0'); -- in TOR or OFF mode bits G-1:0 must read as zero end if; end if; @@ -232,14 +241,16 @@ begin end generate; - -- Region Access Logic -------------------------------------------------------------------- + -- Region Access and Permission Check Logic ----------------------------------------------- -- ------------------------------------------------------------------------------------------- + + -- access switch (check I/D in time-multiplex) -- + acc_addr <= addr_if_i when (ctrl_i.lsu_mo_we = '0') else addr_ls_i; + acc_priv <= ctrl_i.cpu_priv when (ctrl_i.lsu_mo_we = '0') else ctrl_i.lsu_priv; + region_gen: for r in 0 to NUM_REGIONS-1 generate - -- extend region addresses to 34-bit -- - xaddr(r) <= csr.addr(r) & "00"; -- mask byte offset - -- naturally-aligned address mask -- nap_mode_enable: if NAP_EN generate @@ -248,7 +259,7 @@ begin addr_mask_napot(r)(pmp_lsb_c) <= '0'; addr_mask_napot_gen: for i in pmp_lsb_c+1 to XLEN-1 generate - addr_mask_napot(r)(i) <= addr_mask_napot(r)(i-1) or (not xaddr(r)(i-1)); + addr_mask_napot(r)(i) <= addr_mask_napot(r)(i-1) or (not pmpaddr(r)(i-3)); end generate; -- address mask select -- @@ -257,7 +268,7 @@ begin if (rstn_i = '0') then addr_mask(r) <= (others => '0'); elsif rising_edge(clk_i) then - if (csr.cfg(r)(cfg_al_c) = '1') then -- NAPOT + if (pmpcfg(r)(cfg_al_c) = '1') then -- NAPOT addr_mask(r) <= addr_mask_napot(r); else -- NA4 addr_mask(r) <= (others => '1'); @@ -270,82 +281,61 @@ begin -- check region address match -- -- NA4 and NAPOT -- - region.i_cmp_mm(r) <= '1' when ((addr_if_i(XLEN-1 downto pmp_lsb_c) and addr_mask(r)) = (xaddr(r)(XLEN-1 downto pmp_lsb_c) and addr_mask(r))) and NAP_EN else '0'; - region.d_cmp_mm(r) <= '1' when ((addr_ls_i(XLEN-1 downto pmp_lsb_c) and addr_mask(r)) = (xaddr(r)(XLEN-1 downto pmp_lsb_c) and addr_mask(r))) and NAP_EN else '0'; + cmp_na(r) <= '1' when ((acc_addr(XLEN-1 downto pmp_lsb_c) and addr_mask(r)) = (pmpaddr(r)(XLEN-3 downto pmp_lsb_c-2) and addr_mask(r))) and NAP_EN else '0'; -- TOR region 0 -- addr_match_r0_gen: if (r = 0) generate -- first entry: use ZERO as base and current entry as bound - region.i_cmp_ge(r) <= '1' when TOR_EN else '0'; -- address is always greater than or equal to zero (and TOR mode enabled) - region.i_cmp_lt(r) <= '0'; -- unused - region.d_cmp_ge(r) <= '1' when TOR_EN else '0'; -- address is always greater than or equal to zero (and TOR mode enabled) - region.d_cmp_lt(r) <= '0'; -- unused + cmp_ge(r) <= '1' when TOR_EN else '0'; -- address is always greater than or equal to zero (and TOR mode enabled) + cmp_lt(r) <= '0'; -- unused end generate; -- TOR region any -- - addr_match_rx_gen: + addr_match_rn_gen: if (r > 0) generate -- use previous entry as base and current entry as bound - region.i_cmp_ge(r) <= '1' when (unsigned(addr_if_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(xaddr(r-1)(XLEN-1 downto pmp_lsb_c))) and TOR_EN else '0'; - region.i_cmp_lt(r) <= '1' when (unsigned(addr_if_i(XLEN-1 downto pmp_lsb_c)) < unsigned(xaddr(r )(XLEN-1 downto pmp_lsb_c))) and TOR_EN else '0'; - region.d_cmp_ge(r) <= '1' when (unsigned(addr_ls_i(XLEN-1 downto pmp_lsb_c)) >= unsigned(xaddr(r-1)(XLEN-1 downto pmp_lsb_c))) and TOR_EN else '0'; - region.d_cmp_lt(r) <= '1' when (unsigned(addr_ls_i(XLEN-1 downto pmp_lsb_c)) < unsigned(xaddr(r )(XLEN-1 downto pmp_lsb_c))) and TOR_EN else '0'; + cmp_ge(r) <= '1' when (unsigned(acc_addr(XLEN-1 downto pmp_lsb_c)) >= unsigned(pmpaddr(r-1)(XLEN-3 downto pmp_lsb_c-2))) and TOR_EN else '0'; + cmp_lt(r) <= '1' when (unsigned(acc_addr(XLEN-1 downto pmp_lsb_c)) < unsigned(pmpaddr(r )(XLEN-3 downto pmp_lsb_c-2))) and TOR_EN else '0'; end generate; -- check region match according to configured mode -- - match_gen: process(csr, region) - variable tmp_v : std_ulogic_vector(1 downto 0); + match_gen: process(pmpcfg, cmp_ge, cmp_lt, cmp_na) begin - tmp_v := csr.cfg(r)(cfg_ah_c downto cfg_al_c); - case tmp_v is -- VHDL/GHDL issue: "object type is not locally static" - when mode_off_c => -- entry disabled - region.i_match(r) <= '0'; - region.d_match(r) <= '0'; - when mode_tor_c => -- top of region - if TOR_EN then -- TOR mode implemented? - if (r = (NUM_REGIONS-1)) then -- very last entry - region.i_match(r) <= region.i_cmp_ge(r) and region.i_cmp_lt(r); - region.d_match(r) <= region.d_cmp_ge(r) and region.d_cmp_lt(r); - else -- this saves a LOT of comparators - region.i_match(r) <= region.i_cmp_ge(r) and (not region.i_cmp_ge(r+1)); - region.d_match(r) <= region.d_cmp_ge(r) and (not region.d_cmp_ge(r+1)); - end if; - else - region.i_match(r) <= '0'; - region.d_match(r) <= '0'; - end if; - when others => -- naturally-aligned region - if NAP_EN then -- NAPOT/NA4 modes implemented? - region.i_match(r) <= region.i_cmp_mm(r); - region.d_match(r) <= region.d_cmp_mm(r); - else - region.i_match(r) <= '0'; - region.d_match(r) <= '0'; - end if; - end case; + if (pmpcfg(r)(cfg_ah_c downto cfg_al_c) = mode_tor_c) and TOR_EN then -- TOR + if (r = (NUM_REGIONS-1)) then -- very last region + match(r) <= cmp_ge(r) and cmp_lt(r); + else -- any other region + match(r) <= cmp_ge(r) and (not cmp_ge(r+1)); -- this saves a LOT of comparators + end if; + elsif (pmpcfg(r)(cfg_ah_c) = mode_napot_c(1)) and NAP_EN then -- NA4/NAPOT + match(r) <= cmp_na(r); + else -- OFF / mode not supported + match(r) <= '0'; + end if; end process match_gen; - -- compute region permissions -- - perm_gen: process(csr.cfg, ctrl_i) + -- select region permission -- + perm_gen: process(ctrl_i, acc_priv, pmpcfg) begin -- execute (X) -- - if (ctrl_i.cpu_priv = priv_mode_m_c) then - region.perm_ex(r) <= csr.cfg(r)(cfg_x_c) or (not csr.cfg(r)(cfg_l_c)); -- M mode: always allow if not locked - else - region.perm_ex(r) <= csr.cfg(r)(cfg_x_c); - end if; + if (ctrl_i.lsu_mo_we = '0') then + if (acc_priv = priv_mode_m_c) then + allow(r) <= pmpcfg(r)(cfg_x_c) or (not pmpcfg(r)(cfg_l_c)); -- M mode: always allow if not locked + else + allow(r) <= pmpcfg(r)(cfg_x_c); + end if; -- read (R) -- - if (ctrl_i.lsu_rw = '0') then - if (ctrl_i.lsu_priv = priv_mode_m_c) then - region.perm_rw(r) <= csr.cfg(r)(cfg_r_c) or (not csr.cfg(r)(cfg_l_c)); -- M mode: always allow if not locked + elsif (ctrl_i.lsu_rw = '0') then + if (acc_priv = priv_mode_m_c) then + allow(r) <= pmpcfg(r)(cfg_r_c) or (not pmpcfg(r)(cfg_l_c)); -- M mode: always allow if not locked else - region.perm_rw(r) <= csr.cfg(r)(cfg_r_c); + allow(r) <= pmpcfg(r)(cfg_r_c); end if; -- write (W) -- else - if (ctrl_i.lsu_priv = priv_mode_m_c) then - region.perm_rw(r) <= csr.cfg(r)(cfg_w_c) or (not csr.cfg(r)(cfg_l_c)); -- M mode: always allow if not locked + if (acc_priv = priv_mode_m_c) then + allow(r) <= pmpcfg(r)(cfg_w_c) or (not pmpcfg(r)(cfg_l_c)); -- M mode: always allow if not locked else - region.perm_rw(r) <= csr.cfg(r)(cfg_w_c); + allow(r) <= pmpcfg(r)(cfg_w_c); end if; end if; end process perm_gen; @@ -353,30 +343,25 @@ begin end generate; - -- Access Permission Check ---------------------------------------------------------------- + -- Access Permission Check (using static prioritization) ---------------------------------- -- ------------------------------------------------------------------------------------------- - -- check for access fault (using static prioritization) -- - fail_ex(NUM_REGIONS) <= '1' when (ctrl_i.cpu_priv /= priv_mode_m_c) else '0'; -- default (if not match): fault if not M-mode - fail_rw(NUM_REGIONS) <= '1' when (ctrl_i.lsu_priv /= priv_mode_m_c) else '0'; -- default (if not match): fault if not M-mode -- this is a *structural* description of a prioritization logic implemented as a multiplexer chain -- + fail(NUM_REGIONS) <= '1' when (acc_priv /= priv_mode_m_c) else '0'; -- default (if not match): fault if not M-mode fault_check_gen: for r in NUM_REGIONS-1 downto 0 generate -- start with lowest priority - fail_ex(r) <= not region.perm_ex(r) when (region.i_match(r) = '1') else fail_ex(r+1); - fail_rw(r) <= not region.perm_rw(r) when (region.d_match(r) = '1') else fail_rw(r+1); + fail(r) <= not allow(r) when (match(r) = '1') else fail(r+1); end generate; - -- final access check -- - access_check: process(rstn_i, clk_i) + -- output buffer -- + fault_check: process(rstn_i, clk_i) begin if (rstn_i = '0') then - fault_ex_o <= '0'; - fault_rw_o <= '0'; + fault_o <= '0'; elsif rising_edge(clk_i) then - fault_ex_o <= (not ctrl_i.cpu_debug) and fail_ex(0); -- ignore PMP rules when in debug mode - fault_rw_o <= (not ctrl_i.cpu_debug) and fail_rw(0); + fault_o <= (not ctrl_i.cpu_debug) and fail(0); -- ignore PMP rules when in debug-mode end if; - end process access_check; + end process fault_check; end neorv32_cpu_pmp_rtl; From 6822fecca74bb078b70783b140a02befc7bf0888 Mon Sep 17 00:00:00 2001 From: stnolting Date: Tue, 26 Nov 2024 21:28:30 +0100 Subject: [PATCH 2/6] [cpu] signal renaming/cleanups --- rtl/core/neorv32_cpu.vhd | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/rtl/core/neorv32_cpu.vhd b/rtl/core/neorv32_cpu.vhd index a79295e9c..1a39fa370 100644 --- a/rtl/core/neorv32_cpu.vhd +++ b/rtl/core/neorv32_cpu.vhd @@ -121,11 +121,10 @@ architecture neorv32_cpu_rtl of neorv32_cpu is signal csr_rdata : std_ulogic_vector(XLEN-1 downto 0); -- csr read data signal lsu_mar : std_ulogic_vector(XLEN-1 downto 0); -- lsu memory address register signal lsu_err : std_ulogic_vector(3 downto 0); -- lsu alignment/access errors - signal pc_fetch : std_ulogic_vector(XLEN-1 downto 0); -- pc for instruction fetch signal pc_curr : std_ulogic_vector(XLEN-1 downto 0); -- current pc (for currently executed instruction) + signal pc_next : std_ulogic_vector(XLEN-1 downto 0); -- next PC (corresponding to next instruction) signal pc_ret : std_ulogic_vector(XLEN-1 downto 0); -- return address - signal pmp_ex_fault : std_ulogic; -- pmp instruction fetch fault - signal pmp_rw_fault : std_ulogic; -- pmp read/write access fault + signal pmp_fault : std_ulogic; -- pmp permission violation signal irq_machine : std_ulogic_vector(2 downto 0); -- risc-v standard machine-level interrupts begin @@ -234,17 +233,18 @@ begin rstn_i => rstn_i, -- global reset, low-active, async ctrl_o => ctrl, -- main control bus -- instruction fetch interface -- - ibus_pmperr_i => pmp_ex_fault, -- instruction fetch pmp fault ibus_req_o => ibus_req_o, -- request ibus_rsp_i => ibus_rsp_i, -- response + -- pmp fault -- + pmp_fault_i => pmp_fault, -- instruction fetch / execute pmp fault -- data path interface -- alu_cp_done_i => alu_cp_done, -- ALU iterative operation done alu_cmp_i => alu_cmp, -- comparator status alu_add_i => alu_add, -- ALU address result alu_imm_o => alu_imm, -- immediate rf_rs1_i => rs1, -- rf source 1 - pc_fetch_o => pc_fetch, -- instruction fetch address pc_curr_o => pc_curr, -- current PC (corresponding to current instruction) + pc_next_o => pc_next, -- next PC (corresponding to next instruction) pc_ret_o => pc_ret, -- return address csr_rdata_o => csr_rdata, -- CSR read data -- external CSR interface -- @@ -269,8 +269,8 @@ begin xcsr_rdata_res <= xcsr_rdata_pmp or xcsr_rdata_alu; -- CPU state -- - sleep_o <= ctrl.cpu_sleep; -- set when CPU is sleeping (after WFI) - debug_o <= ctrl.cpu_debug; -- set when CPU is in debug mode + sleep_o <= ctrl.cpu_sleep; + debug_o <= ctrl.cpu_debug; -- Register File -------------------------------------------------------------------------- @@ -365,7 +365,7 @@ begin mar_o => lsu_mar, -- memory address register wait_o => lsu_wait, -- wait for access to complete err_o => lsu_err, -- alignment/access errors - pmp_fault_i => pmp_rw_fault, -- PMP read/write access fault + pmp_fault_i => pmp_fault, -- PMP read/write access fault -- data bus -- dbus_req_o => dbus_req_o, -- request dbus_rsp_i => dbus_rsp_i -- response @@ -394,19 +394,17 @@ begin csr_wdata_i => xcsr_wdata, -- write data csr_rdata_o => xcsr_rdata_pmp, -- read data -- address input -- - addr_if_i => pc_fetch, -- instruction fetch address + addr_if_i => pc_next, -- instruction fetch address addr_ls_i => alu_add, -- load/store address - -- faults -- - fault_ex_o => pmp_ex_fault, -- instruction fetch fault - fault_rw_o => pmp_rw_fault -- read/write access fault + -- access error -- + fault_o => pmp_fault -- permission violation ); end generate; pmp_inst_false: if not RISCV_ISA_Smpmp generate xcsr_rdata_pmp <= (others => '0'); - pmp_ex_fault <= '0'; - pmp_rw_fault <= '0'; + pmp_fault <= '0'; end generate; From 3e0d6261b098f5506fad8f575783bc82ec2a2e71 Mon Sep 17 00:00:00 2001 From: stnolting Date: Tue, 26 Nov 2024 21:28:55 +0100 Subject: [PATCH 3/6] [control] move inst. PMP check to execution engine --- rtl/core/neorv32_cpu_control.vhd | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 19812bd0b..4b53ef337 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -79,17 +79,18 @@ entity neorv32_cpu_control is rstn_i : in std_ulogic; -- global reset, low-active, async ctrl_o : out ctrl_bus_t; -- main control bus -- instruction fetch interface -- - ibus_pmperr_i : in std_ulogic; -- instruction fetch pmp fault ibus_req_o : out bus_req_t; -- request ibus_rsp_i : in bus_rsp_t; -- response + -- pmp fault -- + pmp_fault_i : in std_ulogic; -- instruction fetch / execute pmp fault -- data path interface -- alu_cp_done_i : in std_ulogic; -- ALU iterative operation done alu_cmp_i : in std_ulogic_vector(1 downto 0); -- comparator status alu_add_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU address result alu_imm_o : out std_ulogic_vector(XLEN-1 downto 0); -- immediate rf_rs1_i : in std_ulogic_vector(XLEN-1 downto 0); -- rf source 1 - pc_fetch_o : out std_ulogic_vector(XLEN-1 downto 0); -- instruction fetch address pc_curr_o : out std_ulogic_vector(XLEN-1 downto 0); -- current PC (corresponding to current instruction) + pc_next_o : out std_ulogic_vector(XLEN-1 downto 0); -- next PC (corresponding to next instruction) pc_ret_o : out std_ulogic_vector(XLEN-1 downto 0); -- return address csr_rdata_o : out std_ulogic_vector(XLEN-1 downto 0); -- CSR read data -- external CSR interface -- @@ -350,7 +351,6 @@ begin -- PC output for instruction fetch -- ibus_req_o.addr <= fetch_engine.pc(XLEN-1 downto 2) & "00"; -- word aligned - pc_fetch_o <= fetch_engine.pc(XLEN-1 downto 2) & "00"; -- word aligned -- instruction fetch (read) request if IPB not full -- ibus_req_o.stb <= '1' when (fetch_engine.state = IF_REQUEST) and (ipb.free = "11") else '0'; @@ -359,8 +359,8 @@ begin fetch_engine.resp <= ibus_rsp_i.ack or ibus_rsp_i.err; -- IPB instruction data and status -- - ipb.wdata(0) <= (ibus_rsp_i.err or ibus_pmperr_i) & ibus_rsp_i.data(15 downto 0); - ipb.wdata(1) <= (ibus_rsp_i.err or ibus_pmperr_i) & ibus_rsp_i.data(31 downto 16); + ipb.wdata(0) <= ibus_rsp_i.err & ibus_rsp_i.data(15 downto 0); + ipb.wdata(1) <= ibus_rsp_i.err & ibus_rsp_i.data(31 downto 16); -- IPB write enable -- ipb.we(0) <= '1' when (fetch_engine.state = IF_PENDING) and (fetch_engine.resp = '1') and @@ -564,6 +564,7 @@ begin -- PC output -- pc_curr_o <= exe_engine.pc(XLEN-1 downto 1) & '0'; -- address of current instruction + pc_next_o <= exe_engine.pc2(XLEN-1 downto 1) & '0'; -- address of next instruction pc_ret_o <= exe_engine.ra(XLEN-1 downto 1) & '0'; -- return address -- simplified rv32 opcode -- @@ -572,7 +573,8 @@ begin -- Execute Engine FSM Comb ---------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - execute_engine_fsm_comb: process(exe_engine, debug_ctrl, trap_ctrl, hw_trigger_match, opcode, issue_engine, csr, alu_cp_done_i, lsu_wait_i, alu_add_i, branch_taken) + execute_engine_fsm_comb: process(exe_engine, debug_ctrl, trap_ctrl, hw_trigger_match, opcode, issue_engine, csr, + alu_cp_done_i, lsu_wait_i, alu_add_i, branch_taken, pmp_fault_i) variable funct3_v : std_ulogic_vector(2 downto 0); variable funct7_v : std_ulogic_vector(6 downto 0); begin @@ -688,11 +690,11 @@ begin exe_engine_nxt.state <= BRANCHED; -- delay cycle to restart front-end when EXECUTE => -- decode and execute instruction (control will be here for exactly 1 cycle in any case) - -- [NOTE] register file is read in this stage; due to the sync read, data will be available in the _next_ state -- ------------------------------------------------------------ exe_engine_nxt.pc2 <= alu_add_i(XLEN-1 downto 1) & '0'; -- next PC (= PC + immediate) + trap_ctrl.instr_be <= pmp_fault_i; -- did this instruction cause a PMP-execute violation? - -- decode instruction class/type -- + -- decode instruction class/type; [NOTE] register file is read in THIS stage; due to the sync read data will be available in the NEXT state -- case opcode is -- register/immediate ALU operation -- From 25e72cc742e8a525c43084a7c04079de64e562c5 Mon Sep 17 00:00:00 2001 From: stnolting Date: Sat, 30 Nov 2024 07:49:40 +0100 Subject: [PATCH 4/6] [docs] pmp: minor updates/fixes --- docs/datasheet/cpu.adoc | 48 +++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc index 8a7df9475..76d4c338d 100644 --- a/docs/datasheet/cpu.adoc +++ b/docs/datasheet/cpu.adoc @@ -1044,28 +1044,40 @@ does not complete operation within this time window. ==== `Smpmp` ISA Extension -The NEORV32 physical memory protection (PMP) provides an elementary memory -protection mechanism that can be used to constrain read, write and execute rights of arbitrary memory regions. -The NEORV32 PMP is fully compatible to the RISC-V Privileged Architecture Specifications. In general, the PMP can -**grant permissions to user mode**, which by default has none, and can **revoke permissions from M-mode**, which -by default has full permissions. The PMP is configured via the <<_machine_physical_memory_protection_csrs>>. - -Several <<_processor_top_entity_generics>> are provided to fine-tune the CPU's PMP capabilities: - -* `PMP_NUM_REGIONS` defines the number of implemented PMP region -* `PMP_MIN_GRANULARITY` defines the minimal granularity of each region -* `PMP_TOR_MODE_EN` controls the implementation of the top-of-region (TOR) mode -* `PMP_NAP_MODE_EN` controls the implementation of the naturally-aligned-power-of-two (NA4 and NAPOT) modes +The NEORV32 physical memory protection (PMP) provides an elementary memory protection mechanism that can be used +to configure read/write(execute permission of arbitrary memory regions. In general, the PMP can **grant permissions +to user mode**, which by default has none, and can **revoke permissions from M-mode**, which by default has full +permissions. The NEORV32 PMP is fully compatible to the RISC-V Privileged Architecture Specifications and is +configured via several CSRs (<<_machine_physical_memory_protection_csrs>>). Several <<_processor_top_entity_generics>> +are provided to adjust the CPU's PMP capabilities according to the application requirements (pre-synthesis): + +. `PMP_NUM_REGIONS` defines the number of implemented PMP regions (0..16); setting this generic to zero will +result in absolutely no PMP logic being implemented +. `PMP_MIN_GRANULARITY` defines the minimal granularity of each region (has to be a power of 2, minimal +granularity = 4 bytes); note that a smaller granularity will lead to wider comparators and thus, to higher area footprint +and longer critical path +. `PMP_TOR_MODE_EN` controls the implementation of the top-of-region (TOR) mode (default = true); disabling this mode +will reduce area footprint +. `PMP_NAP_MODE_EN` controls the implementation of the naturally-aligned-power-of-two (NA4 and NAPOT) modes (default = +true); disabling this mode will reduce area footprint and critical path length + +.PMP Permissions when in Debug Mode +[NOTE] +When in debug-mode all PMP rules are bypassed/ignored granting the debugger maximum access permissions. -.PMP Rules when in Debug Mode +.PMP Time-Multiplex [NOTE] -When in debug-mode all PMP rules are ignored making the debugger have maximum access rights. +Instructions are executed in a multi-cycle manner. Hence, data access (load/store) and instruction fetch cannot occur +at the same time. Therefore, the PMP hardware uses only a single set of comparators for memory access permissions checks +that are switched in an iterative, time-multiplex style reducing hardware footprint by approx. 50% while maintaining +full security features and RISC-V compatibility. -.Protected Instruction Fetches +.PMP Memory Accesses [IMPORTANT] -New instruction fetches are **always triggered even when denied** by a certain PMP rule. However, the fetched instruction(s) -will not be executed and will not change CPU core state. Instead, they will raise a bus exception when reaching the CPU's -executions stage. +Load/store accesses for which there are insufficient access permission do not trigger any memory/bus accesses at all. +In contrast, instruction accesses for which there are insufficient access permission nevertheless lead to a memory/bus +access (causing potential side effects on the memory side=. However, the fetched instruction will be discarded and the +corresponding exception will still be triggered precisely. ==== `Sdext` ISA Extension From b037d5c968c2b676f69f8c2b8e494103d247d3e1 Mon Sep 17 00:00:00 2001 From: stnolting Date: Thu, 19 Dec 2024 20:13:27 +0100 Subject: [PATCH 5/6] [rtl] control: minor cleanups --- rtl/core/neorv32_cpu_control.vhd | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 4b53ef337..899abbcc8 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -251,7 +251,7 @@ architecture neorv32_cpu_control_rtl of neorv32_cpu_control is end record; signal csr : csr_t; - -- hpm event configuration CSRs -- + -- HPM event configuration CSRs -- type hpmevent_cfg_t is array (3 to 15) of std_ulogic_vector(hpmcnt_event_width_c-1 downto 0); type hpmevent_rd_t is array (3 to 15) of std_ulogic_vector(XLEN-1 downto 0); signal hpmevent_cfg : hpmevent_cfg_t; @@ -309,18 +309,11 @@ begin fetch_engine.pc <= (others => '0'); fetch_engine.priv <= '0'; elsif rising_edge(clk_i) then - -- restart request -- - if (fetch_engine.state = IF_RESTART) then -- restart done - fetch_engine.restart <= '0'; - else -- buffer request - fetch_engine.restart <= fetch_engine.restart or fetch_engine.reset; - end if; - - -- fsm -- case fetch_engine.state is when IF_REQUEST => -- request next 32-bit-aligned instruction word -- ------------------------------------------------------------ + fetch_engine.restart <= fetch_engine.restart or fetch_engine.reset; -- buffer restart request if (ipb.free = "11") then -- free IPB space? fetch_engine.state <= IF_PENDING; elsif (fetch_engine.restart = '1') or (fetch_engine.reset = '1') then -- restart because of branch @@ -329,6 +322,7 @@ begin when IF_PENDING => -- wait for bus response and write instruction data to prefetch buffer -- ------------------------------------------------------------ + fetch_engine.restart <= fetch_engine.restart or fetch_engine.reset; -- buffer restart request if (fetch_engine.resp = '1') then -- wait for bus response fetch_engine.pc <= std_ulogic_vector(unsigned(fetch_engine.pc) + 4); -- next word fetch_engine.pc(1) <= '0'; -- (re-)align to 32-bit @@ -341,9 +335,10 @@ begin when others => -- IF_RESTART: set new start address -- ------------------------------------------------------------ - fetch_engine.pc <= exe_engine.pc2(XLEN-1 downto 1) & '0'; -- initialize from PC incl. 16-bit-alignment bit - fetch_engine.priv <= csr.privilege_eff; -- set new privilege level - fetch_engine.state <= IF_REQUEST; + fetch_engine.restart <= '0'; -- restart done + fetch_engine.pc <= exe_engine.pc2(XLEN-1 downto 1) & '0'; -- initialize from PC incl. 16-bit-alignment bit + fetch_engine.priv <= csr.privilege_eff; -- set new privilege level + fetch_engine.state <= IF_REQUEST; end case; end if; @@ -384,7 +379,7 @@ begin prefetch_buffer_inst: entity neorv32.neorv32_fifo generic map ( FIFO_DEPTH => 2, -- number of IPB entries; has to be a power of two, min 2 - FIFO_WIDTH => ipb.wdata(i)'length, -- size of data elements in fifo + FIFO_WIDTH => ipb.wdata(i)'length, -- size of data elements in FIFO FIFO_RSYNC => false, -- we NEED to read data asynchronously FIFO_SAFE => false, -- no safe access required (ensured by FIFO-external logic) FULL_RESET => true -- map to FFs and add a dedicated reset @@ -1966,7 +1961,7 @@ begin cnt_lo_rd(2) <= cnt.lo(2); -- instret cnt_hi_rd(2) <= cnt.hi(2); -- instreth end if; - -- hpm counters -- + -- HPM counters -- if RISCV_ISA_Zihpm and (hpm_num_c > 0) then for i in 3 to (hpm_num_c+3)-1 loop if (hpm_cnt_lo_width_c > 0) then -- constrain low word size @@ -2043,7 +2038,7 @@ begin cnt.inc(0) <= (others => (cnt_event(hpmcnt_event_cy_c) and (not csr.mcountinhibit(0)) and (not debug_ctrl.run))); cnt.inc(1) <= (others => '0'); -- time: not available cnt.inc(2) <= (others => (cnt_event(hpmcnt_event_ir_c) and (not csr.mcountinhibit(2)) and (not debug_ctrl.run))); - -- hpm counters -- + -- HPM counters -- for i in 3 to 15 loop cnt.inc(i) <= (others => (or_reduce_f(cnt_event and hpmevent_cfg(i)) and (not csr.mcountinhibit(i)) and (not debug_ctrl.run))); end loop; From 2e50b270d560a8b018242395803736dc373d1655 Mon Sep 17 00:00:00 2001 From: stnolting Date: Thu, 19 Dec 2024 20:18:05 +0100 Subject: [PATCH 6/6] [changelog] add v1.10.7.5 --- CHANGELOG.md | 1 + rtl/core/neorv32_package.vhd | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 850960ed2..9e7c8e400 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12 | Date | Version | Comment | Ticket | |:----:|:-------:|:--------|:------:| +| 19.12.2024 | 1.10.7.5 | :test_tube: use time-multiplex PMP architecture (reducing area footprint) | [#1105](https://github.com/stnolting/neorv32/pull/1105) | | 14.12.2024 | 1.10.7.4 | :sparkles: add new module: I2C-compatible **Two-Wire Device Controller (TWD)** | [#1121](https://github.com/stnolting/neorv32/pull/1121) | | 14.12.2024 | 1.10.7.3 | :warning: rework TRNG (change HAL; remove interrupt) | [#1120](https://github.com/stnolting/neorv32/pull/1120) | | 12.12.2024 | 1.10.7.2 | add external memory configuration/initialization options to testbench | [#1119](https://github.com/stnolting/neorv32/pull/1119) | diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index 970a16211..cdde6a5d8 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -29,7 +29,7 @@ package neorv32_package is -- Architecture Constants ----------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01100704"; -- hardware version + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01100705"; -- hardware version constant archid_c : natural := 19; -- official RISC-V architecture ID constant XLEN : natural := 32; -- native data path width