6c6495bb43
RASID must not use 0 for any slot. According with documentation: """The operation of the processor is undefined if any two of the four ASIDs are equal or if it contains an ASID of zero""" Signed-off-by: Flavio Ceolin <flavio.ceolin@intel.com>
195 lines
6.9 KiB
C
195 lines
6.9 KiB
C
/*
|
|
* Copyright 2023 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
#include <stdint.h>
|
|
#include <stdbool.h>
|
|
#include <zephyr/kernel.h>
|
|
#include <xtensa/config/core-isa.h>
|
|
#include <xtensa_mmu_priv.h>
|
|
#include <zephyr/cache.h>
|
|
|
|
#ifdef CONFIG_USERSPACE
|
|
BUILD_ASSERT((CONFIG_PRIVILEGED_STACK_SIZE > 0) &&
|
|
(CONFIG_PRIVILEGED_STACK_SIZE % CONFIG_MMU_PAGE_SIZE) == 0);
|
|
#endif
|
|
|
|
#define ASID_INVALID 0
|
|
|
|
struct tlb_regs {
|
|
uint32_t rasid;
|
|
uint32_t ptevaddr;
|
|
uint32_t ptepin_as;
|
|
uint32_t ptepin_at;
|
|
uint32_t vecpin_as;
|
|
uint32_t vecpin_at;
|
|
};
|
|
|
|
static void compute_regs(uint32_t user_asid, uint32_t *l1_page, struct tlb_regs *regs)
|
|
{
|
|
uint32_t vecbase = XTENSA_RSR("VECBASE");
|
|
|
|
__ASSERT_NO_MSG((((uint32_t)l1_page) & 0xfff) == 0);
|
|
__ASSERT_NO_MSG((user_asid == 0) || ((user_asid > 2) &&
|
|
(user_asid < XTENSA_MMU_SHARED_ASID)));
|
|
|
|
/* We don't use ring 1, ring 0 ASID must be 1 */
|
|
regs->rasid = (XTENSA_MMU_SHARED_ASID << 24) |
|
|
(user_asid << 16) | 0x000201;
|
|
|
|
/* Derive PTEVADDR from ASID so each domain gets its own PTE area */
|
|
regs->ptevaddr = CONFIG_XTENSA_MMU_PTEVADDR + user_asid * 0x400000;
|
|
|
|
/* The ptables code doesn't add the mapping for the l1 page itself */
|
|
l1_page[XTENSA_MMU_L1_POS(regs->ptevaddr)] =
|
|
(uint32_t)l1_page | XTENSA_MMU_PAGE_TABLE_ATTR;
|
|
|
|
regs->ptepin_at = (uint32_t)l1_page;
|
|
regs->ptepin_as = XTENSA_MMU_PTE_ENTRY_VADDR(regs->ptevaddr, regs->ptevaddr)
|
|
| XTENSA_MMU_PTE_WAY;
|
|
|
|
/* Pin mapping for refilling the vector address into the ITLB
|
|
* (for handling TLB miss exceptions). Note: this is NOT an
|
|
* instruction TLB entry for the vector code itself, it's a
|
|
* DATA TLB entry for the page containing the vector mapping
|
|
* so the refill on instruction fetch can find it. The
|
|
* hardware doesn't have a 4k pinnable instruction TLB way,
|
|
* frustratingly.
|
|
*/
|
|
uint32_t vb_pte = l1_page[XTENSA_MMU_L1_POS(vecbase)];
|
|
|
|
regs->vecpin_at = vb_pte;
|
|
regs->vecpin_as = XTENSA_MMU_PTE_ENTRY_VADDR(regs->ptevaddr, vecbase)
|
|
| XTENSA_MMU_VECBASE_WAY;
|
|
}
|
|
|
|
/* Switch to a new page table. There are four items we have to set in
|
|
* the hardware: the PTE virtual address, the ring/ASID mapping
|
|
* register, and two pinned entries in the data TLB handling refills
|
|
* for the page tables and the vector handlers.
|
|
*
|
|
* These can be done in any order, provided that we ensure that no
|
|
* memory access which cause a TLB miss can happen during the process.
|
|
* This means that we must work entirely within registers in a single
|
|
* asm block. Also note that instruction fetches are memory accesses
|
|
* too, which means we cannot cross a page boundary which might reach
|
|
* a new page not in the TLB (a single jump to an aligned address that
|
|
* holds our five instructions is sufficient to guarantee that: I
|
|
* couldn't think of a way to do the alignment statically that also
|
|
* interoperated well with inline assembly).
|
|
*/
|
|
void xtensa_set_paging(uint32_t user_asid, uint32_t *l1_page)
|
|
{
|
|
/* Optimization note: the registers computed here are pure
|
|
* functions of the two arguments. With a minor API tweak,
|
|
* they could be cached in e.g. a thread struct instead of
|
|
* being recomputed. This is called on context switch paths
|
|
* and is performance-sensitive.
|
|
*/
|
|
struct tlb_regs regs;
|
|
|
|
compute_regs(user_asid, l1_page, ®s);
|
|
|
|
__asm__ volatile("j 1f\n"
|
|
".align 16\n" /* enough for 5 insns */
|
|
"1:\n"
|
|
"wsr %0, PTEVADDR\n"
|
|
"wsr %1, RASID\n"
|
|
"wdtlb %2, %3\n"
|
|
"wdtlb %4, %5\n"
|
|
"isync"
|
|
:: "r"(regs.ptevaddr), "r"(regs.rasid),
|
|
"r"(regs.ptepin_at), "r"(regs.ptepin_as),
|
|
"r"(regs.vecpin_at), "r"(regs.vecpin_as));
|
|
}
|
|
|
|
/* This is effectively the same algorithm from xtensa_set_paging(),
|
|
* but it also disables the hardware-initialized 512M TLB entries in
|
|
* way 6 (because the hardware disallows duplicate TLB mappings). For
|
|
* instruction fetches this produces a critical ordering constraint:
|
|
* the instruction following the invalidation of ITLB entry mapping
|
|
* the current PC will by definition create a refill condition, which
|
|
* will (because the data TLB was invalidated) cause a refill
|
|
* exception. Therefore this step must be the very last one, once
|
|
* everything else is setup up and working, which includes the
|
|
* invalidation of the virtual PTEVADDR area so that the resulting
|
|
* refill can complete.
|
|
*
|
|
* Note that we can't guarantee that the compiler won't insert a data
|
|
* fetch from our stack memory after exit from the asm block (while it
|
|
* might be double-mapped), so we invalidate that data TLB inside the
|
|
* asm for correctness. The other 13 entries get invalidated in a C
|
|
* loop at the end.
|
|
*/
|
|
void xtensa_init_paging(uint32_t *l1_page)
|
|
{
|
|
extern char z_xt_init_pc; /* defined in asm below */
|
|
struct tlb_regs regs;
|
|
unsigned int initial_rasid;
|
|
|
|
/* The initial rasid after hardware initialization is 0x04030201.
|
|
* 1 is hardwired to ring 0, other slots must be different
|
|
* from each other and must not be 0.
|
|
*
|
|
* For our initial implementation we just set the 4th slot (ring 3),
|
|
* to use the ASID value used for memory that is shared with all threads.
|
|
*/
|
|
initial_rasid = 0xff030201;
|
|
|
|
#if CONFIG_MP_MAX_NUM_CPUS > 1
|
|
/* The incoherent cache can get into terrible trouble if it's
|
|
* allowed to cache PTEs differently across CPUs. We require
|
|
* that all page tables supplied by the OS have exclusively
|
|
* uncached mappings for page data, but can't do anything
|
|
* about earlier code/firmware. Dump the cache to be safe.
|
|
*/
|
|
sys_cache_data_flush_and_invd_all();
|
|
#endif
|
|
|
|
compute_regs(ASID_INVALID, l1_page, ®s);
|
|
|
|
uint32_t idtlb_pte = (regs.ptevaddr & 0xe0000000) | XCHAL_SPANNING_WAY;
|
|
uint32_t idtlb_stk = (((uint32_t)®s) & ~0xfff) | XCHAL_SPANNING_WAY;
|
|
uint32_t iitlb_pc = (((uint32_t)&z_xt_init_pc) & ~0xfff) | XCHAL_SPANNING_WAY;
|
|
|
|
/* Note: the jump is mostly pedantry, as it's almost
|
|
* inconceivable that a hardware memory region at boot is
|
|
* going to cross a 512M page boundary. But we need the entry
|
|
* symbol to get the address above, so the jump is here for
|
|
* symmetry with the set_paging() code.
|
|
*/
|
|
__asm__ volatile("j z_xt_init_pc\n"
|
|
".align 32\n" /* room for 10 insns */
|
|
".globl z_xt_init_pc\n"
|
|
"z_xt_init_pc:\n"
|
|
"wsr %0, PTEVADDR\n"
|
|
"wsr %1, RASID\n"
|
|
"wdtlb %2, %3\n"
|
|
"wdtlb %4, %5\n"
|
|
"idtlb %6\n" /* invalidate pte */
|
|
"idtlb %7\n" /* invalidate stk */
|
|
"isync\n"
|
|
"iitlb %8\n" /* invalidate pc */
|
|
"isync\n" /* <--- traps a ITLB miss */
|
|
:: "r"(regs.ptevaddr), "r"(initial_rasid),
|
|
"r"(regs.ptepin_at), "r"(regs.ptepin_as),
|
|
"r"(regs.vecpin_at), "r"(regs.vecpin_as),
|
|
"r"(idtlb_pte), "r"(idtlb_stk), "r"(iitlb_pc));
|
|
|
|
/* Invalidate the remaining (unused by this function)
|
|
* initialization entries. Now we're flying free with our own
|
|
* page table.
|
|
*/
|
|
for (uint32_t i = 0; i < 8; i++) {
|
|
uint32_t ixtlb = (i * 0x20000000) | XCHAL_SPANNING_WAY;
|
|
|
|
if (ixtlb != iitlb_pc) {
|
|
__asm__ volatile("iitlb %0" :: "r"(ixtlb));
|
|
}
|
|
if (ixtlb != idtlb_stk && ixtlb != idtlb_pte) {
|
|
__asm__ volatile("idtlb %0" :: "r"(ixtlb));
|
|
}
|
|
}
|
|
__asm__ volatile("isync");
|
|
}
|