xtensa: New asm layer to support SMP

SMP needs a new context switch primitive (to disentangle _swap() from the scheduler) and new interrupt entry behavior (to be able to take a global spinlock on behalf of legacy drivers). The existing code is very obtuse, and working with it led me down a long path of "this would be so much better if..." So this is a new context and entry framework, intended to replace the code that exists now, at least on SMP platforms. New features: * The new context switch primitive is xtensa_switch(), which takes a "new" context handle as an argument instead of getting it from the scheduler, returns an "old" context handle through a pointer (e.g. to save it to the old thread context), and restores the lock state(PS register) exactly as it is at entry instead of taking it as an argument. * The register spill code understands wrap-around register windows and can avoid spilling A4-A15 registers when they are unused by the interrupted function, saving as much as 48 bytes of stack space on the interrupted stacks. * The "spill register windows" routine is entirely different, using a different mechanism, and is MUCH FASTER (to the tune of almost 200 cycles). See notes in comments. * Even better, interrupt entry can be done via a clever "cross stack call" I worked up, meaning that the interrupted thread's registers do not need to be spilled at all until they are naturally pushed out by the interrupt handler or until we return from the interrupt into a different thread. This is a big efficiency win for tiny interrupts (e.g. timers), and a big latency win for all interrupts. * Interrupt entry is 100% symmetric with respect to medium/high interrupts, avoiding the problems seen with hooking high priority interrupts with the current code (e.g. ESP-32's watchdog driver). * Much smaller code size. No cut and paste assembly. No use of HAL calls. * Assumes "XEA2" interrupt architecture, the register window extension (i.e. no CALL0 ABI), and the "high priority interrupts" extension. Does not support the legacy processor variants for which we have no targets. The old code has some stuff in there to support this, but it seems bitrotten, untestable, and I'm all but certain it doesn't work. Note that this simply adds the primitives to the existing tree in a form where they can be unit tested. It does not replace the existing interrupt/exception handling or _Swap() implementation. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2017-12-07 15:01:33 -08:00 · 2017-12-07 15:01:33 -08:00 · a34f884f23
parent 8dca7ae587
commit a34f884f23
6 changed files with 717 additions and 0 deletions
--- a/arch/xtensa/core/CMakeLists.txt
+++ b/arch/xtensa/core/CMakeLists.txt
@ -11,6 +11,8 @@ zephyr_sources(
 	xtensa_vectors.S
 	xt_zephyr.S
 	window_vectors.S
+	xtensa-asm2-util.S
+	xtensa-asm2.c
 	)

 zephyr_sources_ifndef(CONFIG_ATOMIC_OPERATIONS_C atomic.S)
--- a/arch/xtensa/core/xtensa-asm2-util.S
+++ b/arch/xtensa/core/xtensa-asm2-util.S
@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <xtensa-asm2-s.h>
+
+/*
+ * xtensa_save_high_regs
+ *
+ * Call with CALL0, with A2/A3 available as scratch.  Pushes the high
+ * A4-A15 GPRs to the stack if needed (i.e. if those registers are not
+ * part of wrapped-around frames higher up the call stack), returning
+ * to the caller with the stack pointer HAVING BEEN MODIFIED to
+ * contain them.
+ */
+.global xtensa_save_high_regs
+.align 4
+xtensa_save_high_regs:
+	/* Generate a rotated (modulo NREGS/4 bits!) WINDOWSTART in A2
+	 * by duplicating the bits twice and shifting down by WINDOWBASE
+	 * bits.  Now the LSB is the register quad at WINDOWBASE.
+	 */
+	rsr.WINDOWSTART a2
+	slli a3, a2, (XCHAL_NUM_AREGS / 4)
+	or a2, a2, a3
+	rsr.WINDOWBASE a3
+	ssr a3
+	srl a2, a2
+
+	mov a3, a1 /* Stash our original stack pointer */
+
+	/* For the next three bits in WINDOWSTART (which correspond to
+	 * the A4-A7, A8-A11 and A12-A15 quads), if we find a one,
+	 * that means that the quad is owned by a wrapped-around call
+	 * in the registers, so we don't need to spill it or any
+	 * further registers from the GPRs and can skip to the end.
+	 */
+	bbsi a2, 1, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a4, a1, 0
+	s32i a5, a1, 4
+	s32i a6, a1, 8
+	s32i a7, a1, 12
+
+	bbsi a2, 2, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a8, a1, 0
+	s32i a9, a1, 4
+	s32i a10, a1, 8
+	s32i a11, a1, 12
+
+	bbsi a2, 3, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a12, a1, 0
+	s32i a13, a1, 4
+	s32i a14, a1, 8
+	s32i a15, a1, 12
+
+_high_gpr_spill_done:
+	/* Push the original stack pointer so we know at restore
+	 * time how many registers were spilled, then return, leaving the
+	 * modified SP in A1.
+	 */
+	addi a1, a1, -4
+	s32i a3, a1, 0
+
+	ret
+
+/*
+ * xtensa_restore_high_regs
+ *
+ * Does the inverse of xtensa_save_high_regs, taking a stack pointer
+ * in A1 that resulted and restoring the A4-A15 state (and the stack
+ * pointer) to the state they had at the earlier call.  Call with
+ * CALL0, leaving A2/A3 available as scratch.
+ */
+.global xtensa_restore_high_regs
+.align 4
+xtensa_restore_high_regs:
+	/* pop our "original" stack pointer into a2, stash in a3 also */
+	l32i a2, a1, 0
+	addi a1, a1, 4
+	mov a3, a2
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a4, a2, 0
+	l32i a5, a2, 4
+	l32i a6, a2, 8
+	l32i a7, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a8, a2, 0
+	l32i a9, a2, 4
+	l32i a10, a2, 8
+	l32i a11, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a12, a2, 0
+	l32i a13, a2, 4
+	l32i a14, a2, 8
+	l32i a15, a2, 12
+
+_high_restore_done:
+	mov a1, a3 /* Original stack */
+	ret
+
+/*
+ * _restore_context
+ *
+ * Arrive here via a jump.  Enters into the restored context and does
+ * not return.  A1 should have a context pointer in it as received
+ * from switch or an interrupt exit.  Interrupts must be disabled,
+ * and register windows should have been spilled.
+ *
+
+ * Note that exit from the restore is done with the RFI instruction,
+ * using the EPCn/EPSn registers.  Those will have been saved already
+ * by any interrupt entry so they are save to use.  Note that EPC1 and
+ * RFE are NOT usable (they can't preserve PS).  Per the ISA spec, all
+ * RFI levels do the same thing and differ only in the special
+ * registers used to hold PC/PS, but Qemu has been observed to behave
+ * strangely when RFI doesn't "return" to a INTLEVEL strictly lower
+ * than it started from.  So pick level 6 (the highest that works on
+ * Qemu, hardware doesn't care so it doesn't matter).  In theory we
+ * should test to be able to support hardware with less than 6 levels,
+ * though...
+ */
+.global _restore_context
+_restore_context:
+	call0 xtensa_restore_high_regs
+
+	l32i a0, a1, BSA_PC_OFF
+	wsr.EPC6 a0
+	l32i a0, a1, BSA_PS_OFF
+	wsr.EPS6 a0
+
+	l32i a0, a1, BSA_SAR_OFF
+	wsr.SAR a0
+#if XCHAL_HAVE_LOOPS
+	l32i a0, a1, BSA_LBEG_OFF
+	wsr.LBEG a0
+	l32i a0, a1, BSA_LEND_OFF
+	wsr.LEND a0
+	l32i a0, a1, BSA_LCOUNT_OFF
+	wsr.LCOUNT a0
+#endif
+	rsync
+
+	l32i a0, a1, BSA_A0_OFF
+	l32i a2, a1, BSA_A2_OFF
+	l32i a3, a1, BSA_A3_OFF
+	addi a1, a1, BASE_SAVE_AREA_SIZE
+
+	rfi 6
+
+/*
+ * void xtensa_switch(void *new, void **old_return);
+ *
+ * Context switches into the prevoiusly-saved "new" handle, placing
+ * the saved "old" handle into the address provided by old_return.
+ */
+.global xtensa_switch
+.align 4
+xtensa_switch:
+	entry a1, 16
+	SPILL_ALL_WINDOWS
+	addi a1, a1, -BASE_SAVE_AREA_SIZE
+
+	/* Stash our A0/2/3 and the shift/loop registers into the base
+	 * save area so they get restored as they are now.  A2/A3
+	 * don't actually get used post-restore, but they need to be
+	 * stashed across the xtensa_save_high_regs call and this is a
+	 * convenient place.
+	 */
+	s32i a0, a1, BSA_A0_OFF
+	s32i a2, a1, BSA_A2_OFF
+	s32i a3, a1, BSA_A3_OFF
+	ODD_REG_SAVE
+
+	/* Stash our PS register contents and a "restore" PC. */
+	rsr.PS a0
+	s32i a0, a1, BSA_PS_OFF
+	movi a0, _switch_restore_pc
+	s32i a0, a1, BSA_PC_OFF
+
+	/* Now the high registers */
+	call0 xtensa_save_high_regs
+
+	/* Restore the A3 argument we spilled earlier (via the base
+	 * save pointer pushed at the bottom of the stack) and set the
+	 * stack to the "new" context out of the A2 spill slot.
+	 */
+	l32i a2, a1, 0
+	l32i a3, a2, BSA_A3_OFF
+	s32i a1, a3, 0
+
+	/* Switch stack pointer and restore.  The jump to
+	 * _restore_context does not return as such, but we arrange
+	 * for the restored "next" address to be immediately after for
+	 * sanity.
+	 */
+	l32i a1, a2, BSA_A2_OFF
+	j _restore_context
+_switch_restore_pc:
+	retw
--- a/arch/xtensa/core/xtensa-asm2.c
+++ b/arch/xtensa/core/xtensa-asm2.c
@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <string.h>
+#include <xtensa-asm2.h>
+#include <kernel.h>
+#include <kernel_structs.h>
+
+void *xtensa_init_stack(int *stack_top,
+			void (*entry)(void *, void *, void *),
+			void *arg1, void *arg2, void *arg3)
+{
+	/* We cheat and shave 16 bytes off, the top four words are the
+	 * A0-A3 spill area for the caller of the entry function,
+	 * which doesn't exist.  It will never be touched, so we
+	 * arrange to enter the function with a CALLINC of 1 and a
+	 * stack pointer 16 bytes above the top, so its ENTRY at the
+	 * start will decrement the stack pointer by 16.
+	 */
+	const int bsasz = BASE_SAVE_AREA_SIZE - 16;
+	void **bsa = (void **) (((char *) stack_top) - bsasz);
+
+	memset(bsa, 0, bsasz);
+
+	bsa[BSA_PC_OFF/4] = entry;
+	bsa[BSA_PS_OFF/4] = (void *)(PS_WOE | PS_UM | PS_CALLINC(1));
+
+	/* Arguments.  Remember these start at A6, which will be
+	 * rotated into A2 by the ENTRY instruction that begins the
+	 * entry function.  And A4-A7 and A8-A11 are optional quads
+	 * that live below the BSA!
+	 */
+	bsa[-1] = arg2; /* a7 */
+	bsa[-2] = arg1; /* a6 */
+	bsa[-3] = 0;    /* a5 */
+	bsa[-4] = 0;    /* a4 */
+
+	bsa[-5] = 0;    /* a11 */
+	bsa[-6] = 0;    /* a10 */
+	bsa[-7] = 0;    /* a9 */
+	bsa[-8] = arg3; /* a8 */
+
+	/* Finally push the BSA pointer and return the stack pointer
+	 * as the handle
+	 */
+	bsa[-9] = bsa;
+	return &bsa[-9];
+}
--- a/arch/xtensa/include/xtensa-asm2-context.h
+++ b/arch/xtensa/include/xtensa-asm2-context.h
@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XTENSA_ASM2_CONTEXT_H
+#define _XTENSA_ASM2_CONTEXT_H
+
+#include <xtensa/corebits.h>
+#include <xtensa/config/core-isa.h>
+
+/*
+ * Stack frame layout for a saved processor context, in memory order,
+ * high to low address:
+ *
+ * SP-0 <-- Interrupted stack pointer points here
+ *
+ * SP-4   Caller A3 spill slot \
+ * SP-8   Caller A2 spill slot |
+ * SP-12  Caller A1 spill slot + (Part of ABI standard)
+ * SP-16  Caller A0 spill slot /
+ *
+ * SP-20  Saved A3
+ * SP-24  Saved A2
+ * SP-28  Unused (not "Saved A1" because the SP is saved externally as a handle)
+ * SP-32  Saved A0
+ *
+ * SP-36  Saved PC (address to jump to following restore)
+ * SP-40  Saved/interrupted PS special register
+ *
+ * SP-44  Saved SAR special register
+ *
+ * SP-48  Saved LBEG special register (if loops enabled)
+ * SP-52  Saved LEND special register (if loops enabled)
+ * SP-56  Saved LCOUNT special register (if loops enabled)
+ *
+ *       (The above fixed-size region is known as the "base save area" in the
+ *        code below)
+ *
+ * - Saved A7 \
+ * - Saved A6 |
+ * - Saved A5 +- If not in-use by another frame
+ * - Saved A4 /
+ *
+ * - Saved A11 \
+ * - Saved A10 |
+ * - Saved A9  +- If not in-use by another frame
+ * - Saved A8  /
+ *
+ * - Saved A15 \
+ * - Saved A14 |
+ * - Saved A13 +- If not in-use by another frame
+ * - Saved A12 /
+ *
+ * - Saved intermediate stack pointer (points to low word of base save
+ *   area, i.e. the saved LCOUNT or SAR).  The pointer to this value
+ *   (i.e. the final stack pointer) is stored externally as the
+ *   "restore handle" in the thread context.
+ *
+ * Essentially, you can recover a pointer to the BSA by loading *SP.
+ * Adding the fixed BSA size to that gets you back to the
+ * original/interrupted stack pointer.
+ */
+
+#if XCHAL_HAVE_LOOPS
+#define BASE_SAVE_AREA_SIZE 56
+#else
+#define BASE_SAVE_AREA_SIZE 44
+#endif
+
+#define BSA_A3_OFF	(BASE_SAVE_AREA_SIZE - 20)
+#define BSA_A2_OFF	(BASE_SAVE_AREA_SIZE - 24)
+#define BSA_SCRATCH_OFF	(BASE_SAVE_AREA_SIZE - 28)
+#define BSA_A0_OFF	(BASE_SAVE_AREA_SIZE - 32)
+#define BSA_PC_OFF	(BASE_SAVE_AREA_SIZE - 36)
+#define BSA_PS_OFF	(BASE_SAVE_AREA_SIZE - 40)
+#define BSA_SAR_OFF	(BASE_SAVE_AREA_SIZE - 44)
+#define BSA_LBEG_OFF	(BASE_SAVE_AREA_SIZE - 48)
+#define BSA_LEND_OFF	(BASE_SAVE_AREA_SIZE - 52)
+#define BSA_LCOUNT_OFF	(BASE_SAVE_AREA_SIZE - 56)
+
+#endif /* _XTENSA_ASM2_CONTEXT_H */
--- a/arch/xtensa/include/xtensa-asm2-s.h
+++ b/arch/xtensa/include/xtensa-asm2-s.h
@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "xtensa-asm2-context.h"
+
+/* Assembler header!  This file contains macros designed to be included
+ * only by the assembler.
+ */
+
+/*
+ * SPILL_ALL_WINDOWS
+ *
+ * Spills all windowed registers (i.e. registers not visible as
+ * A0-A15) to their ABI-defined spill regions on the stack.
+ *
+ * Unlike the Xtensa HAL implementation, this code requires that the
+ * EXCM and WOE bit be enabled in PS, and relies on repeated hardware
+ * exception handling to do the register spills.  The trick is to do a
+ * noop write to the high registers, which the hardware will trap
+ * (into an overflow exception) in the case where those registers are
+ * already used by an existing call frame.  Then it rotates the window
+ * and repeats until all but the A0-A3 registers of the original frame
+ * are guaranteed to be spilled, eventually rotating back around into
+ * the original frame.  Advantages:
+ *
+ * - Vastly smaller code size
+ *
+ * - More easily maintained if changes are needed to window over/underflow
+ *   exception handling.
+ *
+ * - Requires no scratch registers to do its work, so can be used safely in any
+ *   context.
+ *
+ * - If the WOE bit is not enabled (for example, in code written for
+ *   the CALL0 ABI), this becomes a silent noop and operates compatbily.
+ *
+ * - In memory protection situations, this relies on the existing
+ *   exception handlers (and thus their use of the L/S32E
+ *   instructions) to execute stores in the protected space.  AFAICT,
+ *   the HAL routine does not handle this situation and isn't safe: it
+ *   will happily write through the "stack pointers" found in
+ *   registers regardless of where they might point.
+ *
+ * - Hilariously it's ACTUALLY FASTER than the HAL routine.  And not
+ *   just a little bit, it's MUCH faster.  With a mostly full register
+ *   file on an LX6 core (ESP-32) I'm measuring 145 cycles to spill
+ *   registers with this vs. 279 (!) to do it with
+ *   xthal_spill_windows().  Apparently Xtensa exception handling is
+ *   really fast, and no one told their software people.
+ *
+ * Note that as with the Xtensa HAL spill routine, and unlike context
+ * switching code on most sane architectures, the intermediate states
+ * here will have an invalid stack pointer.  That means that this code
+ * must not be preempted in any context (i.e. all Zephyr situations)
+ * where the interrupt code will need to use the stack to save the
+ * context.  But unlike the HAL, which runs with exceptions masked via
+ * EXCM, this will not: hit needs the overflow handlers unmasked.  Use
+ * INTLEVEL instead (which, happily, is what Zephyr's locking does
+ * anyway).
+ */
+.macro SPILL_ALL_WINDOWS
+#if XCHAL_NUM_AREGS == 64
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 4
+#elif XCHAL_NUM_AREGS == 32
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a4, a4, a4
+	rotw 2
+#else
+#error Unrecognized XCHAL_NUM_AREGS
+#endif
+.endm
+
+/*
+ * ODD_REG_SAVE
+ *
+ * Stashes the oddball shift/loop context registers in the base save
+ * area pointed to by the current stack pointer.  On exit, A0 will
+ * have been modified but A2/A3 have not, and the shift/loop
+ * instructions can be used freely (though note loops don't work in
+ * exceptions for other reasons!).
+ *
+ * Does not populate or modify the PS/PC save locations.
+ */
+.macro ODD_REG_SAVE
+	rsr.SAR a0
+	s32i a0, a1, BSA_SAR_OFF
+#if XCHAL_HAVE_LOOPS
+	rsr.LBEG a0
+	s32i a0, a1, BSA_LBEG_OFF
+	rsr.LEND a0
+	s32i a0, a1, BSA_LEND_OFF
+	rsr.LCOUNT a0
+	s32i a0, a1, BSA_LCOUNT_OFF
+#endif
+.endm
+
+/*
+ * CROSS_STACK_CALL
+ *
+ * Sets the stack up carefully such that a "cross stack" call can spill
+ * correctly, then invokes an immediate handler.  Note that:
+ *
+ * 0. When spilling a frame, functions find their callEE's stack pointer
+ *    (to save A0-A3) from registers.  But they find their
+ *    already-spilled callER's stack pointer (to save higher GPRs) from
+ *    their own stack memory.
+ *
+ * 1. The function that was interrupted ("interruptee") does not need to
+ *    be spilled, because it already has been as part of the context
+ *    save.  So it doesn't need registers allocated for it anywhere.
+ *
+ * 2. Interruptee's caller needs to spill into the space below the
+ *    interrupted stack frame, which means that the A1 register it finds
+ *    below it needs to contain the old/interrupted stack and not the
+ *    context saved one.
+ *
+ * 3. The ISR dispatcher (called "underneath" interruptee) needs to spill
+ *    high registers into the space immediately above its own stack frame,
+ *    so it needs to find a caller with the "new" stack pointer instead.
+ *
+ * We make this work by inserting TWO 4-register frames between
+ * "interruptee's caller" and "ISR dispatcher".  The top one (which
+ * occupies the slot formerly held by "interruptee", whose registers
+ * were saved via external means) holds the "interrupted A1" and the
+ * bottom has the "top of the interrupt stack" which can be either the
+ * word above a new memory area (when handling an interrupt from user
+ * mode) OR the existing "post-context-save" stack pointer (when
+ * handling a nested interrupt).  The code works either way.  Because
+ * these are both only 4-registers, neither needs its own caller for
+ * spilling.
+ *
+ * The net cost is 32 wasted bytes on the interrupt stack frame to
+ * spill our two "phantom frames" (actually not quite, as we'd need a
+ * few of those words used somewhere for tracking the stack pointers
+ * anyway).  But the benefit is that NO REGISTER FRAMES NEED TO BE
+ * SPILLED on interrupt entry.  And if we return back into the same
+ * context we interrupted (a common case) no windows need to be
+ * explicitly spilled at all.  And in fact in the case where the ISR
+ * uses significant depth on its own stack, the interrupted frames
+ * will be spilled naturally as a standard cost of a function call,
+ * giving register windows something like "zero cost interrupts".
+ *
+ * FIXME: a terrible awful really nifty idea to fix the stack waste
+ * problem would be to use a SINGLE frame between the two stacks,
+ * pre-spill it with one stack pointer for the "lower" call to see and
+ * leave the register SP in place for the "upper" frame to use.
+ * Would require modifying the Window{Over|Under}flow4 exceptions to
+ * know not to spill/fill these special frames, but that's not too
+ * hard, maybe...
+ *
+ * Enter this macro with a valid "context saved" pointer (i.e. SP
+ * should point to a stored pointer which points to one BSA below the
+ * interrupted/old stack) in A1, a handler function in A2, and a "new"
+ * stack pointer (i.e. a pointer to the word ABOVE the allocated stack
+ * area) in A3.  On return A0/1 will be unchanged, A2 has the return
+ * value of the called function, and A3 is clobbered.  A4-A15 become
+ * part of called frames and MUST NOT BE IN USE by the code that
+ * expands this macro.  The called function gets the context save
+ * handle in A1 as it's first argument.
+ */
+.macro CROSS_STACK_CALL
+	mov a6, a3		/* place "new sp" in the next frame's A2 */
+	mov a10, a1             /* pass "context handle" in 2nd frame's A2 */
+	mov a3, a1		/* stash it locally in A3 too */
+	mov a11, a2		/* handler in 2nd frame's A3, next frame's A7 */
+
+	/* Recover the interrupted SP from the BSA */
+	l32i a1, a1, 0
+	addi a1, a1, BASE_SAVE_AREA_SIZE
+
+	call4 _xstack_call0_\@
+	mov a1, a3		/* restore original SP */
+	mov a2, a6		/* copy return value */
+	j _xstack_returned_\@
+.align 4
+_xstack_call0_\@:
+	/* We want an ENTRY to set a bit in windowstart and do the
+	 * rotation, but we want our own SP
+	 */
+	entry a1, 16
+	mov a1, a2
+	call4 _xstack_call1_\@
+	mov a2, a6		/* copy return value */
+	retw
+.align 4
+_xstack_call1_\@:
+	/* Remember the handler is going to do our ENTRY, so the
+	 * handler pointer is still in A6 (not A2) even though this is
+	 * after the second CALL4.
+	 */
+	jx a7
+_xstack_returned_\@:
+.endm
+
+/* Entry setup for all exceptions and interrupts.  Arrive here with
+ * the stack pointer decremented across a base save area, A0-A3 and
+ * PS/PC already spilled to the stack in the BSA, and A2 containing a
+ * level-specific C handler function.
+ *
+ * This is a macro (to allow for unit testing) that expands to a
+ * handler body to which the vectors can jump.  It takes two static
+ * (!) arguments: a special register name (which should be set up to
+ * point to some kind of per-CPU record struct) and offsets within
+ * that struct which contains an interrupt stack top and a "nest
+ * count" word.
+ */
+.macro EXCINT_HANDLER SR, NEST_OFF, INTSTACK_OFF
+	/* A2 contains our handler function which will get clobbered
+	 * by the save.  Stash it into the unused "a1" slot in the
+	 * BSA and recover it immediately after.  Kind of a hack.
+	 */
+	s32i a2, a1, BSA_SCRATCH_OFF
+
+	call0 xtensa_save_high_regs
+
+	l32i a2, a1, 0
+	l32i a2, a2, BSA_SCRATCH_OFF
+
+	/* Unmask EXCM bit so C code can spill/fill in window
+	 * exceptions.  Note interrupts are already fully masked by
+	 * INTLEVEL, so this is safe.
+	 */
+	rsr.PS a0
+	movi a3, ~16
+	and a0, a0, a3
+	wsr.PS a0
+	rsync
+
+	/* A1 already contains our saved stack, and A2 our handler.
+	 * So all that's needed for CROSS_STACK_CALL is to put the
+	 * "new" stack into A3.  This can be either a copy of A1 or an
+	 * entirely new area depending on whether we find a 1 in our
+	 * SR[off] macro argument.
+	 */
+	rsr.\SR a3
+	l32i a0, a3, \NEST_OFF
+	beqz a0, _switch_stacks_\@
+
+	/* Use the same stack, just copy A1 to A3 after incrementing NEST */
+	addi a0, a0, 1
+	s32i a0, a3, \NEST_OFF
+	mov a3, a1
+	j _do_call_\@
+
+_switch_stacks_\@:
+	addi a0, a0, 1
+	s32i a0, a3, \NEST_OFF
+	l32i a3, a3, \INTSTACK_OFF
+
+_do_call_\@:
+	CROSS_STACK_CALL
+
+	/* Decrement nest count */
+	rsr.\SR a3
+	l32i a0, a3, \NEST_OFF
+	addi a0, a0, -1
+	s32i a0, a3, \NEST_OFF
+
+	/* Last trick: the called function returned the "next" handle
+         * to restore to in A6 (the call4'd function's A2).  If this
+         * is not the same handle as we started with, we need to do a
+         * register spill before restoring, for obvious reasons.
+         * Remember to mask interrupts (which have been unmasked
+         * during the handler execution) while we muck with the
+         * windows.  The restore will unmask them as needed.
+         */
+        beq a6, a1, _restore_\@
+	rsil a0, XCHAL_NMILEVEL
+	SPILL_ALL_WINDOWS
+	mov a1, a6
+
+_restore_\@:
+	j _restore_context
+.endm
+
+/* Defines an exception/interrupt vector for a specified level.  Saves
+ * off the interrupted A0-A3 registers and the per-level PS/PC
+ * registers to the stack before jumping to a handler (defined with
+ * EXCINT_HANDLER) to do the rest of the work.
+ *
+ * Arguments are a numeric interrupt level and symbol names for the
+ * entry code (defined via EXCINT_HANDLER) and a C handler for this
+ * particular level.
+ *
+ * FIXME: needs special handling for exceptions (level 1): it's "EPC"
+ * and not "EPC1" (though IIRC the assembler makes this work).
+ * And there is no EPS: instead PS is simply the interrupted PS
+ * with EXCM flipped from 0 to 1.
+ *
+ * FIXME: needs better locking.  The hardware will NOT mask out "high
+ * priority" exceptions on arrival here, so we have to do it ourselves
+ * with RSIL.
+ */
+.macro DEF_EXCINT LVL, ENTRY_SYM, C_HANDLER_SYM
+.pushsection .Level\LVL\()InterruptVector.text, "ax"
+.global _Level\LVL\()Vector
+_Level\LVL\()Vector:
+	addi a1, a1, -BASE_SAVE_AREA_SIZE
+	s32i a0, a1, BSA_A0_OFF
+	s32i a2, a1, BSA_A2_OFF
+	s32i a3, a1, BSA_A3_OFF
+
+	rsr.EPS\LVL a0
+	s32i a0, a1, BSA_PS_OFF
+	rsr.EPC\LVL a0
+	s32i a0, a1, BSA_PC_OFF
+
+	/* What's happening with this jump is that the L32R
+	 * instruction to load a full 32 bit immediate must use an
+	 * offset that is negative from PC.  Normally the assembler
+	 * fixes this up for you by putting the "literal pool"
+	 * somewhere at the start of the section.  But vectors start
+	 * at a fixed address in their own section, and don't (in our
+	 * current linker setup) have anywhere "definitely before
+	 * vectors" to place immediates.  Some platforms and apps will
+	 * link by dumb luck, others won't.  We add an extra jump just
+	 * to clear space we know to be legal.
+	 *
+	 * The right way to fix this would be to use a "literal_prefix"
+	 * to put the literals into a per-vector section, then link
+	 * that section into the PREVIOUS vector's area right after
+	 * the vector code.  Requires touching a lot of linker scripts
+	 * though.
+	 */
+	j _after_imms\LVL\()
+.align 4
+_handle_excint_imm\LVL:
+	.word \ENTRY_SYM
+_c_handler_imm\LVL:
+	.word \C_HANDLER_SYM
+_after_imms\LVL:
+	l32r a2, _c_handler_imm\LVL
+	l32r a0, _handle_excint_imm\LVL
+	jx a0
+.popsection
+.endm
--- a/arch/xtensa/include/xtensa-asm2.h
+++ b/arch/xtensa/include/xtensa-asm2.h
@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XTENSA_ASM2_H
+#define _XTENSA_ASM2_H
+
+#include "xtensa-asm2-context.h"
+
+/**
+ * Initializes a stack area such that it can be "restored" later and
+ * begin running with the specified function and three arguments.  The
+ * entry function takes three arguments to match the signature of
+ * Zephyr's k_thread_entry_t.  Thread will start with EXCM clear and
+ * INTLEVEL set to zero (i.e. it's a user thread, we don't start with
+ * anything masked, so don't assume that!).
+ */
+void *xtensa_init_stack(int *stack_top,
+			void (*entry)(void *, void *, void *),
+			void *arg1, void *arg2, void *arg3);
+
+#endif /* _XTENSA_ASM2_H */