diff --git a/arch/Kconfig b/arch/Kconfig
index 04c0ec4e06..2b1c9cbf2a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -38,6 +38,7 @@ config X86
 	select ARCH_IS_SET
 	select ATOMIC_OPERATIONS_BUILTIN
 	select HAS_DTS
+	select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
 	help
 	  x86 architecture
 
diff --git a/arch/x86/core/ia32/thread.c b/arch/x86/core/ia32/thread.c
index 5d5ecea2f1..409c8b13c3 100644
--- a/arch/x86/core/ia32/thread.c
+++ b/arch/x86/core/ia32/thread.c
@@ -15,6 +15,7 @@
 #include <kernel.h>
 #include <ksched.h>
 #include <arch/x86/mmustructs.h>
+#include <kswap.h>
 
 /* forward declaration */
 
@@ -115,3 +116,29 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
 #endif /* CONFIG_LAZY_FPU_SHARING */
 	thread->arch.flags = 0;
 }
+
+/* The core kernel code puts the dummy thread on the stack, which unfortunately
+ * doesn't work for 32-bit x86 as k_thread objects must be aligned due to the
+ * buffer within them fed to fxsave/fxrstor.
+ *
+ * Use some sufficiently aligned bytes in the lower memory of the interrupt
+ * stack instead, otherwise the logic is more or less the same.
+ */
+void arch_switch_to_main_thread(struct k_thread *main_thread,
+				k_thread_stack_t *main_stack,
+				size_t main_stack_size,
+				k_thread_entry_t _main)
+{
+	struct k_thread *dummy_thread = (struct k_thread *)
+		ROUND_UP(Z_THREAD_STACK_BUFFER(z_interrupt_stacks[0]),
+			 FP_REG_SET_ALIGN);
+
+	__ASSERT(((uintptr_t)(&dummy_thread->arch.preempFloatReg) %
+		  FP_REG_SET_ALIGN) == 0,
+		 "unaligned dummy thread %p float member %p",
+		 dummy_thread, &dummy_thread->arch.preempFloatReg);
+
+	z_dummy_thread_init(dummy_thread);
+	z_swap_unlocked();
+	CODE_UNREACHABLE;
+}
diff --git a/include/arch/x86/ia32/thread.h b/include/arch/x86/ia32/thread.h
index fbe9baa2c5..437fba7e78 100644
--- a/include/arch/x86/ia32/thread.h
+++ b/include/arch/x86/ia32/thread.h
@@ -26,12 +26,18 @@
  * since the 'fxsave' and 'fxrstor' instructions require this. In all other
  * cases a 4 byte boundary is sufficient.
  */
-
+#if defined(CONFIG_EAGER_FPU_SHARING) || defined(CONFIG_LAZY_FPU_SHARING)
 #ifdef CONFIG_SSE
 #define FP_REG_SET_ALIGN  16
 #else
 #define FP_REG_SET_ALIGN  4
 #endif
+#else
+/* Unused, no special alignment requirements, use default alignment for
+ * char buffers on this arch
+ */
+#define FP_REG_SET_ALIGN  1
+#endif /* CONFIG_*_FP_SHARING */
 
 /*
  * Bits for _thread_arch.flags, see their use in intstub.S et al.
@@ -230,19 +236,6 @@ struct _thread_arch {
 	unsigned excNestCount; /* nested exception count */
 #endif /* CONFIG_LAZY_FPU_SHARING */
 
-	/*
-	 * The location of all floating point related structures/fields MUST be
-	 * located at the end of struct k_thread.  This way only the
-	 * threads that actually utilize non-integer capabilities need to
-	 * account for the increased memory required for storing FP state when
-	 * sizing stacks.
-	 *
-	 * Given that stacks "grow down" on IA-32, and the TCS is located
-	 * at the start of a thread's "workspace" memory, the stacks of
-	 * threads that do not utilize floating point instruction can
-	 * effectively consume the memory occupied by the 'tPreempFloatReg'
-	 * struct without ill effect.
-	 */
 	tPreempFloatReg preempFloatReg; /* volatile float register storage */
 };