diff --git a/include/kernel/thread.h b/include/kernel/thread.h
index ee6dc8f50e..ae0930b70e 100644
--- a/include/kernel/thread.h
+++ b/include/kernel/thread.h
@@ -116,6 +116,10 @@ struct _thread_base {
 	/* this thread's entry in a timeout queue */
 	struct _timeout timeout;
 #endif
+
+#ifdef CONFIG_SCHED_THREAD_USAGE
+	uint64_t usage;
+#endif
 };
 
 typedef struct _thread_base _thread_base_t;
diff --git a/include/kernel_structs.h b/include/kernel_structs.h
index 5199f82b92..05d2ec309d 100644
--- a/include/kernel_structs.h
+++ b/include/kernel_structs.h
@@ -130,6 +130,10 @@ struct _cpu {
 	uint8_t swap_ok;
 #endif
 
+#ifdef CONFIG_SCHED_THREAD_USAGE
+	uint32_t usage0;
+#endif
+
 	/* Per CPU architecture specifics */
 	struct _cpu_arch arch;
 };
diff --git a/kernel/Kconfig b/kernel/Kconfig
index 26a078e873..7c09d4392f 100644
--- a/kernel/Kconfig
+++ b/kernel/Kconfig
@@ -377,6 +377,12 @@ config THREAD_MAX_NAME_LEN
 config INSTRUMENT_THREAD_SWITCHING
 	bool
 
+config SCHED_THREAD_USAGE
+	bool "Collect thread runtime usage"
+	depends on USE_SWITCH
+	help
+	  Alternate implementation of thread runtime cycle usage
+
 menuconfig THREAD_RUNTIME_STATS
 	bool "Thread runtime statistics"
 	select INSTRUMENT_THREAD_SWITCHING
diff --git a/kernel/include/ksched.h b/kernel/include/ksched.h
index 7d39825c52..f29e6c4166 100644
--- a/kernel/include/ksched.h
+++ b/kernel/include/ksched.h
@@ -363,4 +363,35 @@ static inline bool z_sched_wake_all(_wait_q_t *wait_q, int swap_retval,
 int z_sched_wait(struct k_spinlock *lock, k_spinlock_key_t key,
 		 _wait_q_t *wait_q, k_timeout_t timeout, void **data);
 
+
+/** @brief Halt thread cycle usage accounting.
+ *
+ * Halts the accumulation of thread cycle usage and adds the current
+ * total to the thread's counter.  Called on context switch.
+ *
+ * Note that this function is idempotent.  The core kernel code calls
+ * it at the end of interrupt handlers (because that is where we have
+ * a portable hook) where we are context switching, which will include
+ * any cycles spent in the ISR in the per-thread accounting.  But
+ * architecture code can also call it earlier out of interrupt entry
+ * to improve measurement fidelity.
+ *
+ * This function assumes local interrupts are masked (so that the
+ * current CPU pointer and current thread are safe to modify), but
+ * requires no other synchronizaton.  Architecture layers don't need
+ * to do anything more.
+ */
+void z_sched_usage_stop(void);
+
+void z_sched_usage_start(struct k_thread *thread);
+
+static inline void z_sched_usage_switch(struct k_thread *thread)
+{
+	ARG_UNUSED(thread);
+#ifdef CONFIG_SCHED_THREAD_USAGE
+	z_sched_usage_stop();
+	z_sched_usage_start(thread);
+#endif
+}
+
 #endif /* ZEPHYR_KERNEL_INCLUDE_KSCHED_H_ */
diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
index 917315d589..f43f194117 100644
--- a/kernel/include/kswap.h
+++ b/kernel/include/kswap.h
@@ -109,6 +109,7 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 #ifdef CONFIG_TIMESLICING
 		z_reset_time_slice();
 #endif
+		z_sched_usage_switch(new_thread);
 
 #ifdef CONFIG_SMP
 		_current_cpu->swap_ok = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5eeb192268..32fb2341ac 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -937,6 +937,8 @@ void *z_get_next_switch_handle(void *interrupted)
 		}
 		new_thread = next_up();
 
+		z_sched_usage_switch(new_thread);
+
 		if (old_thread != new_thread) {
 			update_metairq_preempt(new_thread);
 			wait_for_switch(new_thread);
@@ -976,6 +978,7 @@ void *z_get_next_switch_handle(void *interrupted)
 	}
 	return ret;
 #else
+	z_sched_usage_switch(_kernel.ready_q.cache);
 	_current->switch_handle = interrupted;
 	set_current(_kernel.ready_q.cache);
 	return _current->switch_handle;
@@ -1731,3 +1734,55 @@ int z_sched_wait(struct k_spinlock *lock, k_spinlock_key_t key,
 	}
 	return ret;
 }
+
+#ifdef CONFIG_SCHED_THREAD_USAGE
+
+static struct k_spinlock usage_lock;
+
+static uint32_t usage_now(void)
+{
+	uint32_t now = k_cycle_get_32();
+
+	/* Edge case: we use a zero as a null ("stop() already called") */
+	return (now == 0) ? 1 : now;
+}
+
+void z_sched_usage_start(struct k_thread *thread)
+{
+	/* One write through a volatile pointer doesn't require
+	 * synchronization as long as _usage() treats it as volatile
+	 * (we can't race with _stop() by design).
+	 */
+	_current_cpu->usage0 = usage_now();
+}
+
+void z_sched_usage_stop(void)
+{
+	k_spinlock_key_t k = k_spin_lock(&usage_lock);
+	uint32_t u0 = _current_cpu->usage0;
+
+	if (u0 != 0) {
+		_current->base.usage += usage_now() - u0;
+	}
+
+	_current_cpu->usage0 = 0;
+	k_spin_unlock(&usage_lock, k);
+}
+
+uint64_t z_sched_thread_usage(struct k_thread *thread)
+{
+	k_spinlock_key_t k = k_spin_lock(&usage_lock);
+	uint32_t u0 = _current_cpu->usage0, now = usage_now();
+	uint64_t ret = thread->base.usage;
+
+	if (u0 != 0) {
+		ret += now - u0;
+		thread->base.usage = ret;
+		_current_cpu->usage0 = now;
+	}
+
+	k_spin_unlock(&usage_lock, k);
+	return ret;
+}
+
+#endif /* CONFIG_SCHED_THREAD_USAGE */