kernel/arch: enhance the "ready thread" cache

The way the ready thread cache was implemented caused it to not always be "hot", i.e. there could be some misses, which happened when the cached thread was taken out of the ready queue. When that happened, it was not replaced immediately, since doing so could mean that the replacement might not run because the flow could be interrupted and another thread could take its place. This was the more conservative approach that insured that moving a thread to the cache would never be wasted. However, this caused two problems: 1. The cache could not be refilled until another thread context-switched in, since there was no thread in the cache to compare priorities against. 2. Interrupt exit code would always have to call into C to find what thread to run when the current thread was not coop and did not have the scheduler locked. Furthermore, it was possible for this code path to encounter a cold cache and then it had to find out what thread to run the long way. To fix this, filling the cache is now more aggressive, i.e. the next thread to put in the cache is found even in the case the current cached thread is context-switched out. This ensures the interrupt exit code is much faster on the slow path. In addition, since finding the next thread to run is now always "get it from the cache", which is a simple fetch from memory (_kernel.ready_q.cache), there is no need to call the more complex C code. On the ARM FRDM K64F board, this improvement is seen: Before: 1- Measure time to switch from ISR back to interrupted task switching time is 215 tcs = 1791 nsec 2- Measure time from ISR to executing a different task (rescheduled) switch time is 315 tcs = 2625 nsec After: 1- Measure time to switch from ISR back to interrupted task switching time is 130 tcs = 1083 nsec 2- Measure time from ISR to executing a different task (rescheduled) switch time is 225 tcs = 1875 nsec These are the most dramatic improvements, but most of the numbers generated by the latency_measure test are improved. Fixes ZEP-1401. Change-Id: I2eaac147048b1ec71a93bd0a285e743a39533973 Signed-off-by: Benjamin Walsh <benjamin.walsh@windriver.com>
2016-12-02 10:37:27 -05:00 · 2016-12-02 10:37:27 -05:00 · 88b3691415
parent e6ebe3a8b4
commit 88b3691415
17 changed files with 85 additions and 180 deletions
--- a/arch/arc/core/fast_irq.S
+++ b/arch/arc/core/fast_irq.S
@ -40,7 +40,6 @@ GTEXT(_firq_stack_resume)
 #if CONFIG_RGF_NUM_BANKS != 1
 GDATA(_firq_stack)
 GDATA(_saved_firq_stack)
-GTEXT(_is_next_thread_current)

 SECTION_VAR(NOINIT, _firq_stack)
 	.space CONFIG_FIRQ_STACK_SIZE
@ -157,26 +156,10 @@ _firq_check_for_swap:
 	ld_s r0, [r2, _thread_offset_to_sched_locked]
 	brgt r0, 0, _firq_no_reschedule

-	/* check if the current thread needs to be rescheduled */
-	push_s r2
-	push_s r1
-	push_s blink
-	jl _is_next_thread_current
-	pop_s blink
-	pop_s r1
-	pop_s r2
-#if CONFIG_RGF_NUM_BANKS != 1
-#ifndef CONFIG_FIRQ_NO_LPCC
-	/*
-	 * restore lp_count, lp_start, lp_end from r23-r25 in case
-	 * _is_next_thread_current() routine used them
-	 */
-	mov lp_count,r23
-	sr r24, [_ARC_V2_LP_START]
-	sr r25, [_ARC_V2_LP_END]
-#endif
-#endif
-	breq r0, 0, _firq_reschedule
+	/* Check if the current thread (in r2) is the cached thread */
+	ld_s r0, [r1, _kernel_offset_to_ready_q_cache]
+	brne r0, r2, _firq_reschedule
+
 	/* fall to no rescheduling */

 .balign 4
@ -248,18 +231,7 @@ _firq_reschedule:

 	st _CAUSE_FIRQ, [r2, _thread_offset_to_relinquish_cause]

-	/*
-	 * Save needed registers to callee saved ones. It is faster than
-	 * pushing them to stack. It is possible to do since program has
-	 * just saved them and the calling routine will save them in turn
-	 * if it uses them.
-	 */
-	mov_s r13, blink
-	mov_s r14, r1
-	jl _get_next_ready_thread
-	mov_s blink, r13
-	mov_s r1, r14
-	mov_s r2, r0
+	ld_s r2, [r1, _kernel_offset_to_ready_q_cache]
 	st_s r2, [r1, _kernel_offset_to_current]

 #ifdef CONFIG_ARC_STACK_CHECKING
--- a/arch/arc/core/fault_s.S
+++ b/arch/arc/core/fault_s.S
@ -176,34 +176,15 @@ _trap_check_for_swap:
 	brgt r0, 0, _trap_return

 	/* check if the current thread needs to be rescheduled */
-	push_s r2
-	push_s r1
-	push_s blink
-	jl _is_next_thread_current
-	pop_s blink
-	pop_s r1
-	pop_s r2
-	brne r0, 0, _trap_return
+	ld_s r0, [r1, _kernel_offset_to_ready_q_cache]
+	breq r0, r2, _trap_return

 	_save_callee_saved_regs

 	st _CAUSE_RIRQ, [r2, _thread_offset_to_relinquish_cause]
 	/* note: Ok to use _CAUSE_RIRQ since everything is saved */

-	/*
-	 * Save needed registers to callee saved ones. It is faster than
-	 * pushing registers to stack. It is possible to do since program has
-	 * just saved them and the calling routine will save them in turn
-	 * if it uses them.
-	 */
-	mov_s r13, blink
-	mov_s r14, r0
-	mov_s r15, r1
-	jl _get_next_ready_thread
-	mov_s r2, r0
-	mov_s r1, r15
-	mov_s r0, r14
-	mov_s blink, r13
+	ld_s r2, [r1, _kernel_offset_to_ready_q_cache]
 	st_s r2, [r1, _kernel_offset_to_current]

 	/* clear AE bit to forget this was an exception */
--- a/arch/arc/core/regular_irq.S
+++ b/arch/arc/core/regular_irq.S
@ -35,7 +35,6 @@
 GTEXT(_rirq_enter)
 GTEXT(_rirq_exit)
 GTEXT(_rirq_common_interrupt_swap)
-GTEXT(_is_next_thread_current)

 #if 0 /* TODO: when FIRQ is not present, all would be regular */
 #define NUM_REGULAR_IRQ_PRIO_LEVELS CONFIG_NUM_IRQ_PRIO_LEVELS
@ -156,27 +155,13 @@ _rirq_reschedule_check:
 #endif

 	/* check if the current thread needs to be rescheduled */
-	push_s r2
-	push_s r1
-	push_s blink
-	jl _is_next_thread_current
-	pop_s blink
-	pop_s r1
-	pop_s r2
-	brne.d r0, 0, _rirq_no_reschedule
+	ld_s r0, [r1, _kernel_offset_to_ready_q_cache]
+	breq.d r0, r2, _rirq_no_reschedule
+
+	/* delay slot: always load the current thread's stack */
 	ld sp, [r2, _thread_offset_to_sp]

-	/*
-	 * Get the next scheduled thread. On _get_next_ready_thread
-	 * return it is stored in r0.
-	 */
-	push_s r2
-	push_s r1
-	push_s blink
-	jl _get_next_ready_thread
-	pop_s blink
-	pop_s r1
-	pop_s r2
+	/* cached thread to run is in r0, fall through */

 .balign 4
 _rirq_reschedule:
--- a/arch/arc/core/swap.S
+++ b/arch/arc/core/swap.S
@ -34,7 +34,6 @@
 #include <swap_macros.h>

 GTEXT(_Swap)
-GTEXT(_get_next_ready_thread)
 GDATA(_k_neg_eagain)
 GDATA(_kernel)

@ -106,22 +105,8 @@ SECTION_FUNC(TEXT, _Swap)

 	_save_callee_saved_regs

-	/* find out incoming thread (fiber or task) */
-
-	/*
-	 * Save needed registers to callee saved ones. It is faster than
-	 * pushing them to stack. It is possible to do since program has
-	 * just saved them and the calling routine will save them in turn
-	 * if it uses them.
-	 */
-	mov_s r13, blink
-	mov_s r14, r0
-	mov_s r15, r1
-	jl _get_next_ready_thread
-	mov_s r2, r0
-	mov_s r1, r15
-	mov_s r0, r14
-	mov_s blink, r13
+	/* get the cached thread to run */
+	ld_s r2, [r1, _kernel_offset_to_ready_q_cache]

 	/* entering here, r2 contains the new current thread */
 #ifdef CONFIG_ARC_STACK_CHECKING
--- a/arch/arm/core/exc_exit.S
+++ b/arch/arm/core/exc_exit.S
@ -36,7 +36,6 @@ _ASM_FILE_PROLOGUE
 GTEXT(_ExcExit)
 GTEXT(_IntExit)
 GDATA(_kernel)
-GTEXT(_is_next_thread_current)

 #if CONFIG_GDB_INFO
  #define _EXIT_EXC_IF_FIBER_PREEMPTED beq _ExcExitWithGdbStub
@ -88,9 +87,9 @@ SECTION_SUBSEC_FUNC(TEXT, _HandlerModeExit, _IntExit)

 SECTION_SUBSEC_FUNC(TEXT, _HandlerModeExit, _ExcExit)

-    ldr r1, =_kernel
+    ldr r0, =_kernel

-    ldr r1, [r1, #_kernel_offset_to_current]
+    ldr r1, [r0, #_kernel_offset_to_current]
    ldr r2, [r1, #_thread_offset_to_prio]
    ldr r3, [r1, #_thread_offset_to_sched_locked]

@ -102,20 +101,9 @@ SECTION_SUBSEC_FUNC(TEXT, _HandlerModeExit, _ExcExit)
    cmp r3, #0
    bgt _EXIT_EXC

-    push {lr}
-
-    /* _is_next_thread_current must be called with interrupts locked */
-    cpsid i
-    blx _is_next_thread_current
-    cpsie i
-#if defined(CONFIG_CPU_CORTEX_M0_M0PLUS)
-    pop {r1}
-    mov lr, r1
-#else
-    pop {lr}
-#endif /* CONFIG_CPU_CORTEX_M0_M0PLUS  */
-    cmp r0, #0
-    bne _EXIT_EXC
+    ldr r0, [r0, _kernel_offset_to_ready_q_cache]
+    cmp r0, r1
+    beq _EXIT_EXC

    /* context switch required, pend the PendSV exception */
    ldr r1, =_SCS_ICSR
--- a/arch/arm/core/swap.S
+++ b/arch/arm/core/swap.S
@ -36,7 +36,6 @@ GTEXT(_Swap)
 GTEXT(__svc)
 #endif
 GTEXT(__pendsv)
-GTEXT(_get_next_ready_thread)
 GDATA(_k_neg_eagain)

 GDATA(_kernel)
@ -114,16 +113,11 @@ SECTION_FUNC(TEXT, __pendsv)
    msr BASEPRI, r0
 #endif

-    /* find out incoming thread (fiber or task) */
+    /* _kernel is still in r1 */

-    mov.n v2, lr
-    movs.n v1, r1
-    blx _get_next_ready_thread
-    movs.n r1, v1
-    mov.n lr, v2
-    movs.n r2, r0
+    /* fetch the thread to run from the ready queue cache */
+    ldr r2, [r1, _kernel_offset_to_ready_q_cache]

-    /* r2 contains the new thread */
    str r2, [r1, #_kernel_offset_to_current]

    /*
--- a/arch/arm/include/kernel_arch_func.h
+++ b/arch/arm/include/kernel_arch_func.h
@ -59,6 +59,8 @@ _arch_switch_to_main_thread(char *main_stack, size_t main_stack_size,

 	_current = (void *)main_stack;

+	/* the ready queue cache already contains the main thread */
+
 	__asm__ __volatile__(

 		/* move to main() thread stack */
--- a/arch/nios2/core/exception.S
+++ b/arch/nios2/core/exception.S
@ -25,7 +25,6 @@ GTEXT(_exception)
 /* import */
 GTEXT(_Fault)
 GTEXT(_Swap)
-GTEXT(_is_next_thread_current)
 #ifdef CONFIG_IRQ_OFFLOAD
 GTEXT(_irq_do_offload)
 GTEXT(_offload_routine)
@ -140,8 +139,8 @@ on_irq_stack:
 	bne  r12, zero, no_reschedule

 	/* Call into the kernel to see if a scheduling decision is necessary */
-	call _is_next_thread_current
-	bne  r2, zero, no_reschedule
+	ldw  r2, _kernel_offset_to_ready_q_cache(r10)
+	beq  r2, r11, no_reschedule

 	/*
 	 * A context reschedule is required: keep the volatile registers of
--- a/arch/nios2/core/swap.S
+++ b/arch/nios2/core/swap.S
@ -25,7 +25,6 @@ GTEXT(_thread_entry_wrapper)

 /* imports */
 GTEXT(_sys_k_event_logger_context_switch)
-GTEXT(_get_next_ready_thread)
 GTEXT(_k_neg_eagain)

 /* unsigned int _Swap(unsigned int key)
@ -78,16 +77,17 @@ SECTION_FUNC(exception.other, _Swap)
 	ori   r10, r10, %lo(_kernel)
 #endif /* CONFIG_KERNEL_EVENT_LOGGER_CONTEXT_SWITCH */

-	/* Assign to _kernel.current the return value of
-	 * _get_next_ready_thread()
-	 */
-	call  _get_next_ready_thread
 	movhi r10, %hi(_kernel)
 	ori   r10, r10, %lo(_kernel)
-	stw   r2, _kernel_offset_to_current(r10)
+
+	/* get cached thread to run */
+	ldw   r2, _kernel_offset_to_ready_q_cache(r10)

 	/* At this point r2 points to the next thread to be swapped in */

+	/* the thread to be swapped in is now the current thread */
+	stw   r2, _kernel_offset_to_current(r10)
+
 	/* Restore callee-saved registers and switch to the incoming
 	 * thread's stack
 	 */
--- a/arch/x86/core/intstub.S
+++ b/arch/x86/core/intstub.S
@ -42,7 +42,6 @@
 	/* externs */

 	GTEXT(_Swap)
-	GTEXT(_is_next_thread_current)

 #ifdef CONFIG_SYS_POWER_MANAGEMENT
 	GTEXT(_sys_power_save_idle_exit)
@ -303,9 +302,8 @@ alreadyOnIntStack:


 	/* reschedule only if the scheduler says that we must do so */
-	call	_is_next_thread_current
-	testl   %eax, %eax
-	jnz     noReschedule
+	cmpl	%edx, _kernel_offset_to_ready_q_cache(%ecx)
+	je	noReschedule

 	/*
 	 * Set the INT_ACTIVE bit in the k_thread to allow the upcoming call to
@ -315,11 +313,6 @@ alreadyOnIntStack:
 	 */

 #if defined(CONFIG_FP_SHARING) ||  defined(CONFIG_GDB_INFO)
-	/*
-	 * Reload _kernel.current as _is_next_thread_current()
-	 * might have clobbered it.
-	 */
-	movl	_kernel + _kernel_offset_to_current, %edx
 	orl	$INT_ACTIVE, _thread_offset_to_flags(%edx)
 #endif

--- a/arch/x86/core/swap.S
+++ b/arch/x86/core/swap.S
@ -38,7 +38,6 @@
 	GTEXT(_Swap)

 	/* externs */
-	GTEXT(_get_next_ready_thread)
 	GDATA(_k_neg_eagain)

 /**
@ -131,7 +130,7 @@ SECTION_FUNC(TEXT, _Swap)
 	/* Register the context switch */
 	call	_sys_k_event_logger_context_switch
 #endif
-	call	_get_next_ready_thread
+	movl	_kernel_offset_to_ready_q_cache(%edi), %eax

 	/*
 	 * At this point, the %eax register contains the 'k_thread *' of the
--- a/kernel/unified/include/kernel_offsets.h
+++ b/kernel/unified/include/kernel_offsets.h
@ -37,6 +37,9 @@ GEN_OFFSET_SYM(_kernel_t, irq_stack);
 GEN_OFFSET_SYM(_kernel_t, idle);
 #endif

+GEN_OFFSET_SYM(_kernel_t, ready_q);
+GEN_OFFSET_SYM(_ready_q_t, cache);
+
 #ifdef CONFIG_FP_SHARING
 GEN_OFFSET_SYM(_kernel_t, current_fp);
 #endif
--- a/kernel/unified/include/kernel_structs.h
+++ b/kernel/unified/include/kernel_structs.h
@ -146,6 +146,8 @@ struct _ready_q {
 	sys_dlist_t q[K_NUM_PRIORITIES];
 };

+typedef struct _ready_q _ready_q_t;
+
 struct _kernel {

 	/* nested interrupt count */
--- a/kernel/unified/include/ksched.h
+++ b/kernel/unified/include/ksched.h
@ -30,11 +30,17 @@ extern void _pend_thread(struct k_thread *thread,
 			 _wait_q_t *wait_q, int32_t timeout);
 extern void _pend_current_thread(_wait_q_t *wait_q, int32_t timeout);
 extern void _move_thread_to_end_of_prio_q(struct k_thread *thread);
-extern struct k_thread *_get_next_ready_thread(void);
 extern int __must_switch_threads(void);
 extern int32_t _ms_to_ticks(int32_t ms);
 extern void idle(void *, void *, void *);

+/* find which one is the next thread to run */
+/* must be called with interrupts locked */
+static ALWAYS_INLINE struct k_thread *_get_next_ready_thread(void)
+{
+	return _ready_q.cache;
+}
+
 static inline int _is_idle_thread(void *entry_point)
 {
 	return entry_point == idle;
--- a/kernel/unified/include/offsets_short.h
+++ b/kernel/unified/include/offsets_short.h
@ -39,6 +39,9 @@
 #define _kernel_offset_to_current_fp \
 	(___kernel_t_current_fp_OFFSET)

+#define _kernel_offset_to_ready_q_cache \
+	(___kernel_t_ready_q_OFFSET + ___ready_q_t_cache_OFFSET)
+
 /* end - kernel */

 /* threads */
--- a/kernel/unified/init.c
+++ b/kernel/unified/init.c
@ -265,6 +265,17 @@ static void prepare_multithreading(struct k_thread *dummy_thread)
 		sys_dlist_init(&_ready_q.q[ii]);
 	}

+	/*
+	 * prime the cache with the main thread since:
+	 *
+	 * - the cache can never be NULL
+	 * - the main thread will be the one to run first
+	 * - no other thread is initialized yet and thus their priority fields
+	 *   contain garbage, which would prevent the cache loading algorithm
+	 *   to work as intended
+	 */
+	_ready_q.cache = _main_thread;
+
 	_new_thread(_main_stack, MAIN_STACK_SIZE,
 		    _main, NULL, NULL, NULL,
 		    CONFIG_MAIN_THREAD_PRIORITY, K_ESSENTIAL);
--- a/kernel/unified/sched.c
+++ b/kernel/unified/sched.c
@ -41,6 +41,26 @@ static void _clear_ready_q_prio_bit(int prio)
 	*bmap &= ~_get_ready_q_prio_bit(prio);
 }

+/*
+ * Find the next thread to run when there is no thread in the cache and update
+ * the cache.
+ */
+static struct k_thread *_get_ready_q_head(void)
+{
+	int prio = _get_highest_ready_prio();
+	int q_index = _get_ready_q_q_index(prio);
+	sys_dlist_t *list = &_ready_q.q[q_index];
+
+	__ASSERT(!sys_dlist_is_empty(list),
+		 "no thread to run (prio: %d, queue index: %u)!\n",
+		 prio, q_index);
+
+	struct k_thread *thread =
+		(struct k_thread *)sys_dlist_peek_head_not_empty(list);
+
+	return thread;
+}
+
 /*
 * Add thread to the ready queue, in the slot for its priority; the thread
 * must not be on a wait queue.
@ -61,9 +81,7 @@ void _add_thread_to_ready_q(struct k_thread *thread)

 	struct k_thread **cache = &_ready_q.cache;

-	*cache = *cache && _is_prio_higher(thread->base.prio,
-					   (*cache)->base.prio) ?
-		 thread : *cache;
+	*cache = _is_t1_higher_prio_than_t2(thread, *cache) ? thread : *cache;
 }

 /*
@ -85,7 +103,7 @@ void _remove_thread_from_ready_q(struct k_thread *thread)

 	struct k_thread **cache = &_ready_q.cache;

-	*cache = *cache == thread ? NULL : *cache;
+	*cache = *cache == thread ? _get_ready_q_head() : *cache;
 }

 /* reschedule threads if the scheduler is not locked */
@ -182,37 +200,6 @@ void _pend_current_thread(_wait_q_t *wait_q, int32_t timeout)
 	_pend_thread(_current, wait_q, timeout);
 }

-/*
- * Find the next thread to run when there is no thread in the cache and update
- * the cache.
- */
-static struct k_thread *__get_next_ready_thread(void)
-{
-	int prio = _get_highest_ready_prio();
-	int q_index = _get_ready_q_q_index(prio);
-	sys_dlist_t *list = &_ready_q.q[q_index];
-
-	__ASSERT(!sys_dlist_is_empty(list),
-		 "no thread to run (prio: %d, queue index: %u)!\n",
-		 prio, q_index);
-
-	struct k_thread *thread =
-		(struct k_thread *)sys_dlist_peek_head_not_empty(list);
-
-	_ready_q.cache = thread;
-
-	return thread;
-}
-
-/* find which one is the next thread to run */
-/* must be called with interrupts locked */
-struct k_thread *_get_next_ready_thread(void)
-{
-	struct k_thread *cache = _ready_q.cache;
-
-	return cache ? cache : __get_next_ready_thread();
-}
-
 /*
 * Check if there is a thread of higher prio than the current one. Should only
 * be called if we already know that the current thread is preemptible.
@ -228,11 +215,6 @@ int __must_switch_threads(void)
 	return _is_prio_higher(_get_highest_ready_prio(), _current->base.prio);
 }

-int _is_next_thread_current(void)
-{
-	return _get_next_ready_thread() == _current;
-}
-
 int  k_thread_priority_get(k_tid_t thread)
 {
 	return thread->base.prio;
@ -275,7 +257,7 @@ void _move_thread_to_end_of_prio_q(struct k_thread *thread)

 	struct k_thread **cache = &_ready_q.cache;

-	*cache = *cache == thread ? NULL : *cache;
+	*cache = *cache == thread ? _get_ready_q_head() : *cache;
 }

 void k_yield(void)