kernel: priority queues: declare as static inlines

After the move to C files we got some drop in the performance when running latency_measure. This patch declares the priority queue functions as static inlines with minor optimizations. The result for one metric (on qemu): 3.6 and before the anything was changed: Get data from LIFO (w/ ctx switch): 13087 ns after original change (46484da502): Get data from LIFO (w/ ctx switch): 13663 ns with this change: Get data from LIFO (w/ ctx switch): 12543 ns So overall, a net gain of ~ 500ns that can be seen across the board on many of the metrics. Signed-off-by: Anas Nashif <anas.nashif@intel.com>
2024-04-11 11:59:07 -04:00 · 2024-04-11 11:59:07 -04:00 · 4593f0d71c
parent 0b8714bcde
commit 4593f0d71c
3 changed files with 141 additions and 148 deletions
--- a/kernel/include/priority_q.h
+++ b/kernel/include/priority_q.h
@ -7,13 +7,20 @@
 #ifndef ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_
 #define ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_
 #include <zephyr/sys/math_extras.h>
 #include <zephyr/sys/dlist.h>
-/* Dump Scheduling */
+extern int32_t z_sched_prio_cmp(struct k_thread *thread_1,
 	struct k_thread *thread_2);
 bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);
 /* Dumb Scheduling */
 #if defined(CONFIG_SCHED_DUMB)
 #define _priq_run_add		z_priq_dumb_add
 #define _priq_run_remove	z_priq_dumb_remove
 # if defined(CONFIG_SCHED_CPU_MASK)
-#  define _priq_run_best	_priq_dumb_mask_best
+#  define _priq_run_best	z_priq_dumb_mask_best
 # else
 #  define _priq_run_best	z_priq_dumb_best
 # endif /* CONFIG_SCHED_CPU_MASK */
@ -29,7 +36,7 @@
 #define NBITS 64
 #else
 #define NBITS 32
-# endif
+#endif /* CONFIG_64BIT */
 #define _priq_run_add		z_priq_mq_add
 #define _priq_run_remove	z_priq_mq_remove
@ -40,30 +47,99 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq, struct k_thread
 /* Scalable Wait Queue */
 #if defined(CONFIG_WAITQ_SCALABLE)
-#define z_priq_wait_add		z_priq_rb_add
+#define _priq_wait_add		z_priq_rb_add
 #define _priq_wait_remove	z_priq_rb_remove
 #define _priq_wait_best		z_priq_rb_best
-/* Dump Wait Queue */
+/* Dumb Wait Queue */
 #elif defined(CONFIG_WAITQ_DUMB)
-#define z_priq_wait_add		z_priq_dumb_add
+#define _priq_wait_add		z_priq_dumb_add
 #define _priq_wait_remove	z_priq_dumb_remove
 #define _priq_wait_best		z_priq_dumb_best
 #endif
-/* Dumb Scheduling*/
+static ALWAYS_INLINE void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
-struct k_thread *z_priq_dumb_best(sys_dlist_t *pq);
+{
-void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread);
+	ARG_UNUSED(pq);
-/* Scalable Scheduling */
+	sys_dlist_remove(&thread->base.qnode_dlist);
-void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread);
+}
 void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread);
-/* Multi Queue Scheduling */
+static ALWAYS_INLINE struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
-struct k_thread *z_priq_mq_best(struct _priq_mq *pq);
+{
-struct k_thread *z_priq_rb_best(struct _priq_rb *pq);
+	struct k_thread *thread = NULL;
 	sys_dnode_t *n = sys_dlist_peek_head(pq);
 	if (n != NULL) {
 		thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
 	}
 	return thread;
 }
-bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);
+static ALWAYS_INLINE void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
 {
 	struct k_thread *t;
 	thread->base.order_key = pq->next_order_key++;
 	/* Renumber at wraparound.  This is tiny code, and in practice
 	 * will almost never be hit on real systems.  BUT on very
 	 * long-running systems where a priq never completely empties
 	 * AND that contains very large numbers of threads, it can be
 	 * a latency glitch to loop over all the threads like this.
 	 */
 	if (!pq->next_order_key) {
 		RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
 			t->base.order_key = pq->next_order_key++;
 		}
 	}
 	rb_insert(&pq->tree, &thread->base.qnode_rb);
 }
 static ALWAYS_INLINE void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
 {
 	rb_remove(&pq->tree, &thread->base.qnode_rb);
 	if (!pq->tree.root) {
 		pq->next_order_key = 0;
 	}
 }
 static ALWAYS_INLINE struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
 {
 	struct k_thread *thread = NULL;
 	struct rbnode *n = rb_get_min(&pq->tree);
 	if (n != NULL) {
 		thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
 	}
 	return thread;
 }
 static ALWAYS_INLINE struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
 {
 	struct k_thread *thread = NULL;
 	for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
 		if (!pq->bitmask[i]) {
 			continue;
 		}
 #ifdef CONFIG_64BIT
 		sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
 #else
 		sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
 #endif
 		sys_dnode_t *n = sys_dlist_peek_head(l);
 		if (n != NULL) {
 			thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
 			break;
 		}
 	}
 	return thread;
 }
 #ifdef CONFIG_SCHED_MULTIQ
@ -105,4 +181,43 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq,
 	}
 }
 #endif /* CONFIG_SCHED_MULTIQ */
 #ifdef CONFIG_SCHED_CPU_MASK
 static ALWAYS_INLINE struct k_thread *z_priq_dumb_mask_best(sys_dlist_t *pq)
 {
 	/* With masks enabled we need to be prepared to walk the list
 	 * looking for one we can run
 	 */
 	struct k_thread *thread;
 	SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
 		if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
 			return thread;
 		}
 	}
 	return NULL;
 }
 #endif /* CONFIG_SCHED_CPU_MASK */
 #if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
 static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
 					  struct k_thread *thread)
 {
 	struct k_thread *t;
 	SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
 		if (z_sched_prio_cmp(thread, t) > 0) {
 			sys_dlist_insert(&t->base.qnode_dlist,
 					 &thread->base.qnode_dlist);
 			return;
 		}
 	}
 	sys_dlist_append(pq, &thread->base.qnode_dlist);
 }
 #endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */
 #endif /* ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_ */
--- a/kernel/priority_queues.c
+++ b/kernel/priority_queues.c
@ -7,26 +7,7 @@
 #include <zephyr/kernel.h>
 #include <ksched.h>
 #include <zephyr/sys/math_extras.h>
-
+#include <zephyr/sys/dlist.h>
 void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
 {
 	ARG_UNUSED(pq);
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	sys_dlist_remove(&thread->base.qnode_dlist);
 }
 struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
 {
 	struct k_thread *thread = NULL;
 	sys_dnode_t *n = sys_dlist_peek_head(pq);
 	if (n != NULL) {
 		thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
 	}
 	return thread;
 }
 bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
 {
@ -47,73 +28,3 @@ bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
 			? 1 : 0;
 	}
 }
 void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
 {
 	struct k_thread *t;
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	thread->base.order_key = pq->next_order_key++;
 	/* Renumber at wraparound.  This is tiny code, and in practice
 	 * will almost never be hit on real systems.  BUT on very
 	 * long-running systems where a priq never completely empties
 	 * AND that contains very large numbers of threads, it can be
 	 * a latency glitch to loop over all the threads like this.
 	 */
 	if (!pq->next_order_key) {
 		RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
 			t->base.order_key = pq->next_order_key++;
 		}
 	}
 	rb_insert(&pq->tree, &thread->base.qnode_rb);
 }
 void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
 {
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	rb_remove(&pq->tree, &thread->base.qnode_rb);
 	if (!pq->tree.root) {
 		pq->next_order_key = 0;
 	}
 }
 struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
 {
 	struct k_thread *thread = NULL;
 	struct rbnode *n = rb_get_min(&pq->tree);
 	if (n != NULL) {
 		thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
 	}
 	return thread;
 }
 struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
 {
 	struct k_thread *thread = NULL;
 	for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
 		if (!pq->bitmask[i]) {
 			continue;
 		}
 #ifdef CONFIG_64BIT
 		sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
 #else
 		sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
 #endif
 		sys_dnode_t *n = sys_dlist_peek_head(l);
 		if (n != NULL) {
 			thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
 			break;
 		}
 	}
 	return thread;
 }
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -82,43 +82,6 @@ int32_t z_sched_prio_cmp(struct k_thread *thread_1,
 	return 0;
 }
 #ifdef CONFIG_SCHED_CPU_MASK
 static ALWAYS_INLINE struct k_thread *_priq_dumb_mask_best(sys_dlist_t *pq)
 {
 	/* With masks enabled we need to be prepared to walk the list
 	 * looking for one we can run
 	 */
 	struct k_thread *thread;
 	SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
 		if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
 			return thread;
 		}
 	}
 	return NULL;
 }
 #endif /* CONFIG_SCHED_CPU_MASK */
 #if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
 static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
 					  struct k_thread *thread)
 {
 	struct k_thread *t;
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
 		if (z_sched_prio_cmp(thread, t) > 0) {
 			sys_dlist_insert(&t->base.qnode_dlist,
 					 &thread->base.qnode_dlist);
 			return;
 		}
 	}
 	sys_dlist_append(pq, &thread->base.qnode_dlist);
 }
 #endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */
 static ALWAYS_INLINE void *thread_runq(struct k_thread *thread)
 {
 #ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
@ -150,11 +113,15 @@ static ALWAYS_INLINE void *curr_cpu_runq(void)
 static ALWAYS_INLINE void runq_add(struct k_thread *thread)
 {
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	_priq_run_add(thread_runq(thread), thread);
 }
 static ALWAYS_INLINE void runq_remove(struct k_thread *thread)
 {
 	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
 	_priq_run_remove(thread_runq(thread), thread);
 }
@ -616,7 +583,7 @@ static void add_to_waitq_locked(struct k_thread *thread, _wait_q_t *wait_q)
 	if (wait_q != NULL) {
 		thread->base.pended_on = wait_q;
-		z_priq_wait_add(&wait_q->waitq, thread);
+		_priq_wait_add(&wait_q->waitq, thread);
 	}
 }