kernel: priority queues: declare as static inlines

After the move to C files we got some drop in the performance when running latency_measure. This patch declares the priority queue functions as static inlines with minor optimizations. The result for one metric (on qemu): 3.6 and before the anything was changed: Get data from LIFO (w/ ctx switch): 13087 ns after original change (46484da502): Get data from LIFO (w/ ctx switch): 13663 ns with this change: Get data from LIFO (w/ ctx switch): 12543 ns So overall, a net gain of ~ 500ns that can be seen across the board on many of the metrics. Signed-off-by: Anas Nashif <anas.nashif@intel.com>
2024-04-11 11:59:07 -04:00 · 2024-04-11 11:59:07 -04:00 · 4593f0d71c
parent 0b8714bcde
commit 4593f0d71c
3 changed files with 141 additions and 148 deletions
--- a/kernel/include/priority_q.h
+++ b/kernel/include/priority_q.h
@ -7,13 +7,20 @@
 #ifndef ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_
 #define ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_

+#include <zephyr/sys/math_extras.h>
+#include <zephyr/sys/dlist.h>

-/* Dump Scheduling */
+extern int32_t z_sched_prio_cmp(struct k_thread *thread_1,
+	struct k_thread *thread_2);
+
+bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);
+
+/* Dumb Scheduling */
 #if defined(CONFIG_SCHED_DUMB)
 #define _priq_run_add		z_priq_dumb_add
 #define _priq_run_remove	z_priq_dumb_remove
 # if defined(CONFIG_SCHED_CPU_MASK)
-#  define _priq_run_best	_priq_dumb_mask_best
+#  define _priq_run_best	z_priq_dumb_mask_best
 # else
 #  define _priq_run_best	z_priq_dumb_best
 # endif /* CONFIG_SCHED_CPU_MASK */
@ -25,11 +32,11 @@
 /* Multi Queue Scheduling */
 #elif defined(CONFIG_SCHED_MULTIQ)

-# if defined(CONFIG_64BIT)
-#  define NBITS 64
-# else
-#  define NBITS 32
-# endif
+#if defined(CONFIG_64BIT)
+#define NBITS 64
+#else
+#define NBITS 32
+#endif /* CONFIG_64BIT */

 #define _priq_run_add		z_priq_mq_add
 #define _priq_run_remove	z_priq_mq_remove
@ -40,30 +47,99 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq, struct k_thread

 /* Scalable Wait Queue */
 #if defined(CONFIG_WAITQ_SCALABLE)
-#define z_priq_wait_add		z_priq_rb_add
+#define _priq_wait_add		z_priq_rb_add
 #define _priq_wait_remove	z_priq_rb_remove
 #define _priq_wait_best		z_priq_rb_best
-/* Dump Wait Queue */
+/* Dumb Wait Queue */
 #elif defined(CONFIG_WAITQ_DUMB)
-#define z_priq_wait_add		z_priq_dumb_add
+#define _priq_wait_add		z_priq_dumb_add
 #define _priq_wait_remove	z_priq_dumb_remove
 #define _priq_wait_best		z_priq_dumb_best
 #endif

-/* Dumb Scheduling*/
-struct k_thread *z_priq_dumb_best(sys_dlist_t *pq);
-void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread);
+static ALWAYS_INLINE void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
+{
+	ARG_UNUSED(pq);

-/* Scalable Scheduling */
-void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread);
-void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread);
+	sys_dlist_remove(&thread->base.qnode_dlist);
+}

-/* Multi Queue Scheduling */
-struct k_thread *z_priq_mq_best(struct _priq_mq *pq);
-struct k_thread *z_priq_rb_best(struct _priq_rb *pq);
+static ALWAYS_INLINE struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
+{
+	struct k_thread *thread = NULL;
+	sys_dnode_t *n = sys_dlist_peek_head(pq);

+	if (n != NULL) {
+		thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
+	}
+	return thread;
+}

-bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);
+static ALWAYS_INLINE void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
+{
+	struct k_thread *t;
+
+	thread->base.order_key = pq->next_order_key++;
+
+	/* Renumber at wraparound.  This is tiny code, and in practice
+	 * will almost never be hit on real systems.  BUT on very
+	 * long-running systems where a priq never completely empties
+	 * AND that contains very large numbers of threads, it can be
+	 * a latency glitch to loop over all the threads like this.
+	 */
+	if (!pq->next_order_key) {
+		RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
+			t->base.order_key = pq->next_order_key++;
+		}
+	}
+
+	rb_insert(&pq->tree, &thread->base.qnode_rb);
+}
+
+static ALWAYS_INLINE void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
+{
+	rb_remove(&pq->tree, &thread->base.qnode_rb);
+
+	if (!pq->tree.root) {
+		pq->next_order_key = 0;
+	}
+}
+
+static ALWAYS_INLINE struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
+{
+	struct k_thread *thread = NULL;
+	struct rbnode *n = rb_get_min(&pq->tree);
+
+	if (n != NULL) {
+		thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
+	}
+	return thread;
+}
+
+static ALWAYS_INLINE struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
+{
+	struct k_thread *thread = NULL;
+
+	for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
+		if (!pq->bitmask[i]) {
+			continue;
+		}
+
+#ifdef CONFIG_64BIT
+		sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
+#else
+		sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
+#endif
+		sys_dnode_t *n = sys_dlist_peek_head(l);
+
+		if (n != NULL) {
+			thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
+			break;
+		}
+	}
+
+	return thread;
+}


 #ifdef CONFIG_SCHED_MULTIQ
@ -105,4 +181,43 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq,
 	}
 }
 #endif /* CONFIG_SCHED_MULTIQ */
+
+
+
+#ifdef CONFIG_SCHED_CPU_MASK
+static ALWAYS_INLINE struct k_thread *z_priq_dumb_mask_best(sys_dlist_t *pq)
+{
+	/* With masks enabled we need to be prepared to walk the list
+	 * looking for one we can run
+	 */
+	struct k_thread *thread;
+
+	SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
+		if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
+			return thread;
+		}
+	}
+	return NULL;
+}
+#endif /* CONFIG_SCHED_CPU_MASK */
+
+
+#if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
+static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
+					  struct k_thread *thread)
+{
+	struct k_thread *t;
+
+	SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
+		if (z_sched_prio_cmp(thread, t) > 0) {
+			sys_dlist_insert(&t->base.qnode_dlist,
+					 &thread->base.qnode_dlist);
+			return;
+		}
+	}
+
+	sys_dlist_append(pq, &thread->base.qnode_dlist);
+}
+#endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */
+
 #endif /* ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_ */
--- a/kernel/priority_queues.c
+++ b/kernel/priority_queues.c
@ -7,26 +7,7 @@
 #include <zephyr/kernel.h>
 #include <ksched.h>
 #include <zephyr/sys/math_extras.h>
-
-void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
-{
-	ARG_UNUSED(pq);
-
-	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
-
-	sys_dlist_remove(&thread->base.qnode_dlist);
-}
-
-struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
-{
-	struct k_thread *thread = NULL;
-	sys_dnode_t *n = sys_dlist_peek_head(pq);
-
-	if (n != NULL) {
-		thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
-	}
-	return thread;
-}
+#include <zephyr/sys/dlist.h>

 bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
 {
@ -47,73 +28,3 @@ bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
 			? 1 : 0;
 	}
 }
-
-void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
-{
-	struct k_thread *t;
-
-	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
-
-	thread->base.order_key = pq->next_order_key++;
-
-	/* Renumber at wraparound.  This is tiny code, and in practice
-	 * will almost never be hit on real systems.  BUT on very
-	 * long-running systems where a priq never completely empties
-	 * AND that contains very large numbers of threads, it can be
-	 * a latency glitch to loop over all the threads like this.
-	 */
-	if (!pq->next_order_key) {
-		RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
-			t->base.order_key = pq->next_order_key++;
-		}
-	}
-
-	rb_insert(&pq->tree, &thread->base.qnode_rb);
-}
-
-void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
-{
-	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
-
-	rb_remove(&pq->tree, &thread->base.qnode_rb);
-
-	if (!pq->tree.root) {
-		pq->next_order_key = 0;
-	}
-}
-
-struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
-{
-	struct k_thread *thread = NULL;
-	struct rbnode *n = rb_get_min(&pq->tree);
-
-	if (n != NULL) {
-		thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
-	}
-	return thread;
-}
-
-struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
-{
-	struct k_thread *thread = NULL;
-
-	for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
-		if (!pq->bitmask[i]) {
-			continue;
-		}
-
-#ifdef CONFIG_64BIT
-		sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
-#else
-		sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
-#endif
-		sys_dnode_t *n = sys_dlist_peek_head(l);
-
-		if (n != NULL) {
-			thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
-			break;
-		}
-	}
-
-	return thread;
-}
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -82,43 +82,6 @@ int32_t z_sched_prio_cmp(struct k_thread *thread_1,
 	return 0;
 }

-#ifdef CONFIG_SCHED_CPU_MASK
-static ALWAYS_INLINE struct k_thread *_priq_dumb_mask_best(sys_dlist_t *pq)
-{
-	/* With masks enabled we need to be prepared to walk the list
-	 * looking for one we can run
-	 */
-	struct k_thread *thread;
-
-	SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
-		if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
-			return thread;
-		}
-	}
-	return NULL;
-}
-#endif /* CONFIG_SCHED_CPU_MASK */
-
-#if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
-static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
-					  struct k_thread *thread)
-{
-	struct k_thread *t;
-
-	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
-
-	SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
-		if (z_sched_prio_cmp(thread, t) > 0) {
-			sys_dlist_insert(&t->base.qnode_dlist,
-					 &thread->base.qnode_dlist);
-			return;
-		}
-	}
-
-	sys_dlist_append(pq, &thread->base.qnode_dlist);
-}
-#endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */
-
 static ALWAYS_INLINE void *thread_runq(struct k_thread *thread)
 {
 #ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
@ -150,11 +113,15 @@ static ALWAYS_INLINE void *curr_cpu_runq(void)

 static ALWAYS_INLINE void runq_add(struct k_thread *thread)
 {
+	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
+
 	_priq_run_add(thread_runq(thread), thread);
 }

 static ALWAYS_INLINE void runq_remove(struct k_thread *thread)
 {
+	__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));
+
 	_priq_run_remove(thread_runq(thread), thread);
 }

@ -616,7 +583,7 @@ static void add_to_waitq_locked(struct k_thread *thread, _wait_q_t *wait_q)

 	if (wait_q != NULL) {
 		thread->base.pended_on = wait_q;
-		z_priq_wait_add(&wait_q->waitq, thread);
+		_priq_wait_add(&wait_q->waitq, thread);
 	}
 }