mempool: significant reduction of memory waste

The mempool allocator implementation recursively breaks a memory block into 4 sub-blocks until it minimally fits the requested memory size. The size of each sub-blocks is rounded up to the next word boundary to preserve word alignment on the returned memory, and this is a problem. Let's consider max_sz = 2072 and n_max = 1. That's our level 0. At level 1, we get one level-0 block split in 4 sub-blocks whose size is WB_UP(2072 / 4) = 520. However 4 * 520 = 2080 so we must discard the 4th sub-block since it doesn't fit inside our 2072-byte parent block. We're down to 3 * 520 = 1560 bytes of usable memory. Our memory usage efficiency is now 1560 / 2072 = 75%. At level 2, we get 3 level-1 blocks, and each of them may be split in 4 sub-blocks whose size is WB_UP(520 / 4) = 132. But 4 * 132 = 528 so the 4th sub-block has to be discarded again. We're down to 9 * 132 = 1188 bytes of usable memory. Our memory usage efficiency is now 1188 / 2072 = 57%. At level 3, we get 9 level-2 blocks, each split into WB_UP(132 / 4) = 36 bytes. Again 4 * 36 = 144 so the 4th sub-block is discarded. We're down to 27 * 36 = 972 bytes of usable memory. Our memory usage efficiency is now 972 / 2072 = 47%. What should be done instead, is to round _down_ sub-block sizes not _up_. This way, sub-blocks still align to word boundaries, and they always fit within their parent block as the total size may no longer exceed the initial size. Using the same max_sz = 2072 would yield a memory usage efficiency of 99% at level 3, so let's demo a worst case 2044 instead. Level 1: 4 sub-blocks of WB_DN(2044 / 4) = 508 bytes. We're down to 4 * 508 = 2032 bytes of usable memory. Our memory usage efficiency is now 2032 / 2044 = 99%. Level 2: 4 * 4 sub-blocks of WB_DN(508 / 4) = 124 bytes. We're down to 16 * 124 = 1984 bytes of usable memory. Our memory usage efficiency is now 1984 / 2044 = 97%. Level 3: 16 * 4 sub-blocks of WB_DN(124 / 4) = 28 bytes. We're down to 64 * 28 = 1792 bytes of usable memory. Our memory usage efficiency is now 1792 / 2044 = 88%. Conclusion: if max_sz is a power of 2 then we get 100% efficiency at all levens in both cases. But if not, then the rounding-up method has a far worse degradation curve than the rounding-down method, wasting more than 50% of memory in some cases. So let's round sub-block sizes down rather than up, and remove block_fits() which purpose was to identify sub-blocks that didn't fit within their parent block and is now useless. Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
2019-07-11 16:35:29 -04:00 · 2019-07-11 16:35:29 -04:00 · 629bd85612
parent 6609c12516
commit 629bd85612
2 changed files with 7 additions and 37 deletions
--- a/lib/libc/minimal/source/stdlib/malloc.c
+++ b/lib/libc/minimal/source/stdlib/malloc.c
@ -109,7 +109,7 @@ void *realloc(void *ptr, size_t requested_size)
 	 */
 	block_size = blk->pool->base.max_sz;
 	for (int i = 1; i <= blk->level; i++) {
-		block_size = WB_UP(block_size / 4);
+		block_size = WB_DN(block_size / 4);
 	}

 	/* We really need this much memory */
--- a/lib/os/mempool.c
+++ b/lib/os/mempool.c
@ -67,30 +67,6 @@ static int partner_bits(struct sys_mem_pool_base *p, int level, int bn)
 	return (*word >> (4*(bit / 4))) & 0xf;
 }

-static size_t buf_size(struct sys_mem_pool_base *p)
-{
-	return p->n_max * p->max_sz;
-}
-
-static bool block_fits(struct sys_mem_pool_base *p,
-		       int lvl, int bn, size_t *lsizes)
-{
-	u8_t *parent, *block_end;
-	size_t parent_sz;
-
-	block_end = (u8_t *)block_ptr(p, lsizes[lvl], bn) + lsizes[lvl];
-
-	if (lvl == 0) {
-		parent_sz = buf_size(p);
-		parent = p->buf;
-	} else {
-		parent_sz = lsizes[lvl - 1];
-		parent = block_ptr(p, lsizes[lvl - 1], bn / 4);
-	}
-
-	return block_end <= (parent + parent_sz);
-}
-
 void z_sys_mem_pool_base_init(struct sys_mem_pool_base *p)
 {
 	int i;
@ -111,7 +87,7 @@ void z_sys_mem_pool_base_init(struct sys_mem_pool_base *p)
 			bits += (nblocks + 31)/32;
 		}

-		sz = WB_UP(sz / 4);
+		sz = WB_DN(sz / 4);
 	}

 	for (i = 0; i < p->n_max; i++) {
@ -175,8 +151,6 @@ static unsigned int bfree_recombine(struct sys_mem_pool_base *p, int level,
 		int i, lsz = lsizes[level];
 		void *block = block_ptr(p, lsz, bn);

-		__ASSERT(block_fits(p, level, bn, lsizes), "");
-
 		/* Put it back */
 		set_free_bit(p, level, bn);
 		sys_dlist_append(&p->levels[level].free_list, block);
@ -193,10 +167,8 @@ static unsigned int bfree_recombine(struct sys_mem_pool_base *p, int level,
 		for (i = 0; i < 4; i++) {
 			int b = (bn & ~3) + i;

-			if (block_fits(p, level, b, lsizes)) {
-				clear_free_bit(p, level, b);
-				sys_dlist_remove(block_ptr(p, lsz, b));
-			}
+			clear_free_bit(p, level, b);
+			sys_dlist_remove(block_ptr(p, lsz, b));
 		}

 		/* Free the larger block */
@ -234,9 +206,7 @@ static void *block_break(struct sys_mem_pool_base *p, void *block, int l,
 		void *block2 = (lsz * i) + (char *)block;

 		set_free_bit(p, l + 1, lbn);
-		if (block_fits(p, l + 1, lbn, lsizes)) {
-			sys_dlist_append(&p->levels[l + 1].free_list, block2);
-		}
+		sys_dlist_append(&p->levels[l + 1].free_list, block2);
 	}

 	return block;
@ -259,7 +229,7 @@ int z_sys_mem_pool_block_alloc(struct sys_mem_pool_base *p, size_t size,
 	lsizes[0] = p->max_sz;
 	for (i = 0; i < p->n_levels; i++) {
 		if (i > 0) {
-			lsizes[i] = WB_UP(lsizes[i-1] / 4);
+			lsizes[i] = WB_DN(lsizes[i-1] / 4);
 		}

 		if (lsizes[i] < size) {
@ -331,7 +301,7 @@ void z_sys_mem_pool_block_free(struct sys_mem_pool_base *p, u32_t level,
 	 */
 	lsizes[0] = p->max_sz;
 	for (i = 1; i <= level; i++) {
-		lsizes[i] = WB_UP(lsizes[i-1] / 4);
+		lsizes[i] = WB_DN(lsizes[i-1] / 4);
 	}

 	block_free(p, level, lsizes, block);