libc/printf: Use compiler-provided 64 bit math, phase 1

The _to_float() implementation had a somewhat kludgey hand-written 64 bit math implementation, which is unhelpful on Zephyr as all our toolchains provide a working uint64_t runtime. This is at best just dupicated code from libgcc, and at worst less efficient. This patch replaces the existing 64 bit minilibrary but keeps the uint32_t[2] API as is for ease of validation and review. One exception is _ldiv5, a specialized divide-by-five implementation. The 64 bit division routines are large on some architectures (ARM and ARC in particular), not pulled in by a default Zephyr build, and will swamp the benefit from this patch. So this includes a refactored/improved _ldiv5 which leverages libgcc for multiword shifts instead of just using raw division. Note also the "noinline" attribute on _ladd(). This is a workaround for an apparent compiler bug when built with -Og or -Os (hand-hacking the Makefiles to build with -O0 works), perhaps due to my aliasing the int array with a long long. This will go away in phase 2. Change-Id: I63e8c82dabe2bfaa75b63ddb59e5f11d51be538e Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2016-08-15 11:31:58 -07:00 · 2016-08-15 11:31:58 -07:00 · d06eea4eda
parent 073cfddd0f
commit d06eea4eda
1 changed files with 42 additions and 86 deletions
--- a/lib/libc/minimal/source/stdout/prf.c
+++ b/lib/libc/minimal/source/stdout/prf.c
@ -139,113 +139,69 @@ static int _to_dec(char *buf, int32_t value, int fplus, int fspace, int precisio

 static void _llshift(uint32_t value[])
 {
-	if (value[0] & 0x80000000)
-		value[1] = (value[1] << 1) | 1;
-	else
-		value[1] <<= 1;
-	value[0] <<= 1;
+	*((uint64_t *)&value[0]) <<= 1;
 }

-static void _lrshift(uint32_t value[])
+static void __attribute__((noinline)) _ladd(uint32_t *result, uint32_t *value)
 {
-	if (value[1] & 1)
-		value[0] = (value[0] >> 1) | 0x80000000;
-	else
-		value[0] = (value[0] >> 1) & 0x7FFFFFFF;
-	value[1] = (value[1] >> 1) & 0x7FFFFFFF;
-}
-
-static void _ladd(uint32_t result[], uint32_t value[])
-{
-	uint32_t carry;
-	uint32_t temp;
-
-	carry = 0;
-	temp = result[0] + value[0];
-	if (result[0] & 0x80000000) {
-		if ((value[0] & 0x80000000) || ((temp & 0x80000000) == 0))
-			carry = 1;
-	} else {
-		if ((value[0] & 0x80000000) && ((temp & 0x80000000) == 0))
-			carry = 1;
-	}
-	result[0] = temp;
-	result[1] = result[1] + value[1] + carry;
+	uint64_t *r = (uint64_t *)&result[0];
+	uint64_t *v = (uint64_t *)&value[0];
+	*r = *r + *v;
 }

 static	void _rlrshift(uint32_t value[])
 {
-	uint32_t temp[2];
-
-	temp[0] = value[0] & 1;
-	temp[1] = 0;
-	_lrshift(value);
-	_ladd(value, temp);
+	uint64_t *v = (uint64_t *)&value[0];
+	*v = (*v & 1) + (*v >> 1);
 }

- /*
-  * 64 bit divide by 5 function for _to_float.
-  * The result is ROUNDED, not TRUNCATED.
-  */
-
-static	void _ldiv5(uint32_t value[])
+/* Tiny integer divide-by-five routine.  The full 64 bit division
+ * implementations in libgcc are very large on some architectures, and
+ * currently nothing in Zephyr pulls it into the link.  So it makes
+ * sense to define this much smaller special case here to avoid
+ * including it just for printf.
+ *
+ * It works by iteratively dividing the most significant 32 bits of
+ * the 64 bit value by 5.  This will leave a remainder of 0-4
+ * (i.e. three significant bits), ensuring that the top 29 bits of the
+ * remainder are zero for the next iteration.  Thus in the second
+ * iteration only 35 significant bits remain, and in the third only
+ * six.  This was tested exhaustively through the first ~10B values in
+ * the input space, and for ~2e12 (4 hours runtime) random inputs
+ * taken from the full 64 bit space.
+ */
+static void _ldiv5(uint32_t value[])
 {
-	uint32_t      result[2];
-	register int  shift;
-	uint32_t      temp1[2];
-	uint32_t      temp2[2];
+	uint64_t *v = (uint64_t *)&value[0];
+	uint32_t i, hi;
+	uint64_t rem = *v, quot = 0, q;
+	static const char shifts[] = { 32, 3, 0 };

-	result[0] = 0;		/* Result accumulator */
-	result[1] = value[1] / 5;
-	temp1[0] = value[0];	/* Dividend for this pass */
-	temp1[1] = value[1] % 5;
-	temp2[1] = 0;
+	/* Usage in this file wants rounded behavior, not truncation.  So add
+	 * two to get the threshold right.
+	 */
+	rem += 2;

-	while (1) {
-		for (shift = 0; temp1[1] != 0; shift++)
-			_lrshift(temp1);
-		temp2[0] = temp1[0] / 5;
-		if (temp2[0] == 0) {
-			if (temp1[0] % 5 > (5 / 2)) {
-				temp1[0] = 1;
-				_ladd(result, temp1);
-			}
-			break;
-		}
-		temp1[0] = temp2[0];
-		while (shift-- != 0)
-			_llshift(temp1);
-		_ladd(result, temp1);	/* Update result accumulator */
-		temp1[0] = result[0];
-		temp1[1] = result[1];
-		_llshift(temp1);	/* Compute (current_result*5) */
-		_llshift(temp1);
-		_ladd(temp1, result);
-		temp1[0] = ~temp1[0];	/* Compute -(current_result*5) */
-		temp1[1] = ~temp1[1];
-		temp2[0] = 1;
-		_ladd(temp1, temp2);
-		_ladd(temp1, value);	/* Compute #-(current_result*5) */
+	for (i = 0; i < 3; i++) {
+		hi = rem >> shifts[i];
+		q = (uint64_t)(hi / 5) << shifts[i];
+		rem -= q * 5;
+		quot += q;
 	}
-	value[0] = result[0];
-	value[1] = result[1];
+
+	*v = quot;
 }

 static	char _get_digit(uint32_t fract[], int *digit_count)
 {
 	int		rval;
-	uint32_t	temp[2];
+	uint64_t *fr = (uint64_t *)&fract[0];

 	if (*digit_count > 0) {
 		*digit_count -= 1;
-		temp[0] = fract[0];
-		temp[1] = fract[1];
-		_llshift(fract);	/* Multiply by 10 */
-		_llshift(fract);
-		_ladd(fract, temp);
-		_llshift(fract);
-		rval = ((fract[1] >> 28) & 0xF) + '0';
-		fract[1] &= 0x0FFFFFFF;
+		*fr = *fr * 10;
+		rval = ((*fr >> 60) & 0xF) + '0';
+		*fr &= 0x0FFFFFFFFFFFFFFFull;
 	} else
 		rval = '0';
 	return (char) (rval);