X-Git-Url: http://git.silcnet.org/gitweb/?a=blobdiff_plain;f=lib%2Fsilccrypt%2Fsha256_x86.S;h=cfc1ee83ca24290502b7930bb287cded29287085;hb=9b499de7f8fdbb24c32b8a0a84bb2fbbcdab782a;hp=58f7c6915ad3864188ad42bba5cd0cccf1d44713;hpb=1df586668c6f2f4de3bc47a285fc79bccc739084;p=crypto.git

diff --git a/lib/silccrypt/sha256_x86.S b/lib/silccrypt/sha256_x86.S
index 58f7c691..cfc1ee83 100644
--- a/lib/silccrypt/sha256_x86.S
+++ b/lib/silccrypt/sha256_x86.S
@@ -27,11 +27,30 @@
    when loading the W.  This is about as fast as we can get with less than
    8 32-bit registers on 32-bit CPU.
 
+   Benchmarks (megabytes (MB) per second), bigger is better:
+
+   Code           P4 3.60 GHz      PM 1.60 GHz     Xeon 5160 3.00 GHz
+   ----------------------------------------------------------------------
+   SHA-256, asm   110.57 MB/sec    58.50 MB/sec    146.43 MB/sec
+   SHA-256, gcc    49.07 MB/sec    39.55 MB/sec     82.14 MB/sec
+   SHA-256, icc   109.97 MB/sec    55.69 MB/sec     N/A
+
+   Notes:
+   - Test program was lib/silccrypt/tests/test_hash
+   - nice -n -20 was used with test_hash running as root
+   - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS
+     had 32-bit kernel in the test.
+   - ICC generates significantly better code compared to GCC for SSE2
+     capable CPU, and the generated code uses SSE registers.  Hence the
+     comparable speed with the assembler code.  Note that, the GCC code
+     was also compiled with -msse2.  Note that, this assembler code
+     specifically does not use SSE or MMX, for better compatibility.
+
 */
 
-#include "silcdefs.h"
+#include "../../cryptodefs.h"
 
-#ifdef SILC_SHA256_ASM
+#ifdef SILC_SHA256_X86
 
 #define STACK_STATE	(8 * 4)
 #define STACK_W		(64 * 4)
@@ -75,9 +94,9 @@
   	andl    e,   r5;		/* Ch &= e */			\
   	xorl    r3,  r5;		/* Ch ^= g */			\
 									\
+  	leal    ki(r4, r5), r4;		/* t0 = Sigma1 + Ch + ki */	\
 	movl    h(%ebp), r3;						\
   	movl    d(%ebp), r1;						\
-  	leal    ki(r4, r5), r4;		/* t0 = Sigma1 + Ch + ki */	\
   	addl    W * 4(%esp), r4;	/* t0 += W[i] */		\
   	addl    r4, r3;			/* h += t0 */			\
   	addl    r3, r1;			/* d += h (t0) */		\
@@ -124,9 +143,6 @@
 	shrl	$10, r2;		/* w-2 >> 10 */			\
 	xorl    r5,  r2;		/* Gamma1 = w-2 ^ temp */	\
 									\
-	addl    (i - 7) * 4(%esp), r2;	/* Gamma1 += w-7 */		\
-	addl	(i - 16) * 4(%esp), r2;	/* Gamma1 += w-16 */		\
-									\
   	movl    (i - 15) * 4(%esp), r3;					\
 	movl    r3,  r4;		/* w-15 to Gamma0 */		\
 	rorl    $7,  r4;		/* Gamma0 >>= 7 */		\
@@ -136,6 +152,8 @@
 	shrl	$3,  r3;		/* w-15 >> 3 */			\
 	xorl    r5,  r3;		/* Gamma0 = w-15 ^ temp */	\
 									\
+	addl    (i - 7) * 4(%esp), r2;	/* Gamma1 += w-7 */		\
+	addl	(i - 16) * 4(%esp), r2;	/* Gamma1 += w-16 */		\
 	addl    r2,  r3;		/* Gamma0 += Gamma1 */		\
 	movl    r3, i * 4(%esp);
 
@@ -150,7 +168,7 @@
 
 
 .text
-.align 4
+.balign 32
 .globl sha256_transform
 sha256_transform:
 	pushl	%ebp
@@ -295,4 +313,4 @@ sha256_transform:
 
 	ret
 
-#endif /* SILC_SHA256_ASM */
+#endif /* SILC_SHA256_X86 */