X-Git-Url: http://git.silcnet.org/gitweb/?a=blobdiff_plain;f=lib%2Fsilccrypt%2Fsha256_x86.S;h=cfc1ee83ca24290502b7930bb287cded29287085;hb=9b499de7f8fdbb24c32b8a0a84bb2fbbcdab782a;hp=58f7c6915ad3864188ad42bba5cd0cccf1d44713;hpb=1df586668c6f2f4de3bc47a285fc79bccc739084;p=crypto.git diff --git a/lib/silccrypt/sha256_x86.S b/lib/silccrypt/sha256_x86.S index 58f7c691..cfc1ee83 100644 --- a/lib/silccrypt/sha256_x86.S +++ b/lib/silccrypt/sha256_x86.S @@ -27,11 +27,30 @@ when loading the W. This is about as fast as we can get with less than 8 32-bit registers on 32-bit CPU. + Benchmarks (megabytes (MB) per second), bigger is better: + + Code P4 3.60 GHz PM 1.60 GHz Xeon 5160 3.00 GHz + ---------------------------------------------------------------------- + SHA-256, asm 110.57 MB/sec 58.50 MB/sec 146.43 MB/sec + SHA-256, gcc 49.07 MB/sec 39.55 MB/sec 82.14 MB/sec + SHA-256, icc 109.97 MB/sec 55.69 MB/sec N/A + + Notes: + - Test program was lib/silccrypt/tests/test_hash + - nice -n -20 was used with test_hash running as root + - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS + had 32-bit kernel in the test. + - ICC generates significantly better code compared to GCC for SSE2 + capable CPU, and the generated code uses SSE registers. Hence the + comparable speed with the assembler code. Note that, the GCC code + was also compiled with -msse2. Note that, this assembler code + specifically does not use SSE or MMX, for better compatibility. + */ -#include "silcdefs.h" +#include "../../cryptodefs.h" -#ifdef SILC_SHA256_ASM +#ifdef SILC_SHA256_X86 #define STACK_STATE (8 * 4) #define STACK_W (64 * 4) @@ -75,9 +94,9 @@ andl e, r5; /* Ch &= e */ \ xorl r3, r5; /* Ch ^= g */ \ \ + leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \ movl h(%ebp), r3; \ movl d(%ebp), r1; \ - leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \ addl W * 4(%esp), r4; /* t0 += W[i] */ \ addl r4, r3; /* h += t0 */ \ addl r3, r1; /* d += h (t0) */ \ @@ -124,9 +143,6 @@ shrl $10, r2; /* w-2 >> 10 */ \ xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \ \ - addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \ - addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \ - \ movl (i - 15) * 4(%esp), r3; \ movl r3, r4; /* w-15 to Gamma0 */ \ rorl $7, r4; /* Gamma0 >>= 7 */ \ @@ -136,6 +152,8 @@ shrl $3, r3; /* w-15 >> 3 */ \ xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \ \ + addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \ + addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \ addl r2, r3; /* Gamma0 += Gamma1 */ \ movl r3, i * 4(%esp); @@ -150,7 +168,7 @@ .text -.align 4 +.balign 32 .globl sha256_transform sha256_transform: pushl %ebp @@ -295,4 +313,4 @@ sha256_transform: ret -#endif /* SILC_SHA256_ASM */ +#endif /* SILC_SHA256_X86 */