Optimizations.

author Pekka Riikonen <priikone@silcnet.org>

Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)

committer Pekka Riikonen <priikone@silcnet.org>

Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)
author Pekka Riikonen <priikone@silcnet.org>
Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)
committer Pekka Riikonen <priikone@silcnet.org>
Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)
diff --git a/lib/silccrypt/configure.ad b/lib/silccrypt/configure.ad

index 721bea2aca855da7f26da32ae1ffbabd8f6a566a..01434b039b8a890702cbd3a37b1763bf165e13e4 100644 (file)
--- a/lib/silccrypt/configure.ad
+++ b/lib/silccrypt/configure.ad
@@ -25,7 +25,7 @@ aes_asm=false
  
  case "$host_cpu" in
    i?86)
-    AC_DEFINE([SILC_SHA256_ASM], [], [SILC_SHA256_ASM])
+    AC_DEFINE([SILC_SHA256_X86], [], [SILC_SHA256_X86])
  
      if test "${pic_mode:-default}" != "yes" ; then
        # Don't enable ASM AES with shared libs as the code doesn't support PIC.
diff --git a/lib/silccrypt/sha256.c b/lib/silccrypt/sha256.c

index 4b149a8d288508454caa6ffaaad4236c93bafbb6..3a5abc45b957936ad98ea3a57e747a9dd66057ac 100644 (file)
--- a/lib/silccrypt/sha256.c
+++ b/lib/silccrypt/sha256.c
@@ -53,7 +53,7 @@ SILC_HASH_API_CONTEXT_LEN(sha256)
  #define Gamma0(x)       (S(x, 7) ^ S(x, 18) ^ R(x, 3))
  #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
  
-#ifndef SILC_SHA256_ASM
+#ifndef SILC_SHA256_X86
  
  /* Transform 512-bits */
  void  sha256_transform(SilcUInt32 *state, unsigned char *buf)
@@ -155,7 +155,7 @@ void  sha256_transform(SilcUInt32 *state, unsigned char *buf)
    }
  }
  
-#endif /* !SILC_SHA256_ASM */
+#endif /* !SILC_SHA256_X86 */
  
  int sha256_init(sha256_state * md)
  {
diff --git a/lib/silccrypt/sha256_x86.S b/lib/silccrypt/sha256_x86.S

index 58f7c6915ad3864188ad42bba5cd0cccf1d44713..85166abeff84750b158bfaaa7413660c88a8ad97 100644 (file)
--- a/lib/silccrypt/sha256_x86.S
+++ b/lib/silccrypt/sha256_x86.S
@@ -27,11 +27,28 @@
     when loading the W.  This is about as fast as we can get with less than
     8 32-bit registers on 32-bit CPU.
  
+   Benchmarks (megabytes (MB) per second), bigger is better:
+
+   Code            Pentium 4 3.60 GHz   Pentium M 1.60 GHz
+   -----------------------------------------------------------------------
+   SHA-256, asm    110.57 MB/sec        58.50 MB/sec
+   SHA-256, gcc     49.07 MB/sec        39.55 MB/sec
+   SHA-256, icc    109.97 MB/sec        55.69 MB/sec
+
+   Notes:
+   - Test program was lib/silccrypt/tests/test_hash
+   - nice -n -20 was used with test_hash running as root
+   - ICC generates significantly better code compared to GCC for SSE2
+     capable CPU, and the generated code uses SSE registers.  Hence the
+     comparable speed with the assembler code.  Note that, the GCC code
+     was also compiled with -msse2.  Note that, this assembler code
+     specifically does not use SSE or MMX, for better compatibility.
+
  */
  
  #include "silcdefs.h"
  
-#ifdef SILC_SHA256_ASM
+#ifdef SILC_SHA256_X86
  
  #define STACK_STATE    (8 * 4)
  #define STACK_W                (64 * 4)
@@ -75,9 +92,9 @@
         andl    e,   r5;                /* Ch &= e */                   \
         xorl    r3,  r5;                /* Ch ^= g */                   \
                                                                         \
+       leal    ki(r4, r5), r4;         /* t0 = Sigma1 + Ch + ki */     \
         movl    h(%ebp), r3;                                            \
         movl    d(%ebp), r1;                                            \
-       leal    ki(r4, r5), r4;         /* t0 = Sigma1 + Ch + ki */     \
         addl    W * 4(%esp), r4;        /* t0 += W[i] */                \
         addl    r4, r3;                 /* h += t0 */                   \
         addl    r3, r1;                 /* d += h (t0) */               \
@@ -124,9 +141,6 @@
         shrl    $10, r2;                /* w-2 >> 10 */                 \
         xorl    r5,  r2;                /* Gamma1 = w-2 ^ temp */       \
                                                                         \
-       addl    (i - 7) * 4(%esp), r2;  /* Gamma1 += w-7 */             \
-       addl    (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */            \
-                                                                       \
         movl    (i - 15) * 4(%esp), r3;                                 \
         movl    r3,  r4;                /* w-15 to Gamma0 */            \
         rorl    $7,  r4;                /* Gamma0 >>= 7 */              \
@@ -136,6 +150,8 @@
         shrl    $3,  r3;                /* w-15 >> 3 */                 \
         xorl    r5,  r3;                /* Gamma0 = w-15 ^ temp */      \
                                                                         \
+       addl    (i - 7) * 4(%esp), r2;  /* Gamma1 += w-7 */             \
+       addl    (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */            \
         addl    r2,  r3;                /* Gamma0 += Gamma1 */          \
         movl    r3, i * 4(%esp);
  
@@ -295,4 +311,4 @@ sha256_transform:
  
         ret
  
-#endif /* SILC_SHA256_ASM */
+#endif /* SILC_SHA256_X86 */
author	Pekka Riikonen <priikone@silcnet.org>
	Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)
committer	Pekka Riikonen <priikone@silcnet.org>
	Mon, 3 Sep 2007 13:41:09 +0000 (13:41 +0000)
lib/silccrypt/configure.ad		patch \| blob \| history
lib/silccrypt/sha256.c		patch \| blob \| history
lib/silccrypt/sha256_x86.S		patch \| blob \| history