lib/silccrypt/sha256_x86.S

   1 /*
   2
   3   sha256_x86.S
   4
   5   Author: Pekka Riikonen <priikone@silcnet.org>
   6
   7   Copyright (C) 2007 Pekka Riikonen
   8
   9   This program is free software; you can redistribute it and/or modify
  10   it under the terms of the GNU General Public License as published by
  11   the Free Software Foundation; version 2 of the License.
  12
  13   This program is distributed in the hope that it will be useful,
  14   but WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16   GNU General Public License for more details.
  17
  18 */
  19
  20 /* SHA-256 x86 assembler implementation.  This implements only the SHA-256
  21    transform function and other parts are implemented in sha256.c.  The
  22    function preserves ebp, edx, edi and esi but does not preserve other
  23    registers.
  24
  25    This implementation uses only 32-bit registers.  It does not use MMX or
  26    SSE registers which could be used to enhance the performance, especially
  27    when loading the W.  This is about as fast as we can get with less than
  28    8 32-bit registers on 32-bit CPU.
  29
  30    Benchmarks (megabytes (MB) per second), bigger is better:
  31
  32    Code           P4 3.60 GHz      PM 1.60 GHz     Xeon 5160 3.00 GHz
  33    ----------------------------------------------------------------------
  34    SHA-256, asm   110.57 MB/sec    58.50 MB/sec    146.43 MB/sec
  35    SHA-256, gcc    49.07 MB/sec    39.55 MB/sec     82.14 MB/sec
  36    SHA-256, icc   109.97 MB/sec    55.69 MB/sec     N/A
  37
  38    Notes:
  39    - Test program was lib/silccrypt/tests/test_hash
  40    - nice -n -20 was used with test_hash running as root
  41    - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS
  42      had 32-bit kernel in the test.
  43    - ICC generates significantly better code compared to GCC for SSE2
  44      capable CPU, and the generated code uses SSE registers.  Hence the
  45      comparable speed with the assembler code.  Note that, the GCC code
  46      was also compiled with -msse2.  Note that, this assembler code
  47      specifically does not use SSE or MMX, for better compatibility.
  48
  49 */
  50
  51 #include "../../silcdefs.h"
  52
  53 #ifdef SILC_SHA256_X86
  54
  55 #define STACK_STATE     (8 * 4)
  56 #define STACK_W         (64 * 4)
  57 #define STACK_SIZE      STACK_STATE + STACK_W
  58 #define ARG_STATE       STACK_SIZE + 20(%esp)
  59 #define ARG_BUF         STACK_SIZE + 24(%esp)
  60
  61 #define A               0
  62 #define B               4
  63 #define C               8
  64 #define D               12
  65 #define E               16
  66 #define F               20
  67 #define G               24
  68 #define H               28
  69
  70 #define r0              %eax
  71 #define r1              %ebx
  72 #define r2              %ecx
  73 #define r3              %edx
  74 #define r4              %edi
  75 #define r5              %esi
  76
  77 /* One round of SHA-256.  The a (r0) and e (r1) are inputs already in
  78    registers.  r0 will be the next round a, r1 the next round e.  The
  79    d and h are outputs and they are the r0 and r1 for next round. */
  80 #define RND(a, b, c, d, e, f, g, h, W, ki)                              \
  81         movl    f(%ebp), r2;                                            \
  82         movl    g(%ebp), r3;                                            \
  83                                                                         \
  84         movl    e,   r4;                /* e to Sigma1 */               \
  85         rorl    $6,  r4;                /* Sigma1 >>= 6 */              \
  86         movl    r4,  r5;                /* Sigma1 to temp */            \
  87         rorl    $5,  r4;                /* Sigma1 >>= 5 (11) */         \
  88         xorl    r4,  r5;                /* temp ^= Sigma1 */            \
  89         rorl    $14, r4;                /* Sigma1 >>= 14 (25) */        \
  90         xorl    r5,  r4;                /* Sigma1 ^= temp */            \
  91                                                                         \
  92         movl    r3,  r5;                /* g to Ch */                   \
  93         xorl    r2,  r5;                /* Ch ^= f */                   \
  94         andl    e,   r5;                /* Ch &= e */                   \
  95         xorl    r3,  r5;                /* Ch ^= g */                   \
  96                                                                         \
  97         leal    ki(r4, r5), r4;         /* t0 = Sigma1 + Ch + ki */     \
  98         movl    h(%ebp), r3;                                            \
  99         movl    d(%ebp), r1;                                            \
 100         addl    W * 4(%esp), r4;        /* t0 += W[i] */                \
 101         addl    r4, r3;                 /* h += t0 */                   \
 102         addl    r3, r1;                 /* d += h (t0) */               \
 103                                                                         \
 104         movl    a,   r4;                /* a to Sigma0 */               \
 105         rorl    $2,  r4;                /* Sigma0 >>= 2 */              \
 106         movl    r4,  r5;                /* Sigma0 to temp */            \
 107         rorl    $11, r4;                /* Sigma0 >>= 11 (13) */        \
 108         xorl    r4,  r5;                /* temp ^= Sigma0 */            \
 109         rorl    $9,  r4;                /* Sigma0 >>= 9 (22) */         \
 110         xorl    r5,  r4;                /* Sigma0 ^= temp */            \
 111                                                                         \
 112         addl    r3, r4;                 /* t1 = Sigma0 + h (t0) */      \
 113         movl    b(%ebp), r2;                                            \
 114         movl    c(%ebp), r3;                                            \
 115                                                                         \
 116         movl    r2,  r5;                /* b to temp */                 \
 117         orl     a,   r5;                /* temp |= a */                 \
 118         andl    r3,  r5;                /* temp &= c */                 \
 119         andl    r2,  a;                 /* a &= b */                    \
 120         orl     r5,  a;                 /* a |= temp */                 \
 121         addl    r4,  r0;                /* h = t0 + t1 */
 122
 123 #define ROUND(a, b, c, d, e, f, g, h, W, ki)                            \
 124         RND(a, b, c, d, e, f, g, h, W, ki)                              \
 125         movl    r1, d(%ebp);            /* Update d in stack */         \
 126         movl    r0, h(%ebp);            /* Update h in stack */
 127
 128 /* Get 64 bits from input buffer in MSB first order */
 129 #define GET_BUF(i)                                                      \
 130         movl    i * 4(r5), r4;                                          \
 131         movl    (i + 1) * 4(r5), r3;                                    \
 132         bswapl  r4;                                                     \
 133         bswapl  r3;                                                     \
 134         movl    r4, i * 4(%esp);                                        \
 135         movl    r3, (i + 1) * 4(%esp);
 136
 137 /* Expand the input */
 138 #define EXP_BUF(i)                                                      \
 139         rorl    $17, r4;                /* Gamma1 >>= 17 */             \
 140         movl    r4,  r5;                /* Gamma1 to temp */            \
 141         rorl    $2,  r4;                /* Gamma1 >>= 2 (19) */         \
 142         xorl    r4,  r5;                /* temp ^= Gamma1 */            \
 143         shrl    $10, r2;                /* w-2 >> 10 */                 \
 144         xorl    r5,  r2;                /* Gamma1 = w-2 ^ temp */       \
 145                                                                         \
 146         movl    (i - 15) * 4(%esp), r3;                                 \
 147         movl    r3,  r4;                /* w-15 to Gamma0 */            \
 148         rorl    $7,  r4;                /* Gamma0 >>= 7 */              \
 149         movl    r4,  r5;                /* Gamma0 to temp */            \
 150         rorl    $11, r4;                /* Gamma0 >>= 11 (18) */        \
 151         xorl    r4,  r5;                /* temp ^= Gamma0 */            \
 152         shrl    $3,  r3;                /* w-15 >> 3 */                 \
 153         xorl    r5,  r3;                /* Gamma0 = w-15 ^ temp */      \
 154                                                                         \
 155         addl    (i - 7) * 4(%esp), r2;  /* Gamma1 += w-7 */             \
 156         addl    (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */            \
 157         addl    r2,  r3;                /* Gamma0 += Gamma1 */          \
 158         movl    r3, i * 4(%esp);
 159
 160 #define EXP_BUF0(i)                                                     \
 161         movl    r4, r2;                                                 \
 162         EXP_BUF(i)
 163
 164 #define EXP_BUFX(i)                                                     \
 165         movl    (i - 2) * 4(%esp), r2;                                  \
 166         movl    r2,  r4;                /* w-2 to Gamma1 */             \
 167         EXP_BUF(i)
 168
 169
 170 .text
 171 .balign 32
 172 .globl sha256_transform
 173 sha256_transform:
 174         pushl   %ebp
 175         pushl   %ebx
 176         pushl   %edi
 177         pushl   %esi
 178         subl    $STACK_SIZE, %esp
 179
 180         /* State to stack */
 181         movl    ARG_STATE, %ebp
 182         movl    A(%ebp), r0
 183         movl    B(%ebp), r1
 184         movl    C(%ebp), r2
 185         movl    D(%ebp), r3
 186         movl    r0, A + STACK_W(%esp)
 187         movl    r1, B + STACK_W(%esp)
 188         movl    r2, C + STACK_W(%esp)
 189         movl    r3, D + STACK_W(%esp)
 190         movl    E(%ebp), r1
 191         movl    F(%ebp), r2
 192         movl    G(%ebp), r3
 193         movl    H(%ebp), r4
 194         movl    r1, E + STACK_W(%esp)
 195         movl    r2, F + STACK_W(%esp)
 196         movl    r3, G + STACK_W(%esp)
 197         movl    r4, H + STACK_W(%esp)
 198
 199         /* Get buf in MSB first order, W[0..15] */
 200         movl    ARG_BUF, r5
 201         GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6)
 202         GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14)
 203
 204         /* Expand input, fill in W[16..63] */
 205         EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20)
 206         EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25)
 207         EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30)
 208         EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35)
 209         EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40)
 210         EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45)
 211         EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50)
 212         EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55)
 213         EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60)
 214         EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63)
 215
 216         /* Hash, r0 and r1 set above, ebp is base address to state */
 217         leal    STACK_W(%esp), %ebp
 218
 219         ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98);
 220         ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491);
 221         ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf);
 222         ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5);
 223         ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b);
 224         ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1);
 225         ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4);
 226         ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5);
 227
 228         ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98);
 229         ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01);
 230         ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be);
 231         ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3);
 232         ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74);
 233         ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe);
 234         ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7);
 235         ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174);
 236
 237         ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1);
 238         ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786);
 239         ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6);
 240         ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc);
 241         ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f);
 242         ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa);
 243         ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc);
 244         ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da);
 245
 246         ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152);
 247         ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d);
 248         ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8);
 249         ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7);
 250         ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3);
 251         ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147);
 252         ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351);
 253         ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967);
 254
 255         ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85);
 256         ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138);
 257         ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc);
 258         ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13);
 259         ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354);
 260         ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb);
 261         ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e);
 262         ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85);
 263
 264         ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1);
 265         ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b);
 266         ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70);
 267         ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3);
 268         ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819);
 269         ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624);
 270         ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585);
 271         ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070);
 272
 273         ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116);
 274         ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08);
 275         ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c);
 276         ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5);
 277         ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3);
 278         ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a);
 279         ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f);
 280         ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3);
 281
 282         ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee);
 283         ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f);
 284         ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814);
 285         ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208);
 286         ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa);
 287         ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb);
 288         ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7);
 289         RND  (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2);
 290
 291         /* Update state from stack */
 292         movl    ARG_STATE, %ebp
 293         addl    r0, A(%ebp)             /* a from last round */
 294         addl    r1, E(%ebp)             /* e from last round */
 295         movl    B + STACK_W(%esp), r0
 296         movl    C + STACK_W(%esp), r1
 297         movl    D + STACK_W(%esp), r2
 298         movl    F + STACK_W(%esp), r3
 299         movl    G + STACK_W(%esp), r4
 300         movl    H + STACK_W(%esp), r5
 301         addl    r0, B(%ebp)
 302         addl    r1, C(%ebp)
 303         addl    r2, D(%ebp)
 304         addl    r3, F(%ebp)
 305         addl    r4, G(%ebp)
 306         addl    r5, H(%ebp)
 307
 308         addl    $STACK_SIZE, %esp
 309         popl    %esi
 310         popl    %edi
 311         popl    %ebx
 312         popl    %ebp
 313
 314         ret
 315
 316 #endif /* SILC_SHA256_X86 */