+/*
+
+ sha256_x86.S
+
+ Author: Pekka Riikonen <priikone@silcnet.org>
+
+ Copyright (C) 2007 Pekka Riikonen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+*/
+
+/* SHA-256 x86 assembler implementation. This implements only the SHA-256
+ transform function and other parts are implemented in sha256.c. The
+ function preserves ebp, edx, edi and esi but does not preserve other
+ registers.
+
+ This implementation uses only 32-bit registers. It does not use MMX or
+ SSE registers which could be used to enhance the performance, especially
+ when loading the W. This is about as fast as we can get with less than
+ 8 32-bit registers on 32-bit CPU.
+
+*/
+
+#ifdef SILC_SHA256_ASM
+
+#define STACK_STATE (8 * 4)
+#define STACK_W (64 * 4)
+#define STACK_SIZE STACK_STATE + STACK_W
+#define ARG_STATE STACK_SIZE + 20(%esp)
+#define ARG_BUF STACK_SIZE + 24(%esp)
+
+#define A 0
+#define B 4
+#define C 8
+#define D 12
+#define E 16
+#define F 20
+#define G 24
+#define H 28
+
+#define r0 %eax
+#define r1 %ebx
+#define r2 %ecx
+#define r3 %edx
+#define r4 %edi
+#define r5 %esi
+
+/* One round of SHA-256. The a (r0) and e (r1) are inputs already in
+ registers. r0 will be the next round a, r1 the next round e. The
+ d and h are outputs and they are the r0 and r1 for next round. */
+#define RND(a, b, c, d, e, f, g, h, W, ki) \
+ movl f(%ebp), r2; \
+ movl g(%ebp), r3; \
+ \
+ movl e, r4; /* e to Sigma1 */ \
+ rorl $6, r4; /* Sigma1 >>= 6 */ \
+ movl r4, r5; /* Sigma1 to temp */ \
+ rorl $5, r4; /* Sigma1 >>= 5 (11) */ \
+ xorl r4, r5; /* temp ^= Sigma1 */ \
+ rorl $14, r4; /* Sigma1 >>= 14 (25) */ \
+ xorl r5, r4; /* Sigma1 ^= temp */ \
+ \
+ movl r3, r5; /* g to Ch */ \
+ xorl r2, r5; /* Ch ^= f */ \
+ andl e, r5; /* Ch &= e */ \
+ xorl r3, r5; /* Ch ^= g */ \
+ \
+ movl h(%ebp), r3; \
+ movl d(%ebp), r1; \
+ leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \
+ addl W * 4(%esp), r4; /* t0 += W[i] */ \
+ addl r4, r3; /* h += t0 */ \
+ addl r3, r1; /* d += h (t0) */ \
+ \
+ movl a, r4; /* a to Sigma0 */ \
+ rorl $2, r4; /* Sigma0 >>= 2 */ \
+ movl r4, r5; /* Sigma0 to temp */ \
+ rorl $11, r4; /* Sigma0 >>= 11 (13) */ \
+ xorl r4, r5; /* temp ^= Sigma0 */ \
+ rorl $9, r4; /* Sigma0 >>= 9 (22) */ \
+ xorl r5, r4; /* Sigma0 ^= temp */ \
+ \
+ addl r3, r4; /* t1 = Sigma0 + h (t0) */ \
+ movl b(%ebp), r2; \
+ movl c(%ebp), r3; \
+ \
+ movl r2, r5; /* b to temp */ \
+ orl a, r5; /* temp |= a */ \
+ andl r3, r5; /* temp &= c */ \
+ andl r2, a; /* a &= b */ \
+ orl r5, a; /* a |= temp */ \
+ addl r4, r0; /* h = t0 + t1 */
+
+#define ROUND(a, b, c, d, e, f, g, h, W, ki) \
+ RND(a, b, c, d, e, f, g, h, W, ki) \
+ movl r1, d(%ebp); /* Update d in stack */ \
+ movl r0, h(%ebp); /* Update h in stack */
+
+/* Get 64 bits from input buffer in MSB first order */
+#define GET_BUF(i) \
+ movl i * 4(r5), r4; \
+ movl (i + 1) * 4(r5), r3; \
+ bswapl r4; \
+ bswapl r3; \
+ movl r4, i * 4(%esp); \
+ movl r3, (i + 1) * 4(%esp);
+
+/* Expand the input */
+#define EXP_BUF(i) \
+ rorl $17, r4; /* Gamma1 >>= 17 */ \
+ movl r4, r5; /* Gamma1 to temp */ \
+ rorl $2, r4; /* Gamma1 >>= 2 (19) */ \
+ xorl r4, r5; /* temp ^= Gamma1 */ \
+ shrl $10, r2; /* w-2 >> 10 */ \
+ xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \
+ \
+ addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \
+ addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \
+ \
+ movl (i - 15) * 4(%esp), r3; \
+ movl r3, r4; /* w-15 to Gamma0 */ \
+ rorl $7, r4; /* Gamma0 >>= 7 */ \
+ movl r4, r5; /* Gamma0 to temp */ \
+ rorl $11, r4; /* Gamma0 >>= 11 (18) */ \
+ xorl r4, r5; /* temp ^= Gamma0 */ \
+ shrl $3, r3; /* w-15 >> 3 */ \
+ xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \
+ \
+ addl r2, r3; /* Gamma0 += Gamma1 */ \
+ movl r3, i * 4(%esp);
+
+#define EXP_BUF0(i) \
+ movl r4, r2; \
+ EXP_BUF(i)
+
+#define EXP_BUFX(i) \
+ movl (i - 2) * 4(%esp), r2; \
+ movl r2, r4; /* w-2 to Gamma1 */ \
+ EXP_BUF(i)
+
+
+.text
+.align 4
+.globl sha256_transform
+sha256_transform:
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ subl $STACK_SIZE, %esp
+
+ /* State to stack */
+ movl ARG_STATE, %ebp
+ movl A(%ebp), r0
+ movl B(%ebp), r1
+ movl C(%ebp), r2
+ movl D(%ebp), r3
+ movl r0, A + STACK_W(%esp)
+ movl r1, B + STACK_W(%esp)
+ movl r2, C + STACK_W(%esp)
+ movl r3, D + STACK_W(%esp)
+ movl E(%ebp), r1
+ movl F(%ebp), r2
+ movl G(%ebp), r3
+ movl H(%ebp), r4
+ movl r1, E + STACK_W(%esp)
+ movl r2, F + STACK_W(%esp)
+ movl r3, G + STACK_W(%esp)
+ movl r4, H + STACK_W(%esp)
+
+ /* Get buf in MSB first order, W[0..15] */
+ movl ARG_BUF, r5
+ GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6)
+ GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14)
+
+ /* Expand input, fill in W[16..63] */
+ EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20)
+ EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25)
+ EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30)
+ EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35)
+ EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40)
+ EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45)
+ EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50)
+ EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55)
+ EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60)
+ EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63)
+
+ /* Hash, r0 and r1 set above, ebp is base address to state */
+ leal STACK_W(%esp), %ebp
+
+ ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98);
+ ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491);
+ ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf);
+ ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5);
+ ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b);
+ ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1);
+ ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4);
+ ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98);
+ ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01);
+ ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be);
+ ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3);
+ ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74);
+ ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe);
+ ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7);
+ ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1);
+ ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786);
+ ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6);
+ ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc);
+ ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f);
+ ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa);
+ ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc);
+ ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152);
+ ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d);
+ ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8);
+ ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7);
+ ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3);
+ ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147);
+ ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351);
+ ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85);
+ ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138);
+ ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc);
+ ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13);
+ ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354);
+ ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb);
+ ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e);
+ ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1);
+ ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b);
+ ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70);
+ ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3);
+ ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819);
+ ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624);
+ ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585);
+ ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116);
+ ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08);
+ ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c);
+ ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5);
+ ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3);
+ ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a);
+ ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f);
+ ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3);
+
+ ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee);
+ ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f);
+ ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814);
+ ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208);
+ ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa);
+ ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb);
+ ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7);
+ RND (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2);
+
+ /* Update state from stack */
+ movl ARG_STATE, %ebp
+ addl r0, A(%ebp) /* a from last round */
+ addl r1, E(%ebp) /* e from last round */
+ movl B + STACK_W(%esp), r0
+ movl C + STACK_W(%esp), r1
+ movl D + STACK_W(%esp), r2
+ movl F + STACK_W(%esp), r3
+ movl G + STACK_W(%esp), r4
+ movl H + STACK_W(%esp), r5
+ addl r0, B(%ebp)
+ addl r1, C(%ebp)
+ addl r2, D(%ebp)
+ addl r3, F(%ebp)
+ addl r4, G(%ebp)
+ addl r5, H(%ebp)
+
+ addl $STACK_SIZE, %esp
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ ret
+
+#endif /* SILC_SHA256_ASM */