/* sha256_x86.S Author: Pekka Riikonen Copyright (C) 2007 Pekka Riikonen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. */ /* SHA-256 x86 assembler implementation. This implements only the SHA-256 transform function and other parts are implemented in sha256.c. The function preserves ebp, edx, edi and esi but does not preserve other registers. This implementation uses only 32-bit registers. It does not use MMX or SSE registers which could be used to enhance the performance, especially when loading the W. This is about as fast as we can get with less than 8 32-bit registers on 32-bit CPU. Benchmarks (megabytes (MB) per second), bigger is better: Code P4 3.60 GHz PM 1.60 GHz Xeon 5160 3.00 GHz ---------------------------------------------------------------------- SHA-256, asm 110.57 MB/sec 58.50 MB/sec 146.43 MB/sec SHA-256, gcc 49.07 MB/sec 39.55 MB/sec 82.14 MB/sec SHA-256, icc 109.97 MB/sec 55.69 MB/sec N/A Notes: - Test program was lib/silccrypt/tests/test_hash - nice -n -20 was used with test_hash running as root - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS had 32-bit kernel in the test. - ICC generates significantly better code compared to GCC for SSE2 capable CPU, and the generated code uses SSE registers. Hence the comparable speed with the assembler code. Note that, the GCC code was also compiled with -msse2. Note that, this assembler code specifically does not use SSE or MMX, for better compatibility. */ #include "silcdefs.h" #ifdef SILC_SHA256_X86 #define STACK_STATE (8 * 4) #define STACK_W (64 * 4) #define STACK_SIZE STACK_STATE + STACK_W #define ARG_STATE STACK_SIZE + 20(%esp) #define ARG_BUF STACK_SIZE + 24(%esp) #define A 0 #define B 4 #define C 8 #define D 12 #define E 16 #define F 20 #define G 24 #define H 28 #define r0 %eax #define r1 %ebx #define r2 %ecx #define r3 %edx #define r4 %edi #define r5 %esi /* One round of SHA-256. The a (r0) and e (r1) are inputs already in registers. r0 will be the next round a, r1 the next round e. The d and h are outputs and they are the r0 and r1 for next round. */ #define RND(a, b, c, d, e, f, g, h, W, ki) \ movl f(%ebp), r2; \ movl g(%ebp), r3; \ \ movl e, r4; /* e to Sigma1 */ \ rorl $6, r4; /* Sigma1 >>= 6 */ \ movl r4, r5; /* Sigma1 to temp */ \ rorl $5, r4; /* Sigma1 >>= 5 (11) */ \ xorl r4, r5; /* temp ^= Sigma1 */ \ rorl $14, r4; /* Sigma1 >>= 14 (25) */ \ xorl r5, r4; /* Sigma1 ^= temp */ \ \ movl r3, r5; /* g to Ch */ \ xorl r2, r5; /* Ch ^= f */ \ andl e, r5; /* Ch &= e */ \ xorl r3, r5; /* Ch ^= g */ \ \ leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \ movl h(%ebp), r3; \ movl d(%ebp), r1; \ addl W * 4(%esp), r4; /* t0 += W[i] */ \ addl r4, r3; /* h += t0 */ \ addl r3, r1; /* d += h (t0) */ \ \ movl a, r4; /* a to Sigma0 */ \ rorl $2, r4; /* Sigma0 >>= 2 */ \ movl r4, r5; /* Sigma0 to temp */ \ rorl $11, r4; /* Sigma0 >>= 11 (13) */ \ xorl r4, r5; /* temp ^= Sigma0 */ \ rorl $9, r4; /* Sigma0 >>= 9 (22) */ \ xorl r5, r4; /* Sigma0 ^= temp */ \ \ addl r3, r4; /* t1 = Sigma0 + h (t0) */ \ movl b(%ebp), r2; \ movl c(%ebp), r3; \ \ movl r2, r5; /* b to temp */ \ orl a, r5; /* temp |= a */ \ andl r3, r5; /* temp &= c */ \ andl r2, a; /* a &= b */ \ orl r5, a; /* a |= temp */ \ addl r4, r0; /* h = t0 + t1 */ #define ROUND(a, b, c, d, e, f, g, h, W, ki) \ RND(a, b, c, d, e, f, g, h, W, ki) \ movl r1, d(%ebp); /* Update d in stack */ \ movl r0, h(%ebp); /* Update h in stack */ /* Get 64 bits from input buffer in MSB first order */ #define GET_BUF(i) \ movl i * 4(r5), r4; \ movl (i + 1) * 4(r5), r3; \ bswapl r4; \ bswapl r3; \ movl r4, i * 4(%esp); \ movl r3, (i + 1) * 4(%esp); /* Expand the input */ #define EXP_BUF(i) \ rorl $17, r4; /* Gamma1 >>= 17 */ \ movl r4, r5; /* Gamma1 to temp */ \ rorl $2, r4; /* Gamma1 >>= 2 (19) */ \ xorl r4, r5; /* temp ^= Gamma1 */ \ shrl $10, r2; /* w-2 >> 10 */ \ xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \ \ movl (i - 15) * 4(%esp), r3; \ movl r3, r4; /* w-15 to Gamma0 */ \ rorl $7, r4; /* Gamma0 >>= 7 */ \ movl r4, r5; /* Gamma0 to temp */ \ rorl $11, r4; /* Gamma0 >>= 11 (18) */ \ xorl r4, r5; /* temp ^= Gamma0 */ \ shrl $3, r3; /* w-15 >> 3 */ \ xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \ \ addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \ addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \ addl r2, r3; /* Gamma0 += Gamma1 */ \ movl r3, i * 4(%esp); #define EXP_BUF0(i) \ movl r4, r2; \ EXP_BUF(i) #define EXP_BUFX(i) \ movl (i - 2) * 4(%esp), r2; \ movl r2, r4; /* w-2 to Gamma1 */ \ EXP_BUF(i) .text .align 4 .globl sha256_transform sha256_transform: pushl %ebp pushl %ebx pushl %edi pushl %esi subl $STACK_SIZE, %esp /* State to stack */ movl ARG_STATE, %ebp movl A(%ebp), r0 movl B(%ebp), r1 movl C(%ebp), r2 movl D(%ebp), r3 movl r0, A + STACK_W(%esp) movl r1, B + STACK_W(%esp) movl r2, C + STACK_W(%esp) movl r3, D + STACK_W(%esp) movl E(%ebp), r1 movl F(%ebp), r2 movl G(%ebp), r3 movl H(%ebp), r4 movl r1, E + STACK_W(%esp) movl r2, F + STACK_W(%esp) movl r3, G + STACK_W(%esp) movl r4, H + STACK_W(%esp) /* Get buf in MSB first order, W[0..15] */ movl ARG_BUF, r5 GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6) GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14) /* Expand input, fill in W[16..63] */ EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20) EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25) EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30) EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35) EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40) EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45) EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50) EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55) EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60) EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63) /* Hash, r0 and r1 set above, ebp is base address to state */ leal STACK_W(%esp), %ebp ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98); ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491); ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf); ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5); ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b); ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1); ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4); ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5); ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98); ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01); ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be); ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3); ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74); ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe); ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7); ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174); ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1); ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786); ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6); ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc); ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f); ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa); ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc); ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da); ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152); ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d); ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8); ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7); ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3); ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147); ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351); ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967); ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85); ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138); ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc); ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13); ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354); ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb); ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e); ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85); ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1); ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b); ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70); ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3); ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819); ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624); ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585); ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070); ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116); ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08); ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c); ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5); ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3); ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a); ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f); ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3); ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee); ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f); ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814); ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208); ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa); ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb); ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7); RND (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2); /* Update state from stack */ movl ARG_STATE, %ebp addl r0, A(%ebp) /* a from last round */ addl r1, E(%ebp) /* e from last round */ movl B + STACK_W(%esp), r0 movl C + STACK_W(%esp), r1 movl D + STACK_W(%esp), r2 movl F + STACK_W(%esp), r3 movl G + STACK_W(%esp), r4 movl H + STACK_W(%esp), r5 addl r0, B(%ebp) addl r1, C(%ebp) addl r2, D(%ebp) addl r3, F(%ebp) addl r4, G(%ebp) addl r5, H(%ebp) addl $STACK_SIZE, %esp popl %esi popl %edi popl %ebx popl %ebp ret #endif /* SILC_SHA256_X86 */