5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2007 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
20 /* SHA-256 x86 assembler implementation. This implements only the SHA-256
21 transform function and other parts are implemented in sha256.c. The
22 function preserves ebp, edx, edi and esi but does not preserve other
25 This implementation uses only 32-bit registers. It does not use MMX or
26 SSE registers which could be used to enhance the performance, especially
27 when loading the W. This is about as fast as we can get with less than
28 8 32-bit registers on 32-bit CPU.
30 Benchmarks (megabytes (MB) per second), bigger is better:
32 Code Pentium 4 3.60 GHz Pentium M 1.60 GHz
33 -----------------------------------------------------------------------
34 SHA-256, asm 110.57 MB/sec 58.50 MB/sec
35 SHA-256, gcc 49.07 MB/sec 39.55 MB/sec
36 SHA-256, icc 109.97 MB/sec 55.69 MB/sec
39 - Test program was lib/silccrypt/tests/test_hash
40 - nice -n -20 was used with test_hash running as root
41 - ICC generates significantly better code compared to GCC for SSE2
42 capable CPU, and the generated code uses SSE registers. Hence the
43 comparable speed with the assembler code. Note that, the GCC code
44 was also compiled with -msse2. Note that, this assembler code
45 specifically does not use SSE or MMX, for better compatibility.
51 #ifdef SILC_SHA256_X86
53 #define STACK_STATE (8 * 4)
54 #define STACK_W (64 * 4)
55 #define STACK_SIZE STACK_STATE + STACK_W
56 #define ARG_STATE STACK_SIZE + 20(%esp)
57 #define ARG_BUF STACK_SIZE + 24(%esp)
75 /* One round of SHA-256. The a (r0) and e (r1) are inputs already in
76 registers. r0 will be the next round a, r1 the next round e. The
77 d and h are outputs and they are the r0 and r1 for next round. */
78 #define RND(a, b, c, d, e, f, g, h, W, ki) \
82 movl e, r4; /* e to Sigma1 */ \
83 rorl $6, r4; /* Sigma1 >>= 6 */ \
84 movl r4, r5; /* Sigma1 to temp */ \
85 rorl $5, r4; /* Sigma1 >>= 5 (11) */ \
86 xorl r4, r5; /* temp ^= Sigma1 */ \
87 rorl $14, r4; /* Sigma1 >>= 14 (25) */ \
88 xorl r5, r4; /* Sigma1 ^= temp */ \
90 movl r3, r5; /* g to Ch */ \
91 xorl r2, r5; /* Ch ^= f */ \
92 andl e, r5; /* Ch &= e */ \
93 xorl r3, r5; /* Ch ^= g */ \
95 leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \
98 addl W * 4(%esp), r4; /* t0 += W[i] */ \
99 addl r4, r3; /* h += t0 */ \
100 addl r3, r1; /* d += h (t0) */ \
102 movl a, r4; /* a to Sigma0 */ \
103 rorl $2, r4; /* Sigma0 >>= 2 */ \
104 movl r4, r5; /* Sigma0 to temp */ \
105 rorl $11, r4; /* Sigma0 >>= 11 (13) */ \
106 xorl r4, r5; /* temp ^= Sigma0 */ \
107 rorl $9, r4; /* Sigma0 >>= 9 (22) */ \
108 xorl r5, r4; /* Sigma0 ^= temp */ \
110 addl r3, r4; /* t1 = Sigma0 + h (t0) */ \
114 movl r2, r5; /* b to temp */ \
115 orl a, r5; /* temp |= a */ \
116 andl r3, r5; /* temp &= c */ \
117 andl r2, a; /* a &= b */ \
118 orl r5, a; /* a |= temp */ \
119 addl r4, r0; /* h = t0 + t1 */
121 #define ROUND(a, b, c, d, e, f, g, h, W, ki) \
122 RND(a, b, c, d, e, f, g, h, W, ki) \
123 movl r1, d(%ebp); /* Update d in stack */ \
124 movl r0, h(%ebp); /* Update h in stack */
126 /* Get 64 bits from input buffer in MSB first order */
128 movl i * 4(r5), r4; \
129 movl (i + 1) * 4(r5), r3; \
132 movl r4, i * 4(%esp); \
133 movl r3, (i + 1) * 4(%esp);
135 /* Expand the input */
137 rorl $17, r4; /* Gamma1 >>= 17 */ \
138 movl r4, r5; /* Gamma1 to temp */ \
139 rorl $2, r4; /* Gamma1 >>= 2 (19) */ \
140 xorl r4, r5; /* temp ^= Gamma1 */ \
141 shrl $10, r2; /* w-2 >> 10 */ \
142 xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \
144 movl (i - 15) * 4(%esp), r3; \
145 movl r3, r4; /* w-15 to Gamma0 */ \
146 rorl $7, r4; /* Gamma0 >>= 7 */ \
147 movl r4, r5; /* Gamma0 to temp */ \
148 rorl $11, r4; /* Gamma0 >>= 11 (18) */ \
149 xorl r4, r5; /* temp ^= Gamma0 */ \
150 shrl $3, r3; /* w-15 >> 3 */ \
151 xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \
153 addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \
154 addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \
155 addl r2, r3; /* Gamma0 += Gamma1 */ \
156 movl r3, i * 4(%esp);
158 #define EXP_BUF0(i) \
162 #define EXP_BUFX(i) \
163 movl (i - 2) * 4(%esp), r2; \
164 movl r2, r4; /* w-2 to Gamma1 */ \
170 .globl sha256_transform
176 subl $STACK_SIZE, %esp
184 movl r0, A + STACK_W(%esp)
185 movl r1, B + STACK_W(%esp)
186 movl r2, C + STACK_W(%esp)
187 movl r3, D + STACK_W(%esp)
192 movl r1, E + STACK_W(%esp)
193 movl r2, F + STACK_W(%esp)
194 movl r3, G + STACK_W(%esp)
195 movl r4, H + STACK_W(%esp)
197 /* Get buf in MSB first order, W[0..15] */
199 GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6)
200 GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14)
202 /* Expand input, fill in W[16..63] */
203 EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20)
204 EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25)
205 EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30)
206 EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35)
207 EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40)
208 EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45)
209 EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50)
210 EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55)
211 EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60)
212 EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63)
214 /* Hash, r0 and r1 set above, ebp is base address to state */
215 leal STACK_W(%esp), %ebp
217 ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98);
218 ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491);
219 ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf);
220 ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5);
221 ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b);
222 ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1);
223 ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4);
224 ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5);
226 ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98);
227 ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01);
228 ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be);
229 ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3);
230 ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74);
231 ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe);
232 ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7);
233 ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174);
235 ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1);
236 ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786);
237 ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6);
238 ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc);
239 ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f);
240 ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa);
241 ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc);
242 ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da);
244 ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152);
245 ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d);
246 ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8);
247 ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7);
248 ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3);
249 ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147);
250 ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351);
251 ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967);
253 ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85);
254 ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138);
255 ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc);
256 ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13);
257 ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354);
258 ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb);
259 ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e);
260 ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85);
262 ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1);
263 ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b);
264 ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70);
265 ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3);
266 ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819);
267 ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624);
268 ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585);
269 ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070);
271 ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116);
272 ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08);
273 ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c);
274 ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5);
275 ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3);
276 ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a);
277 ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f);
278 ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3);
280 ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee);
281 ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f);
282 ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814);
283 ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208);
284 ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa);
285 ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb);
286 ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7);
287 RND (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2);
289 /* Update state from stack */
291 addl r0, A(%ebp) /* a from last round */
292 addl r1, E(%ebp) /* e from last round */
293 movl B + STACK_W(%esp), r0
294 movl C + STACK_W(%esp), r1
295 movl D + STACK_W(%esp), r2
296 movl F + STACK_W(%esp), r3
297 movl G + STACK_W(%esp), r4
298 movl H + STACK_W(%esp), r5
306 addl $STACK_SIZE, %esp
314 #endif /* SILC_SHA256_X86 */