5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2007 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
20 /* SHA-256 x86 assembler implementation. This implements only the SHA-256
21 transform function and other parts are implemented in sha256.c. The
22 function preserves ebp, edx, edi and esi but does not preserve other
25 This implementation uses only 32-bit registers. It does not use MMX or
26 SSE registers which could be used to enhance the performance, especially
27 when loading the W. This is about as fast as we can get with less than
28 8 32-bit registers on 32-bit CPU.
30 Benchmarks (megabytes (MB) per second), bigger is better:
32 Code P4 3.60 GHz PM 1.60 GHz Xeon 5160 3.00 GHz
33 ----------------------------------------------------------------------
34 SHA-256, asm 110.57 MB/sec 58.50 MB/sec 146.43 MB/sec
35 SHA-256, gcc 49.07 MB/sec 39.55 MB/sec 82.14 MB/sec
36 SHA-256, icc 109.97 MB/sec 55.69 MB/sec N/A
39 - Test program was lib/silccrypt/tests/test_hash
40 - nice -n -20 was used with test_hash running as root
41 - P4 is Pentium 4, PM is Pentium M, Xeon 5160 is 64-bit CPU but the OS
42 had 32-bit kernel in the test.
43 - ICC generates significantly better code compared to GCC for SSE2
44 capable CPU, and the generated code uses SSE registers. Hence the
45 comparable speed with the assembler code. Note that, the GCC code
46 was also compiled with -msse2. Note that, this assembler code
47 specifically does not use SSE or MMX, for better compatibility.
51 #include "../../silcdefs.h"
53 #ifdef SILC_SHA256_X86
55 #define STACK_STATE (8 * 4)
56 #define STACK_W (64 * 4)
57 #define STACK_SIZE STACK_STATE + STACK_W
58 #define ARG_STATE STACK_SIZE + 20(%esp)
59 #define ARG_BUF STACK_SIZE + 24(%esp)
77 /* One round of SHA-256. The a (r0) and e (r1) are inputs already in
78 registers. r0 will be the next round a, r1 the next round e. The
79 d and h are outputs and they are the r0 and r1 for next round. */
80 #define RND(a, b, c, d, e, f, g, h, W, ki) \
84 movl e, r4; /* e to Sigma1 */ \
85 rorl $6, r4; /* Sigma1 >>= 6 */ \
86 movl r4, r5; /* Sigma1 to temp */ \
87 rorl $5, r4; /* Sigma1 >>= 5 (11) */ \
88 xorl r4, r5; /* temp ^= Sigma1 */ \
89 rorl $14, r4; /* Sigma1 >>= 14 (25) */ \
90 xorl r5, r4; /* Sigma1 ^= temp */ \
92 movl r3, r5; /* g to Ch */ \
93 xorl r2, r5; /* Ch ^= f */ \
94 andl e, r5; /* Ch &= e */ \
95 xorl r3, r5; /* Ch ^= g */ \
97 leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \
100 addl W * 4(%esp), r4; /* t0 += W[i] */ \
101 addl r4, r3; /* h += t0 */ \
102 addl r3, r1; /* d += h (t0) */ \
104 movl a, r4; /* a to Sigma0 */ \
105 rorl $2, r4; /* Sigma0 >>= 2 */ \
106 movl r4, r5; /* Sigma0 to temp */ \
107 rorl $11, r4; /* Sigma0 >>= 11 (13) */ \
108 xorl r4, r5; /* temp ^= Sigma0 */ \
109 rorl $9, r4; /* Sigma0 >>= 9 (22) */ \
110 xorl r5, r4; /* Sigma0 ^= temp */ \
112 addl r3, r4; /* t1 = Sigma0 + h (t0) */ \
116 movl r2, r5; /* b to temp */ \
117 orl a, r5; /* temp |= a */ \
118 andl r3, r5; /* temp &= c */ \
119 andl r2, a; /* a &= b */ \
120 orl r5, a; /* a |= temp */ \
121 addl r4, r0; /* h = t0 + t1 */
123 #define ROUND(a, b, c, d, e, f, g, h, W, ki) \
124 RND(a, b, c, d, e, f, g, h, W, ki) \
125 movl r1, d(%ebp); /* Update d in stack */ \
126 movl r0, h(%ebp); /* Update h in stack */
128 /* Get 64 bits from input buffer in MSB first order */
130 movl i * 4(r5), r4; \
131 movl (i + 1) * 4(r5), r3; \
134 movl r4, i * 4(%esp); \
135 movl r3, (i + 1) * 4(%esp);
137 /* Expand the input */
139 rorl $17, r4; /* Gamma1 >>= 17 */ \
140 movl r4, r5; /* Gamma1 to temp */ \
141 rorl $2, r4; /* Gamma1 >>= 2 (19) */ \
142 xorl r4, r5; /* temp ^= Gamma1 */ \
143 shrl $10, r2; /* w-2 >> 10 */ \
144 xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \
146 movl (i - 15) * 4(%esp), r3; \
147 movl r3, r4; /* w-15 to Gamma0 */ \
148 rorl $7, r4; /* Gamma0 >>= 7 */ \
149 movl r4, r5; /* Gamma0 to temp */ \
150 rorl $11, r4; /* Gamma0 >>= 11 (18) */ \
151 xorl r4, r5; /* temp ^= Gamma0 */ \
152 shrl $3, r3; /* w-15 >> 3 */ \
153 xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \
155 addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \
156 addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \
157 addl r2, r3; /* Gamma0 += Gamma1 */ \
158 movl r3, i * 4(%esp);
160 #define EXP_BUF0(i) \
164 #define EXP_BUFX(i) \
165 movl (i - 2) * 4(%esp), r2; \
166 movl r2, r4; /* w-2 to Gamma1 */ \
172 .globl sha256_transform
178 subl $STACK_SIZE, %esp
186 movl r0, A + STACK_W(%esp)
187 movl r1, B + STACK_W(%esp)
188 movl r2, C + STACK_W(%esp)
189 movl r3, D + STACK_W(%esp)
194 movl r1, E + STACK_W(%esp)
195 movl r2, F + STACK_W(%esp)
196 movl r3, G + STACK_W(%esp)
197 movl r4, H + STACK_W(%esp)
199 /* Get buf in MSB first order, W[0..15] */
201 GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6)
202 GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14)
204 /* Expand input, fill in W[16..63] */
205 EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20)
206 EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25)
207 EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30)
208 EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35)
209 EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40)
210 EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45)
211 EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50)
212 EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55)
213 EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60)
214 EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63)
216 /* Hash, r0 and r1 set above, ebp is base address to state */
217 leal STACK_W(%esp), %ebp
219 ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98);
220 ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491);
221 ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf);
222 ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5);
223 ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b);
224 ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1);
225 ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4);
226 ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5);
228 ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98);
229 ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01);
230 ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be);
231 ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3);
232 ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74);
233 ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe);
234 ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7);
235 ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174);
237 ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1);
238 ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786);
239 ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6);
240 ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc);
241 ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f);
242 ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa);
243 ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc);
244 ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da);
246 ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152);
247 ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d);
248 ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8);
249 ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7);
250 ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3);
251 ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147);
252 ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351);
253 ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967);
255 ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85);
256 ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138);
257 ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc);
258 ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13);
259 ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354);
260 ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb);
261 ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e);
262 ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85);
264 ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1);
265 ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b);
266 ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70);
267 ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3);
268 ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819);
269 ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624);
270 ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585);
271 ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070);
273 ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116);
274 ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08);
275 ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c);
276 ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5);
277 ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3);
278 ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a);
279 ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f);
280 ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3);
282 ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee);
283 ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f);
284 ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814);
285 ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208);
286 ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa);
287 ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb);
288 ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7);
289 RND (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2);
291 /* Update state from stack */
293 addl r0, A(%ebp) /* a from last round */
294 addl r1, E(%ebp) /* e from last round */
295 movl B + STACK_W(%esp), r0
296 movl C + STACK_W(%esp), r1
297 movl D + STACK_W(%esp), r2
298 movl F + STACK_W(%esp), r3
299 movl G + STACK_W(%esp), r4
300 movl H + STACK_W(%esp), r5
308 addl $STACK_SIZE, %esp
316 #endif /* SILC_SHA256_X86 */