From: Pekka Riikonen Date: Sat, 1 Sep 2007 10:12:37 +0000 (+0000) Subject: Added x86 optimized SHA-256. X-Git-Tag: 1.2.beta1~126 X-Git-Url: http://git.silcnet.org/gitweb/?p=crypto.git;a=commitdiff_plain;h=bc3393d27dac88c719a9a52108757d116b28db20 Added x86 optimized SHA-256. --- diff --git a/lib/silccrypt/Makefile.ad b/lib/silccrypt/Makefile.ad index e4cb4950..0d1aa46b 100644 --- a/lib/silccrypt/Makefile.ad +++ b/lib/silccrypt/Makefile.ad @@ -38,6 +38,8 @@ libsilccrypt_la_SOURCES = \ dsa.c \ sha1.c \ sha256.c \ + sha256_x86.S \ + sha512.c \ twofish.c \ blowfish.c \ cast5.c \ diff --git a/lib/silccrypt/sha256.c b/lib/silccrypt/sha256.c index 0598837e..4b149a8d 100644 --- a/lib/silccrypt/sha256.c +++ b/lib/silccrypt/sha256.c @@ -1,15 +1,8 @@ -/* Modified for SILC -Pekka */ - -/* LibTomCrypt, modular cryptographic library -- Tom St Denis - * - * LibTomCrypt is a library that provides various cryptographic - * algorithms in a highly modular and flexible manner. - * - * The library is free for all purposes without any express - * guarantee it works. - * - * Tom St Denis, tomstdenis@gmail.com, http://libtomcrypt.org - */ +/* Taken from public domain libtomcrypt library and the code and all changes + to it are in public domain. -Pekka */ + +/* LibTomCrypt, modular cryptographic library -- Tom St Denis */ + #include "silc.h" #include "sha256_internal.h" #include "sha256.h" @@ -35,7 +28,7 @@ SILC_HASH_API_FINAL(sha256) SILC_HASH_API_TRANSFORM(sha256) { - sha256_compress(state, (unsigned char *)buffer); + sha256_transform(state, (unsigned char *)buffer); } SILC_HASH_API_CONTEXT_LEN(sha256) @@ -60,214 +53,199 @@ SILC_HASH_API_CONTEXT_LEN(sha256) #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) -/* compress 512-bits */ -int sha256_compress(SilcUInt32 *state, unsigned char *buf) -{ - SilcUInt32 S[8], W[64], t0, t1; - int i; - - /* copy state into S */ - for (i = 0; i < 8; i++) { - S[i] = state[i]; - } - - /* copy the state into 512-bits into W[0..15] */ - for (i = 0; i < 16; i++) - SILC_GET32_MSB(W[i], buf + (4 * i)); +#ifndef SILC_SHA256_ASM - /* fill W[16..63] */ - for (i = 16; i < 64; i++) { - W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16]; - } - - /* Compress */ -#define RND(a,b,c,d,e,f,g,h,i,ki) \ - t0 = h + Sigma1(e) + Ch(e, f, g) + ki + W[i]; \ - t1 = Sigma0(a) + Maj(a, b, c); \ - d += t0; \ - h = t0 + t1; - - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],0,0x428a2f98); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],1,0x71374491); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],2,0xb5c0fbcf); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],3,0xe9b5dba5); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],4,0x3956c25b); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],5,0x59f111f1); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],6,0x923f82a4); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],7,0xab1c5ed5); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],8,0xd807aa98); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],9,0x12835b01); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],10,0x243185be); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],11,0x550c7dc3); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],12,0x72be5d74); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],13,0x80deb1fe); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],14,0x9bdc06a7); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],15,0xc19bf174); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],16,0xe49b69c1); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],17,0xefbe4786); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],18,0x0fc19dc6); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],19,0x240ca1cc); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],20,0x2de92c6f); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],21,0x4a7484aa); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],22,0x5cb0a9dc); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],23,0x76f988da); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],24,0x983e5152); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],25,0xa831c66d); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],26,0xb00327c8); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],27,0xbf597fc7); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],28,0xc6e00bf3); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],29,0xd5a79147); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],30,0x06ca6351); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],31,0x14292967); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],32,0x27b70a85); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],33,0x2e1b2138); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],34,0x4d2c6dfc); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],35,0x53380d13); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],36,0x650a7354); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],37,0x766a0abb); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],38,0x81c2c92e); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],39,0x92722c85); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],40,0xa2bfe8a1); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],41,0xa81a664b); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],42,0xc24b8b70); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],43,0xc76c51a3); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],44,0xd192e819); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],45,0xd6990624); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],46,0xf40e3585); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],47,0x106aa070); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],48,0x19a4c116); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],49,0x1e376c08); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],50,0x2748774c); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],51,0x34b0bcb5); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],52,0x391c0cb3); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],53,0x4ed8aa4a); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],54,0x5b9cca4f); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],55,0x682e6ff3); - RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],56,0x748f82ee); - RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],57,0x78a5636f); - RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],58,0x84c87814); - RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],59,0x8cc70208); - RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],60,0x90befffa); - RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],61,0xa4506ceb); - RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],62,0xbef9a3f7); - RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],63,0xc67178f2); +/* Transform 512-bits */ +void sha256_transform(SilcUInt32 *state, unsigned char *buf) +{ + SilcUInt32 S[8], W[64], t0, t1; + int i; + + /* copy state into S */ + for (i = 0; i < 8; i++) { + S[i] = state[i]; + } + + /* copy the state into 512-bits into W[0..15] */ + for (i = 0; i < 16; i++) + SILC_GET32_MSB(W[i], buf + (4 * i)); + + /* fill W[16..63] */ + for (i = 16; i < 64; i++) { + W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16]; + } + + /* Compress */ +#define RND(a,b,c,d,e,f,g,h,i,ki) \ + t0 = h + Sigma1(e) + Ch(e, f, g) + ki + W[i]; \ + t1 = Sigma0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],0,0x428a2f98); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],1,0x71374491); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],2,0xb5c0fbcf); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],3,0xe9b5dba5); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],4,0x3956c25b); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],5,0x59f111f1); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],6,0x923f82a4); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],7,0xab1c5ed5); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],8,0xd807aa98); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],9,0x12835b01); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],10,0x243185be); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],11,0x550c7dc3); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],12,0x72be5d74); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],13,0x80deb1fe); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],14,0x9bdc06a7); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],15,0xc19bf174); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],16,0xe49b69c1); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],17,0xefbe4786); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],18,0x0fc19dc6); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],19,0x240ca1cc); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],20,0x2de92c6f); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],21,0x4a7484aa); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],22,0x5cb0a9dc); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],23,0x76f988da); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],24,0x983e5152); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],25,0xa831c66d); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],26,0xb00327c8); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],27,0xbf597fc7); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],28,0xc6e00bf3); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],29,0xd5a79147); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],30,0x06ca6351); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],31,0x14292967); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],32,0x27b70a85); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],33,0x2e1b2138); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],34,0x4d2c6dfc); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],35,0x53380d13); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],36,0x650a7354); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],37,0x766a0abb); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],38,0x81c2c92e); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],39,0x92722c85); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],40,0xa2bfe8a1); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],41,0xa81a664b); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],42,0xc24b8b70); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],43,0xc76c51a3); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],44,0xd192e819); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],45,0xd6990624); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],46,0xf40e3585); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],47,0x106aa070); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],48,0x19a4c116); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],49,0x1e376c08); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],50,0x2748774c); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],51,0x34b0bcb5); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],52,0x391c0cb3); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],53,0x4ed8aa4a); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],54,0x5b9cca4f); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],55,0x682e6ff3); + RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],56,0x748f82ee); + RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],57,0x78a5636f); + RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],58,0x84c87814); + RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],59,0x8cc70208); + RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],60,0x90befffa); + RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],61,0xa4506ceb); + RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],62,0xbef9a3f7); + RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],63,0xc67178f2); #undef RND - /* feedback */ - for (i = 0; i < 8; i++) { - state[i] = state[i] + S[i]; - } - return TRUE; + /* feedback */ + for (i = 0; i < 8; i++) { + state[i] = state[i] + S[i]; + } } -/** - Initialize the hash state - @param md The hash state you wish to initialize - @return CRYPT_OK if successful -*/ +#endif /* !SILC_SHA256_ASM */ + int sha256_init(sha256_state * md) { - md->length = 0; - md->curlen = 0; - md->state[0] = 0x6A09E667UL; - md->state[1] = 0xBB67AE85UL; - md->state[2] = 0x3C6EF372UL; - md->state[3] = 0xA54FF53AUL; - md->state[4] = 0x510E527FUL; - md->state[5] = 0x9B05688CUL; - md->state[6] = 0x1F83D9ABUL; - md->state[7] = 0x5BE0CD19UL; - return TRUE; + md->length = 0; + md->curlen = 0; + md->state[0] = 0x6A09E667UL; + md->state[1] = 0xBB67AE85UL; + md->state[2] = 0x3C6EF372UL; + md->state[3] = 0xA54FF53AUL; + md->state[4] = 0x510E527FUL; + md->state[5] = 0x9B05688CUL; + md->state[6] = 0x1F83D9ABUL; + md->state[7] = 0x5BE0CD19UL; + return TRUE; } #if !defined(MIN) #define MIN(x,y) ((x)<(y)?(x):(y)) #endif -/** - Process a block of memory though the hash - @param md The hash state - @param in The data to hash - @param inlen The length of the data (octets) - @return CRYPT_OK if successful -*/ int sha256_process(sha256_state * md, const unsigned char *in, unsigned long inlen) { - unsigned long n; - int err, block_size = sizeof(md->buf); - - if (md->curlen > block_size) - return FALSE; - - while (inlen > 0) { - if (md->curlen == 0 && inlen >= block_size) { - if ((err = sha256_compress(md->state, (unsigned char *)in)) != TRUE) - return err; - md->length += block_size * 8; - in += block_size; - inlen -= block_size; - } else { - n = MIN(inlen, (block_size - md->curlen)); - memcpy(md->buf + md->curlen, in, (size_t)n); - md->curlen += n; - in += n; - inlen -= n; - if (md->curlen == block_size) { - if ((err = sha256_compress(md->state, md->buf)) != TRUE) - return err; - md->length += block_size * 8; - md->curlen = 0; - } - } + unsigned long n; + int block_size = sizeof(md->buf); + + if (md->curlen > block_size) + return FALSE; + + while (inlen > 0) { + if (md->curlen == 0 && inlen >= block_size) { + sha256_transform(md->state, (unsigned char *)in); + md->length += block_size * 8; + in += block_size; + inlen -= block_size; + } else { + n = MIN(inlen, (block_size - md->curlen)); + memcpy(md->buf + md->curlen, in, (size_t)n); + md->curlen += n; + in += n; + inlen -= n; + if (md->curlen == block_size) { + sha256_transform(md->state, md->buf); + md->length += block_size * 8; + md->curlen = 0; + } } - return TRUE; + } + return TRUE; } -/** - Terminate the hash to get the digest - @param md The hash state - @param out [out] The destination of the hash (32 bytes) - @return CRYPT_OK if successful -*/ int sha256_done(sha256_state * md, unsigned char *out) { - int i; - - if (md->curlen >= sizeof(md->buf)) - return FALSE; - - /* increase the length of the message */ - md->length += md->curlen * 8; - - /* append the '1' bit */ - md->buf[md->curlen++] = (unsigned char)0x80; - - /* if the length is currently above 56 bytes we append zeros - * then compress. Then we can fall back to padding zeros and length - * encoding like normal. - */ - if (md->curlen > 56) { - while (md->curlen < 64) { - md->buf[md->curlen++] = (unsigned char)0; - } - sha256_compress(md->state, md->buf); - md->curlen = 0; - } + int i; + + if (md->curlen >= sizeof(md->buf)) + return FALSE; - /* pad upto 56 bytes of zeroes */ - while (md->curlen < 56) { - md->buf[md->curlen++] = (unsigned char)0; + /* increase the length of the message */ + md->length += md->curlen * 8; + + /* append the '1' bit */ + md->buf[md->curlen++] = (unsigned char)0x80; + + /* if the length is currently above 56 bytes we append zeros + * then compress. Then we can fall back to padding zeros and length + * encoding like normal. + */ + if (md->curlen > 56) { + while (md->curlen < 64) { + md->buf[md->curlen++] = (unsigned char)0; } + sha256_transform(md->state, md->buf); + md->curlen = 0; + } + + /* pad upto 56 bytes of zeroes */ + while (md->curlen < 56) { + md->buf[md->curlen++] = (unsigned char)0; + } - /* store length */ - SILC_PUT64_MSB(md->length, md->buf + 56); - sha256_compress(md->state, md->buf); + /* store length */ + SILC_PUT64_MSB(md->length, md->buf + 56); + sha256_transform(md->state, md->buf); - /* copy output */ - for (i = 0; i < 8; i++) - SILC_PUT32_MSB(md->state[i], out + (4 * i)); + /* copy output */ + for (i = 0; i < 8; i += 2) { + SILC_PUT32_MSB(md->state[i], out + (4 * i)); + SILC_PUT32_MSB(md->state[i + 1], out + (4 * (i + 1))); + } - return TRUE; + return TRUE; } diff --git a/lib/silccrypt/sha256_internal.h b/lib/silccrypt/sha256_internal.h index 6f61fbb6..4ad57221 100644 --- a/lib/silccrypt/sha256_internal.h +++ b/lib/silccrypt/sha256_internal.h @@ -10,7 +10,7 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -29,9 +29,9 @@ typedef struct { } sha256_state; int sha256_init(sha256_state * md); -int sha256_process(sha256_state * md, const unsigned char *in, +int sha256_process(sha256_state * md, const unsigned char *in, unsigned long inlen); int sha256_done(sha256_state * md, unsigned char *hash); -int sha256_compress(SilcUInt32 *state, unsigned char *buf); +void sha256_transform(SilcUInt32 *state, unsigned char *buf); #endif /* SHA256_INTERNAL_H */ diff --git a/lib/silccrypt/sha256_x86.S b/lib/silccrypt/sha256_x86.S new file mode 100644 index 00000000..728f3e63 --- /dev/null +++ b/lib/silccrypt/sha256_x86.S @@ -0,0 +1,296 @@ +/* + + sha256_x86.S + + Author: Pekka Riikonen + + Copyright (C) 2007 Pekka Riikonen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + +*/ + +/* SHA-256 x86 assembler implementation. This implements only the SHA-256 + transform function and other parts are implemented in sha256.c. The + function preserves ebp, edx, edi and esi but does not preserve other + registers. + + This implementation uses only 32-bit registers. It does not use MMX or + SSE registers which could be used to enhance the performance, especially + when loading the W. This is about as fast as we can get with less than + 8 32-bit registers on 32-bit CPU. + +*/ + +#ifdef SILC_SHA256_ASM + +#define STACK_STATE (8 * 4) +#define STACK_W (64 * 4) +#define STACK_SIZE STACK_STATE + STACK_W +#define ARG_STATE STACK_SIZE + 20(%esp) +#define ARG_BUF STACK_SIZE + 24(%esp) + +#define A 0 +#define B 4 +#define C 8 +#define D 12 +#define E 16 +#define F 20 +#define G 24 +#define H 28 + +#define r0 %eax +#define r1 %ebx +#define r2 %ecx +#define r3 %edx +#define r4 %edi +#define r5 %esi + +/* One round of SHA-256. The a (r0) and e (r1) are inputs already in + registers. r0 will be the next round a, r1 the next round e. The + d and h are outputs and they are the r0 and r1 for next round. */ +#define RND(a, b, c, d, e, f, g, h, W, ki) \ + movl f(%ebp), r2; \ + movl g(%ebp), r3; \ + \ + movl e, r4; /* e to Sigma1 */ \ + rorl $6, r4; /* Sigma1 >>= 6 */ \ + movl r4, r5; /* Sigma1 to temp */ \ + rorl $5, r4; /* Sigma1 >>= 5 (11) */ \ + xorl r4, r5; /* temp ^= Sigma1 */ \ + rorl $14, r4; /* Sigma1 >>= 14 (25) */ \ + xorl r5, r4; /* Sigma1 ^= temp */ \ + \ + movl r3, r5; /* g to Ch */ \ + xorl r2, r5; /* Ch ^= f */ \ + andl e, r5; /* Ch &= e */ \ + xorl r3, r5; /* Ch ^= g */ \ + \ + movl h(%ebp), r3; \ + movl d(%ebp), r1; \ + leal ki(r4, r5), r4; /* t0 = Sigma1 + Ch + ki */ \ + addl W * 4(%esp), r4; /* t0 += W[i] */ \ + addl r4, r3; /* h += t0 */ \ + addl r3, r1; /* d += h (t0) */ \ + \ + movl a, r4; /* a to Sigma0 */ \ + rorl $2, r4; /* Sigma0 >>= 2 */ \ + movl r4, r5; /* Sigma0 to temp */ \ + rorl $11, r4; /* Sigma0 >>= 11 (13) */ \ + xorl r4, r5; /* temp ^= Sigma0 */ \ + rorl $9, r4; /* Sigma0 >>= 9 (22) */ \ + xorl r5, r4; /* Sigma0 ^= temp */ \ + \ + addl r3, r4; /* t1 = Sigma0 + h (t0) */ \ + movl b(%ebp), r2; \ + movl c(%ebp), r3; \ + \ + movl r2, r5; /* b to temp */ \ + orl a, r5; /* temp |= a */ \ + andl r3, r5; /* temp &= c */ \ + andl r2, a; /* a &= b */ \ + orl r5, a; /* a |= temp */ \ + addl r4, r0; /* h = t0 + t1 */ + +#define ROUND(a, b, c, d, e, f, g, h, W, ki) \ + RND(a, b, c, d, e, f, g, h, W, ki) \ + movl r1, d(%ebp); /* Update d in stack */ \ + movl r0, h(%ebp); /* Update h in stack */ + +/* Get 64 bits from input buffer in MSB first order */ +#define GET_BUF(i) \ + movl i * 4(r5), r4; \ + movl (i + 1) * 4(r5), r3; \ + bswapl r4; \ + bswapl r3; \ + movl r4, i * 4(%esp); \ + movl r3, (i + 1) * 4(%esp); + +/* Expand the input */ +#define EXP_BUF(i) \ + rorl $17, r4; /* Gamma1 >>= 17 */ \ + movl r4, r5; /* Gamma1 to temp */ \ + rorl $2, r4; /* Gamma1 >>= 2 (19) */ \ + xorl r4, r5; /* temp ^= Gamma1 */ \ + shrl $10, r2; /* w-2 >> 10 */ \ + xorl r5, r2; /* Gamma1 = w-2 ^ temp */ \ + \ + addl (i - 7) * 4(%esp), r2; /* Gamma1 += w-7 */ \ + addl (i - 16) * 4(%esp), r2; /* Gamma1 += w-16 */ \ + \ + movl (i - 15) * 4(%esp), r3; \ + movl r3, r4; /* w-15 to Gamma0 */ \ + rorl $7, r4; /* Gamma0 >>= 7 */ \ + movl r4, r5; /* Gamma0 to temp */ \ + rorl $11, r4; /* Gamma0 >>= 11 (18) */ \ + xorl r4, r5; /* temp ^= Gamma0 */ \ + shrl $3, r3; /* w-15 >> 3 */ \ + xorl r5, r3; /* Gamma0 = w-15 ^ temp */ \ + \ + addl r2, r3; /* Gamma0 += Gamma1 */ \ + movl r3, i * 4(%esp); + +#define EXP_BUF0(i) \ + movl r4, r2; \ + EXP_BUF(i) + +#define EXP_BUFX(i) \ + movl (i - 2) * 4(%esp), r2; \ + movl r2, r4; /* w-2 to Gamma1 */ \ + EXP_BUF(i) + + +.text +.align 4 +.globl sha256_transform +sha256_transform: + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + subl $STACK_SIZE, %esp + + /* State to stack */ + movl ARG_STATE, %ebp + movl A(%ebp), r0 + movl B(%ebp), r1 + movl C(%ebp), r2 + movl D(%ebp), r3 + movl r0, A + STACK_W(%esp) + movl r1, B + STACK_W(%esp) + movl r2, C + STACK_W(%esp) + movl r3, D + STACK_W(%esp) + movl E(%ebp), r1 + movl F(%ebp), r2 + movl G(%ebp), r3 + movl H(%ebp), r4 + movl r1, E + STACK_W(%esp) + movl r2, F + STACK_W(%esp) + movl r3, G + STACK_W(%esp) + movl r4, H + STACK_W(%esp) + + /* Get buf in MSB first order, W[0..15] */ + movl ARG_BUF, r5 + GET_BUF(0) GET_BUF(2) GET_BUF(4) GET_BUF(6) + GET_BUF(8) GET_BUF(10) GET_BUF(12) GET_BUF(14) + + /* Expand input, fill in W[16..63] */ + EXP_BUF0(16) EXP_BUFX(17) EXP_BUFX(18) EXP_BUFX(19) EXP_BUFX(20) + EXP_BUFX(21) EXP_BUFX(22) EXP_BUFX(23) EXP_BUFX(24) EXP_BUFX(25) + EXP_BUFX(26) EXP_BUFX(27) EXP_BUFX(28) EXP_BUFX(29) EXP_BUFX(30) + EXP_BUFX(31) EXP_BUFX(32) EXP_BUFX(33) EXP_BUFX(34) EXP_BUFX(35) + EXP_BUFX(36) EXP_BUFX(37) EXP_BUFX(38) EXP_BUFX(39) EXP_BUFX(40) + EXP_BUFX(41) EXP_BUFX(42) EXP_BUFX(43) EXP_BUFX(44) EXP_BUFX(45) + EXP_BUFX(46) EXP_BUFX(47) EXP_BUFX(48) EXP_BUFX(49) EXP_BUFX(50) + EXP_BUFX(51) EXP_BUFX(52) EXP_BUFX(53) EXP_BUFX(54) EXP_BUFX(55) + EXP_BUFX(56) EXP_BUFX(57) EXP_BUFX(58) EXP_BUFX(59) EXP_BUFX(60) + EXP_BUFX(61) EXP_BUFX(62) EXP_BUFX(63) + + /* Hash, r0 and r1 set above, ebp is base address to state */ + leal STACK_W(%esp), %ebp + + ROUND(r0, B, C, D, r1, F, G, H, 0, 0x428a2f98); + ROUND(r0, A, B, C, r1, E, F, G, 1, 0x71374491); + ROUND(r0, H, A, B, r1, D, E, F, 2, 0xb5c0fbcf); + ROUND(r0, G, H, A, r1, C, D, E, 3, 0xe9b5dba5); + ROUND(r0, F, G, H, r1, B, C, D, 4, 0x3956c25b); + ROUND(r0, E, F, G, r1, A, B, C, 5, 0x59f111f1); + ROUND(r0, D, E, F, r1, H, A, B, 6, 0x923f82a4); + ROUND(r0, C, D, E, r1, G, H, A, 7, 0xab1c5ed5); + + ROUND(r0, B, C, D, r1, F, G, H, 8, 0xd807aa98); + ROUND(r0, A, B, C, r1, E, F, G, 9, 0x12835b01); + ROUND(r0, H, A, B, r1, D, E, F, 10, 0x243185be); + ROUND(r0, G, H, A, r1, C, D, E, 11, 0x550c7dc3); + ROUND(r0, F, G, H, r1, B, C, D, 12, 0x72be5d74); + ROUND(r0, E, F, G, r1, A, B, C, 13, 0x80deb1fe); + ROUND(r0, D, E, F, r1, H, A, B, 14, 0x9bdc06a7); + ROUND(r0, C, D, E, r1, G, H, A, 15, 0xc19bf174); + + ROUND(r0, B, C, D, r1, F, G, H, 16, 0xe49b69c1); + ROUND(r0, A, B, C, r1, E, F, G, 17, 0xefbe4786); + ROUND(r0, H, A, B, r1, D, E, F, 18, 0x0fc19dc6); + ROUND(r0, G, H, A, r1, C, D, E, 19, 0x240ca1cc); + ROUND(r0, F, G, H, r1, B, C, D, 20, 0x2de92c6f); + ROUND(r0, E, F, G, r1, A, B, C, 21, 0x4a7484aa); + ROUND(r0, D, E, F, r1, H, A, B, 22, 0x5cb0a9dc); + ROUND(r0, C, D, E, r1, G, H, A, 23, 0x76f988da); + + ROUND(r0, B, C, D, r1, F, G, H, 24, 0x983e5152); + ROUND(r0, A, B, C, r1, E, F, G, 25, 0xa831c66d); + ROUND(r0, H, A, B, r1, D, E, F, 26, 0xb00327c8); + ROUND(r0, G, H, A, r1, C, D, E, 27, 0xbf597fc7); + ROUND(r0, F, G, H, r1, B, C, D, 28, 0xc6e00bf3); + ROUND(r0, E, F, G, r1, A, B, C, 29, 0xd5a79147); + ROUND(r0, D, E, F, r1, H, A, B, 30, 0x06ca6351); + ROUND(r0, C, D, E, r1, G, H, A, 31, 0x14292967); + + ROUND(r0, B, C, D, r1, F, G, H, 32, 0x27b70a85); + ROUND(r0, A, B, C, r1, E, F, G, 33, 0x2e1b2138); + ROUND(r0, H, A, B, r1, D, E, F, 34, 0x4d2c6dfc); + ROUND(r0, G, H, A, r1, C, D, E, 35, 0x53380d13); + ROUND(r0, F, G, H, r1, B, C, D, 36, 0x650a7354); + ROUND(r0, E, F, G, r1, A, B, C, 37, 0x766a0abb); + ROUND(r0, D, E, F, r1, H, A, B, 38, 0x81c2c92e); + ROUND(r0, C, D, E, r1, G, H, A, 39, 0x92722c85); + + ROUND(r0, B, C, D, r1, F, G, H, 40, 0xa2bfe8a1); + ROUND(r0, A, B, C, r1, E, F, G, 41, 0xa81a664b); + ROUND(r0, H, A, B, r1, D, E, F, 42, 0xc24b8b70); + ROUND(r0, G, H, A, r1, C, D, E, 43, 0xc76c51a3); + ROUND(r0, F, G, H, r1, B, C, D, 44, 0xd192e819); + ROUND(r0, E, F, G, r1, A, B, C, 45, 0xd6990624); + ROUND(r0, D, E, F, r1, H, A, B, 46, 0xf40e3585); + ROUND(r0, C, D, E, r1, G, H, A, 47, 0x106aa070); + + ROUND(r0, B, C, D, r1, F, G, H, 48, 0x19a4c116); + ROUND(r0, A, B, C, r1, E, F, G, 49, 0x1e376c08); + ROUND(r0, H, A, B, r1, D, E, F, 50, 0x2748774c); + ROUND(r0, G, H, A, r1, C, D, E, 51, 0x34b0bcb5); + ROUND(r0, F, G, H, r1, B, C, D, 52, 0x391c0cb3); + ROUND(r0, E, F, G, r1, A, B, C, 53, 0x4ed8aa4a); + ROUND(r0, D, E, F, r1, H, A, B, 54, 0x5b9cca4f); + ROUND(r0, C, D, E, r1, G, H, A, 55, 0x682e6ff3); + + ROUND(r0, B, C, D, r1, F, G, H, 56, 0x748f82ee); + ROUND(r0, A, B, C, r1, E, F, G, 57, 0x78a5636f); + ROUND(r0, H, A, B, r1, D, E, F, 58, 0x84c87814); + ROUND(r0, G, H, A, r1, C, D, E, 59, 0x8cc70208); + ROUND(r0, F, G, H, r1, B, C, D, 60, 0x90befffa); + ROUND(r0, E, F, G, r1, A, B, C, 61, 0xa4506ceb); + ROUND(r0, D, E, F, r1, H, A, B, 62, 0xbef9a3f7); + RND (r0, C, D, E, r1, G, H, A, 63, 0xc67178f2); + + /* Update state from stack */ + movl ARG_STATE, %ebp + addl r0, A(%ebp) /* a from last round */ + addl r1, E(%ebp) /* e from last round */ + movl B + STACK_W(%esp), r0 + movl C + STACK_W(%esp), r1 + movl D + STACK_W(%esp), r2 + movl F + STACK_W(%esp), r3 + movl G + STACK_W(%esp), r4 + movl H + STACK_W(%esp), r5 + addl r0, B(%ebp) + addl r1, C(%ebp) + addl r2, D(%ebp) + addl r3, F(%ebp) + addl r4, G(%ebp) + addl r5, H(%ebp) + + addl $STACK_SIZE, %esp + popl %esi + popl %edi + popl %ebx + popl %ebp + + ret + +#endif /* SILC_SHA256_ASM */